diff --git a/cmake/external/libmct.cmake b/cmake/external/libmct.cmake
index 92c3165fbaa904..a166e43c7b95ea 100644
--- a/cmake/external/libmct.cmake
+++ b/cmake/external/libmct.cmake
@@ -45,7 +45,7 @@ ExternalProject_Add(
     PREFIX                ${LIBMCT_PREFIX_DIR}
     DOWNLOAD_DIR          ${LIBMCT_DOWNLOAD_DIR}
     DOWNLOAD_COMMAND      wget --no-check-certificate ${LIBMCT_URL} -c -q -O ${LIBMCT_NAME}.tar.gz
-                          && tar zxvf ${LIBMCT_NAME}.tar.gz
+                          && tar --no-same-owner -zxvf ${LIBMCT_NAME}.tar.gz
     DOWNLOAD_NO_PROGRESS  1
     UPDATE_COMMAND        ""
     CMAKE_ARGS            -DCMAKE_INSTALL_PREFIX=${LIBMCT_INSTALL_ROOT}
diff --git a/cmake/external/lite.cmake b/cmake/external/lite.cmake
index 47bbfee57451c6..f1d206dd5e1992 100644
--- a/cmake/external/lite.cmake
+++ b/cmake/external/lite.cmake
@@ -86,7 +86,7 @@ if (NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR)
       GIT_REPOSITORY      "${GIT_URL}/PaddlePaddle/Paddle-Lite.git"
       GIT_TAG             ${LITE_GIT_TAG}
       PREFIX              ${LITE_PREFIX_DIR}
-      PATCH_COMMAND       mkdir -p ${LITE_PREFIX_DIR}/src/extern_lite-build/lite/gen_code && touch ${LITE_PREFIX_DIR}/src/extern_lite-build/lite/gen_code/__generated_code__.cc && sed -i "/aarch64-linux-gnu-gcc/d" ${LITE_PREFIX_DIR}/src/extern_lite/cmake/cross_compiling/armlinux.cmake && sed -i "/aarch64-linux-gnu-g++/d" ${LITE_PREFIX_DIR}/src/extern_lite/cmake/cross_compiling/armlinux.cmake
+      PATCH_COMMAND       mkdir -p ${LITE_PREFIX_DIR}/src/extern_lite-build/lite/gen_code && touch ${LITE_PREFIX_DIR}/src/extern_lite-build/lite/gen_code/__generated_code__.cc && sed -i "/aarch64-linux-gnu-gcc/d" ${LITE_PREFIX_DIR}/src/extern_lite/cmake/os/armlinux.cmake && sed -i "/aarch64-linux-gnu-g++/d" ${LITE_PREFIX_DIR}/src/extern_lite/cmake/os/armlinux.cmake
       UPDATE_COMMAND      ""
       BUILD_COMMAND       ${LITE_BUILD_COMMAND}
       INSTALL_COMMAND     ""
diff --git a/cmake/external/poplar.cmake b/cmake/external/poplar.cmake
index 7a8fa3ef5d710a..8b2de14e966201 100644
--- a/cmake/external/poplar.cmake
+++ b/cmake/external/poplar.cmake
@@ -12,6 +12,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+macro(find_popart_version popart_version_file)
+  file(READ ${popart_version_file} popart_version_file_content)
+  string(REGEX MATCH "(POPART_VERSION_STRING)[ \t\r\n](\")([0-9]+\.[0-9]+\.[0-9]+)(\\+)([A-Za-z0-9_]*)(\")" POPART_VERSION ${popart_version_file_content})
+  string(REPLACE "POPART_VERSION_STRING" "" POPART_VERSION "${POPART_VERSION}")
+  string(REPLACE "\"" "" POPART_VERSION "${POPART_VERSION}")
+  string(REPLACE " " "" POPART_VERSION "${POPART_VERSION}")
+  if(NOT POPART_VERSION)
+    set(POPART_VERSION "Unknown version")
+  else()
+    message(STATUS "Current PopART version is ${POPART_VERSION}")
+  endif()
+endmacro()
+
 if(WITH_IPU)
   set(POPLAR_DIR CACHE PATH "Path to a Poplar install")
   set(POPART_DIR CACHE PATH "Path to a Popart install")
@@ -64,6 +77,8 @@ if(WITH_IPU)
     message(FATAL_ERROR "You must provide a path to a Popart build using -DPOPART_DIR=/path/to/popart/build")
   endif()
 
+  find_popart_version("${POPART_DIR}/include/popart/version.hpp")
+
   add_definitions(-DONNX_NAMESPACE=onnx)
   add_custom_target(extern_poplar DEPENDS poplar popart-only)
 endif()
diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake
index e83bdef327891a..2b84def46520f5 100644
--- a/cmake/external/xpu.cmake
+++ b/cmake/external/xpu.cmake
@@ -36,7 +36,7 @@ ENDIF()
 
 if(NOT DEFINED XPU_BASE_URL)
   SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev")
-  SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220402")
+  SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220408")
 else()
   SET(XPU_BASE_URL "${XPU_BASE_URL}")
 endif()
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index e3e6e1cced2aa0..1b38f208716b37 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -398,7 +398,8 @@ function(version version_file)
             "WITH_GPU: ${WITH_GPU}\n"
             "WITH_ROCM: ${WITH_ROCM}\n"
             "WITH_ASCEND_CL: ${WITH_ASCEND_CL}\n"
-            "WITH_ASCEND_CXX11: ${WITH_ASCEND_CXX11}\n")
+            "WITH_ASCEND_CXX11: ${WITH_ASCEND_CXX11}\n"
+            "WITH_IPU: ${WITH_IPU}\n")
     if(WITH_GPU)
         file(APPEND ${version_file}
                 "CUDA version: ${CUDA_VERSION}\n"
@@ -414,6 +415,10 @@ function(version version_file)
                 "Ascend Toolkit version: ${ASCEND_TOOLKIT_VERSION}\n"
                 "Ascend Driver version: ${ASCEND_DRIVER_VERSION}\n")
     endif()
+    if(WITH_IPU)
+        file(APPEND ${version_file}
+                "PopART version: ${POPART_VERSION}\n")
+    endif()
     file(APPEND ${version_file} "CXX compiler version: ${CMAKE_CXX_COMPILER_VERSION}\n")
     if(TENSORRT_FOUND)
         file(APPEND ${version_file}
diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
index eeb5e3b397c10e..b1d892e2521a39 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
@@ -110,7 +110,8 @@ void ProcessGroupNCCL::BroadcastUniqueNCCLID(
     std::vector<ncclUniqueId>& nccl_ids) {  // NOLINT
   if (rank_ == 0) {
     for (size_t i = 0; i < nccl_ids.size(); i++) {
-      auto key = "ProcessGroupNCCL/nccl_ids/" + std::to_string(i);
+      auto key = "ProcessGroupNCCL/nccl_ids/" + std::to_string(gid_) + "/" +
+                 std::to_string(i);
       auto nccl_id = std::vector<uint8_t>(
           reinterpret_cast<uint8_t*>(&nccl_ids[i]),
           reinterpret_cast<uint8_t*>(&nccl_ids[i]) + NCCL_UNIQUE_ID_BYTES);
@@ -118,7 +119,8 @@ void ProcessGroupNCCL::BroadcastUniqueNCCLID(
     }
   } else {
     for (size_t i = 0; i < nccl_ids.size(); i++) {
-      auto key = "ProcessGroupNCCL/nccl_ids/" + std::to_string(i);
+      auto key = "ProcessGroupNCCL/nccl_ids/" + std::to_string(gid_) + "/" +
+                 std::to_string(i);
       auto ret = store_->get(key);
       std::memcpy(&nccl_ids[i], ret.data(), ret.size());
     }
diff --git a/paddle/fluid/distributed/common/sparse_sharding_merge.h b/paddle/fluid/distributed/common/sparse_sharding_merge.h
deleted file mode 100644
index 147403d08e6be8..00000000000000
--- a/paddle/fluid/distributed/common/sparse_sharding_merge.h
+++ /dev/null
@@ -1,310 +0,0 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-#include <sys/time.h>
-
-#include <iostream>
-#include <ostream>
-#include <string>
-#include <thread>  // NOLINT
-#include <vector>
-
-#include <ThreadPool.h>
-#include "glog/logging.h"
-#include "paddle/fluid/distributed/common/utils.h"
-#include "paddle/fluid/framework/blocking_queue.h"
-#include "paddle/fluid/framework/framework.pb.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/string/split.h"
-#include "paddle/phi/core/utils/dim.h"
-
-constexpr int FG = 256 * 1024 * 1024;
-constexpr int Q_SIZE = 10000;
-constexpr int BUCKET = 10;
-constexpr char XEOF[] = "EOF";
-
-inline double GetCurrentUS() {
-  struct timeval time;
-  gettimeofday(&time, NULL);
-  return 1e+6 * time.tv_sec + time.tv_usec;
-}
-
-namespace paddle {
-namespace distributed {
-
-class ShardingMerge {
- public:
-  ShardingMerge() {}
-  ~ShardingMerge() {}
-
-  void Merge(const std::vector<std::string> &inputs,
-             const std::vector<int64_t> &feasigns, const std::string &output,
-             const int embedding_dim) {
-    pool_.reset(new ::ThreadPool(inputs.size()));
-
-    std::vector<std::future<int>> tasks(inputs.size());
-    std::vector<std::vector<int64_t>> rows;
-    rows.resize(inputs.size());
-
-    auto begin = GetCurrentUS();
-    for (int x = 0; x < inputs.size(); ++x) {
-      tasks[x] = pool_->enqueue([this, x, &rows, &inputs, &feasigns]() -> int {
-        DeserializeRowsFromFile(inputs[x], feasigns[x], &rows[x]);
-        return 0;
-      });
-    }
-
-    for (size_t x = 0; x < tasks.size(); ++x) {
-      tasks[x].wait();
-    }
-
-    int64_t total_rows = 0;
-    for (auto x = 0; x < rows.size(); x++) {
-      total_rows += rows[x].size();
-    }
-
-    auto end = GetCurrentUS();
-
-    VLOG(0) << "got " << total_rows
-            << " feasigin ids from sparse embedding using " << end - begin;
-
-    std::vector<int64_t> total_dims = {total_rows,
-                                       static_cast<int64_t>(embedding_dim)};
-
-    std::vector<std::vector<int>> batch_buckets;
-    batch_buckets.resize(inputs.size());
-
-    for (int x = 0; x < rows.size(); ++x) {
-      batch_buckets[x] = bucket(rows[x].size(), BUCKET);
-    }
-
-    std::ofstream out(output, std::ios::binary);
-
-    begin = GetCurrentUS();
-    SerializeRowsToStream(out, rows, batch_buckets, total_rows);
-    end = GetCurrentUS();
-    VLOG(0) << "write rows to oostrream using " << end - begin;
-
-    begin = GetCurrentUS();
-    SerializePreTensorToStream(out, total_dims);
-    end = GetCurrentUS();
-    VLOG(0) << "write pretensor to oostrream using " << end - begin;
-
-    begin = GetCurrentUS();
-    SerializeValueToStream(out, inputs, batch_buckets, embedding_dim);
-    end = GetCurrentUS();
-    VLOG(0) << "write values to oostrream using " << end - begin;
-  }
-
- private:
-  void SerializeRowsToStream(std::ostream &os,
-                             const std::vector<std::vector<int64_t>> &rows,
-                             const std::vector<std::vector<int>> &batch_buckets,
-                             int64_t total_rows) {
-    {  // the 1st field, uint32_t version
-      constexpr uint32_t version = 0;
-      os.write(reinterpret_cast<const char *>(&version), sizeof(version));
-    }
-
-    {
-      // the 2st field, rows information
-      os.write(reinterpret_cast<const char *>(&total_rows), sizeof(total_rows));
-
-      for (int b = 0; b < BUCKET; ++b) {
-        for (int x = 0; x < batch_buckets.size(); ++x) {
-          auto begin = batch_buckets[x][b];
-          auto end = batch_buckets[x][b + 1];
-
-          if (end - begin == 0) continue;
-
-          os.write(reinterpret_cast<const char *>(rows[x].data() + begin),
-                   sizeof(int64_t) * (end - begin));
-        }
-      }
-
-      // the 3st field, the height of SelectedRows
-      int64_t height = total_rows;
-      os.write(reinterpret_cast<const char *>(&height), sizeof(height));
-    }
-  }
-
-  void SerializePreTensorToStream(std::ostream &os,
-                                  const std::vector<int64_t> &dims) {
-    {  // the 1st field, uint32_t version
-      constexpr uint32_t version = 0;
-      os.write(reinterpret_cast<const char *>(&version), sizeof(version));
-    }
-    {  // the 2nd field, tensor description
-      // int32_t  size
-      framework::proto::VarType::TensorDesc desc;
-      desc.set_data_type(framework::proto::VarType::FP32);
-      auto *pb_dims = desc.mutable_dims();
-      pb_dims->Resize(static_cast<int>(dims.size()), 0);
-      std::copy(dims.begin(), dims.end(), pb_dims->begin());
-      int32_t size = desc.ByteSize();
-      os.write(reinterpret_cast<const char *>(&size), sizeof(size));
-      auto out = desc.SerializeAsString();
-      os.write(out.data(), size);
-    }
-  }
-
-  void SerializeValueToVec(std::ifstream &in, const int batch,
-                           const int embedding_dim, std::vector<float> *out) {
-    auto queue =
-        std::make_shared<framework::BlockingQueue<std::vector<std::string>>>();
-
-    auto read = [batch, &in, &queue]() {
-      std::string line;
-      std::vector<std::string> columns;
-      std::vector<std::string> values_str;
-
-      int count = 0;
-
-      while (std::getline(in, line)) {
-        ++count;
-        columns = string::Split(line, '\t');
-
-        if (columns.size() != 5) {
-          VLOG(0) << "unexpected line: " << line << ", skip it";
-          continue;
-        }
-
-        values_str = string::Split(columns[4], ',');
-        queue->Push(values_str);
-
-        if (count >= batch) {
-          break;
-        }
-      }
-      queue->Push({});
-    };
-
-    auto write = [embedding_dim, &out, &queue]() {
-      std::vector<std::string> values_str;
-      std::string line;
-
-      while (true) {
-        queue->Pop(&values_str);
-
-        if (values_str.size() == 0) {
-          break;
-        }
-
-        for (int x = 0; x < embedding_dim; ++x) {
-          float v = 0.0;
-          try {
-            v = std::stof(values_str[x]);
-          } catch (std::invalid_argument &e) {
-            VLOG(0) << " get unexpected line: " << line;
-          } catch (std::out_of_range &e) {
-            VLOG(0) << " get unexpected line: " << line;
-          }
-          out->push_back(v);
-        }
-      }
-    };
-
-    std::thread p_read(read);
-    std::thread p_write(write);
-    p_read.join();
-    p_write.join();
-  }
-
-  void SerializeVecToStream(std::ostream &out,
-                            const std::vector<float> &value) {
-    out.write(reinterpret_cast<const char *>(value.data()),
-              static_cast<std::streamsize>(sizeof(float) * value.size()));
-  }
-
-  void SerializeValueToStream(
-      std::ostream &out, const std::vector<std::string> &ins,
-      const std::vector<std::vector<int>> &batch_buckets,
-      const int embedding_dim) {
-    std::vector<std::shared_ptr<std::ifstream>> in_streams;
-
-    for (int x = 0; x < ins.size(); ++x) {
-      in_streams.emplace_back(std::make_shared<std::ifstream>(ins[x]));
-    }
-
-    std::vector<std::future<int>> tasks(ins.size());
-
-    for (int b = 0; b < BUCKET; ++b) {
-      std::vector<std::vector<float>> values;
-      values.resize(tasks.size());
-
-      auto begin = GetCurrentUS();
-
-      for (int x = 0; x < tasks.size(); ++x) {
-        auto batch = batch_buckets[x][b + 1] - batch_buckets[x][b];
-        values[x].clear();
-        values[x].reserve(batch * embedding_dim);
-      }
-
-      for (int x = 0; x < tasks.size(); ++x) {
-        tasks[x] =
-            pool_->enqueue([this, b, x, &out, &in_streams, &batch_buckets,
-                            &values, embedding_dim]() -> int {
-              auto batch = batch_buckets[x][b + 1] - batch_buckets[x][b];
-              if (batch == 0) return 0;
-              SerializeValueToVec(*(in_streams[x].get()), batch, embedding_dim,
-                                  &values[x]);
-              return 0;
-            });
-      }
-
-      for (size_t x = 0; x < tasks.size(); ++x) {
-        tasks[x].wait();
-      }
-
-      auto end = GetCurrentUS();
-
-      auto begin1 = GetCurrentUS();
-      for (size_t x = 0; x < tasks.size(); ++x) {
-        SerializeVecToStream(out, values[x]);
-      }
-      auto end1 = GetCurrentUS();
-
-      VLOG(0) << "serialize buckets " << b << " read using " << end - begin
-              << ", to oostream using " << end1 - begin1;
-    }
-  }
-
-  void DeserializeRowsFromFile(const std::string &input_file,
-                               const int64_t feasigns,
-                               std::vector<int64_t> *rows) {
-    std::string line;
-    std::vector<std::string> columns;
-    std::ifstream file(input_file);
-
-    rows->reserve(feasigns);
-
-    while (std::getline(file, line)) {
-      columns = string::Split(line, '\t');
-      if (columns.size() != 5) {
-        VLOG(0) << "unexpected line: " << line << ", skip it";
-        continue;
-      }
-      rows->push_back(std::stoull(columns[0]));
-    }
-
-    VLOG(0) << "parse " << rows->size() << " embedding rows from "
-            << input_file;
-  }
-
- private:
-  std::unique_ptr<::ThreadPool> pool_;
-};
-}  // namespace distributed
-}  // namespace paddle
diff --git a/paddle/fluid/distributed/fleet_executor/CMakeLists.txt b/paddle/fluid/distributed/fleet_executor/CMakeLists.txt
index 4a2dfcb554ad33..977a125627ba54 100644
--- a/paddle/fluid/distributed/fleet_executor/CMakeLists.txt
+++ b/paddle/fluid/distributed/fleet_executor/CMakeLists.txt
@@ -13,7 +13,7 @@ endif()
 cc_library(task_loop_thread_pool SRCS task_loop_thread_pool.cc task_loop_thread.cc task_loop.cc DEPS enforce glog)
 
 cc_library(fleet_executor SRCS fleet_executor.cc carrier.cc task_node.cc runtime_graph.cc dist_model.cc interceptor.cc
-        compute_interceptor.cc amplifier_interceptor.cc source_interceptor.cc message_service.cc message_bus.cc dist_model_tensor_wrapper.cc
+        compute_interceptor.cc amplifier_interceptor.cc source_interceptor.cc sink_interceptor.cc message_service.cc message_bus.cc dist_model_tensor_wrapper.cc
         DEPS proto_desc fleet_executor_desc_proto interceptor_message_proto task_loop_thread_pool collective_helper
         op_registry executor_gc_helper gflags glog ${BRPC_DEPS})
 
@@ -26,6 +26,7 @@ if(WITH_DISTRIBUTE)
   set_source_files_properties(compute_interceptor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
   set_source_files_properties(amplifier_interceptor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
   set_source_files_properties(source_interceptor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+  set_source_files_properties(sink_interceptor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
   set_source_files_properties(message_bus.h PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
   set_source_files_properties(message_bus.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
   set_source_files_properties(fleet_executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
diff --git a/paddle/fluid/distributed/fleet_executor/carrier.cc b/paddle/fluid/distributed/fleet_executor/carrier.cc
index 358393d97f0710..2d2a3b688fefed 100644
--- a/paddle/fluid/distributed/fleet_executor/carrier.cc
+++ b/paddle/fluid/distributed/fleet_executor/carrier.cc
@@ -31,6 +31,7 @@ namespace distributed {
 USE_INTERCEPTOR(Source);
 USE_INTERCEPTOR(Compute);
 USE_INTERCEPTOR(Amplifier);
+USE_INTERCEPTOR(Sink);
 
 void Carrier::Init(
     int64_t rank,
diff --git a/paddle/fluid/distributed/fleet_executor/sink_interceptor.cc b/paddle/fluid/distributed/fleet_executor/sink_interceptor.cc
new file mode 100644
index 00000000000000..af707c28acd9e9
--- /dev/null
+++ b/paddle/fluid/distributed/fleet_executor/sink_interceptor.cc
@@ -0,0 +1,65 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/fleet_executor/sink_interceptor.h"
+#include "paddle/fluid/distributed/fleet_executor/task_node.h"
+
+namespace paddle {
+namespace distributed {
+
+SinkInterceptor::SinkInterceptor(int64_t interceptor_id, TaskNode* node)
+    : Interceptor(interceptor_id, node), max_run_times_(node->max_run_times()) {
+  // prepare the upstream running status
+  for (const auto& up : node->upstream()) {
+    upstream_step_.emplace(up.first, 0);
+  }
+  RegisterMsgHandle([this](const InterceptorMessage& msg) { Run(msg); });
+}
+
+void SinkInterceptor::StopCarrierIfComplete() {
+  bool flag = true;
+  for (const auto& up : upstream_step_) {
+    flag = flag & (up.second == max_run_times_);
+  }
+  if (flag) {
+    VLOG(3) << "Sink Interceptor is stopping carrier";
+    StopCarrier();
+    for (const auto& up : upstream_step_) {
+      upstream_step_.at(up.first) = 0;
+    }
+  }
+}
+
+void SinkInterceptor::ReplyCompletedToUpStream(int64_t upstream_id) {
+  int64_t micro_step = upstream_step_.at(upstream_id);
+  int64_t scope_idx = micro_step % max_run_times_;
+  InterceptorMessage msg;
+  msg.set_message_type(DATA_IS_USELESS);
+  msg.set_scope_idx(scope_idx);
+  Send(upstream_id, msg);
+  upstream_step_.at(upstream_id) = micro_step + 1;
+  if (micro_step == max_run_times_ - 1) {
+    StopCarrierIfComplete();
+  }
+}
+
+void SinkInterceptor::Run(const InterceptorMessage& msg) {
+  if (msg.message_type() == DATA_IS_READY) {
+    ReplyCompletedToUpStream(msg.src_id());
+  }
+}
+
+REGISTER_INTERCEPTOR(Sink, SinkInterceptor);
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/fleet_executor/sink_interceptor.h b/paddle/fluid/distributed/fleet_executor/sink_interceptor.h
new file mode 100644
index 00000000000000..cb1d698a78526f
--- /dev/null
+++ b/paddle/fluid/distributed/fleet_executor/sink_interceptor.h
@@ -0,0 +1,41 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/fluid/distributed/fleet_executor/interceptor.h"
+
+namespace paddle {
+namespace distributed {
+
+/*
+ * Sink interceptor
+ * There is only one sink in the runtime graph
+ * Take charge of:
+ *   1. record the num of micro-step
+ *   2. check whether to notify carrier the current step is finished
+ */
+class SinkInterceptor : public Interceptor {
+ public:
+  SinkInterceptor(int64_t interceptor_id, TaskNode* node);
+
+ private:
+  void ReplyCompletedToUpStream(int64_t up_id);
+  void Run(const InterceptorMessage& msg);
+  void StopCarrierIfComplete();
+  int64_t max_run_times_;
+  // upstream_id->cur_step
+  std::map<int64_t, int64_t> upstream_step_;
+};
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/fleet_executor/test/CMakeLists.txt b/paddle/fluid/distributed/fleet_executor/test/CMakeLists.txt
index 33c08acd4498df..e0db8a261b5859 100644
--- a/paddle/fluid/distributed/fleet_executor/test/CMakeLists.txt
+++ b/paddle/fluid/distributed/fleet_executor/test/CMakeLists.txt
@@ -7,6 +7,9 @@ cc_test(compute_interceptor_test SRCS compute_interceptor_test.cc DEPS fleet_exe
 set_source_files_properties(source_interceptor_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 cc_test(source_interceptor_test SRCS source_interceptor_test.cc DEPS fleet_executor ${BRPC_DEPS})
 
+set_source_files_properties(sink_interceptor_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+cc_test(sink_interceptor_test SRCS sink_interceptor_test.cc DEPS fleet_executor ${BRPC_DEPS})
+
 set_source_files_properties(interceptor_pipeline_short_path_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 cc_test(interceptor_pipeline_short_path_test SRCS interceptor_pipeline_short_path_test.cc DEPS fleet_executor ${BRPC_DEPS})
 
diff --git a/paddle/fluid/distributed/fleet_executor/test/sink_interceptor_test.cc b/paddle/fluid/distributed/fleet_executor/test/sink_interceptor_test.cc
new file mode 100644
index 00000000000000..6b1a555e987a38
--- /dev/null
+++ b/paddle/fluid/distributed/fleet_executor/test/sink_interceptor_test.cc
@@ -0,0 +1,89 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <iostream>
+#include <unordered_map>
+
+#include "gtest/gtest.h"
+
+#include "paddle/fluid/distributed/fleet_executor/carrier.h"
+#include "paddle/fluid/distributed/fleet_executor/global.h"
+#include "paddle/fluid/distributed/fleet_executor/interceptor.h"
+#include "paddle/fluid/distributed/fleet_executor/message_bus.h"
+#include "paddle/fluid/distributed/fleet_executor/task_node.h"
+
+namespace paddle {
+namespace distributed {
+
+class FakeInterceptor : public Interceptor {
+ public:
+  FakeInterceptor(int64_t interceptor_id, TaskNode* node)
+      : Interceptor(interceptor_id, node) {
+    RegisterMsgHandle([this](const InterceptorMessage& msg) { NOP(msg); });
+  }
+
+  void NOP(const InterceptorMessage& msg) {
+    if (msg.message_type() == DATA_IS_READY) {
+      std::cout << "FakeInterceptor run in scope " << msg.scope_idx()
+                << std::endl;
+      InterceptorMessage reply;
+      reply.set_message_type(DATA_IS_USELESS);
+      Send(-1, reply);
+      InterceptorMessage ready;
+      ready.set_message_type(DATA_IS_READY);
+      Send(-2, ready);
+    } else if (msg.message_type() == DATA_IS_USELESS) {
+      std::cout << "FakeInterceptor remove result in scope " << msg.scope_idx()
+                << std::endl;
+    }
+  }
+
+ private:
+  int64_t step_;
+};
+
+TEST(SourceInterceptor, Source) {
+  std::string carrier_id = "0";
+  Carrier* carrier =
+      GlobalMap<std::string, Carrier>::Create(carrier_id, carrier_id);
+  carrier->Init(0, {{-1, 0}, {0, 0}, {-2, 0}});
+
+  MessageBus* msg_bus = GlobalVal<MessageBus>::Create();
+  msg_bus->Init(0, {{0, "127.0.0.0:0"}}, "");
+
+  // NOTE: don't delete, otherwise interceptor will use undefined node
+  TaskNode* source = new TaskNode(0, -1, 0, 3, 0);  // role, rank, task_id
+  TaskNode* node_a = new TaskNode(0, 0, 0, 3, 0);   // role, rank, task_id
+  TaskNode* sink = new TaskNode(0, -2, 0, 3, 0);    // role, rank, task_id
+
+  source->AddDownstreamTask(0, 1);
+  node_a->AddUpstreamTask(-1, 1);
+  node_a->AddDownstreamTask(-2, 1);
+  sink->AddUpstreamTask(0, 1);
+  carrier->SetInterceptor(-1, InterceptorFactory::Create("Source", -1, source));
+  carrier->SetInterceptor(0, std::make_unique<FakeInterceptor>(0, node_a));
+  carrier->SetInterceptor(-2, InterceptorFactory::Create("Sink", -2, sink));
+
+  // start
+  InterceptorMessage msg;
+  msg.set_message_type(START);
+  msg.set_dst_id(-1);
+  carrier->EnqueueInterceptorMessage(msg);
+
+  carrier->Wait();
+  carrier->Release();
+}
+
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/ps/README.md b/paddle/fluid/distributed/ps/README.md
index d287dcd1111982..afa6d60a4e0bbf 100755
--- a/paddle/fluid/distributed/ps/README.md
+++ b/paddle/fluid/distributed/ps/README.md
@@ -1,3 +1,39 @@
 # 目录说明
 
-> 成型之后，上级目录的 table、thirdparty、table、service 目录可以删除，communicator_common.h 、fleet.cc、fleet.h 删除
+Table: for param storage and update
+-----MemorySparseTable: table for sparse param, used in cpu async mode
+-----MemoryDenseTable: table for dense param, used in cpu async/geo mode
+-----MemorySparseGeoTable: table for sparse param, used in cpu async mode
+-----CommonGraphTable: table used for graph learning
+-----BarrierTable: table for barrier function, used in cpu sync mode
+-----TensorTable: table which run program, used for learning rate decay only
+
+ValueAccessor: for pull param and push gradient
+-----CtrCommonAccessor: pull/push value with show/click, float type
+-----DownpourCtrDoubleAccessor: same as CtrCommonAccessor, other than show/click with double type
+-----SparseAccessor: used for common embedding, pull value without show/click, push value with show/click
+-----CommMergeAccessor: used for dense table only, for get param dim
+
+PsService(proto): for server to handle request
+-----PsBaseService
+----------BrpcPsService: for cpu dnn training task
+----------GraphBrpcService: for graph learning
+-----HeterService: for dnn training task with heterogeneous computing resources
+
+PSServer: recv request from trainer and handle it by service
+-----BrpcPsServer: for cpu dnn training task
+-----GraphBrpcServer: for graph learning
+-----PsLocalServer: for GpuPS
+
+HeterServer: for HeterPS
+
+PSClient: pull param and push gradient for trainer
+-----BrpcPsClient: for cpu dnn training task
+----------GraphBrpcClient: for graph learning
+-----PsLocalClient: for GpuPS
+
+HeterClient: for HeterPS
+
+PSCore: Wrapper for InitServer
+
+GraphPyService: for graph learning
diff --git a/paddle/fluid/distributed/ps/table/common_sparse_table.h b/paddle/fluid/distributed/ps/table/common_sparse_table.h
deleted file mode 100644
index 2673e8dfae3c64..00000000000000
--- a/paddle/fluid/distributed/ps/table/common_sparse_table.h
+++ /dev/null
@@ -1,203 +0,0 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <ThreadPool.h>
-#include <assert.h>
-#include <pthread.h>
-#include <memory>
-#include <mutex>  // NOLINT
-#include <string>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-#include "Eigen/Dense"
-#include "paddle/fluid/distributed/ps/table/accessor.h"
-#include "paddle/fluid/distributed/ps/table/common_table.h"
-#include "paddle/fluid/distributed/ps/table/depends/initializers.h"
-#include "paddle/fluid/distributed/ps/table/depends/large_scale_kv.h"
-#include "paddle/fluid/distributed/ps/table/depends/sparse.h"
-#include "paddle/fluid/string/string_helper.h"
-#include "paddle/phi/core/utils/rw_lock.h"
-
-#define PSERVER_SAVE_SUFFIX ".shard"
-
-namespace paddle {
-namespace distributed {
-
-class SparseOptimizer;
-
-enum SaveMode { all, base, delta };
-
-struct Meta {
-  std::string param;
-  int shard_id;
-  std::vector<std::string> names;
-  std::vector<int> dims;
-  uint64_t count;
-  std::unordered_map<std::string, int> dims_map;
-
-  explicit Meta(const std::string& metapath) {
-    std::ifstream file(metapath);
-    std::string line;
-    int num_lines = 0;
-    while (std::getline(file, line)) {
-      if (StartWith(line, "#")) {
-        continue;
-      }
-      auto pairs = paddle::string::split_string<std::string>(line, "=");
-      PADDLE_ENFORCE_EQ(
-          pairs.size(), 2,
-          paddle::platform::errors::InvalidArgument(
-              "info in %s except k=v, but got %s", metapath, line));
-
-      if (pairs[0] == "param") {
-        param = pairs[1];
-      }
-      if (pairs[0] == "shard_id") {
-        shard_id = std::stoi(pairs[1]);
-      }
-      if (pairs[0] == "row_names") {
-        names = paddle::string::split_string<std::string>(pairs[1], ",");
-      }
-      if (pairs[0] == "row_dims") {
-        auto dims_strs =
-            paddle::string::split_string<std::string>(pairs[1], ",");
-        for (auto& str : dims_strs) {
-          dims.push_back(std::stoi(str));
-        }
-      }
-      if (pairs[0] == "count") {
-        count = std::stoull(pairs[1]);
-      }
-    }
-    for (int x = 0; x < names.size(); ++x) {
-      dims_map[names[x]] = dims[x];
-    }
-  }
-
-  Meta(std::string param, int shard_id, std::vector<std::string> row_names,
-       std::vector<int> dims, uint64_t count) {
-    this->param = param;
-    this->shard_id = shard_id;
-    this->names = row_names;
-    this->dims = dims;
-    this->count = count;
-  }
-
-  std::string ToString() {
-    std::stringstream ss;
-    ss << "param=" << param << "\n";
-    ss << "shard_id=" << shard_id << "\n";
-    ss << "row_names=" << paddle::string::join_strings(names, ',') << "\n";
-    ss << "row_dims=" << paddle::string::join_strings(dims, ',') << "\n";
-    ss << "count=" << count << "\n";
-    return ss.str();
-  }
-};
-
-class CommonSparseTable : public Table {
- public:
-  CommonSparseTable() { rwlock_.reset(new phi::RWLock); }
-  virtual ~CommonSparseTable() {}
-
-  // unused method begin
-  //  virtual int32_t PullDense(float* pull_values, size_t num) { return 0; }
-  //  virtual int32_t PushDenseParam(const float* values, size_t num) { return
-  //  0; }
-  //  virtual int32_t PushDense(const float* values, size_t num) { return 0; }
-  // unused method end
-
-  virtual int32_t Pull(TableContext& context);
-  virtual int32_t Push(TableContext& context);
-
-  virtual int32_t Initialize();
-  virtual int32_t InitializeShard() { return 0; }
-  virtual int32_t InitializeValue();
-  virtual int32_t InitializeOptimizer();
-  virtual int32_t InitializeRecorder();
-
-  virtual int32_t Load(const std::string& path, const std::string& param);
-
-  virtual int32_t Save(const std::string& path, const std::string& param);
-
-  void SaveMetaToText(std::ostream* os, const CommonAccessorParameter& common,
-                      const size_t shard_idx, const int64_t total);
-
-  int64_t SaveValueToText(std::ostream* os, std::shared_ptr<ValueBlock> block,
-                          std::shared_ptr<::ThreadPool> pool, const int mode,
-                          int shard_id);
-
-  virtual void ProcessALine(const std::vector<std::string>& columns,
-                            const Meta& meta, const int64_t id,
-                            std::vector<std::vector<float>>* values);
-
-  virtual int64_t LoadFromText(
-      const std::string& valuepath, const std::string& metapath,
-      const int pserver_id, const int pserver_num, const int local_shard_num,
-      std::vector<std::shared_ptr<ValueBlock>>* blocks);
-
-  virtual std::pair<int64_t, int64_t> PrintTableStat();
-  virtual int32_t PullSparse(float* values, const PullSparseValue& pull_value);
-
-  virtual int32_t PullSparsePtr(char** pull_values, const uint64_t* keys,
-                                size_t num);
-
-  virtual int32_t PushSparse(const uint64_t* keys, const float* values,
-                             size_t num);
-
-  virtual int32_t PushSparse(const uint64_t* keys, const float** values,
-                             size_t num);
-
-  // only for sparse geo table
-  virtual int32_t PushSparseParam(const uint64_t* keys, const float* values,
-                                  size_t num);
-  virtual int32_t SetGlobalLR(float* lr);
-
-  virtual int32_t Pour();
-  virtual int32_t Flush();
-  virtual int32_t Shrink(const std::string& param);
-  virtual void Clear();
-
-  virtual void* GetShard(size_t shard_idx) { return 0; }
-
- protected:
-  virtual int32_t _PushSparse(const uint64_t* keys, const float* values,
-                              size_t num);
-  virtual int32_t _PushSparse(const uint64_t* keys, const float** values,
-                              size_t num);
-
- protected:
-  const int task_pool_size_ = 11;
-  std::vector<std::shared_ptr<::ThreadPool>> _shards_task_pool;
-
-  bool sync = false;
-  int param_dim_ = 0;
-  int param_offset_ = 0;
-
-  std::unordered_map<std::string, int> value_idx_;
-  std::vector<std::string> value_names_;
-  std::vector<int> value_dims_;
-  std::vector<int> value_offsets_;
-  std::vector<std::string> initializer_attrs_;
-
-  std::shared_ptr<SparseOptimizer> optimizer_;
-  std::vector<std::shared_ptr<ValueBlock>> shard_values_;
-  std::unordered_map<uint64_t, ReservoirValue<float>> pull_reservoir_;
-  std::unique_ptr<phi::RWLock> rwlock_{nullptr};
-};
-
-}  // namespace distributed
-}  // namespace paddle
diff --git a/paddle/fluid/distributed/ps/table/ctr_accessor.h b/paddle/fluid/distributed/ps/table/ctr_accessor.h
index b8895e74d1d091..a599bfca7f6d29 100644
--- a/paddle/fluid/distributed/ps/table/ctr_accessor.h
+++ b/paddle/fluid/distributed/ps/table/ctr_accessor.h
@@ -186,6 +186,7 @@ class CtrCommonAccessor : public ValueAccessor {
   // CtrCommonFeatureValue common_feature_value;
   float _show_click_decay_rate;
   int32_t _ssd_unseenday_threshold;
+  bool _show_scale = false;
 
  public:  // TODO(zhaocaibei123): it should be private, but we make it public
           // for unit test
diff --git a/paddle/fluid/distributed/ps/table/depends/large_scale_kv.h b/paddle/fluid/distributed/ps/table/depends/large_scale_kv.h
deleted file mode 100644
index 68c80ad737ec4e..00000000000000
--- a/paddle/fluid/distributed/ps/table/depends/large_scale_kv.h
+++ /dev/null
@@ -1,353 +0,0 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <ThreadPool.h>
-#include <functional>
-#include <future>  // NOLINT
-#include <memory>
-#include <string>
-#include <thread>  // NOLINT
-#include <unordered_map>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-#include "gflags/gflags.h"
-
-#include "butil/object_pool.h"
-#include "paddle/fluid/distributed/common/utils.h"
-#include "paddle/fluid/distributed/ps/table/depends/initializers.h"
-#include "paddle/fluid/distributed/ps/thirdparty/round_robin.h"
-#include "paddle/fluid/framework/generator.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/selected_rows_utils.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/framework/threadpool.h"
-#include "paddle/fluid/framework/variable.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/place.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/fluid/string/string_helper.h"
-#include "paddle/phi/backends/dynload/port.h"
-#include "paddle/phi/core/utils/rw_lock.h"
-
-namespace paddle {
-namespace distributed {
-
-enum Mode { training, infer };
-
-static const int SPARSE_SHARD_BUCKET_NUM_BITS = 6;
-static const size_t SPARSE_SHARD_BUCKET_NUM = (size_t)1
-                                              << SPARSE_SHARD_BUCKET_NUM_BITS;
-
-struct VALUE {
-  explicit VALUE(size_t length)
-      : length_(length),
-        count_(0),
-        unseen_days_(0),
-        need_save_(false),
-        is_entry_(false) {
-    data_.resize(length);
-    memset(data_.data(), 0, sizeof(float) * length);
-  }
-
-  size_t length_;
-  std::vector<float> data_;
-  int count_;
-  int unseen_days_;  // use to check knock-out
-  bool need_save_;   // whether need to save
-  bool is_entry_;    // whether knock-in
-};
-
-inline bool count_entry(VALUE *value, int threshold) {
-  return value->count_ >= threshold;
-}
-
-inline bool probility_entry(VALUE *value, float threshold) {
-  UniformInitializer uniform = UniformInitializer({"uniform", "0", "0", "1"});
-  return uniform.GetValue() >= threshold;
-}
-
-class ValueBlock {
- public:
-  typedef typename robin_hood::unordered_map<uint64_t, VALUE *> map_type;
-  explicit ValueBlock(const std::vector<std::string> &value_names,
-                      const std::vector<int> &value_dims,
-                      const std::vector<int> &value_offsets,
-                      const std::unordered_map<std::string, int> &value_idx,
-                      const std::vector<std::string> &init_attrs,
-                      const std::string &entry_attr)
-      : value_names_(value_names),
-        value_dims_(value_dims),
-        value_offsets_(value_offsets),
-        value_idx_(value_idx) {
-    for (size_t x = 0; x < value_dims.size(); ++x) {
-      value_length_ += value_dims[x];
-    }
-
-    // for Entry
-    {
-      auto slices = string::split_string<std::string>(entry_attr, ":");
-      if (slices[0] == "none") {
-        entry_func_ = std::bind(&count_entry, std::placeholders::_1, 0);
-        threshold_ = 0;
-      } else if (slices[0] == "count_filter_entry") {
-        threshold_ = std::stoi(slices[1]);
-        entry_func_ =
-            std::bind(&count_entry, std::placeholders::_1, threshold_);
-      } else if (slices[0] == "probability_entry") {
-        threshold_ = std::stof(slices[1]);
-        entry_func_ =
-            std::bind(&probility_entry, std::placeholders::_1, threshold_);
-      } else {
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "Not supported Entry Type : %s, Only support [CountFilterEntry, "
-            "ProbabilityEntry]",
-            slices[0]));
-      }
-    }
-
-    // for Initializer
-    {
-      for (auto &attr : init_attrs) {
-        auto slices = string::split_string<std::string>(attr, "&");
-
-        if (slices[0] == "gaussian_random") {
-          initializers_.emplace_back(
-              std::make_shared<GaussianInitializer>(slices));
-        } else if (slices[0] == "fill_constant") {
-          initializers_.emplace_back(
-              std::make_shared<FillConstantInitializer>(slices));
-        } else if (slices[0] == "uniform_random") {
-          initializers_.emplace_back(
-              std::make_shared<UniformInitializer>(slices));
-        } else if (slices[0] == "truncated_gaussian_random") {
-          initializers_.emplace_back(
-              std::make_shared<TruncatedGaussianInitializer>(slices));
-        } else {
-          PADDLE_THROW(platform::errors::InvalidArgument(
-              "%s can not be supported", attr));
-        }
-      }
-    }
-  }
-
-  ~ValueBlock() {}
-
-  std::vector<float *> Get(const uint64_t &id,
-                           const std::vector<std::string> &value_names,
-                           const std::vector<int> &value_dims) {
-    auto pts = std::vector<float *>();
-    pts.reserve(value_names.size());
-    auto values = GetValue(id);
-    for (int i = 0; i < static_cast<int>(value_names.size()); i++) {
-      PADDLE_ENFORCE_EQ(
-          value_dims[i], value_dims_[i],
-          platform::errors::InvalidArgument("value dims is not match"));
-      pts.push_back(values->data_.data() +
-                    value_offsets_.at(value_idx_.at(value_names[i])));
-    }
-    return pts;
-  }
-
-  // pull
-  float *Init(const uint64_t &id, const bool with_update = true,
-              const int counter = 1) {
-    size_t hash = _hasher(id);
-    size_t bucket = compute_bucket(hash);
-
-    auto &table = values_[bucket];
-    auto res = table.find(id);
-
-    VALUE *value = nullptr;
-    if (res == table.end()) {
-      value = butil::get_object<VALUE>(value_length_);
-
-      table[id] = value;
-
-    } else {
-      value = res->second;
-    }
-
-    if (with_update) {
-      AttrUpdate(value, counter);
-    }
-    return value->data_.data();
-  }
-
-  VALUE *InitGet(const uint64_t &id, const bool with_update = true,
-                 const int counter = 1) {
-    size_t hash = _hasher(id);
-    size_t bucket = compute_bucket(hash);
-
-    auto &table = values_[bucket];
-    auto res = table.find(id);
-
-    VALUE *value = nullptr;
-    if (res == table.end()) {
-      value = butil::get_object<VALUE>(value_length_);
-      // value = _alloc.acquire(value_length_);
-      table[id] = value;
-    } else {
-      value = (VALUE *)(void *)(res->second);  // NOLINT
-    }
-    return value;
-  }
-
-  void AttrUpdate(VALUE *value, const int counter) {
-    // update state
-    value->unseen_days_ = 0;
-    value->count_ += counter;
-
-    if (!value->is_entry_) {
-      value->is_entry_ = entry_func_(value);
-      if (value->is_entry_) {
-        // initialize
-        for (size_t x = 0; x < value_names_.size(); ++x) {
-          initializers_[x]->GetValue(value->data_.data() + value_offsets_[x],
-                                     value_dims_[x]);
-        }
-        value->need_save_ = true;
-      }
-    } else {
-      value->need_save_ = true;
-    }
-
-    return;
-  }
-
-  // dont jude if (has(id))
-  float *Get(const uint64_t &id) {
-    size_t hash = _hasher(id);
-    size_t bucket = compute_bucket(hash);
-    auto &table = values_[bucket];
-
-    // auto &value = table.at(id);
-    // return value->data_.data();
-    auto res = table.find(id);
-    VALUE *value = res->second;
-    return value->data_.data();
-  }
-
-  // for load, to reset count, unseen_days
-  VALUE *GetValue(const uint64_t &id) {
-    size_t hash = _hasher(id);
-    size_t bucket = compute_bucket(hash);
-
-    auto &table = values_[bucket];
-    auto res = table.find(id);
-    return res->second;
-  }
-
-  bool GetEntry(const uint64_t &id) {
-    auto value = GetValue(id);
-    return value->is_entry_;
-  }
-
-  void SetEntry(const uint64_t &id, const bool state) {
-    auto value = GetValue(id);
-    value->is_entry_ = state;
-  }
-
-  void erase(uint64_t feasign) {
-    size_t hash = _hasher(feasign);
-    size_t bucket = compute_bucket(hash);
-    auto &table = values_[bucket];
-
-    auto iter = table.find(feasign);
-    if (iter != table.end()) {
-      butil::return_object(iter->second);
-      iter = table.erase(iter);
-    }
-  }
-
-  void Shrink(const int threshold) {
-    for (auto &table : values_) {
-      for (auto iter = table.begin(); iter != table.end();) {
-        // VALUE* value = (VALUE*)(void*)(iter->second);
-        VALUE *value = iter->second;
-        value->unseen_days_++;
-        if (value->unseen_days_ >= threshold) {
-          butil::return_object(iter->second);
-          // _alloc.release(iter->second);
-          // _alloc.release(value);
-          iter = table.erase(iter);
-        } else {
-          ++iter;
-        }
-      }
-    }
-    return;
-  }
-
-  float GetThreshold() { return threshold_; }
-  size_t compute_bucket(size_t hash) {
-    if (SPARSE_SHARD_BUCKET_NUM == 1) {
-      return 0;
-    } else {
-      return hash >> (sizeof(size_t) * 8 - SPARSE_SHARD_BUCKET_NUM_BITS);
-    }
-  }
-
-  map_type::iterator end() {
-    return values_[SPARSE_SHARD_BUCKET_NUM - 1].end();
-  }
-
-  map_type::iterator Find(uint64_t id) {
-    size_t hash = _hasher(id);
-    size_t bucket = compute_bucket(hash);
-    auto &table = values_[bucket];
-
-    auto got = table.find(id);
-    if (got == table.end()) {
-      return end();
-    } else {
-      return got;
-    }
-  }
-
- private:
-  bool Has(const uint64_t id) {
-    size_t hash = _hasher(id);
-    size_t bucket = compute_bucket(hash);
-    auto &table = values_[bucket];
-
-    auto got = table.find(id);
-    if (got == table.end()) {
-      return false;
-    } else {
-      return true;
-    }
-  }
-
- public:
-  map_type values_[SPARSE_SHARD_BUCKET_NUM];
-  size_t value_length_ = 0;
-  std::hash<uint64_t> _hasher;
-
- private:
-  const std::vector<std::string> &value_names_;
-  const std::vector<int> &value_dims_;
-  const std::vector<int> &value_offsets_;
-  const std::unordered_map<std::string, int> &value_idx_;
-
-  std::function<bool(VALUE *)> entry_func_;
-  std::vector<std::shared_ptr<Initializer>> initializers_;
-  float threshold_;
-};
-
-}  // namespace distributed
-}  // namespace paddle
diff --git a/paddle/fluid/eager/backward.cc b/paddle/fluid/eager/backward.cc
index d5397e20e7d680..be425cf91bdef2 100644
--- a/paddle/fluid/eager/backward.cc
+++ b/paddle/fluid/eager/backward.cc
@@ -22,10 +22,10 @@
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
 
+#include "glog/logging.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/errors.h"
-
-#include "glog/logging.h"
+#include "paddle/phi/kernels/autotune/switch_autotune.h"
 
 namespace egr {
 
@@ -799,6 +799,7 @@ void Backward(
   paddle::platform::RecordEvent backward_record_event(
       "backward", paddle::platform::TracerEventType::Operator, 1);
   RunBackward(tensors, grad_tensors, retain_graph);
+  phi::autotune::AutoTuneStatus::Instance().Update();
 }
 
 std::vector<paddle::experimental::Tensor> Grad(
diff --git a/paddle/fluid/eager/pylayer/py_layer_node.cc b/paddle/fluid/eager/pylayer/py_layer_node.cc
index 5008e958c5f11a..42036a28cfa15e 100644
--- a/paddle/fluid/eager/pylayer/py_layer_node.cc
+++ b/paddle/fluid/eager/pylayer/py_layer_node.cc
@@ -154,6 +154,12 @@ operator()(
     }
   }
 
+  Py_XDECREF(backward_fn);
+  Py_XDECREF(backward_args);
+  if (!PyTuple_Check(outputs)) {
+    Py_XDECREF(outputs_tuple);
+  }
+
   return grad_out;
 }
 }  // namespace egr
diff --git a/paddle/fluid/framework/conv_search_cache.h b/paddle/fluid/framework/conv_search_cache.h
index 51446f287e94b7..4da2aeb4d04722 100644
--- a/paddle/fluid/framework/conv_search_cache.h
+++ b/paddle/fluid/framework/conv_search_cache.h
@@ -16,7 +16,6 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator_kernel_configs.h"
-
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/dist_multi_trainer.cc b/paddle/fluid/framework/dist_multi_trainer.cc
index d16469e265e2e3..7f9aac4d3f1d31 100644
--- a/paddle/fluid/framework/dist_multi_trainer.cc
+++ b/paddle/fluid/framework/dist_multi_trainer.cc
@@ -117,6 +117,9 @@ void DistMultiTrainer::InitOtherEnv(const ProgramDesc &main_program) {
     InitDumpEnv();
   }
   pull_dense_worker_->SetRootScope(root_scope_);
+#if defined(PADDLE_WITH_PSCORE) && defined(PADDLE_WITH_CUDA)
+  pull_dense_worker_->CreatePinVar();
+#endif
   pull_dense_worker_->Start();
 #if defined(PADDLE_WITH_PSLIB) || defined(PADDLE_WITH_PSCORE)
   for (int i = 0; i < thread_num_; ++i) {
diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
index 75f5c24af5a996..c7852de00a18ef 100755
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
@@ -37,6 +37,43 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
+#ifdef PADDLE_WITH_PSLIB
+void AfsWrapper::init(const std::string& fs_name, const std::string& fs_user,
+                      const std::string& pass_wd, const std::string& conf) {
+  int ret = afs_handler_.init(fs_name.c_str(), fs_user.c_str(), pass_wd.c_str(),
+                              conf.c_str());
+  if (ret != 0) {
+    LOG(ERROR) << "AFS Init Error";
+  }
+}
+
+int AfsWrapper::remove(const std::string& path) {
+  return afs_handler_.remove(path);
+}
+
+int AfsWrapper::mkdir(const std::string& path) {
+  return afs_handler_.mkdir(path);
+}
+
+std::vector<std::string> AfsWrapper::list(const std::string& path) {
+  return afs_handler_.list(path);
+}
+
+int AfsWrapper::exist(const std::string& path) {
+  return afs_handler_.exist(path);
+}
+
+int AfsWrapper::upload(const std::string& local_file,
+                       const std::string& afs_file) {
+  return afs_handler_.upload_file(local_file, afs_file);
+}
+
+int AfsWrapper::download(const std::string& local_file,
+                         const std::string& afs_file) {
+  return afs_handler_.download_file(local_file, afs_file);
+}
+#endif
+
 std::shared_ptr<PSGPUWrapper> PSGPUWrapper::s_instance_ = NULL;
 bool PSGPUWrapper::is_initialized_ = false;
 #ifdef PADDLE_WITH_PSLIB
diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
index d9d29cc072dd7b..9b7d6de082d1c1 100755
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
@@ -55,6 +55,27 @@ namespace framework {
 #define TYPEALIGN(ALIGNVAL, LEN) \
   (((uint64_t)(LEN) + ((ALIGNVAL)-1)) & ~((uint64_t)((ALIGNVAL)-1)))
 
+#ifdef PADDLE_WITH_PSLIB
+class AfsWrapper {
+ public:
+  AfsWrapper() {}
+  virtual ~AfsWrapper() {}
+  void init(const std::string& fs_name, const std::string& fs_user,
+            const std::string& pass_wd, const std::string& conf);
+  int remove(const std::string& path);
+  int mkdir(const std::string& path);
+  std::vector<std::string> list(const std::string& path);
+
+  int exist(const std::string& path);
+  int upload(const std::string& local_file, const std::string& afs_file);
+
+  int download(const std::string& local_file, const std::string& afs_file);
+
+ private:
+  paddle::ps::AfsApiWrapper afs_handler_;
+};
+#endif
+
 class PSGPUWrapper {
  public:
   virtual ~PSGPUWrapper() { delete HeterPs_; }
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index 16a95b2ccf7f19..834a2c953eab83 100755
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -140,6 +140,8 @@ if(WITH_MKLDNN)
     pass_library(batch_norm_act_fuse_pass inference DIR mkldnn)
     pass_library(multi_gru_fuse_pass inference DIR mkldnn)
     pass_library(multi_gru_seq_fuse_pass inference DIR mkldnn)
+    pass_library(quant_dequant_mkldnn_pass inference DIR mkldnn)
+    pass_library(compute_propagate_scales_mkldnn_pass inference DIR mkldnn)
 endif()
 
 if(WITH_IPU)
diff --git a/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.cc b/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.cc
new file mode 100644
index 00000000000000..d7d0b988b551eb
--- /dev/null
+++ b/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.cc
@@ -0,0 +1,438 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <float.h>
+#include <algorithm>
+
+#include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.h"
+#include "paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void ComputePropagateScalesMkldnnPass::GetTensorFromVector(
+    const std::vector<float>& data_v, Tensor* tensor) const {
+  const int size = static_cast<int>(data_v.size());
+  auto* data = tensor->mutable_data<float>({size}, platform::CPUPlace());
+  for (int i = 0; i < size; i++) {
+    data[i] = data_v[i];
+  }
+}
+
+void ComputePropagateScalesMkldnnPass::GetQuantInfo(
+    ir::Graph* graph, StringPairMap* var_quant_scales) const {
+  std::unordered_map<std::string, std::vector<float>> info_map{};
+  GetInfoFromTheFirstOp(graph, "has_quant_info", "var_quant_scales", &info_map);
+
+  for (auto iter = info_map.begin(); iter != info_map.end(); iter++) {
+    Tensor tensor;
+    GetTensorFromVector(iter->second, &tensor);
+    auto pair = std::make_pair(false, tensor);
+    var_quant_scales->insert(std::make_pair(iter->first, pair));
+  }
+}
+
+std::vector<float> ComputePropagateScalesMkldnnPass::GetScales(Tensor* tensor,
+                                                               int axis) const {
+  PADDLE_ENFORCE_LT(axis, 2,
+                    platform::errors::InvalidArgument(
+                        "The input axis is required to be less than 2."));
+  auto* data = tensor->data<float>();
+  const auto dims = tensor->dims();
+  PADDLE_ENFORCE_EQ(dims.size(), 2,
+                    platform::errors::InvalidArgument(
+                        "The input tensor's rank is required to be 2."));
+
+  const int rows = dims.at(0);
+  const int columns = dims.at(1);
+  std::vector<float> scales;
+  if (axis == 0) {
+    for (int i = 0; i < columns; i++) {
+      float max_value = FLT_MIN;
+      for (int j = 0; j < rows; j++) {
+        max_value = std::max(max_value, std::abs(data[i + j * columns]));
+      }
+      max_value = 1.0 / max_value;
+      if (std::isinf(max_value) || std::isnan(max_value)) {
+        max_value = 0.0;
+      }
+      scales.push_back(max_value);
+    }
+  } else {
+    for (int i = 0; i < rows; i++) {
+      float max_value = FLT_MIN;
+      for (int j = i * columns; j < (i + 1) * columns; j++) {
+        max_value = std::max(max_value, std::abs(data[j]));
+      }
+      max_value = 1.0 / max_value;
+      if (std::isinf(max_value) || std::isnan(max_value)) {
+        max_value = 0.0;
+      }
+      scales.push_back(max_value);
+    }
+  }
+  return scales;
+}
+
+void ComputePropagateScalesMkldnnPass::ComputeVarScales(
+    ir::Graph* graph, Scope* scope, const std::unordered_set<std::string>& ops,
+    const std::string& weight_name, const int axis,
+    StringPairMap* var_quant_scales) const {
+  for (auto* op_node :
+       ir::TopologyVarientSort(*graph, static_cast<ir::SortKind>(0))) {
+    if (!op_node->IsOp()) continue;
+
+    auto* op_desc = op_node->Op();
+    if (ops.count(op_desc->Type())) {
+      auto var_name = op_desc->Input(weight_name)[0];
+      auto* var = scope->FindVar(var_name);
+      PADDLE_ENFORCE_NOT_NULL(
+          var, platform::errors::NotFound(
+                   "The input persistable var [%s] of [%s] op is not found.",
+                   var_name, op_desc->Type()));
+      auto* weight_tensor = var->GetMutable<LoDTensor>();
+      const auto dims = weight_tensor->dims();
+      int volume = 1;
+      for (int i = 1; i < dims.size(); i++) {
+        volume *= dims[i];
+      }
+
+      Tensor tmp_tensor;
+      std::vector<int64_t> reshape_dims = {dims[0], volume};
+      tmp_tensor.Resize(phi::make_ddim(reshape_dims));
+      auto* weight_data = weight_tensor->data<float>();
+      auto* tmp_data = tmp_tensor.mutable_data<float>(platform::CPUPlace());
+      for (int i = 0; i < weight_tensor->numel(); i++) {
+        tmp_data[i] = std::abs(weight_data[i]);
+      }
+
+      auto scales_v = GetScales(&tmp_tensor, axis);
+      Tensor tensor;
+      GetTensorFromVector(scales_v, &tensor);
+      auto pair = std::make_pair(false, tensor);
+      var_quant_scales->insert(std::make_pair(var_name, pair));
+    }
+  }
+}
+
+void ComputePropagateScalesMkldnnPass::ComputeSingleGruWeightScales(
+    Scope* scope, const std::string& wx_var_name,
+    const std::string& wh_var_name, Tensor* tensor) const {
+  auto* wx_var = scope->FindVar(wx_var_name);
+  PADDLE_ENFORCE_NOT_NULL(
+      wx_var, platform::errors::NotFound(
+                  "The input persistable var [%s] is not found.", wx_var_name));
+  auto* wh_var = scope->FindVar(wh_var_name);
+  PADDLE_ENFORCE_NOT_NULL(
+      wh_var, platform::errors::NotFound(
+                  "The input persistable var [%s] is not found.", wh_var_name));
+
+  const auto* wx_tensor = wx_var->GetMutable<LoDTensor>();
+  const auto* wh_tensor = wh_var->GetMutable<LoDTensor>();
+  const int OC = wh_tensor->dims()[0];
+  std::vector<float> scale_ur(2 * OC);
+  std::vector<float> scale_o(OC);
+  for (int row_id = 0; row_id < wx_tensor->dims()[0]; row_id++) {
+    for (int col_id = 0; col_id < 2 * OC; col_id++) {
+      int idx = (row_id * wx_tensor->dims()[1]) + col_id;
+      auto abs_value = std::abs(wx_tensor->data<float>()[idx]);
+      if (row_id == 0) {
+        scale_ur[col_id] = abs_value;
+      } else {
+        if (abs_value > scale_ur[col_id]) scale_ur[col_id] = abs_value;
+      }
+    }
+  }
+
+  for (int i = 0; i < 2 * OC * OC; i++) {
+    int col_id = i % (2 * OC);
+    auto abs_value = std::abs(wh_tensor->data<float>()[i]);
+    if (abs_value > scale_ur[col_id]) scale_ur[col_id] = abs_value;
+  }
+
+  for (int row_id = 0; row_id < wx_tensor->dims()[0]; row_id++) {
+    for (int col_id = 2 * OC; col_id < wx_tensor->dims()[1]; col_id++) {
+      int idx = (row_id * wx_tensor->dims()[1]) + col_id;
+      auto abs_value = std::abs(wx_tensor->data<float>()[idx]);
+      if (row_id == 0) {
+        scale_o[col_id % OC] = abs_value;
+      } else {
+        if (abs_value > scale_o[col_id]) scale_o[col_id % OC] = abs_value;
+      }
+    }
+  }
+
+  for (int i = 2 * OC * OC; i < OC * wh_tensor->dims()[1]; i++) {
+    int col_id = i % OC;
+    auto abs_value = std::abs(wh_tensor->data<float>()[i]);
+    if (abs_value > scale_o[col_id]) scale_o[col_id] = abs_value;
+  }
+
+  scale_ur.insert(scale_ur.end(), scale_o.begin(), scale_o.end());
+  transform(scale_ur.begin(), scale_ur.end(), scale_ur.begin(),
+            [](float c) { return 1 / c; });
+  GetTensorFromVector(scale_ur, tensor);
+}
+
+void ComputePropagateScalesMkldnnPass::ComputeGruWeightScales(
+    ir::Graph* graph, Scope* scope, const std::string& wx_name,
+    const std::string& wh_name, StringPairMap* var_quant_scales) const {
+  for (auto* op_node :
+       ir::TopologyVarientSort(*graph, static_cast<ir::SortKind>(0))) {
+    if (!op_node->IsOp()) continue;
+
+    auto* op_desc = op_node->Op();
+    if (op_desc->Type() == "fusion_gru" || op_desc->Type() == "multi_gru") {
+      auto wx_var_names = op_desc->Input(wx_name);
+      auto wh_var_names = op_desc->Input(wh_name);
+      const int wx_names_size = static_cast<int>(wx_var_names.size());
+      const int wh_names_size = static_cast<int>(wh_var_names.size());
+      PADDLE_ENFORCE_EQ(
+          wx_names_size, wh_names_size,
+          platform::errors::Fatal("Mismatch in number of weights inputs (%d "
+                                  "for WeightX vs. %d for WeightH).",
+                                  wx_names_size, wh_names_size));
+      for (int i = 0; i < wx_names_size; i++) {
+        auto wh_var_name = wh_var_names[i];
+        auto wx_var_name = wx_var_names[i];
+        Tensor tensor;
+        ComputeSingleGruWeightScales(scope, wx_var_name, wh_var_name, &tensor);
+        auto pair = std::make_pair(false, tensor);
+        var_quant_scales->insert(std::make_pair(wx_var_name, pair));
+      }
+    }
+  }
+}
+
+void ComputePropagateScalesMkldnnPass::ComputeSingleLstmWeightScales(
+    Scope* scope, const std::string& wx_var_name,
+    const std::string& wh_var_name, Tensor* tensor) const {
+  auto* wx_var = scope->FindVar(wx_var_name);
+  PADDLE_ENFORCE_NOT_NULL(
+      wx_var, platform::errors::NotFound(
+                  "The input persistable var [%s] is not found.", wx_var_name));
+  auto* wh_var = scope->FindVar(wh_var_name);
+  PADDLE_ENFORCE_NOT_NULL(
+      wh_var, platform::errors::NotFound(
+                  "The input persistable var [%s] is not found.", wh_var_name));
+
+  const auto* wx_tensor = wx_var->GetMutable<LoDTensor>();
+  const auto* wh_tensor = wh_var->GetMutable<LoDTensor>();
+  std::vector<float> scale(wx_tensor->dims()[1]);
+
+  for (int row_id = 0; row_id < wx_tensor->dims()[0]; row_id++) {
+    for (int col_id = 0; col_id < wx_tensor->dims()[1]; col_id++) {
+      int idx = (row_id * wx_tensor->dims()[1]) + col_id;
+      auto abs_value = std::abs(wx_tensor->data<float>()[idx]);
+      if (row_id == 0) {
+        scale[col_id] = abs_value;
+      } else {
+        if (abs_value > scale[col_id]) scale[col_id] = abs_value;
+      }
+    }
+  }
+  for (int row_id = 0; row_id < wh_tensor->dims()[0]; row_id++) {
+    for (int col_id = 0; col_id < wh_tensor->dims()[1]; col_id++) {
+      int idx = (row_id * wh_tensor->dims()[1]) + col_id;
+      auto abs_value = std::abs(wh_tensor->data<float>()[idx]);
+      if (abs_value > scale[col_id]) scale[col_id] = abs_value;
+    }
+  }
+  transform(scale.begin(), scale.end(), scale.begin(),
+            [](float c) { return 1 / c; });
+  GetTensorFromVector(scale, tensor);
+}
+
+void ComputePropagateScalesMkldnnPass::ComputeLstmWeightScales(
+    ir::Graph* graph, Scope* scope, const std::string& wx_name,
+    const std::string& wh_name, StringPairMap* var_quant_scales) const {
+  for (auto* op_node :
+       ir::TopologyVarientSort(*graph, static_cast<ir::SortKind>(0))) {
+    if (!op_node->IsOp()) continue;
+
+    auto* op_desc = op_node->Op();
+    if (op_desc->Type() == "fusion_lstm") {
+      auto wx_var_names = op_desc->Input(wx_name);
+      auto wh_var_names = op_desc->Input(wh_name);
+      const int wx_names_size = static_cast<int>(wx_var_names.size());
+      const int wh_names_size = static_cast<int>(wh_var_names.size());
+      PADDLE_ENFORCE_EQ(
+          wx_names_size, wh_names_size,
+          platform::errors::Fatal("Mismatch in number of weights inputs (%d "
+                                  "for WeightX vs. %d for WeightH).",
+                                  wx_names_size, wh_names_size));
+
+      for (int i = 0; i < wx_names_size; i++) {
+        auto wh_var_name = wh_var_names[i];
+        auto wx_var_name = wx_var_names[i];
+        Tensor tensor;
+        ComputeSingleLstmWeightScales(scope, wx_var_name, wh_var_name, &tensor);
+        auto pair = std::make_pair(false, tensor);
+        var_quant_scales->insert(std::make_pair(wx_var_name, pair));
+      }
+    }
+  }
+}
+
+void ComputePropagateScalesMkldnnPass::ComputeWeightScales(
+    ir::Graph* graph, Scope* scope, StringPairMap* var_quant_scales) const {
+  ComputeVarScales(graph, scope, {"conv2d", "depthwise_conv2d"}, "Filter", 1,
+                   var_quant_scales);
+  ComputeVarScales(graph, scope, {"fc"}, "W", 0, var_quant_scales);
+  ComputeVarScales(graph, scope, {"fusion_gru", "multi_gru"}, "WeightH", 0,
+                   var_quant_scales);
+  ComputeVarScales(graph, scope, {"fusion_lstm"}, "WeightH", 0,
+                   var_quant_scales);
+  ComputeGruWeightScales(graph, scope, "WeightX", "WeightH", var_quant_scales);
+  ComputeLstmWeightScales(graph, scope, "WeightX", "WeightH", var_quant_scales);
+}
+
+void ComputePropagateScalesMkldnnPass::UpdateScaleOpInScale(
+    Node* op_node, const std::string& input_name,
+    const std::string& output_name, StringPairMap* var_quant_scales) const {
+  auto iter = var_quant_scales->find(output_name);
+  if (iter != var_quant_scales->end()) {
+    auto pair = iter->second;
+    const auto tensor = pair.second;
+
+    const auto scale = BOOST_GET_CONST(float, op_node->Op()->GetAttr("scale"));
+    Tensor tmp_tensor;
+    tmp_tensor.Resize(tensor.dims());
+    auto* data = tmp_tensor.mutable_data<float>(platform::CPUPlace());
+    for (int i = 0; i < tensor.numel(); i++) {
+      data[i] = data[i] * scale;
+    }
+
+    auto new_pair = std::make_pair(pair.first, tmp_tensor);
+    var_quant_scales->insert(std::make_pair(input_name, new_pair));
+  }
+}
+
+std::unordered_set<std::string> ComputePropagateScalesMkldnnPass::UpdateScales(
+    ir::Graph* graph, StringPairMap* var_quant_scales,
+    const std::unordered_set<std::string>& scale_immutable_ops) const {
+  std::unordered_set<std::string> waiting_for_scale{};
+  for (auto* op_node :
+       ir::TopologyVarientSort(*graph, static_cast<ir::SortKind>(0))) {
+    if (!op_node->IsOp()) continue;
+
+    const auto op_name = op_node->Name();
+    if (scale_immutable_ops.count(op_name)) {
+      std::string input_name;
+      if (op_name == "slice") {
+        input_name = op_node->Op()->Input("Input")[0];
+      } else {
+        input_name = op_node->Op()->Input("X")[0];
+      }
+
+      const std::string output_name = op_node->Op()->Output("Out")[0];
+      auto in_iter = var_quant_scales->find(input_name);
+      auto out_iter = var_quant_scales->find(output_name);
+      if (in_iter == var_quant_scales->end() &&
+          out_iter == var_quant_scales->end()) {
+        waiting_for_scale.insert(input_name);
+        waiting_for_scale.insert(output_name);
+      } else if (in_iter != var_quant_scales->end()) {
+        out_iter->second = in_iter->second;
+      } else if (out_iter != var_quant_scales->end()) {
+        in_iter->second = out_iter->second;
+      }
+    } else if (op_name == "scale") {
+      const std::string output_name = op_node->Op()->Output("Out")[0];
+      auto out_iter = var_quant_scales->find(output_name);
+      if (out_iter != var_quant_scales->end()) {
+        const std::string input_name = op_node->Op()->Input("X")[0];
+        UpdateScaleOpInScale(op_node, input_name, output_name,
+                             var_quant_scales);
+      }
+    }
+  }
+  return waiting_for_scale;
+}
+
+void ComputePropagateScalesMkldnnPass::PropagateScales(
+    ir::Graph* graph, StringPairMap* var_quant_scales,
+    const std::unordered_set<std::string>& scale_immutable_ops) const {
+  auto waiting_for_scale =
+      UpdateScales(graph, var_quant_scales, scale_immutable_ops);
+  std::unordered_set<std::string> waiting_for_scale_prev{};
+  while (waiting_for_scale.size() != 0 &&
+         waiting_for_scale != waiting_for_scale_prev) {
+    waiting_for_scale_prev.clear();
+    waiting_for_scale_prev.insert(waiting_for_scale.begin(),
+                                  waiting_for_scale.end());
+    waiting_for_scale =
+        UpdateScales(graph, var_quant_scales, scale_immutable_ops);
+  }
+}
+
+void ComputePropagateScalesMkldnnPass::ConvertStringPairMap(
+    const StringPairMap& var_quant_scales,
+    std::unordered_map<std::string, std::vector<float>>* info_map) const {
+  for (auto iter = var_quant_scales.begin(); iter != var_quant_scales.end();
+       iter++) {
+    auto* data = iter->second.second.data<float>();
+    std::vector<float> data_v;
+    for (int i = 0; i < iter->second.second.numel(); i++) {
+      data_v.push_back(data[i]);
+    }
+
+    info_map->insert(std::make_pair(iter->first, data_v));
+  }
+}
+
+void ComputePropagateScalesMkldnnPass::ApplyImpl(ir::Graph* graph) const {
+  VLOG(3) << "Convert paddle model to mkldnn quantized model.";
+  const std::string pattern_name = "compute_propagate_scales_mkldnn_pass";
+  FusePassBase::Init(pattern_name, graph);
+
+  const std::unordered_set<std::string> scale_immutable_ops = {
+      "transpose2", "reshape2",       "pool2d",
+      "slice",      "nearest_interp", "nearest_interp_v2"};
+
+  StringPairMap var_quant_scales{};
+
+  auto* scope = param_scope();
+  GetQuantInfo(graph, &var_quant_scales);
+  ComputeWeightScales(graph, scope, &var_quant_scales);
+  PropagateScales(graph, &var_quant_scales, scale_immutable_ops);
+
+  // save var_quant_scales in the first op's attr
+  // for cpu_quantize_pass
+  std::unordered_map<std::string, std::vector<float>> info_map;
+  ConvertStringPairMap(var_quant_scales, &info_map);
+  SaveInfoInTheFirstOp(graph, "has_quant_info", "var_quant_scales", info_map);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(compute_propagate_scales_mkldnn_pass,
+              paddle::framework::ir::ComputePropagateScalesMkldnnPass);
+
+REGISTER_PASS_CAPABILITY(compute_propagate_scales_mkldnn_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .LE("conv2d", 1)
+            .EQ("fc", 0)
+            .LE("conv2d_transpose", 2)
+            .EQ("fake_quantize_abs_max", 0)
+            .EQ("fake_quantize_range_abs_max", 0)
+            .EQ("fake_quantize_moving_average_abs_max", 0)
+            .LE("fake_channel_wise_quantize_abs_max", 1)
+            .EQ("fake_dequantize_max_abs", 0));
diff --git a/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.h b/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.h
new file mode 100644
index 00000000000000..b0076c1b38cd46
--- /dev/null
+++ b/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.h
@@ -0,0 +1,92 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+using StringPairMap = std::unordered_map<std::string, std::pair<bool, Tensor>>;
+
+class ComputePropagateScalesMkldnnPass : public FusePassBase {
+ public:
+  ComputePropagateScalesMkldnnPass() = default;
+  virtual ~ComputePropagateScalesMkldnnPass() {}
+
+#ifdef PADDLE_WITH_TESTING
+  friend class ComputePropagateScalesMkldnnPassTest;
+#endif
+
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+
+ private:
+  void GetTensorFromVector(const std::vector<float>& data_v,
+                           Tensor* tensor) const;
+
+  void GetQuantInfo(ir::Graph* graph, StringPairMap* var_quant_scales) const;
+
+  std::vector<float> GetScales(Tensor* tensor, int axis) const;
+
+  void ComputeVarScales(ir::Graph* graph, Scope* scope,
+                        const std::unordered_set<std::string>& ops,
+                        const std::string& weight_name, const int axis,
+                        StringPairMap* var_quant_scales) const;
+
+  void ComputeSingleGruWeightScales(Scope* scope,
+                                    const std::string& wx_var_name,
+                                    const std::string& wh_var_name,
+                                    Tensor* tensor) const;
+
+  void ComputeGruWeightScales(ir::Graph* graph, Scope* scope,
+                              const std::string& wx_name,
+                              const std::string& wh_name,
+                              StringPairMap* var_quant_scales) const;
+
+  void ComputeSingleLstmWeightScales(Scope* scope,
+                                     const std::string& wx_var_name,
+                                     const std::string& wh_var_name,
+                                     Tensor* tensor) const;
+
+  void ComputeLstmWeightScales(ir::Graph* graph, Scope* scope,
+                               const std::string& wx_name,
+                               const std::string& wh_name,
+                               StringPairMap* var_quant_scales) const;
+
+  void ComputeWeightScales(ir::Graph* graph, Scope* scope,
+                           StringPairMap* var_quant_scales) const;
+
+  void UpdateScaleOpInScale(Node* op_node, const std::string& input_name,
+                            const std::string& output_name,
+                            StringPairMap* var_quant_scales) const;
+
+  std::unordered_set<std::string> UpdateScales(
+      ir::Graph* graph, StringPairMap* var_quant_scales,
+      const std::unordered_set<std::string>& scale_immutable_ops) const;
+
+  void PropagateScales(
+      ir::Graph* graph, StringPairMap* var_quant_scales,
+      const std::unordered_set<std::string>& scale_immutable_ops) const;
+
+  void ConvertStringPairMap(
+      const StringPairMap& var_quant_scales,
+      std::unordered_map<std::string, std::vector<float>>* info_map) const;
+};
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h b/paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h
new file mode 100644
index 00000000000000..505bb2739e1d45
--- /dev/null
+++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h
@@ -0,0 +1,77 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include "paddle/fluid/framework/ir/graph_helper.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+static void SaveInfoInTheFirstOp(
+    ir::Graph* graph, const std::string& flag, const std::string& key_suffix,
+    const std::unordered_map<std::string, std::vector<float>>& info_map) {
+  VLOG(3) << "save variables in the first op's attr";
+
+  const std::string suffix = "_" + key_suffix + "_" + flag;
+  for (auto* op_node :
+       ir::TopologyVarientSort(*graph, static_cast<ir::SortKind>(0))) {
+    if (!op_node->IsOp() || op_node->Op()->Type() == "feed" ||
+        op_node->Op()->Type() == "fetch")
+      continue;
+
+    op_node->Op()->SetAttr(flag, true);
+    for (auto iter = info_map.begin(); iter != info_map.end(); ++iter) {
+      op_node->Op()->SetAttr(iter->first + suffix, iter->second);
+    }
+    break;
+  }
+}
+
+static void GetInfoFromTheFirstOp(
+    ir::Graph* graph, const std::string& flag, const std::string& key_suffix,
+    std::unordered_map<std::string, std::vector<float>>* info_map) {
+  VLOG(3) << "get variables from the first op's attr";
+
+  const std::string suffix = "_" + key_suffix + "_" + flag;
+  for (auto* op_node :
+       ir::TopologyVarientSort(*graph, static_cast<ir::SortKind>(0))) {
+    if (!op_node->IsOp() || op_node->Op()->Type() == "feed" ||
+        op_node->Op()->Type() == "fetch")
+      continue;
+
+    auto* op_desc = op_node->Op();
+    if (op_desc->GetAttrIfExists<bool>(flag)) {
+      op_desc->RemoveAttr(flag);
+      std::vector<std::string> attr_names = op_desc->AttrNames();
+      for (auto fake_name : attr_names) {
+        size_t pos = fake_name.find(suffix);
+        if (pos != std::string::npos) {
+          std::string name = fake_name.substr(0, pos);
+          auto scales_vector =
+              BOOST_GET_CONST(std::vector<float>, op_desc->GetAttr(fake_name));
+          info_map->insert(std::make_pair(name, scales_vector));
+          op_desc->RemoveAttr(fake_name);
+        }
+      }
+      break;
+    }
+  }
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc b/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc
new file mode 100644
index 00000000000000..808d043a4b226c
--- /dev/null
+++ b/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc
@@ -0,0 +1,582 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.h"
+#include <string>
+#include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void QuantDequantMkldnnPass::MarkSkipQuantizedOps(
+    ir::Graph* graph, const std::unordered_set<std::string>& skip_ops) const {
+  VLOG(3) << "mark skip quantized ops";
+  for (auto* op_node :
+       ir::TopologyVarientSort(*graph, static_cast<ir::SortKind>(0))) {
+    if (!op_node->IsOp()) continue;
+
+    if (skip_ops.count(op_node->Name())) {
+      auto* op_desc = op_node->Op();
+      if (!op_desc->HasAttr("quantization_type")) {
+        bool is_quantized_op = true;
+        for (auto* node_input : op_node->inputs) {
+          for (auto* node_input_input : node_input->inputs) {
+            if (!node_input_input->IsOp()) continue;
+            if (node_input_input->Name().find("quantize_dequantize") ==
+                std::string::npos) {
+              is_quantized_op = false;
+              break;
+            }
+          }
+          if (!is_quantized_op) break;
+        }
+
+        if (!is_quantized_op) {
+          op_node->Op()->SetAttr("skip_quant", 1);
+        }
+      }
+    }
+  }
+}
+
+void QuantDequantMkldnnPass::MarkSkipQuantizedPool2d(ir::Graph* graph) const {
+  VLOG(3) << "mark avg pool2d as skip quantized op";
+  for (auto* op_node :
+       ir::TopologyVarientSort(*graph, static_cast<ir::SortKind>(0))) {
+    if (!op_node->IsOp()) continue;
+
+    if (op_node->Name() == "pool2d") {
+      auto* op_desc = op_node->Op();
+      auto pool_type =
+          BOOST_GET_CONST(std::string, op_desc->GetAttr("pooling_type"));
+      if (pool_type == "avg") {
+        op_node->Op()->SetAttr("skip_quant", 1);
+      }
+    }
+  }
+}
+
+void QuantDequantMkldnnPass::CollectInfoFromFake(
+    ir::Graph* graph, Scope* scope,
+    const std::unordered_set<std::string>& fake_dequantize_types,
+    std::unordered_map<std::string, std::vector<float>>* weight_thresholds)
+    const {
+  VLOG(3) << "gather weight_thresholds from fake dequantized ops";
+  for (auto* op_node :
+       ir::TopologyVarientSort(*graph, static_cast<ir::SortKind>(0))) {
+    if (!op_node->IsOp()) continue;
+
+    if (fake_dequantize_types.count(op_node->Name())) {
+      auto* op_desc = op_node->Op();
+      auto x_var_name = op_desc->Input("X")[0];
+
+      if (op_desc->HasAttr("max_range")) {
+        const float max_range =
+            BOOST_GET_CONST(float, op_desc->GetAttr("max_range"));
+        std::vector<float> thresholds = {127 * 127 / max_range};
+        weight_thresholds->insert(std::make_pair(x_var_name, thresholds));
+      } else {
+        auto scale_name = op_desc->Input("Scales")[0];
+        auto* var = scope->FindVar(scale_name);
+        PADDLE_ENFORCE_NOT_NULL(
+            var, platform::errors::NotFound(
+                     "The Scales variable [%s] of dequantize op is not found.",
+                     var));
+
+        auto* scale_tensor = var->GetMutable<LoDTensor>();
+        auto* scale_data = scale_tensor->data<float>();
+        std::vector<float> thresholds{};
+        for (int i = 0; i < scale_tensor->numel(); i++) {
+          thresholds.push_back(scale_data[i]);
+        }
+        weight_thresholds->insert(std::make_pair(x_var_name, thresholds));
+      }
+    }
+  }
+}
+
+void QuantDequantMkldnnPass::CollectInputScalesFromFake(
+    ir::Graph* graph, Scope* scope,
+    const std::unordered_set<std::string>& fake_quantize_types,
+    std::unordered_map<std::string, std::vector<float>>* var_quant_scales)
+    const {
+  VLOG(3) << "gather input scales from fake quantized ops";
+  for (auto* op_node :
+       ir::TopologyVarientSort(*graph, static_cast<ir::SortKind>(0))) {
+    if (!op_node->IsOp()) continue;
+
+    if (op_node->Name() == "fake_quantize_dequantize_moving_average_abs_max" ||
+        fake_quantize_types.count(op_node->Name())) {
+      auto* op_desc = op_node->Op();
+      const int bit_length =
+          BOOST_GET_CONST(int, op_desc->GetAttr("bit_length"));
+      PADDLE_ENFORCE_EQ(bit_length, 8, platform::errors::InvalidArgument(
+                                           "Unsupported number quantization "
+                                           "bits: %d, only 8 is supported now.",
+                                           bit_length));
+
+      auto x_var_name = op_desc->Input("X")[0];
+      auto scale_name = op_desc->Input("InScale")[0];
+      auto out_var_name = op_desc->Output("Out")[0];
+      auto* var = scope->FindVar(scale_name);
+      PADDLE_ENFORCE_NOT_NULL(
+          var,
+          platform::errors::NotFound(
+              "The InScale variable [%s] of quantize op is not found.", var));
+
+      auto* scale_tensor = var->GetMutable<LoDTensor>();
+      auto* scale_data = scale_tensor->data<float>();
+      float scale = 1.0 / scale_data[0];
+      if (std::isinf(scale) || std::isnan(scale)) {
+        scale = 0.0;
+      }
+
+      if (!var_quant_scales->count(x_var_name)) {
+        std::vector<float> scale_v = {scale};
+        var_quant_scales->insert(std::make_pair(x_var_name, scale_v));
+      }
+
+      if (!var_quant_scales->count(out_var_name)) {
+        std::vector<float> scale_v = {scale};
+        var_quant_scales->insert(std::make_pair(out_var_name, scale_v));
+      }
+    }
+  }
+}
+
+void QuantDequantMkldnnPass::CollectOutputScalesFromAttr(
+    ir::Graph* graph,
+    std::unordered_map<std::string, std::vector<float>>* var_quant_scales)
+    const {
+  VLOG(3) << "gather output scales from op's attr";
+  for (auto* op_node :
+       ir::TopologyVarientSort(*graph, static_cast<ir::SortKind>(0))) {
+    if (!op_node->IsOp()) continue;
+
+    auto* op_desc = op_node->Op();
+    if (op_desc->HasAttr("out_threshold")) {
+      const float attr_scale =
+          BOOST_GET_CONST(float, op_desc->GetAttr("out_threshold"));
+      if (attr_scale == 0.0) continue;
+      float scale = 1.0 / attr_scale;
+      std::vector<float> scale_v = {scale};
+
+      auto var_name_map = op_desc->Outputs();
+      for (auto iter = var_name_map.begin(); iter != var_name_map.end();
+           ++iter) {
+        for (auto var_name : iter->second) {
+          var_quant_scales->insert(std::make_pair(var_name, scale_v));
+        }
+      }
+    }
+  }
+}
+
+void QuantDequantMkldnnPass::CollectFakeQuantizeOps(
+    ir::Graph* graph, Node* op_node,
+    std::unordered_set<const Node*>* nodes2rm) const {
+  auto* op_desc = op_node->Op();
+  auto x_var_name = op_desc->Input("X")[0];
+  auto in_scale_name = op_desc->Input("InScale")[0];
+  auto out_var_name = op_desc->Output("Out")[0];
+  auto out_scale_name = op_desc->Output("OutScale")[0];
+
+  Node* fake_quant_in = nullptr;
+  Node* fake_quant_in_scale = nullptr;
+  for (auto* node_input : op_node->inputs) {
+    if (node_input->Name() == x_var_name) {
+      fake_quant_in = node_input;
+      break;
+    } else if (node_input->Name() == in_scale_name) {
+      fake_quant_in_scale = node_input;
+      break;
+    }
+  }
+
+  Node* fake_quant_out = nullptr;
+  Node* fake_quant_out_scale = nullptr;
+  for (auto* node_output : op_node->outputs) {
+    if (node_output->Name() == out_var_name) {
+      fake_quant_out = node_output;
+      break;
+    } else if (node_output->Name() == out_scale_name) {
+      fake_quant_out_scale = node_output;
+      break;
+    }
+  }
+
+  PADDLE_ENFORCE_NOT_NULL(
+      fake_quant_in,
+      platform::errors::NotFound(
+          "The input var [%s] of quantize op is not found.", x_var_name));
+  PADDLE_ENFORCE_NOT_NULL(
+      fake_quant_out,
+      platform::errors::NotFound(
+          "The output var [%s] of quantize op is not found.", out_var_name));
+
+  std::string input_act_name = fake_quant_in->Var()->Name();
+  std::string output_act_name = fake_quant_out->Var()->Name();
+  auto outlinks = fake_quant_out->outputs;
+  for (auto* next_node : outlinks) {
+    if (!next_node->IsOp()) continue;
+    next_node->Op()->RenameInput(output_act_name, input_act_name);
+    IR_NODE_LINK_TO(fake_quant_in, next_node);
+  }
+
+  nodes2rm->insert(op_node);
+  nodes2rm->insert(fake_quant_in_scale);
+  nodes2rm->insert(fake_quant_out);
+  nodes2rm->insert(fake_quant_out_scale);
+}
+
+void QuantDequantMkldnnPass::CollectFakeDequantizeOps(
+    ir::Graph* graph, Node* op_node,
+    std::unordered_set<const Node*>* nodes2rm) const {
+  auto* op_desc = op_node->Op();
+  auto x_var_name = op_desc->Input("X")[0];
+  auto out_var_name = op_desc->Output("Out")[0];
+
+  Node* fake_dequant_in = nullptr;
+  for (auto* node_input : op_node->inputs) {
+    if (node_input->Name() == x_var_name) {
+      fake_dequant_in = node_input;
+      break;
+    }
+  }
+
+  Node* fake_dequant_out = nullptr;
+  for (auto* node_output : op_node->outputs) {
+    if (node_output->Name() == out_var_name) {
+      fake_dequant_out = node_output;
+      break;
+    }
+  }
+
+  PADDLE_ENFORCE_NOT_NULL(
+      fake_dequant_in,
+      platform::errors::NotFound(
+          "The input var [%s] of dequantize op is not found.", x_var_name));
+  PADDLE_ENFORCE_NOT_NULL(
+      fake_dequant_out,
+      platform::errors::NotFound(
+          "The output var [%s] of dequantize op is not found.", out_var_name));
+
+  std::string input_act_name = fake_dequant_in->Var()->Name();
+  std::string output_act_name = fake_dequant_out->Var()->Name();
+  auto outlinks = fake_dequant_out->outputs;
+  for (auto* next_node : outlinks) {
+    next_node->Op()->RenameInput(output_act_name, input_act_name);
+    IR_NODE_LINK_TO(fake_dequant_in, next_node);
+  }
+
+  nodes2rm->insert(op_node);
+  nodes2rm->insert(fake_dequant_out);
+}
+
+void QuantDequantMkldnnPass::RemoveFakeOps(
+    ir::Graph* graph,
+    const std::unordered_set<std::string>& fake_quantize_types,
+    const std::unordered_set<std::string>& fake_dequantize_types,
+    const std::unordered_set<std::string>& fake_quantize_dequantize_types)
+    const {
+  VLOG(3) << "remove fake quantize and dequantize ops";
+
+  std::unordered_set<const Node*> nodes2rm = {};
+  for (auto* op_node :
+       ir::TopologyVarientSort(*graph, static_cast<ir::SortKind>(0))) {
+    if (!op_node->IsOp()) continue;
+
+    if (fake_quantize_types.count(op_node->Name())) {
+      CollectFakeQuantizeOps(graph, op_node, &nodes2rm);
+    } else if (fake_dequantize_types.count(op_node->Name())) {
+      CollectFakeDequantizeOps(graph, op_node, &nodes2rm);
+    } else if (fake_quantize_dequantize_types.count(op_node->Name())) {
+      CollectFakeDequantizeOps(graph, op_node, &nodes2rm);
+    }
+  }
+
+  GraphSafeRemoveNodes(graph, nodes2rm);
+}
+
+void QuantDequantMkldnnPass::TransposeWeight(Tensor* input) const {
+  const auto in_dims = input->dims();
+  std::vector<int> out_dim_v;
+  std::vector<int> axis;
+  for (int i = in_dims.size() - 1; i >= 0; i--) {
+    axis.push_back(i);
+    out_dim_v.push_back(in_dims[i]);
+  }
+
+  const auto out_dims = phi::make_ddim(out_dim_v);
+  const int rank = axis.size();
+  auto in_stride = phi::stride(in_dims);
+  auto out_stride = phi::stride(out_dims);
+  const int count = input->numel();
+
+  Tensor trans_tensor;
+  trans_tensor.Resize(out_dims);
+  float* trans_data = trans_tensor.mutable_data<float>(platform::CPUPlace());
+  float* in_data = input->mutable_data<float>(platform::CPUPlace());
+
+  for (int64_t out_idx = 0; out_idx < count; ++out_idx) {
+    int64_t in_idx = 0;
+    int64_t tmp_idx = out_idx;
+    for (int i = 0; i < rank; ++i) {
+      const int64_t coordinate = tmp_idx / out_stride[i];
+      tmp_idx -= coordinate * out_stride[i];
+      in_idx += coordinate * in_stride[axis[i]];
+    }
+    trans_data[out_idx] = in_data[in_idx];
+  }
+
+  input->Resize(out_dims);
+  for (int i = 0; i < input->numel(); i++) {
+    in_data[i] = trans_data[i];
+  }
+}
+
+bool QuantDequantMkldnnPass::IsInt8Weight(
+    Node* op_node, Scope* scope, const std::string& weight_name) const {
+  auto* op_desc = op_node->Op();
+  auto var_name = op_desc->Input(weight_name)[0];
+  auto* var = scope->FindVar(var_name);
+  PADDLE_ENFORCE_NOT_NULL(
+      var, platform::errors::NotFound(
+               "The input persistable [%s] var of [%s] op is not found.",
+               var_name, op_desc->Type()));
+  auto* weight_tensor = var->GetMutable<LoDTensor>();
+  auto* weight_data = weight_tensor->data<float>();
+  bool is_int8 = true;
+  for (int i = 0; i < weight_tensor->numel(); i++) {
+    if (weight_data[i] - static_cast<int>(weight_data[i]) != 0) {
+      is_int8 = false;
+      break;
+    }
+  }
+  return is_int8;
+}
+
+void QuantDequantMkldnnPass::DequantizeOpWeights(
+    Node* op_node, Scope* scope, const std::string& weight_name,
+    const std::string& output_name,
+    const std::unordered_map<std::string, std::vector<float>>&
+        weight_thresholds) const {
+  auto* op_desc = op_node->Op();
+  std::string weight_var_name = op_desc->Input(weight_name)[0];
+  std::string output_var_name = op_desc->Output(output_name)[0];
+
+  std::vector<float> scales;
+  auto iter = weight_thresholds.find(output_var_name);
+  if (iter != weight_thresholds.end()) {
+    scales = iter->second;
+  } else {
+    PADDLE_THROW(paddle::platform::errors::Fatal(
+        "Could not find threshold information for [%s] var, please check if "
+        "the model is correct.",
+        output_var_name));
+  }
+
+  auto* var = scope->FindVar(weight_var_name);
+  PADDLE_ENFORCE_NOT_NULL(
+      var, platform::errors::NotFound(
+               "The input persistable [%s] var of [%s] op is not found.",
+               weight_var_name, op_desc->Type()));
+  auto* weight_tensor = var->GetMutable<LoDTensor>();
+  const auto weight_dims = weight_tensor->dims();
+
+  const int size = scales.size();
+  if (size == 1 || size == weight_dims[0]) {
+    auto* weight_data =
+        weight_tensor->mutable_data<float>(platform::CPUPlace());
+    for (int i = 0; i < weight_tensor->numel(); i++) {
+      weight_data[i] /= 127;
+    }
+
+    TransposeWeight(weight_tensor);
+
+    if (size == 1) {
+      for (int i = 0; i < weight_tensor->numel(); i++) {
+        weight_data[i] *= scales[0];
+      }
+    } else {
+      for (int i = 0; i < weight_tensor->numel(); i++) {
+        weight_data[i] *= scales[i % size];
+      }
+    }
+
+    TransposeWeight(weight_tensor);
+  } else if (weight_dims.size() > 1 && size == weight_dims[1]) {
+    auto* weight_data =
+        weight_tensor->mutable_data<float>(platform::CPUPlace());
+    for (int i = 0; i < weight_tensor->numel(); i++) {
+      weight_data[i] /= 127;
+    }
+
+    int step_n = 1;
+    for (int i = 1; i < weight_dims.size(); i++) {
+      step_n *= weight_dims[i];
+    }
+    int step_c = step_n / size;
+    for (int i = 0; i < weight_dims[0]; i++) {
+      int begin_n = i * step_n;
+      for (int j = begin_n; j < begin_n + step_n; j++) {
+        for (int k = 0; k < size; k++) {
+          int begin_c = k * step_c;
+          for (int m = begin_c; m < begin_c + step_c; m++) {
+            weight_data[m] *= scales[k];
+          }
+        }
+      }
+    }
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "The size of weight scales vector (%d) does not "
+        "match the dimensions (%d) of the weights tensor %s.",
+        size, weight_tensor->dims().size(), weight_var_name));
+  }
+
+  weight_tensor->Resize(weight_dims);
+}
+
+void QuantDequantMkldnnPass::DequantizeWeights(
+    ir::Graph* graph, Scope* scope,
+    const std::unordered_map<std::string, std::vector<float>>&
+        weight_thresholds) const {
+  VLOG(3) << "dequantize weight for ops which has weight";
+
+  if (weight_thresholds.empty()) {
+    VLOG(3)
+        << "No need to dequantize weights because weight_thresholds is empty.";
+    return;
+  }
+
+  for (auto* op_node :
+       ir::TopologyVarientSort(*graph, static_cast<ir::SortKind>(0))) {
+    if (!op_node->IsOp()) continue;
+    if (op_node->Name() == "conv2d" || op_node->Name() == "depthwise_conv2d") {
+      if (IsInt8Weight(op_node, scope, "Filter")) {
+        DequantizeOpWeights(op_node, scope, "Filter", "Output",
+                            weight_thresholds);
+      }
+    } else if (op_node->Name() == "mul" || op_node->Name() == "matmul" ||
+               op_node->Name() == "matmul_v2") {
+      if (IsInt8Weight(op_node, scope, "Y")) {
+        DequantizeOpWeights(op_node, scope, "Y", "Out", weight_thresholds);
+      }
+    }
+  }
+}
+
+void QuantDequantMkldnnPass::UpdateActivations(ir::Graph* graph) const {
+  VLOG(3) << "update conv2d or depthwise_conv2d fused activation";
+  for (auto* op_node :
+       ir::TopologyVarientSort(*graph, static_cast<ir::SortKind>(0))) {
+    if (!op_node->IsOp()) continue;
+
+    if (op_node->Name() == "conv2d" || op_node->Name() == "depthwise_conv2d") {
+      auto* op_desc = op_node->Op();
+      if (!op_desc->HasAttr("fuse_activation")) {
+        std::string activation;
+        if (op_desc->GetAttrIfExists<bool>("fuse_relu")) {
+          activation = "relu";
+        } else if (op_desc->GetAttrIfExists<bool>("fuse_brelu")) {
+          activation = "relu6";
+          float alpha = 6.0;
+          if (op_desc->HasAttr("fuse_brelu_threshold")) {
+            alpha = BOOST_GET_CONST(float,
+                                    op_desc->GetAttr("fuse_brelu_threshold"));
+          }
+          op_node->Op()->SetAttr("fuse_alpha", alpha);
+        }
+        op_node->Op()->SetAttr("fuse_activation", activation);
+      }
+    }
+  }
+}
+
+void QuantDequantMkldnnPass::RemoveCtrlVars(ir::Graph* graph) const {
+  VLOG(3) << "remove control flow variable";
+  std::unordered_set<const Node*> nodes2rm = {};
+  for (auto* op_node :
+       ir::TopologyVarientSort(*graph, static_cast<ir::SortKind>(0))) {
+    if (op_node->IsCtrlVar()) {
+      nodes2rm.insert(op_node);
+    }
+  }
+
+  GraphSafeRemoveNodes(graph, nodes2rm);
+}
+
+void QuantDequantMkldnnPass::ApplyImpl(ir::Graph* graph) const {
+  VLOG(3) << "Convert paddle slim quantized model to mkldnn quantized model.";
+  const std::string pattern_name = "quant_dequant_mkldnn_pass";
+  FusePassBase::Init(pattern_name, graph);
+
+  const std::unordered_set<std::string> skip_ops = {
+      "conv2d", "depthwise_conv2d", "mul", "matmul", "matmul_v2"};
+
+  const std::unordered_set<std::string> fake_quantize_types = {
+      "fake_quantize_moving_average_abs_max", "fake_quantize_range_abs_max"};
+
+  const std::unordered_set<std::string> fake_dequantize_types = {
+      "fake_dequantize_max_abs", "fake_channel_wise_dequantize_max_abs"};
+
+  const std::unordered_set<std::string> fake_quantize_dequantize_types = {
+      "fake_quantize_dequantize_abs_max",
+      "fake_quantize_dequantize_moving_average_abs_max",
+      "fake_channel_wise_quantize_dequantize_abs_max"};
+
+  std::unordered_map<std::string, std::vector<float>> weight_thresholds{};
+  std::unordered_map<std::string, std::vector<float>> var_quant_scales{};
+
+  auto* scope = param_scope();
+  MarkSkipQuantizedOps(graph, skip_ops);
+  MarkSkipQuantizedPool2d(graph);
+  CollectInfoFromFake(graph, scope, fake_dequantize_types, &weight_thresholds);
+  CollectInputScalesFromFake(graph, scope, fake_quantize_types,
+                             &var_quant_scales);
+  CollectOutputScalesFromAttr(graph, &var_quant_scales);
+  RemoveFakeOps(graph, fake_quantize_types, fake_dequantize_types,
+                fake_quantize_dequantize_types);
+  DequantizeWeights(graph, scope, weight_thresholds);
+  UpdateActivations(graph);
+  RemoveCtrlVars(graph);
+
+  // save var_quant_scales in the first op's attr
+  // for compute_propagate_scales_mkldnn_pass
+  SaveInfoInTheFirstOp(graph, "has_quant_info", "var_quant_scales",
+                       var_quant_scales);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(quant_dequant_mkldnn_pass,
+              paddle::framework::ir::QuantDequantMkldnnPass);
+
+REGISTER_PASS_CAPABILITY(quant_dequant_mkldnn_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .LE("conv2d", 1)
+            .EQ("fc", 0)
+            .LE("conv2d_transpose", 2)
+            .EQ("fake_quantize_abs_max", 0)
+            .EQ("fake_quantize_range_abs_max", 0)
+            .EQ("fake_quantize_moving_average_abs_max", 0)
+            .LE("fake_channel_wise_quantize_abs_max", 1)
+            .EQ("fake_dequantize_max_abs", 0));
diff --git a/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.h b/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.h
new file mode 100644
index 00000000000000..a9442f707402d9
--- /dev/null
+++ b/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.h
@@ -0,0 +1,91 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class QuantDequantMkldnnPass : public FusePassBase {
+ public:
+  QuantDequantMkldnnPass() = default;
+  virtual ~QuantDequantMkldnnPass() {}
+
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+
+ private:
+  void MarkSkipQuantizedOps(
+      ir::Graph* graph, const std::unordered_set<std::string>& skip_ops) const;
+
+  void MarkSkipQuantizedPool2d(ir::Graph* graph) const;
+
+  void CollectInfoFromFake(
+      ir::Graph* graph, Scope* scope,
+      const std::unordered_set<std::string>& fake_dequantize_types,
+      std::unordered_map<std::string, std::vector<float>>* weight_thresholds)
+      const;
+
+  void CollectInputScalesFromFake(
+      ir::Graph* graph, Scope* scope,
+      const std::unordered_set<std::string>& fake_quantize_types,
+      std::unordered_map<std::string, std::vector<float>>* var_quant_scales)
+      const;
+
+  void CollectOutputScalesFromAttr(
+      ir::Graph* graph,
+      std::unordered_map<std::string, std::vector<float>>* var_quant_scales)
+      const;
+
+  void CollectFakeQuantizeOps(ir::Graph* graph, Node* op_node,
+                              std::unordered_set<const Node*>* nodes2rm) const;
+
+  void CollectFakeDequantizeOps(
+      ir::Graph* graph, Node* op_node,
+      std::unordered_set<const Node*>* nodes2rm) const;
+
+  void RemoveFakeOps(
+      ir::Graph* graph,
+      const std::unordered_set<std::string>& fake_quantize_types,
+      const std::unordered_set<std::string>& fake_dequantize_types,
+      const std::unordered_set<std::string>& fake_quantize_dequantize_types)
+      const;
+
+  bool IsInt8Weight(Node* op_node, Scope* scope,
+                    const std::string& weight_name) const;
+
+  void TransposeWeight(Tensor* input) const;
+
+  void DequantizeOpWeights(
+      Node* op_node, Scope* scope, const std::string& weight_name,
+      const std::string& output_name,
+      const std::unordered_map<std::string, std::vector<float>>&
+          weight_thresholds) const;
+
+  void DequantizeWeights(
+      ir::Graph* graph, Scope* scope,
+      const std::unordered_map<std::string, std::vector<float>>&
+          weight_thresholds) const;
+
+  void UpdateActivations(ir::Graph* graph) const;
+
+  void RemoveCtrlVars(ir::Graph* graph) const;
+};
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/new_executor/interpretercore_util.cc b/paddle/fluid/framework/new_executor/interpretercore_util.cc
index a704411f3bb713..59703332efe959 100644
--- a/paddle/fluid/framework/new_executor/interpretercore_util.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore_util.cc
@@ -39,6 +39,7 @@ constexpr size_t kPrepareWorkQueueIdx = 2;
 
 void AsyncWorkQueue::AddTask(const OpFuncType& op_func_type,
                              std::function<void()> fn) {
+  VLOG(4) << "Add task: " << static_cast<size_t>(op_func_type) << " ";
   // NOTE(zhiqiu): use thhe second queue of size of, so only one thread is used.
   if (FLAGS_new_executor_sequential_run) {
     VLOG(4) << "FLAGS_new_executor_sequential_run:"
diff --git a/paddle/fluid/framework/new_executor/workqueue/event_count.h b/paddle/fluid/framework/new_executor/workqueue/event_count.h
index 893c6d2d54ac72..7a826c3990713d 100644
--- a/paddle/fluid/framework/new_executor/workqueue/event_count.h
+++ b/paddle/fluid/framework/new_executor/workqueue/event_count.h
@@ -54,6 +54,7 @@
 #include <cstdlib>
 #include <mutex>
 #include <vector>
+#include "glog/logging.h"
 
 namespace paddle {
 namespace framework {
@@ -255,6 +256,7 @@ class EventCount {
     std::unique_lock<std::mutex> lock(w->mu);
     while (w->state != Waiter::kSignaled) {
       w->state = Waiter::kWaiting;
+      VLOG(10) << "Go to wait " << &(w->cv);
       w->cv.wait(lock);
     }
   }
@@ -270,7 +272,10 @@ class EventCount {
         w->state = Waiter::kSignaled;
       }
       // Avoid notifying if it wasn't waiting.
-      if (state == Waiter::kWaiting) w->cv.notify_one();
+      if (state == Waiter::kWaiting) {
+        VLOG(10) << "Go to notify " << &(w->cv);
+        w->cv.notify_one();
+      }
     }
   }
 };
diff --git a/paddle/fluid/framework/new_executor/workqueue/nonblocking_threadpool.h b/paddle/fluid/framework/new_executor/workqueue/nonblocking_threadpool.h
index 384498584c66a6..44953fa192e270 100644
--- a/paddle/fluid/framework/new_executor/workqueue/nonblocking_threadpool.h
+++ b/paddle/fluid/framework/new_executor/workqueue/nonblocking_threadpool.h
@@ -53,7 +53,6 @@ class ThreadPoolTempl {
     all_coprimes_.reserve(num_threads_);
     for (int i = 1; i <= num_threads_; ++i) {
       all_coprimes_.emplace_back();
-      all_coprimes_.back().push_back(i);
       ComputeCoprimes(i, &(all_coprimes_.back()));
     }
     for (int i = 0; i < num_threads_; i++) {
@@ -130,8 +129,11 @@ class ThreadPoolTempl {
     // this. We expect that such scenario is prevented by program, that is,
     // this is kept alive while any threads can potentially be in Schedule.
     if (!t.f) {
-      if (num_tasks > num_threads_ - blocked_.load(std::memory_order_relaxed)) {
+      if (num_tasks > num_threads_ - blocked_) {
+        VLOG(6) << "Add task, Notify";
         ec_.Notify(false);
+      } else {
+        VLOG(6) << "Add task, No Notify";
       }
     } else {
       num_tasks_.fetch_sub(1, std::memory_order_relaxed);
@@ -376,17 +378,21 @@ class ThreadPoolTempl {
       ec_.CancelWait();
       return false;
     }
+
+    // Number of blocked threads is used as termination condition.
+    // If we are shutting down and all worker threads blocked without work,
+    // that's we are done.
+    blocked_++;
+
     // Now do a reliable emptiness check.
     int victim = NonEmptyQueueIndex();
     if (victim != -1) {
       ec_.CancelWait();
       *t = thread_data_[victim].queue.PopBack();
+      blocked_--;
       return true;
     }
-    // Number of blocked threads is used as termination condition.
-    // If we are shutting down and all worker threads blocked without work,
-    // that's we are done.
-    blocked_++;
+
     if (done_ && blocked_ == static_cast<unsigned>(num_threads_)) {
       ec_.CancelWait();
       // Almost done, but need to re-check queues.
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 6af07caaf88b2a..e6577f662ae7b2 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1333,7 +1333,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
 // NOTE(Liu-xiandong): Determine whether the selected kernel is valid
 // If not, use the kernel registered in fluid. And if the fluid do not
 // contains the related heterogeneous kernel, use phi CPU kernel.
-#if defined(PADDLE_WITH_XPU) && !defined(PADDLE_WITH_XPU_KP)
+#if defined(PADDLE_WITH_XPU)
     bool is_xpu_unsupport =
         paddle::platform::is_xpu_place(kernel_type_->place_) &&
             !paddle::platform::is_xpu_support_op(type_, *kernel_type_.get()) ||
@@ -1373,7 +1373,10 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
 #if defined(PADDLE_WITH_XPU) && !defined(PADDLE_WITH_XPU_KP)
           || is_xpu_unsupport
 #endif
-          ) {
+#if defined(PADDLE_WITH_XPU_KP)
+          || (is_xpu_unsupport && !is_xpu_kp_support)
+#endif
+              ) {
         auto pt_cpu_kernel_key =
             FallBackToCpu(*kernel_type_.get(), pt_kernel_key, *this);
         pt_kernel_.reset(
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index b56d113937d69f..0ad5e808b1d1aa 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -263,7 +263,10 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
 #if defined(PADDLE_WITH_XPU) && !defined(PADDLE_WITH_XPU_KP)
       || is_xpu_unsupport
 #endif
-      ) {
+#if defined(PADDLE_WITH_XPU_KP)
+      || (is_xpu_unsupport && !is_xpu_kp_support)
+#endif
+          ) {
     if (phi::KernelFactory::Instance().HasCompatiblePhiKernel(op.Type())) {
       auto pt_cpu_kernel_key =
           FallBackToCpu(expected_kernel_key, pt_kernel_key, op);
diff --git a/paddle/fluid/operators/arg_max_op_xpu.cc b/paddle/fluid/operators/arg_max_op_xpu.cc
index ba2ef81b5cdf1d..e2acd84bd4e9db 100644
--- a/paddle/fluid/operators/arg_max_op_xpu.cc
+++ b/paddle/fluid/operators/arg_max_op_xpu.cc
@@ -28,12 +28,15 @@ class ArgMaxXPUKernel : public framework::OpKernel<T> {
     auto* out = ctx.Output<framework::LoDTensor>("Out");
     auto dtype = ctx.Attr<int>("dtype");
     PADDLE_ENFORCE_EQ(
-        (dtype < 0 || dtype == 3), true,
+        (dtype < 0 || dtype == 2 || dtype == 3), true,
         platform::errors::InvalidArgument(
-            "The attribute of dtype in xpu argmin/argmax must be [%s], but "
+            "The attribute of dtype in xpu argmin/argmax must be [%s] or [%s], "
+            "but "
             "received [%s]",
             paddle::framework::DataTypeToString(
                 framework::proto::VarType::INT64),
+            paddle::framework::DataTypeToString(
+                framework::proto::VarType::INT32),
             paddle::framework::DataTypeToString(
                 static_cast<framework::proto::VarType::Type>(dtype))));
 
diff --git a/paddle/fluid/operators/cast_op.cu b/paddle/fluid/operators/cast_op.cu
index eb51215790bbcd..0afe09ec028e34 100644
--- a/paddle/fluid/operators/cast_op.cu
+++ b/paddle/fluid/operators/cast_op.cu
@@ -19,15 +19,15 @@ namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
 using CUDA = paddle::platform::CUDADeviceContext;
-#define REGISTER_CAST_CUDA_BASE(op_name, ...)                             \
-  REGISTER_OP_CUDA_KERNEL(                                                \
-      op_name, ops::CastOpKernel<CUDA, float>,                            \
-      ops::CastOpKernel<CUDA, double>, ops::CastOpKernel<CUDA, int>,      \
-      ops::CastOpKernel<CUDA, int64_t>, ops::CastOpKernel<CUDA, int16_t>, \
-      ops::CastOpKernel<CUDA, bool>, ops::CastOpKernel<CUDA, uint8_t>,    \
-      ops::CastOpKernel<CUDA, plat::float16>,                             \
-      ops::CastOpKernel<CUDA, plat::complex<float>>,                      \
-      ops::CastOpKernel<CUDA, plat::complex<double>>, ##__VA_ARGS__);
-
 // See [ why register transfer_dtype_op alias with cast_op? ] in cast_op.cc
-REGISTER_CAST_CUDA_BASE(transfer_dtype, ops::CastOpKernel<CUDA, plat::bfloat16>)
+REGISTER_OP_CUDA_KERNEL(transfer_dtype, ops::CastOpKernel<CUDA, float>,
+                        ops::CastOpKernel<CUDA, double>,
+                        ops::CastOpKernel<CUDA, int>,
+                        ops::CastOpKernel<CUDA, int64_t>,
+                        ops::CastOpKernel<CUDA, int16_t>,
+                        ops::CastOpKernel<CUDA, bool>,
+                        ops::CastOpKernel<CUDA, uint8_t>,
+                        ops::CastOpKernel<CUDA, plat::float16>,
+                        ops::CastOpKernel<CUDA, plat::complex<float>>,
+                        ops::CastOpKernel<CUDA, plat::complex<double>>,
+                        ops::CastOpKernel<CUDA, plat::bfloat16>);
diff --git a/paddle/fluid/operators/conv_base_helper.h b/paddle/fluid/operators/conv_base_helper.h
new file mode 100644
index 00000000000000..c664d1935fe2e3
--- /dev/null
+++ b/paddle/fluid/operators/conv_base_helper.h
@@ -0,0 +1,99 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include <array>
+#include <memory>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/conv_search_cache.h"
+#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using DataLayout = platform::DataLayout;
+using framework::AlgorithmsCache;
+using framework::ConvSearchCache;
+
+template <typename T>
+using ScalingParamType = typename platform::CudnnDataType<T>::ScalingParamType;
+
+// As the basic for SearchAlgorithm struct.
+template <typename PerfT>
+struct SearchAlgorithm {};
+
+// As the container of searchAlgorithm::Find() result.
+template <typename AlgoT>
+struct SearchResult {
+ public:
+  AlgoT algo = static_cast<AlgoT>(0);
+  float time = -1.f;
+  size_t workspace_size = 0;
+};
+
+// As the container of conv relevant descriptors.
+template <typename HandleT, typename DataT>
+struct ConvArgsBase {
+  HandleT handle;
+  platform::TensorDescriptor idesc, odesc;
+  platform::FilterDescriptor wdesc;
+  platform::ConvolutionDescriptor cdesc;
+  const framework::Tensor *x, *w, *o;
+  DataT cudnn_dtype;
+
+  // strides
+  std::vector<int> s;
+  // paddings
+  std::vector<int> p;
+  // dilations
+  std::vector<int> d;
+
+  ConvArgsBase(const framework::Tensor* x, const framework::Tensor* w,
+               const framework::Tensor* o, const std::vector<int> s,
+               const std::vector<int> p, const std::vector<int> d, DataT dtype)
+      : x(x), w(w), o(o), s(s), p(p), d(d), cudnn_dtype(dtype) {}
+};
+
+static inline void GetNCDHW(const framework::DDim& dims,
+                            const DataLayout& layout, int* N, int* C, int* D,
+                            int* H, int* W) {
+  *N = dims[0];
+  *C = layout == DataLayout::kNCHW ? dims[1] : dims[dims.size() - 1];
+  int i = layout == DataLayout::kNCHW ? 0 : 1;
+  if (dims.size() == 5) {
+    *D = dims[2 - i];
+    *H = dims[3 - i];
+    *W = dims[4 - i];
+  } else {
+    *D = 1;
+    *H = dims[2 - i];
+    *W = dims[3 - i];
+  }
+}
+
+template <typename T>
+static std::ostream& operator<<(std::ostream& out, const std::vector<T>& v) {
+  out << "[";
+  for (auto const& tmp : v) out << tmp << ",";
+  out << "]";
+  return out;
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/conv_cudnn_helper.h b/paddle/fluid/operators/conv_cudnn_helper.h
index 4e6fda3d09a071..3c29c60b215655 100644
--- a/paddle/fluid/operators/conv_cudnn_helper.h
+++ b/paddle/fluid/operators/conv_cudnn_helper.h
@@ -14,44 +14,15 @@ limitations under the License. */
 
 #pragma once
 
-#include <algorithm>
-#include <array>
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/conv_search_cache.h"
-#include "paddle/fluid/framework/operator_kernel_configs.h"
-#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
-#include "paddle/fluid/operators/eigen/eigen_function.h"
+#include "paddle/fluid/operators/conv_base_helper.h"
 #include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
-#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
-#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
 
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
-using DataLayout = platform::DataLayout;
-template <typename T>
-using ScalingParamType = typename platform::CudnnDataType<T>::ScalingParamType;
-using framework::AlgorithmsCache;
-static inline void GetNCDHW(const framework::DDim& dims,
-                            const DataLayout& layout, int* N, int* C, int* D,
-                            int* H, int* W) {
-  *N = dims[0];
-  *C = layout == DataLayout::kNCHW ? dims[1] : dims[dims.size() - 1];
-  int i = layout == DataLayout::kNCHW ? 0 : 1;
-  if (dims.size() == 5) {
-    *D = dims[2 - i];
-    *H = dims[3 - i];
-    *W = dims[4 - i];
-  } else {
-    *D = 1;
-    *H = dims[2 - i];
-    *W = dims[3 - i];
-  }
-}
+using ConvArgs = ConvArgsBase<cudnnHandle_t, cudnnDataType_t>;
 
 template <typename DeviceContext, typename T, size_t D>
 static void RemovePaddingSlice(const phi::GPUContext& context,
@@ -68,121 +39,103 @@ static void RemovePaddingSlice(const phi::GPUContext& context,
     extents[i] = new_out_dims[i];
   }
 
-  int start;
   for (size_t i = 0; i < axes.size(); ++i) {
-    start = starts[i];
+    int start = starts[i];
     if (start < 0) {
       start = (start + in_dims[axes[i]]);
     }
     start = std::max(start, 0);
     offsets[axes[i]] = start;
   }
+
   auto in_t =
       framework::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(
           *input);
-
   auto out_t =
       framework::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(
           *out, new_out_dims);
-  EigenSlice<std::decay_t<decltype(place)>, T, D>::Eval(place, out_t, in_t,
-                                                        offsets, extents);
+
+  phi::funcs::EigenSlice<std::decay_t<decltype(place)>, T, D>::Eval(
+      place, out_t, in_t, offsets, extents);
 }
 
-template <typename T>
-std::ostream& operator<<(std::ostream& out, const std::vector<T>& v) {
-  out << "[";
-  for (auto const& tmp : v) out << tmp << ",";
-  out << "]";
-  return out;
+static inline double ToMegaBytes(size_t bytes) {
+  return static_cast<double>(bytes) / (1 << 20);
 }
 
-inline int MaxBwdFilterAlgos(cudnnHandle_t cudnn_handle) {
-  int max_algos = 0;
-#if CUDNN_VERSION_MIN(7, 0, 1)
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(
-          cudnn_handle, &max_algos));
-#endif
-  return max_algos;
+static inline bool UseFixedWorkspace() {
+  return FLAGS_conv_workspace_size_limit >= 0;
 }
 
-template <typename PerfType, typename AlgoType>
-void ChooseAlgoByWorkspace(PerfType* perf_results, size_t perf_num,
-                           size_t workspace_byte, AlgoType* algo) {
-  for (size_t i = 0; i < perf_num; ++i) {
-    auto result = perf_results[i];
-    if (result.status == CUDNN_STATUS_SUCCESS &&
-        result.memory < workspace_byte) {
-      *algo = result.algo;
-      VLOG(3) << "    algo: " << result.algo << ", time: " << result.time
-              << " ms, wksp = " << result.memory
-              << ", status = " << result.status;
-      return;
-    }
+static size_t CaclWorkspaceLimitInBytes(const phi::GPUContext& ctx) {
+  if (!UseFixedWorkspace()) {
+    int device_id = platform::GetCurrentDeviceId();
+    int64_t allocated = memory::StatGetCurrentValue("Allocated", device_id);
+    int64_t reserved = memory::StatGetCurrentValue("Reserved", device_id);
+    int64_t availble = platform::GpuAvailableMemToAlloc();
+    int64_t cur_workspace_size = ctx.cudnn_workspace_handle().WorkspaceSize();
+    VLOG(3) << "[memory] allocated=" << ToMegaBytes(allocated)
+            << " MB, reserved=" << ToMegaBytes(reserved)
+            << " MB, available_to_alloc=" << ToMegaBytes(availble)
+            << " MB, current_workspace_size=" << ToMegaBytes(cur_workspace_size)
+            << " MB.";
+    return std::max(std::max(availble, cur_workspace_size),
+                    reserved - allocated);
+  } else {
+    return FLAGS_conv_workspace_size_limit * 1024 * 1024;
   }
-  VLOG(3) << "Can not find alog that requires memory < "
-          << static_cast<double>(workspace_byte) / (1 << 20) << " MB";
 }
 
-template <typename PerfType, typename AlgoType>
-void ChooseAlgo(const std::vector<PerfType>& perf_results,
-                size_t workspace_byte, AlgoType* algo) {
-  VLOG(3) << "=========BwdFilterAlgo Perf result=========";
-  for (const auto& result : perf_results) {
-    auto math_type_str = "False";
-    if (result.mathType == CUDNN_TENSOR_OP_MATH) {
-      math_type_str = "True";
-    }
-    VLOG(3) << "    algo: " << result.algo << ", TensorCore: " << math_type_str
-            << ", time: " << result.time << " ms"
-            << ", wksp = " << result.memory << ", status = " << result.status;
+template <typename PerfT>
+std::string GetPerfResultString(std::string prefix,
+                                const std::vector<PerfT>& perf_results,
+                                int actual_algo_count, size_t workspace_limit) {
+  std::ostringstream out;
+  out << prefix << " (workspace limit=" << ToMegaBytes(workspace_limit)
+      << " MB):\n";
+  for (int i = 0; i < actual_algo_count; ++i) {
+    const auto& result = perf_results[i];
+    auto math_type_str = (result.mathType == CUDNN_TENSOR_OP_MATH) ? "T" : "F";
+    out << "  algo=" << result.algo << ": tensor_core=" << math_type_str
+        << ", time=" << result.time
+        << " ms, memory=" << ToMegaBytes(result.memory)
+        << " MB, status=" << result.status << "\n";
   }
+  return out.str();
+}
 
-  for (size_t i = 0; i != perf_results.size(); ++i) {
-    const auto& result = perf_results[i];
+template <typename PerfT, typename AlgoT>
+void ChooseAlgoByWorkspace(const std::vector<PerfT>& perf_results,
+                           size_t workspace_limit,
+                           SearchResult<AlgoT>* algo_result) {
+  for (size_t i = 0; i < perf_results.size(); ++i) {
+    auto result = perf_results[i];
     if (result.status == CUDNN_STATUS_SUCCESS &&
-        (result.memory <= workspace_byte)) {
-      if ((result.mathType == CUDNN_TENSOR_OP_MATH) &&
-          (i != perf_results.size() - 1)) {
-        const auto& next_result = perf_results[i + 1];
-        if (next_result.status == CUDNN_STATUS_SUCCESS &&
-            next_result.algo == result.algo &&
-            next_result.memory == result.memory &&
-            next_result.mathType != CUDNN_TENSOR_OP_MATH &&
-            next_result.time < 1.01 * result.time) {
-          // Skip over this result- it's not really a Tensor Core algo.
-          // Because it is only 1% performance difference.
-          // Prefer to choose the next equivalent non-Tensor Core algo.
-          continue;
-        }
-      }
-      *algo = result.algo;
-      auto math_type_str = "0";
-      if (result.mathType == CUDNN_TENSOR_OP_MATH) {
-        math_type_str = "1";
-      }
-      VLOG(3) << "    choose algo: " << result.algo << ", TC: " << math_type_str
-              << ", time: " << result.time << " ms"
-              << ", wksp = " << result.memory << ", status = " << result.status;
-      break;
+        result.memory < workspace_limit) {
+      algo_result->algo = result.algo;
+      algo_result->time = result.time;
+      algo_result->workspace_size = result.memory;
+      VLOG(3) << "  algo=" << result.algo << ", time=" << result.time
+              << " ms, memory=" << ToMegaBytes(result.memory)
+              << " MB (limit=" << ToMegaBytes(workspace_limit)
+              << " MB), status=" << result.status;
+      return;
     }
   }
+  VLOG(3) << "Can not find an algorithm that requires memory < "
+          << ToMegaBytes(workspace_limit) << " MB";
 }
 
-using framework::ConvSearchCache;
-
 static void SetConvMathType(const phi::GPUContext& ctx, cudnnDataType_t dtype,
                             const platform::ConvolutionDescriptor& cdesc) {
 #if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
-  auto& dev_ctx = ctx;
-  if (dev_ctx.GetComputeCapability() >= 70 && dtype == CUDNN_DATA_HALF) {
+  if (ctx.GetComputeCapability() >= 70 && dtype == CUDNN_DATA_HALF) {
     PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
         cdesc.desc(), CUDNN_TENSOR_OP_MATH));
     VLOG(5) << "use cudnn_tensor_op_math";
 #if CUDA_VERSION >= 11000
 #if CUDNN_VERSION_MIN(8, 1, 0)
-  } else if (dev_ctx.GetComputeCapability() >= 80 &&
-             dtype == CUDNN_DATA_BFLOAT16) {
+  } else if (ctx.GetComputeCapability() >= 80 && dtype == CUDNN_DATA_BFLOAT16) {
     PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
         cdesc.desc(), CUDNN_TENSOR_OP_MATH));
 #endif  // CUDNN_VERSION_MIN(8, 1, 0)
@@ -198,76 +151,49 @@ static void SetConvMathType(const phi::GPUContext& ctx, cudnnDataType_t dtype,
 #endif
 }
 
-struct ConvArgs {
-  cudnnHandle_t handle;
-  platform::TensorDescriptor idesc, odesc;
-  platform::FilterDescriptor wdesc;
-  platform::ConvolutionDescriptor cdesc;
-  const framework::Tensor *x, *w, *o;
-  cudnnDataType_t cudnn_dtype;
-
-  // strides
-  std::vector<int> s;
-  // paddings
-  std::vector<int> p;
-  // dilations
-  std::vector<int> d;
-
-  ConvArgs(const framework::Tensor* x, const framework::Tensor* w,
-           const framework::Tensor* o, const std::vector<int> s,
-           const std::vector<int> p, const std::vector<int> d,
-           cudnnDataType_t dtype)
-      : x(x), w(w), o(o), s(s), p(p), d(d), cudnn_dtype(dtype) {}
-};
-
-template <typename perf_t>
-struct SearchAlgorithm {};
-
 template <>
 struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
-  using perf_t = cudnnConvolutionFwdAlgoPerf_t;
-  using algo_t = cudnnConvolutionFwdAlgo_t;
+  using PerfT = cudnnConvolutionFwdAlgoPerf_t;
+  using AlgoT = cudnnConvolutionFwdAlgo_t;
 
   template <typename T>
-  static algo_t Find(const ConvArgs& args, bool exhaustive_search,
-                     bool deterministic, const phi::GPUContext& ctx) {
+  static SearchResult<AlgoT> Find(const ConvArgs& args, bool exhaustive_search,
+                                  bool deterministic,
+                                  const phi::GPUContext& ctx) {
+    SearchResult<AlgoT> result;
     auto dtype = platform::CudnnDataType<T>::type;
-    bool has_got_workspace_size = true;
-    size_t workspace_size_limit = FLAGS_conv_workspace_size_limit * 1024 * 1024;
-    size_t workspace_size = 0;
-    algo_t algo;
+    size_t workspace_size_limit = CaclWorkspaceLimitInBytes(ctx);
     SetConvMathType(ctx, dtype, args.cdesc);
 
     if (!exhaustive_search && !deterministic) {
 #if CUDNN_VERSION >= 7001
-      int perf_count;
+      int actual_perf_count;
       int best_algo_idx = 0;
-      std::unique_ptr<perf_t[]> perf_results(new perf_t[kNUM_CUDNN_FWD_ALGS]);
+      std::vector<PerfT> perf_results(kNUM_CUDNN_FWD_ALGS);
       PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cudnnGetConvolutionForwardAlgorithm_v7(
               args.handle, args.idesc.desc(), args.wdesc.desc(),
               args.cdesc.desc(), args.odesc.desc(), kNUM_CUDNN_FWD_ALGS,
-              &perf_count, perf_results.get()));
-      algo = (perf_results.get())[best_algo_idx].algo;
-      workspace_size = (perf_results.get())[best_algo_idx].memory;
+              &actual_perf_count, perf_results.data()));
+      result.algo = perf_results[best_algo_idx].algo;
+      result.workspace_size = perf_results[best_algo_idx].memory;
 
-      if (workspace_size > workspace_size_limit) {
+      if (result.workspace_size > workspace_size_limit) {
 #if CUDNN_VERSION >= 8000
         // cudnnGetConvolutionForwardAlgorithm is removed in CUDNN-8
-        ChooseAlgoByWorkspace<perf_t, algo_t>(perf_results.get(),
-                                              kNUM_CUDNN_FWD_ALGS,
-                                              workspace_size_limit, &algo);
+        ChooseAlgoByWorkspace<PerfT, AlgoT>(perf_results, workspace_size_limit,
+                                            &result);
 #else
-        VLOG(1) << "Fallback to non-v7 method to find conv algorithm becasue "
-                   "the workspace size request("
-                << workspace_size << ") exceeds the limit("
+        VLOG(3) << "Fallback to non-v7 method to find conv algorithm "
+                   "becasue the workspace size request("
+                << result.workspace_size << ") exceeds the limit("
                 << workspace_size_limit << ")";
         PADDLE_ENFORCE_GPU_SUCCESS(
             platform::dynload::cudnnGetConvolutionForwardAlgorithm(
                 args.handle, args.idesc.desc(), args.wdesc.desc(),
                 args.cdesc.desc(), args.odesc.desc(),
                 CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
-                workspace_size_limit, &algo));
+                workspace_size_limit, &(result.algo)));
 #endif
       }
 #else
@@ -276,30 +202,30 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
               args.handle, args.idesc.desc(), args.wdesc.desc(),
               args.cdesc.desc(), args.odesc.desc(),
               CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
-              workspace_size_limit, &algo));
+              workspace_size_limit, &(result.algo)));
 #endif
-      VLOG(3) << "choose algo " << algo;
     } else if (deterministic) {
-      algo = static_cast<cudnnConvolutionFwdAlgo_t>(1);
+      result.algo = static_cast<AlgoT>(1);
     } else {
-      auto& dev_ctx = ctx;
-      auto workspace_handle = dev_ctx.cudnn_workspace_handle();
-
-      AlgorithmsCache<algo_t>& algo_cache =
-          *(framework::ConvSearchCache::Instance().GetForward());
-
+      auto workspace_handle = ctx.cudnn_workspace_handle();
       auto x_dims = phi::vectorize(args.x->dims());
       auto w_dims = phi::vectorize(args.w->dims());
-
       VLOG(10) << "cudnnConvolutionFwdAlgoPerf_t:"
                << ", x_dims:" << x_dims << ", w_dims:" << w_dims << ", args.s"
                << args.s << ", args.p" << args.p << ", args.d" << args.d;
 
-      algo = algo_cache.GetAlgorithm(
+      AlgorithmsCache<AlgoT>& algo_cache =
+          *(framework::ConvSearchCache::Instance().GetForward());
+
+      result.algo = algo_cache.GetAlgorithm(
           x_dims, w_dims, args.s, args.p, args.d, 0,
           static_cast<int64_t>(args.cudnn_dtype), [&]() {
             int returned_algo_count;
-            std::array<perf_t, kNUM_CUDNN_FWD_ALGS> perf_stat;
+            std::vector<PerfT> perf_results(kNUM_CUDNN_FWD_ALGS);
+            size_t max_workspace_size =
+                FindMaxWorkspaceSize(args, workspace_size_limit);
+            VLOG(4) << "max_workspace_size=" << ToMegaBytes(max_workspace_size)
+                    << " MB";
 
             auto cudnn_find_func = [&](void* cudnn_workspace_ptr) {
               PADDLE_ENFORCE_GPU_SUCCESS(
@@ -308,25 +234,28 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
                       args.wdesc.desc(), args.w->data<T>(), args.cdesc.desc(),
                       args.odesc.desc(), const_cast<T*>(args.o->data<T>()),
                       kNUM_CUDNN_FWD_ALGS, &returned_algo_count,
-                      perf_stat.data(), cudnn_workspace_ptr,
-                      workspace_size_limit));
+                      perf_results.data(), cudnn_workspace_ptr,
+                      max_workspace_size));
             };
-            workspace_handle.RunFuncSync(cudnn_find_func, workspace_size_limit);
-
-            VLOG(3) << "FwdAlgo Perf result: (algo: stat, time, memory)";
-            for (int i = 0; i < returned_algo_count; ++i) {
-              const auto& stat = perf_stat[i];
-              VLOG(3) << stat.algo << ": " << stat.status << " " << stat.time
-                      << " " << stat.memory;
-            }
-            return perf_stat[0].algo;
+            workspace_handle.RunFuncSync(cudnn_find_func, max_workspace_size,
+                                         UseFixedWorkspace());
+
+            VLOG(4) << GetPerfResultString<PerfT>(
+                "[Exhaustive Search] FwdAlgo Perf result", perf_results,
+                returned_algo_count, workspace_size_limit);
+            result.time = perf_results[0].time;
+            return perf_results[0].algo;
           });
     }
-    VLOG(3) << "choose algo " << algo;
-    return algo;
+    VLOG(3) << "[cuDNN Convoltion] exhaustive_search=" << exhaustive_search
+            << ", deterministic=" << deterministic
+            << ", choose algo=" << result.algo << ", workspace="
+            << ToMegaBytes(GetWorkspaceSize(args, result.algo)) << " MB";
+    return result;
   }
 
-  static size_t GetWorkspaceSize(const ConvArgs& args, algo_t algo) {
+  static size_t GetWorkspaceSize(const ConvArgs& args,
+                                 cudnnConvolutionFwdAlgo_t algo) {
     size_t workspace_size = 0;
     PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
@@ -334,68 +263,84 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
             args.cdesc.desc(), args.odesc.desc(), algo, &workspace_size));
     return workspace_size;
   }
+
+ private:
+  static size_t FindMaxWorkspaceSize(const ConvArgs& args,
+                                     size_t workspace_size_limit) {
+    if (!UseFixedWorkspace()) {
+      size_t max_workspace_size = 0;
+      for (size_t algo = 0; algo < kNUM_CUDNN_FWD_ALGS; ++algo) {
+        size_t workspace_size = 0;
+        auto status =
+            platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
+                args.handle, args.idesc.desc(), args.wdesc.desc(),
+                args.cdesc.desc(), args.odesc.desc(),
+                static_cast<cudnnConvolutionFwdAlgo_t>(algo), &workspace_size);
+        if (status == CUDNN_STATUS_SUCCESS) {
+          max_workspace_size = std::max(workspace_size, max_workspace_size);
+        }
+      }
+      return std::min(max_workspace_size, workspace_size_limit);
+    } else {
+      return workspace_size_limit;
+    }
+  }
 };
 
 template <>
 struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
-  using perf_t = cudnnConvolutionBwdDataAlgoPerf_t;
-  using algo_t = cudnnConvolutionBwdDataAlgo_t;
+  using PerfT = cudnnConvolutionBwdDataAlgoPerf_t;
+  using AlgoT = cudnnConvolutionBwdDataAlgo_t;
 
   template <typename T>
-  static algo_t Find(const ConvArgs& args, bool exhaustive_search,
-                     bool deterministic, const phi::GPUContext& ctx) {
+  static SearchResult<AlgoT> Find(const ConvArgs& args, bool exhaustive_search,
+                                  bool deterministic,
+                                  const phi::GPUContext& ctx) {
+    SearchResult<AlgoT> result;
     auto dtype = platform::CudnnDataType<T>::type;
-    size_t workspace_size_limit = FLAGS_conv_workspace_size_limit * 1024 * 1024;
-    size_t workspace_size = 0;
-    bool has_got_workspace_size = true;
-    algo_t algo;
+    size_t workspace_size_limit = CaclWorkspaceLimitInBytes(ctx);
     SetConvMathType(ctx, dtype, args.cdesc);
 
     if (!exhaustive_search && !deterministic) {
 #if CUDNN_VERSION >= 7001
-      int perf_count;
+      int actual_perf_count;
       int best_algo_idx = 0;
-      std::unique_ptr<perf_t[]> perf_results(
-          new perf_t[kNUM_CUDNN_BWD_DATA_ALGS]);
+      std::vector<PerfT> perf_results(kNUM_CUDNN_BWD_DATA_ALGS);
       PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm_v7(
               args.handle, args.wdesc.desc(), args.odesc.desc(),
               args.cdesc.desc(), args.idesc.desc(), kNUM_CUDNN_BWD_DATA_ALGS,
-              &perf_count, perf_results.get()));
-      algo = (perf_results.get())[best_algo_idx].algo;
+              &actual_perf_count, perf_results.data()));
+      result.algo = perf_results[best_algo_idx].algo;
 
 #if CUDNN_VERSION < 7500
       int stride_dim = args.x->dims().size() - 2;
       bool blacklist = std::any_of(args.s.begin(), args.s.begin() + stride_dim,
                                    [=](int n) { return n != 1; });
-      if (blacklist && (static_cast<cudnnConvolutionBwdDataAlgo_t>(
-                            perf_results[best_algo_idx].algo) ==
+      if (blacklist && (perf_results[best_algo_idx].algo ==
                             CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING ||
-                        static_cast<cudnnConvolutionBwdDataAlgo_t>(
-                            perf_results[best_algo_idx].algo) ==
+                        perf_results[best_algo_idx].algo ==
                             CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT)) {
-        algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
+        result.algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
       }
 #endif
-      workspace_size = GetWorkspaceSize(args, algo);
-      if (workspace_size > workspace_size_limit) {
-        has_got_workspace_size = false;
+      result.workspace_size = GetWorkspaceSize(args, result.algo);
+      if (result.workspace_size > workspace_size_limit) {
 #if CUDNN_VERSION >= 8000
         // cudnnGetConvolutionBackwardDataAlgorithm is removed in CUDNN-8
-        ChooseAlgoByWorkspace<perf_t, algo_t>(perf_results.get(),
-                                              kNUM_CUDNN_BWD_DATA_ALGS,
-                                              workspace_size_limit, &algo);
+        ChooseAlgoByWorkspace<PerfT, AlgoT>(perf_results, workspace_size_limit,
+                                            &result);
 #else
         VLOG(1) << "Fallback to non-v7 method to find conv algorithm becasue "
                    "the workspace size request("
-                << workspace_size << ") exceeds the limit("
+                << result.workspace_size << ") exceeds the limit("
                 << workspace_size_limit << ")";
         PADDLE_ENFORCE_GPU_SUCCESS(
             platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
                 args.handle, args.wdesc.desc(), args.odesc.desc(),
                 args.cdesc.desc(), args.idesc.desc(),
                 CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
-                workspace_size_limit, &algo));
+                workspace_size_limit, &(result.algo)));
 #endif
       }
 #else
@@ -404,29 +349,29 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
               args.handle, args.wdesc.desc(), args.odesc.desc(),
               args.cdesc.desc(), args.idesc.desc(),
               CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
-              workspace_size_limit, &algo));
+              workspace_size_limit, &(result.algo)));
 #endif
     } else if (deterministic) {
-      return CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
+      result.algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
     } else {
-      auto& dev_ctx = ctx;
-      auto workspace_handle = dev_ctx.cudnn_workspace_handle();
-
-      AlgorithmsCache<algo_t>& algo_cache =
-          *(framework::ConvSearchCache::Instance().GetBackwardData());
-
+      auto workspace_handle = ctx.cudnn_workspace_handle();
       auto x_dims = phi::vectorize(args.x->dims());
       auto w_dims = phi::vectorize(args.w->dims());
-
       VLOG(10) << "cudnnConvolutionFwdAlgoPerf_t"
                << ", x_dims:" << x_dims << ", w_dims:" << w_dims << ", args.s"
                << args.s << ", args.p" << args.p << ", args.d" << args.d;
 
-      algo = algo_cache.GetAlgorithm(
+      AlgorithmsCache<AlgoT>& algo_cache =
+          *(framework::ConvSearchCache::Instance().GetBackwardData());
+      result.algo = algo_cache.GetAlgorithm(
           x_dims, w_dims, args.s, args.p, args.d, 0,
           static_cast<int64_t>(args.cudnn_dtype), [&]() {
             int returned_algo_count;
-            std::array<perf_t, kNUM_CUDNN_BWD_DATA_ALGS> perf_stat;
+            std::vector<PerfT> perf_results(kNUM_CUDNN_BWD_DATA_ALGS);
+            size_t max_workspace_size =
+                FindMaxWorkspaceSize(args, workspace_size_limit);
+            VLOG(3) << "max_workspace_size=" << ToMegaBytes(max_workspace_size)
+                    << " MB";
 
             auto cudnn_find_func = [&](void* cudnn_workspace_ptr) {
               PADDLE_ENFORCE_GPU_SUCCESS(
@@ -437,26 +382,28 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
                           args.cdesc.desc(), args.idesc.desc(),
                           const_cast<T*>(args.x->data<T>()),
                           kNUM_CUDNN_BWD_DATA_ALGS, &returned_algo_count,
-                          perf_stat.data(), cudnn_workspace_ptr,
-                          workspace_size_limit));
+                          perf_results.data(), cudnn_workspace_ptr,
+                          max_workspace_size));
             };
-            workspace_handle.RunFuncSync(cudnn_find_func, workspace_size_limit);
-
-            VLOG(3) << "BwdDataAlgo Perf result: (algo: stat, time, memory)";
-            for (int i = 0; i < returned_algo_count; ++i) {
-              const auto& stat = perf_stat[i];
-              VLOG(3) << stat.algo << ": " << stat.status << " " << stat.time
-                      << " " << stat.memory;
-            }
-
-            return perf_stat[0].algo;
+            workspace_handle.RunFuncSync(cudnn_find_func, max_workspace_size,
+                                         UseFixedWorkspace());
+
+            VLOG(3) << GetPerfResultString<PerfT>(
+                "[Exhaustive Search] BwdDataAlgo Perf result", perf_results,
+                returned_algo_count, workspace_size_limit);
+            result.time = perf_results[0].time;
+            return perf_results[0].algo;
           });
     }
-    VLOG(3) << "choose algo " << algo;
-    return algo;
+    VLOG(3) << "[cuDNN Convoltion] exhaustive_search=" << exhaustive_search
+            << ", deterministic=" << deterministic
+            << ", choose algo=" << result.algo << ", workspace="
+            << ToMegaBytes(GetWorkspaceSize(args, result.algo)) << " MB";
+    return result;
   }
 
-  static size_t GetWorkspaceSize(const ConvArgs& args, algo_t algo) {
+  static size_t GetWorkspaceSize(const ConvArgs& args,
+                                 cudnnConvolutionBwdDataAlgo_t algo) {
     size_t workspace_size = 0;
     PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
@@ -464,57 +411,75 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
             args.cdesc.desc(), args.idesc.desc(), algo, &workspace_size));
     return workspace_size;
   }
+
+ private:
+  static size_t FindMaxWorkspaceSize(const ConvArgs& args,
+                                     size_t workspace_size_limit) {
+    if (!UseFixedWorkspace()) {
+      size_t max_workspace_size = 0;
+      for (size_t algo = 0; algo < kNUM_CUDNN_BWD_DATA_ALGS; ++algo) {
+        size_t workspace_size = 0;
+        auto status =
+            platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
+                args.handle, args.wdesc.desc(), args.odesc.desc(),
+                args.cdesc.desc(), args.idesc.desc(),
+                static_cast<cudnnConvolutionBwdDataAlgo_t>(algo),
+                &workspace_size);
+        if (status == CUDNN_STATUS_SUCCESS) {
+          max_workspace_size = std::max(workspace_size, max_workspace_size);
+        }
+      }
+      return std::min(max_workspace_size, workspace_size_limit);
+    } else {
+      return workspace_size_limit;
+    }
+  }
 };
 
 template <>
 struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
-  using perf_t = cudnnConvolutionBwdFilterAlgoPerf_t;
-  using algo_t = cudnnConvolutionBwdFilterAlgo_t;
+  using PerfT = cudnnConvolutionBwdFilterAlgoPerf_t;
+  using AlgoT = cudnnConvolutionBwdFilterAlgo_t;
 
   template <typename T>
-  static algo_t Find(const ConvArgs& args, bool exhaustive_search,
-                     bool deterministic, const phi::GPUContext& ctx) {
+  static SearchResult<AlgoT> Find(const ConvArgs& args, bool exhaustive_search,
+                                  bool deterministic,
+                                  const phi::GPUContext& ctx) {
     platform::CUDAGraphCaptureModeGuard guard;
+    SearchResult<AlgoT> result;
     auto dtype = platform::CudnnDataType<T>::type;
-    size_t workspace_size_limit = FLAGS_conv_workspace_size_limit * 1024 * 1024;
-    size_t workspace_size = 0;
-    bool has_got_workspace_size = true;
+    size_t workspace_size_limit = CaclWorkspaceLimitInBytes(ctx);
     SetConvMathType(ctx, dtype, args.cdesc);
 
-    algo_t algo;
     if (!exhaustive_search && !deterministic) {
 #if CUDNN_VERSION >= 7001
-      using perf_t = cudnnConvolutionBwdFilterAlgoPerf_t;
-      int perf_count;
+      int actual_perf_count;
       int best_algo_idx = 0;
-      std::unique_ptr<perf_t[]> perf_results(
-          new perf_t[kNUM_CUDNN_BWD_FILTER_ALGS]);
+      std::vector<PerfT> perf_results(kNUM_CUDNN_BWD_FILTER_ALGS);
       PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm_v7(
               args.handle, args.idesc.desc(), args.odesc.desc(),
               args.cdesc.desc(), args.wdesc.desc(), kNUM_CUDNN_BWD_FILTER_ALGS,
-              &perf_count, perf_results.get()));
-      algo = (perf_results.get())[best_algo_idx].algo;
-      workspace_size = (perf_results.get())[best_algo_idx].memory;
+              &actual_perf_count, perf_results.data()));
+      result.algo = perf_results[best_algo_idx].algo;
+      result.workspace_size = perf_results[best_algo_idx].memory;
 
-      if (workspace_size > workspace_size_limit) {
-        workspace_size = workspace_size_limit;
+      if (result.workspace_size > workspace_size_limit) {
 #if CUDNN_VERSION >= 8000
         // cudnnGetConvolutionBackwardFilterAlgorithm is removed in CUDNN-8
-        ChooseAlgoByWorkspace<perf_t, algo_t>(perf_results.get(),
-                                              kNUM_CUDNN_BWD_FILTER_ALGS,
-                                              workspace_size_limit, &algo);
+        ChooseAlgoByWorkspace<PerfT, AlgoT>(perf_results, workspace_size_limit,
+                                            &result);
 #else
         VLOG(1) << "Fallback to non-v7 method to find conv algorithm becasue "
                    "the workspace size request("
-                << workspace_size << ") exceeds the limit("
+                << result.workspace_size << ") exceeds the limit("
                 << workspace_size_limit << ")";
         PADDLE_ENFORCE_GPU_SUCCESS(
             platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
                 args.handle, args.idesc.desc(), args.odesc.desc(),
                 args.cdesc.desc(), args.wdesc.desc(),
                 CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
-                workspace_size_limit, &algo));
+                workspace_size_limit, &(result.algo)));
 #endif
       }
 #else
@@ -523,28 +488,32 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
               args.handle, args.idesc.desc(), args.odesc.desc(),
               args.cdesc.desc(), args.wdesc.desc(),
               CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
-              workspace_size_limit, &algo));
+              workspace_size_limit, &(result.algo)));
 #endif
     } else if (deterministic) {
-      return CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
+      result.algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
     } else {
-      auto& dev_ctx = ctx;
-      auto workspace_handle = dev_ctx.cudnn_workspace_handle();
-      AlgorithmsCache<algo_t>& algo_cache =
-          *(framework::ConvSearchCache::Instance().GetBackwardFilter());
-
+      auto workspace_handle = ctx.cudnn_workspace_handle();
       auto x_dims = phi::vectorize(args.x->dims());
       auto w_dims = phi::vectorize(args.w->dims());
-
       VLOG(10) << "cudnnConvolutionFwdAlgoPerf_t:"
                << ", x_dims:" << x_dims << ", w_dims:" << w_dims << ", args.s"
                << args.s << ", args.p" << args.p << ", args.d" << args.d;
+
+      AlgorithmsCache<AlgoT>& algo_cache =
+          *(framework::ConvSearchCache::Instance().GetBackwardFilter());
+
       if (dtype != CUDNN_DATA_HALF) {
-        algo = algo_cache.GetAlgorithm(
+        result.algo = algo_cache.GetAlgorithm(
             x_dims, w_dims, args.s, args.p, args.d, 0,
             static_cast<int64_t>(args.cudnn_dtype), [&]() {
               int returned_algo_count;
-              std::array<perf_t, kNUM_CUDNN_BWD_FILTER_ALGS> perf_stat;
+              std::vector<PerfT> perf_results(kNUM_CUDNN_BWD_FILTER_ALGS);
+              size_t max_workspace_size =
+                  FindMaxWorkspaceSize(args, workspace_size_limit);
+              VLOG(3) << "max_workspace_size="
+                      << ToMegaBytes(max_workspace_size) << " MB";
+
               auto cudnn_find_func = [&](void* cudnn_workspace_ptr) {
                 PADDLE_ENFORCE_GPU_SUCCESS(
                     platform::dynload::
@@ -554,29 +523,26 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
                             args.cdesc.desc(), args.wdesc.desc(),
                             const_cast<T*>(args.w->data<T>()),
                             kNUM_CUDNN_BWD_FILTER_ALGS, &returned_algo_count,
-                            perf_stat.data(), cudnn_workspace_ptr,
-                            workspace_size_limit));
+                            perf_results.data(), cudnn_workspace_ptr,
+                            max_workspace_size));
               };
-              workspace_handle.RunFuncSync(cudnn_find_func,
-                                           workspace_size_limit);
-
-              VLOG(3)
-                  << "BwdFilterAlgo Perf result: (algo: stat, time, memory)";
-              for (int i = 0; i < returned_algo_count; ++i) {
-                const auto& stat = perf_stat[i];
-                VLOG(3) << stat.algo << ": " << stat.status << " " << stat.time
-                        << " " << stat.memory;
-              }
-              return perf_stat[0].algo;
+              workspace_handle.RunFuncSync(cudnn_find_func, max_workspace_size,
+                                           UseFixedWorkspace());
+
+              VLOG(3) << GetPerfResultString<PerfT>(
+                  "[Exhaustive Search] BwdFilterAlgo Perf result", perf_results,
+                  returned_algo_count, workspace_size_limit);
+              result.time = perf_results[0].time;
+              return perf_results[0].algo;
             });
       } else {
-        auto max_algos = MaxBwdFilterAlgos(args.handle);
-        algo = algo_cache.GetAlgorithm(
+        result.algo = algo_cache.GetAlgorithm(
             x_dims, w_dims, args.s, args.p, args.d, 0,
             static_cast<int64_t>(args.cudnn_dtype), [&]() {
-              algo_t chosen_algo;
-              std::vector<perf_t> perf_results(max_algos);
+              SearchResult<AlgoT> algo_result;
               int actual_algos = 0;
+              std::vector<PerfT> perf_results(kNUM_CUDNN_BWD_FILTER_ALGS);
+
               PADDLE_ENFORCE_GPU_SUCCESS(
                   platform::dynload::
                       cudnnFindConvolutionBackwardFilterAlgorithm(
@@ -585,17 +551,21 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
                           perf_results.size(), &actual_algos,
                           perf_results.data()));
               perf_results.resize(actual_algos);
-              ChooseAlgo<perf_t, algo_t>(perf_results, workspace_size_limit,
-                                         &chosen_algo);
-              return chosen_algo;
+              ChooseAlgo(perf_results, workspace_size_limit, &algo_result);
+              result.time = algo_result.time;
+              return algo_result.algo;
             });
       }
     }
-    VLOG(3) << "choose algo " << algo;
-    return algo;
+    VLOG(3) << "[cuDNN Convoltion] exhaustive_search=" << exhaustive_search
+            << ", deterministic=" << deterministic
+            << ", choose algo=" << result.algo << ", workspace="
+            << ToMegaBytes(GetWorkspaceSize(args, result.algo)) << " MB";
+    return result;
   }
 
-  static size_t GetWorkspaceSize(const ConvArgs& args, algo_t algo) {
+  static size_t GetWorkspaceSize(const ConvArgs& args,
+                                 cudnnConvolutionBwdFilterAlgo_t algo) {
     platform::CUDAGraphCaptureModeGuard guard;
     size_t workspace_size = 0;
     PADDLE_ENFORCE_GPU_SUCCESS(
@@ -604,6 +574,69 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
             args.cdesc.desc(), args.wdesc.desc(), algo, &workspace_size));
     return workspace_size;
   }
+
+ private:
+  static size_t FindMaxWorkspaceSize(const ConvArgs& args,
+                                     size_t workspace_size_limit) {
+    if (!UseFixedWorkspace()) {
+      size_t max_workspace_size = 0;
+      for (size_t algo = 0; algo < kNUM_CUDNN_BWD_FILTER_ALGS; ++algo) {
+        size_t workspace_size = 0;
+        auto status =
+            platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
+                args.handle, args.idesc.desc(), args.odesc.desc(),
+                args.cdesc.desc(), args.wdesc.desc(),
+                static_cast<cudnnConvolutionBwdFilterAlgo_t>(algo),
+                &workspace_size);
+        if (status == CUDNN_STATUS_SUCCESS) {
+          max_workspace_size = std::max(workspace_size, max_workspace_size);
+        }
+      }
+      return std::min(max_workspace_size, workspace_size_limit);
+    } else {
+      return workspace_size_limit;
+    }
+  }
+
+  static void ChooseAlgo(const std::vector<PerfT>& perf_results,
+                         size_t workspace_limit,
+                         SearchResult<AlgoT>* algo_result) {
+    VLOG(3) << GetPerfResultString<PerfT>(
+        "[Exhaustive Search] BwdFilterAlgo Perf result", perf_results,
+        perf_results.size(), workspace_limit);
+
+    for (size_t i = 0; i != perf_results.size(); ++i) {
+      const auto& result = perf_results[i];
+      if (result.status == CUDNN_STATUS_SUCCESS &&
+          (result.memory <= workspace_limit)) {
+        if ((result.mathType == CUDNN_TENSOR_OP_MATH) &&
+            (i != perf_results.size() - 1)) {
+          const auto& next_result = perf_results[i + 1];
+          if (next_result.status == CUDNN_STATUS_SUCCESS &&
+              next_result.algo == result.algo &&
+              next_result.memory == result.memory &&
+              next_result.mathType != CUDNN_TENSOR_OP_MATH &&
+              next_result.time < 1.01 * result.time) {
+            // Skip over this result- it's not really a Tensor Core algo.
+            // Because it is only 1% performance difference.
+            // Prefer to choose the next equivalent non-Tensor Core algo.
+            continue;
+          }
+        }
+        algo_result->algo = result.algo;
+        algo_result->time = result.time;
+        auto math_type_str = "0";
+        if (result.mathType == CUDNN_TENSOR_OP_MATH) {
+          math_type_str = "1";
+        }
+        VLOG(3) << "    choose algo: " << result.algo
+                << ", TC: " << math_type_str << ", time: " << result.time
+                << " ms, wksp = " << result.memory
+                << ", status = " << result.status;
+        break;
+      }
+    }
+  }
 };
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/conv_cudnn_op_cache.h b/paddle/fluid/operators/conv_cudnn_op_cache.h
index 291e5f92f322cb..af67d857e0eb7c 100644
--- a/paddle/fluid/operators/conv_cudnn_op_cache.h
+++ b/paddle/fluid/operators/conv_cudnn_op_cache.h
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
-DECLARE_uint64(conv_workspace_size_limit);
+DECLARE_int64(conv_workspace_size_limit);
 DECLARE_bool(cudnn_exhaustive_search);
 DECLARE_int64(cudnn_exhaustive_search_times);
 
diff --git a/paddle/fluid/operators/conv_miopen_helper.h b/paddle/fluid/operators/conv_miopen_helper.h
index 66f71869384783..abc7be7fb8b8ae 100644
--- a/paddle/fluid/operators/conv_miopen_helper.h
+++ b/paddle/fluid/operators/conv_miopen_helper.h
@@ -14,42 +14,12 @@ limitations under the License. */
 
 #pragma once
 
-#include <algorithm>
-#include <array>
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/conv_search_cache.h"
-#include "paddle/fluid/framework/operator_kernel_configs.h"
-#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
-#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
-#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/fluid/operators/conv_base_helper.h"
 
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
-using DataLayout = platform::DataLayout;
-template <typename T>
-using ScalingParamType = typename platform::CudnnDataType<T>::ScalingParamType;
-using framework::AlgorithmsCache;
-static inline void GetNCDHW(const framework::DDim& dims,
-                            const DataLayout& layout, int* N, int* C, int* D,
-                            int* H, int* W) {
-  *N = dims[0];
-  *C = layout == DataLayout::kNCHW ? dims[1] : dims[dims.size() - 1];
-  int i = layout == DataLayout::kNCHW ? 0 : 1;
-  if (dims.size() == 5) {
-    *D = dims[2 - i];
-    *H = dims[3 - i];
-    *W = dims[4 - i];
-  } else {
-    *D = 1;
-    *H = dims[2 - i];
-    *W = dims[3 - i];
-  }
-}
+using ConvArgs = ConvArgsBase<miopenHandle_t, miopenDataType_t>;
 
 template <typename DeviceContext, typename T, size_t D>
 static void RemovePaddingSlice(const phi::GPUContext& context,
@@ -66,9 +36,8 @@ static void RemovePaddingSlice(const phi::GPUContext& context,
     extents[i] = new_out_dims[i];
   }
 
-  int start;
   for (size_t i = 0; i < axes.size(); ++i) {
-    start = starts[i];
+    int start = starts[i];
     if (start < 0) {
       start = (start + in_dims[axes[i]]);
     }
@@ -85,41 +54,6 @@ static void RemovePaddingSlice(const phi::GPUContext& context,
   out_t.device(place) = in_t.slice(offsets, extents);
 }
 
-template <typename T>
-std::ostream& operator<<(std::ostream& out, const std::vector<T>& v) {
-  out << "[";
-  for (auto const& tmp : v) out << tmp << ",";
-  out << "]";
-  return out;
-}
-
-using framework::ConvSearchCache;
-
-struct ConvArgs {
-  miopenHandle_t handle;
-  platform::TensorDescriptor idesc, odesc;
-  platform::FilterDescriptor wdesc;
-  platform::ConvolutionDescriptor cdesc;
-  const framework::Tensor *x, *w, *o;
-  miopenDataType_t cudnn_dtype;
-
-  // strides
-  std::vector<int> s;
-  // paddings
-  std::vector<int> p;
-  // dilations
-  std::vector<int> d;
-
-  ConvArgs(const framework::Tensor* x, const framework::Tensor* w,
-           const framework::Tensor* o, const std::vector<int> s,
-           const std::vector<int> p, const std::vector<int> d,
-           miopenDataType_t dtype)
-      : x(x), w(w), o(o), s(s), p(p), d(d), cudnn_dtype(dtype) {}
-};
-
-template <typename algo_t>
-struct SearchAlgorithm {};
-
 template <>
 struct SearchAlgorithm<miopenConvFwdAlgorithm_t> {
   using perf_t = miopenConvAlgoPerf_t;
diff --git a/paddle/fluid/operators/dropout_impl.cu.h b/paddle/fluid/operators/dropout_impl.cu.h
index 83ca9ace20d054..6af8c925ff5802 100644
--- a/paddle/fluid/operators/dropout_impl.cu.h
+++ b/paddle/fluid/operators/dropout_impl.cu.h
@@ -38,43 +38,9 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/distribution_helper.h"
 #include "paddle/phi/kernels/funcs/functors.h"
 
-DECLARE_bool(use_curand);
-
 namespace paddle {
 namespace operators {
 
-template <typename T1, typename T2 = T1, typename OutT = T1>
-struct DstMaskGenerator {
-  const float dropout_prob_;
-  const bool is_upscale_in_train_;
-  using MT = typename details::MPTypeTrait<T1>::Type;
-  MT factor;
-  HOSTDEVICE inline DstMaskGenerator(const float dropout_prob,
-                                     const bool is_upscale_in_train)
-      : dropout_prob_(dropout_prob), is_upscale_in_train_(is_upscale_in_train) {
-    factor = static_cast<MT>(1.0f / (1.0f - dropout_prob_));
-  }
-
-  HOSTDEVICE inline void operator()(OutT* dst, const T1* src_val,
-                                    const T2* rand, int num) const {
-    static constexpr int kCount =
-        phi::funcs::uniform_distribution<T2>::kReturnsCount;
-// 0 ~ kCount -1 is dist , kCount ~ 2 * kCount - 1 is mask
-#pragma unroll
-    for (int i = 0; i < kCount; i++) {
-      if (rand[i] < dropout_prob_) {
-        dst[i] = static_cast<T1>(0);
-        dst[i + kCount] = dst[i];
-      } else {
-        dst[i] = is_upscale_in_train_
-                     ? static_cast<T1>(static_cast<MT>(src_val[i]) * factor)
-                     : static_cast<T1>(src_val[i]);
-        dst[i + kCount] = static_cast<T1>(1);
-      }
-    }
-  }
-};
-
 template <typename T1, typename T2 = T1, typename OutT = T1>
 struct DstMaskFunctor {
   const float retain_prob_;
@@ -113,7 +79,7 @@ __global__ void VectorizedRandomGenerator(const size_t n, uint64_t seed,
                                           const T* src, MaskType* mask, T* dst,
                                           bool is_upscale_in_train,
                                           uint64_t increment,
-                                          size_t main_offset, bool use_curand) {
+                                          size_t main_offset) {
   size_t idx = static_cast<size_t>(BLOCK_ID_X * BLOCK_NUM_X);
   static constexpr int kCount =
       phi::funcs::uniform_distribution<float>::kReturnsCount;
@@ -135,76 +101,41 @@ __global__ void VectorizedRandomGenerator(const size_t n, uint64_t seed,
   int deal_size = BLOCK_NUM_X * kCount;
 
   size_t fix = idx * kCount;
-  if (use_curand) {
-    auto dst_functor =
-        DstMaskFunctor<T, float>(1.0f - dropout_prob, is_upscale_in_train);
-    for (; fix < main_offset; fix += stride) {
-      kps::ReadData<T, kCount, 1, 1, false>(&dst_mask[0], src + fix, deal_size);
-      kps::ElementwiseRandom<SType, float, kCount, 1, Rand>(&rands[0], Rand(),
-                                                            &state);
-      // dst
-      kps::OperatorTernary<T, float, T, DstMaskFunctor<T, float>>(
-          &dst_mask[0], &dst_mask[0], &rands[0], dst_functor, kCount);
-      kps::WriteData<T, kCount, 1, 1, false>(dst + fix, &dst_mask[0],
-                                             deal_size);
-      // mask
-      kps::ElementwiseUnary<T, MaskType, kCount, 1, 1, Cast>(
-          &mask_result[0], &dst_mask[kCount], Cast());
-      kps::WriteData<MaskType, kCount, 1, 1, false>(mask + fix, &mask_result[0],
-                                                    deal_size);
-      if (fix > idx * kCount + 1) {
-        __syncthreads();
-      }
-    }
-    int remainder = n - fix;
-    if (remainder > 0) {
-      kps::ReadData<T, kCount, 1, 1, true>(&dst_mask[0], src + fix, remainder);
-      kps::ElementwiseRandom<SType, float, kCount, 1, Rand>(&rands[0], Rand(),
-                                                            &state);
-      // dst
-      kps::OperatorTernary<T, float, T, DstMaskFunctor<T, float>>(
-          &dst_mask[0], &dst_mask[0], &rands[0], dst_functor, kCount);
-      kps::WriteData<T, kCount, 1, 1, true>(dst + fix, &dst_mask[0], remainder);
-      // mask
-      kps::ElementwiseUnary<T, MaskType, kCount, 1, 1, Cast>(
-          &mask_result[0], &dst_mask[kCount], Cast());
-      kps::WriteData<MaskType, kCount, 1, 1, true>(mask + fix, &mask_result[0],
-                                                   remainder);
+
+  auto dst_functor =
+      DstMaskFunctor<T, float>(1.0f - dropout_prob, is_upscale_in_train);
+  for (; fix < main_offset; fix += stride) {
+    kps::ReadData<T, kCount, 1, 1, false>(&dst_mask[0], src + fix, deal_size);
+    kps::ElementwiseRandom<SType, float, kCount, 1, Rand>(&rands[0], Rand(),
+                                                          &state);
+    // dst
+    kps::OperatorTernary<T, float, T, DstMaskFunctor<T, float>>(
+        &dst_mask[0], &dst_mask[0], &rands[0], dst_functor, kCount);
+    kps::WriteData<T, kCount, 1, 1, false>(dst + fix, &dst_mask[0], deal_size);
+    // mask
+    kps::ElementwiseUnary<T, MaskType, kCount, 1, 1, Cast>(
+        &mask_result[0], &dst_mask[kCount], Cast());
+    kps::WriteData<MaskType, kCount, 1, 1, false>(mask + fix, &mask_result[0],
+                                                  deal_size);
+    if (fix > idx * kCount + 1) {
       __syncthreads();
     }
-  } else {
-    auto dst_functor =
-        DstMaskGenerator<T, float>(dropout_prob, is_upscale_in_train);
-    for (; fix < main_offset; fix += stride) {
-      kps::ReadData<T, kCount, 1, 1, false>(&dst_mask[0], src + fix, deal_size);
-      kps::ElementwiseRandom<SType, float, kCount, 1, Rand>(&rands[0], Rand(),
-                                                            &state);
-      // dst
-      kps::OperatorTernary<T, float, T, DstMaskGenerator<T, float>>(
-          &dst_mask[0], &dst_mask[0], &rands[0], dst_functor, kCount);
-      kps::WriteData<T, kCount, 1, 1, false>(dst + fix, &dst_mask[0],
-                                             deal_size);
-      // mask
-      kps::ElementwiseUnary<T, MaskType, kCount, 1, 1, Cast>(
-          &mask_result[0], &dst_mask[kCount], Cast());
-      kps::WriteData<MaskType, kCount, 1, 1, false>(mask + fix, &mask_result[0],
-                                                    deal_size);
-    }
-    int remainder = n - fix;
-    if (remainder > 0) {
-      kps::ReadData<T, kCount, 1, 1, true>(&dst_mask[0], src + fix, remainder);
-      kps::ElementwiseRandom<SType, float, kCount, 1, Rand>(&rands[0], Rand(),
-                                                            &state);
-      // dst
-      kps::OperatorTernary<T, float, T, DstMaskGenerator<T, float>>(
-          &dst_mask[0], &dst_mask[0], &rands[0], dst_functor, kCount);
-      kps::WriteData<T, kCount, 1, 1, true>(dst + fix, &dst_mask[0], remainder);
-      // mask
-      kps::ElementwiseUnary<T, MaskType, kCount, 1, 1, Cast>(
-          &mask_result[0], &dst_mask[kCount], Cast());
-      kps::WriteData<MaskType, kCount, 1, 1, true>(mask + fix, &mask_result[0],
-                                                   remainder);
-    }
+  }
+  int remainder = n - fix;
+  if (remainder > 0) {
+    kps::ReadData<T, kCount, 1, 1, true>(&dst_mask[0], src + fix, remainder);
+    kps::ElementwiseRandom<SType, float, kCount, 1, Rand>(&rands[0], Rand(),
+                                                          &state);
+    // dst
+    kps::OperatorTernary<T, float, T, DstMaskFunctor<T, float>>(
+        &dst_mask[0], &dst_mask[0], &rands[0], dst_functor, kCount);
+    kps::WriteData<T, kCount, 1, 1, true>(dst + fix, &dst_mask[0], remainder);
+    // mask
+    kps::ElementwiseUnary<T, MaskType, kCount, 1, 1, Cast>(
+        &mask_result[0], &dst_mask[kCount], Cast());
+    kps::WriteData<MaskType, kCount, 1, 1, true>(mask + fix, &mask_result[0],
+                                                 remainder);
+    __syncthreads();
   }
 }
 
@@ -251,13 +182,11 @@ void DropoutFwGPUKernelDriver(const phi::GPUContext& dev_ctx, bool is_test,
     size_t grid_size = gpu_config.GetGridSize();
     size_t block_size = gpu_config.GetBlockSize();
 
-    if (FLAGS_use_curand) {
-      int64_t device_id = dev_ctx.GetPlace().GetDeviceId();
-      const auto& prop = platform::GetDeviceProperties(device_id);
-      size_t max_grid_size = prop.maxThreadsPerMultiProcessor *
-                             prop.multiProcessorCount / block_size;
-      grid_size = std::min(grid_size, max_grid_size);
-    }
+    int64_t device_id = dev_ctx.GetPlace().GetDeviceId();
+    const auto& prop = platform::GetDeviceProperties(device_id);
+    size_t max_grid_size = prop.maxThreadsPerMultiProcessor *
+                           prop.multiProcessorCount / block_size;
+    grid_size = std::min(grid_size, max_grid_size);
 
     auto offset =
         ((x_numel - 1) / (grid_size * block_size * kVecSize) + 1) * kVecSize;
@@ -268,7 +197,7 @@ void DropoutFwGPUKernelDriver(const phi::GPUContext& dev_ctx, bool is_test,
 
     VectorizedRandomGenerator<T, uint8_t><<<grid_size, block_size, 0, stream>>>(
         size, seed_data, dropout_prob, x_data, mask_data, y_data,
-        upscale_in_train, increment, main_offset, FLAGS_use_curand);
+        upscale_in_train, increment, main_offset);
   } else {
     if (upscale_in_train) {
 // todo: can y share with data with x directly?
diff --git a/paddle/fluid/operators/fake_quantize_op.cu.h b/paddle/fluid/operators/fake_quantize_op.cu.h
index d85d47f5461313..ae448b7ff2c8b9 100644
--- a/paddle/fluid/operators/fake_quantize_op.cu.h
+++ b/paddle/fluid/operators/fake_quantize_op.cu.h
@@ -305,7 +305,7 @@ __global__ void ChannelClipAndQuantKernelQuantAxisN(
   int64_t idx = blockDim.x * blockIdx.x + threadIdx.x;
   for (int64_t i = idx; i < n; i += blockDim.x * gridDim.x) {
     T s = scale[(i / quant_stride) % nScale];
-    T inv_s = 1.0 / s;
+    T inv_s = inverse(s);
     T x = in[i];
     T v = x > s ? s : x;
     v = v < -s ? -s : v;
diff --git a/paddle/fluid/operators/fused/fusion_conv_inception_op.cu b/paddle/fluid/operators/fused/fusion_conv_inception_op.cu
index 39b42ec194c3ba..bd7134f2f33542 100644
--- a/paddle/fluid/operators/fused/fusion_conv_inception_op.cu
+++ b/paddle/fluid/operators/fused/fusion_conv_inception_op.cu
@@ -16,8 +16,6 @@ limitations under the License. */
 #include "paddle/fluid/operators/conv_cudnn_op_cache.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
-DECLARE_uint64(conv_workspace_size_limit);
-
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/gaussian_random_op.cu b/paddle/fluid/operators/gaussian_random_op.cu
index 00ce10bfe3bccb..552649279e9118 100644
--- a/paddle/fluid/operators/gaussian_random_op.cu
+++ b/paddle/fluid/operators/gaussian_random_op.cu
@@ -11,21 +11,14 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include <thrust/device_vector.h>
-#include <thrust/host_vector.h>
 #include <thrust/random.h>
-#include <thrust/transform.h>
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/operators/fill_constant_op.h"
-
-#include "paddle/phi/kernels/funcs/distribution_helper.h"
 #include "paddle/phi/kernels/funcs/index_impl.cu.h"
 
-DECLARE_bool(use_curand);
-
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/group_norm_op.cu b/paddle/fluid/operators/group_norm_op.cu
index c08f1920205daa..bb8031b0cc4e6c 100644
--- a/paddle/fluid/operators/group_norm_op.cu
+++ b/paddle/fluid/operators/group_norm_op.cu
@@ -419,23 +419,6 @@ __global__ void GroupNormBackward(const T* x, const T* d_y, const T* scale,
   }
 }
 
-template <typename T, typename AccT, int VecSize>
-__global__ void VectorizedGetDsDbCUDAKernel(int imsize, const T* x, const T* dy,
-                                            T* ds, T* db) {
-  int i = blockIdx.x;
-  AccT ds_sum = static_cast<AccT>(0);
-  AccT db_sum = static_cast<AccT>(0);
-  x += i * imsize;
-  const int input_offset = ((uint64_t)x) % ALIGN_BYTES / sizeof(T);
-
-  phi::Array<const T*, 2> ins;
-  ins[0] = x;
-  ins[1] = dy;
-  ThreadReduce<T, AccT, VecSize, 2>(ins, imsize, input_offset, &db_sum,
-                                    &ds_sum);
-  ReduceMeanAndVar<AccT>(db, ds, db_sum, ds_sum, 1);
-}
-
 template <typename T>
 __global__ void ScalarGetDsDbCUDAKernel(int imsize, const T* x, const T* dy,
                                         T* ds, T* db) {
@@ -622,25 +605,17 @@ class GroupNormGradKernel<platform::CUDADeviceContext, T>
     int flags =
         (scale_data != nullptr) * kHasScale + (bias_data != nullptr) * kHasBias;
     if (data_layout == DataLayout::kNCHW) {
-      using AccT = typename details::MPTypeTrait<T>::Type;
-      constexpr int vec_size = sizeof(float4) / sizeof(T);
       const int max_num_threads = 1024;
-      int max_block_size = std::min(imsize / vec_size, max_num_threads);
+      int max_block_size = std::min(imsize, max_num_threads);
       int block_size_nchw = 1;
       while (block_size_nchw < max_block_size) {
         block_size_nchw *= 2;
       }
       block_size_nchw = std::max(block_size_nchw, kps::details::kWarpSize);
       dim3 blocks(block_size_nchw);
-      if (imsize < vec_size * block_size_nchw) {
-        ScalarGetDsDbCUDAKernel<
-            T><<<x_dims[0] * C, blocks, 0, dev_ctx.stream()>>>(
-            imsize, x_data, dy_data, ds_data, db_data);
-      } else {
-        VectorizedGetDsDbCUDAKernel<
-            T, AccT, vec_size><<<x_dims[0] * C, blocks, 0, dev_ctx.stream()>>>(
-            imsize, x_data, dy_data, ds_data, db_data);
-      }
+      ScalarGetDsDbCUDAKernel<
+          T><<<x_dims[0] * C, blocks, 0, dev_ctx.stream()>>>(
+          imsize, x_data, dy_data, ds_data, db_data);
 
       if (d_scale || d_bias) {
         const int block = 256;
diff --git a/paddle/fluid/operators/linspace_op.cc b/paddle/fluid/operators/linspace_op.cc
index 5599debbf38714..1cd59672f97fc3 100644
--- a/paddle/fluid/operators/linspace_op.cc
+++ b/paddle/fluid/operators/linspace_op.cc
@@ -67,7 +67,7 @@ class LinspaceOpMaker : public framework::OpProtoAndCheckerMaker {
 
 namespace ops = paddle::operators;
 DECLARE_INFER_SHAPE_FUNCTOR(linspace, LinspaceInferShapeFunctor,
-                            PD_INFER_META(phi::LinspaceInferMeta));
+                            PD_INFER_META(phi::LinspaceRawInferMeta));
 REGISTER_OPERATOR(
     linspace, ops::LinspaceOp, ops::LinspaceOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
diff --git a/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc
index 503d3ec33762fb..de999035fa5d87 100644
--- a/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/operators/fill_constant_op.h"
+#include "paddle/fluid/platform/mkldnn_helper.h"
 
 namespace paddle {
 namespace operators {
@@ -42,7 +43,7 @@ class GaussianMKLDNNKernel : public paddle::framework::OpKernel<T> {
     }
 
     tensor->set_layout(DataLayout::kMKLDNN);
-    tensor->set_format(dnnl::memory::format_tag::oihw);
+    tensor->set_format(platform::GetPlainMKLDNNFormat(tensor->dims().size()));
   }
 };
 }  // namespace operators
diff --git a/paddle/fluid/operators/mul_op_xpu.cc b/paddle/fluid/operators/mul_op_xpu.cc
index 6ef41e059c7d99..7410b3b607c82e 100644
--- a/paddle/fluid/operators/mul_op_xpu.cc
+++ b/paddle/fluid/operators/mul_op_xpu.cc
@@ -19,6 +19,8 @@ limitations under the License. */
 #include <unordered_map>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/xpu_api_wrapper.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"
 
 namespace paddle {
 namespace operators {
@@ -28,6 +30,8 @@ using framework::Tensor;
 
 template <typename DeviceContext, typename T>
 class MulXPUKernel : public framework::OpKernel<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     const Tensor* x = context.Input<Tensor>("X");
@@ -62,14 +66,15 @@ class MulXPUKernel : public framework::OpKernel<T> {
     const T* data_b = y_matrix.data<T>();
     T* data_c = z->data<T>();
     auto& dev_ctx = context.template device_context<DeviceContext>();
-    int ret = xpu::fc_int16(dev_ctx.x_context(), trans_a, trans_b, m, n, k,
-                            alpha, data_a, data_b, beta, data_c);
-    PADDLE_ENFORCE_EQ(
-        ret, XPU_SUCCESS,
-        platform::errors::External(
-            "XPU API return wrong value[%d], please check whether "
-            "Baidu Kunlun Card is properly installed.",
-            ret));
+
+    int ret = xpu_fc_wrapper<XPUType, int16_t>(
+        dev_ctx.x_context(), reinterpret_cast<const XPUType*>(data_a),
+        reinterpret_cast<const XPUType*>(data_b),
+        reinterpret_cast<XPUType*>(data_c), m, n, k, trans_a, trans_b, nullptr,
+        nullptr, nullptr, k, n, n, alpha, beta, nullptr,
+        xpu::Activation_t::LINEAR);
+    PADDLE_ENFORCE_XDNN_SUCCESS(ret, "xpu_fc_wrapper");
+
     if (z_dim.size() != 2) {
       z->Resize(z_dim);
     }
@@ -78,6 +83,8 @@ class MulXPUKernel : public framework::OpKernel<T> {
 
 template <typename DeviceContext, typename T>
 class MulGradXPUKernel : public framework::OpKernel<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     int x_num_col_dims = ctx.template Attr<int>("x_num_col_dims");
@@ -126,14 +133,14 @@ class MulGradXPUKernel : public framework::OpKernel<T> {
       const T* data_a = dout->data<T>();
       const T* data_b = y_matrix.data<T>();
       T* data_c = dx_matrix.data<T>();
-      int ret =
-          xpu::gemm_int16(dev_ctx.x_context(), trans_a, trans_b, m, n, k, alpha,
-                          data_a, lda, data_b, ldb, beta, data_c, ldc);
-      PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
-                        platform::errors::External(
-                            "XPU API return wrong value[%d], please check "
-                            "where Baidu Kunlun Card is properly installed.",
-                            ret));
+
+      int ret = xpu_fc_wrapper<XPUType, int16_t>(
+          dev_ctx.x_context(), reinterpret_cast<const XPUType*>(data_a),
+          reinterpret_cast<const XPUType*>(data_b),
+          reinterpret_cast<XPUType*>(data_c), m, n, k, trans_a, trans_b,
+          nullptr, nullptr, nullptr, lda, ldb, ldc, alpha, beta, nullptr,
+          xpu::Activation_t::LINEAR);
+      PADDLE_ENFORCE_XDNN_SUCCESS(ret, "xpu_fc_wrapper");
     }
 
     if (dy) {
@@ -159,14 +166,14 @@ class MulGradXPUKernel : public framework::OpKernel<T> {
       const T* data_a = x_matrix.data<T>();
       const T* data_b = dout->data<T>();
       T* data_c = dy_matrix.data<T>();
-      int ret =
-          xpu::gemm_int16(dev_ctx.x_context(), trans_a, trans_b, m, n, k, alpha,
-                          data_a, lda, data_b, ldb, beta, data_c, ldc);
-      PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
-                        platform::errors::External(
-                            "XPU API return wrong value[%d], please check "
-                            "where Baidu Kunlun Card is properly installed.",
-                            ret));
+
+      int ret = xpu_fc_wrapper<XPUType, int16_t>(
+          dev_ctx.x_context(), reinterpret_cast<const XPUType*>(data_a),
+          reinterpret_cast<const XPUType*>(data_b),
+          reinterpret_cast<XPUType*>(data_c), m, n, k, trans_a, trans_b,
+          nullptr, nullptr, nullptr, lda, ldb, ldc, alpha, beta, nullptr,
+          xpu::Activation_t::LINEAR);
+      PADDLE_ENFORCE_XDNN_SUCCESS(ret, "xpu_fc_wrapper");
     }
   }
 };
@@ -175,9 +182,12 @@ class MulGradXPUKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+namespace plat = paddle::platform;
 
 REGISTER_OP_XPU_KERNEL(
-    mul, ops::MulXPUKernel<paddle::platform::XPUDeviceContext, float>);
+    mul, ops::MulXPUKernel<paddle::platform::XPUDeviceContext, float>,
+    ops::MulXPUKernel<paddle::platform::XPUDeviceContext, plat::float16>);
 REGISTER_OP_XPU_KERNEL(
-    mul_grad, ops::MulGradXPUKernel<paddle::platform::XPUDeviceContext, float>)
+    mul_grad, ops::MulGradXPUKernel<paddle::platform::XPUDeviceContext, float>,
+    ops::MulGradXPUKernel<paddle::platform::XPUDeviceContext, plat::float16>)
 #endif
diff --git a/paddle/fluid/operators/pixel_shuffle_op.cc b/paddle/fluid/operators/pixel_shuffle_op.cc
index 21ca26f49f653d..1724aedbe9b249 100644
--- a/paddle/fluid/operators/pixel_shuffle_op.cc
+++ b/paddle/fluid/operators/pixel_shuffle_op.cc
@@ -82,42 +82,6 @@ class PixelShuffleGradMaker : public framework::SingleGradOpMaker<T> {
 class PixelShuffleGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput(framework::GradVarName("Out")), true,
-        platform::errors::NotFound("Input(Out@Grad) should not be null"));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasOutput(framework::GradVarName("X")), true,
-        platform::errors::NotFound("Output(X@Grad) should not be null"));
-
-    auto do_dims = ctx->GetInputDim(framework::GradVarName("Out"));
-    PADDLE_ENFORCE_EQ(do_dims.size(), 4,
-                      platform::errors::InvalidArgument(
-                          "Input should be a 4-D tensor of format [N, C, H, W] "
-                          "or [N, H, W, C], but got %u.",
-                          do_dims.size()));
-
-    auto upscale_factor = ctx->Attrs().Get<int>("upscale_factor");
-
-    const std::string data_format =
-        ctx->Attrs().Get<std::string>("data_format");
-    const bool channel_last = (data_format == "NHWC");
-
-    auto dx_dims = do_dims;
-    dx_dims[0] = do_dims[0];
-
-    if (!channel_last) {
-      dx_dims[1] = do_dims[1] * (upscale_factor * upscale_factor);
-      dx_dims[2] = do_dims[2] / upscale_factor;
-      dx_dims[3] = do_dims[3] / upscale_factor;
-    } else {
-      dx_dims[1] = do_dims[1] / upscale_factor;
-      dx_dims[2] = do_dims[2] / upscale_factor;
-      dx_dims[3] = do_dims[3] * (upscale_factor * upscale_factor);
-    }
-    ctx->SetOutputDim(framework::GradVarName("X"), dx_dims);
-  }
 };
 
 }  // namespace operators
@@ -132,7 +96,11 @@ REGISTER_OPERATOR(pixel_shuffle, ops::PixelShuffleOp, ops::PixelShuffleOpMaker,
                   ops::PixelShuffleGradMaker<paddle::imperative::OpBase>,
                   PixelShuffleInferShapeFunctor);
 
-REGISTER_OPERATOR(pixel_shuffle_grad, ops::PixelShuffleGradOp);
+DECLARE_INFER_SHAPE_FUNCTOR(pixel_shuffle_grad,
+                            PixelShuffleGradInferShapeFunctor,
+                            PD_INFER_META(phi::PixelShuffleGradInferMeta));
+REGISTER_OPERATOR(pixel_shuffle_grad, ops::PixelShuffleGradOp,
+                  PixelShuffleGradInferShapeFunctor);
 
 REGISTER_OP_VERSION(pixel_shuffle)
     .AddCheckpoint(
diff --git a/paddle/fluid/operators/rnn_op_xpu.cc b/paddle/fluid/operators/rnn_op_xpu.cc
index 2dee4e889f739e..c75c24ab0abc2c 100644
--- a/paddle/fluid/operators/rnn_op_xpu.cc
+++ b/paddle/fluid/operators/rnn_op_xpu.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/fluid/platform/device/xpu/xpu_header.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -114,6 +115,9 @@ class RnnXPUKernel : public framework::OpKernel<T> {
       if (dropout_mask->numel() != output->numel()) dropout_mask->clear();
     }
     dropout_mask->mutable_data<uint8_t>(output->dims(), ctx.GetPlace());
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    phi::funcs::SetConstant<platform::XPUDeviceContext, uint8_t> ones;
+    ones(dev_ctx, dropout_mask, static_cast<uint8_t>(1));
 
     PADDLE_ENFORCE_EQ(
         mode, "LSTM",
@@ -190,7 +194,6 @@ class RnnXPUKernel : public framework::OpKernel<T> {
       seq_len_tensor = operators::GetDataFromTensor(sequence_length);
     }
 
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
     int state_offset = pre_state[0]->dims()[1] * pre_state[0]->dims()[2];
 
     for (int i = 0; i < num_layers; i++) {
diff --git a/paddle/fluid/operators/uniform_random_op.cc b/paddle/fluid/operators/uniform_random_op.cc
index 1c22e60fa87aa7..55c24e213d58b0 100644
--- a/paddle/fluid/operators/uniform_random_op.cc
+++ b/paddle/fluid/operators/uniform_random_op.cc
@@ -16,9 +16,11 @@ limitations under the License. */
 #include <string>
 
 #include "paddle/fluid/framework/generator.h"
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/bfloat16.h"
+#include "paddle/phi/infermeta/nullary.h"
 
 namespace paddle {
 namespace operators {
@@ -122,74 +124,6 @@ class UniformRandomOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "UniformRandomOp");
-
-    PADDLE_ENFORCE_LT(
-        ctx->Attrs().Get<float>("min"), ctx->Attrs().Get<float>("max"),
-        platform::errors::InvalidArgument(
-            "The uniform_random's min must less then max. But received min = "
-            "%f great than or equal max = %f.",
-            ctx->Attrs().Get<float>("min"), ctx->Attrs().Get<float>("max")));
-    PADDLE_ENFORCE_GE(ctx->Attrs().Get<int>("diag_num"), 0,
-                      platform::errors::InvalidArgument(
-                          "The uniform_random's diag_num must greater than or "
-                          "equal 0. But recevied diag_num (%d) < 0.",
-                          ctx->Attrs().Get<int>("diag_num")));
-    PADDLE_ENFORCE_GE(ctx->Attrs().Get<int>("diag_step"), 0,
-                      platform::errors::InvalidArgument(
-                          "The uniform_random's diag_step must greater than or "
-                          "equal 0. But recevied diag_step (%d) < 0.",
-                          ctx->Attrs().Get<int>("diag_step")));
-
-    if (ctx->HasInputs("ShapeTensorList")) {
-      // top prority shape
-      auto inputs_name = ctx->Inputs("ShapeTensorList");
-      PADDLE_ENFORCE_GT(inputs_name.size(), 0,
-                        platform::errors::InvalidArgument(
-                            "Input(ShapeTensorList)'size of "
-                            "Op(uniform_random) can't be zero."
-                            "Please check the Attr(shape)'s size of"
-                            "Op(fluid.layers.uniform_random).)"));
-      auto out_dims = std::vector<int>(inputs_name.size(), -1);
-      ctx->SetOutputDim("Out", phi::make_ddim(out_dims));
-
-      return;
-    }
-    auto &shape = ctx->Attrs().Get<std::vector<int64_t>>("shape");
-    if (ctx->HasInput("ShapeTensor") && shape.empty()) {
-      auto shape_dims = ctx->GetInputDim("ShapeTensor");
-      PADDLE_ENFORCE_EQ(
-          shape_dims.size(), 1,
-          platform::errors::InvalidArgument(
-              "ShapeError: Input(ShapeTensor)' dimension size of "
-              "Op(uniform_random) must be 1."
-              "But received ShapeTensor's dimensions = %d, shape = [%s]",
-              shape_dims.size(), shape_dims));
-      int num_ele = 1;
-      for (int i = 0; i < shape_dims.size(); ++i) {
-        num_ele *= shape_dims[i];
-      }
-      auto vec_dims = std::vector<int64_t>(num_ele, -1);
-      auto out_dims = phi::make_ddim(vec_dims);
-      ctx->SetOutputDim("Out", out_dims);
-      return;
-    }
-
-    PADDLE_ENFORCE_EQ(shape.empty(), false,
-                      platform::errors::InvalidArgument(
-                          "if there is no Input(ShapeTensorList) and no "
-                          "Input(ShapeTensor),the "
-                          "attr(shape) information must "
-                          "be set by Attr(shape)."));
-    std::vector<int64_t> tensor_shape;
-    tensor_shape.reserve(shape.size());
-    for (auto dim : shape) {
-      tensor_shape.push_back(static_cast<int64_t>(dim));
-    }
-    ctx->SetOutputDim("Out", phi::make_ddim(tensor_shape));
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
@@ -274,12 +208,16 @@ class UniformRandomOpVarTypeInference : public framework::VarTypeInference {
 }  // namespace operators
 }  // namespace paddle
 
+DECLARE_INFER_SHAPE_FUNCTOR(uniform_random, UniformRandomInferShapeFunctor,
+                            PD_INFER_META(phi::UniformRandomInferMeta));
+
 REGISTER_OPERATOR(
     uniform_random, paddle::operators::UniformRandomOp,
     paddle::operators::UniformRandomOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
-    paddle::operators::UniformRandomOpVarTypeInference);
+    paddle::operators::UniformRandomOpVarTypeInference,
+    UniformRandomInferShapeFunctor);
 
 REGISTER_OP_CPU_KERNEL(
     uniform_random_batch_size_like,
diff --git a/paddle/fluid/operators/uniform_random_op.h b/paddle/fluid/operators/uniform_random_op.h
index b941dc21c3ab21..ae846f4cae6fba 100644
--- a/paddle/fluid/operators/uniform_random_op.h
+++ b/paddle/fluid/operators/uniform_random_op.h
@@ -19,11 +19,7 @@
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #if defined(__NVCC__) || defined(__HIPCC__)
-DECLARE_bool(use_curand);
-#include <thrust/device_vector.h>
-#include <thrust/host_vector.h>
 #include <thrust/random.h>
-#include <thrust/transform.h>
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/phi/kernels/full_kernel.h"
 #include "paddle/phi/kernels/funcs/distribution_helper.h"
@@ -146,39 +142,6 @@ struct UniformGenerator {
   }
 };
 
-template <typename T>
-struct UniformGeneratorOffset {
-  T min_, max_;
-  unsigned int seed_;
-  T diag_val_;
-  unsigned int diag_num_;
-  unsigned int diag_step_;
-  int offset_;
-  __host__ __device__ UniformGeneratorOffset(T min, T max, int seed,
-                                             int diag_num, int diag_step,
-                                             T diag_val, int offset)
-      : min_(min),
-        max_(max),
-        seed_(seed),
-        diag_num_(diag_num),
-        diag_step_(diag_step),
-        diag_val_(diag_val),
-        offset_(offset) {}
-
-  __host__ __device__ T operator()(const unsigned int n) const {
-    thrust::minstd_rand rng;
-    rng.seed(seed_);
-    thrust::uniform_real_distribution<T> dist(min_, max_);
-    rng.discard(n + offset_);
-    T out = dist(rng);
-    unsigned int remainder = n % (diag_step_ + 1);
-    if (remainder == 0 && diag_num_ > n / (diag_step_ + 1)) {
-      out = diag_val_;
-    }
-    return out;
-  }
-};
-
 template <typename T>
 void UniformRandom(const framework::ExecutionContext& context,
                    framework::Tensor* tensor) {
@@ -205,19 +168,10 @@ void UniformRandom(const framework::ExecutionContext& context,
   int device_id = context.GetPlace().GetDeviceId();
   auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
   if (gen_cuda->GetIsInitPy() && seed_flag) {
-    if (FLAGS_use_curand) {
-      using MT = typename details::MPTypeTrait<T>::Type;
-      phi::funcs::uniform_distribution<MT> dist;
-      phi::funcs::uniform_real_transform<MT> trans(min, max);
-      phi::funcs::distribution_and_transform<T>(dev_cxt, tensor, dist, trans);
-    } else {
-      auto seed_offset = gen_cuda->IncrementOffset(1);
-      int64_t gen_offset = size * seed_offset.second;
-      auto func =
-          UniformGeneratorOffset<T>(min, max, seed_offset.first, diag_num,
-                                    diag_step, diag_val, gen_offset);
-      phi::IndexKernel<T, UniformGeneratorOffset<T>>(dev_cxt, tensor, func);
-    }
+    using MT = typename details::MPTypeTrait<T>::Type;
+    phi::funcs::uniform_distribution<MT> dist;
+    phi::funcs::uniform_real_transform<MT> trans(min, max);
+    phi::funcs::distribution_and_transform<T>(dev_cxt, tensor, dist, trans);
   } else {
     auto func =
         UniformGenerator<T>(min, max, seed, diag_num, diag_step, diag_val);
diff --git a/paddle/fluid/platform/device/gpu/gpu_info.cc b/paddle/fluid/platform/device/gpu/gpu_info.cc
index a671381d07ff3d..89e3b74bb3acae 100644
--- a/paddle/fluid/platform/device/gpu/gpu_info.cc
+++ b/paddle/fluid/platform/device/gpu/gpu_info.cc
@@ -188,6 +188,8 @@ class RecordedGpuMallocHelper {
     if (UNLIKELY(malloc_managed_memory)) {
       result = cudaMallocManaged(ptr, size);
     } else {
+      VLOG(10) << "[cudaMalloc] size=" << static_cast<double>(size) / (1 << 20)
+               << " MB";
       result = cudaMalloc(ptr, size);
     }
 #endif
@@ -226,6 +228,8 @@ class RecordedGpuMallocHelper {
     if (err != hipErrorDeinitialized) {
 #else
     auto err = cudaFree(ptr);
+    VLOG(10) << "[cudaFree] size=" << static_cast<double>(size) / (1 << 20)
+             << " MB";
     if (err != cudaErrorCudartUnloading) {
 #endif
       PADDLE_ENFORCE_GPU_SUCCESS(err);
diff --git a/paddle/fluid/platform/device/ipu/ipu_compiler.cc b/paddle/fluid/platform/device/ipu/ipu_compiler.cc
index 1a3e600058b3b1..7ae3b2303decd7 100644
--- a/paddle/fluid/platform/device/ipu/ipu_compiler.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_compiler.cc
@@ -474,6 +474,7 @@ void Compiler::LowerOptimizer(const Scope* scope) {
         auto adam_mode =
             AdamModeFromStr(adam_mode_, ipu_strategy_->use_no_bias_optimizer);
         auto weight_decay_mode_ = ipu_strategy_->weight_decay_mode;
+        auto scaled_optimizer_state_ = ipu_strategy_->scaled_optimizer_state;
         if (weight_decay_mode_.empty()) {
           weight_decay_mode_ = BOOST_GET_CONST(
               std::string, op_desc->GetAttr("weight_decay_mode"));
@@ -492,7 +493,7 @@ void Compiler::LowerOptimizer(const Scope* scope) {
             auto optimizer_instance = std::make_unique<popart::Adam>(
                 optimizer_value, adam_mode, weight_decay_mode,
                 popart::DataType::UNDEFINED, accl1_type, accl2_type,
-                clip_norm_settings);
+                clip_norm_settings, scaled_optimizer_state_);
             for (int i = 0; i < weight_decay_vars.size(); i++) {
               optimizer_instance->insertSpecific(
                   weight_decay_vars[i],
@@ -511,11 +512,10 @@ void Compiler::LowerOptimizer(const Scope* scope) {
                 popart::OptimizerValue(loss_scaling, true),
                 popart::OptimizerValue(mwn, true), adam_mode, weight_decay_mode,
                 popart::DataType::UNDEFINED, accl1_type, accl2_type,
-                clip_norm_settings);
+                clip_norm_settings, scaled_optimizer_state_);
           }
         };
-        if (adam_mode == popart::AdamMode::Lamb ||
-            adam_mode == popart::AdamMode::LambNoBias) {
+        if (adam_mode == popart::AdamMode::Lamb) {
           const std::map<std::string, std::pair<float, bool>> optimizer_value =
               {{"defaultLearningRate", {0.0, false}},
                {"defaultBeta1", {beta1, false}},
@@ -526,7 +526,26 @@ void Compiler::LowerOptimizer(const Scope* scope) {
           auto eval_optimizer = std::make_unique<popart::Adam>(
               optimizer_value, adam_mode, weight_decay_mode,
               popart::DataType::UNDEFINED, popart::DataType::FLOAT,
-              popart::DataType::FLOAT, clip_norm_settings);
+              popart::DataType::FLOAT, clip_norm_settings,
+              scaled_optimizer_state_);
+          for (int i = 0; i < weight_decay_vars.size(); i++) {
+            eval_optimizer->insertSpecific(weight_decay_vars[i],
+                                           {{"weightDecay", {0.0, false}}});
+          }
+          resources_->eval_optimizer = std::move(eval_optimizer);
+        } else if (adam_mode == popart::AdamMode::LambNoBias) {
+          const std::map<std::string, std::pair<float, bool>> optimizer_value =
+              {{"defaultLearningRate", {0.0, false}},
+               {"defaultBeta1", {1.0, false}},
+               {"defaultBeta2", {1.0, false}},
+               {"defaultEps", {eps, true}},
+               {"lossScaling", {loss_scaling, true}},
+               {"defaultMaxWeightNorm", {mwn, true}}};
+          auto eval_optimizer = std::make_unique<popart::Adam>(
+              optimizer_value, adam_mode, weight_decay_mode,
+              popart::DataType::UNDEFINED, popart::DataType::FLOAT,
+              popart::DataType::FLOAT, clip_norm_settings,
+              scaled_optimizer_state_);
           for (int i = 0; i < weight_decay_vars.size(); i++) {
             eval_optimizer->insertSpecific(weight_decay_vars[i],
                                            {{"weightDecay", {0.0, false}}});
@@ -542,7 +561,8 @@ void Compiler::LowerOptimizer(const Scope* scope) {
               popart::OptimizerValue(loss_scaling, true),
               popart::OptimizerValue(mwn, true), adam_mode, weight_decay_mode,
               popart::DataType::UNDEFINED, popart::DataType::FLOAT,
-              popart::DataType::FLOAT, clip_norm_settings);
+              popart::DataType::FLOAT, clip_norm_settings,
+              scaled_optimizer_state_);
         }
       } else if (type == "adaptive") {
         auto alpha = BOOST_GET_CONST(float, op_desc->GetAttr("alpha"));
diff --git a/paddle/fluid/platform/device/ipu/ipu_strategy.cc b/paddle/fluid/platform/device/ipu/ipu_strategy.cc
index 6172d4d7dc6800..f52499a8d8fda4 100644
--- a/paddle/fluid/platform/device/ipu/ipu_strategy.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_strategy.cc
@@ -67,6 +67,7 @@ IpuStrategy::IpuStrategy() {
   ADD_BOOL_OPTION(transfer_cast_op);
   ADD_BOOL_OPTION(use_no_bias_optimizer);
   ADD_BOOL_OPTION(enable_distribution);
+  ADD_BOOL_OPTION(scaled_optimizer_state);
   ADD_UINT64_OPTION(num_ipus);
   ADD_UINT64_OPTION(batches_per_step);
   ADD_UINT64_OPTION(micro_batch_size);
diff --git a/paddle/fluid/platform/device/ipu/ipu_strategy.h b/paddle/fluid/platform/device/ipu/ipu_strategy.h
index 786e2419cc0be9..1802eb16e58955 100644
--- a/paddle/fluid/platform/device/ipu/ipu_strategy.h
+++ b/paddle/fluid/platform/device/ipu/ipu_strategy.h
@@ -37,13 +37,13 @@ class IpuStrategy {
   // training flag, true for training
   bool is_training = true;
 
-  // average sharding, debugging used
+  // Average sharding, debugging used
   bool need_avg_shard = false;
 
-  // flag for fp16, true for pure fp16
+  // Flag for fp16, true for pure fp16
   bool enable_fp16 = false;
 
-  // enable transfer cast Op target from fp32 to fp16 in fp16 mode
+  // Enable transfer cast Op target from fp32 to fp16 in fp16 mode
   bool transfer_cast_op = true;
 
   // The mode of Adam/Lamb optimizer
@@ -51,33 +51,35 @@ class IpuStrategy {
   // true: The Adam_No_Bias/Lamb_No_Bias optimizer from PopART
   bool use_no_bias_optimizer = false;
 
-  // enable distributed computing for POD128 or POD256
+  // Enable distributed computing for POD128 or POD256
   bool enable_distribution = false;
 
+  // Enable Scaled optimizer state only for Adam and Lamb
+  bool scaled_optimizer_state = false;
+
   // Number ipus total needed, local_replica * ipu_per_replica
   int num_ipus = 1;
 
-  // batches per step
+  // Batches per step
   int batches_per_step = 1;
 
-  // micro batch-size
+  // Micro batch-size
   int micro_batch_size = 1;
 
-  // random seed
+  // Random seed
   std::uint64_t random_seed = std::numeric_limits<std::uint64_t>::max();
 
-  // TODO(alleng) remove this param
-  // available memory proportion, 0.0f for disable
+  // Available memory proportion, 0.0f for disable
   float available_memory_proportion = 0.0f;
 
-  // loss scaling, currently we can't get loss scaling from
+  // Loss scaling, currently we can't get loss scaling from
   // optimizer_extract_pass, so we have to set it here
   float loss_scaling = 1.0f;
 
-  // defaultMaxWeightNorm for adam optimizer
+  // DefaultMaxWeightNorm for adam optimizer
   float max_weight_norm = 65504.0f;
 
-  // file path for dumping compiled model in onnx format
+  // File path for dumping compiled model in onnx format
   std::string onnx_dump_path;
 
   // Data type to use for tensor that stores first-order momentum optimizer
@@ -106,7 +108,7 @@ class IpuStrategy {
   // popart pattern manager
   popart::Patterns popart_patterns;
 
-  // custom ops
+  // Custom ops
   std::vector<IpuCustomOpIdentifier> custom_ops;
 
  public:
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc
index 9a907cf5e880ff..444b55959cf221 100644
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc
@@ -157,7 +157,6 @@ Node *softmax_handler(Graph *graph, Node *node) {
 
 Node *scale_handler(Graph *graph, Node *node) {
   auto *op = node->Op();
-  auto scale_ = BOOST_GET_CONST(float, op->GetAttr("scale"));
   auto bias_ = BOOST_GET_CONST(float, op->GetAttr("bias"));
   auto bias_after_scale_ =
       BOOST_GET_CONST(bool, op->GetAttr("bias_after_scale"));
@@ -191,6 +190,7 @@ Node *scale_handler(Graph *graph, Node *node) {
       }
     }
   } else {
+    auto scale_ = BOOST_GET_CONST(float, op->GetAttr("scale"));
     if (is_float_equal(bias_, 0.0) && is_float_equal(scale_, 1.0)) {
       return CreateBaseOp(graph, node, "popart_identity",
                           {GetInputVarNode("X", node)}, node->outputs, {});
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/nn_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/nn_ops.cc
index a529a34e6d71ac..a08fbaa26d9eda 100644
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/nn_ops.cc
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/nn_ops.cc
@@ -95,6 +95,21 @@ Node *pool2d_handler(Graph *graph, Node *node) {
   auto *op = node->Op();
   auto pooling_type = BOOST_GET_CONST(std::string, op->GetAttr("pooling_type"));
   auto global_pooling = BOOST_GET_CONST(bool, op->GetAttr("global_pooling"));
+  if (op->HasAttr("adaptive")) {
+    auto adaptive = BOOST_GET_CONST(bool, op->GetAttr("adaptive"));
+    if (adaptive) {
+      auto ksize = BOOST_GET_CONST(std::vector<int>, op->GetAttr("ksize"));
+      if (ksize[0] != 1 || ksize[1] != 1) {
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "Only support pool_size=1 with adaptive mode."));
+      }
+      // adaptive maxpool op is max_pool2d_with_index. Only process avgpool
+      // here.
+      return CreateBaseOp(graph, node, "popart_globalaveragepool", node->inputs,
+                          node->outputs);
+    }
+  }
+
   if (global_pooling) {
     if (pooling_type == "max") {
       return CreateBaseOp(graph, node, "popart_globalmaxpool", node->inputs,
@@ -159,6 +174,17 @@ Node *pool2d_handler(Graph *graph, Node *node) {
   }
 }
 
+Node *max_pool2d_with_index_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  auto ksize = BOOST_GET_CONST(std::vector<int>, op->GetAttr("ksize"));
+  if (ksize[0] != 1 || ksize[1] != 1) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Only support pool_size=1 with adaptive mode."));
+  }
+  return CreateBaseOp(graph, node, "popart_globalmaxpool", node->inputs,
+                      {GetOutputVarNode("Out", node)});
+}
+
 Node *group_norm_handler(Graph *graph, Node *node) {
   auto *op = node->Op();
   auto epsilon_ = BOOST_GET_CONST(float, op->GetAttr("epsilon"));
@@ -304,6 +330,7 @@ Node *dropout_handler(Graph *graph, Node *node) {
 }  // namespace paddle
 
 REGISTER_HANDLER(pool2d, pool2d_handler);
+REGISTER_HANDLER(max_pool2d_with_index, max_pool2d_with_index_handler);
 REGISTER_HANDLER(batch_norm, batch_norm_handler);
 REGISTER_HANDLER(group_norm, group_norm_handler);
 REGISTER_HANDLER(instance_norm, instance_norm_handler);
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/tensor_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/tensor_ops.cc
index 4c086bffb240ed..55c25bce159313 100644
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/tensor_ops.cc
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/tensor_ops.cc
@@ -331,7 +331,7 @@ Node *shape_handler(Graph *graph, Node *node) {
 Node *slice_handler(Graph *graph, Node *node) {
   auto *op = node->Op();
   Node *starts = nullptr;
-  if (!op->Input("StartsTensor").empty()) {
+  if (!op->HasAttr("starts")) {
     starts = GetInputVarNode("StartsTensor", node);
   } else {
     auto starts_ = BOOST_GET_CONST(std::vector<int>, op->GetAttr("starts"));
@@ -341,7 +341,7 @@ Node *slice_handler(Graph *graph, Node *node) {
     starts = starts->outputs[0];
   }
   Node *ends = nullptr;
-  if (!op->Input("EndsTensor").empty()) {
+  if (!op->HasAttr("ends")) {
     ends = GetInputVarNode("EndsTensor", node);
   } else {
     auto ends_ = BOOST_GET_CONST(std::vector<int>, op->GetAttr("ends"));
diff --git a/paddle/fluid/platform/device/xpu/xpu2_op_list.h b/paddle/fluid/platform/device/xpu/xpu2_op_list.h
index 15db243f751a65..08a7f080069570 100644
--- a/paddle/fluid/platform/device/xpu/xpu2_op_list.h
+++ b/paddle/fluid/platform/device/xpu/xpu2_op_list.h
@@ -70,8 +70,10 @@ XPUOpMap& get_kl2_ops() {
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                      pOpKernelType(vartype::FP16, XPUPlace())})},
       {"dropout_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"dropout", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                     pOpKernelType(vartype::FP16, XPUPlace())})},
+      {"dropout", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                                pOpKernelType(vartype::FP16, XPUPlace())})},
       {"elementwise_add_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                      pOpKernelType(vartype::FP16, XPUPlace())})},
@@ -249,6 +251,8 @@ XPUOpMap& get_kl2_ops() {
       {"momentum", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"mul", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                             pOpKernelType(vartype::FP16, XPUPlace())})},
+      {"mul_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                                 pOpKernelType(vartype::FP16, XPUPlace())})},
       {"nearest_interp_v2",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"nearest_interp_v2_grad",
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index f3934c7d8713b2..904e4854ba6b45 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -522,8 +522,8 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : phi::GPUContext(place) {
   cuda_stream_.reset(new stream::CUDAStream(phi::GPUContext::stream(), place));
   auto& instance = memory::allocation::AllocatorFacade::Instance();
   instance.SetDefaultStream(place, phi::GPUContext::stream());
-  workspace_.reset(
-      new phi::DnnWorkspaceHandle(instance.GetAllocator(place).get()));
+  workspace_.reset(new phi::DnnWorkspaceHandle(
+      instance.GetAllocator(place).get(), stream()));
 }
 
 CUDADeviceContext::~CUDADeviceContext() = default;
@@ -623,7 +623,8 @@ phi::DnnWorkspaceHandle CUDADeviceContext::cudnn_workspace_handle() const {
     return phi::DnnWorkspaceHandle(
         memory::allocation::AllocatorFacade::Instance()
             .GetAllocator(GetPlace())
-            .get());
+            .get(),
+        stream());
   }
   return phi::GPUContext::cudnn_workspace_handle();
 }
diff --git a/paddle/fluid/platform/device_event.h b/paddle/fluid/platform/device_event.h
index 57f45a40165d7a..463329d32c9361 100644
--- a/paddle/fluid/platform/device_event.h
+++ b/paddle/fluid/platform/device_event.h
@@ -29,7 +29,7 @@ using ::paddle::platform::kCPU;
 USE_EVENT(kCPU)
 USE_EVENT_WAIT(kCPU, kCPU)
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 USE_EVENT(kCUDA);
 USE_EVENT_WAIT(kCUDA, kCUDA)
 USE_EVENT_WAIT(kCPU, kCUDA)
diff --git a/paddle/fluid/platform/device_event_gpu.cc b/paddle/fluid/platform/device_event_gpu.cc
index a811a5b9c130dc..f42ccc5a1db54e 100644
--- a/paddle/fluid/platform/device_event_gpu.cc
+++ b/paddle/fluid/platform/device_event_gpu.cc
@@ -15,7 +15,7 @@
 #include "paddle/fluid/platform/device_event_base.h"
 #include "paddle/fluid/platform/event.h"
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 namespace paddle {
 namespace platform {
 struct CUDADeviceEventWrapper {
diff --git a/paddle/fluid/platform/device_event_test.cc b/paddle/fluid/platform/device_event_test.cc
index 96e89f9257dd23..d9f744b26256b1 100644
--- a/paddle/fluid/platform/device_event_test.cc
+++ b/paddle/fluid/platform/device_event_test.cc
@@ -75,6 +75,58 @@ TEST(DeviceEvent, CUDA) {
 }
 #endif
 
+#ifdef PADDLE_WITH_HIP
+#include <hip/hip_runtime.h>
+
+TEST(DeviceEvent, CUDA) {
+  VLOG(1) << "In Test";
+  using paddle::platform::CUDAPlace;
+
+  auto& pool = DeviceContextPool::Instance();
+  auto place = CUDAPlace(0);
+  auto* context =
+      static_cast<paddle::platform::CUDADeviceContext*>(pool.Get(place));
+
+  ASSERT_NE(context, nullptr);
+  // case 1. test for event_creator
+  DeviceEvent event(place);
+  ASSERT_NE(event.GetEvent().get(), nullptr);
+  bool status = event.Query();
+  ASSERT_EQ(status, true);
+  // case 2. test for event_recorder
+  event.Record(context);
+  status = event.Query();
+  ASSERT_EQ(status, false);
+  // case 3. test for event_finisher
+  event.Finish();
+  status = event.Query();
+  ASSERT_EQ(status, true);
+
+  // case 4. test for event_waiter
+  float *src_fp32, *dst_fp32;
+  int size = 1000000 * sizeof(float);
+  hipMallocHost(reinterpret_cast<void**>(&src_fp32), size);
+  hipMalloc(reinterpret_cast<void**>(&dst_fp32), size);
+  hipMemcpyAsync(dst_fp32, src_fp32, size, hipMemcpyHostToDevice,
+                 context->stream());
+  event.Record(context);  // step 1. record it
+  status = event.Query();
+  ASSERT_EQ(status, false);
+
+  event.Wait(kCUDA, context);  // step 2. add streamWaitEvent
+  status = event.Query();
+  ASSERT_EQ(status, false);  // async
+
+  event.Wait(kCPU, context);  // step 3. EventSynchornize
+  status = event.Query();
+  ASSERT_EQ(status, true);  // sync
+
+  // release resource
+  hipFree(dst_fp32);
+  hipFreeHost(src_fp32);
+}
+#endif
+
 TEST(DeviceEvent, CPU) {
   using paddle::platform::CPUPlace;
   auto place = CPUPlace();
diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc
index 4e47c130c7252f..8209c0a5d6f8e9 100644
--- a/paddle/fluid/platform/flags.cc
+++ b/paddle/fluid/platform/flags.cc
@@ -161,10 +161,9 @@ PADDLE_DEFINE_EXPORTED_bool(
  * increased.
  *       Users need to balance memory and speed.
  */
-PADDLE_DEFINE_EXPORTED_uint64(
-    conv_workspace_size_limit,
-    paddle::platform::kDefaultConvWorkspaceSizeLimitMB,
-    "cuDNN convolution workspace limit in MB unit.");
+PADDLE_DEFINE_EXPORTED_int64(conv_workspace_size_limit,
+                             paddle::platform::kDefaultConvWorkspaceSizeLimitMB,
+                             "cuDNN convolution workspace limit in MB unit.");
 
 /**
  * CUDNN related FLAG
@@ -545,8 +544,6 @@ PADDLE_DEFINE_EXPORTED_double(
  */
 PADDLE_DEFINE_EXPORTED_bool(use_mkldnn, false, "Use MKLDNN to run");
 
-PADDLE_DEFINE_EXPORTED_bool(use_curand, false, "Random OP use CURAND");
-
 /**
  * Debug related FLAG
  * Name: FLAGS_call_stack_level
diff --git a/paddle/fluid/pybind/eager_py_layer.cc b/paddle/fluid/pybind/eager_py_layer.cc
index cade856b3607a5..605056e7af2b5e 100644
--- a/paddle/fluid/pybind/eager_py_layer.cc
+++ b/paddle/fluid/pybind/eager_py_layer.cc
@@ -231,6 +231,10 @@ PyObject* pylayer_method_apply(PyObject* cls, PyObject* args,
   auto outputs = PyObject_Call(forward_fn, forward_args, kwargs);
   egr::Controller::Instance().SetHasGrad(trace_backward);
   if (!outputs) {
+    Py_XDECREF(forward_args);
+    Py_XDECREF(kwargs_value_list);
+    Py_XDECREF(backward_function);
+    Py_XDECREF(forward_fn);
     return nullptr;
   }
 
@@ -367,6 +371,14 @@ PyObject* pylayer_method_apply(PyObject* cls, PyObject* args,
     VLOG(6) << "PyLayer construct backward node finish...";
   }
 
+  if (!PyTuple_Check(outputs)) {
+    Py_XDECREF(outputs_tuple);
+  }
+  Py_XDECREF(forward_args);
+  Py_XDECREF(kwargs_value_list);
+  Py_XDECREF(backward_function);
+  Py_XDECREF(forward_fn);
+
   return outputs;
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
diff --git a/paddle/fluid/pybind/fleet_py.cc b/paddle/fluid/pybind/fleet_py.cc
index 330719762ae087..8d8301689521b8 100644
--- a/paddle/fluid/pybind/fleet_py.cc
+++ b/paddle/fluid/pybind/fleet_py.cc
@@ -28,7 +28,6 @@ limitations under the License. */
 #include <string>
 #include <vector>
 
-#include "paddle/fluid/distributed/common/sparse_sharding_merge.h"
 #include "paddle/fluid/distributed/index_dataset/index_sampler.h"
 #include "paddle/fluid/distributed/index_dataset/index_wrapper.h"
 #include "paddle/fluid/distributed/ps/service/communicator/communicator.h"
@@ -49,7 +48,6 @@ using paddle::distributed::GraphNode;
 using paddle::distributed::GraphPyServer;
 using paddle::distributed::GraphPyClient;
 using paddle::distributed::FeatureNode;
-using paddle::distributed::ShardingMerge;
 
 namespace paddle {
 namespace pybind {
@@ -93,12 +91,6 @@ void BindPSHost(py::module* m) {
       .def("to_string", &distributed::PSHost::ToString);
 }
 
-void BindSparseShardingTools(py::module* m) {
-  py::class_<ShardingMerge>(*m, "ShardingMerge")
-      .def(py::init<>())
-      .def("merge", &ShardingMerge::Merge);
-}
-
 void BindCommunicatorContext(py::module* m) {
   py::class_<CommContext>(*m, "CommContext")
       .def(
diff --git a/paddle/fluid/pybind/fleet_py.h b/paddle/fluid/pybind/fleet_py.h
index 4dc0f002ad3c1d..206a69f5a80197 100644
--- a/paddle/fluid/pybind/fleet_py.h
+++ b/paddle/fluid/pybind/fleet_py.h
@@ -36,6 +36,5 @@ void BindIndexNode(py::module* m);
 void BindTreeIndex(py::module* m);
 void BindIndexWrapper(py::module* m);
 void BindIndexSampler(py::module* m);
-void BindSparseShardingTools(py::module* m);
 }  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index 7df6d8f7f791c8..e09c205db14e79 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -2182,6 +2182,7 @@ void BindImperative(py::module *m_ptr) {
   m.def("varbase_copy", &VarBaseCopy<platform::XPUPlace>);
   m.def("varbase_copy", &VarBaseCopy<platform::CUDAPinnedPlace>);
   m.def("varbase_copy", &VarBaseCopy<platform::NPUPlace>);
+  m.def("varbase_copy", &VarBaseCopy<platform::CustomPlace>);
   m.def("varbase_copy", &VarBaseCopy<platform::MLUPlace>);
 
   m.def(
@@ -2341,6 +2342,11 @@ void BindImperative(py::module *m_ptr) {
            const py::args args, const py::kwargs kwargs) {
           return imperative::PyLayerApply(place, cls, args, kwargs);
         });
+  m.def("pylayer_apply",
+        [](const platform::CustomPlace &place, const py::object &cls,
+           const py::args args, const py::kwargs kwargs) {
+          return imperative::PyLayerApply(place, cls, args, kwargs);
+        });
 
 #if defined(PADDLE_WITH_CUDA)
   m.def("to_uva_tensor",
diff --git a/paddle/fluid/pybind/ps_gpu_wrapper_py.cc b/paddle/fluid/pybind/ps_gpu_wrapper_py.cc
index fe1f27226bad4b..79529fca7d1be9 100644
--- a/paddle/fluid/pybind/ps_gpu_wrapper_py.cc
+++ b/paddle/fluid/pybind/ps_gpu_wrapper_py.cc
@@ -63,6 +63,27 @@ void BindPSGPUWrapper(py::module* m) {
       .def("finalize", &framework::PSGPUWrapper::Finalize,
            py::call_guard<py::gil_scoped_release>());
 }  // end PSGPUWrapper
+#ifdef PADDLE_WITH_PSLIB
+void BindAfsWrapper(py::module* m) {
+  py::class_<framework::AfsWrapper, std::shared_ptr<framework::AfsWrapper>>(
+      *m, "AfsWrapper")
+      .def(py::init([]() { return std::make_shared<framework::AfsWrapper>(); }))
+      .def("init", &framework::AfsWrapper::init,
+           py::call_guard<py::gil_scoped_release>())
+      .def("list", &framework::AfsWrapper::list,
+           py::call_guard<py::gil_scoped_release>())
+      .def("mkdir", &framework::AfsWrapper::mkdir,
+           py::call_guard<py::gil_scoped_release>())
+      .def("exist", &framework::AfsWrapper::exist,
+           py::call_guard<py::gil_scoped_release>())
+      .def("download", &framework::AfsWrapper::download,
+           py::call_guard<py::gil_scoped_release>())
+      .def("upload", &framework::AfsWrapper::upload,
+           py::call_guard<py::gil_scoped_release>())
+      .def("remove", &framework::AfsWrapper::remove,
+           py::call_guard<py::gil_scoped_release>());
+}
+#endif
 #endif
 }  // end namespace pybind
 }  // end namespace paddle
diff --git a/paddle/fluid/pybind/ps_gpu_wrapper_py.h b/paddle/fluid/pybind/ps_gpu_wrapper_py.h
index ba4f146389ed3e..22cd5ef0fd1496 100644
--- a/paddle/fluid/pybind/ps_gpu_wrapper_py.h
+++ b/paddle/fluid/pybind/ps_gpu_wrapper_py.h
@@ -24,6 +24,9 @@ namespace pybind {
 
 #ifdef PADDLE_WITH_HETERPS
 void BindPSGPUWrapper(py::module* m);
+#ifdef PADDLE_WITH_PSLIB
+void BindAfsWrapper(py::module* m);
+#endif
 #endif
 }  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 44abf3357d63d0..0427fcece0b8b4 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -845,6 +845,10 @@ PYBIND11_MODULE(core_noavx, m) {
            [](framework::Tensor &self, const std::string &layout) {
              self.set_layout(StringToDataLayout(layout));
            })
+      .def("_alloc_float",
+           [](framework::Tensor &self, paddle::platform::CustomPlace &place) {
+             self.mutable_data<float>(place);
+           })
       .def("_alloc_float",
            [](framework::Tensor &self, paddle::platform::CUDAPlace &place) {
              self.mutable_data<float>(place);
@@ -873,6 +877,10 @@ PYBIND11_MODULE(core_noavx, m) {
            [](framework::Tensor &self, paddle::platform::CPUPlace &place) {
              self.mutable_data<int>(place);
            })
+      .def("_alloc_int",
+           [](framework::Tensor &self, paddle::platform::CustomPlace &place) {
+             self.mutable_data<int>(place);
+           })
       .def("_alloc_int",
            [](framework::Tensor &self, paddle::platform::XPUPlace &place) {
              self.mutable_data<int>(place);
@@ -901,6 +909,12 @@ PYBIND11_MODULE(core_noavx, m) {
              return reinterpret_cast<uintptr_t>(
                  self.mutable_data(place, framework::TransToPhiDataType(type)));
            })
+      .def("_mutable_data",
+           [](framework::Tensor &self, paddle::platform::CustomPlace &place,
+              paddle::framework::proto::VarType::Type type) {
+             return reinterpret_cast<uintptr_t>(
+                 self.mutable_data(place, framework::TransToPhiDataType(type)));
+           })
       .def("_mutable_data",
            [](framework::Tensor &self, paddle::platform::XPUPlace &place,
               paddle::framework::proto::VarType::Type type) {
@@ -934,6 +948,8 @@ PYBIND11_MODULE(core_noavx, m) {
            })
       .def("_copy_from", &TensorCopyFrom<paddle::platform::CPUPlace>,
            py::arg("tensor"), py::arg("place"), py::arg("batch_size") = -1)
+      .def("_copy_from", &TensorCopyFrom<paddle::platform::CustomPlace>,
+           py::arg("tensor"), py::arg("place"), py::arg("batch_size") = -1)
       .def("_copy_from", &TensorCopyFrom<paddle::platform::XPUPlace>,
            py::arg("tensor"), py::arg("place"), py::arg("batch_size") = -1)
       .def("_copy_from", &TensorCopyFrom<paddle::platform::CUDAPlace>,
@@ -948,6 +964,8 @@ PYBIND11_MODULE(core_noavx, m) {
            py::arg("tensor"), py::arg("place"), py::arg("batch_size") = -1)
       .def("set", SetTensorFromPyArray<paddle::platform::CPUPlace>,
            py::arg("array"), py::arg("place"), py::arg("zero_copy") = false)
+      .def("set", SetTensorFromPyArray<paddle::platform::CustomPlace>,
+           py::arg("array"), py::arg("place"), py::arg("zero_copy") = false)
       .def("set", SetTensorFromPyArray<paddle::platform::XPUPlace>,
            py::arg("array"), py::arg("place"), py::arg("zero_copy") = false)
       .def("set", SetTensorFromPyArray<paddle::platform::CUDAPlace>,
@@ -1985,6 +2003,19 @@ All parameter, weight, gradient are variables in Paddle.
                  "Please recompile or reinstall Paddle with NPU support."));
 #else
                 return new paddle::platform::NPUDeviceContext(place);
+#endif
+        })
+        .def_static("create",
+                    [](paddle::platform::CustomPlace& place)
+                        -> paddle::platform::DeviceContext* {
+#ifndef PADDLE_WITH_CUSTOM_DEVICE
+             PADDLE_THROW(
+                 platform::errors::PermissionDenied(
+                 "Cannot use CustomPlace in CPU/GPU/XPU version, "
+                 "Please recompile or reinstall Paddle with "
+                 "CustomDevice support."));
+#else
+                return new paddle::platform::CustomDeviceContext(place);
 #endif
         })
       .def_static("create",
@@ -2722,6 +2753,12 @@ All parameter, weight, gradient are variables in Paddle.
              pybind11::gil_scoped_release release;
              self.Run(scope, place);
            })
+      .def("run",
+           [](OperatorBase &self, const Scope &scope,
+              const platform::CustomPlace &place) {
+             pybind11::gil_scoped_release release;
+             self.Run(scope, place);
+           })
       .def("type",
            [](const OperatorBase &op) -> std::string { return op.Type(); })
       .def("outputs",
@@ -4458,6 +4495,9 @@ All parameter, weight, gradient are variables in Paddle.
 #endif
 #ifdef PADDLE_WITH_HETERPS
   BindPSGPUWrapper(&m);
+#ifdef PADDLE_WITH_PSLIB
+  BindAfsWrapper(&m);
+#endif
 #endif
   BindGlooWrapper(&m);
   BindBoxHelper(&m);
@@ -4504,7 +4544,6 @@ All parameter, weight, gradient are variables in Paddle.
   BindTreeIndex(&m);
   BindIndexWrapper(&m);
   BindIndexSampler(&m);
-  BindSparseShardingTools(&m);
 #endif
 }
 }  // namespace pybind
diff --git a/paddle/infrt/api/CMakeLists.txt b/paddle/infrt/api/CMakeLists.txt
index 27d736cfdf7aa0..6d4604edee6a01 100644
--- a/paddle/infrt/api/CMakeLists.txt
+++ b/paddle/infrt/api/CMakeLists.txt
@@ -7,3 +7,5 @@ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/infrt_api_test.cc.in ${CMAKE_CURRENT_
 
 # Disable temporarily for the external-kernel's mkldnn is outdate
 cc_test_tiny(test_infrt_api SRCS infrt_api_test.cc DEPS infrt ${MLIR_IR_LIBS})
+# TODO(inference): remove after optimize weight unfold.
+set_tests_properties(test_infrt_api PROPERTIES TIMEOUT 200)
diff --git a/paddle/infrt/api/infrt_api.cc b/paddle/infrt/api/infrt_api.cc
index 2e8b64f768f139..8b4b14a3ca08ba 100644
--- a/paddle/infrt/api/infrt_api.cc
+++ b/paddle/infrt/api/infrt_api.cc
@@ -17,12 +17,14 @@
 #include <llvm/ADT/SmallVector.h>
 #include <llvm/Support/DynamicLibrary.h>
 #include <mlir/Dialect/StandardOps/IR/Ops.h>
+#include <mlir/IR/BuiltinOps.h>
 #include <mlir/Parser.h>
+#include <mlir/Pass/PassManager.h>
+#include <mlir/Transforms/Passes.h>
 
 #include <unordered_map>
 #include <vector>
 
-#include "mlir/Pass/PassManager.h"
 #include "paddle/infrt/backends/host/phi_allocator.h"
 #include "paddle/infrt/common/global.h"
 #include "paddle/infrt/dialect/dense_tensor.h"
@@ -48,8 +50,16 @@
 #include "paddle/infrt/kernel/test_kernels.h"
 #include "paddle/infrt/tensor/tensor_map.h"
 
+#include "paddle/infrt/dialect/infrt/pass/infrt_weights_unfold_pass.h"
+
 #if defined(INFRT_WITH_GPU) && defined(INFRT_WITH_TRT)
 #include "paddle/infrt/kernel/tensorrt/registry.h"
+
+#include "paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h"
+#include "paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h"
+#include "paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h"
+#include "paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h"
+#include "paddle/infrt/dialect/tensorrt/trt_type_convert_pass.h"
 #endif
 
 using namespace infrt::host_context;  // NOLINT
@@ -233,17 +243,34 @@ int InfRtPredictor::Init(const InfRtConfig& config) {
 #endif  // INFRT_WITH_GPU && INFRT_WITH_TRT
 #endif
 
-  auto module_op = impl_->module_gen_.ImportPaddleModel(config.model_dir(),
-                                                        config.param_dir());
+  mlir::ModuleOp module_op;
+  if (config.tensorrt_enabled()) {
+    module_op = impl_->module_gen_.ImportPaddleModel(
+        config.model_dir(), config.param_dir(), false);
+  } else {
+    module_op = impl_->module_gen_.ImportPaddleModel(config.model_dir(),
+                                                     config.param_dir());
+  }
 
   context->loadAllAvailableDialects();
   ::mlir::PassManager pm(context);
-  ::mlir::OpPassManager& phi_pass_manager = pm.nest<::mlir::FuncOp>();
-  std::vector<::infrt::Place> valid_places = {{::infrt::TargetType::CPU,
-                                               ::infrt::PrecisionType::FLOAT32,
-                                               ::infrt::LayoutType::NCHW}};
-  phi_pass_manager.addPass(CreatePhiOpCvtPass(valid_places));
-  phi_pass_manager.addPass(CreateInfrtOpFusePass());
+  ::mlir::OpPassManager& pass_manager = pm.nest<::mlir::FuncOp>();
+  if (config.tensorrt_enabled()) {
+    pass_manager.addPass(::infrt::CreateInfrtWeightsUnfoldPass());
+    pass_manager.addPass(::infrt::trt::CreateTrtOpTellerPass());
+    pass_manager.addPass(::infrt::trt::CreateTrtGraphFusePass());
+    pass_manager.addPass(::infrt::trt::CreateTrtGraphSplitPass(1));
+    pass_manager.addPass(::infrt::trt::CreateTrtOpConverterPass());
+    pass_manager.addPass(::infrt::trt::CreateTrtTypeConvertPass());
+    pass_manager.addPass(::mlir::createCanonicalizerPass());
+  } else {
+    std::vector<::infrt::Place> valid_places = {
+        {::infrt::TargetType::CPU,
+         ::infrt::PrecisionType::FLOAT32,
+         ::infrt::LayoutType::NCHW}};
+    pass_manager.addPass(CreatePhiOpCvtPass(valid_places));
+    pass_manager.addPass(CreateInfrtOpFusePass());
+  }
   if (mlir::failed(pm.run(module_op))) {
     std::cout << "\npass failed!\n" << std::endl;
     return 4;
diff --git a/paddle/infrt/api/infrt_api.h b/paddle/infrt/api/infrt_api.h
index cf14cab3c066e2..231f496bb89d19 100644
--- a/paddle/infrt/api/infrt_api.h
+++ b/paddle/infrt/api/infrt_api.h
@@ -26,6 +26,9 @@ class InfRtConfig {
   std::string param_dir_;
   std::vector<std::string> shared_libs_;
 
+  // TODO(wilber): Design an easy-to-use interface.
+  bool tensorrt_enabled_{false};
+
  public:
   InfRtConfig() = default;
   void set_model_dir(const std::string& model_dir) { model_dir_ = model_dir; }
@@ -39,6 +42,11 @@ class InfRtConfig {
   }
   const std::vector<std::string>& shared_libs() const { return shared_libs_; }
 
+  // TODO(wilber): Design an easy-to-use interface.
+  void enable_tensorrt() { tensorrt_enabled_ = true; }
+  void disable_tensorrt() { tensorrt_enabled_ = false; }
+  bool tensorrt_enabled() const { return tensorrt_enabled_; }
+
   virtual ~InfRtConfig() = default;
 };
 
diff --git a/paddle/infrt/api/infrt_api_test.cc.in b/paddle/infrt/api/infrt_api_test.cc.in
index 6323b6a540a31b..13635ddaaab2f1 100644
--- a/paddle/infrt/api/infrt_api_test.cc.in
+++ b/paddle/infrt/api/infrt_api_test.cc.in
@@ -57,4 +57,47 @@ TEST(InfRtPredictor, predictor) {
   ASSERT_EQ(output->dims(), ::phi::DDim({16, 10}));
 }
 
+#ifdef INFRT_WITH_TRT
+TEST(InfRtPredictor, trt_predictor) {
+  std::vector<std::string> shared_libs;
+
+  InfRtConfig config;
+  config.enable_tensorrt();
+
+  config.set_model_dir("@CMAKE_BINARY_DIR@/models/resnet50/model.pdmodel");
+  config.set_param_dir("@CMAKE_BINARY_DIR@/models/resnet50/model.pdiparams");
+
+  std::unique_ptr<InfRtPredictor> predictor = CreateInfRtPredictor(config);
+
+  ::infrt::backends::CpuPhiAllocator cpu_allocator;
+  ::phi::DenseTensor* input = predictor->GetInput(0);
+  input->Resize({2, 3, 256, 256});
+  input->AllocateFrom(&cpu_allocator, ::phi::DataType::FLOAT32);
+  auto* input_data = reinterpret_cast<float*>(input->data());
+  for (int i = 0; i < input->numel(); i++) input_data[i] = 1.0;
+  predictor->Run();
+
+  // get and print output tensor
+  auto* output = predictor->GetOutput(0);
+
+  ASSERT_EQ(output->dims(), ::phi::DDim({2, 1000}));
+  const std::vector<float> true_vals {
+    -3.319006264209747314e-01, -1.418896913528442383e+00,
+        -6.934890151023864746e-01, -1.498023152351379395e+00,
+        3.078042864799499512e-01, -1.340998053550720215e+00,
+        3.508620023727416992e+00, 2.274388313293457031e+00,
+        -1.321727275848388672e+00, -8.888689428567886353e-02,
+        -3.319006264209747314e-01, -1.418896913528442383e+00,
+        -6.934890151023864746e-01, -1.498023152351379395e+00,
+        3.078042864799499512e-01, -1.340998053550720215e+00,
+        3.508620023727416992e+00, 2.274388313293457031e+00,
+        -1.321727275848388672e+00, -8.888689428567886353e-02
+  };
+
+  for (size_t i = 0; i < true_vals.size(); i+=100) {
+    CHECK_NEAR(output->data<float>()[i*100], true_vals[i], 1e-5);
+  }
+}
+#endif
+
 }  // namespace infrt
diff --git a/paddle/infrt/backends/tensorrt/trt_utils.h b/paddle/infrt/backends/tensorrt/trt_utils.h
index c23d4608bb33fc..b2d5659fd25206 100644
--- a/paddle/infrt/backends/tensorrt/trt_utils.h
+++ b/paddle/infrt/backends/tensorrt/trt_utils.h
@@ -50,7 +50,8 @@ inline nvinfer1::Dims VecToDims(const std::vector<int>& vec) {
     assert(false);
   }
   // Pick first nvinfer1::Dims::MAX_DIMS elements
-  nvinfer1::Dims dims{std::min(static_cast<int>(vec.size()), limit), {}};
+  nvinfer1::Dims dims;
+  dims.nbDims = std::min(static_cast<int>(vec.size()), limit);
   std::copy_n(vec.begin(), dims.nbDims, std::begin(dims.d));
   return dims;
 }
diff --git a/paddle/infrt/dialect/infrt/common/types.h b/paddle/infrt/dialect/infrt/common/types.h
index 2ebe2b8ccdba69..5bd1f40262b47c 100644
--- a/paddle/infrt/dialect/infrt/common/types.h
+++ b/paddle/infrt/dialect/infrt/common/types.h
@@ -39,15 +39,12 @@ enum class PrecisionType : uint8_t {
 };
 
 struct Place {
-  TargetType target;
-  PrecisionType precision;
-  LayoutType layout;
+  TargetType target = TargetType::UNK;
+  PrecisionType precision = PrecisionType::UNK;
+  LayoutType layout = LayoutType::UNK;
   Place(TargetType tar, PrecisionType pre, LayoutType lay)
       : target(tar), precision(pre), layout(lay) {}
-  Place()
-      : target(TargetType::UNK),
-        precision(PrecisionType::UNK),
-        layout(LayoutType::UNK) {}
+  Place() = default;
 };
 
 llvm::Optional<TargetType> GetTargetType(llvm::StringRef key);
diff --git a/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td b/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td
index c4707c367bc801..2078ebb1442ffc 100644
--- a/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td
+++ b/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td
@@ -34,7 +34,8 @@ def CreateHostInitedDenseTensorOp : PDT_Op<"create_host_inited_dense_tensor.f32"
     I64ArrayAttr:$dims,
     LayoutAttr:$layout,
     I64ArrayAttr:$lod,
-    F32ArrayAttr:$values
+    F32ArrayAttr:$values,
+    DefaultValuedAttr<BoolAttr, "true">:$run_once
   );
   let results = (outs DenseTensor:$output);
 }
diff --git a/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.cc b/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.cc
index e3fdd5ae5bb9fb..e9b426a5088fc7 100644
--- a/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.cc
+++ b/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.cc
@@ -36,12 +36,35 @@
 #include "paddle/phi/ops/compat/signatures.h"
 
 namespace {
+
+infrt::Place ParsePlaceFromStr(const std::string &key) {
+  size_t first_index = key.find_first_of('-');
+  size_t second_index = key.find_last_of('-');
+  if (first_index != second_index) {
+    llvm::Optional<infrt::TargetType> tar =
+        infrt::GetTargetType(key.substr(0, first_index));
+    llvm::Optional<infrt::PrecisionType> pre = infrt::GetPrecisionType(
+        key.substr(first_index + 1, second_index - first_index - 1));
+    llvm::Optional<infrt::LayoutType> lay =
+        infrt::GetLayoutType(key.substr(second_index + 1));
+    if (tar && pre && lay) {
+      return infrt::Place(tar.getValue(), pre.getValue(), lay.getValue());
+    }
+  }
+  LOG(FATAL) << "Can't parse infrt::Place from string:" << key;
+  return infrt::Place();
+}
+
 class PhiOpConvertPass
     : public mlir::PassWrapper<PhiOpConvertPass, mlir::FunctionPass> {
  public:
   ::llvm::StringRef getName() const override { return "PhiOpConvertPass"; }
   void runOnFunction() override;
-  PhiOpConvertPass();
+
+  /// Initialize the valid_places_ by the valid_places_options_ while
+  /// valid_places_options_ has values.
+  mlir::LogicalResult initialize(mlir::MLIRContext *context) override;
+  PhiOpConvertPass() {}
   explicit PhiOpConvertPass(const std::vector<infrt::Place> &valid_places)
       : valid_places_(valid_places) {}
 
@@ -56,14 +79,35 @@ class PhiOpConvertPass
   void convertStage();
   void dispatchStage();
 
-  // Force a specified data format for all layout sensitive operations.
-  Option<std::string> valid_places_options_{
+  ListOption<std::string> valid_places_options_{
       *this,
       "valid-targets",
-      llvm::cl::desc("Set the valid target, [CPU-FP32-NCHW]")};
+      llvm::cl::desc(
+          "Set the valids target, such as: CPU-FP32-NCHW,GPU-FP32-NCHW"),
+      llvm::cl::MiscFlags::CommaSeparated};
 
   std::vector<infrt::Place> valid_places_;
 };
+
+/// Initialize the canonicalizer by building the set of patterns used during
+/// execution.
+mlir::LogicalResult PhiOpConvertPass::initialize(mlir::MLIRContext *context) {
+  if (valid_places_options_.hasValue()) {
+    VLOG(4) << "Start parse valid_places from commond line:";
+    if (!valid_places_.empty()) {
+      LOG(WARNING) << "Find valid place from commandline, current value will "
+                      "be overwrittern.";
+      valid_places_.clear();
+    }
+    for (auto &val : *valid_places_options_) {
+      VLOG(4) << "place string:" << val;
+      valid_places_.emplace_back(ParsePlaceFromStr(val));
+    }
+    VLOG(4) << "End parse valid_places from commond line:";
+  }
+  return mlir::success();
+}
+
 // Implementation of the PhiOpConvertPass.
 void PhiOpConvertPass::runOnFunction() {
   convertStage();
@@ -191,7 +235,16 @@ void PhiOpConvertPass::dispatchStage() {
                   .output();
           phi_context[infrt::TargetType::CPU] = context_value;
         } break;
-        case infrt::TargetType::GPU:
+        case infrt::TargetType::GPU: {
+          auto context_value =
+              builder
+                  .create<infrt::phi::CreateGPUContextOp>(
+                      kernel_op.getLoc(),
+                      infrt::phi::ContextType::get(kernel_op.getContext(),
+                                                   infrt::TargetType::GPU))
+                  .output();
+          phi_context[infrt::TargetType::GPU] = context_value;
+        } break;
         case infrt::TargetType::UNK:
         default:
           LOG(FATAL) << "Unsupported TargetType";
@@ -237,17 +290,6 @@ void PhiOpConvertPass::dispatchStage() {
   }
 }
 
-PhiOpConvertPass::PhiOpConvertPass() {
-  if (!valid_places_options_.hasValue()) {
-    valid_places_.emplace_back(infrt::TargetType::CPU,
-                               infrt::PrecisionType::FLOAT32,
-                               infrt::LayoutType::NCHW);
-    return;
-  }
-
-  LOG(FATAL) << "To be done for specifying places in command line";
-}
-
 void PhiOpConvertPass::getDependentDialects(
     mlir::DialectRegistry &registry) const {
   registry.insert<infrt::InfrtDialect>();
@@ -265,7 +307,3 @@ std::unique_ptr<mlir::Pass> infrt::CreatePhiOpCvtPass(
     std::vector<Place> valid_places) {
   return std::make_unique<PhiOpConvertPass>(valid_places);
 }
-
-std::unique_ptr<mlir::Pass> infrt::CreatePhiOpCvtPass() {
-  return std::make_unique<PhiOpConvertPass>();
-}
diff --git a/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.h b/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.h
index c426bbf11518b2..a0e74426a40974 100644
--- a/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.h
+++ b/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.h
@@ -23,6 +23,4 @@ namespace infrt {
  */
 std::unique_ptr<mlir::Pass> CreatePhiOpCvtPass(std::vector<Place> valid_places);
 
-std::unique_ptr<mlir::Pass> CreatePhiOpCvtPass();
-
 }  // namespace infrt
diff --git a/paddle/infrt/dialect/tensorrt/trt_exec.cc b/paddle/infrt/dialect/tensorrt/trt_exec.cc
index 837ca2093747c3..2682a744bb056f 100644
--- a/paddle/infrt/dialect/tensorrt/trt_exec.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_exec.cc
@@ -81,7 +81,7 @@ int main(int argc, char** argv) {
   trt_pass_manager.addPass(std::make_unique<infrt::trt::TRTGraphFusePass>());
   trt_pass_manager.addPass(std::make_unique<infrt::trt::TRTGraphSplitPass>(1));
   trt_pass_manager.addPass(std::make_unique<infrt::trt::TRTOpConverterPass>());
-  trt_pass_manager.addPass(infrt::trt::createTrtTypeConvertPass());
+  trt_pass_manager.addPass(infrt::trt::CreateTrtTypeConvertPass());
   trt_pass_manager.addPass(::mlir::createCanonicalizerPass());
   if (mlir::failed(pm.run(*module))) {
     std::cout << "\npass failed!\n" << std::endl;
diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc
index 55964b77e21ca6..bbe9a76e87b005 100644
--- a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc
@@ -181,5 +181,10 @@ void TRTGraphFusePass::runOnFunction() {
   // TODO(wilber): Implement a toposort for efficiency.
   // topoSortBlock(body);
 }
+
+std::unique_ptr<mlir::Pass> CreateTrtGraphFusePass() {
+  return std::make_unique<TRTGraphFusePass>();
+}
+
 }  // namespace trt
 }  // namespace infrt
diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h
index 4c7214762303c0..515e73df854801 100644
--- a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h
+++ b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h
@@ -17,6 +17,9 @@
 
 namespace infrt {
 namespace trt {
+
+std::unique_ptr<mlir::Pass> CreateTrtGraphFusePass();
+
 /*
  * trtGraphFusePass.
  *
diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc
index 2136f19fd1af56..d5ce871edd1a3a 100644
--- a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc
@@ -44,5 +44,10 @@ void TRTGraphSplitPass::runOnFunction() {
     graph_op.erase();
   }
 }
+
+std::unique_ptr<mlir::Pass> CreateTrtGraphSplitPass(size_t min_subgraph_size) {
+  return std::make_unique<TRTGraphSplitPass>(min_subgraph_size);
+}
+
 }  // namespace trt
 }  // namespace infrt
diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h
index a71b9cb6536c5f..fa101a8db027ab 100644
--- a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h
+++ b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h
@@ -17,6 +17,9 @@
 
 namespace infrt {
 namespace trt {
+
+std::unique_ptr<mlir::Pass> CreateTrtGraphSplitPass(size_t min_subgraph_size);
+
 /*
  * trtGraphSplitPass.
  *
diff --git a/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc b/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc
index e40bbd67c0b5ec..6776f01e36d19a 100644
--- a/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc
@@ -260,5 +260,9 @@ void TRTOpConverterPass::runOnOperation() {
     signalPassFailure();
 }
 
+std::unique_ptr<mlir::Pass> CreateTrtOpConverterPass() {
+  return std::make_unique<TRTOpConverterPass>();
+}
+
 }  // namespace trt
 }  // namespace infrt
diff --git a/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h b/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h
index 685686493c9ab6..84bc7194636386 100644
--- a/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h
+++ b/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h
@@ -20,6 +20,9 @@
 
 namespace infrt {
 namespace trt {
+
+std::unique_ptr<mlir::Pass> CreateTrtOpConverterPass();
+
 /*
  * trtOpConverterPass.
  *
diff --git a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc
index 77c22c12854c64..d7b917385cf147 100644
--- a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc
@@ -58,5 +58,10 @@ void TRTOpTellerPass::runOnFunction() {
     builder.create<::infrt::ReturnOp>(loc, op->getResults());
   }
 }
+
+std::unique_ptr<mlir::Pass> CreateTrtOpTellerPass() {
+  return std::make_unique<TRTOpTellerPass>();
+}
+
 }  // namespace trt
 }  // namespace infrt
diff --git a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h
index 47375d838a9874..566c5a45da03ad 100644
--- a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h
+++ b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h
@@ -17,6 +17,9 @@
 
 namespace infrt {
 namespace trt {
+
+std::unique_ptr<mlir::Pass> CreateTrtOpTellerPass();
+
 /*
  * trtOpTellerPass.
  *
diff --git a/paddle/infrt/dialect/tensorrt/trt_type_convert_pass.cc b/paddle/infrt/dialect/tensorrt/trt_type_convert_pass.cc
index 0ed79c79db6a2a..35c81d0230161b 100644
--- a/paddle/infrt/dialect/tensorrt/trt_type_convert_pass.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_type_convert_pass.cc
@@ -175,7 +175,7 @@ void TrtTypeConvertPass::runOnFunction() {
 namespace infrt {
 namespace trt {
 
-std::unique_ptr<mlir::Pass> createTrtTypeConvertPass() {
+std::unique_ptr<mlir::Pass> CreateTrtTypeConvertPass() {
   return std::make_unique<TrtTypeConvertPass>();
 }
 
diff --git a/paddle/infrt/dialect/tensorrt/trt_type_convert_pass.h b/paddle/infrt/dialect/tensorrt/trt_type_convert_pass.h
index fbc30cdbeb7675..68a15696b3e695 100644
--- a/paddle/infrt/dialect/tensorrt/trt_type_convert_pass.h
+++ b/paddle/infrt/dialect/tensorrt/trt_type_convert_pass.h
@@ -19,7 +19,7 @@
 namespace infrt {
 namespace trt {
 
-std::unique_ptr<mlir::Pass> createTrtTypeConvertPass();
+std::unique_ptr<mlir::Pass> CreateTrtTypeConvertPass();
 
 }  // namespace trt
 }  // namespace infrt
diff --git a/paddle/infrt/host_context/paddle_mlir.cc b/paddle/infrt/host_context/paddle_mlir.cc
index 8b7bbe13260ff2..0264920a600ffd 100644
--- a/paddle/infrt/host_context/paddle_mlir.cc
+++ b/paddle/infrt/host_context/paddle_mlir.cc
@@ -15,11 +15,13 @@
 #include "paddle/infrt/host_context/paddle_mlir.h"
 
 #include <mlir/IR/OpDefinition.h>
+#include <mlir/IR/Value.h>
 
 #include "paddle/infrt/dialect/infrt/ir/basic_kernels.h"
 #include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
 #include "paddle/infrt/dialect/pd/common/pd_ops_info.h"
 #include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h"
+#include "paddle/infrt/dialect/phi/ir/phi_base.h"
 
 MLIRModelGenImpl::MLIRModelGenImpl()
     : context_(infrt::Global::getMLIRContext()), builder_(context_) {
@@ -35,32 +37,40 @@ MLIRModelGenImpl::MLIRModelGenImpl()
 
 infrt::paddle::framework_proto::ProgramDesc MLIRModelGenImpl::ParsePaddleModel(
     const std::string &model_file) {
+  model_file_ = model_file;
   infrt::paddle::framework_proto::ProgramDesc program_proto =
       *infrt::paddle::LoadProgram(model_file);
   return program_proto;
 }
 
-mlir::ModuleOp MLIRModelGenImpl::ImportPaddleModel(
-    const std::string &model_dir) {
+mlir::ModuleOp MLIRModelGenImpl::ImportPaddleModel(const std::string &model_dir,
+                                                   bool arg_has_map) {
+  model_dir_ = model_dir;
   infrt::paddle::framework_proto::ProgramDesc program_proto =
       ParsePaddleModel(model_dir + "/__model__");
-  return ImportPaddleModel(program_proto);
+  return ImportPaddleModel(program_proto, arg_has_map);
 }
 
 mlir::ModuleOp MLIRModelGenImpl::ImportPaddleModel(
-    const std::string &model_file, const std::string &param_file) {
+    const std::string &model_file,
+    const std::string &param_file,
+    bool arg_has_map) {
+  model_file_ = model_file;
+  params_file_ = param_file;
   infrt::paddle::framework_proto::ProgramDesc program_proto =
       ParsePaddleModel(model_file);
-  return ImportPaddleModel(program_proto);
+  return ImportPaddleModel(program_proto, arg_has_map);
 }
 
 mlir::ModuleOp MLIRModelGenImpl::ImportPaddleModel(
-    const infrt::paddle::framework_proto::ProgramDesc &program) {
+    const infrt::paddle::framework_proto::ProgramDesc &program,
+    bool arg_has_map) {
   main_block_ = program.blocks(0);
-  llvm::SmallVector<mlir::Type, 4> operandTypes = GetModelInputsType(program);
+  llvm::SmallVector<mlir::Type, 4> operandTypes =
+      GetModelInputsType(program, arg_has_map);
   llvm::SmallVector<mlir::Type, 4> resultTypes = GetModelOutputsType(program);
   mlir::FuncOp mainFunc = UpdateModelModule(operandTypes, resultTypes);
-  UpdateModelParams(program, &mainFunc);
+  UpdateModelParams(program, &mainFunc, arg_has_map);
   UpdateModelOps(program);
   UpdateModelOutputs(program);
   return module_;
@@ -83,9 +93,12 @@ mlir::FuncOp MLIRModelGenImpl::UpdateModelModule(
 }
 
 llvm::SmallVector<mlir::Type, 4> MLIRModelGenImpl::GetModelInputsType(
-    const infrt::paddle::framework_proto::ProgramDesc &program) {
+    const infrt::paddle::framework_proto::ProgramDesc &program,
+    bool arg_has_map) {
   llvm::SmallVector<mlir::Type, 4> operandTypes;
-  operandTypes.push_back(infrt::phi::DenseTensorMapType::get(context_));
+  if (arg_has_map) {
+    operandTypes.push_back(infrt::phi::DenseTensorMapType::get(context_));
+  }
   for (auto &op_desc : main_block_.ops()) {
     if (op_desc.type() != "feed") continue;
     for (int var_idx = 0; var_idx < op_desc.outputs_size(); ++var_idx) {
@@ -155,9 +168,14 @@ void MLIRModelGenImpl::UpdateModelOps(
 
 void MLIRModelGenImpl::UpdateModelParams(
     const infrt::paddle::framework_proto::ProgramDesc &program,
-    mlir::FuncOp *mainFunc) {
+    mlir::FuncOp *mainFunc,
+    bool arg_has_map) {
   // update input vars
-  int input_index = 1;
+  int input_index;
+  if (arg_has_map)
+    input_index = 1;
+  else
+    input_index = 0;
   for (auto &op_desc : main_block_.ops()) {
     if (op_desc.type() == "feed") {
       for (int var_idx = 0; var_idx < op_desc.outputs_size(); ++var_idx) {
@@ -170,9 +188,28 @@ void MLIRModelGenImpl::UpdateModelParams(
       }
     }
   }
+  ::mlir::Value map;
+  if (arg_has_map) {
+    map = mainFunc->getArgument(0);
+  } else {
+    builder_.setInsertionPointToStart(&mainFunc->body().front());
+    if (!model_dir_.empty()) {
+      auto load_op = builder_.create<::infrt::phi::LoadParamsOp>(
+          mlir::UnknownLoc::get(context_),
+          ::infrt::phi::DenseTensorMapType::get(context_),
+          builder_.getStringAttr(model_dir_));
+      map = load_op.out();
+    } else if (!model_file_.empty()) {
+      auto load_op = builder_.create<::infrt::phi::LoadCombinedParamsOp>(
+          mlir::UnknownLoc::get(context_),
+          ::infrt::phi::DenseTensorMapType::get(context_),
+          builder_.getStringAttr(model_file_),
+          builder_.getStringAttr(params_file_));
+      map = load_op.out();
+    }
+  }
 
   // update persistable tensors
-  ::mlir::Value map = mainFunc->getArgument(0);
   for (int i = 0; i < main_block_.vars_size(); i++) {
     auto var_desc = main_block_.vars(i);
     if (params_map_.find(var_desc.name()) != params_map_.end()) continue;
diff --git a/paddle/infrt/host_context/paddle_mlir.h b/paddle/infrt/host_context/paddle_mlir.h
index 3d79d608e702db..57bdc1b48578b2 100644
--- a/paddle/infrt/host_context/paddle_mlir.h
+++ b/paddle/infrt/host_context/paddle_mlir.h
@@ -37,8 +37,10 @@ class MLIRModelGenImpl {
  public:
   MLIRModelGenImpl();
   mlir::ModuleOp ImportPaddleModel(const std::string &model_file,
-                                   const std::string &param_file);
-  mlir::ModuleOp ImportPaddleModel(const std::string &model_dir);
+                                   const std::string &param_file,
+                                   bool arg_has_map = true);
+  mlir::ModuleOp ImportPaddleModel(const std::string &model_dir,
+                                   bool arg_has_map = true);
 
  private:
   // parse paddle model file
@@ -47,11 +49,13 @@ class MLIRModelGenImpl {
 
   // convert paddle model proto into paddle dialect module
   mlir::ModuleOp ImportPaddleModel(
-      const infrt::paddle::framework_proto::ProgramDesc &program);
+      const infrt::paddle::framework_proto::ProgramDesc &program,
+      bool arg_has_map);
 
   // get inputs and outputs info from program_desc
   llvm::SmallVector<mlir::Type, 4> GetModelInputsType(
-      const infrt::paddle::framework_proto::ProgramDesc &program);
+      const infrt::paddle::framework_proto::ProgramDesc &program,
+      bool arg_has_map);
   llvm::SmallVector<mlir::Type, 4> GetModelOutputsType(
       const infrt::paddle::framework_proto::ProgramDesc &program);
   // create main function module
@@ -63,7 +67,8 @@ class MLIRModelGenImpl {
   // convert persistable params and inputs variable into mlir domain
   void UpdateModelParams(
       const infrt::paddle::framework_proto::ProgramDesc &program,
-      mlir::FuncOp *mainFunc);
+      mlir::FuncOp *mainFunc,
+      bool arg_has_map);
   // register model outpus into params_map_
   void UpdateModelOutputs(
       const infrt::paddle::framework_proto::ProgramDesc &program);
@@ -80,11 +85,16 @@ class MLIRModelGenImpl {
   void RegisterOpOutputVars(const infrt::paddle::framework_proto::OpDesc &op_,
                             mlir::Operation *mlir_op_);
 
+ private:
   mlir::MLIRContext *context_;
   mlir::OpBuilder builder_;
   mlir::ModuleOp module_;
   infrt::paddle::framework_proto::BlockDesc main_block_;
 
+  std::string model_dir_{};
+  std::string model_file_{};
+  std::string params_file_{};
+
   std::map<std::string, mlir::Value> params_map_;
 };
 
diff --git a/paddle/infrt/kernel/phi/registry.cc b/paddle/infrt/kernel/phi/registry.cc
index 928209ab182e62..848ff28faffc71 100644
--- a/paddle/infrt/kernel/phi/registry.cc
+++ b/paddle/infrt/kernel/phi/registry.cc
@@ -46,7 +46,7 @@ void RegisterPhiKernels(host_context::KernelRegistry* registry) {
   registry->AddKernel(
       "phi_dt.create_host_inited_dense_tensor.f32",
       INFRT_KERNEL(infrt::kernel::phi::CreateHostInitedDenseTensorF32),
-      {"dims", "lod", "layout", "values"});
+      {"dims", "lod", "layout", "values", "run_once"});
 
   registry->AddKernel("phi_dt.fill_dense_tensor.f32",
                       INFRT_KERNEL(infrt::kernel::phi::FillDenseTensorF32),
diff --git a/paddle/infrt/tests/dialect/phi/phi_pass.mlir b/paddle/infrt/tests/dialect/phi/phi_pass.mlir
index 784ead5b2a0e35..0d9e312ce0bfdf 100644
--- a/paddle/infrt/tests/dialect/phi/phi_pass.mlir
+++ b/paddle/infrt/tests/dialect/phi/phi_pass.mlir
@@ -1,4 +1,4 @@
-// RUN: infrtopt -phi-op-convert -infrt-op-fuse %s
+// RUN: infrtopt -phi-op-convert=valid-targets=CPU-FP32-NCHW -infrt-op-fuse %s
 
 // CHECK-LABEL: @ops
 func @ops(%a:!infrt.lod_tensor<?xf32,0>, %b:!infrt.lod_tensor<?xf32,0>) {
diff --git a/paddle/infrt/tests/dialect/phi/resnet50.mlir.in b/paddle/infrt/tests/dialect/phi/resnet50.mlir.in
index 2803ebb41cfd7b..3591a62e88ed06 100644
--- a/paddle/infrt/tests/dialect/phi/resnet50.mlir.in
+++ b/paddle/infrt/tests/dialect/phi/resnet50.mlir.in
@@ -444,7 +444,7 @@ module  {
     %387 = "pd.flatten_contiguous_range"(%386) {start_axis = 1 : si32, stop_axis = 3 : si32} : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> !infrt.dense_tensor<CPU, FP32, NCHW>
     %388 = "pd.matmul_v2"(%387, %245) {trans_x = false, trans_y = false} : (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> !infrt.dense_tensor<CPU, FP32, NCHW>
     %389 = "pd.elementwise_add"(%388, %30) {axis = 1 : si32} : (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> !infrt.dense_tensor<CPU, FP32, NCHW>
-    infrt.return %270 : !infrt.dense_tensor<CPU, FP32, NCHW>
+    infrt.return %389 : !infrt.dense_tensor<CPU, FP32, NCHW>
   }
 
   func @main() {
diff --git a/paddle/phi/api/lib/api_custom_impl.cc b/paddle/phi/api/lib/api_custom_impl.cc
index f559027fdd4b02..637c3b9107a7d6 100644
--- a/paddle/phi/api/lib/api_custom_impl.cc
+++ b/paddle/phi/api/lib/api_custom_impl.cc
@@ -475,6 +475,54 @@ std::tuple<Tensor, Tensor, Tensor> momentum_impl(
   return api_output;
 }
 
+std::vector<Tensor> unbind_impl(const Tensor& input, int axis) {
+  auto kernel_key_set = ParseKernelKeyByInputArgs(input);
+  auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
+
+  Backend kernel_backend = kernel_key.backend();
+  DataLayout kernel_layout = kernel_key.layout();
+  DataType kernel_data_type = kernel_key.dtype();
+
+  auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError(
+      "unbind", {kernel_backend, kernel_layout, kernel_data_type});
+  VLOG(6) << "unbind API kernel key: [" << kernel_backend << ", "
+          << kernel_layout << ", " << kernel_data_type << "]";
+  VLOG(6) << "unbind API kernel: " << kernel;
+
+  auto* dev_ctx = GetDeviceContextByBackend(kernel_backend);
+
+  auto dense_input = PrepareData(input, kernel.InputAt(0), {});
+
+  // Calculate the number of out tensors
+  auto input_shape = input.dims();
+  if (axis < 0) {
+    axis = input_shape.size() + axis;
+  }
+  auto out_num = input_shape[axis];
+
+  std::vector<Tensor> out;
+  auto dense_outs = SetKernelOutput(out_num, kernel_backend, &out);
+  std::vector<phi::MetaTensor> meta_outs;
+  meta_outs.reserve(out_num);
+  std::vector<phi::MetaTensor*> meta_out_ptrs;
+  meta_out_ptrs.reserve(out_num);
+  for (int64_t i = 0; i < out_num; ++i) {
+    meta_outs.push_back(dense_outs[i]);
+    meta_out_ptrs.push_back(&meta_outs.back());
+  }
+
+  phi::UnbindInferMeta(MakeMetaTensor(*dense_input), axis, meta_out_ptrs);
+
+  using kernel_signature = void (*)(const phi::DeviceContext&,
+                                    const phi::DenseTensor&,
+                                    int,
+                                    std::vector<phi::DenseTensor*>&);
+  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
+  (*kernel_fn)(*dev_ctx, *dense_input, axis, dense_outs);
+
+  return out;
+}
+
 ////////////////// Backward(grad) api impls //////////////////////
 
 // TODO(chenweihang):  the original sum grad op can support higher-level
@@ -966,5 +1014,135 @@ std::vector<Tensor> meshgrid_grad_impl(
   return api_output;
 }
 
+std::vector<Tensor> multi_dot_grad_impl(const std::vector<Tensor>& x,
+                                        const Tensor& out_grad) {
+  Backend kernel_backend = Backend::UNDEFINED;
+  DataLayout kernel_layout = DataLayout::UNDEFINED;
+  DataType kernel_data_type = DataType::UNDEFINED;
+
+  if (kernel_backend == Backend::UNDEFINED ||
+      kernel_layout == DataLayout::UNDEFINED ||
+      kernel_data_type == DataType::UNDEFINED) {
+    auto kernel_key_set = ParseKernelKeyByInputArgs(x, out_grad);
+    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
+    if (kernel_backend == Backend::UNDEFINED) {
+      kernel_backend = kernel_key.backend();
+    }
+    if (kernel_layout == DataLayout::UNDEFINED) {
+      kernel_layout = kernel_key.layout();
+    }
+    if (kernel_data_type == DataType::UNDEFINED) {
+      kernel_data_type = kernel_key.dtype();
+    }
+  }
+
+  VLOG(6) << "multi_dot_grad API kernel key: [" << kernel_backend << ", "
+          << kernel_layout << ", " << kernel_data_type << "]";
+  const auto& kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError(
+      "multi_dot_grad", {kernel_backend, kernel_layout, kernel_data_type});
+  VLOG(6) << "multi_dot_grad API kernel: " << kernel;
+
+  auto* dev_ctx = GetDeviceContextByBackend(kernel_backend);
+
+  auto input_x_vec = PrepareData(x, kernel.InputAt(0), {});
+  std::vector<const phi::DenseTensor*> input_x(input_x_vec->size());
+  for (size_t i = 0; i < input_x.size(); ++i) {
+    input_x[i] = &input_x_vec->at(i);
+  }
+  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {});
+
+  size_t out_number = input_x.size();
+  std::vector<Tensor> api_output;
+  auto kernel_out = SetKernelOutput(out_number, kernel_backend, &api_output);
+
+  auto x_meta_vec = MakeMetaTensor(input_x);
+  std::vector<phi::MetaTensor*> x_metas(x_meta_vec.size());
+  for (size_t i = 0; i < x_meta_vec.size(); ++i) {
+    x_metas[i] = &x_meta_vec[i];
+  }
+
+  std::vector<phi::MetaTensor> meta_outs;
+  meta_outs.reserve(out_number);
+  std::vector<phi::MetaTensor*> meta_out_ptrs;
+  meta_out_ptrs.reserve(out_number);
+  for (size_t i = 0; i < out_number; ++i) {
+    meta_outs.push_back(kernel_out[i]);
+    meta_out_ptrs.push_back(&meta_outs.back());
+  }
+
+  phi::MultiDotGradInferMeta(
+      x_metas, MakeMetaTensor(*input_out_grad), meta_out_ptrs);
+
+  using kernel_signature = void (*)(const platform::DeviceContext&,
+                                    const std::vector<const phi::DenseTensor*>&,
+                                    const phi::DenseTensor&,
+                                    std::vector<phi::DenseTensor*>&);
+  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
+  (*kernel_fn)(*dev_ctx, input_x, *input_out_grad, kernel_out);
+
+  return api_output;
+}
+
+std::vector<Tensor> multiplex_grad_impl(const std::vector<Tensor>& inputs,
+                                        const Tensor& ids,
+                                        const Tensor& out_grad) {
+  Backend kernel_backend = Backend::UNDEFINED;
+  DataLayout kernel_layout = DataLayout::UNDEFINED;
+  DataType kernel_data_type = DataType::UNDEFINED;
+
+  if (kernel_backend == Backend::UNDEFINED ||
+      kernel_layout == DataLayout::UNDEFINED ||
+      kernel_data_type == DataType::UNDEFINED) {
+    auto kernel_key_set = ParseKernelKeyByInputArgs(out_grad);
+    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
+    if (kernel_backend == Backend::UNDEFINED) {
+      kernel_backend = kernel_key.backend();
+    }
+    if (kernel_layout == DataLayout::UNDEFINED) {
+      kernel_layout = kernel_key.layout();
+    }
+    if (kernel_data_type == DataType::UNDEFINED) {
+      kernel_data_type = kernel_key.dtype();
+    }
+  }
+
+  VLOG(6) << "multiplex_grad API kernel key: [" << kernel_backend << ", "
+          << kernel_layout << ", " << kernel_data_type << "]";
+  const auto& kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError(
+      "multiplex_grad", {kernel_backend, kernel_layout, kernel_data_type});
+  VLOG(6) << "multiplex_grad API kernel: " << kernel;
+
+  auto* dev_ctx = GetDeviceContextByBackend(kernel_backend);
+
+  auto input_ids = PrepareData(ids, kernel.InputAt(0), {});
+  auto input_out_grad = PrepareData(out_grad, kernel.InputAt(1), {});
+
+  auto out_number = inputs.size();
+  std::vector<Tensor> api_output;
+  auto kernel_out = SetKernelOutput(out_number, kernel_backend, &api_output);
+
+  std::vector<phi::MetaTensor> meta_outs;
+  meta_outs.reserve(out_number);
+  std::vector<phi::MetaTensor*> meta_out_ptrs;
+  meta_out_ptrs.reserve(out_number);
+  for (size_t i = 0; i < out_number; ++i) {
+    meta_outs.push_back(kernel_out[i]);
+    meta_out_ptrs.push_back(&meta_outs.back());
+  }
+
+  phi::MultiplexGradInferMeta(MakeMetaTensor(*input_ids),
+                              MakeMetaTensor(*input_out_grad),
+                              meta_out_ptrs);
+
+  using kernel_signature = void (*)(const platform::DeviceContext&,
+                                    const phi::DenseTensor&,
+                                    const phi::DenseTensor&,
+                                    std::vector<phi::DenseTensor*>&);
+  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
+  (*kernel_fn)(*dev_ctx, *input_ids, *input_out_grad, kernel_out);
+
+  return api_output;
+}
+
 }  // namespace experimental
 }  // namespace paddle
diff --git a/paddle/phi/api/lib/api_custom_impl.h b/paddle/phi/api/lib/api_custom_impl.h
index 4745782d914cab..0e360ce4a993f0 100644
--- a/paddle/phi/api/lib/api_custom_impl.h
+++ b/paddle/phi/api/lib/api_custom_impl.h
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #pragma once
 
+#include <vector>
+
 #include "paddle/phi/api/include/tensor.h"
 #include "paddle/phi/common/int_array.h"
 #include "paddle/phi/common/place.h"
@@ -60,6 +62,8 @@ std::vector<Tensor> split_impl(const Tensor& x,
                                const IntArray& num_or_sections,
                                const Scalar& axis);
 
+std::vector<Tensor> meshgrid_impl(const std::vector<Tensor>& inputs);
+
 std::tuple<Tensor, Tensor, Tensor> momentum_impl(
     const Tensor& param,
     const Tensor& grad,
@@ -73,6 +77,8 @@ std::tuple<Tensor, Tensor, Tensor> momentum_impl(
     bool multi_precision,
     float rescale_grad);
 
+std::vector<Tensor> unbind_impl(const Tensor& input, int axis);
+
 ////////////////// Backward(grad) api impls //////////////////////
 
 std::vector<Tensor> add_n_grad_impl(const std::vector<Tensor>& x,
@@ -105,9 +111,15 @@ Tensor real_grad_impl(const Tensor& x);
 std::vector<Tensor> stack_grad_impl(const std::vector<Tensor>& x,
                                     const Tensor& out_grad,
                                     int axis);
-std::vector<Tensor> meshgrid_impl(const std::vector<Tensor>& inputs);
 std::vector<Tensor> meshgrid_grad_impl(const std::vector<Tensor>& inputs,
                                        const std::vector<Tensor>& outputs_grad);
 
+std::vector<Tensor> multi_dot_grad_impl(const std::vector<Tensor>& x,
+                                        const Tensor& out_grad);
+
+std::vector<Tensor> multiplex_grad_impl(const std::vector<Tensor>& inputs,
+                                        const Tensor& ids,
+                                        const Tensor& out_grad);
+
 }  // namespace experimental
 }  // namespace paddle
diff --git a/paddle/phi/backends/gpu/gpu_context.cc b/paddle/phi/backends/gpu/gpu_context.cc
index 0394835aa8b700..ff238b79978655 100644
--- a/paddle/phi/backends/gpu/gpu_context.cc
+++ b/paddle/phi/backends/gpu/gpu_context.cc
@@ -12,6 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include <algorithm>
 #include <array>
@@ -155,6 +156,39 @@ static void StreamCallbackFunc(gpuStream_t stream,
 
 }  // namespace internal
 
+void DnnWorkspaceHandle::RunFuncSync(
+    const std::function<void(void*)>& cudnn_func,
+    size_t required_workspace_bytes,
+    bool use_cached_allocation) {
+  bool need_realloc = required_workspace_bytes > WorkspaceSize();
+  if (need_realloc && !use_cached_allocation) {
+    void* workspace_ptr = nullptr;
+    size_t size = ((required_workspace_bytes + 255) >> 8) << 8;
+    std::lock_guard<std::mutex> guard(*mtx_);
+#ifdef PADDLE_WITH_HIP
+    auto status = hipMalloc(&workspace_ptr, size);
+#else
+    auto status = cudaMalloc(&workspace_ptr, size);
+#endif
+    if (status == gpuSuccess) {
+      cudnn_func(workspace_ptr);
+      phi::backends::gpu::GpuStreamSync(stream_);
+#ifdef PADDLE_WITH_HIP
+      PADDLE_ENFORCE_GPU_SUCCESS(hipFree(workspace_ptr));
+#else
+      PADDLE_ENFORCE_GPU_SUCCESS(cudaFree(workspace_ptr));
+#endif
+      return;
+    }
+  }
+
+  RunFunc(cudnn_func, required_workspace_bytes);
+  if (need_realloc) {
+    // Release the workspace allocated in this running.
+    ResetWorkspace();
+  }
+}
+
 void DnnWorkspaceHandle::ResetWorkspace() { allocation_ = nullptr; }
 
 void DnnWorkspaceHandle::ReallocWorkspace(size_t required_workspace_bytes) {
@@ -295,13 +329,13 @@ struct GPUContext::Impl {
   void InitDnnWorkspace() {
     PD_CHECK(allocator_ != nullptr,
              "the device allocator for gpu context is nullptr.");
-    workspace_ = new DnnWorkspaceHandle(allocator_);
+    workspace_ = new DnnWorkspaceHandle(allocator_, stream_);
   }
 
   void DestoryInternalWorkspace() {
     if (owned_ && workspace_ != nullptr) {
       delete workspace_;
-      stream_ = nullptr;
+      workspace_ = nullptr;
     }
   }
 
@@ -313,7 +347,7 @@ struct GPUContext::Impl {
   DnnWorkspaceHandle GetDnnWorkspace() {
     PD_CHECK(allocator_ != nullptr,
              "the device allocator for gpu context is nullptr.");
-    return DnnWorkspaceHandle(allocator_);
+    return DnnWorkspaceHandle(allocator_, stream_);
   }
 
   void InitStream() {
diff --git a/paddle/phi/backends/gpu/gpu_context.h b/paddle/phi/backends/gpu/gpu_context.h
index cd08da1c0f2f80..ffae1f1f1353e4 100644
--- a/paddle/phi/backends/gpu/gpu_context.h
+++ b/paddle/phi/backends/gpu/gpu_context.h
@@ -21,6 +21,7 @@ limitations under the License. */
 #include "paddle/phi/backends/gpu/forwards.h"
 #include "paddle/phi/backends/gpu/gpu_decls.h"
 #include "paddle/phi/backends/gpu/gpu_helper.h"
+#include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/device_context.h"
 
@@ -28,8 +29,8 @@ namespace phi {
 
 class DnnWorkspaceHandle {
  public:
-  explicit inline DnnWorkspaceHandle(Allocator* allocator)
-      : allocator_(allocator) {
+  inline DnnWorkspaceHandle(Allocator* allocator, gpuStream_t stream)
+      : allocator_(allocator), stream_(stream) {
     mtx_.reset(new std::mutex());
   }
 
@@ -48,11 +49,9 @@ class DnnWorkspaceHandle {
    *  running the function. Currently this function is only used when cudnn
    *  exhaustive searching and callers have to guarantee that the input function
    *  is host blocking */
-  inline void RunFuncSync(const std::function<void(void*)>& cudnn_func,
-                          size_t required_workspace_bytes) {
-    RunFunc(cudnn_func, required_workspace_bytes);
-    ResetWorkspace();
-  }
+  void RunFuncSync(const std::function<void(void*)>& cudnn_func,
+                   size_t required_workspace_bytes,
+                   bool use_cached_allocation = true);
 
   inline size_t WorkspaceSize() {
     if (allocation_ == nullptr) {
@@ -70,7 +69,8 @@ class DnnWorkspaceHandle {
 
  private:
   Allocator::AllocationPtr allocation_{nullptr};
-  Allocator* allocator_{nullptr};
+  Allocator* allocator_{nullptr};  // Not owned
+  gpuStream_t stream_{nullptr};    // Not owned
   std::unique_ptr<std::mutex> mtx_;
 };
 
diff --git a/paddle/phi/common/backend.h b/paddle/phi/common/backend.h
index a9e12f5d81ed08..5543bee144b3bf 100644
--- a/paddle/phi/common/backend.h
+++ b/paddle/phi/common/backend.h
@@ -55,6 +55,8 @@ enum class Backend : uint8_t {
   // paddle kernel primitives backend
   KPS,
 
+  IPU,
+
   // end of backend types
   NUM_BACKENDS,
 
@@ -121,6 +123,9 @@ inline std::ostream& operator<<(std::ostream& os, Backend backend) {
     case Backend::KPS:
       os << "KPS";
       break;
+    case Backend::IPU:
+      os << "IPU";
+      break;
     default: {
       size_t device_type_id_ = static_cast<size_t>(backend) -
                                static_cast<size_t>(Backend::NUM_BACKENDS);
@@ -155,6 +160,8 @@ inline Backend StringToBackend(const char* backend_cstr) {
     return Backend::GPUDNN;
   } else if (s == std::string("KPS")) {
     return Backend::KPS;
+  } else if (s == std::string("IPU")) {
+    return Backend::IPU;
   } else {
     return static_cast<Backend>(static_cast<size_t>(Backend::NUM_BACKENDS) +
                                 phi::GetOrRegisterGlobalDeviceTypeId(s));
diff --git a/paddle/phi/core/compat/convert_utils.cc b/paddle/phi/core/compat/convert_utils.cc
index c08dfa64c7f1bf..43febb2ac0430d 100644
--- a/paddle/phi/core/compat/convert_utils.cc
+++ b/paddle/phi/core/compat/convert_utils.cc
@@ -38,6 +38,8 @@ Backend TransToPhiBackend(const phi::Place& place) {
     return Backend::XPU;
   } else if (allocation_type == phi::AllocationType::NPU) {
     return Backend::NPU;
+  } else if (allocation_type == phi::AllocationType::IPU) {
+    return Backend::IPU;
   } else if (allocation_type == phi::AllocationType::CUSTOM) {
     return static_cast<Backend>(
         static_cast<size_t>(Backend::NUM_BACKENDS) +
diff --git a/paddle/phi/infermeta/backward.cc b/paddle/phi/infermeta/backward.cc
index 49e416fd0152dc..81d3cb9ddf0f4d 100644
--- a/paddle/phi/infermeta/backward.cc
+++ b/paddle/phi/infermeta/backward.cc
@@ -308,6 +308,38 @@ void MeshgridGradInferMeta(const std::vector<MetaTensor*>& inputs,
   }
 }
 
+void MultiDotGradInferMeta(const std::vector<MetaTensor*>& x,
+                           const MetaTensor& out_grad,
+                           std::vector<MetaTensor*> x_grad) {
+  PADDLE_ENFORCE_EQ(
+      x.size(),
+      x_grad.size(),
+      errors::InvalidArgument(
+          "Number of Inputs(X) should be equal with Outputs(X@Grad)."
+          "But received Inputs(X)' size = %d , Outputs(X@Grad)' size = %d.",
+          x.size(),
+          x_grad.size()));
+  for (size_t i = 0; i < x.size(); i++) {
+    if (x_grad[i] != nullptr) {
+      x_grad[i]->set_dims(x[i]->dims());
+      x_grad[i]->share_lod(*x[i]);
+    }
+  }
+}
+
+void MultiplexGradInferMeta(const MetaTensor& ids,
+                            const MetaTensor& out_grad,
+                            std::vector<MetaTensor*> ins_grad) {
+  PADDLE_ENFORCE_NE(
+      ins_grad.empty(),
+      true,
+      errors::InvalidArgument("Output(X@Grad) should not be null."));
+  auto dout_dim = out_grad.dims();
+  for (auto in_grad : ins_grad) {
+    in_grad->set_dims(dout_dim);
+  }
+}
+
 void NllLossGradInferMeta(const MetaTensor& x,
                           const MetaTensor& label,
                           paddle::optional<const MetaTensor&> weight,
diff --git a/paddle/phi/infermeta/backward.h b/paddle/phi/infermeta/backward.h
index eff3731bf22536..058ff7541cd8b7 100644
--- a/paddle/phi/infermeta/backward.h
+++ b/paddle/phi/infermeta/backward.h
@@ -139,6 +139,14 @@ void MeshgridGradInferMeta(const std::vector<MetaTensor*>& inputs,
                            const std::vector<MetaTensor*>& outputs_grad,
                            std::vector<MetaTensor*> inputs_grad);
 
+void MultiDotGradInferMeta(const std::vector<MetaTensor*>& x,
+                           const MetaTensor& out_grad,
+                           std::vector<MetaTensor*> x_grad);
+
+void MultiplexGradInferMeta(const MetaTensor& ids,
+                            const MetaTensor& out_grad,
+                            std::vector<MetaTensor*> ins_grad);
+
 void NllLossGradInferMeta(const MetaTensor& input,
                           const MetaTensor& label,
                           paddle::optional<const MetaTensor&> weight,
diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc
index 298ad14f9e04b6..2139605fb20486 100644
--- a/paddle/phi/infermeta/binary.cc
+++ b/paddle/phi/infermeta/binary.cc
@@ -64,6 +64,16 @@ static void BinarySameInputDimsCheck(const MetaTensor& x,
   }
 }
 
+// Used in MatrixRankTolInferMeta
+static DDim CheckAndGetOutputDim(const DDim& dim_x) {
+  auto x_vec = phi::vectorize(dim_x);
+  if (x_vec.size() == 2) {
+    return phi::make_ddim({1});
+  }
+  x_vec.erase(x_vec.end() - 2, x_vec.end());
+  return phi::make_ddim(x_vec);
+}
+
 }  // namespace detail
 
 void AllValueCompareInferMeta(const MetaTensor& x,
@@ -1465,6 +1475,47 @@ void MatmulWithFlattenInferMeta(const MetaTensor& x,
   out->share_lod(x);
 }
 
+void MatrixRankTolInferMeta(const MetaTensor& x,
+                            const MetaTensor& atol_tensor,
+                            bool use_default_tol,
+                            bool hermitian,
+                            MetaTensor* out) {
+  auto dim_x = x.dims();
+  PADDLE_ENFORCE_GE(
+      dim_x.size(),
+      2,
+      phi::errors::InvalidArgument("The dims of input must be greater than 2"));
+
+  if (hermitian) {
+    int rows = dim_x[dim_x.size() - 2];
+    int cols = dim_x[dim_x.size() - 1];
+    PADDLE_ENFORCE_EQ(rows,
+                      cols,
+                      phi::errors::InvalidArgument(
+                          "if hermitian == true, matrix should be n*n"));
+  }
+  DDim dim_x_batch = detail::CheckAndGetOutputDim(dim_x);
+  auto dim_tol = atol_tensor.dims();
+  if (dim_x_batch == dim_tol) {
+    out->set_dims(dim_x_batch);
+  } else {
+    int max_dim = std::max(dim_x_batch.size(), dim_tol.size());
+    int axis = std::abs(dim_x_batch.size() - dim_tol.size());
+    std::vector<int> x_batch_dims_array(max_dim);
+    std::vector<int> tol_dims_array(max_dim);
+    std::vector<int> out_dims_array(max_dim);
+    phi::funcs::GetBroadcastDimsArrays(dim_x_batch,
+                                       dim_tol,
+                                       x_batch_dims_array.data(),
+                                       tol_dims_array.data(),
+                                       out_dims_array.data(),
+                                       max_dim,
+                                       axis);
+    out->set_dims(phi::make_ddim(out_dims_array));
+  }
+  out->share_lod(x);
+}
+
 void MvInferMeta(const MetaTensor& x, const MetaTensor& vec, MetaTensor* out) {
   auto dim_x = x.dims();
   auto dim_vec = vec.dims();
diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h
index 70c3c9dfe849de..192fa214c905fe 100644
--- a/paddle/phi/infermeta/binary.h
+++ b/paddle/phi/infermeta/binary.h
@@ -218,6 +218,12 @@ void MatmulWithFlattenInferMeta(const MetaTensor& x,
                                 int y_num_col_dims,
                                 MetaTensor* out);
 
+void MatrixRankTolInferMeta(const MetaTensor& x,
+                            const MetaTensor& atol_tensor,
+                            bool use_default_tol,
+                            bool hermitian,
+                            MetaTensor* out);
+
 void MvInferMeta(const MetaTensor& x, const MetaTensor& vec, MetaTensor* out);
 
 void PReluInferMeta(const MetaTensor& x,
diff --git a/paddle/phi/infermeta/nullary.cc b/paddle/phi/infermeta/nullary.cc
index f76e7910d77b59..3a99103eda5c23 100644
--- a/paddle/phi/infermeta/nullary.cc
+++ b/paddle/phi/infermeta/nullary.cc
@@ -63,6 +63,18 @@ void RandpermInferMeta(int n, DataType dtype, MetaTensor* out) {
   out->set_dtype(dtype);
 }
 
+void UniformRandomInferMeta(const IntArray& shape,
+                            DataType dtype,
+                            float min,
+                            float max,
+                            int seed,
+                            MetaTensor* out) {
+  auto out_dims = phi::make_ddim(shape.GetData());
+  out->set_dims(out_dims);
+  out->set_dtype(dtype);
+  out->set_layout(DataLayout::NCHW);
+}
+
 void RandintInferMeta(
     int low, int high, const IntArray& shape, DataType dtype, MetaTensor* out) {
   PADDLE_ENFORCE_NOT_NULL(
diff --git a/paddle/phi/infermeta/nullary.h b/paddle/phi/infermeta/nullary.h
index f84ac01d002d31..8d952d842c0c44 100644
--- a/paddle/phi/infermeta/nullary.h
+++ b/paddle/phi/infermeta/nullary.h
@@ -65,4 +65,11 @@ void TruncatedGaussianRandomInferMeta(const std::vector<int>& shape,
                                       DataType dtype,
                                       MetaTensor* out);
 
+void UniformRandomInferMeta(const IntArray& shape,
+                            DataType dtype,
+                            float min,
+                            float max,
+                            int seed,
+                            MetaTensor* out);
+
 }  // namespace phi
diff --git a/paddle/phi/infermeta/ternary.cc b/paddle/phi/infermeta/ternary.cc
index 3e4aa7b4448e3f..c692b6c8fcd13e 100644
--- a/paddle/phi/infermeta/ternary.cc
+++ b/paddle/phi/infermeta/ternary.cc
@@ -276,10 +276,10 @@ void LerpInferMeta(const MetaTensor& x,
   out->share_lod(x);
 }
 
-void LinspaceInferMeta(const MetaTensor& start,
-                       const MetaTensor& stop,
-                       const MetaTensor& number,
-                       MetaTensor* out) {
+void LinspaceRawInferMeta(const MetaTensor& start,
+                          const MetaTensor& stop,
+                          const MetaTensor& number,
+                          MetaTensor* out) {
   auto s_dims = start.dims();
   PADDLE_ENFORCE_EQ(
       (s_dims.size() == 1) && (s_dims[0] == 1),
@@ -305,6 +305,14 @@ void LinspaceInferMeta(const MetaTensor& start,
   out->set_dtype(start.dtype());
 }
 
+void LinspaceInferMeta(const MetaTensor& start,
+                       const MetaTensor& stop,
+                       const MetaTensor& number,
+                       DataType dtype,
+                       MetaTensor* out) {
+  LinspaceRawInferMeta(start, stop, number, out);
+}
+
 void NllLossRawInferMeta(const MetaTensor& input,
                          const MetaTensor& label,
                          paddle::optional<const MetaTensor&> weight,
diff --git a/paddle/phi/infermeta/ternary.h b/paddle/phi/infermeta/ternary.h
index 00e49811688acc..83505f2c2fadae 100644
--- a/paddle/phi/infermeta/ternary.h
+++ b/paddle/phi/infermeta/ternary.h
@@ -65,9 +65,15 @@ void LerpInferMeta(const MetaTensor& x,
                    const MetaTensor& weight,
                    MetaTensor* out);
 
+void LinspaceRawInferMeta(const MetaTensor& start,
+                          const MetaTensor& stop,
+                          const MetaTensor& number,
+                          MetaTensor* out);
+
 void LinspaceInferMeta(const MetaTensor& start,
                        const MetaTensor& stop,
                        const MetaTensor& number,
+                       DataType dtype,
                        MetaTensor* out);
 
 void NllLossRawInferMeta(const MetaTensor& input,
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index e0ea637074c202..a47fc698777f7f 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -31,6 +31,18 @@ limitations under the License. */
 
 namespace phi {
 
+namespace detail {
+// Used in MatrixRankInferMeta
+static DDim CheckAndGetOutputDim(const DDim& dim_x) {
+  auto x_vec = phi::vectorize(dim_x);
+  if (x_vec.size() == 2) {
+    return phi::make_ddim({1});
+  }
+  x_vec.erase(x_vec.end() - 2, x_vec.end());
+  return phi::make_ddim(x_vec);
+}
+}  // namespace detail
+
 void ArgMinMaxInferMeta(const MetaTensor& x,
                         int64_t axis,
                         bool keepdims,
@@ -901,6 +913,29 @@ void MatrixPowerInferMeta(const MetaTensor& x, int n, MetaTensor* out) {
   out->set_dtype(x.dtype());
 }
 
+void MatrixRankInferMeta(const MetaTensor& x,
+                         bool use_default_tol,
+                         bool hermitian,
+                         MetaTensor* out) {
+  auto dim_x = x.dims();
+  PADDLE_ENFORCE_GE(
+      dim_x.size(),
+      2,
+      phi::errors::InvalidArgument("The dims of input must be greater than 2"));
+
+  if (hermitian) {
+    int rows = dim_x[dim_x.size() - 2];
+    int cols = dim_x[dim_x.size() - 1];
+    PADDLE_ENFORCE_EQ(rows,
+                      cols,
+                      phi::errors::InvalidArgument(
+                          "if hermitian == true, matrix should be n*n"));
+  }
+  DDim dim_x_batch = detail::CheckAndGetOutputDim(dim_x);
+  out->set_dims(dim_x_batch);
+  out->share_lod(x);
+}
+
 void MaxOutInferMeta(const MetaTensor& x,
                      int groups,
                      int axis,
@@ -1280,6 +1315,36 @@ void PixelShuffleInferMeta(const MetaTensor& x,
   out->set_dims(output_dims);
 }
 
+void PixelShuffleGradInferMeta(const MetaTensor& out_grad,
+                               int upscale_factor,
+                               const std::string& data_format,
+                               MetaTensor* x_grad) {
+  auto do_dims = out_grad.dims();
+  PADDLE_ENFORCE_EQ(do_dims.size(),
+                    4,
+                    phi::errors::InvalidArgument(
+                        "Input should be a 4-D tensor of format [N, C, H, W] "
+                        "or [N, H, W, C], but got %u.",
+                        do_dims.size()));
+
+  const bool channel_last = (data_format == "NHWC");
+
+  auto dx_dims = do_dims;
+  dx_dims[0] = do_dims[0];
+
+  if (!channel_last) {
+    dx_dims[1] = do_dims[1] * (upscale_factor * upscale_factor);
+    dx_dims[2] = do_dims[2] / upscale_factor;
+    dx_dims[3] = do_dims[3] / upscale_factor;
+  } else {
+    dx_dims[1] = do_dims[1] / upscale_factor;
+    dx_dims[2] = do_dims[2] / upscale_factor;
+    dx_dims[3] = do_dims[3] * (upscale_factor * upscale_factor);
+  }
+  x_grad->set_dims(dx_dims);
+  x_grad->set_dtype(out_grad.dtype());
+}
+
 void PNormInferMeta(const MetaTensor& x,
                     float porder,
                     int axis,
@@ -2429,7 +2494,7 @@ void TransposeGradInferMeta(const MetaTensor& x,
 
 void UnbindInferMeta(const MetaTensor& x,
                      int axis,
-                     std::vector<MetaTensor>* outs) {
+                     std::vector<MetaTensor*> outs) {
   auto in_dims = x.dims();
   std::vector<int> out_dim;
   axis = axis < 0 ? in_dims.size() + axis : axis;
@@ -2438,11 +2503,11 @@ void UnbindInferMeta(const MetaTensor& x,
   }
   auto out_dims = phi::make_ddim(out_dim);
 
-  for (size_t i = 0; i < outs->size(); ++i) {
-    (*outs)[i].set_dtype(x.dtype());
-    (*outs)[i].set_dims(out_dims);
-    (*outs)[i].set_layout(x.layout());
-    (*outs)[i].share_lod(x);
+  for (size_t i = 0; i < outs.size(); ++i) {
+    outs[i]->set_dtype(x.dtype());
+    outs[i]->set_dims(out_dims);
+    outs[i]->set_layout(x.layout());
+    outs[i]->share_lod(x);
   }
 }
 
@@ -2866,4 +2931,5 @@ void WhereIndexInferMeta(const MetaTensor& condition, MetaTensor* out) {
 }  // namespace phi
 
 PD_REGISTER_INFER_META_FN(copy_to, phi::CopyToInferMeta);
+PD_REGISTER_INFER_META_FN(flatten, phi::FlattenInferMeta);
 PD_REGISTER_INFER_META_FN(split, phi::SplitInferMeta);
diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h
index 5106c6f4487336..c49e4c88dd8991 100644
--- a/paddle/phi/infermeta/unary.h
+++ b/paddle/phi/infermeta/unary.h
@@ -142,6 +142,11 @@ void LogsumexpInferMeta(const MetaTensor& input,
 
 void MatrixPowerInferMeta(const MetaTensor& x, int n, MetaTensor* out);
 
+void MatrixRankInferMeta(const MetaTensor& x,
+                         bool use_default_tol,
+                         bool hermitian,
+                         MetaTensor* out);
+
 void MaxOutInferMeta(const MetaTensor& x,
                      int groups,
                      int axis,
@@ -195,6 +200,11 @@ void PixelShuffleInferMeta(const MetaTensor& x,
                            const std::string& data_format,
                            MetaTensor* out);
 
+void PixelShuffleGradInferMeta(const MetaTensor& out_grad,
+                               int upscale_factor,
+                               const std::string& data_format,
+                               MetaTensor* x_grad);
+
 void PNormInferMeta(const MetaTensor& x,
                     float porder,
                     int axis,
@@ -365,7 +375,7 @@ void TrilTriuInferMeta(const MetaTensor& x,
 
 void UnbindInferMeta(const MetaTensor& x,
                      int axis,
-                     std::vector<MetaTensor>* outs);
+                     std::vector<MetaTensor*> outs);
 
 void UnchangedInferMeta(const MetaTensor& x, MetaTensor* out);
 
diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt
index d4b832cef0bd25..937024d450a363 100644
--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
@@ -62,7 +62,7 @@ kernel_library(triangular_solve_grad_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_re
 kernel_library(rnn_kernel DEPS ${COMMON_KERNEL_DEPS} concat_and_split_functor lstm_compute gru_compute)
 kernel_library(rnn_grad_kernel DEPS ${COMMON_KERNEL_DEPS} concat_and_split_functor lstm_compute gru_compute)
 kernel_library(warpctc_kernel DEPS ${COMMON_KERNEL_DEPS} phi_dynload_warpctc sequence_padding sequence_scale)
-kernel_library(warpctc_grad_kernel DEPS ${COMMON_KERNEL_DEPS} sequence_padding sequence_scale)
+kernel_library(warpctc_grad_kernel DEPS ${COMMON_KERNEL_DEPS} phi_dynload_warpctc sequence_padding sequence_scale)
 
 # 4. auto parse and build kernel targets by cmake
 register_kernels(EXCLUDES ${COMMON_BAISC_KERNELS} ${MANUAL_BUILD_KERNELS} DEPS ${COMMON_KERNEL_DEPS} ${COMMON_BAISC_KERNELS} )
diff --git a/paddle/phi/kernels/activation_grad_kernel.h b/paddle/phi/kernels/activation_grad_kernel.h
index 82e168a3c630b3..065d0188522679 100644
--- a/paddle/phi/kernels/activation_grad_kernel.h
+++ b/paddle/phi/kernels/activation_grad_kernel.h
@@ -197,6 +197,7 @@ DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(ThresholdedRelu, threshold);
 DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(SoftShrink, lambda);
 DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(HardShrink, threshold);
 DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Swish, beta);
+DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Logit, eps);
 
 DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(BRelu, t_min, t_max);
 
diff --git a/paddle/phi/kernels/assign_kernel.cc b/paddle/phi/kernels/assign_kernel.cc
index a330227fcfafd2..5eafc869fa551a 100644
--- a/paddle/phi/kernels/assign_kernel.cc
+++ b/paddle/phi/kernels/assign_kernel.cc
@@ -22,16 +22,23 @@
 
 namespace phi {
 
+template <typename Context>
+void AssignRawKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     DenseTensor* out) {
+  Copy<Context>(dev_ctx, x, x.place(), false, out);
+}
+
 template <typename Context>
 void AssignKernel(const Context& dev_ctx,
                   paddle::optional<const DenseTensor&> x,
                   DenseTensor* out) {
-  if (x.get_ptr()) {
-    if (!x.is_initialized()) {
+  if (x) {
+    if (!x->IsInitialized()) {
       return;
     }
     auto& x_tensor = *x.get_ptr();
-    Copy<Context>(dev_ctx, x_tensor, x_tensor.place(), false, out);
+    AssignRawKernel<Context>(dev_ctx, x_tensor, out);
   }
 }
 
@@ -104,6 +111,12 @@ void AssignValueKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
+PD_REGISTER_GENERAL_KERNEL(assign_raw,
+                           CPU,
+                           ALL_LAYOUT,
+                           phi::AssignRawKernel<phi::CPUContext>,
+                           ALL_DTYPE) {}
+
 PD_REGISTER_GENERAL_KERNEL(
     assign, CPU, ALL_LAYOUT, phi::AssignKernel<phi::CPUContext>, ALL_DTYPE) {
   kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
@@ -123,6 +136,11 @@ PD_REGISTER_KERNEL(assign_value,
                    int64_t) {}
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_REGISTER_GENERAL_KERNEL(assign_raw,
+                           GPU,
+                           ALL_LAYOUT,
+                           phi::AssignRawKernel<phi::GPUContext>,
+                           ALL_DTYPE) {}
 PD_REGISTER_GENERAL_KERNEL(
     assign, GPU, ALL_LAYOUT, phi::AssignKernel<phi::GPUContext>, ALL_DTYPE) {
   kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
diff --git a/paddle/phi/kernels/assign_kernel.h b/paddle/phi/kernels/assign_kernel.h
index f1f3f024205a10..437a2a0c189e86 100644
--- a/paddle/phi/kernels/assign_kernel.h
+++ b/paddle/phi/kernels/assign_kernel.h
@@ -21,6 +21,11 @@
 
 namespace phi {
 
+template <typename Context>
+void AssignRawKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     DenseTensor* out);
+
 // In order to be compatible with the `AsDispensable` input in the original
 // assign op maker, the input parameter here needs to be dispensable, but
 // this looks weird
diff --git a/paddle/phi/kernels/autotune/CMakeLists.txt b/paddle/phi/kernels/autotune/CMakeLists.txt
index b933e0993deef2..f1702d883b9f03 100644
--- a/paddle/phi/kernels/autotune/CMakeLists.txt
+++ b/paddle/phi/kernels/autotune/CMakeLists.txt
@@ -1,6 +1,6 @@
 if (WITH_GPU)
-     nv_test(gpu_timer_test SRCS gpu_timer_test.cu DEPS gtest)
-     nv_test(auto_tune_test SRCS auto_tune_test.cu DEPS gtest)
+    nv_test(gpu_timer_test SRCS gpu_timer_test.cu DEPS gtest)
+    nv_test(auto_tune_test SRCS auto_tune_test.cu DEPS gtest)
 elseif (WITH_ROCM)
     hip_test(gpu_timer_test SRCS gpu_timer_test.cu DEPS gtest)
     hip_test(auto_tune_test SRCS auto_tune_test.cu DEPS gtest)
diff --git a/paddle/phi/kernels/cpu/rnn_kernel.cc b/paddle/phi/kernels/cpu/rnn_kernel.cc
index 4d3976b0aba687..cae97eb0764533 100644
--- a/paddle/phi/kernels/cpu/rnn_kernel.cc
+++ b/paddle/phi/kernels/cpu/rnn_kernel.cc
@@ -832,11 +832,13 @@ void RnnKernel(const Context& dev_ctx,
                DenseTensor* dropout_state,
                std::vector<DenseTensor*> state,
                DenseTensor* reserve) {
-  if (dropout_state->IsInitialized()) {
-    if (dropout_state->numel() != out->numel()) dropout_state->clear();
+  if (!is_test) {
+    if (dropout_state->IsInitialized()) {
+      if (dropout_state->numel() != out->numel()) dropout_state->clear();
+    }
+    const auto& out_dim = out->dims();
+    Full<uint8_t>(dev_ctx, {out_dim.Get(), out_dim.size()}, 1, dropout_state);
   }
-  const auto& out_dim = out->dims();
-  Full<uint8_t>(dev_ctx, {out_dim.Get(), out_dim.size()}, 1, dropout_state);
 
   // init the output and allocate the memory
   dev_ctx.template Alloc<T>(out);
diff --git a/paddle/phi/kernels/cpu/size_kernel.cc b/paddle/phi/kernels/cpu/size_kernel.cc
index ff34ef26f6bd3a..71ebf9cdc09f79 100644
--- a/paddle/phi/kernels/cpu/size_kernel.cc
+++ b/paddle/phi/kernels/cpu/size_kernel.cc
@@ -22,6 +22,7 @@ PD_REGISTER_KERNEL(size,
                    CPU,
                    ALL_LAYOUT,
                    phi::SizeKernel,
+                   int16_t,
                    int,
                    int64_t,
                    phi::dtype::float16,
diff --git a/paddle/phi/kernels/cpu/transpose_kernel.cc b/paddle/phi/kernels/cpu/transpose_kernel.cc
index a80196e7f80e1b..5dc4866e1efc33 100644
--- a/paddle/phi/kernels/cpu/transpose_kernel.cc
+++ b/paddle/phi/kernels/cpu/transpose_kernel.cc
@@ -75,6 +75,7 @@ PD_REGISTER_KERNEL(transpose,
                    double,
                    int32_t,
                    int64_t,
+                   phi::dtype::float16,
                    phi::dtype::bfloat16,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/bernoulli_kernel.cu b/paddle/phi/kernels/gpu/bernoulli_kernel.cu
index 79d8a7b0f3444b..edcf29e2d88d38 100644
--- a/paddle/phi/kernels/gpu/bernoulli_kernel.cu
+++ b/paddle/phi/kernels/gpu/bernoulli_kernel.cu
@@ -14,8 +14,6 @@
 
 #include "paddle/phi/kernels/bernoulli_kernel.h"
 
-#include <thrust/random.h>
-#include <thrust/transform.h>
 #ifdef __NVCC__
 #include <curand_kernel.h>
 #endif
@@ -32,35 +30,8 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/distribution_helper.h"
 
-// See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/platform/transform.h"
-
-DECLARE_bool(use_curand);
-
 namespace phi {
 
-template <typename T>
-struct BernoulliCudaFunctor {
-  unsigned int seed_;
-  unsigned int offset_;
-  __host__ __device__ BernoulliCudaFunctor(unsigned int seed,
-                                           unsigned int offset)
-      : seed_(seed), offset_(offset) {}
-
-  __host__ __device__ T operator()(const unsigned int n, const T p) const {
-    // NOTE(zhiqiu): currently, PADDLE_ENFORCE in cuda kernel may print several
-    // lines of error messages if, and it should be refined.
-    PADDLE_ENFORCE(p >= 0.0 && p <= 1.0,
-                   "The probability should be >=0 and <= 1, but got %f",
-                   p);
-    thrust::minstd_rand rng;
-    rng.seed(seed_);
-    thrust::uniform_real_distribution<T> dist(0.0, 1.0);
-    rng.discard(n + offset_);
-    return static_cast<T>(dist(rng) < p);
-  }
-};
-
 // 'curand_uniform4/hiprand_uniform4' generate 4 random number each time
 template <typename T>
 __global__ void bernoulli_cuda_kernel(
@@ -100,30 +71,16 @@ void BernoulliKernel(const Context& ctx,
 
   auto gen_cuda = ctx.GetGenerator();
 
-  if (FLAGS_use_curand) {
-    auto seed_offset = gen_cuda->IncrementOffset(12);
-    uint64_t seed = seed_offset.first;
-    uint64_t offset = seed_offset.second;
+  auto seed_offset = gen_cuda->IncrementOffset(12);
+  uint64_t seed = seed_offset.first;
+  uint64_t offset = seed_offset.second;
 
-    auto gpu_config = phi::backends::gpu::GetGpuLaunchConfig1D(ctx, numel, 4);
-    size_t grid_size = gpu_config.GetGridSize();
-    size_t block_size = gpu_config.GetBlockSize();
+  auto gpu_config = phi::backends::gpu::GetGpuLaunchConfig1D(ctx, numel, 4);
+  size_t grid_size = gpu_config.GetGridSize();
+  size_t block_size = gpu_config.GetBlockSize();
 
-    bernoulli_cuda_kernel<<<grid_size, block_size, 0, ctx.stream()>>>(
-        numel, seed, offset, x_data, out_data);
-  } else {
-    auto seed_offset = gen_cuda->IncrementOffset(1);
-    int64_t gen_offset = numel * seed_offset.second;
-    paddle::platform::Transform<phi::GPUContext> trans;
-    thrust::counting_iterator<int64_t> index_sequence_begin(0);
-    trans(ctx,
-          index_sequence_begin,
-          index_sequence_begin + numel,
-          x_data,
-          out_data,
-          BernoulliCudaFunctor<T>(static_cast<int64_t>(seed_offset.first),
-                                  static_cast<int64_t>(gen_offset)));
-  }
+  bernoulli_cuda_kernel<<<grid_size, block_size, 0, ctx.stream()>>>(
+      numel, seed, offset, x_data, out_data);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/cumsum_kernel.cu b/paddle/phi/kernels/gpu/cumsum_kernel.cu
index a253e6f4ad290b..e04f2b5f876581 100644
--- a/paddle/phi/kernels/gpu/cumsum_kernel.cu
+++ b/paddle/phi/kernels/gpu/cumsum_kernel.cu
@@ -222,25 +222,28 @@ void CumsumKernel(const Context& dev_ctx,
   // Use thrust for parallel acceleration when the input size is equal to the
   // length of the ‘axis’ dimension.
   if (size == out_dims[axis]) {
+#ifdef __HIPCC__
+    const auto& policy = thrust::hip::par.on(dev_ctx.stream());
+#else
+    const auto& policy = thrust::cuda::par.on(dev_ctx.stream());
+#endif
     if (reverse) {
-      thrust::device_ptr<const T> dev_ptr =
-          thrust::device_pointer_cast(in_data);
-      thrust::device_vector<T> vec(dev_ptr, dev_ptr + size);
+      thrust::reverse_iterator<thrust::device_ptr<const T>> reversed_in(
+          thrust::device_pointer_cast(in_data) + size);
+      thrust::reverse_iterator<thrust::device_ptr<T>> reversed_out(
+          thrust::device_pointer_cast(out_data) + size);
       if (exclusive) {
         thrust::exclusive_scan(
-            thrust::device, vec.rbegin(), vec.rend(), out_data);
+            policy, reversed_in, reversed_in + size, reversed_out);
       } else {
         thrust::inclusive_scan(
-            thrust::device, vec.rbegin(), vec.rend(), out_data);
+            policy, reversed_in, reversed_in + size, reversed_out);
       }
-      thrust::reverse(thrust::device, out_data, out_data + size);
     } else {
       if (exclusive) {
-        thrust::exclusive_scan(
-            thrust::device, in_data, in_data + size, out_data);
+        thrust::exclusive_scan(policy, in_data, in_data + size, out_data);
       } else {
-        thrust::inclusive_scan(
-            thrust::device, in_data, in_data + size, out_data);
+        thrust::inclusive_scan(policy, in_data, in_data + size, out_data);
       }
     }
     return;
diff --git a/paddle/phi/kernels/gpu/gaussian_random_kernel.cu b/paddle/phi/kernels/gpu/gaussian_random_kernel.cu
index e159e5916cff2b..96ebc0353ef245 100644
--- a/paddle/phi/kernels/gpu/gaussian_random_kernel.cu
+++ b/paddle/phi/kernels/gpu/gaussian_random_kernel.cu
@@ -14,10 +14,7 @@
 
 #include "paddle/phi/kernels/gaussian_random_kernel.h"
 
-#include <thrust/device_vector.h>
-#include <thrust/host_vector.h>
 #include <thrust/random.h>
-#include <thrust/transform.h>
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/core/dense_tensor.h"
@@ -27,8 +24,6 @@
 
 #include "paddle/fluid/framework/generator.h"
 
-DECLARE_bool(use_curand);
-
 namespace phi {
 
 template <typename T>
@@ -83,21 +78,11 @@ void GaussianRandomKernel(const Context& dev_ctx,
   auto gen_cuda = paddle::framework::GetDefaultCUDAGenerator(device_id);
 
   if (gen_cuda->GetIsInitPy() && seed_flag) {
-    if (FLAGS_use_curand) {
-      using MT = typename phi::dtype::MPTypeTrait<T>::Type;
-      funcs::normal_distribution<MT> dist;
-      funcs::normal_transform<MT> trans(static_cast<MT>(mean),
-                                        static_cast<MT>(std));
-      funcs::distribution_and_transform<T>(dev_ctx, tensor, dist, trans);
-    } else {
-      auto seed_offset = gen_cuda->IncrementOffset(1);
-      int64_t gen_offset = size * seed_offset.second;
-      auto func = GaussianGenerator<T>(static_cast<T>(mean),
-                                       static_cast<T>(std),
-                                       seed_offset.first,
-                                       gen_offset);
-      IndexKernel<T, GaussianGenerator<T>>(dev_ctx, tensor, func);
-    }
+    using MT = typename phi::dtype::MPTypeTrait<T>::Type;
+    funcs::normal_distribution<MT> dist;
+    funcs::normal_transform<MT> trans(static_cast<MT>(mean),
+                                      static_cast<MT>(std));
+    funcs::distribution_and_transform<T>(dev_ctx, tensor, dist, trans);
   } else {
     auto func =
         GaussianGenerator<T>(static_cast<T>(mean), static_cast<T>(std), seed);
diff --git a/paddle/phi/kernels/gpu/multinomial_kernel.cu b/paddle/phi/kernels/gpu/multinomial_kernel.cu
index ee5f843b18a90a..ef6cd1323a9df8 100644
--- a/paddle/phi/kernels/gpu/multinomial_kernel.cu
+++ b/paddle/phi/kernels/gpu/multinomial_kernel.cu
@@ -18,11 +18,6 @@ limitations under the License. */
 
 #include "paddle/phi/kernels/multinomial_kernel.h"
 
-#include <thrust/execution_policy.h>
-#include <thrust/random.h>
-#include <thrust/scan.h>
-#include <thrust/transform.h>
-
 #ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
@@ -44,12 +39,6 @@ namespace cub = hipcub;
 #include "paddle/phi/kernels/funcs/multinomial_functor.h"
 #include "paddle/phi/kernels/top_k_kernel.h"
 
-// See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/memory/memcpy.h"
-#include "paddle/fluid/platform/transform.h"
-
-DECLARE_bool(use_curand);
-
 namespace phi {
 
 template <typename T>
@@ -74,32 +63,6 @@ __global__ void NormalizeProbability(T* norm_probs,
   }
 }
 
-template <typename T>
-__global__ void GetCumulativeProbs(T* norm_probs_data,
-                                   int64_t num_distributions,
-                                   int64_t num_categories,
-                                   T* cumulative_probs_data) {
-  int id = blockIdx.x;
-  thrust::inclusive_scan(thrust::device,
-                         norm_probs_data + id * num_categories,
-                         norm_probs_data + (id + 1) * num_categories,
-                         cumulative_probs_data + id * num_categories);
-}
-
-template <typename T>
-struct RandomGeneratorCudaFunctor {
-  unsigned int seed_;
-  __host__ __device__ RandomGeneratorCudaFunctor(int seed) : seed_(seed) {}
-
-  __host__ __device__ T operator()(const unsigned int n) const {
-    thrust::minstd_rand rng;
-    rng.seed(seed_);
-    thrust::uniform_real_distribution<T> dist(0.0, 1.0);
-    rng.discard(n);
-    return dist(rng);
-  }
-};
-
 template <typename T>
 __device__ int binarySearchFunctor(T* cumulative_probs_data,
                                    T* norm_probs_data,
@@ -130,7 +93,6 @@ __device__ int binarySearchFunctor(T* cumulative_probs_data,
 
 template <typename T>
 __global__ void sampleMultinomialWithReplacement(
-    T* rng_data,
     const int64_t num_samples,
     int64_t* out_data,
     const int64_t num_distributions,
@@ -138,10 +100,9 @@ __global__ void sampleMultinomialWithReplacement(
     T* cumulative_probs_data,
     T* norm_probs_data,
     uint64_t seed,
-    uint64_t offset,
-    bool use_curand) {
+    uint64_t offset) {
   // use binary search to get the selected category sample id.
-  // let cumulative_probs_data[id-1] < rng_data < cumulative_probs_data[id].
+  // let cumulative_probs_data[id-1] < rng_number < cumulative_probs_data[id].
   size_t idx = gridDim.x * blockDim.x * blockIdx.y + blockDim.x * blockIdx.x +
                threadIdx.x;
 
@@ -151,10 +112,7 @@ __global__ void sampleMultinomialWithReplacement(
   int sample = blockIdx.x * blockDim.x + threadIdx.x;
   for (int dist = blockIdx.y; dist < num_distributions; dist += gridDim.y) {
     if (sample < num_samples) {
-      T rng_number = rng_data[sample + dist * num_samples];
-      if (use_curand) {
-        rng_number = static_cast<T>(curand_uniform4(&state).x);
-      }
+      T rng_number = static_cast<T>(curand_uniform4(&state).x);
       // Find the bucket that a uniform random number lies in
       int selected_category =
           binarySearchFunctor<T>(cumulative_probs_data + dist * num_categories,
@@ -182,10 +140,7 @@ void MultinomialKernel(const Context& dev_ctx,
   const int64_t num_distributions = in_rank > 1 ? in_dims[in_rank - 2] : 1;
 
   // If replacement is False, it's not a replaceable sample. Every category
-  // can
-  // be used only once. So after every sample, probability of the distribution
-  // will change. The implementation can't be parallelizable. Thus, call CPU
-  // implementation ``funcs::MultinomialFunctor`` to sample the distribution.
+  // can be used only once.
   if (!replacement) {
     int64_t in_data_numel = x.numel();
     int64_t out_data_numel = out->numel();
@@ -202,76 +157,50 @@ void MultinomialKernel(const Context& dev_ctx,
                in_data_numel * sizeof(T),
                cudaMemcpyDeviceToHost);
 #endif
-    if (FLAGS_use_curand) {
-      for (size_t i = 0; i < num_distributions; ++i) {
-        int zero_num = 0;
-        for (size_t j = 0; j < num_categories; ++j) {
-          T weight = cpu_in_data[i * num_distributions + j];
-          PADDLE_ENFORCE_GE(
-              weight,
-              0,
-              errors::InvalidArgument(
-                  "Each element of multinomial'input must >= 0, but got %f.",
-                  weight));
-          if (weight == static_cast<T>(0)) {
-            zero_num++;
-          }
+    for (size_t i = 0; i < num_distributions; ++i) {
+      int zero_num = 0;
+      for (size_t j = 0; j < num_categories; ++j) {
+        T weight = cpu_in_data[i * num_distributions + j];
+        PADDLE_ENFORCE_GE(
+            weight,
+            0,
+            errors::InvalidArgument(
+                "Each element of multinomial'input must >= 0, but got %f.",
+                weight));
+        if (weight == static_cast<T>(0)) {
+          zero_num++;
         }
-        int valid_samples = num_categories - zero_num;
-        PADDLE_ENFORCE_LE(
-            num_samples,
-            valid_samples,
-            errors::InvalidArgument("When replacement=False, 'num_samples' "
-                                    "must less than or eaqual to the number of "
-                                    "positive item of input"));
       }
-
-      // Refer to [gumbel softmax algorithm]
-      DenseTensor rand = EmptyLike<T, Context>(dev_ctx, x);
-      T* rand_data = rand.data<T>();
-      funcs::uniform_distribution<T> dist;
-      funcs::exponential_transform<T> trans(1.0);
-      funcs::distribution_and_transform<T>(dev_ctx, &rand, dist, trans);
-
-      funcs::ForRange<Context> for_range(dev_ctx, x.numel());
-      for_range([rand_data, in_data] __device__(size_t idx) {
-        rand_data[idx] = in_data[idx] / rand_data[idx];
-      });
-
-      if (num_samples == 1) {
-        ArgMaxKernel<T, Context>(
-            dev_ctx, rand, -1, true, false, 3 /*proto::VarType::INT64*/, out);
-      } else {
-        std::vector<int64_t> out_dim_vec = vectorize<int64_t>(out->dims());
-        DenseTensor value = Empty<T, Context>(dev_ctx, IntArray(out_dim_vec));
-        TopkKernel<T, Context>(
-            dev_ctx, rand, Scalar(num_samples), -1, true, true, &value, out);
-      }
-      return;
+      int valid_samples = num_categories - zero_num;
+      PADDLE_ENFORCE_LE(
+          num_samples,
+          valid_samples,
+          errors::InvalidArgument("When replacement=False, 'num_samples' "
+                                  "must less than or eaqual to the number of "
+                                  "positive item of input"));
     }
 
-    funcs::MultinomialFunctor<T>(dev_ctx,
-                                 cpu_out_data,
-                                 cpu_in_data,
-                                 num_samples,
-                                 replacement,
-                                 num_categories,
-                                 num_distributions);
-
-#ifdef PADDLE_WITH_HIP
-    hipMemcpy(out_data,
-              cpu_out_data,
-              out_data_numel * sizeof(int64_t),
-              hipMemcpyHostToDevice);
-#else
-    cudaMemcpy(out_data,
-               cpu_out_data,
-               out_data_numel * sizeof(int64_t),
-               cudaMemcpyHostToDevice);
-#endif
-
-    delete[] cpu_in_data;
-    delete[] cpu_out_data;
+    // Refer to [gumbel softmax algorithm]
+    DenseTensor rand = EmptyLike<T, Context>(dev_ctx, x);
+    T* rand_data = rand.data<T>();
+    funcs::uniform_distribution<T> dist;
+    funcs::exponential_transform<T> trans(1.0);
+    funcs::distribution_and_transform<T>(dev_ctx, &rand, dist, trans);
+
+    funcs::ForRange<Context> for_range(dev_ctx, x.numel());
+    for_range([rand_data, in_data] __device__(size_t idx) {
+      rand_data[idx] = in_data[idx] / rand_data[idx];
+    });
+
+    if (num_samples == 1) {
+      ArgMaxKernel<T, Context>(
+          dev_ctx, rand, -1, true, false, 3 /*proto::VarType::INT64*/, out);
+    } else {
+      std::vector<int64_t> out_dim_vec = vectorize<int64_t>(out->dims());
+      DenseTensor value = Empty<T, Context>(dev_ctx, IntArray(out_dim_vec));
+      TopkKernel<T, Context>(
+          dev_ctx, rand, Scalar(num_samples), -1, true, true, &value, out);
+    }
     return;
   }
 
@@ -322,44 +251,18 @@ void MultinomialKernel(const Context& dev_ctx,
   auto* cumulative_probs_data =
       dev_ctx.template Alloc<T>(&cumulative_probs_tensor);
 
-  if (FLAGS_use_curand) {
-    // 'phi::funcs::InclusiveScan' has higher accuracy than
-    // 'thrust::inclusive_scan'
-    funcs::InclusiveScan<T, std::plus<T>>(
-        /*in*/ norm_probs_data,
-        /*out*/ cumulative_probs_data,
-        /*outer_dim*/ static_cast<size_t>(num_distributions),
-        /*mid_dim*/ static_cast<size_t>(num_categories),
-        /*inner_dim*/ static_cast<size_t>(1),
-        /*init*/ static_cast<T>(0),
-        std::plus<T>(),
-        /*reverse=*/false,
-        dev_ctx);
-  } else {
-    dim3 block_cumsum(1);
-    dim3 grid_cumsum(num_distributions);
-    GetCumulativeProbs<T><<<grid_cumsum, block_cumsum, 0, dev_ctx.stream()>>>(
-        norm_probs_data,
-        num_distributions,
-        num_categories,
-        cumulative_probs_data);
-  }
-
-  // Generate random number for each sample.
-  std::random_device rd;
-  auto seed = rd();
-
-  DenseTensor rng_data_tensor;
-  rng_data_tensor.Resize({num_distributions, num_samples});
-  auto* rng_data = dev_ctx.template Alloc<T>(&rng_data_tensor);
-
-  thrust::counting_iterator<int64_t> index_sequence_begin(0);
-  paddle::platform::Transform<GPUContext> trans;
-  trans(dev_ctx,
-        index_sequence_begin,
-        index_sequence_begin + num_distributions * num_samples,
-        rng_data,
-        RandomGeneratorCudaFunctor<T>(seed));
+  // 'phi::funcs::InclusiveScan' has higher accuracy than
+  // 'thrust::inclusive_scan'
+  funcs::InclusiveScan<T, std::plus<T>>(
+      /*in*/ norm_probs_data,
+      /*out*/ cumulative_probs_data,
+      /*outer_dim*/ static_cast<size_t>(num_distributions),
+      /*mid_dim*/ static_cast<size_t>(num_categories),
+      /*inner_dim*/ static_cast<size_t>(1),
+      /*init*/ static_cast<T>(0),
+      std::plus<T>(),
+      /*reverse=*/false,
+      dev_ctx);
 
   // Sample the multinomial distributions.
   dim3 block(128);
@@ -376,7 +279,6 @@ void MultinomialKernel(const Context& dev_ctx,
   auto seed_offset = gen_cuda->IncrementOffset(increment);
 
   sampleMultinomialWithReplacement<T><<<grid, block, 0, dev_ctx.stream()>>>(
-      rng_data,
       num_samples,
       out_data,
       num_distributions,
@@ -384,8 +286,7 @@ void MultinomialKernel(const Context& dev_ctx,
       cumulative_probs_data,
       norm_probs_data,
       seed_offset.first,
-      seed_offset.second,
-      FLAGS_use_curand);
+      seed_offset.second);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/randint_kernel.cu b/paddle/phi/kernels/gpu/randint_kernel.cu
index 01885050022688..90eaea6a0868c1 100644
--- a/paddle/phi/kernels/gpu/randint_kernel.cu
+++ b/paddle/phi/kernels/gpu/randint_kernel.cu
@@ -23,8 +23,6 @@
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/memory/memcpy.h"
 
-DECLARE_bool(use_curand);
-
 namespace phi {
 
 template <typename T, typename Context>
@@ -37,37 +35,9 @@ void RandintRawKernel(const Context& dev_ctx,
                       DenseTensor* out) {
   out->Resize(phi::make_ddim(shape.GetData()));
   T* data = dev_ctx.template Alloc<T>(out);
-  if (FLAGS_use_curand) {
-    funcs::uniform_distribution<uint32_t> dist;
-    funcs::uniform_int_transform<T, uint32_t> trans(low, high);
-    funcs::distribution_and_transform<T>(dev_ctx, out, dist, trans);
-  } else {
-    DenseTensor tmp;
-    tmp.Resize(phi::make_ddim(shape.GetData()));
-    T* tmp_data = dev_ctx.template HostAlloc<T>(&tmp);
-
-    std::shared_ptr<std::mt19937_64> engine;
-    if (seed) {
-      engine = std::make_shared<std::mt19937_64>();
-      engine->seed(seed);
-    } else {
-      engine = dev_ctx.GetHostGenerator()->GetCPUEngine();
-    }
-
-    std::uniform_int_distribution<T> dist(low, high - 1);
-    auto numel = out->numel();
-    for (int64_t i = 0; i < numel; ++i) {
-      tmp_data[i] = dist(*engine);
-    }
-
-    paddle::memory::Copy<phi::GPUPlace, phi::Place>(
-        out->place(),
-        data,
-        tmp.place(),
-        tmp_data,
-        numel * paddle::experimental::SizeOf(out->dtype()),
-        0);
-  }
+  funcs::uniform_distribution<uint32_t> dist;
+  funcs::uniform_int_transform<T, uint32_t> trans(low, high);
+  funcs::distribution_and_transform<T>(dev_ctx, out, dist, trans);
 }
 
 template <typename T, typename Context>
diff --git a/paddle/phi/kernels/gpu/randperm_kernel.cu b/paddle/phi/kernels/gpu/randperm_kernel.cu
index 678b580beca2f6..4e488ed470df92 100644
--- a/paddle/phi/kernels/gpu/randperm_kernel.cu
+++ b/paddle/phi/kernels/gpu/randperm_kernel.cu
@@ -84,91 +84,65 @@ __global__ void SwapRepeatKernel(
 template <typename T, typename Context>
 void RandpermRawKernel(
     const Context& dev_ctx, int n, DataType dtype, int seed, DenseTensor* out) {
-  if (FLAGS_use_curand) {
-    DenseTensor key;
-    RandintKernel<int, Context>(dev_ctx,
-                                std::numeric_limits<int>::min(),
-                                std::numeric_limits<int>::max(),
-                                IntArray({n}),
-                                phi::DataType::INT32,
-                                &key);
-    DenseTensor key_out = Empty<int, Context>(dev_ctx, IntArray({n}));
-
-    DenseTensor range = Empty<T, Context>(dev_ctx, IntArray({n}));
-    T* range_data = range.data<T>();
-    funcs::ForRange<Context> for_range(dev_ctx, n);
-    for_range([range_data] __device__(size_t idx) {
-      range_data[idx] = static_cast<T>(idx);
-    });
-
-    out->Resize(phi::make_ddim({n}));
-    T* out_data = dev_ctx.template Alloc<T>(out);
-
-    // Refer to [Algorithm of randperm] https://osf.io/af2hy/ to
-    // improve performance of radix sort.
-    double n_d = static_cast<double>(n);
-    int begin_bit = 0;
-    int end_bit =
-        std::ceil(std::log2(n_d - (6 * n_d * n_d + 1) / (12 * std::log(0.9))));
-
-    size_t temp_storage_bytes = 0;
-    cub::DeviceRadixSort::SortPairs<int, T>(nullptr,
-                                            temp_storage_bytes,
-                                            key.data<int>(),
-                                            key_out.data<int>(),
-                                            range.data<T>(),
-                                            out_data,
-                                            n,
-                                            begin_bit,
-                                            end_bit < 32 ? end_bit : 32,
-                                            dev_ctx.stream());
-
-    auto d_temp_storage = paddle::memory::Alloc(dev_ctx, temp_storage_bytes);
-    cub::DeviceRadixSort::SortPairs<int, T>(d_temp_storage->ptr(),
-                                            temp_storage_bytes,
-                                            key.data<int>(),
-                                            key_out.data<int>(),
-                                            range.data<T>(),
-                                            out_data,
-                                            n,
-                                            begin_bit,
-                                            end_bit < 32 ? end_bit : 32,
-                                            dev_ctx.stream());
-
-    auto gen_cuda = dev_ctx.GetGenerator();
-    auto seed_offset = gen_cuda->IncrementOffset(n);
-    uint64_t seed = seed_offset.first;
-    uint64_t offset = seed_offset.second;
-
-    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n);
-    SwapRepeatKernel<T><<<config.block_per_grid.x,
-                          config.thread_per_block.x,
-                          0,
-                          dev_ctx.stream()>>>(
-        key_out.data<int>(), out_data, n, seed, offset);
-  } else {
-    DenseTensor tmp;
-    tmp.Resize(phi::make_ddim({n}));
-    T* tmp_data = dev_ctx.template HostAlloc<T>(&tmp);
-
-    std::shared_ptr<std::mt19937_64> engine;
-    if (seed) {
-      engine = std::make_shared<std::mt19937_64>();
-      engine->seed(seed);
-    } else {
-      engine = dev_ctx.GetHostGenerator()->GetCPUEngine();
-    }
-
-    for (int i = 0; i < n; ++i) {
-      tmp_data[i] = static_cast<T>(i);
-    }
-    std::shuffle(tmp_data, tmp_data + n, *engine);
-
-    T* out_data = dev_ctx.template Alloc<T>(out);
-    auto size = out->numel() * paddle::experimental::SizeOf(out->dtype());
-    paddle::memory::Copy<phi::GPUPlace, phi::Place>(
-        out->place(), out_data, tmp.place(), tmp_data, size, 0);
-  }
+  DenseTensor key;
+  RandintKernel<int, Context>(dev_ctx,
+                              std::numeric_limits<int>::min(),
+                              std::numeric_limits<int>::max(),
+                              IntArray({n}),
+                              phi::DataType::INT32,
+                              &key);
+  DenseTensor key_out = Empty<int, Context>(dev_ctx, IntArray({n}));
+
+  DenseTensor range = Empty<T, Context>(dev_ctx, IntArray({n}));
+  T* range_data = range.data<T>();
+  funcs::ForRange<Context> for_range(dev_ctx, n);
+  for_range([range_data] __device__(size_t idx) {
+    range_data[idx] = static_cast<T>(idx);
+  });
+
+  out->Resize(phi::make_ddim({n}));
+  T* out_data = dev_ctx.template Alloc<T>(out);
+
+  // Refer to [Algorithm of randperm] https://osf.io/af2hy/ to
+  // improve performance of radix sort.
+  double n_d = static_cast<double>(n);
+  int begin_bit = 0;
+  int end_bit =
+      std::ceil(std::log2(n_d - (6 * n_d * n_d + 1) / (12 * std::log(0.9))));
+
+  size_t temp_storage_bytes = 0;
+  cub::DeviceRadixSort::SortPairs<int, T>(nullptr,
+                                          temp_storage_bytes,
+                                          key.data<int>(),
+                                          key_out.data<int>(),
+                                          range.data<T>(),
+                                          out_data,
+                                          n,
+                                          begin_bit,
+                                          end_bit < 32 ? end_bit : 32,
+                                          dev_ctx.stream());
+
+  auto d_temp_storage = paddle::memory::Alloc(dev_ctx, temp_storage_bytes);
+  cub::DeviceRadixSort::SortPairs<int, T>(d_temp_storage->ptr(),
+                                          temp_storage_bytes,
+                                          key.data<int>(),
+                                          key_out.data<int>(),
+                                          range.data<T>(),
+                                          out_data,
+                                          n,
+                                          begin_bit,
+                                          end_bit < 32 ? end_bit : 32,
+                                          dev_ctx.stream());
+
+  auto gen_cuda = dev_ctx.GetGenerator();
+  auto seed_offset = gen_cuda->IncrementOffset(n);
+
+  auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n);
+  SwapRepeatKernel<T><<<config.block_per_grid.x,
+                        config.thread_per_block.x,
+                        0,
+                        dev_ctx.stream()>>>(
+      key_out.data<int>(), out_data, n, seed_offset.first, seed_offset.second);
 }
 
 template <typename T, typename Context>
diff --git a/paddle/phi/kernels/gpu/size_kernel.cu b/paddle/phi/kernels/gpu/size_kernel.cu
index 17a39944eb04f5..7051fb78c7587f 100644
--- a/paddle/phi/kernels/gpu/size_kernel.cu
+++ b/paddle/phi/kernels/gpu/size_kernel.cu
@@ -22,6 +22,7 @@ PD_REGISTER_KERNEL(size,
                    GPU,
                    ALL_LAYOUT,
                    phi::SizeKernel,
+                   int16_t,
                    int,
                    int64_t,
                    phi::dtype::float16,
diff --git a/paddle/phi/kernels/gpu/uniform_random_kernel.cu b/paddle/phi/kernels/gpu/uniform_random_kernel.cu
index 2cabde0bbf9425..a4aea10cfe762f 100644
--- a/paddle/phi/kernels/gpu/uniform_random_kernel.cu
+++ b/paddle/phi/kernels/gpu/uniform_random_kernel.cu
@@ -14,14 +14,13 @@
 
 #include "paddle/phi/kernels/uniform_random_kernel.h"
 
+#include <thrust/random.h>
 #include "gflags/gflags.h"
 
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/distribution_helper.h"
 #include "paddle/phi/kernels/funcs/index_impl.cu.h"
 
-DECLARE_bool(use_curand);
-
 namespace phi {
 
 template <typename T>
@@ -54,43 +53,6 @@ struct UniformGenerator {
   }
 };
 
-template <typename T>
-struct UniformGeneratorOffset {
-  T min_, max_;
-  unsigned int seed_;
-  T diag_val_;
-  unsigned int diag_num_;
-  unsigned int diag_step_;
-  int offset_;
-  __host__ __device__ UniformGeneratorOffset(T min,
-                                             T max,
-                                             int seed,
-                                             int diag_num,
-                                             int diag_step,
-                                             T diag_val,
-                                             int offset)
-      : min_(min),
-        max_(max),
-        seed_(seed),
-        diag_num_(diag_num),
-        diag_step_(diag_step),
-        diag_val_(diag_val),
-        offset_(offset) {}
-
-  __host__ __device__ T operator()(const unsigned int n) const {
-    thrust::minstd_rand rng;
-    rng.seed(seed_);
-    thrust::uniform_real_distribution<T> dist(min_, max_);
-    rng.discard(n + offset_);
-    T out = dist(rng);
-    unsigned int remainder = n % (diag_step_ + 1);
-    if (remainder == 0 && diag_num_ > n / (diag_step_ + 1)) {
-      out = diag_val_;
-    }
-    return out;
-  }
-};
-
 template <typename T, typename Context>
 void UniformRandomRawKernel(const Context& dev_ctx,
                             const IntArray& shape,
@@ -114,23 +76,10 @@ void UniformRandomRawKernel(const Context& dev_ctx,
 
   auto generator = dev_ctx.GetGenerator();
   if (generator->GetIsInitPy() && seed_flag) {
-    if (FLAGS_use_curand) {
-      using MT = typename kps::details::MPTypeTrait<T>::Type;
-      funcs::uniform_distribution<MT> dist;
-      funcs::uniform_real_transform<MT> trans(min, max);
-      funcs::distribution_and_transform<T>(dev_ctx, out, dist, trans);
-    } else {
-      auto seed_offset = generator->IncrementOffset(1);
-      int64_t gen_offset = size * seed_offset.second;
-      auto func = UniformGeneratorOffset<T>(min,
-                                            max,
-                                            seed_offset.first,
-                                            diag_num,
-                                            diag_step,
-                                            diag_val,
-                                            gen_offset);
-      IndexKernel<T, UniformGeneratorOffset<T>>(dev_ctx, out, func);
-    }
+    using MT = typename kps::details::MPTypeTrait<T>::Type;
+    funcs::uniform_distribution<MT> dist;
+    funcs::uniform_real_transform<MT> trans(min, max);
+    funcs::distribution_and_transform<T>(dev_ctx, out, dist, trans);
   } else {
     auto func =
         UniformGenerator<T>(min, max, seed, diag_num, diag_step, diag_val);
diff --git a/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu
index 9c5e77d5fd8466..74525e63f476b2 100644
--- a/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu
@@ -289,21 +289,17 @@ void ConvCudnnGradGradKernel(
                                     dtype};
 
 #ifdef PADDLE_WITH_HIP
-  miopenConvFwdAlgorithm_t fwd_algo1 = static_cast<miopenConvFwdAlgorithm_t>(0);
-  miopenConvFwdAlgorithm_t fwd_algo2 = static_cast<miopenConvFwdAlgorithm_t>(0);
-  miopenConvBwdDataAlgorithm_t data_algo =
-      static_cast<miopenConvBwdDataAlgorithm_t>(0);
-  miopenConvBwdWeightsAlgorithm_t filter_algo =
-      static_cast<miopenConvBwdWeightsAlgorithm_t>(0);
+  paddle::operators::SearchResult<miopenConvFwdAlgorithm_t> fwd_result1;
+  paddle::operators::SearchResult<miopenConvFwdAlgorithm_t> fwd_result2;
+  paddle::operators::SearchResult<miopenConvBwdDataAlgorithm_t> data_result;
+  paddle::operators::SearchResult<miopenConvBwdWeightsAlgorithm_t>
+      filter_result;
 #else
-  cudnnConvolutionFwdAlgo_t fwd_algo1 =
-      static_cast<cudnnConvolutionFwdAlgo_t>(0);
-  cudnnConvolutionFwdAlgo_t fwd_algo2 =
-      static_cast<cudnnConvolutionFwdAlgo_t>(0);
-  cudnnConvolutionBwdDataAlgo_t data_algo =
-      static_cast<cudnnConvolutionBwdDataAlgo_t>(0);
-  cudnnConvolutionBwdFilterAlgo_t filter_algo =
-      static_cast<cudnnConvolutionBwdFilterAlgo_t>(0);
+  paddle::operators::SearchResult<cudnnConvolutionFwdAlgo_t> fwd_result1;
+  paddle::operators::SearchResult<cudnnConvolutionFwdAlgo_t> fwd_result2;
+  paddle::operators::SearchResult<cudnnConvolutionBwdDataAlgo_t> data_result;
+  paddle::operators::SearchResult<cudnnConvolutionBwdFilterAlgo_t>
+      filter_result;
 #endif
 
   auto layout = paddle::platform::GetCudnnTensorFormat(
@@ -332,13 +328,13 @@ void ConvCudnnGradGradKernel(
       using search1 =
           paddle::operators::SearchAlgorithm<miopenConvFwdAlgorithm_t>;
       workspace_size = search1::GetWorkspaceSize(args1);
-      fwd_algo1 = search1::Find<T>(
+      fwd_result1.algo = search1::Find<T>(
           args1, exhaustive_search, false, workspace_size, ctx);
 #else
       using search1 =
           paddle::operators::SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
-      fwd_algo1 = search1::Find<T>(args1, exhaustive_search, false, ctx);
-      workspace_size = search1::GetWorkspaceSize(args1, fwd_algo1);
+      fwd_result1 = search1::Find<T>(args1, exhaustive_search, false, ctx);
+      workspace_size = search1::GetWorkspaceSize(args1, fwd_result1.algo);
 #endif
     }
 
@@ -360,14 +356,14 @@ void ConvCudnnGradGradKernel(
           paddle::operators::SearchAlgorithm<miopenConvFwdAlgorithm_t>;
       workspace_size =
           std::max(workspace_size, search2::GetWorkspaceSize(args2));
-      fwd_algo2 = search2::Find<T>(
+      fwd_result2.algo = search2::Find<T>(
           args2, exhaustive_search, false, workspace_size, ctx);
 #else
       using search2 =
           paddle::operators::SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
-      fwd_algo2 = search2::Find<T>(args2, exhaustive_search, false, ctx);
-      workspace_size =
-          std::max(workspace_size, search2::GetWorkspaceSize(args2, fwd_algo2));
+      fwd_result2 = search2::Find<T>(args2, exhaustive_search, false, ctx);
+      workspace_size = std::max(
+          workspace_size, search2::GetWorkspaceSize(args2, fwd_result2.algo));
 #endif
     }
   }
@@ -389,15 +385,15 @@ void ConvCudnnGradGradKernel(
     using search3 =
         paddle::operators::SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
     workspace_size = std::max(workspace_size, search3::GetWorkspaceSize(args3));
-    filter_algo = search3::Find<T>(
+    filter_result.algo = search3::Find<T>(
         args3, exhaustive_search, deterministic, workspace_size, ctx);
 #else
     using search3 =
         paddle::operators::SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
-    filter_algo =
+    filter_result =
         search3::Find<T>(args3, exhaustive_search, deterministic, ctx);
-    workspace_size =
-        std::max(workspace_size, search3::GetWorkspaceSize(args3, filter_algo));
+    workspace_size = std::max(
+        workspace_size, search3::GetWorkspaceSize(args3, filter_result.algo));
 #endif
   }
 
@@ -419,14 +415,15 @@ void ConvCudnnGradGradKernel(
     using search4 =
         paddle::operators::SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
     workspace_size = std::max(workspace_size, search4::GetWorkspaceSize(args4));
-    data_algo = search4::Find<T>(
+    data_result.algo = search4::Find<T>(
         args4, exhaustive_search, deterministic, workspace_size, ctx);
 #else
     using search4 =
         paddle::operators::SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
-    data_algo = search4::Find<T>(args4, exhaustive_search, deterministic, ctx);
-    workspace_size =
-        std::max(workspace_size, search4::GetWorkspaceSize(args4, data_algo));
+    data_result =
+        search4::Find<T>(args4, exhaustive_search, deterministic, ctx);
+    workspace_size = std::max(
+        workspace_size, search4::GetWorkspaceSize(args4, data_result.algo));
 #endif
   }
 
@@ -471,7 +468,7 @@ void ConvCudnnGradGradKernel(
                     args1.wdesc.desc(),
                     w,
                     args1.cdesc.desc(),
-                    fwd_algo1,
+                    fwd_result1.algo,
                     &beta,
                     args1.odesc.desc(),
                     transformed_ddy_channel,
@@ -492,7 +489,7 @@ void ConvCudnnGradGradKernel(
                       args1.wdesc.desc(),
                       w + i * group_offset_filter,
                       args1.cdesc.desc(),
-                      fwd_algo1,
+                      fwd_result1.algo,
                       workspace_ptr,
                       workspace_size,
                       &beta,
@@ -517,7 +514,7 @@ void ConvCudnnGradGradKernel(
                     args2.wdesc.desc(),
                     ddw,
                     args2.cdesc.desc(),
-                    fwd_algo2,
+                    fwd_result2.algo,
                     &beta,
                     args2.odesc.desc(),
                     transformed_ddy_channel,
@@ -538,7 +535,7 @@ void ConvCudnnGradGradKernel(
                       args2.wdesc.desc(),
                       ddw + i * group_offset_filter,
                       args2.cdesc.desc(),
-                      fwd_algo2,
+                      fwd_result2.algo,
                       workspace_ptr,
                       workspace_size,
                       &alpha,
@@ -568,7 +565,7 @@ void ConvCudnnGradGradKernel(
                   args3.idesc.desc(),
                   ddx,
                   args3.cdesc.desc(),
-                  filter_algo,
+                  filter_result.algo,
                   &beta,
                   args3.wdesc.desc(),
                   dw,
@@ -589,7 +586,7 @@ void ConvCudnnGradGradKernel(
                     args3.odesc.desc(),
                     transformed_dy_channel + i * group_offset_out,
                     args3.cdesc.desc(),
-                    filter_algo,
+                    filter_result.algo,
                     workspace_ptr,
                     workspace_size,
                     &beta,
@@ -615,7 +612,7 @@ void ConvCudnnGradGradKernel(
                   args4.wdesc.desc(),
                   ddw,
                   args4.cdesc.desc(),
-                  data_algo,
+                  data_result.algo,
                   &beta,
                   args4.idesc.desc(),
                   transformed_dx,
@@ -636,7 +633,7 @@ void ConvCudnnGradGradKernel(
                     args4.odesc.desc(),
                     transformed_dy_channel + i * group_offset_out,
                     args4.cdesc.desc(),
-                    data_algo,
+                    data_result.algo,
                     workspace_ptr,
                     workspace_size,
                     &beta,
diff --git a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
index e09c33380b307d..3696ab08ea83e6 100644
--- a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
@@ -322,17 +322,16 @@ void ConvCudnnGradKernel(const Context& ctx,
   int group_offset_in = i_c / groups * i_h * i_w * i_d;
   int group_offset_out = o_c / groups * o_h * o_w * o_d;
   int group_offset_filter = transformed_filter_channel.numel() / groups;
+
 // ------------------- cudnn backward algorithm ---------------------
 #ifdef PADDLE_WITH_HIP
-  miopenConvBwdDataAlgorithm_t data_algo =
-      static_cast<miopenConvBwdDataAlgorithm_t>(0);
-  miopenConvBwdWeightsAlgorithm_t filter_algo =
-      static_cast<miopenConvBwdWeightsAlgorithm_t>(0);
+  paddle::operators::SearchResult<miopenConvBwdDataAlgorithm_t> bwd_result;
+  paddle::operators::SearchResult<miopenConvBwdWeightsAlgorithm_t>
+      filter_result;
 #else
-  cudnnConvolutionBwdDataAlgo_t data_algo =
-      static_cast<cudnnConvolutionBwdDataAlgo_t>(0);
-  cudnnConvolutionBwdFilterAlgo_t filter_algo =
-      static_cast<cudnnConvolutionBwdFilterAlgo_t>(0);
+  paddle::operators::SearchResult<cudnnConvolutionBwdDataAlgo_t> bwd_result;
+  paddle::operators::SearchResult<cudnnConvolutionBwdFilterAlgo_t>
+      filter_result;
 #endif
   // input data workspace_size
   size_t workspace_size_d = 0;
@@ -368,14 +367,14 @@ void ConvCudnnGradKernel(const Context& ctx,
         paddle::operators::SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
     workspace_size_d =
         std::max(workspace_size_d, search1::GetWorkspaceSize(args1));
-    data_algo = search1::Find<T>(
+    bwd_result.algo = search1::Find<T>(
         args1, exhaustive_search, deterministic, workspace_size_d, ctx);
 #else
     using search1 =
         paddle::operators::SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
-    data_algo = search1::Find<T>(args1, exhaustive_search, deterministic, ctx);
-    workspace_size_d =
-        std::max(workspace_size_d, search1::GetWorkspaceSize(args1, data_algo));
+    bwd_result = search1::Find<T>(args1, exhaustive_search, deterministic, ctx);
+    workspace_size_d = std::max(
+        workspace_size_d, search1::GetWorkspaceSize(args1, bwd_result.algo));
 #endif
   }
 
@@ -397,15 +396,17 @@ void ConvCudnnGradKernel(const Context& ctx,
         paddle::operators::SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
     workspace_size_w =
         std::max(workspace_size_w, search2::GetWorkspaceSize(args2));
-    filter_algo = search2::Find<T>(
+    filter_result.algo = search2::Find<T>(
         args2, exhaustive_search, deterministic, workspace_size_w, ctx);
 #else
     using search2 =
         paddle::operators::SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
-    filter_algo =
+    filter_result =
         search2::Find<T>(args2, exhaustive_search, deterministic, ctx);
-    workspace_size_w = std::max(workspace_size_w,
-                                search2::GetWorkspaceSize(args2, filter_algo));
+    VLOG(3) << "filter algo: " << filter_result.algo << ", time "
+            << filter_result.time;
+    workspace_size_w = std::max(
+        workspace_size_w, search2::GetWorkspaceSize(args2, filter_result.algo));
 #endif
   }
 
@@ -439,7 +440,7 @@ void ConvCudnnGradKernel(const Context& ctx,
                     args1.wdesc.desc(),
                     filter_data,
                     args1.cdesc.desc(),
-                    data_algo,
+                    bwd_result.algo,
                     &beta,
                     args1.idesc.desc(),
                     temp_tensor_data,
@@ -471,7 +472,7 @@ void ConvCudnnGradKernel(const Context& ctx,
                     args1.wdesc.desc(),
                     filter_data,
                     args1.cdesc.desc(),
-                    data_algo,
+                    bwd_result.algo,
                     &beta,
                     args1.idesc.desc(),
                     transformed_input_grad_data,
@@ -494,7 +495,7 @@ void ConvCudnnGradKernel(const Context& ctx,
                     args1.odesc.desc(),
                     output_grad_data + i * group_offset_out,
                     args1.cdesc.desc(),
-                    data_algo,
+                    bwd_result.algo,
                     cudnn_workspace_ptr,
                     workspace_size_d,
                     &beta,
@@ -554,7 +555,7 @@ void ConvCudnnGradKernel(const Context& ctx,
                   args2.idesc.desc(),
                   input_data,
                   args2.cdesc.desc(),
-                  filter_algo,
+                  filter_result.algo,
                   &beta,
                   args2.wdesc.desc(),
                   filter_grad_data,
@@ -575,7 +576,7 @@ void ConvCudnnGradKernel(const Context& ctx,
                     args2.odesc.desc(),
                     output_grad_data + i * group_offset_out,
                     args2.cdesc.desc(),
-                    filter_algo,
+                    filter_result.algo,
                     cudnn_workspace_ptr,
                     workspace_size_w,
                     &beta_filter,
@@ -626,6 +627,39 @@ void Conv3DCudnnGradKernel(const Context& dev_ctx,
                          filter_grad);
 }
 
+template <typename T, typename Context>
+void DepthwiseConvCudnnGradKernel(const Context& dev_ctx,
+                                  const DenseTensor& input,
+                                  const DenseTensor& filter,
+                                  const DenseTensor& out_grad,
+                                  const std::vector<int>& strides,
+                                  const std::vector<int>& paddings,
+                                  const std::string& paddding_algorithm,
+                                  int groups,
+                                  const std::vector<int>& dilations,
+                                  const std::string& data_format,
+                                  bool use_addto,
+                                  int workspace_size_MB,
+                                  bool exhaustive_search,
+                                  DenseTensor* input_grad,
+                                  DenseTensor* filter_grad) {
+  ConvCudnnGradKernel<T>(dev_ctx,
+                         input,
+                         filter,
+                         out_grad,
+                         strides,
+                         paddings,
+                         paddding_algorithm,
+                         groups,
+                         dilations,
+                         data_format,
+                         use_addto,
+                         workspace_size_MB,
+                         exhaustive_search,
+                         input_grad,
+                         filter_grad);
+}
+
 }  // namespace phi
 
 #ifdef PADDLE_WITH_HIP
@@ -642,6 +676,13 @@ PD_REGISTER_KERNEL(conv3d_grad,
                    phi::Conv3DCudnnGradKernel,
                    float,
                    phi::dtype::float16) {}
+
+PD_REGISTER_KERNEL(depthwise_conv2d_grad,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::DepthwiseConvCudnnGradKernel,
+                   float,
+                   phi::dtype::float16) {}
 #else
 #if CUDNN_VERSION_MIN(8, 1, 0)
 PD_REGISTER_KERNEL(conv2d_grad,
diff --git a/paddle/phi/kernels/gpudnn/conv_kernel.cu b/paddle/phi/kernels/gpudnn/conv_kernel.cu
index c2970cc8cde751..d40cbecaee6d51 100644
--- a/paddle/phi/kernels/gpudnn/conv_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_kernel.cu
@@ -18,7 +18,6 @@
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 
-#include "paddle/fluid/framework/eigen.h"
 #ifdef PADDLE_WITH_HIP
 #include "paddle/fluid/operators/conv_miopen_helper.h"
 #else
@@ -68,7 +67,6 @@ void ConvCudnnKernel(const Context& ctx,
                         "FLAGS_cudnn_deterministic True at same time."));
 
   const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
-
   auto dtype = paddle::platform::CudnnDataType<T>::type;
 
 #ifdef PADDLE_WITH_HIP
@@ -309,17 +307,17 @@ void ConvCudnnKernel(const Context& ctx,
   size_t workspace_size = 0;  // final workspace to allocate.
 // ------------------- cudnn conv algorithm ---------------------
 #ifdef PADDLE_WITH_HIP
-  miopenConvFwdAlgorithm_t algo{};
+  paddle::operators::SearchResult<miopenConvFwdAlgorithm_t> fwd_result;
   using search = paddle::operators::SearchAlgorithm<miopenConvFwdAlgorithm_t>;
   workspace_size = search::GetWorkspaceSize(args);
-  algo = search::Find<T>(
+  fwd_result.algo = search::Find<T>(
       args, exhaustive_search, deterministic, workspace_size, ctx);
 #else
-  cudnnConvolutionFwdAlgo_t algo{};
+  paddle::operators::SearchResult<cudnnConvolutionFwdAlgo_t> fwd_result;
   using search =
       paddle::operators::SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
-  algo = search::Find<T>(args, exhaustive_search, deterministic, ctx);
-  workspace_size = search::GetWorkspaceSize(args, algo);
+  fwd_result = search::Find<T>(args, exhaustive_search, deterministic, ctx);
+  workspace_size = search::GetWorkspaceSize(args, fwd_result.algo);
 #endif
 
 #if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION_MIN(7, 0, 1)
@@ -328,7 +326,7 @@ void ConvCudnnKernel(const Context& ctx,
   // in forward computation, so change the algorithm to CUDNN_CONVOLUTION_\
     // FWD_ALGO_IMPLICIT_GEMM manually.
   if (groups > 1) {
-    algo = static_cast<cudnnConvolutionFwdAlgo_t>(0);
+    fwd_result.algo = static_cast<cudnnConvolutionFwdAlgo_t>(0);
   }
 #endif
 
@@ -352,7 +350,7 @@ void ConvCudnnKernel(const Context& ctx,
                 args.wdesc.desc(),
                 filter_data,
                 args.cdesc.desc(),
-                algo,
+                fwd_result.algo,
                 &beta,
                 args.odesc.desc(),
                 output_data,
@@ -373,7 +371,7 @@ void ConvCudnnKernel(const Context& ctx,
                   args.wdesc.desc(),
                   filter_data + i * group_offset_filter,
                   args.cdesc.desc(),
-                  algo,
+                  fwd_result.algo,
                   workspace_ptr,
                   workspace_size,
                   &beta,
@@ -418,6 +416,35 @@ void Conv3DCudnnKernel(const Context& dev_ctx,
                      out);
 }
 
+template <typename T, typename Context>
+void DepthwiseConvCudnnKernel(const Context& dev_ctx,
+                              const DenseTensor& input,
+                              const DenseTensor& filter,
+                              const std::vector<int>& strides,
+                              const std::vector<int>& paddings,
+                              const std::string& padding_algorithm,
+                              int groups,
+                              const std::vector<int>& dilations,
+                              const std::string& data_format,
+                              bool use_addto,
+                              int workspace_size_MB,
+                              bool exhaustive_search,
+                              DenseTensor* out) {
+  ConvCudnnKernel<T>(dev_ctx,
+                     input,
+                     filter,
+                     strides,
+                     paddings,
+                     padding_algorithm,
+                     groups,
+                     dilations,
+                     data_format,
+                     use_addto,
+                     workspace_size_MB,
+                     exhaustive_search,
+                     out);
+}
+
 }  // namespace phi
 
 #ifdef PADDLE_WITH_HIP
@@ -434,6 +461,14 @@ PD_REGISTER_KERNEL(conv3d,
                    phi::Conv3DCudnnKernel,
                    float,
                    phi::dtype::float16) {}
+
+PD_REGISTER_KERNEL(depthwise_conv2d,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::DepthwiseConvCudnnKernel,
+                   float,
+                   phi::dtype::float16) {}
+
 #else
 #if CUDNN_VERSION_MIN(8, 1, 0)
 PD_REGISTER_KERNEL(conv2d,
diff --git a/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu
index 2893bd74b1bce6..601ac43eeefd3c 100644
--- a/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu
@@ -188,11 +188,13 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
                                     dtype};
 
 #ifdef PADDLE_WITH_HIP
-  miopenConvFwdAlgorithm_t data_algo{};
-  miopenConvBwdWeightsAlgorithm_t filter_algo{};
+  paddle::operators::SearchResult<miopenConvFwdAlgorithm_t> fwd_result;
+  paddle::operators::SearchResult<miopenConvBwdWeightsAlgorithm_t>
+      filter_result;
 #else
-  cudnnConvolutionFwdAlgo_t data_algo{};
-  cudnnConvolutionBwdFilterAlgo_t filter_algo{};
+  paddle::operators::SearchResult<cudnnConvolutionFwdAlgo_t> fwd_result;
+  paddle::operators::SearchResult<cudnnConvolutionBwdFilterAlgo_t>
+      filter_result;
 #endif
 
   auto layout_tensor = paddle::platform::GetCudnnTensorFormat(layout);
@@ -218,14 +220,14 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
     using search1 =
         paddle::operators::SearchAlgorithm<miopenConvFwdAlgorithm_t>;
     workspace_size = std::max(workspace_size, search1::GetWorkspaceSize(args1));
-    data_algo =
+    fwd_result.algo =
         search1::Find<T>(args1, false, deterministic, workspace_size, ctx);
 #else
     using search1 =
         paddle::operators::SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
-    data_algo = search1::Find<T>(args1, false, deterministic, ctx);
-    workspace_size =
-        std::max(workspace_size, search1::GetWorkspaceSize(args1, data_algo));
+    fwd_result = search1::Find<T>(args1, false, deterministic, ctx);
+    workspace_size = std::max(
+        workspace_size, search1::GetWorkspaceSize(args1, fwd_result.algo));
 #endif
   }
 
@@ -245,14 +247,14 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
     using search2 =
         paddle::operators::SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
     workspace_size = std::max(workspace_size, search2::GetWorkspaceSize(args2));
-    filter_algo =
+    filter_result.algo =
         search2::Find<T>(args2, false, deterministic, workspace_size, ctx);
 #else
     using search2 =
         paddle::operators::SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
-    filter_algo = search2::Find<T>(args2, false, deterministic, ctx);
-    workspace_size =
-        std::max(workspace_size, search2::GetWorkspaceSize(args2, filter_algo));
+    filter_result = search2::Find<T>(args2, false, deterministic, ctx);
+    workspace_size = std::max(
+        workspace_size, search2::GetWorkspaceSize(args2, filter_result.algo));
 #endif
   }
 
@@ -278,7 +280,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
                                               args1.wdesc.desc(),
                                               filter_data + filter_offset * g,
                                               args1.cdesc.desc(),
-                                              data_algo,
+                                              fwd_result.algo,
                                               &beta,
                                               args1.odesc.desc(),
                                               dx_data + x_offset * g,
@@ -295,7 +297,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
                                              args1.wdesc.desc(),
                                              filter_data + filter_offset * g,
                                              args1.cdesc.desc(),
-                                             data_algo,
+                                             fwd_result.algo,
                                              cudnn_workspace,
                                              workspace_size,
                                              &beta,
@@ -338,7 +340,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
             args2.idesc.desc(),
             dout_data + dout_offset * g,
             args2.cdesc.desc(),
-            filter_algo,
+            filter_result.algo,
             &beta,
             args2.wdesc.desc(),
             dfilter_data + filter_offset * g,
@@ -355,7 +357,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
             args2.odesc.desc(),
             x_data + x_offset * g,
             args2.cdesc.desc(),
-            filter_algo,
+            filter_result.algo,
             cudnn_workspace,
             workspace_size,
             &beta,
@@ -653,22 +655,17 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
                                     dilations_,
                                     dtype};
 #ifdef PADDLE_WITH_HIP
-  miopenConvBwdDataAlgorithm_t bwd_algo1 =
-      static_cast<miopenConvBwdDataAlgorithm_t>(0);
-  miopenConvBwdDataAlgorithm_t bwd_algo2 =
-      static_cast<miopenConvBwdDataAlgorithm_t>(0);
-  miopenConvFwdAlgorithm_t data_algo = static_cast<miopenConvFwdAlgorithm_t>(0);
-  miopenConvBwdWeightsAlgorithm_t filter_algo =
-      static_cast<miopenConvBwdWeightsAlgorithm_t>(0);
+  paddle::operators::SearchResult<miopenConvBwdDataAlgorithm_t> bwd_result1;
+  paddle::operators::SearchResult<miopenConvBwdDataAlgorithm_t> bwd_result2;
+  paddle::operators::SearchResult<miopenConvBwdWeightsAlgorithm_t>
+      filter_result;
+  paddle::operators::SearchResult<miopenConvFwdAlgorithm_t> fwd_result;
 #else
-  cudnnConvolutionBwdDataAlgo_t bwd_algo1 =
-      static_cast<cudnnConvolutionBwdDataAlgo_t>(0);
-  cudnnConvolutionBwdDataAlgo_t bwd_algo2 =
-      static_cast<cudnnConvolutionBwdDataAlgo_t>(0);
-  cudnnConvolutionFwdAlgo_t data_algo =
-      static_cast<cudnnConvolutionFwdAlgo_t>(0);
-  cudnnConvolutionBwdFilterAlgo_t filter_algo =
-      static_cast<cudnnConvolutionBwdFilterAlgo_t>(0);
+  paddle::operators::SearchResult<cudnnConvolutionBwdDataAlgo_t> bwd_result1;
+  paddle::operators::SearchResult<cudnnConvolutionBwdDataAlgo_t> bwd_result2;
+  paddle::operators::SearchResult<cudnnConvolutionBwdFilterAlgo_t>
+      filter_result;
+  paddle::operators::SearchResult<cudnnConvolutionFwdAlgo_t> fwd_result;
 #endif
 
   auto layout = paddle::platform::GetCudnnTensorFormat(GPUDNNDataLayout::kNCHW);
@@ -696,13 +693,13 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
     using search1 =
         paddle::operators::SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
     workspace_size = search1::GetWorkspaceSize(args1);
-    bwd_algo1 =
+    bwd_result1.algo =
         search1::Find<T>(args1, false, deterministic, workspace_size, ctx);
 #else
     using search1 =
         paddle::operators::SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
-    bwd_algo1 = search1::Find<T>(args1, false, deterministic, ctx);
-    workspace_size = search1::GetWorkspaceSize(args1, bwd_algo1);
+    bwd_result1 = search1::Find<T>(args1, false, deterministic, ctx);
+    workspace_size = search1::GetWorkspaceSize(args1, bwd_result1.algo);
 #endif
 
     ddfilter_ = ddfilter.data<T>();
@@ -720,14 +717,14 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
     using search2 =
         paddle::operators::SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
     workspace_size = std::max(workspace_size, search2::GetWorkspaceSize(args2));
-    bwd_algo2 =
+    bwd_result2.algo =
         search2::Find<T>(args2, false, deterministic, workspace_size, ctx);
 #else
     using search2 =
         paddle::operators::SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
-    bwd_algo2 = search2::Find<T>(args2, false, deterministic, ctx);
-    workspace_size =
-        std::max(workspace_size, search2::GetWorkspaceSize(args2, bwd_algo2));
+    bwd_result2 = search2::Find<T>(args2, false, deterministic, ctx);
+    workspace_size = std::max(
+        workspace_size, search2::GetWorkspaceSize(args2, bwd_result2.algo));
 #endif
   }
 
@@ -736,9 +733,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
     args3.handle = handle;
     args3.idesc.set(transformed_dout, iwo_group);
     args3.wdesc.set(*dfilter, layout, iwo_group);
-
     args3.odesc.set(transformed_ddx_channel, iwo_group);
-
     args3.cdesc.set(dtype,
                     padding_common,
                     strides,
@@ -749,14 +744,14 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
     using search3 =
         paddle::operators::SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
     workspace_size = std::max(workspace_size, search3::GetWorkspaceSize(args3));
-    filter_algo =
+    filter_result.algo =
         search3::Find<T>(args3, false, deterministic, workspace_size, ctx);
 #else
     using search3 =
         paddle::operators::SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
-    filter_algo = search3::Find<T>(args3, false, deterministic, ctx);
-    workspace_size =
-        std::max(workspace_size, search3::GetWorkspaceSize(args3, filter_algo));
+    filter_result = search3::Find<T>(args3, false, deterministic, ctx);
+    workspace_size = std::max(
+        workspace_size, search3::GetWorkspaceSize(args3, filter_result.algo));
 #endif
   }
 
@@ -777,14 +772,14 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
     using search4 =
         paddle::operators::SearchAlgorithm<miopenConvFwdAlgorithm_t>;
     workspace_size = std::max(workspace_size, search4::GetWorkspaceSize(args4));
-    data_algo =
+    fwd_result.algo =
         search4::Find<T>(args4, false, deterministic, workspace_size, ctx);
 #else
     using search4 =
         paddle::operators::SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
-    data_algo = search4::Find<T>(args4, false, deterministic, ctx);
-    workspace_size =
-        std::max(workspace_size, search4::GetWorkspaceSize(args4, data_algo));
+    fwd_result = search4::Find<T>(args4, false, deterministic, ctx);
+    workspace_size = std::max(
+        workspace_size, search4::GetWorkspaceSize(args4, fwd_result.algo));
 #endif
   }
 
@@ -831,7 +826,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
                 args1.wdesc.desc(),
                 filter_ + i * group_offset_filter,
                 args1.cdesc.desc(),
-                bwd_algo1,
+                bwd_result1.algo,
                 &beta,
                 args1.idesc.desc(),
                 transformed_ddout_channel_ + i * group_offset_out,
@@ -850,7 +845,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
                 args1.odesc.desc(),
                 ddx_ + i * group_offset_in,
                 args1.cdesc.desc(),
-                bwd_algo1,
+                bwd_result1.algo,
                 workspace_ptr,
                 workspace_size,
                 &beta,
@@ -877,7 +872,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
                 args2.wdesc.desc(),
                 ddfilter_ + i * group_offset_filter,
                 args2.cdesc.desc(),
-                bwd_algo2,
+                bwd_result2.algo,
                 &beta,
                 args2.idesc.desc(),
                 conv_x_ddfilter_data + i * group_offset_out,
@@ -908,7 +903,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
                 args2.odesc.desc(),
                 x_ + i * group_offset_in,
                 args2.cdesc.desc(),
-                bwd_algo2,
+                bwd_result2.algo,
                 workspace_ptr,
                 workspace_size,
                 &alpha,
@@ -964,7 +959,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
                     args3.idesc.desc(),
                     transformed_dout_channel_ + i * group_offset_out,
                     args3.cdesc.desc(),
-                    filter_algo,
+                    filter_result.algo,
                     &beta,
                     args3.wdesc.desc(),
                     dfilter_ + i * group_offset_filter,
@@ -983,7 +978,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
                 args3.odesc.desc(),
                 ddx_ + i * group_offset_in,
                 args3.cdesc.desc(),
-                filter_algo,
+                filter_result.algo,
                 workspace_ptr,
                 workspace_size,
                 &beta,
@@ -1009,7 +1004,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
                 args4.wdesc.desc(),
                 ddfilter_ + i * group_offset_filter,
                 args4.cdesc.desc(),
-                data_algo,
+                fwd_result.algo,
                 &beta,
                 args4.odesc.desc(),
                 transformed_dx_ + i * group_offset_in,
@@ -1028,7 +1023,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
                 args4.wdesc.desc(),
                 ddfilter_ + i * group_offset_filter,
                 args4.cdesc.desc(),
-                data_algo,
+                fwd_result.algo,
                 workspace_ptr,
                 workspace_size,
                 &beta,
diff --git a/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu b/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu
index 5de2df4a70c88e..ce02a00162b579 100644
--- a/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu
@@ -217,16 +217,19 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx,
                  c_groups);
 
 #ifdef PADDLE_WITH_HIP
+  paddle::operators::SearchResult<miopenConvBwdDataAlgorithm_t> bwd_result;
   using search =
       paddle::operators::SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
   workspace_size = std::max(workspace_size, search::GetWorkspaceSize(args));
-  algo = search::Find<T>(args, false, deterministic, workspace_size, ctx);
+  bwd_result.algo =
+      search::Find<T>(args, false, deterministic, workspace_size, ctx);
 #else
+  paddle::operators::SearchResult<cudnnConvolutionBwdDataAlgo_t> bwd_result;
   using search =
       paddle::operators::SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
-  algo = search::Find<T>(args, false, deterministic, ctx);
+  bwd_result = search::Find<T>(args, false, deterministic, ctx);
   workspace_size =
-      std::max(workspace_size, search::GetWorkspaceSize(args, algo));
+      std::max(workspace_size, search::GetWorkspaceSize(args, bwd_result.algo));
 #endif
 
   // ------------------- cudnn conv transpose forward ---------------------
@@ -247,7 +250,7 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx,
           args.wdesc.desc(),
           filter_data + filter_offset * g,
           args.cdesc.desc(),
-          algo,
+          bwd_result.algo,
           &beta,
           args.idesc.desc(),
           transformed_out_data + out_offset * g,
@@ -264,7 +267,7 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx,
           args.odesc.desc(),
           x_data + x_offset * g,
           args.cdesc.desc(),
-          algo,
+          bwd_result.algo,
           cudnn_workspace,
           workspace_size,
           &beta,
diff --git a/paddle/phi/kernels/impl/conv_cudnn_impl.h b/paddle/phi/kernels/impl/conv_cudnn_impl.h
index 93bc5b64adc170..5cf59fe01920aa 100644
--- a/paddle/phi/kernels/impl/conv_cudnn_impl.h
+++ b/paddle/phi/kernels/impl/conv_cudnn_impl.h
@@ -36,7 +36,7 @@
 #include "paddle/phi/kernels/funcs/batch_norm_utils.h"
 
 DECLARE_bool(cudnn_deterministic);
-DECLARE_uint64(conv_workspace_size_limit);
+DECLARE_int64(conv_workspace_size_limit);
 DECLARE_bool(cudnn_exhaustive_search);
 
 namespace phi {
diff --git a/paddle/phi/kernels/impl/multi_dot_kernel_impl.h b/paddle/phi/kernels/impl/multi_dot_kernel_impl.h
index 0833e94fe2c189..039b056200fddf 100644
--- a/paddle/phi/kernels/impl/multi_dot_kernel_impl.h
+++ b/paddle/phi/kernels/impl/multi_dot_kernel_impl.h
@@ -339,8 +339,8 @@ void MultiDotGradMatChainOrder(const Context& ctx,
 
 template <typename T, typename Context>
 void MultiDotGradKernel(const Context& ctx,
-                        const DenseTensor& out_grad,
                         const std::vector<const DenseTensor*>& x,
+                        const DenseTensor& out_grad,
                         std::vector<DenseTensor*> x_grad) {
   auto ins = x;
   auto dout = out_grad;
diff --git a/paddle/phi/kernels/multi_dot_grad_kernel.h b/paddle/phi/kernels/multi_dot_grad_kernel.h
index e6d8ecd744e12a..f495c704520799 100644
--- a/paddle/phi/kernels/multi_dot_grad_kernel.h
+++ b/paddle/phi/kernels/multi_dot_grad_kernel.h
@@ -20,8 +20,8 @@ namespace phi {
 
 template <typename T, typename Context>
 void MultiDotGradKernel(const Context& ctx,
-                        const DenseTensor& out_grad,
                         const std::vector<const DenseTensor*>& x,
+                        const DenseTensor& out_grad,
                         std::vector<DenseTensor*> x_grad);
 
 }  // namespace phi
diff --git a/paddle/phi/ops/compat/multi_dot_sig.cc b/paddle/phi/ops/compat/multi_dot_sig.cc
index 598cbd980f3cc5..2e05bd6d1557ac 100644
--- a/paddle/phi/ops/compat/multi_dot_sig.cc
+++ b/paddle/phi/ops/compat/multi_dot_sig.cc
@@ -19,7 +19,7 @@ namespace phi {
 KernelSignature MultiDotGradOpArgumentMapping(
     const ArgumentMappingContext& ctx) {
   return KernelSignature(
-      "multi_dot_grad", {GradVarName("Out"), "X"}, {}, {GradVarName("X")});
+      "multi_dot_grad", {"X", GradVarName("Out")}, {}, {GradVarName("X")});
 }
 
 }  // namespace phi
diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat
index cc55ea82df6082..21df60e9721214 100644
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -657,7 +657,6 @@ for /F %%# in ('wmic os get localdatetime^|findstr 20') do set start=%%#
 set start=%start:~4,10%
 
 set FLAGS_call_stack_level=2
-set FLAGS_use_curand=True
 dir %THIRD_PARTY_PATH:/=\%\install\openblas\lib
 dir %THIRD_PARTY_PATH:/=\%\install\openblas\bin
 dir %THIRD_PARTY_PATH:/=\%\install\zlib\bin
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index d1220e45375825..e8bde467e085d6 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -61,8 +61,6 @@ function init() {
     # NOTE(chenweihang): For easy debugging, CI displays the C++ error stacktrace by default 
     export FLAGS_call_stack_level=2
 
-    export FLAGS_use_curand=True
-
     # set CI_SKIP_CPP_TEST if only *.py changed
     # In order to avoid using in some CI(such as daily performance), the current
     # branch must not be `${BRANCH}` which is usually develop.
diff --git a/python/paddle/common_ops_import.py b/python/paddle/common_ops_import.py
index 98974808589467..de8056f280a396 100644
--- a/python/paddle/common_ops_import.py
+++ b/python/paddle/common_ops_import.py
@@ -22,7 +22,7 @@
 from paddle.fluid import core, dygraph_utils
 from paddle.fluid.data_feeder import check_type, check_dtype, check_variable_and_dtype, convert_dtype
 from paddle.fluid.layers import fill_constant, utils, scale
-from paddle.fluid.layers.layer_function_generator import templatedoc
+from paddle.tensor.layer_function_generator import templatedoc
 import paddle.fluid as fluid
 import numpy
 import warnings
diff --git a/python/paddle/dataset/image.py b/python/paddle/dataset/image.py
index c36213282c59ce..a094529edf5756 100644
--- a/python/paddle/dataset/image.py
+++ b/python/paddle/dataset/image.py
@@ -54,7 +54,10 @@
     if retcode != 0:
         cv2 = None
     else:
-        import cv2
+        try:
+            import cv2
+        except ImportError:
+            cv2 = None
 else:
     try:
         import cv2
diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py
index a5ea528d13450e..fbad470cb3f133 100644
--- a/python/paddle/distributed/collective.py
+++ b/python/paddle/distributed/collective.py
@@ -138,7 +138,7 @@ def _get_global_env():
 # Name of the default group for init_parallel_env
 _default_group_name = "_default_pg"
 
-_valid_backend_list = ['nccl', 'gloo', 'hccl']
+_valid_backend_list = ['nccl', 'gloo', 'hccl', 'heter']
 _default_store = None  # the default tcp store
 _default_backend = None
 
@@ -234,6 +234,31 @@ def _new_process_group_impl(backend,
         pg = core.ProcessGroupNCCL(store, rank, world_size, group_id)
     elif backend == "hccl":
         pg = core.ProcessGroupHCCL(store, rank, world_size, group_id)
+    elif backend == "heter":
+        cluster_id = int(os.getenv("CLUSTER_ID", "-1"))
+        assert cluster_id >= 0, "please set the CLUSTER_ID variable."
+        cluster_size = os.getenv("CLUSTER_SIZE", None)
+        assert cluster_size, "please set the CLUSTER_SIZE variable."
+        cluster_size = cluster_size.split(",")
+        cluster_size = [int(s) for s in cluster_size]
+        switch_ep = os.getenv("CLUSTER_SWITCH", None)
+        assert switch_ep, "please set the CLUSTER_SWITCH variable."
+        cluster_size_cumsum = np.cumsum(cluster_size)
+        cluster_offset = 0 if cluster_id == 0 else cluster_size_cumsum[
+            cluster_id - 1]
+        global_rank = cluster_offset + rank
+        global_world_size = cluster_size_cumsum[-1]
+        pg = core.ProcessGroupHeter(
+            store,
+            rank=global_rank,
+            world_size=global_world_size,
+            gid=0,
+            local_rank=rank,
+            local_size=world_size,
+            gloo_rank=cluster_id,
+            gloo_size=len(cluster_size),
+            with_switch=True,
+            switch_endpoint=switch_ep)
 
     return pg
 
diff --git a/python/paddle/distributed/ps/utils/public.py b/python/paddle/distributed/ps/utils/public.py
index b76484a3ebc111..e7edc6fd859a63 100755
--- a/python/paddle/distributed/ps/utils/public.py
+++ b/python/paddle/distributed/ps/utils/public.py
@@ -58,7 +58,7 @@
 
 def logger_config(log_path, logging_name):
     logger = logging.getLogger(logging_name)
-    logger.setLevel(level=logging.DEBUG)
+    logger.setLevel(level=logging.WARNING)
     handler = logging.FileHandler(
         log_path, mode='a', encoding='UTF-8', delay=True)
     handler.setLevel(logging.INFO)
diff --git a/python/paddle/fluid/dygraph/layers.py b/python/paddle/fluid/dygraph/layers.py
index 54a245aab81c90..193025b1864abc 100644
--- a/python/paddle/fluid/dygraph/layers.py
+++ b/python/paddle/fluid/dygraph/layers.py
@@ -1169,6 +1169,8 @@ def _remove_if_exist(*dicts):
                     # add a persistable buffer.
                     if name not in self._buffers:
                         self._non_persistable_buffer_names_set.add(name)
+                    if not value.name:
+                        value.name = unique_name.generate('_buffers_' + name)
                     _buffers[name] = value
                 elif _buffers is not None and name in _buffers:
                     # Note(Aurelius84): In Dy2stat, the value of the Buffer may be modified in
diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py
index 4659c98abccc1a..9bf245ff388b40 100644
--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
@@ -23,7 +23,7 @@
 from ..framework import convert_np_dtype_to_dtype_, _in_legacy_dygraph
 from .. import core
 from .. import unique_name
-from ..framework import Variable, Parameter, ParamBase, _getitem_impl_, _setitem_impl_, EagerParamBase
+from ..framework import Variable, Parameter, ParamBase, _getitem_impl_, _setitem_impl_, EagerParamBase, in_dygraph_mode
 from .base import switch_to_static_graph
 from .math_op_patch import monkey_patch_math_varbase
 from .parallel import scale_loss
@@ -99,7 +99,7 @@ def _to_static_var(self, to_parameter=False, **kwargs):
 
         # Note: getattr(self, attr, None) will call x.grad=x.gradient(), but gradient() only available in dygraph.
         # It will fail. So, for propery that different between dynamic and static graph, should not getattr(self, attr, None).
-        attr_not_need_keys = ['grad', 'T']
+        attr_not_need_keys = ['grad', 'T', 'place', '_place_str']
         if isinstance(self, (ParamBase, EagerParamBase)):
             attr_kwargs = self.__dict__.copy()
         else:
@@ -798,6 +798,9 @@ def _set_grad_ivar(self, value):
 
     @framework.dygraph_only
     def clone(self):
+        if in_dygraph_mode():
+            return _C_ops.final_state_assign(self)
+
         if _in_legacy_dygraph():
             output = core.VarBase()
         else:
diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py
index 37eff6d132d03b..b3baedc401504f 100644
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
@@ -561,12 +561,12 @@ def __call__(self, var, block=None):
 
         if framework._non_static_mode():
             if self._uniform:
-                limit = np.sqrt(6.0 / float(fan_in + fan_out))
+                limit = math.sqrt(6.0 / float(fan_in + fan_out))
                 out_var = _C_ops.uniform_random('shape', out_var.shape, 'min',
                                                 -limit, 'max', limit, 'seed',
                                                 self._seed, 'dtype', out_dtype)
             else:
-                std = np.sqrt(2.0 / float(fan_in + fan_out))
+                std = math.sqrt(2.0 / float(fan_in + fan_out))
                 out_var = _C_ops.gaussian_random(
                     'shape', out_var.shape, 'dtype', out_dtype, 'mean', 0.0,
                     'std', std, 'seed', self._seed)
@@ -581,7 +581,7 @@ def __call__(self, var, block=None):
             return None
         else:
             if self._uniform:
-                limit = np.sqrt(6.0 / float(fan_in + fan_out))
+                limit = math.sqrt(6.0 / float(fan_in + fan_out))
                 op = block.append_op(
                     type="uniform_random",
                     inputs={},
@@ -595,7 +595,7 @@ def __call__(self, var, block=None):
                     },
                     stop_gradient=True)
             else:
-                std = np.sqrt(2.0 / float(fan_in + fan_out))
+                std = math.sqrt(2.0 / float(fan_in + fan_out))
                 op = block.append_op(
                     type="gaussian_random",
                     outputs={"Out": out_var},
@@ -713,13 +713,13 @@ def __call__(self, var, block=None):
 
         if framework._non_static_mode():
             if self._uniform:
-                limit = np.sqrt(6.0 / float(fan_in))
+                limit = math.sqrt(6.0 / float(fan_in))
                 out_var = _C_ops.uniform_random('shape', out_var.shape, 'min',
                                                 -limit, 'max', limit, 'seed',
                                                 self._seed, 'dtype',
                                                 int(out_dtype))
             else:
-                std = np.sqrt(2.0 / float(fan_in))
+                std = math.sqrt(2.0 / float(fan_in))
                 out_var = _C_ops.gaussian_random(
                     'shape', out_var.shape, 'dtype',
                     int(out_dtype), 'mean', 0.0, 'std', std, 'seed', self._seed)
@@ -734,7 +734,7 @@ def __call__(self, var, block=None):
             return None
         else:
             if self._uniform:
-                limit = np.sqrt(6.0 / float(fan_in))
+                limit = math.sqrt(6.0 / float(fan_in))
                 op = block.append_op(
                     type="uniform_random",
                     inputs={},
@@ -749,7 +749,7 @@ def __call__(self, var, block=None):
                     stop_gradient=True)
 
             else:
-                std = np.sqrt(2.0 / float(fan_in))
+                std = math.sqrt(2.0 / float(fan_in))
                 op = block.append_op(
                     type="gaussian_random",
                     outputs={"Out": out_var},
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 311a6278a89f8f..68a58e8be49b84 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -5970,8 +5970,11 @@ def multiplex(inputs, index, name=None):
             print(res) # [array([[5., 6.], [3., 4.]], dtype=float32)]
 
     """
-    if _non_static_mode():
+
+    if _in_legacy_dygraph():
         return _C_ops.multiplex(index, inputs)
+    if in_dygraph_mode():
+        return _C_ops.final_state_multiplex(inputs, index)
     helper = LayerHelper('multiplex', **locals())
 
     check_type(inputs, 'inputs', (list), 'multiplex')
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index a63e87472ebed7..3a8dfdc858079c 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -21,7 +21,7 @@
 from ..layer_helper import LayerHelper
 from ..param_attr import ParamAttr
 from ..initializer import Initializer
-from ..framework import _current_expected_place, convert_np_dtype_to_dtype_, _non_static_mode, _varbase_creator, device_guard, _in_legacy_dygraph, in_dygraph_mode
+from ..framework import _current_expected_place, convert_np_dtype_to_dtype_, _non_static_mode, _varbase_creator, device_guard, _in_legacy_dygraph, in_dygraph_mode, _get_paddle_place
 from ..framework import Variable
 from ..initializer import Constant
 from ..core import VarDesc
@@ -622,12 +622,15 @@ def assign(input, output=None):
     # after this api.
     if isinstance(input, (Variable, core.VarBase)):
         if _non_static_mode():
-            if output is None:
-                if _in_legacy_dygraph():
-                    output = core.VarBase()
-                else:
-                    output = core.eager.Tensor()
-            _C_ops.assign(input, output)
+            if in_dygraph_mode() and output is None:
+                output = _C_ops.final_state_assign(input)
+            else:
+                if output is None:
+                    if _in_legacy_dygraph():
+                        output = core.VarBase()
+                    else:
+                        output = core.eager.Tensor()
+                _C_ops.assign(input, output)
         else:
             check_dtype(input.dtype, 'input', [
                 'float16', 'uint16', 'float32', 'float64', 'int32', 'int64',
@@ -751,22 +754,36 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None, name=None):
             attrs['value'] = float(value)
 
     if _non_static_mode():
-        shape = utils.convert_shape_to_list(shape)
-        if out is None:
-            out = _varbase_creator(dtype=dtype)
+        if out is None and in_dygraph_mode():
+            #Currently, final state mode don't support out is None.
+            place = _current_expected_place()
+            if force_cpu:
+                place = core.CPUPlace()
+
+            shape = utils.convert_shape_to_list(shape)
+            if not isinstance(dtype, core.VarDesc.VarType):
+                dtype = convert_np_dtype_to_dtype_(dtype)
+            out = _C_ops.final_state_full(shape, float(value), dtype, place)
+            out.stop_gradient = True
+            return out
 
-        if isinstance(value, Variable):
-            if dtype in ['uint8', 'int16', 'int32', 'int64']:
-                attrs['str_value'] = str(int(value.numpy().item(0)))
-            else:
-                attrs['str_value'] = str(float(value.numpy().item(0)))
+        else:
+            shape = utils.convert_shape_to_list(shape)
+            if out is None:
+                out = _varbase_creator(dtype=dtype)
 
-        _C_ops.fill_constant(out, 'value',
-                             float(value), 'force_cpu', force_cpu, 'dtype',
-                             out.dtype, 'str_value', attrs['str_value'],
-                             'shape', shape)
-        out.stop_gradient = True
-        return out
+            if isinstance(value, Variable):
+                if dtype in ['uint8', 'int16', 'int32', 'int64']:
+                    attrs['str_value'] = str(int(value.numpy().item(0)))
+                else:
+                    attrs['str_value'] = str(float(value.numpy().item(0)))
+
+            _C_ops.fill_constant(out, 'value',
+                                 float(value), 'force_cpu', force_cpu, 'dtype',
+                                 out.dtype, 'str_value', attrs['str_value'],
+                                 'shape', shape)
+            out.stop_gradient = True
+            return out
 
     helper = LayerHelper("fill_constant", **locals())
     inputs = {}
@@ -1548,10 +1565,12 @@ def linspace(start, stop, num, dtype=None, name=None):
     if not isinstance(num, Variable):
         with device_guard("cpu"):
             tensor_num = fill_constant([1], 'int32', num)
-    if _non_static_mode():
+    if _in_legacy_dygraph():
         return _C_ops.linspace(tensor_start, tensor_stop, tensor_num, 'dtype',
                                dtype)
-
+    if in_dygraph_mode():
+        return _C_ops.final_state_linspace(tensor_start, tensor_stop,
+                                           tensor_num, dtype)
     helper = LayerHelper("linspace", **locals())
 
     start_dtype = convert_dtype(tensor_start.dtype)
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 6085360543e92d..3f640a73a55c50 100755
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -596,6 +596,13 @@ foreach(TEST_OP ${TEST_OPS_WITH_GC})
   py_test_modules(${TEST_OP} MODULES ${TEST_OP} ENVS ${GC_ENVS})
 endforeach()
 
+# Switch some dy2st UT to eager mode
+set(TEST_EAGER_OPS test_jit_save_load test_translated_layer)
+foreach(TEST_OP ${TEST_EAGER_OPS})
+  list(REMOVE_ITEM TEST_OPS ${TEST_OP})
+  py_test_modules(${TEST_OP} MODULES ${TEST_OP} ENVS FLAGS_enable_eager_mode=1)
+endforeach()
+
 if ((NOT WITH_GPU) AND (NOT WITH_XPU) AND NOT (WITH_ASCEND OR WITH_ASCEND_CL))
     list(REMOVE_ITEM TEST_OPS "test_fleet_graph_execution_meta_optimizer")
     list(REMOVE_ITEM TEST_OPS "test_gen_nccl_id_op")
@@ -1146,7 +1153,7 @@ if(WITH_DISTRIBUTE AND WITH_GPU AND WITH_NCCL)
     set_tests_properties(test_auto_parallel_save_load PROPERTIES TIMEOUT 120)
     set_tests_properties(test_auto_parallel_autoconvert PROPERTIES TIMEOUT 120)
     set_tests_properties(test_collective_process_group PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_eager_dist_api PROPERTIES TIMEOUT 300)
+    set_tests_properties(test_eager_dist_api PROPERTIES TIMEOUT 100)
     
     if(${NCCL_VERSION} VERSION_GREATER_EQUAL 2212)
         set_tests_properties(test_parallel_dygraph_sparse_embedding PROPERTIES TIMEOUT 200)
diff --git a/python/paddle/fluid/tests/unittests/autograd/CMakeLists.txt b/python/paddle/fluid/tests/unittests/autograd/CMakeLists.txt
index 1f69abac01ac60..46af5509d244b3 100644
--- a/python/paddle/fluid/tests/unittests/autograd/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/autograd/CMakeLists.txt
@@ -6,5 +6,5 @@ foreach(TEST_OP ${TEST_OPS})
     py_test_modules(${TEST_OP} MODULES ${TEST_OP} ENVS ${GC_ENVS})
 endforeach(TEST_OP)
 
-set_tests_properties(test_autograd_functional_dynamic PROPERTIES TIMEOUT 100)
-set_tests_properties(test_autograd_functional_static PROPERTIES TIMEOUT 100)
+set_tests_properties(test_autograd_functional_dynamic PROPERTIES TIMEOUT 160)
+set_tests_properties(test_autograd_functional_static PROPERTIES TIMEOUT 160)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt b/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt
index eeb377ff3b4a2b..ddc959a29a2ef8 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt
@@ -6,7 +6,7 @@ set(DY2ST_EAGER_TEST_ENVS ${GC_ENVS} FLAGS_enable_eager_mode=1)
 set(TEST_EAGER_OPS test_bmn test_break_continue test_ifelse test_loop test_mnist_amp 
     test_mnist_pure_fp16 test_mobile_net test_program_translator test_ptb_lm test_reinforcement_learning 
     test_resnet test_resnet_amp test_resnet_pure_fp16 test_se_resnet test_sentiment test_seq2seq 
-    test_tsm test_word2vec test_yolov3)
+    test_tsm test_word2vec test_yolov3 test_bert test_cycle_gan test_lstm test_simnet test_transformer)
 list(REMOVE_ITEM TEST_OPS test_lac)
 # NOTE(Aurelius84): In case of Windows CI, if open ON_INFER, RWLOCK of Scope will
 # be removed and will cause some random failed in multi-thread.
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_v2.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_v2.py
index ae7a5885790591..0cf96b7159579f 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_v2.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_v2.py
@@ -14,6 +14,8 @@
 
 from __future__ import print_function
 
+import os
+os.environ["FLAGS_enable_eager_mode"] = "0"
 import math
 import time
 import unittest
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py
index 4e4fe69d914fad..44263b89e16168 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py
@@ -113,6 +113,7 @@ def setUp(self):
         super(TestMKLDNNSwishDim2, self).setUp()
 
         self.attrs["use_mkldnn"] = True
+        self.check_eager = False
 
     def init_dtype(self):
         self.dtype = np.float32
@@ -284,6 +285,7 @@ def setUp(self):
         self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
         self.outputs = {'Out': out}
         self.attrs = {"use_mkldnn": True, "beta": beta}
+        self.check_eager = False
 
     def init_dtype(self):
         self.dtype = np.float32
diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py
index 89f8ebbd0cafbb..58d8610ee352d9 100755
--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
@@ -25,6 +25,7 @@
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle.fluid import compiler, Program, program_guard
+from paddle.fluid.framework import _test_eager_guard
 
 paddle.enable_static()
 
@@ -1755,7 +1756,7 @@ class TestHardSwish(TestActivation):
     def setUp(self):
         self.op_type = 'hard_swish'
         self.init_dtype()
-
+        self.python_api = paddle.nn.functional.hardswish
         skip_check_grad_ci(reason="not implemented yet")
 
         np.random.seed(1024)
@@ -1777,7 +1778,10 @@ def test_check_grad(self):
             return
 
         return  # not implemented yet
-        self.check_grad(['X'], 'Out')
+        self.check_grad(['X'], 'Out', check_eager=True)
+
+    def test_check_output(self):
+        self.check_output(check_eager=True)
 
 
 class TestHardswishAPI(unittest.TestCase):
@@ -1838,6 +1842,11 @@ def test_errors(self):
                 name='x_fp16', shape=[12, 10], dtype='float16')
             F.hardswish(x_fp16)
 
+    def test_api_eager_dygraph(self):
+        with _test_eager_guard():
+            self.test_dygraph_api()
+            self.test_errors()
+
 
 class TestSoftRelu(TestActivation):
     def setUp(self):
@@ -2931,7 +2940,9 @@ def ref_swish(x):
 class TestSwish(TestActivation):
     def setUp(self):
         self.op_type = "swish"
+        self.python_api = paddle.nn.functional.swish
         self.init_dtype()
+        self.check_eager = True
 
         np.random.seed(1024)
         x = np.random.uniform(-1, 1, [10, 12]).astype(self.dtype)
@@ -2943,7 +2954,10 @@ def setUp(self):
     def test_check_grad(self):
         if self.dtype == np.float16:
             return
-        self.check_grad(['X'], 'Out')
+        check_eager = False
+        if hasattr(self, 'check_eager'):
+            check_eager = self.check_eager
+        self.check_grad(['X'], 'Out', check_eager=check_eager)
 
 
 class TestSwishAPI(unittest.TestCase):
@@ -2978,6 +2992,10 @@ def test_dygraph_api(self):
             self.assertEqual(np.allclose(out_ref, r.numpy()), True)
         paddle.enable_static()
 
+    def test_dygraph_final_state_api(self):
+        with _test_eager_guard():
+            self.test_dygraph_api()
+
     def test_fluid_api(self):
         paddle.enable_static()
         with fluid.program_guard(fluid.Program()):
diff --git a/python/paddle/fluid/tests/unittests/test_allclose_layer.py b/python/paddle/fluid/tests/unittests/test_allclose_layer.py
index c376a5c95c3935..1e080c80367f0d 100644
--- a/python/paddle/fluid/tests/unittests/test_allclose_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_allclose_layer.py
@@ -16,6 +16,7 @@
 import paddle.fluid as fluid
 import unittest
 import numpy as np
+from paddle.fluid.framework import _test_eager_guard
 
 
 class TestAllcloseLayer(unittest.TestCase):
@@ -95,7 +96,7 @@ def test_allclose_gpu_fp64(self):
                 with fluid.program_guard(main, startup):
                     self.allclose_check(use_cuda=True, dtype='float64')
 
-    def test_dygraph_mode(self):
+    def func_dygraph_mode(self):
         x_1 = np.array([10000., 1e-07]).astype("float32")
         y_1 = np.array([10000.1, 1e-08]).astype("float32")
         x_2 = np.array([10000., 1e-08]).astype("float32")
@@ -171,9 +172,14 @@ def test_dygraph_mode(self):
             x_v_5 = paddle.to_tensor(x_5)
             y_v_5 = paddle.to_tensor(y_5)
             ret_5 = paddle.allclose(
-                x_v_5, y_v_5, rtol=0.01, atol=0.0, name='test_8')
+                x_v_5, y_v_5, rtol=0.015, atol=0.0, name='test_8')
             self.assertEqual(ret_5.numpy()[0], True)
 
+    def test_dygraph_mode(self):
+        with _test_eager_guard():
+            self.func_dygraph_mode()
+        self.func_dygraph_mode()
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_assign_op.py b/python/paddle/fluid/tests/unittests/test_assign_op.py
index 3dbd9311a71ed6..bfe23c621270d7 100644
--- a/python/paddle/fluid/tests/unittests/test_assign_op.py
+++ b/python/paddle/fluid/tests/unittests/test_assign_op.py
@@ -27,30 +27,32 @@
 
 class TestAssignOp(op_test.OpTest):
     def setUp(self):
+        self.python_api = paddle.assign
         self.op_type = "assign"
         x = np.random.random(size=(100, 10)).astype('float64')
         self.inputs = {'X': x}
         self.outputs = {'Out': x}
 
     def test_forward(self):
-        self.check_output()
+        self.check_output(check_eager=True)
 
     def test_backward(self):
-        self.check_grad(['X'], 'Out')
+        self.check_grad(['X'], 'Out', check_eager=True)
 
 
 class TestAssignFP16Op(op_test.OpTest):
     def setUp(self):
+        self.python_api = paddle.assign
         self.op_type = "assign"
         x = np.random.random(size=(100, 10)).astype('float16')
         self.inputs = {'X': x}
         self.outputs = {'Out': x}
 
     def test_forward(self):
-        self.check_output()
+        self.check_output(check_eager=True)
 
     def test_backward(self):
-        self.check_grad(['X'], 'Out')
+        self.check_grad(['X'], 'Out', check_eager=True)
 
 
 class TestAssignOpWithLoDTensorArray(unittest.TestCase):
@@ -171,6 +173,8 @@ def test_assign_BasicTypes(self):
 
     def test_clone(self):
         paddle.disable_static()
+        self.python_api = paddle.clone
+
         x = paddle.ones([2])
         x.stop_gradient = False
         clone_x = paddle.clone(x)
diff --git a/python/paddle/fluid/tests/unittests/test_bernoulli_op.py b/python/paddle/fluid/tests/unittests/test_bernoulli_op.py
index 426d5d463f4530..fc4ee13384b2dc 100644
--- a/python/paddle/fluid/tests/unittests/test_bernoulli_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bernoulli_op.py
@@ -75,9 +75,6 @@ def test_fixed_random_number(self):
         if not paddle.is_compiled_with_cuda():
             return
 
-        if os.getenv("FLAGS_use_curand", None) in ('0', 'False', None):
-            return
-
         print("Test Fixed Random number on GPU------>")
         paddle.disable_static()
         paddle.set_device('gpu')
diff --git a/python/paddle/fluid/tests/unittests/test_bincount_op.py b/python/paddle/fluid/tests/unittests/test_bincount_op.py
index 851bf7b01125a3..17b04b954afe87 100644
--- a/python/paddle/fluid/tests/unittests/test_bincount_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bincount_op.py
@@ -126,6 +126,7 @@ class TestBincountOp(OpTest):
     # without weights
     def setUp(self):
         self.op_type = "bincount"
+        self.python_api = paddle.bincount
         self.init_test_case()
         self.inputs = {"X": self.np_input}
         self.attrs = {"minlength": self.minlength}
@@ -137,13 +138,14 @@ def init_test_case(self):
         self.Out = np.bincount(self.np_input, minlength=self.minlength)
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_eager=False)
 
 
 class TestCase1(TestBincountOp):
     # with weights(FLOAT32)
     def setUp(self):
         self.op_type = "bincount"
+        self.python_api = paddle.bincount
         self.init_test_case()
         self.inputs = {"X": self.np_input, "Weights": self.np_weights}
         self.attrs = {"minlength": self.minlength}
@@ -163,6 +165,7 @@ class TestCase2(TestBincountOp):
     # with weights(other)
     def setUp(self):
         self.op_type = "bincount"
+        self.python_api = paddle.bincount
         self.init_test_case()
         self.inputs = {"X": self.np_input, "Weights": self.np_weights}
         self.attrs = {"minlength": self.minlength}
diff --git a/python/paddle/fluid/tests/unittests/test_conj_op.py b/python/paddle/fluid/tests/unittests/test_conj_op.py
index 774a29ada4a846..fe9efc301fea70 100644
--- a/python/paddle/fluid/tests/unittests/test_conj_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conj_op.py
@@ -32,6 +32,7 @@
 class TestConjOp(OpTest):
     def setUp(self):
         self.op_type = "conj"
+        self.python_api = paddle.tensor.conj
         self.init_dtype_type()
         self.init_input_output()
         self.init_grad_input_output()
@@ -53,14 +54,15 @@ def init_grad_input_output(self):
         self.grad_in = np.conj(self.grad_out)
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_eager=True)
 
     def test_check_grad_normal(self):
         self.check_grad(
             ['X'],
             'Out',
             user_defined_grads=[self.grad_in],
-            user_defined_grad_outputs=[self.grad_out])
+            user_defined_grad_outputs=[self.grad_out],
+            check_eager=True)
 
 
 class TestComplexConjOp(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py
index 8ec3fecceb9600..59d196fdf55e57 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py
@@ -51,9 +51,8 @@ def check_with_place(self,
         tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs)
 
     def test_dist_train(self):
-        #        self.check_with_place(
-        #            "dist_fleet_ctr.py", delta=1e-5, check_error_log=False)
-        print('recover later')
+        self.check_with_place(
+            "dist_fleet_ctr.py", delta=1e-5, check_error_log=False)
 
 
 class TestDistMnistAsync2x2(TestFleetBase):
@@ -86,9 +85,8 @@ def check_with_place(self,
         tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs)
 
     def test_dist_train(self):
-        #        self.check_with_place(
-        #            "dist_fleet_ctr.py", delta=1e-5, check_error_log=False)
-        print('recover later')
+        self.check_with_place(
+            "dist_fleet_ctr.py", delta=1e-5, check_error_log=False)
 
 
 class TestDistCtrHalfAsync2x2(TestFleetBase):
@@ -124,9 +122,8 @@ def check_with_place(self,
         tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs)
 
     def test_dist_train(self):
-        #        self.check_with_place(
-        #            "dist_fleet_ctr.py", delta=1e-5, check_error_log=False)
-        print('recover later')
+        self.check_with_place(
+            "dist_fleet_ctr.py", delta=1e-5, check_error_log=False)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr2.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr2.py
index e5e486d7068457..e73eff2acc9671 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr2.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr2.py
@@ -52,9 +52,8 @@ def check_with_place(self,
         tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs)
 
     def test_dist_train(self):
-        #        self.check_with_place(
-        #            "dist_fleet_ctr.py", delta=1e-5, check_error_log=False)
-        print('recover later')
+        self.check_with_place(
+            "dist_fleet_ctr.py", delta=1e-5, check_error_log=False)
 
 
 # @unittest.skip(reason="Skip unstable ut, reader need to be rewrite")
@@ -92,9 +91,8 @@ def check_with_place(self,
         tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs)
 
     def test_dist_train(self):
-        #        self.check_with_place(
-        #            "dist_fleet_ctr.py", delta=1e-5, check_error_log=False)
-        print('recover later')
+        self.check_with_place(
+            "dist_fleet_ctr.py", delta=1e-5, check_error_log=False)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_dist_op.py b/python/paddle/fluid/tests/unittests/test_dist_op.py
index b9b8ea92cb3a84..ad999c3feae426 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_op.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_op.py
@@ -37,6 +37,7 @@ def dist(x, y, p):
 class TestDistOp(OpTest):
     def setUp(self):
         self.op_type = 'dist'
+        self.python_api = paddle.dist
         self.attrs = {}
         self.init_case()
         self.init_data_type()
@@ -106,10 +107,14 @@ def get_reduce_dims(x, y):
         return x_grad, y_grad
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_eager=True)
 
     def test_check_grad(self):
-        self.check_grad(["X", "Y"], "Out", user_defined_grads=self.gradient)
+        self.check_grad(
+            ["X", "Y"],
+            "Out",
+            user_defined_grads=self.gradient,
+            check_eager=True)
 
 
 class TestDistOpCase1(TestDistOp):
@@ -174,4 +179,5 @@ def test_api(self):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dropout_op.py b/python/paddle/fluid/tests/unittests/test_dropout_op.py
index d8a4eb8f45f7d3..3aca428ac77af4 100644
--- a/python/paddle/fluid/tests/unittests/test_dropout_op.py
+++ b/python/paddle/fluid/tests/unittests/test_dropout_op.py
@@ -1034,9 +1034,6 @@ def test_fixed_random_number(self):
         if not "V100" in paddle.device.cuda.get_device_name():
             return
 
-        if os.getenv("FLAGS_use_curand", None) in ('0', 'False', None):
-            return
-
         print("Test Fixed Random number on V100 GPU------>")
         paddle.disable_static()
         paddle.set_device('gpu')
diff --git a/python/paddle/fluid/tests/unittests/test_erfinv_op.py b/python/paddle/fluid/tests/unittests/test_erfinv_op.py
index 847a868dd6ca01..5b5a7c03843166 100644
--- a/python/paddle/fluid/tests/unittests/test_erfinv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_erfinv_op.py
@@ -28,6 +28,7 @@
 class TestErfinv(OpTest):
     def setUp(self):
         self.op_type = "erfinv"
+        self.python_api = paddle.erfinv
         self.init_dtype()
         self.shape = [11, 17]
         self.x = np.random.uniform(-1, 1, size=self.shape).astype(self.dtype)
@@ -42,7 +43,7 @@ def init_dtype(self):
         self.dtype = np.float64
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_eager=True)
 
     def test_check_grad(self):
         self.check_grad(
diff --git a/python/paddle/fluid/tests/unittests/test_expand_v2_op.py b/python/paddle/fluid/tests/unittests/test_expand_v2_op.py
index a204c26c1b823f..70b3fda79b50fb 100644
--- a/python/paddle/fluid/tests/unittests/test_expand_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_expand_v2_op.py
@@ -27,6 +27,7 @@ class TestExpandV2OpRank1(OpTest):
     def setUp(self):
         self.op_type = "expand_v2"
         self.init_data()
+        self.python_api = paddle.expand
 
         self.inputs = {'X': np.random.random(self.ori_shape).astype("float64")}
         self.attrs = {'shape': self.shape}
diff --git a/python/paddle/fluid/tests/unittests/test_exponential_op.py b/python/paddle/fluid/tests/unittests/test_exponential_op.py
index 7a3ae203be62d6..c8f4101ea5d6ba 100644
--- a/python/paddle/fluid/tests/unittests/test_exponential_op.py
+++ b/python/paddle/fluid/tests/unittests/test_exponential_op.py
@@ -100,9 +100,6 @@ def test_fixed_random_number(self):
         if not "V100" in paddle.device.cuda.get_device_name():
             return
 
-        if os.getenv("FLAGS_use_curand", None) in ('0', 'False', None):
-            return
-
         print("Test Fixed Random number on V100 GPU------>")
         paddle.disable_static()
         paddle.set_device('gpu')
diff --git a/python/paddle/fluid/tests/unittests/test_flip.py b/python/paddle/fluid/tests/unittests/test_flip.py
index 5e2aacf9cefed0..010d23bca51d73 100644
--- a/python/paddle/fluid/tests/unittests/test_flip.py
+++ b/python/paddle/fluid/tests/unittests/test_flip.py
@@ -67,6 +67,7 @@ def test_dygraph(self):
 class TestFlipOp(OpTest):
     def setUp(self):
         self.op_type = 'flip'
+        self.python_api = paddle.tensor.flip
         self.init_test_case()
         self.inputs = {'X': np.random.random(self.in_shape).astype('float64')}
         self.init_attrs()
@@ -76,10 +77,10 @@ def init_attrs(self):
         self.attrs = {"axis": self.axis}
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_eager=True)
 
     def test_check_grad(self):
-        self.check_grad(["X"], "Out")
+        self.check_grad(["X"], "Out", check_eager=True)
 
     def init_test_case(self):
         self.in_shape = (6, 4, 2, 3)
@@ -131,4 +132,5 @@ def init_test_case(self):
 
 
 if __name__ == "__main__":
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_full_op.py b/python/paddle/fluid/tests/unittests/test_full_op.py
index c508d56c29a438..2442f2b681554e 100644
--- a/python/paddle/fluid/tests/unittests/test_full_op.py
+++ b/python/paddle/fluid/tests/unittests/test_full_op.py
@@ -23,6 +23,7 @@
 import paddle.fluid as fluid
 import paddle
 from paddle.fluid import compiler, Program, program_guard
+from paddle.fluid.framework import _test_eager_guard
 
 
 # Test python API
@@ -75,6 +76,61 @@ def test_api(self):
         assert np.array_equal(res_6, np.full([1, 2], 1.1, dtype="float32"))
         assert np.array_equal(res_7, np.full([1, 2], 1.1, dtype="float32"))
 
+    def test_api_eager(self):
+        with fluid.dygraph.base.guard():
+            with _test_eager_guard():
+                positive_2_int32 = fluid.layers.fill_constant([1], "int32", 2)
+
+                positive_2_int64 = fluid.layers.fill_constant([1], "int64", 2)
+                out_1 = paddle.full(
+                    shape=[1, 2], dtype="float32", fill_value=1.1)
+
+                out_2 = paddle.full(
+                    shape=[1, positive_2_int32.item()],
+                    dtype="float32",
+                    fill_value=1.1)
+
+                out_3 = paddle.full(
+                    shape=[1, positive_2_int64.item()],
+                    dtype="float32",
+                    fill_value=1.1)
+
+                out_4 = paddle.full(
+                    shape=[1, 2], dtype="float32", fill_value=1.2)
+
+                out_5 = paddle.full(
+                    shape=[1, 2], dtype="float32", fill_value=1.1)
+
+                out_6 = paddle.full(
+                    shape=[1, 2], dtype=np.float32, fill_value=1.1)
+
+                val = fluid.layers.fill_constant(
+                    shape=[1], dtype=np.float32, value=1.1)
+                out_7 = paddle.full(
+                    shape=[1, 2], dtype=np.float32, fill_value=val)
+
+                assert np.array_equal(
+                    out_1, np.full(
+                        [1, 2], 1.1, dtype="float32"))
+                assert np.array_equal(
+                    out_2, np.full(
+                        [1, 2], 1.1, dtype="float32"))
+                assert np.array_equal(
+                    out_3, np.full(
+                        [1, 2], 1.1, dtype="float32"))
+                assert np.array_equal(
+                    out_4, np.full(
+                        [1, 2], 1.2, dtype="float32"))
+                assert np.array_equal(
+                    out_5, np.full(
+                        [1, 2], 1.1, dtype="float32"))
+                assert np.array_equal(
+                    out_6, np.full(
+                        [1, 2], 1.1, dtype="float32"))
+                assert np.array_equal(
+                    out_7, np.full(
+                        [1, 2], 1.1, dtype="float32"))
+
 
 class TestFullOpError(unittest.TestCase):
     def test_errors(self):
diff --git a/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py b/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
index 738441a46d377e..4fca8b9f2a1182 100644
--- a/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
@@ -342,9 +342,6 @@ def test_fixed_random_number(self):
         if not "V100" in paddle.device.cuda.get_device_name():
             return
 
-        if os.getenv("FLAGS_use_curand", None) in ('0', 'False', None):
-            return
-
         def _check_random_value(dtype, expect, expect_mean, expect_std):
             x = paddle.randn([32, 3, 1024, 1024], dtype=dtype)
             actual = x.numpy()
diff --git a/python/paddle/fluid/tests/unittests/test_graph_khop_sampler.py b/python/paddle/fluid/tests/unittests/test_graph_khop_sampler.py
index b8071222ac7729..6e6175d669515d 100644
--- a/python/paddle/fluid/tests/unittests/test_graph_khop_sampler.py
+++ b/python/paddle/fluid/tests/unittests/test_graph_khop_sampler.py
@@ -46,7 +46,7 @@ def setUp(self):
         self.sample_sizes = [5, 5]
         self.dst_src_dict = dst_src_dict
 
-    def test_sample_result(self):
+    def func_sample_result(self):
         paddle.disable_static()
         row = paddle.to_tensor(self.row)
         colptr = paddle.to_tensor(self.colptr)
@@ -79,13 +79,25 @@ def test_sample_result(self):
             # Ensure the correct sample neighbors.
             self.assertTrue(np.sum(in_neighbors) == in_neighbors.shape[0])
 
-    def test_uva_sample_result(self):
+    def test_sample_result(self):
+        with fluid.framework._test_eager_guard():
+            self.func_sample_result()
+        self.func_sample_result()
+
+    def func_uva_sample_result(self):
         paddle.disable_static()
         if paddle.fluid.core.is_compiled_with_cuda():
-            row = paddle.fluid.core.to_uva_tensor(
-                self.row.astype(self.row.dtype))
-            sorted_eid = paddle.fluid.core.to_uva_tensor(
-                self.sorted_eid.astype(self.sorted_eid.dtype))
+            row = None
+            if fluid.framework.in_dygraph_mode():
+                row = paddle.fluid.core.eager.to_uva_tensor(
+                    self.row.astype(self.row.dtype), 0)
+                sorted_eid = paddle.fluid.core.eager.to_uva_tensor(
+                    self.sorted_eid.astype(self.sorted_eid.dtype), 0)
+            else:
+                row = paddle.fluid.core.to_uva_tensor(
+                    self.row.astype(self.row.dtype))
+                sorted_eid = paddle.fluid.core.to_uva_tensor(
+                    self.sorted_eid.astype(self.sorted_eid.dtype))
             colptr = paddle.to_tensor(self.colptr)
             nodes = paddle.to_tensor(self.nodes)
 
@@ -114,6 +126,11 @@ def test_uva_sample_result(self):
                 in_neighbors = np.isin(edge_src_n.numpy(), self.dst_src_dict[n])
                 self.assertTrue(np.sum(in_neighbors) == in_neighbors.shape[0])
 
+    def test_uva_sample_result(self):
+        with fluid.framework._test_eager_guard():
+            self.func_uva_sample_result()
+        self.func_uva_sample_result()
+
     def test_sample_result_static_with_eids(self):
         paddle.enable_static()
         with paddle.static.program_guard(paddle.static.Program()):
diff --git a/python/paddle/fluid/tests/unittests/test_gumbel_softmax_op.py b/python/paddle/fluid/tests/unittests/test_gumbel_softmax_op.py
index e423404d07fb10..7c706eabd1d7a6 100644
--- a/python/paddle/fluid/tests/unittests/test_gumbel_softmax_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gumbel_softmax_op.py
@@ -17,6 +17,7 @@
 import paddle
 import paddle.fluid as fluid
 from paddle.fluid import Program, program_guard
+from paddle.fluid.framework import _test_eager_guard
 paddle.enable_static()
 
 
@@ -177,12 +178,17 @@ def test_check_api(self):
         self.assertEqual(out_np.sum(), self.count_expected)
 
         # test dygrapg api
-        paddle.disable_static()
-        x = paddle.to_tensor(self.x)
-        y = paddle.nn.functional.gumbel_softmax(x, hard=True)
-        out_np = np.array(y)
-        self.assertEqual(out_np.sum(), self.count_expected)
-        paddle.enable_static()
+        with paddle.fluid.dygraph.base.guard():
+            x = paddle.to_tensor(self.x)
+            y = paddle.nn.functional.gumbel_softmax(x, hard=True)
+            out_np = np.array(y)
+            self.assertEqual(out_np.sum(), self.count_expected)
+
+            with _test_eager_guard():
+                x = paddle.to_tensor(self.x)
+                y = paddle.nn.functional.gumbel_softmax(x, hard=True)
+                out_np = np.array(y)
+                self.assertEqual(out_np.sum(), self.count_expected)
 
 
 class TestGumbelSoftmaxOpError(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_kron_op.py b/python/paddle/fluid/tests/unittests/test_kron_op.py
index d6db4c2f074a90..f4d013b7c6a3ea 100644
--- a/python/paddle/fluid/tests/unittests/test_kron_op.py
+++ b/python/paddle/fluid/tests/unittests/test_kron_op.py
@@ -21,11 +21,13 @@
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.dygraph as dg
+from paddle.fluid.framework import _test_eager_guard
 
 
 class TestKronOp(OpTest):
     def setUp(self):
         self.op_type = "kron"
+        self.python_api = paddle.kron
         self.dtype = self._init_dtype()
         x = np.random.uniform(size=(10, 10)).astype(self.dtype)
         y = np.random.uniform(size=(10, 10)).astype(self.dtype)
@@ -37,21 +39,22 @@ def _init_dtype(self):
         return "float64"
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_eager=True)
 
     def test_check_grad(self):
-        self.check_grad(['X', 'Y'], 'Out')
+        self.check_grad(['X', 'Y'], 'Out', check_eager=True)
 
     def test_check_grad_ignore_x(self):
-        self.check_grad(['Y'], 'Out', no_grad_set=set('X'))
+        self.check_grad(['Y'], 'Out', no_grad_set=set('X'), check_eager=True)
 
     def test_check_grad_ignore_y(self):
-        self.check_grad(['X'], 'Out', no_grad_set=set('Y'))
+        self.check_grad(['X'], 'Out', no_grad_set=set('Y'), check_eager=True)
 
 
 class TestKronOp2(TestKronOp):
     def setUp(self):
         self.op_type = "kron"
+        self.python_api = paddle.kron
         self.dtype = self._init_dtype()
         x = np.random.uniform(size=(5, 5, 4)).astype(self.dtype)
         y = np.random.uniform(size=(10, 10)).astype(self.dtype)
@@ -63,6 +66,7 @@ def setUp(self):
 class TestKronOp3(TestKronOp):
     def setUp(self):
         self.op_type = "kron"
+        self.python_api = paddle.kron
         self.dtype = self._init_dtype()
         x = np.random.uniform(size=(10, 10)).astype(self.dtype)
         y = np.random.uniform(size=(5, 5, 4)).astype(self.dtype)
@@ -101,10 +105,16 @@ def test_case_with_output(self):
         c, = exe.run(main, feed={'a': a, 'b': b}, fetch_list=[out_var])
         np.testing.assert_allclose(c, np.kron(a, b))
 
+    def test_api_eager_dygraph(self):
+        with _test_eager_guard():
+            self.test_case()
+            self.test_case_with_output()
+
 
 class TestComplexKronOp(OpTest):
     def setUp(self):
         self.op_type = "kron"
+        self.python_api = paddle.kron
         self.x_shape = np.array([10, 10])
         self.y_shape = np.array([3, 35])
         self.out_shape = self.x_shape * self.y_shape
@@ -160,14 +170,15 @@ def get_grad_y_by_numpy(self):
         return grad_y
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_eager=True)
 
     def test_check_grad_normal(self):
         self.check_grad(
             ['X', 'Y'],
             'Out',
             user_defined_grads=[self.grad_x, self.grad_y],
-            user_defined_grad_outputs=[self.grad_out])
+            user_defined_grad_outputs=[self.grad_out],
+            check_eager=True)
 
     def test_check_grad_ingore_x(self):
         self.check_grad(
@@ -175,7 +186,8 @@ def test_check_grad_ingore_x(self):
             'Out',
             no_grad_set=set("X"),
             user_defined_grads=[self.grad_y],
-            user_defined_grad_outputs=[self.grad_out])
+            user_defined_grad_outputs=[self.grad_out],
+            check_eager=True)
 
     def test_check_grad_ingore_y(self):
         self.check_grad(
@@ -183,7 +195,8 @@ def test_check_grad_ingore_y(self):
             'Out',
             no_grad_set=set('Y'),
             user_defined_grads=[self.grad_x],
-            user_defined_grad_outputs=[self.grad_out])
+            user_defined_grad_outputs=[self.grad_out],
+            check_eager=True)
 
 
 class TestKronOpTypePromotion(TestComplexKronOp):
diff --git a/python/paddle/fluid/tests/unittests/test_linalg_cond.py b/python/paddle/fluid/tests/unittests/test_linalg_cond.py
index 9e3edd82681bca..42fb2fbc578bfa 100644
--- a/python/paddle/fluid/tests/unittests/test_linalg_cond.py
+++ b/python/paddle/fluid/tests/unittests/test_linalg_cond.py
@@ -18,6 +18,7 @@
 import numpy as np
 import paddle
 import paddle.static as static
+from paddle.fluid.framework import _test_eager_guard
 
 p_list_n_n = ("fro", "nuc", 1, -1, np.inf, -np.inf)
 p_list_m_n = (None, 2, -2)
@@ -89,16 +90,21 @@ def test_out(self):
 
 
 class API_TestDygraphCond(unittest.TestCase):
-    def test_out(self):
+    def func_out(self):
         paddle.disable_static()
         # test calling results of 'cond' in dynamic mode
         x_list_n_n, x_list_m_n = gen_input()
         test_dygraph_assert_true(self, x_list_n_n, p_list_n_n + p_list_m_n)
         test_dygraph_assert_true(self, x_list_m_n, p_list_m_n)
 
+    def test_out(self):
+        with _test_eager_guard():
+            self.func_out()
+        self.func_out()
+
 
 class TestCondAPIError(unittest.TestCase):
-    def test_dygraph_api_error(self):
+    def func_dygraph_api_error(self):
         paddle.disable_static()
         # test raising errors when 'cond' is called in dygraph mode
         p_list_error = ('fro_', '_nuc', -0.7, 0, 1.5, 3)
@@ -113,6 +119,11 @@ def test_dygraph_api_error(self):
                 x_tensor = paddle.to_tensor(x)
                 self.assertRaises(ValueError, paddle.linalg.cond, x_tensor, p)
 
+    def test_dygraph_api_error(self):
+        with _test_eager_guard():
+            self.func_dygraph_api_error()
+        self.func_dygraph_api_error()
+
     def test_static_api_error(self):
         paddle.enable_static()
         # test raising errors when 'cond' is called in static mode
@@ -149,13 +160,18 @@ def test_static_empty_input_error(self):
 
 
 class TestCondEmptyTensorInput(unittest.TestCase):
-    def test_dygraph_empty_tensor_input(self):
+    def func_dygraph_empty_tensor_input(self):
         paddle.disable_static()
         # test calling results of 'cond' when input is an empty tensor in dynamic mode
         x_list_n_n, x_list_m_n = gen_empty_input()
         test_dygraph_assert_true(self, x_list_n_n, p_list_n_n + p_list_m_n)
         test_dygraph_assert_true(self, x_list_m_n, p_list_m_n)
 
+    def test_dygraph_empty_tensor_input(self):
+        with _test_eager_guard():
+            self.func_dygraph_empty_tensor_input()
+        self.func_dygraph_empty_tensor_input()
+
 
 if __name__ == "__main__":
     paddle.enable_static()
diff --git a/python/paddle/fluid/tests/unittests/test_linear.py b/python/paddle/fluid/tests/unittests/test_linear.py
index 9d07a80da15dbf..6b00a86e3e9009 100644
--- a/python/paddle/fluid/tests/unittests/test_linear.py
+++ b/python/paddle/fluid/tests/unittests/test_linear.py
@@ -73,6 +73,22 @@ def test_error(self, place=paddle.CPUPlace()):
         np.testing.assert_array_almost_equal(res_f, res_nn)
         np.testing.assert_array_almost_equal(res_nn, res_np)
 
+    def test_weight_init(self):
+        if not paddle.is_compiled_with_cuda():
+            return
+        paddle.seed(100)
+        linear = paddle.nn.Linear(
+            2, 3, weight_attr=paddle.nn.initializer.Normal(0, 1.))
+        paddle.nn.utils._stride_column(linear.weight)
+        expect = [[1.4349908, -0.8099171, -2.64788],
+                  [-1.4981681, -1.1784115, -0.023253186]]
+        self.assertTrue(np.allclose(linear.weight.numpy(), expect))
+
+        linear = paddle.nn.Linear(2, 3)
+        expect = [[0.73261100, 0.43836895, 0.07908206],
+                  [0.85075015, -1.04724526, 0.64371765]]
+        self.assertTrue(np.allclose(linear.weight.numpy(), expect))
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_linspace.py b/python/paddle/fluid/tests/unittests/test_linspace.py
index 54846e6a14bd2b..65a6c21fb0720d 100644
--- a/python/paddle/fluid/tests/unittests/test_linspace.py
+++ b/python/paddle/fluid/tests/unittests/test_linspace.py
@@ -21,11 +21,13 @@
 import paddle.fluid as fluid
 from paddle.fluid import compiler, Program, program_guard
 from paddle.fluid import core
+from paddle.fluid.framework import _test_eager_guard
 
 
 class TestLinspaceOpCommonCase(OpTest):
     def setUp(self):
         self.op_type = "linspace"
+        self.python_api = paddle.linspace
         dtype = 'float32'
         self.inputs = {
             'Start': np.array([0]).astype(dtype),
@@ -37,12 +39,13 @@ def setUp(self):
         self.outputs = {'Out': np.arange(0, 11).astype(dtype)}
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_eager=True)
 
 
 class TestLinspaceOpReverseCase(OpTest):
     def setUp(self):
         self.op_type = "linspace"
+        self.python_api = paddle.linspace
         dtype = 'float32'
         self.inputs = {
             'Start': np.array([10]).astype(dtype),
@@ -54,12 +57,13 @@ def setUp(self):
         self.outputs = {'Out': np.arange(10, -1, -1).astype(dtype)}
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_eager=True)
 
 
 class TestLinspaceOpNumOneCase(OpTest):
     def setUp(self):
         self.op_type = "linspace"
+        self.python_api = paddle.linspace
         dtype = 'float32'
         self.inputs = {
             'Start': np.array([10]).astype(dtype),
@@ -71,7 +75,7 @@ def setUp(self):
         self.outputs = {'Out': np.array(10, dtype=dtype)}
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_eager=True)
 
 
 class TestLinspaceAPI(unittest.TestCase):
@@ -123,6 +127,11 @@ def test_imperative(self):
         self.assertEqual((out2.numpy() == np_out2).all(), True)
         self.assertEqual((out3.numpy() == np_out3).all(), True)
 
+    def test_api_eager_dygraph(self):
+        with _test_eager_guard():
+            self.test_variable_input2()
+            self.test_imperative()
+
 
 class TestLinspaceOpError(unittest.TestCase):
     def test_errors(self):
diff --git a/python/paddle/fluid/tests/unittests/test_logit_op.py b/python/paddle/fluid/tests/unittests/test_logit_op.py
index 9254996eb44631..9b46039da13b10 100644
--- a/python/paddle/fluid/tests/unittests/test_logit_op.py
+++ b/python/paddle/fluid/tests/unittests/test_logit_op.py
@@ -16,6 +16,7 @@
 import numpy as np
 from op_test import OpTest
 import paddle
+from paddle.fluid.framework import _test_eager_guard
 np.random.seed(10)
 
 
@@ -37,6 +38,7 @@ def logit_grad(x, eps=1e-8):
 class TestLogitOp(OpTest):
     def setUp(self):
         self.op_type = 'logit'
+        self.python_api = paddle.logit
         self.dtype = np.float64
         self.shape = [120]
         self.eps = 1e-8
@@ -52,10 +54,11 @@ def set_attrs(self):
         pass
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_eager=True)
 
     def test_check_grad(self):
-        self.check_grad(['X'], ['Out'], user_defined_grads=[self.x_grad])
+        self.check_grad(
+            ['X'], ['Out'], user_defined_grads=[self.x_grad], check_eager=True)
 
 
 class TestLogitShape(TestLogitOp):
@@ -106,6 +109,11 @@ def test_errors(self):
             x = paddle.fluid.data(name='X2', shape=[100], dtype='float32')
             self.assertRaises(TypeError, paddle.logit, x, dtype='int32')
 
+    def test_api_eager_dygraph(self):
+        with _test_eager_guard():
+            self.test_check_api()
+            self.test_errors()
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_matrix_rank_op.py b/python/paddle/fluid/tests/unittests/test_matrix_rank_op.py
index d0b84a0d7e1082..b13b3462617627 100644
--- a/python/paddle/fluid/tests/unittests/test_matrix_rank_op.py
+++ b/python/paddle/fluid/tests/unittests/test_matrix_rank_op.py
@@ -30,8 +30,13 @@
 np.random.seed(SEED)
 
 
+def matrix_rank_wraper(x, tol=None, use_default_tol=True, hermitian=False):
+    return paddle.linalg.matrix_rank(x, tol, hermitian)
+
+
 class TestMatrixRankOP(OpTest):
     def setUp(self):
+        self.python_api = matrix_rank_wraper
         self.op_type = "matrix_rank"
         self.init_data()
         self.inputs = {'X': self.x}
@@ -44,7 +49,7 @@ def setUp(self):
         self.outputs = {'Out': self.out}
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_eager=True)
 
     def init_data(self):
         self.x = np.eye(3, dtype=np.float32)
@@ -110,6 +115,28 @@ def init_data(self):
                                          self.hermitian)
 
 
+class TestMatrixRankOP6(TestMatrixRankOP):
+    def init_data(self):
+        self.x = np.random.rand(3, 4, 5, 6).astype(np.float32)
+        self.tol_tensor = None
+        self.tol = None
+        self.use_default_tol = False
+        self.hermitian = False
+        self.out = np.linalg.matrix_rank(self.x, self.tol_tensor,
+                                         self.hermitian)
+
+
+class TestMatrixRankOP7(TestMatrixRankOP):
+    def init_data(self):
+        self.x = np.eye(200, dtype=np.float64)
+        self.tol_tensor = np.random.random([200, 200]).astype(self.x.dtype)
+        self.tol = None
+        self.use_default_tol = True
+        self.hermitian = True
+        self.out = np.linalg.matrix_rank(self.x, self.tol_tensor,
+                                         self.hermitian)
+
+
 class TestMatrixRankAPI(unittest.TestCase):
     def test_dygraph(self):
         paddle.disable_static()
diff --git a/python/paddle/fluid/tests/unittests/test_maxout_op.py b/python/paddle/fluid/tests/unittests/test_maxout_op.py
index fac400caacdab5..4bc7b09c71e6eb 100644
--- a/python/paddle/fluid/tests/unittests/test_maxout_op.py
+++ b/python/paddle/fluid/tests/unittests/test_maxout_op.py
@@ -21,6 +21,7 @@
 import paddle.fluid.core as core
 import paddle.nn.functional as F
 from op_test import OpTest
+from paddle.fluid.framework import _test_eager_guard
 
 paddle.enable_static()
 np.random.seed(1)
@@ -38,6 +39,7 @@ def maxout_forward_naive(x, groups, channel_axis):
 class TestMaxOutOp(OpTest):
     def setUp(self):
         self.op_type = "maxout"
+        self.python_api = paddle.nn.functional.maxout
         self.dtype = 'float64'
         self.shape = [3, 6, 2, 4]
         self.groups = 2
@@ -55,10 +57,10 @@ def set_attrs(self):
         pass
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_eager=True)
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
+        self.check_grad(['X'], 'Out', check_eager=True)
 
 
 class TestMaxOutOpAxis0(TestMaxOutOp):
@@ -144,6 +146,10 @@ def test_errors(self):
             x_float32 = paddle.fluid.data(name='x_float32', shape=[2, 4, 6, 8])
             self.assertRaises(ValueError, F.maxout, x_float32, 2, 2)
 
+    def test_dygraph_final_state_api(self):
+        with _test_eager_guard():
+            self.test_dygraph_api()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_multi_dot_op.py b/python/paddle/fluid/tests/unittests/test_multi_dot_op.py
index 8856624b4efc72..11c04363170766 100644
--- a/python/paddle/fluid/tests/unittests/test_multi_dot_op.py
+++ b/python/paddle/fluid/tests/unittests/test_multi_dot_op.py
@@ -18,6 +18,7 @@
 from numpy.linalg import multi_dot
 from op_test import OpTest
 import paddle
+from paddle.fluid.framework import _test_eager_guard
 
 paddle.enable_static()
 
@@ -27,6 +28,7 @@
 class TestMultiDotOp(OpTest):
     def setUp(self):
         self.op_type = "multi_dot"
+        self.python_api = paddle.linalg.multi_dot
         self.dtype = self.get_dtype()
         self.get_inputs_and_outputs()
 
@@ -40,11 +42,11 @@ def get_inputs_and_outputs(self):
         self.outputs = {'Out': multi_dot([self.A, self.B])}
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_eager=True)
 
     def test_check_grad(self):
-        self.check_grad(['x0'], 'Out')
-        self.check_grad(['x1'], 'Out')
+        self.check_grad(['x0'], 'Out', check_eager=True)
+        self.check_grad(['x1'], 'Out', check_eager=True)
 
 
 #(A*B)*C
@@ -57,9 +59,9 @@ def get_inputs_and_outputs(self):
         self.outputs = {'Out': multi_dot([self.A, self.B, self.C])}
 
     def test_check_grad(self):
-        self.check_grad(['x0'], 'Out')
-        self.check_grad(['x1'], 'Out')
-        self.check_grad(['x2'], 'Out')
+        self.check_grad(['x0'], 'Out', check_eager=True)
+        self.check_grad(['x1'], 'Out', check_eager=True)
+        self.check_grad(['x2'], 'Out', check_eager=True)
 
 
 #A*(B*C)
@@ -72,9 +74,9 @@ def get_inputs_and_outputs(self):
         self.outputs = {'Out': multi_dot([self.A, self.B, self.C])}
 
     def test_check_grad(self):
-        self.check_grad(['x0'], 'Out')
-        self.check_grad(['x1'], 'Out')
-        self.check_grad(['x2'], 'Out')
+        self.check_grad(['x0'], 'Out', check_eager=True)
+        self.check_grad(['x1'], 'Out', check_eager=True)
+        self.check_grad(['x2'], 'Out', check_eager=True)
 
 
 class TestMultiDotOp4Mat(TestMultiDotOp):
@@ -90,10 +92,10 @@ def get_inputs_and_outputs(self):
         self.outputs = {'Out': multi_dot([self.A, self.B, self.C, self.D])}
 
     def test_check_grad(self):
-        self.check_grad(['x0'], 'Out')
-        self.check_grad(['x1'], 'Out')
-        self.check_grad(['x2'], 'Out')
-        self.check_grad(['x3'], 'Out')
+        self.check_grad(['x0'], 'Out', check_eager=True)
+        self.check_grad(['x1'], 'Out', check_eager=True)
+        self.check_grad(['x2'], 'Out', check_eager=True)
+        self.check_grad(['x3'], 'Out', check_eager=True)
 
 
 class TestMultiDotOpFirst1D(TestMultiDotOp):
@@ -143,9 +145,9 @@ def get_inputs_and_outputs(self):
         self.outputs = {'Out': multi_dot([self.A, self.B, self.C])}
 
     def test_check_grad(self):
-        self.check_grad(['x0'], 'Out')
-        self.check_grad(['x1'], 'Out')
-        self.check_grad(['x2'], 'Out')
+        self.check_grad(['x0'], 'Out', check_eager=True)
+        self.check_grad(['x1'], 'Out', check_eager=True)
+        self.check_grad(['x2'], 'Out', check_eager=True)
 
 
 class TestMultiDotOp4MatLast1D(TestMultiDotOp4Mat):
@@ -260,6 +262,10 @@ def test_dygraph_without_out(self):
         expected_result = np.linalg.multi_dot([input_array1, input_array2])
         self.assertTrue(np.allclose(expected_result, out.numpy()))
 
+    def test_dygraph_final_state_api(self):
+        with _test_eager_guard():
+            self.test_dygraph_without_out()
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_multinomial_op.py b/python/paddle/fluid/tests/unittests/test_multinomial_op.py
index a65a1c7e14c2bf..ecde527523d3dd 100644
--- a/python/paddle/fluid/tests/unittests/test_multinomial_op.py
+++ b/python/paddle/fluid/tests/unittests/test_multinomial_op.py
@@ -227,9 +227,6 @@ def test_fixed_random_number(self):
         if not "V100" in paddle.device.cuda.get_device_name():
             return
 
-        if os.getenv("FLAGS_use_curand", None) in ('0', 'False', None):
-            return
-
         print("Test Fixed Random number on V100 GPU------>")
         paddle.disable_static()
         paddle.set_device('gpu')
diff --git a/python/paddle/fluid/tests/unittests/test_multiplex_op.py b/python/paddle/fluid/tests/unittests/test_multiplex_op.py
index a840586d78db08..093ee86aeea6ee 100644
--- a/python/paddle/fluid/tests/unittests/test_multiplex_op.py
+++ b/python/paddle/fluid/tests/unittests/test_multiplex_op.py
@@ -19,6 +19,7 @@
 from op_test import OpTest
 import paddle
 import paddle.fluid as fluid
+from paddle.fluid.framework import _test_eager_guard
 
 
 class TestMultiplexOp(OpTest):
@@ -68,26 +69,26 @@ def test_errors(self):
 
             def test_list():
                 # the inputs type must be list
-                fluid.layers.multiplex(inputs=x1, index=index)
+                paddle.multiplex(inputs=x1, index=index)
 
             self.assertRaises(TypeError, test_list)
 
             def test_len():
-                fluid.layers.multiplex(inputs=[x1], index=index)
+                paddle.multiplex(inputs=[x1], index=index)
 
             self.assertRaises(ValueError, test_len)
 
             def test_type():
                 y1 = fluid.data(name='y1', shape=[None, 2], dtype='int16')
                 y2 = fluid.data(name='y2', shape=[None, 2], dtype='int16')
-                fluid.layers.multiplex(inputs=[y1, y2], index=index)
+                paddle.multiplex(inputs=[y1, y2], index=index)
 
             self.assertRaises(TypeError, test_type)
 
             def test_type2():
                 index2 = fluid.data(
                     name='index2', shape=[None, 1], dtype='int16')
-                fluid.layers.multiplex(inputs=[x1, x2], index=index2)
+                paddle.multiplex(inputs=[x1, x2], index=index2)
 
             self.assertRaises(TypeError, test_type2)
 
@@ -102,6 +103,30 @@ def test_multiplex_dygraph(self):
         res = paddle.multiplex(inputs, index)
         paddle.enable_static()
 
+    def test_dygraph_final_state_api(self):
+        with fluid.dygraph.guard():
+            img1 = np.array([[1, 2], [3, 4]]).astype(np.float32)
+            img2 = np.array([[5, 6], [7, 8]]).astype(np.float32)
+            inputs = [paddle.to_tensor(img1), paddle.to_tensor(img2)]
+            index = paddle.to_tensor(np.array([[1], [0]]).astype(np.int32))
+            inputs[0].stop_gradient = False
+            inputs[1].stop_gradient = False
+            res = paddle.multiplex(inputs, index)
+            res.backward()
+            with _test_eager_guard():
+                inputs_eager = [paddle.to_tensor(img1), paddle.to_tensor(img2)]
+                index_eager = paddle.to_tensor(
+                    np.array([[1], [0]]).astype(np.int32))
+                inputs_eager[0].stop_gradient = False
+                inputs_eager[1].stop_gradient = False
+                res_eager = paddle.multiplex(inputs_eager, index_eager)
+                res_eager.backward()
+                self.assertEqual((res.numpy() == res_eager.numpy()).all(), True)
+                self.assertEqual((inputs[0].grad.numpy() ==
+                                  inputs_eager[0].grad.numpy()).all(), True)
+                self.assertEqual((inputs[1].grad.numpy() ==
+                                  inputs_eager[1].grad.numpy()).all(), True)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_newprofiler.py b/python/paddle/fluid/tests/unittests/test_newprofiler.py
index 0088687b125636..ac2b205e61128e 100755
--- a/python/paddle/fluid/tests/unittests/test_newprofiler.py
+++ b/python/paddle/fluid/tests/unittests/test_newprofiler.py
@@ -16,6 +16,7 @@
 
 import unittest
 import numpy as np
+import tempfile
 
 import paddle
 import paddle.profiler as profiler
@@ -138,6 +139,146 @@ def test_nvprof(self):
             y = x / 2.0
 
 
+class TestGetProfiler(unittest.TestCase):
+    def test_getprofiler(self):
+        config_content = '''
+        {
+        "targets": ["CPU"],
+        "scheduler": [3,4],
+        "on_trace_ready": {
+            "export_chrome_tracing":{
+                "module": "paddle.profiler",
+                "use_direct": false,
+                "args": [],
+                "kwargs": {
+                        "dir_name": "testdebug/"
+                    }
+                }
+            },
+          "timer_only": false
+        }
+        '''
+        filehandle = tempfile.NamedTemporaryFile(mode='w')
+        filehandle.write(config_content)
+        filehandle.flush()
+        import paddle.profiler.profiler as profiler
+        profiler = profiler.get_profiler(filehandle.name)
+        x_value = np.random.randn(2, 3, 3)
+        x = paddle.to_tensor(
+            x_value, stop_gradient=False, place=paddle.CPUPlace())
+        with profiler:
+            for i in range(5):
+                y = x / 2.0
+                ones_like_y = paddle.ones_like(y)
+                profiler.step()
+
+        # below tests are just for coverage, wrong config
+        # test use_direct
+        config_content = '''
+        {
+        "targets": ["Cpu", "Gpu"],
+        "scheduler": {
+            "make_scheduler":{
+                "module": "paddle.profiler",
+                "use_direct": true,
+                "args": [],
+                "kwargs": {}
+            }
+        },
+        "on_trace_ready": {
+            "export_chrome_tracing":{
+                "module": "paddle.profiler1",
+                "use_direct": true,
+                "args": [],
+                "kwargs": {
+                    }
+                }
+            },
+          "timer_only": false
+        }
+        '''
+        filehandle = tempfile.NamedTemporaryFile(mode='w')
+        filehandle.write(config_content)
+        filehandle.flush()
+        import paddle.profiler.profiler as profiler
+        try:
+            profiler = profiler.get_profiler(filehandle.name)
+        except:
+            pass
+
+        # test scheduler 
+        config_content = '''
+        {
+        "targets": ["Cpu", "Gpu"],
+        "scheduler": {
+           "make_scheduler":{
+                "module": "paddle.profiler",
+                "use_direct": false,
+                "args": [],
+                "kwargs": {
+                        "closed": 1,
+                        "ready": 1,
+                        "record": 2
+                    }
+            }
+        },
+        "on_trace_ready": {
+            "export_chrome_tracing":{
+                "module": "paddle.profiler",
+                "use_direct": true,
+                "args": [],
+                "kwargs": {
+                    }
+                }
+            },
+          "timer_only": false
+        }
+        '''
+        filehandle = tempfile.NamedTemporaryFile(mode='w')
+        filehandle.write(config_content)
+        filehandle.flush()
+        import paddle.profiler.profiler as profiler
+        profiler = profiler.get_profiler(filehandle.name)
+
+        # test exception
+        config_content = '''
+        {
+        "targets": [1],
+        "scheduler": {
+            "make_scheduler1":{
+                "module": "paddle.profiler",
+                "use_direct": false,
+                "args": [],
+                "kwargs": {
+                        "closed": 1,
+                        "ready": 1,
+                        "record": 2
+                    }
+            }
+        },
+        "on_trace_ready": {
+            "export_chrome_tracing1":{
+                "module": "paddle.profiler",
+                "use_direct": false,
+                "args": [],
+                "kwargs": {
+                        "dir_name": "testdebug/"
+                    }
+                }
+            },
+          "timer_only": 1
+        }
+        '''
+        filehandle = tempfile.NamedTemporaryFile(mode='w')
+        filehandle.write(config_content)
+        filehandle.flush()
+        import paddle.profiler.profiler as profiler
+        profiler = profiler.get_profiler(filehandle.name)
+        # test path error
+        import paddle.profiler.profiler as profiler
+        profiler = profiler.get_profiler('nopath.json')
+
+
 class RandomDataset(Dataset):
     def __init__(self, num_samples):
         self.num_samples = num_samples
diff --git a/python/paddle/fluid/tests/unittests/test_norm_op.py b/python/paddle/fluid/tests/unittests/test_norm_op.py
index 626de9b12b9c15..49e1f2533491d7 100644
--- a/python/paddle/fluid/tests/unittests/test_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_norm_op.py
@@ -32,6 +32,7 @@ def l2_norm(x, axis, epsilon):
 class TestNormOp(OpTest):
     def setUp(self):
         self.op_type = "norm"
+        self.python_api = paddle.fluid.layers.l2_normalize
         self.init_test_case()
         self.init_dtype()
         x = np.random.random(self.shape).astype(self.dtype)
diff --git a/python/paddle/fluid/tests/unittests/test_pixel_shuffle.py b/python/paddle/fluid/tests/unittests/test_pixel_shuffle.py
index f1a409c712fc32..06d975fe2b88f8 100644
--- a/python/paddle/fluid/tests/unittests/test_pixel_shuffle.py
+++ b/python/paddle/fluid/tests/unittests/test_pixel_shuffle.py
@@ -52,6 +52,7 @@ def pixel_shuffle_np(x, up_factor, data_format="NCHW"):
 class TestPixelShuffleOp(OpTest):
     def setUp(self):
         self.op_type = "pixel_shuffle"
+        self.python_api = paddle.nn.functional.pixel_shuffle
         self.init_data_format()
         n, c, h, w = 2, 9, 4, 4
 
@@ -73,10 +74,10 @@ def init_data_format(self):
         self.format = "NCHW"
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_eager=True)
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
+        self.check_grad(['X'], 'Out', check_eager=True)
 
 
 class TestChannelLast(TestPixelShuffleOp):
@@ -220,4 +221,5 @@ def error_data_format_layer():
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_poisson_op.py b/python/paddle/fluid/tests/unittests/test_poisson_op.py
index 2123d4e0e7e359..f8183bb5f8db28 100644
--- a/python/paddle/fluid/tests/unittests/test_poisson_op.py
+++ b/python/paddle/fluid/tests/unittests/test_poisson_op.py
@@ -107,9 +107,6 @@ def test_fixed_random_number(self):
         if not paddle.is_compiled_with_cuda():
             return
 
-        if os.getenv("FLAGS_use_curand", None) in ('0', 'False', None):
-            return
-
         print("Test Fixed Random number on GPU------>")
         paddle.disable_static()
         paddle.set_device('gpu')
diff --git a/python/paddle/fluid/tests/unittests/test_profiler_statistic.py b/python/paddle/fluid/tests/unittests/test_profiler_statistic.py
index adc42d0447f340..dc944e68c7f555 100644
--- a/python/paddle/fluid/tests/unittests/test_profiler_statistic.py
+++ b/python/paddle/fluid/tests/unittests/test_profiler_statistic.py
@@ -185,20 +185,22 @@ def test_statistic_case1(self):
                 profiler.TracerEventType.Communication), 5)
         self.assertEqual(len(event_summary.items), 2)
         self.assertEqual(len(event_summary.userdefined_items), 1)
-        self.assertEqual(len(event_summary.model_perspective_items), 3)
+        self.assertEqual(len(event_summary.model_perspective_items), 4)
         self.assertEqual(len(event_summary.memory_manipulation_items), 1)
         self.assertEqual(event_summary.items['conv2d'].cpu_time, 15)
-        self.assertEqual(event_summary.items['conv2d'].gpu_time, 25)
+        self.assertEqual(event_summary.items['conv2d'].general_gpu_time, 25)
         self.assertEqual(
             event_summary.model_perspective_items['Forward'].cpu_time, 100)
         self.assertEqual(
-            event_summary.model_perspective_items['Forward'].gpu_time, 135)
+            event_summary.model_perspective_items['Forward'].general_gpu_time,
+            135)
         self.assertEqual(
-            event_summary.model_perspective_items['Backward'].gpu_time, 0)
+            event_summary.model_perspective_items['Backward'].general_gpu_time,
+            0)
         self.assertEqual(
             event_summary.memory_manipulation_items['AsyncMemcpy'].cpu_time, 15)
-        self.assertEqual(
-            event_summary.memory_manipulation_items['AsyncMemcpy'].gpu_time, 60)
+        self.assertEqual(event_summary.memory_manipulation_items['AsyncMemcpy']
+                         .general_gpu_time, 60)
         print(
             profiler.profiler_statistic._build_table(
                 statistic_data,
@@ -226,31 +228,31 @@ def test_statistic_case2(self):
         userdefined_node = HostPythonNode('Communication Time',
                                           profiler.TracerEventType.UserDefined,
                                           100, 110, 1000, 1001)
-        reduce_all_launchkernel0 = HostPythonNode(
+        allreduce_launchkernel0 = HostPythonNode(
             'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 102, 104,
             1000, 1001)
 
-        nccl_reduce_all_kernel0 = DevicePythonNode(
-            'nccl_reduce_all_kernel', profiler.TracerEventType.Kernel, 105, 120,
+        nccl_allreduce_kernel0 = DevicePythonNode(
+            'nccl_allreduce_kernel', profiler.TracerEventType.Kernel, 105, 120,
             0, 0, 2)
 
         communication_node = HostPythonNode(
             'Communication', profiler.TracerEventType.Communication, 105, 110,
             1000, 1001)
 
-        reduce_all_op1 = HostPythonNode('reduce_all_op1',
-                                        profiler.TracerEventType.Operator, 105,
-                                        108, 1000, 1001)
-        reduce_all_op1_infershape = HostPythonNode(
-            'reduce_all_op1::infershape',
-            profiler.TracerEventType.OperatorInner, 105, 106, 1000, 1001)
+        allreduce_op1 = HostPythonNode('allreduce_op1',
+                                       profiler.TracerEventType.Operator, 105,
+                                       108, 1000, 1001)
+        allreduce_op1_infershape = HostPythonNode(
+            'allreduce_op1::infershape', profiler.TracerEventType.OperatorInner,
+            105, 106, 1000, 1001)
 
-        reduce_all_launchkernel1 = HostPythonNode(
+        allreduce_launchkernel1 = HostPythonNode(
             'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 106, 107,
             1000, 1001)
 
-        nccl_reduce_all_kernel1 = DevicePythonNode(
-            'nccl_reduce_all_kernel', profiler.TracerEventType.Kernel, 130, 150,
+        nccl_allreduce_kernel1 = DevicePythonNode(
+            'nccl_allreduce_kernel', profiler.TracerEventType.Kernel, 130, 150,
             0, 0, 2)
 
         backward_node = HostPythonNode('Gradient Backward',
@@ -305,19 +307,19 @@ def test_statistic_case2(self):
             'sync_batch_norm_memcpy', profiler.TracerEventType.Memcpy, 150, 200,
             0, 0, 1)
 
-        reduce_all_node2 = HostPythonNode('reduce_all',
-                                          profiler.TracerEventType.Operator,
-                                          230, 250, 1000, 1001)
+        allreduce_node2 = HostPythonNode('allreduce',
+                                         profiler.TracerEventType.Operator, 230,
+                                         250, 1000, 1001)
 
-        reduce_all_node2_infershape = HostPythonNode(
-            'reduce_all_node2::infershape',
+        allreduce_node2_infershape = HostPythonNode(
+            'allreduce_node2::infershape',
             profiler.TracerEventType.OperatorInner, 231, 232, 1000, 1001)
-        reduce_all_launchkernel2 = HostPythonNode(
+        allreduce_launchkernel2 = HostPythonNode(
             'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 235, 240,
             1000, 1001)
 
-        nccl_reduce_all_kernel2 = DevicePythonNode(
-            'nccl_reduce_all_kernel', profiler.TracerEventType.Kernel, 250, 280,
+        nccl_allreduce_kernel2 = DevicePythonNode(
+            'nccl_allreduce_kernel', profiler.TracerEventType.Kernel, 250, 280,
             0, 0, 2)
 
         root_node.children_node.append(profilerstep_node)
@@ -329,12 +331,12 @@ def test_statistic_case2(self):
         yolonet_node.children_node.extend(
             [sync_batch_norm_node, userdefined_node])
         userdefined_node.children_node.append(communication_node)
-        userdefined_node.runtime_node.append(reduce_all_launchkernel0)
-        reduce_all_launchkernel0.device_node.append(nccl_reduce_all_kernel0)
-        communication_node.children_node.append(reduce_all_op1)
-        reduce_all_op1.children_node.append(reduce_all_op1_infershape)
-        reduce_all_op1.runtime_node.append(reduce_all_launchkernel1)
-        reduce_all_launchkernel1.device_node.append(nccl_reduce_all_kernel1)
+        userdefined_node.runtime_node.append(allreduce_launchkernel0)
+        allreduce_launchkernel0.device_node.append(nccl_allreduce_kernel0)
+        communication_node.children_node.append(allreduce_op1)
+        allreduce_op1.children_node.append(allreduce_op1_infershape)
+        allreduce_op1.runtime_node.append(allreduce_launchkernel1)
+        allreduce_launchkernel1.device_node.append(nccl_allreduce_kernel1)
         conv2d_node.children_node.extend(
             [conv2d_infer_shape, conv2d_compute, conv2d_MemCpy])
         conv2d_compute.runtime_node.append(conv2d_launchkernel)
@@ -350,10 +352,10 @@ def test_statistic_case2(self):
         sync_batch_norm_MemCpy.runtime_node.append(sync_batch_norm_cudaMemCpy)
         sync_batch_norm_launchkernel.device_node.append(sync_batch_norm_kernel)
         sync_batch_norm_cudaMemCpy.device_node.append(sync_batch_norm_memcpy)
-        optimization_node.children_node.append(reduce_all_node2)
-        reduce_all_node2.children_node.append(reduce_all_node2_infershape)
-        reduce_all_node2.runtime_node.append(reduce_all_launchkernel2)
-        reduce_all_launchkernel2.device_node.append(nccl_reduce_all_kernel2)
+        optimization_node.children_node.append(allreduce_node2)
+        allreduce_node2.children_node.append(allreduce_node2_infershape)
+        allreduce_node2.runtime_node.append(allreduce_launchkernel2)
+        allreduce_launchkernel2.device_node.append(nccl_allreduce_kernel2)
         thread_tree = {'thread1001': root_node}
         extra_info = {
             'Process Cpu Utilization': '1.02',
@@ -415,20 +417,22 @@ def test_statistic_case2(self):
                 distributed_summary.overlap_range), 85)
         self.assertEqual(len(event_summary.items), 4)
         self.assertEqual(len(event_summary.userdefined_items), 1)
-        self.assertEqual(len(event_summary.model_perspective_items), 3)
+        self.assertEqual(len(event_summary.model_perspective_items), 4)
         self.assertEqual(len(event_summary.memory_manipulation_items), 1)
         self.assertEqual(event_summary.items['conv2d'].cpu_time, 15)
-        self.assertEqual(event_summary.items['conv2d'].gpu_time, 25)
+        self.assertEqual(event_summary.items['conv2d'].general_gpu_time, 25)
         self.assertEqual(
             event_summary.model_perspective_items['Forward'].cpu_time, 100)
         self.assertEqual(
-            event_summary.model_perspective_items['Forward'].gpu_time, 315)
+            event_summary.model_perspective_items['Forward'].general_gpu_time,
+            315)
         self.assertEqual(
-            event_summary.model_perspective_items['Backward'].gpu_time, 0)
+            event_summary.model_perspective_items['Backward'].general_gpu_time,
+            0)
         self.assertEqual(
             event_summary.memory_manipulation_items['AsyncMemcpy'].cpu_time, 15)
-        self.assertEqual(
-            event_summary.memory_manipulation_items['AsyncMemcpy'].gpu_time, 60)
+        self.assertEqual(event_summary.memory_manipulation_items['AsyncMemcpy']
+                         .general_gpu_time, 60)
         print(
             profiler.profiler_statistic._build_table(
                 statistic_data,
diff --git a/python/paddle/fluid/tests/unittests/test_randint_op.py b/python/paddle/fluid/tests/unittests/test_randint_op.py
index 1eb99e08bb8e1b..361f4d280f70fa 100644
--- a/python/paddle/fluid/tests/unittests/test_randint_op.py
+++ b/python/paddle/fluid/tests/unittests/test_randint_op.py
@@ -198,9 +198,6 @@ def test_fixed_random_number(self):
         if not "V100" in paddle.device.cuda.get_device_name():
             return
 
-        if os.getenv("FLAGS_use_curand", None) in ('0', 'False', None):
-            return
-
         print("Test Fixed Random number on GPU------>")
         paddle.disable_static()
 
diff --git a/python/paddle/fluid/tests/unittests/test_randperm_op.py b/python/paddle/fluid/tests/unittests/test_randperm_op.py
index 5c9ab36fa34bc3..deb0a9a082140f 100644
--- a/python/paddle/fluid/tests/unittests/test_randperm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_randperm_op.py
@@ -155,9 +155,6 @@ def test_fixed_random_number(self):
         if not paddle.is_compiled_with_cuda():
             return
 
-        if os.getenv("FLAGS_use_curand", None) in ('0', 'False', None):
-            return
-
         print("Test Fixed Random number on GPU------>")
         paddle.disable_static()
         paddle.set_device('gpu')
diff --git a/python/paddle/fluid/tests/unittests/test_segment_ops.py b/python/paddle/fluid/tests/unittests/test_segment_ops.py
index e2aadbedbd07fd..90d597837a8e10 100644
--- a/python/paddle/fluid/tests/unittests/test_segment_ops.py
+++ b/python/paddle/fluid/tests/unittests/test_segment_ops.py
@@ -73,6 +73,17 @@ def compute_segment_min_max(x, segment_ids, pooltype="MAX"):
     return results, gradient / results.size
 
 
+def segment_pool_split(X, SegmentIds, pooltype):
+    if pooltype == "SUM":
+        return paddle.incubate.tensor.segment_sum(X, SegmentIds)
+    elif pooltype == "MEAN":
+        return paddle.incubate.tensor.segment_mean(X, SegmentIds)
+    elif pooltype == "MIN":
+        return paddle.incubate.tensor.segment_min(X, SegmentIds)
+    elif pooltype == "MAX":
+        return paddle.incubate.tensor.segment_max(X, SegmentIds)
+
+
 class TestSegmentOps(OpTest):
     def set_data(self):
         x = np.random.uniform(-1, 1, self.shape).astype(self.dtype)
@@ -90,6 +101,8 @@ def compute(self, x, segment_ids):
 
     def prepare(self):
         self.op_type = "segment_pool"
+        self.python_api = segment_pool_split
+        self.python_out_sig = ["Out"]
         self.dtype = np.float64
         self.shape = [30, 15]
         self.attrs = {"pooltype": "SUM"}
@@ -105,10 +118,10 @@ def setUp(self):
         self.outputs = {'Out': result.astype(self.dtype)}
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_eager=True)
 
     def test_check_grad(self):
-        self.check_grad(["X"], "Out")
+        self.check_grad(["X"], "Out", check_eager=True)
 
 
 class TestSegmentSum2(TestSegmentOps):
@@ -259,4 +272,5 @@ def test_dygraph(self):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_switch_autotune.py b/python/paddle/fluid/tests/unittests/test_switch_autotune.py
index 08cf120a0366eb..1c08811d4b95c5 100644
--- a/python/paddle/fluid/tests/unittests/test_switch_autotune.py
+++ b/python/paddle/fluid/tests/unittests/test_switch_autotune.py
@@ -43,6 +43,16 @@ def static_program(net, data):
     return loss
 
 
+def set_flags(enable_autotune):
+    if paddle.is_compiled_with_cuda():
+        if enable_autotune:
+            paddle.set_flags({'FLAGS_conv_workspace_size_limit': -1})
+            paddle.set_flags({'FLAGS_cudnn_exhaustive_search': 1})
+        else:
+            paddle.set_flags({'FLAGS_conv_workspace_size_limit': 512})
+            paddle.set_flags({'FLAGS_cudnn_exhaustive_search': 0})
+
+
 class TestAutoTune(unittest.TestCase):
     def test_autotune(self):
         paddle.fluid.core.disable_autotune()
@@ -61,6 +71,7 @@ def check_status(self, expected_res):
 
 class TestDygraphAutoTuneStatus(TestAutoTune):
     def run_program(self, enable_autotune):
+        set_flags(enable_autotune)
         if enable_autotune:
             paddle.fluid.core.enable_autotune()
         else:
@@ -87,16 +98,27 @@ def run_program(self, enable_autotune):
                 }
                 self.check_status(expected_res)
 
-    def test_enable_autotune(self):
+    def func_enable_autotune(self):
         self.run_program(enable_autotune=True)
 
-    def test_disable_autotune(self):
+    def test_enable_autotune(self):
+        with paddle.fluid.framework._test_eager_guard():
+            self.func_enable_autotune()
+        self.func_enable_autotune()
+
+    def func_disable_autotune(self):
         self.run_program(enable_autotune=False)
 
+    def test_disable_autotune(self):
+        with paddle.fluid.framework._test_eager_guard():
+            self.func_disable_autotune()
+        self.func_disable_autotune()
+
 
 class TestStaticAutoTuneStatus(TestAutoTune):
     def run_program(self, enable_autotune):
         paddle.enable_static()
+        set_flags(enable_autotune)
         if enable_autotune:
             paddle.fluid.core.enable_autotune()
         else:
@@ -136,12 +158,22 @@ def run_program(self, enable_autotune):
                 self.check_status(expected_res)
         paddle.disable_static()
 
-    def test_enable_autotune(self):
+    def func_enable_autotune(self):
         self.run_program(enable_autotune=True)
 
-    def test_disable_autotune(self):
+    def test_enable_autotune(self):
+        with paddle.fluid.framework._test_eager_guard():
+            self.func_enable_autotune()
+        self.func_enable_autotune()
+
+    def func_disable_autotune(self):
         self.run_program(enable_autotune=False)
 
+    def test_disable_autotune(self):
+        with paddle.fluid.framework._test_eager_guard():
+            self.func_disable_autotune()
+        self.func_disable_autotune()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_unbind_op.py b/python/paddle/fluid/tests/unittests/test_unbind_op.py
index e16fb6ddaacd71..43f2f3526ac0fc 100644
--- a/python/paddle/fluid/tests/unittests/test_unbind_op.py
+++ b/python/paddle/fluid/tests/unittests/test_unbind_op.py
@@ -17,9 +17,11 @@
 import unittest
 import numpy as np
 from op_test import OpTest, convert_float_to_uint16
+import paddle
 import paddle.fluid as fluid
 import paddle.tensor as tensor
 from paddle.fluid import compiler, Program, program_guard, core
+from paddle.fluid.framework import _test_eager_guard
 
 
 class TestUnbind(unittest.TestCase):
@@ -39,6 +41,25 @@ def test_unbind(self):
         assert np.array_equal(res_1, input_1[0, 0:100])
         assert np.array_equal(res_2, input_1[1, 0:100])
 
+    def test_unbind_dygraph(self):
+        with fluid.dygraph.guard():
+            np_x = np.random.random([2, 3]).astype("float32")
+            x = paddle.to_tensor(np_x)
+            x.stop_gradient = False
+            [res_1, res_2] = paddle.unbind(x, 0)
+            self.assertTrue(np.array_equal(res_1, np_x[0, 0:100]))
+            self.assertTrue(np.array_equal(res_2, np_x[1, 0:100]))
+
+            out = paddle.add_n([res_1, res_2])
+
+            np_grad = np.ones(x.shape, np.float32)
+            out.backward()
+            self.assertTrue(np.array_equal(x.grad.numpy(), np_grad))
+
+    def test_unbind_dygraph_final_state(self):
+        with _test_eager_guard():
+            self.test_unbind_dygraph()
+
 
 class TestLayersUnbind(unittest.TestCase):
     def test_layers_unbind(self):
@@ -157,6 +178,7 @@ def outReshape(self):
 class TestUnbindBF16Op(OpTest):
     def setUp(self):
         self._set_op_type()
+        self.python_api = paddle.unbind
         self.dtype = self.get_dtype()
         self.axis = 0
         self.num = 3
diff --git a/python/paddle/fluid/tests/unittests/test_uniform_random_op.py b/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
index 41b6ed36d65ccc..0b27c616230898 100644
--- a/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
+++ b/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
@@ -26,6 +26,7 @@
 from paddle.fluid.op import Operator
 import paddle.fluid as fluid
 from paddle.fluid import Program, program_guard
+from paddle.fluid.framework import _test_eager_guard
 
 
 def output_hist(out):
@@ -52,6 +53,7 @@ def output_hist_diag(out):
 class TestUniformRandomOp_attr_tensorlist(OpTest):
     def setUp(self):
         self.op_type = "uniform_random"
+        self.python_api = paddle.uniform
         self.new_shape = (1000, 784)
         shape_tensor = []
         for index, ele in enumerate(self.new_shape):
@@ -84,6 +86,7 @@ def init_attrs(self):
 class TestUniformRandomOp_attr_tensorlist_int32(OpTest):
     def setUp(self):
         self.op_type = "uniform_random"
+        self.python_api = paddle.uniform
         self.new_shape = (1000, 784)
         shape_tensor = []
         for index, ele in enumerate(self.new_shape):
@@ -110,6 +113,7 @@ def verify_output(self, outs):
 class TestUniformRandomOp_attr_tensor(OpTest):
     def setUp(self):
         self.op_type = "uniform_random"
+        self.python_api = paddle.uniform
         self.inputs = {"ShapeTensor": np.array([1000, 784]).astype("int64")}
         self.init_attrs()
         self.outputs = {"Out": np.zeros((1000, 784)).astype("float32")}
@@ -131,6 +135,7 @@ def verify_output(self, outs):
 class TestUniformRandomOp_attr_tensor_int32(OpTest):
     def setUp(self):
         self.op_type = "uniform_random"
+        self.python_api = paddle.uniform
         self.inputs = {"ShapeTensor": np.array([1000, 784]).astype("int32")}
         self.init_attrs()
         self.outputs = {"Out": np.zeros((1000, 784)).astype("float32")}
@@ -152,6 +157,7 @@ def verify_output(self, outs):
 class TestUniformRandomOp(OpTest):
     def setUp(self):
         self.op_type = "uniform_random"
+        self.python_api = paddle.uniform
         self.inputs = {}
         self.init_attrs()
         self.outputs = {"Out": np.zeros((1000, 784)).astype("float32")}
@@ -174,6 +180,18 @@ def verify_output(self, outs):
             np.allclose(
                 hist, prob, rtol=0, atol=0.01), "hist: " + str(hist))
 
+    def test_check_api(self):
+        places = self._get_places()
+        for place in places:
+            with fluid.dygraph.base.guard(place=place):
+                out = self.python_api(self.attrs['shape'], 'float32',
+                                      self.attrs['min'], self.attrs['max'],
+                                      self.attrs['seed'])
+
+    def test_check_api_eager(self):
+        with _test_eager_guard():
+            self.test_check_api()
+
 
 class TestUniformRandomOpError(unittest.TestCase):
     def test_errors(self):
@@ -573,37 +591,46 @@ def test_fixed_random_number(self):
         if not "V100" in paddle.device.cuda.get_device_name():
             return
 
-        if os.getenv("FLAGS_use_curand", None) in ('0', 'False', None):
-            return
-
-        def _check_random_value(dtype, expect, expect_mean, expect_std):
-            x = paddle.rand([32, 3, 1024, 1024], dtype=dtype)
-            actual = x.numpy()
-            self.assertTrue(np.allclose(actual[2, 1, 512, 1000:1010], expect))
-            self.assertEqual(np.mean(actual), expect_mean)
-            self.assertEqual(np.std(actual), expect_std)
-
         print("Test Fixed Random number on V100 GPU------>")
         paddle.disable_static()
+
         paddle.set_device('gpu')
         paddle.seed(2021)
+
+        expect_mean = 0.50000454338820143895816272561205551028251647949218750
+        expect_std = 0.28867379167297479991560749112977646291255950927734375
         expect = [
             0.55298901, 0.65184678, 0.49375412, 0.57943639, 0.16459608,
             0.67181056, 0.03021481, 0.0238559, 0.07742096, 0.55972187
         ]
-        expect_mean = 0.50000454338820143895816272561205551028251647949218750
-        expect_std = 0.28867379167297479991560749112977646291255950927734375
-        _check_random_value(core.VarDesc.VarType.FP64, expect, expect_mean,
-                            expect_std)
+        out = paddle.rand([32, 3, 1024, 1024], dtype='float64').numpy()
+        self.assertEqual(np.mean(out), expect_mean)
+        self.assertEqual(np.std(out), expect_std)
+        self.assertTrue(np.allclose(out[2, 1, 512, 1000:1010], expect))
 
+        expect_mean = 0.50002604722976684570312500
+        expect_std = 0.2886914908885955810546875
         expect = [
             0.45320973, 0.17582087, 0.725341, 0.30849215, 0.622257, 0.46352342,
             0.97228295, 0.12771158, 0.286525, 0.9810645
         ]
-        expect_mean = 0.50002604722976684570312500
-        expect_std = 0.2886914908885955810546875
-        _check_random_value(core.VarDesc.VarType.FP32, expect, expect_mean,
-                            expect_std)
+        out = paddle.rand([32, 3, 1024, 1024], dtype='float32').numpy()
+        self.assertEqual(np.mean(out), expect_mean)
+        self.assertEqual(np.std(out), expect_std)
+        self.assertTrue(np.allclose(out[2, 1, 512, 1000:1010], expect))
+
+        expect_mean = 25.11843109130859375
+        expect_std = 43.370647430419921875
+        expect = [
+            30.089634, 77.05225, 3.1201615, 68.34072, 59.266724, -25.33281,
+            12.973292, 27.41127, -17.412298, 27.931019
+        ]
+        out = paddle.empty(
+            [16, 16, 16, 16], dtype='float32').uniform_(-50, 100).numpy()
+        self.assertEqual(np.mean(out), expect_mean)
+        self.assertEqual(np.std(out), expect_std)
+        self.assertTrue(np.allclose(out[10, 10, 10, 0:10], expect))
+
         paddle.enable_static()
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_zeropad2d.py b/python/paddle/fluid/tests/unittests/test_zeropad2d.py
index 2849caf17c62d8..e2913097ae1b14 100644
--- a/python/paddle/fluid/tests/unittests/test_zeropad2d.py
+++ b/python/paddle/fluid/tests/unittests/test_zeropad2d.py
@@ -16,6 +16,7 @@
 
 import unittest
 import numpy as np
+import paddle
 from paddle import to_tensor
 from paddle.nn.functional import zeropad2d
 from paddle.nn import ZeroPad2D
@@ -33,7 +34,7 @@ def setUp(self):
         self.shape = [4, 3, 224, 224]
         self.unsupport_dtypes = ['bool', 'int8']
 
-    def test_unsupport_dtypes(self):
+    def func_unsupport_dtypes(self):
         """
         test unsupport dtypes.
         """
@@ -43,6 +44,11 @@ def test_unsupport_dtypes(self):
             x_tensor = to_tensor(x).astype(dtype)
             self.assertRaises(TypeError, zeropad2d, x=x_tensor, padding=pad)
 
+    def test_unsupport_dtypes(self):
+        with paddle.fluid.framework._test_eager_guard():
+            self.func_unsupport_dtypes()
+        self.func_unsupport_dtypes()
+
 
 class TestZeroPad2dAPI(unittest.TestCase):
     """
@@ -56,7 +62,7 @@ def setUp(self):
         self.shape = [4, 3, 224, 224]
         self.support_dtypes = ['float32', 'float64', 'int32', 'int64']
 
-    def test_support_dtypes(self):
+    def func_support_dtypes(self):
         """
         test support types
         """
@@ -69,7 +75,12 @@ def test_support_dtypes(self):
             ret_res = zeropad2d(x_tensor, [pad, pad, pad, pad]).numpy()
             self.assertTrue(np.allclose(expect_res, ret_res))
 
-    def test_support_pad2(self):
+    def test_support_dtypes(self):
+        with paddle.fluid.framework._test_eager_guard():
+            self.func_support_dtypes()
+        self.func_support_dtypes()
+
+    def func_support_pad2(self):
         """
         test the type of 'pad' is list.
         """
@@ -82,7 +93,12 @@ def test_support_pad2(self):
         ret_res = zeropad2d(x_tensor, pad).numpy()
         self.assertTrue(np.allclose(expect_res, ret_res))
 
-    def test_support_pad3(self):
+    def test_support_pad2(self):
+        with paddle.fluid.framework._test_eager_guard():
+            self.func_support_pad2()
+        self.func_support_pad2()
+
+    def func_support_pad3(self):
         """
         test the type of 'pad' is tuple.
         """
@@ -95,7 +111,12 @@ def test_support_pad3(self):
         ret_res = zeropad2d(x_tensor, pad).numpy()
         self.assertTrue(np.allclose(expect_res, ret_res))
 
-    def test_support_pad4(self):
+    def test_support_pad3(self):
+        with paddle.fluid.framework._test_eager_guard():
+            self.func_support_pad3()
+        self.func_support_pad3()
+
+    def func_support_pad4(self):
         """
         test the type of 'pad' is paddle.Tensor.
         """
@@ -109,6 +130,11 @@ def test_support_pad4(self):
         ret_res = zeropad2d(x_tensor, pad_tensor).numpy()
         self.assertTrue(np.allclose(expect_res, ret_res))
 
+    def test_support_pad4(self):
+        with paddle.fluid.framework._test_eager_guard():
+            self.func_support_pad4()
+        self.func_support_pad4()
+
 
 class TestZeroPad2DLayer(unittest.TestCase):
     """
@@ -124,12 +150,17 @@ def setUp(self):
                                  [[0, 0], [0, 0], [self.pad[2], self.pad[3]],
                                   [self.pad[0], self.pad[1]]])
 
-    def test_layer(self):
+    def func_layer(self):
         self.assertTrue(
             np.allclose(
                 zeropad2d(to_tensor(self.x), self.pad).numpy(),
                 self.padLayer(to_tensor(self.x))))
 
+    def test_layer(self):
+        with paddle.fluid.framework._test_eager_guard():
+            self.func_layer()
+        self.func_layer()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_mul_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_mul_op_xpu.py
index 58a8fa3083055a..9d98ab70041e9f 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_mul_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_mul_op_xpu.py
@@ -27,104 +27,120 @@
 
 paddle.enable_static()
 
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
+
 
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
 class TestMulOpError(unittest.TestCase):
     def test_errors(self):
         with program_guard(Program(), Program()):
             # The input type of mul_op must be Variable.
             x1 = fluid.create_lod_tensor(
-                np.array([[-1]]), [[1]], fluid.CPUPlace())
+                np.array([[-1]]), [[1]], fluid.XPUPlace(0))
             x2 = fluid.create_lod_tensor(
-                np.array([[-1]]), [[1]], fluid.CPUPlace())
+                np.array([[-1]]), [[1]], fluid.XPUPlace(0))
             self.assertRaises(TypeError, fluid.layers.mul, x1, x2)
-            # The input dtype of mul_op must be float32 or float64.
+            # The input dtype of mul_op must be float32.
             x3 = fluid.layers.data(name='x3', shape=[4], dtype="int32")
             x4 = fluid.layers.data(name='x4', shape=[4], dtype="int32")
             self.assertRaises(TypeError, fluid.layers.mul, x3, x4)
 
 
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestXPUMulOp1(XPUOpTest):
-    def setUp(self):
-        self.op_type = "mul"
-        self.dtype = np.float32
-        self.use_xpu = True
-        self.init_dtype_type()
-        self.inputs = {
-            'X': np.random.random((3, 4, 2, 9)).astype(self.dtype),
-            'Y': np.random.random((3, 6, 1, 2, 3)).astype(self.dtype)
-        }
-        self.attrs = {
-            'x_num_col_dims': 2,
-            'y_num_col_dims': 2,
-        }
-        result = np.dot(self.inputs['X'].reshape(3 * 4, 2 * 9),
-                        self.inputs['Y'].reshape(3 * 6, 1 * 2 * 3))
-        result = result.reshape(3, 4, 1, 2, 3)
-        self.outputs = {'Out': result}
-
-    def init_dtype_type(self):
-        pass
-
-    def test_check_output(self):
-        place = paddle.XPUPlace(0)
-        self.check_output_with_place(place, atol=0.01)
-
-    def test_check_grad_normal(self):
-        place = paddle.XPUPlace(0)
-        self.check_grad_with_place(
-            place, ['X', 'Y'], 'Out', max_relative_error=0.1)
-
-    def test_check_grad_ingore_x(self):
-        place = paddle.XPUPlace(0)
-        self.check_grad_with_place(
-            place, ['Y'], 'Out', max_relative_error=0.1, no_grad_set=set("X"))
-
-    def test_check_grad_ignore_y(self):
-        place = paddle.XPUPlace(0)
-        self.check_grad_with_place(
-            place, ['X'], 'Out', max_relative_error=0.1, no_grad_set=set('Y'))
-
-
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestXPUMulOp2(XPUOpTest):
-    def setUp(self):
-        self.op_type = "mul"
-        self.use_xpu = True
-        self.dtype = np.float32
-        self.init_dtype_type()
-        self.inputs = {
-            'X': np.random.random((20, 5)).astype(self.dtype),
-            'Y': np.random.random((5, 21)).astype(self.dtype)
-        }
-        self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])}
-
-    def init_dtype_type(self):
-        self.dtype = np.float32
-
-    def test_check_output(self):
-        place = paddle.XPUPlace(0)
-        self.check_output_with_place(place, atol=0.01)
-
-    def test_check_grad_normal(self):
-        place = paddle.XPUPlace(0)
-        self.check_grad_with_place(
-            place, ['X', 'Y'], 'Out', max_relative_error=0.1)
-
-    def test_check_grad_ingore_x(self):
-        place = paddle.XPUPlace(0)
-        self.check_grad_with_place(
-            place, ['Y'], 'Out', max_relative_error=0.1, no_grad_set=set("X"))
-
-    def test_check_grad_ingore_y(self):
-        place = paddle.XPUPlace(0)
-        self.check_grad_with_place(
-            place, ['X'], 'Out', max_relative_error=0.1, no_grad_set=set('Y'))
-
+class XPUTestMulOp(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'mul'
+        self.use_dynamic_create_class = False
+
+    class TestXPUMulOp1(XPUOpTest):
+        def setUp(self):
+            self.op_type = "mul"
+            self.dtype = self.in_type
+            self.inputs = {
+                'X': np.random.random((3, 4, 2, 9)).astype(self.in_type_str),
+                'Y': np.random.random((3, 6, 1, 2, 3)).astype(self.in_type_str)
+            }
+            self.attrs = {
+                'x_num_col_dims': 2,
+                'y_num_col_dims': 2,
+            }
+            result = np.dot(self.inputs['X'].reshape(3 * 4, 2 * 9),
+                            self.inputs['Y'].reshape(3 * 6, 1 * 2 * 3))
+            result = result.reshape(3, 4, 1, 2, 3)
+            self.outputs = {'Out': result}
+
+        def test_check_output(self):
+            paddle.enable_static()
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place, atol=0.01)
+
+        def test_check_grad_normal(self):
+            place = paddle.XPUPlace(0)
+            paddle.enable_static()
+            self.check_grad_with_place(
+                place, ['X', 'Y'], 'Out', max_relative_error=0.1)
+
+        def test_check_grad_ingore_x(self):
+            place = paddle.XPUPlace(0)
+            paddle.enable_static()
+            self.check_grad_with_place(
+                place, ['Y'],
+                'Out',
+                max_relative_error=0.1,
+                no_grad_set=set("X"))
+
+        def test_check_grad_ignore_y(self):
+            place = paddle.XPUPlace(0)
+            paddle.enable_static()
+            self.check_grad_with_place(
+                place, ['X'],
+                'Out',
+                max_relative_error=0.1,
+                no_grad_set=set('Y'))
+
+    class TestXPUMulOp2(XPUOpTest):
+        def setUp(self):
+            self.op_type = "mul"
+            self.use_xpu = True
+            self.dtype = self.in_type
+            self.inputs = {
+                'X': np.random.random((20, 5)).astype(self.in_type_str),
+                'Y': np.random.random((5, 21)).astype(self.in_type_str)
+            }
+            self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])}
+
+        def test_check_output(self):
+            place = paddle.XPUPlace(0)
+            paddle.enable_static()
+            self.check_output_with_place(place, atol=0.01)
+
+        def test_check_grad_normal(self):
+            place = paddle.XPUPlace(0)
+            paddle.enable_static()
+            self.check_grad_with_place(
+                place, ['X', 'Y'], 'Out', max_relative_error=0.1)
+
+        def test_check_grad_ingore_x(self):
+            place = paddle.XPUPlace(0)
+            paddle.enable_static()
+            self.check_grad_with_place(
+                place, ['Y'],
+                'Out',
+                max_relative_error=0.1,
+                no_grad_set=set("X"))
+
+        def test_check_grad_ingore_y(self):
+            place = paddle.XPUPlace(0)
+            paddle.enable_static()
+            self.check_grad_with_place(
+                place, ['X'],
+                'Out',
+                max_relative_error=0.1,
+                no_grad_set=set('Y'))
+
+
+support_types = get_xpu_op_support_types('mul')
+for stype in support_types:
+    create_test_class(globals(), XPUTestMulOp, stype)
 
 if __name__ == "__main__":
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_rnn_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_rnn_op_xpu.py
index e0d208644e79e6..20a3fc69fe8d24 100755
--- a/python/paddle/fluid/tests/unittests/xpu/test_rnn_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_rnn_op_xpu.py
@@ -46,8 +46,9 @@ def setUp(self):
             self.init_dtype()
             self.op_type = "rnn"
             self.place = paddle.XPUPlace(0)
-            self.sequence_length = np.ones(
-                (self.batch_size, ), dtype=np.int32) * self.seq_length
+            self.sequence_length = np.array([12, 11, 10, 9, 8], dtype=np.int32)
+            self.num_layers = 1
+            self.is_bidirec = False
             self.set_attrs()
             self.mode = "LSTM"
             self.is_test = False
@@ -61,6 +62,10 @@ def setUp(self):
                 high=0.1,
                 size=(self.seq_length, self.batch_size,
                       self.input_size)).astype(self.dtype)
+            input[11][1:][:] = 0
+            input[10][2:][:] = 0
+            input[9][3:][:] = 0
+            input[8][4:][:] = 0
 
             rnn1 = LSTM(
                 self.input_size,
@@ -126,10 +131,10 @@ def test_check_output(self):
                 no_check_set=['Reserve', 'DropoutState'])
 
         def init_size(self):
-            self.seq_length = 1
-            self.batch_size = 1
-            self.input_size = 5
-            self.hidden_size = 16
+            self.seq_length = 12
+            self.batch_size = 5
+            self.input_size = 3
+            self.hidden_size = 2
 
         def get_weight_names(self):
             weight_names = []
@@ -142,38 +147,18 @@ def get_weight_names(self):
             return weight_names
 
         def set_attrs(self):
-            self.num_layers = 1
-            self.is_bidirec = False
+            pass
 
     class TestRNNOp1(TestRNNOp):
-        def init_size(self):
-            self.seq_length = 2
-            self.batch_size = 4
-            self.input_size = 10
-            self.hidden_size = 32
-
         def set_attrs(self):
-            self.num_layers = 1
-            self.is_bidirec = False
+            self.sequence_length = None
 
     class TestRNNOp2(TestRNNOp):
-        def init_size(self):
-            self.seq_length = 5
-            self.batch_size = 16
-            self.input_size = 30
-            self.hidden_size = 64
-
         def set_attrs(self):
             self.num_layers = 1
             self.is_bidirec = True
 
     class TestRNNOp3(TestRNNOp):
-        def init_size(self):
-            self.seq_length = 10
-            self.batch_size = 64
-            self.input_size = 50
-            self.hidden_size = 64
-
         def set_attrs(self):
             self.num_layers = 2
             self.is_bidirec = False
@@ -188,6 +173,17 @@ def set_attrs(self):
             self.num_layers = 2
             self.is_bidirec = True
 
+    class TestRNNOp6(TestRNNOp):
+        def set_attrs(self):
+            self.num_layers = 2
+            self.is_bidirec = True
+            self.sequence_length = None
+
+    class TestRNNOp7(TestRNNOp):
+        def set_attrs(self):
+            self.num_layers = 3
+            self.is_bidirec = True
+
 
 support_types = get_xpu_op_support_types('rnn')
 for stype in support_types:
diff --git a/python/paddle/framework/__init__.py b/python/paddle/framework/__init__.py
index 2f8c23187e8d13..ffd1607fe87b4c 100644
--- a/python/paddle/framework/__init__.py
+++ b/python/paddle/framework/__init__.py
@@ -53,4 +53,7 @@
 from ..fluid.framework import convert_np_dtype_to_dtype_, _varbase_creator, OpProtoHolder  # noqa: F401
 from ..fluid.framework import _dygraph_tracer  # noqa: F401
 
+from ..fluid.layer_helper import LayerHelper  # noqa: F401
+from ..fluid.framework import in_dygraph_mode  # noqa: F401
+
 __all__ = []
diff --git a/python/paddle/incubate/optimizer/distributed_fused_lamb.py b/python/paddle/incubate/optimizer/distributed_fused_lamb.py
index 12a88106a44cda..74b5398230dee6 100644
--- a/python/paddle/incubate/optimizer/distributed_fused_lamb.py
+++ b/python/paddle/incubate/optimizer/distributed_fused_lamb.py
@@ -17,7 +17,7 @@
 from paddle.fluid.clip import ClipGradByGlobalNorm
 from paddle.fluid.initializer import Constant
 from paddle.fluid.layer_helper import LayerHelper
-from paddle.optimizer import Optimizer
+from paddle.fluid.optimizer import Optimizer
 from paddle.distributed import get_rank, get_world_size
 from paddle.fluid.executor import global_scope
 from paddle.fluid.framework import name_scope
@@ -42,11 +42,7 @@ def __init__(self,
         assert not framework._non_static_mode(
         ), "DistributedFusedLamb does not support dygraph mode"
         super(DistributedFusedLamb, self).__init__(
-            learning_rate=learning_rate,
-            parameters=parameters,
-            weight_decay=None,
-            grad_clip=None,
-            name=name)
+            learning_rate=learning_rate, grad_clip=None, name=name)
 
         self._beta1 = beta1
         self._beta2 = beta2
diff --git a/python/paddle/incubate/tensor/math.py b/python/paddle/incubate/tensor/math.py
index b36aaef9acf361..07dc7c1581fc49 100644
--- a/python/paddle/incubate/tensor/math.py
+++ b/python/paddle/incubate/tensor/math.py
@@ -52,7 +52,7 @@ def segment_sum(data, segment_ids, name=None):
 
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_segment_pool(data, segment_idsm, "SUM")[0]
+        return _C_ops.final_state_segment_pool(data, segment_ids, "SUM")[0]
     if _in_legacy_dygraph():
         out, tmp = _C_ops.segment_pool(data, segment_ids, 'pooltype', "SUM")
         return out
@@ -109,7 +109,7 @@ def segment_mean(data, segment_ids, name=None):
     """
 
     if in_dygraph_mode():
-        return _C_ops.final_state_segment_pool(data, segment_idsm, "MEAN")[0]
+        return _C_ops.final_state_segment_pool(data, segment_ids, "MEAN")[0]
     if _non_static_mode():
         out, tmp = _C_ops.segment_pool(data, segment_ids, 'pooltype', "MEAN")
         return out
@@ -165,7 +165,7 @@ def segment_min(data, segment_ids, name=None):
     """
 
     if in_dygraph_mode():
-        return _C_ops.final_state_segment_pool(data, segment_idsm, "MIN")[0]
+        return _C_ops.final_state_segment_pool(data, segment_ids, "MIN")[0]
 
     if _non_static_mode():
         out, tmp = _C_ops.segment_pool(data, segment_ids, 'pooltype', "MIN")
@@ -222,7 +222,7 @@ def segment_max(data, segment_ids, name=None):
     """
 
     if in_dygraph_mode():
-        out, tmp = _C_ops.final_state_segment_pool(data, segment_ids, "MAX")[0]
+        out, tmp = _C_ops.final_state_segment_pool(data, segment_ids, "MAX")
         return out
 
     if _non_static_mode():
diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py
index d145b615c3d7fd..a0efdaac8ff7c6 100644
--- a/python/paddle/nn/functional/activation.py
+++ b/python/paddle/nn/functional/activation.py
@@ -28,6 +28,7 @@
 import paddle
 from paddle import _C_ops, in_dynamic_mode
 from paddle.framework import core
+from paddle.fluid.framework import _in_legacy_dygraph, in_dygraph_mode
 
 __all__ = []
 
@@ -386,8 +387,10 @@ def hardswish(x, name=None):
             out = F.hardswish(x) # [0., 5., 0.666667]
     """
 
-    if in_dynamic_mode():
+    if _in_legacy_dygraph():
         return _C_ops.hard_swish(x)
+    if in_dygraph_mode():
+        return _C_ops.final_state_hard_swish(x, 6, 6, 3)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
                              'hardswish')
@@ -681,10 +684,10 @@ def maxout(x, groups, axis=1, name=None):
             #    [0.95313174 0.6228939  0.7129065  0.7087491 ]
             #    [0.7142536  0.88725346 0.61093384 0.38833922]]]]
     """
-
-    if in_dynamic_mode():
+    if _in_legacy_dygraph():
         return _C_ops.maxout(x, 'groups', groups, 'axis', axis)
-
+    if in_dygraph_mode():
+        return _C_ops.final_state_maxout(x, groups, axis)
     check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'maxout')
     if axis not in [1, -1, 3]:
         raise ValueError(
@@ -1178,8 +1181,9 @@ def swish(x, name=None):
             x = paddle.to_tensor(np.array([-2., 0., 1.]))
             out = F.swish(x) # [-0.238406, 0., 0.731059]
     """
-
-    if in_dynamic_mode():
+    if in_dygraph_mode():
+        return _C_ops.final_state_swish(x, 1.0)
+    if _in_legacy_dygraph():
         return _C_ops.swish(x, 'beta', 1.0)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'swish')
@@ -1521,6 +1525,9 @@ def gumbel_softmax(x, temperature=1.0, hard=False, axis=-1, name=None):
             # [0.00000000, 0.00000000, 0.00000000, 0.00001258, 0.99998736, 0.00000000]]
         
     """
+    if in_dygraph_mode():
+        return _C_ops.final_state_gumbel_softmax(x, temperature, hard, axis)
+
     if in_dynamic_mode():
         return _C_ops.gumbel_softmax(x, 'temperature', temperature, 'hard',
                                      hard, 'axis', axis)
diff --git a/python/paddle/nn/functional/common.py b/python/paddle/nn/functional/common.py
index 5cbd66b7832d88..287dc7d67def88 100644
--- a/python/paddle/nn/functional/common.py
+++ b/python/paddle/nn/functional/common.py
@@ -1356,29 +1356,31 @@ def pad(x, pad, mode='constant', value=0, data_format="NCHW", name=None):
                 unsqueezed_dim = [1]
                 x = unsqueeze(x, axis=unsqueezed_dim)
 
-    if in_dynamic_mode():
+    if in_dygraph_mode():
         if isinstance(pad, Variable):
-            pad = pad.numpy()
+            pad = pad.numpy().tolist()
+        out = _C_ops.final_state_pad3d(x, pad, mode, value, data_format)
+    else:
         if _in_legacy_dygraph():
+            if isinstance(pad, Variable):
+                pad = pad.numpy().tolist()
             out = _C_ops.pad3d(x, "paddings", pad, "mode", mode, "value", value,
                                "data_format", data_format, "name", name)
         else:
-            out = _C_ops.final_state_pad3d(x, pad, mode, value, data_format)
-    else:
-        attrs = {'mode': mode, 'value': value, 'data_format': data_format}
-        inputs = {'X': [x]}
-        if isinstance(pad, Variable):
-            inputs['Paddings'] = [pad]
-            attrs['paddings'] = []
-        else:
-            attrs['paddings'] = pad
+            attrs = {'mode': mode, 'value': value, 'data_format': data_format}
+            inputs = {'X': [x]}
+            if isinstance(pad, Variable):
+                inputs['Paddings'] = [pad]
+                attrs['paddings'] = []
+            else:
+                attrs['paddings'] = pad
 
-        helper = LayerHelper('pad3d', **locals())
+            helper = LayerHelper('pad3d', **locals())
 
-        dtype = helper.input_dtype(input_param_name='input')
-        out = helper.create_variable_for_type_inference(dtype)
-        helper.append_op(
-            type='pad3d', inputs=inputs, outputs={"Out": out}, attrs=attrs)
+            dtype = helper.input_dtype(input_param_name='input')
+            out = helper.create_variable_for_type_inference(dtype)
+            helper.append_op(
+                type='pad3d', inputs=inputs, outputs={"Out": out}, attrs=attrs)
 
     if len(unsqueezed_dim) != 0:
         out = squeeze(out, axis=unsqueezed_dim)
@@ -1531,38 +1533,50 @@ def linear(x, weight, bias=None, name=None):
           #     [0.9440598  0.9440598  0.9440598  0.9440598 ]
           #     [2.1077576  2.1077576  2.1077576  2.1077576 ]]
     """
-    if in_dynamic_mode():
-        pre_bias = _C_ops.matmul_v2(x, weight, 'trans_x', False, 'trans_y',
-                                    False)
+    if in_dygraph_mode():
+        pre_bias = _C_ops.final_state_matmul(x, weight, False, False)
 
         if bias is None:
             return pre_bias
 
-        return _C_ops.elementwise_add(pre_bias, bias)
+        return _C_ops.final_state_add(pre_bias, bias)
     else:
-        helper = LayerHelper('linear', **locals())
-        dtype = x.dtype
+        if _in_legacy_dygraph():
+            pre_bias = _C_ops.matmul_v2(x, weight, 'trans_x', False, 'trans_y',
+                                        False)
 
-        check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
-                                 'linear')
-        check_dtype(dtype, 'dtype', ['float16', 'float32', 'float64'], 'linear')
+            if bias is None:
+                return pre_bias
 
-        inputs = {'X': [x], 'Y': [weight]}
-        attrs = {'trans_x': False, 'trans_y': False}
-        tmp = helper.create_variable_for_type_inference(dtype)
-        helper.append_op(
-            type='matmul_v2', inputs=inputs, outputs={'Out': tmp}, attrs=attrs)
-        if bias is not None:
-            res = helper.create_variable_for_type_inference(dtype)
-            helper.append_op(
-                type='elementwise_add',
-                inputs={'X': [tmp],
-                        'Y': [bias]},
-                outputs={'Out': [res]},
-                attrs={'axis': len(x.shape) - 1})
+            return _C_ops.elementwise_add(pre_bias, bias)
         else:
-            res = tmp
-        return res
+            helper = LayerHelper('linear', **locals())
+            dtype = x.dtype
+
+            check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
+                                     'linear')
+            check_dtype(dtype, 'dtype', ['float16', 'float32', 'float64'],
+                        'linear')
+
+            inputs = {'X': [x], 'Y': [weight]}
+            attrs = {'trans_x': False, 'trans_y': False}
+            tmp = helper.create_variable_for_type_inference(dtype)
+            helper.append_op(
+                type='matmul_v2',
+                inputs=inputs,
+                outputs={'Out': tmp},
+                attrs=attrs)
+            if bias is not None:
+                res = helper.create_variable_for_type_inference(dtype)
+                helper.append_op(
+                    type='elementwise_add',
+                    inputs={'X': [tmp],
+                            'Y': [bias]},
+                    outputs={'Out': [res]},
+                    attrs={'axis': len(x.shape) - 1})
+            else:
+                res = tmp
+            return res
 
 
 def label_smooth(label, prior_dist=None, epsilon=0.1, name=None):
diff --git a/python/paddle/nn/functional/conv.py b/python/paddle/nn/functional/conv.py
index 086ae78919454e..84aadbbac649b0 100644
--- a/python/paddle/nn/functional/conv.py
+++ b/python/paddle/nn/functional/conv.py
@@ -127,8 +127,12 @@ def _conv_nd(x,
             x, weight, stride, padding, padding_algorithm, groups, dilation,
             data_format, False, -1, False)
         if bias is not None:
-            out = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
-            return out
+            channel_dim = channel_dim + len(
+                x.shape) if channel_dim < 0 else channel_dim
+            tmp_bias = _C_ops.final_state_reshape(
+                bias, bias.shape +
+                [1 for i in range(len(x.shape) - channel_dim - 1)])
+            return _C_ops.final_state_add(pre_bias, tmp_bias)
         else:
             return pre_bias
     if in_dynamic_mode():
diff --git a/python/paddle/nn/functional/extension.py b/python/paddle/nn/functional/extension.py
index 6a8686b612e7f3..2483eab6c053ac 100644
--- a/python/paddle/nn/functional/extension.py
+++ b/python/paddle/nn/functional/extension.py
@@ -20,7 +20,7 @@
 from ...static import Variable
 from ...tensor.creation import assign
 from ...fluid import dygraph_utils
-from ...fluid.layers.layer_function_generator import templatedoc
+from ...tensor.layer_function_generator import templatedoc
 from ...fluid.layers.sequence_lod import sequence_mask  #noqa: F401
 from paddle import in_dynamic_mode
 
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index 593cea2d2cf643..62f034c7b41498 100755
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -1795,7 +1795,7 @@ def cross_entropy(input,
             # 2. else
             #     numerator: loss's weighted sum
             #     denominator: cal the sum of weight where the sample's class_index!=ignore_index
-            if ignore_index != -100:
+            if ignore_index >= 0:
                 out_sum = _C_ops.reduce_sum(out, 'reduce_all', True)
                 # for each label[i],set 1 or 0, according to ignore_index
                 # mask[i]=0, if label[i]==ignore_index
@@ -1905,7 +1905,7 @@ def cross_entropy(input,
     if reduction == "sum":
         return paddle.sum(out, name=name)
     elif reduction == "mean":
-        if ignore_index != -100:
+        if ignore_index >= 0:
             out_sum = paddle.sum(out, name=name)
             # for each label[i],set 1 or 0, according to ignore_index
             # mask[i]=0, if label[i]==ignore_index
diff --git a/python/paddle/nn/utils/__init__.py b/python/paddle/nn/utils/__init__.py
index 8f9b55d15cad0b..8ec4e8cfd60b5a 100644
--- a/python/paddle/nn/utils/__init__.py
+++ b/python/paddle/nn/utils/__init__.py
@@ -14,7 +14,7 @@
 
 from .spectral_norm_hook import spectral_norm
 from .weight_norm_hook import weight_norm, remove_weight_norm  # noqa: F401
-from .transform_parameters import parameters_to_vector, vector_to_parameters  # noqa: F401
+from .transform_parameters import parameters_to_vector, vector_to_parameters, _stride_column  # noqa: F401
 
 __all__ = [  #noqa
     'weight_norm', 'remove_weight_norm', 'spectral_norm', 'parameters_to_vector', 'vector_to_parameters'
diff --git a/python/paddle/nn/utils/transform_parameters.py b/python/paddle/nn/utils/transform_parameters.py
index 99870ce29a138d..feb70e02d59881 100644
--- a/python/paddle/nn/utils/transform_parameters.py
+++ b/python/paddle/nn/utils/transform_parameters.py
@@ -36,6 +36,39 @@ def _inplace_reshape_dygraph(x, shape):
             stop_gradient=True)
 
 
+@dygraph_only
+def _stride_column(param):
+    """
+    A tool function. Permute date of parameter as a 'columns' stride. Now, it only support 2-D parameter.
+
+    Args:
+        param(Tensor]): The param that will be strided according to 'columns'.
+    
+    Examples:
+       .. code-block:: python
+
+            import paddle
+            paddle.seed(100)
+
+            linear = paddle.nn.Linear(2, 3)
+            print(linear.weight)
+            # [[-0.31485492, -1.02896988,  0.45741916],
+            #  [-0.65525872, -1.04643178,  1.07262802]]
+
+            paddle.nn.utils.stride_column(linear.weight)
+            print(linear.weight)
+            # [[-0.31485492,  0.45741916, -1.04643178],
+            #  [-1.02896988, -0.65525872,  1.07262802]]
+
+    """
+    assert len(param.shape) == 2
+    shape = [param.shape[1], param.shape[0]]
+    with paddle.fluid.dygraph.no_grad():
+        reshape_var = paddle.reshape(param, shape)
+        transpose_var = paddle.transpose(reshape_var, [1, 0])
+        transpose_var._share_underline_tensor_to(param)
+
+
 @dygraph_only
 def parameters_to_vector(parameters, name=None):
     """
diff --git a/python/paddle/profiler/profiler.py b/python/paddle/profiler/profiler.py
index c1c4f4ff8c13c9..2fae583397a8e9 100644
--- a/python/paddle/profiler/profiler.py
+++ b/python/paddle/profiler/profiler.py
@@ -18,6 +18,8 @@
 from enum import Enum
 from typing import Any, Callable, Iterable, Optional, Union
 from warnings import warn
+import importlib
+import json
 
 import paddle
 from paddle.fluid.core import (_Profiler, _ProfilerResult, ProfilerOptions,
@@ -741,3 +743,73 @@ def summary(self,
                     op_detail=op_detail,
                     thread_sep=thread_sep,
                     time_unit=time_unit))
+
+
+def get_profiler(config_path):
+    try:
+        with open(config_path, 'r') as filehandle:
+            config_dict = json.load(filehandle)
+    except Exception as e:
+        print('Load config file for profiler error: {}'.format(e))
+        print('Use default parameters instead.')
+        return Profiler()
+    translated_config_dict = {}
+    if "targets" in config_dict:
+        try:
+            translated_config_dict['targets'] = []
+            for target in config_dict['targets']:
+                if target.lower() == "cpu":
+                    translated_config_dict['targets'].append(ProfilerTarget.CPU)
+                elif target.lower() == 'gpu':
+                    translated_config_dict['targets'].append(ProfilerTarget.GPU)
+        except:
+            print('Set targets parameter error, use default parameter instead.')
+            translated_config_dict['targets'] = None
+    if "scheduler" in config_dict:
+        try:
+            if isinstance(config_dict['scheduler'], dict):
+                for key, value in config_dict['scheduler'].items():
+                    module_path = value['module']
+                    use_direct = value['use_direct']
+                    module = importlib.import_module(module_path)
+                    method = getattr(module, key)
+                    if not use_direct:
+                        translated_config_dict['scheduler'] = method(
+                            *value['args'], **value['kwargs'])
+                    else:
+                        translated_config_dict['scheduler'] = method
+            else:
+                translated_config_dict['scheduler'] = [
+                    config_dict['scheduler'][0], config_dict['scheduler'][1]
+                ]
+
+        except:
+            print(
+                'Set scheduler parameter error, use default parameter instead.')
+            translated_config_dict['scheduler'] = None
+    if "on_trace_ready" in config_dict:
+        try:
+            if isinstance(config_dict['on_trace_ready'], dict):
+                for key, value in config_dict['on_trace_ready'].items():
+                    module_path = value['module']
+                    use_direct = value['use_direct']
+                    module = importlib.import_module(module_path)
+                    method = getattr(module, key)
+                    if not use_direct:
+                        translated_config_dict['on_trace_ready'] = method(
+                            *value['args'], **value['kwargs'])
+                    else:
+                        translated_config_dict['on_trace_ready'] = method
+        except:
+            print(
+                'Set on_trace_ready parameter error, use default parameter instead.'
+            )
+            translated_config_dict['on_trace_ready'] = None
+    if "timer_only" in config_dict:
+        if isinstance(config_dict['timer_only'], bool):
+            translated_config_dict['timer_only'] = config_dict['timer_only']
+        else:
+            print(
+                'Set timer_only parameter error, use default parameter instead.')
+
+    return Profiler(**translated_config_dict)
diff --git a/python/paddle/profiler/profiler_statistic.py b/python/paddle/profiler/profiler_statistic.py
index 3be6088a484b81..e4d4ff8c183bca 100755
--- a/python/paddle/profiler/profiler_statistic.py
+++ b/python/paddle/profiler/profiler_statistic.py
@@ -28,7 +28,7 @@
     TracerEventType.PythonOp, TracerEventType.PythonUserDefined
 ]
 
-_CommunicationOpName = ['reduce', 'broadcast', 'rpc']
+_CommunicationOpName = ['allreduce', 'broadcast', 'rpc']
 
 
 class SortedKeys(Enum):
@@ -74,8 +74,10 @@ def __init__(self, hostnode):
         self.runtime_node = []
         self.cpu_time = 0
         self.self_cpu_time = 0
-        self.gpu_time = 0
+        self.gpu_time = 0  # kernel time
         self.self_gpu_time = 0
+        self.general_gpu_time = 0  # besides kernel, include time of gpu events like memcpy and memset
+        self.self_general_gpu_time = 0
 
     def cal_statistic(self):
         for child in self.children_node:
@@ -86,14 +88,20 @@ def cal_statistic(self):
         self.cpu_time = self.hostnode.end_ns - self.hostnode.start_ns
         for child in self.children_node:
             self.gpu_time += child.gpu_time
+            self.general_gpu_time += child.general_gpu_time
             self.self_cpu_time -= (child.end_ns - child.start_ns)
         for rt in self.runtime_node:
             self.self_cpu_time -= (rt.end_ns - rt.start_ns)
             self.gpu_time += rt.gpu_time
             self.self_gpu_time += rt.gpu_time
+            self.general_gpu_time += rt.general_gpu_time
+            self.self_general_gpu_time += rt.general_gpu_time
         for device in self.hostnode.device_node:
-            self.gpu_time += (device.end_ns - device.start_ns)
-            self.self_gpu_time += (device.end_ns - device.start_ns)
+            if device.type == TracerEventType.Kernel:
+                self.gpu_time += (device.end_ns - device.start_ns)
+                self.self_gpu_time += (device.end_ns - device.start_ns)
+            self.general_gpu_time += (device.end_ns - device.start_ns)
+            self.self_general_gpu_time += (device.end_ns - device.start_ns)
 
     @property
     def end_ns(self):
@@ -258,6 +266,8 @@ def __init__(self):
         self.communication_range = []
         self.computation_range = []
         self.overlap_range = []
+        self.cpu_calls = 0
+        self.gpu_calls = 0
 
     def parse(self, nodetrees):
         '''
@@ -300,6 +310,8 @@ def parse(self, nodetrees):
                                 else:
                                     self.computation_range.append((
                                         devicenode.start_ns, devicenode.end_ns))
+        self.cpu_calls = len(set(self.cpu_communication_range))
+        self.gpu_calls = len(set(self.gpu_communication_range))
         self.cpu_communication_range = merge_self_ranges(
             self.cpu_communication_range, is_sorted=False)
         self.gpu_communication_range = merge_self_ranges(
@@ -354,6 +366,9 @@ def __init__(self, name):
             self.min_gpu_time = float('inf')
             self.devices = {}
             self.operator_inners = {}
+            self.general_gpu_time = 0
+            self.min_general_gpu_time = float('inf')
+            self.max_general_gpu_time = 0
 
         @property
         def avg_cpu_time(self):
@@ -363,6 +378,10 @@ def avg_cpu_time(self):
         def avg_gpu_time(self):
             return self.gpu_time / self.call
 
+        @property
+        def avg_general_gpu_time(self):
+            return self.general_gpu_time / self.call
+
         def add_cpu_time(self, time):
             if time > self.max_cpu_time:
                 self.max_cpu_time = time
@@ -377,6 +396,13 @@ def add_gpu_time(self, time):
                 self.min_gpu_time = time
             self.gpu_time += time
 
+        def add_general_gpu_time(self, time):
+            if time > self.max_general_gpu_time:
+                self.max_general_gpu_time = time
+            if time < self.min_general_gpu_time:
+                self.min_general_gpu_time = time
+            self.general_gpu_time += time
+
         def add_call(self):
             self.call += 1
 
@@ -384,6 +410,7 @@ def add_item(self, node):
             self.add_call()
             self.add_cpu_time(node.cpu_time)
             self.add_gpu_time(node.gpu_time)
+            self.add_general_gpu_time(node.general_gpu_time)
             for child in node.children_node:
                 if child.name not in self.operator_inners:
                     self.operator_inners[
@@ -407,6 +434,9 @@ def __init__(self, name):
             self.gpu_time = 0
             self.max_gpu_time = 0
             self.min_gpu_time = float('inf')
+            self.general_gpu_time = 0
+            self.min_general_gpu_time = float('inf')
+            self.max_general_gpu_time = 0
 
         @property
         def avg_cpu_time(self):
@@ -416,6 +446,10 @@ def avg_cpu_time(self):
         def avg_gpu_time(self):
             return self.gpu_time / self.call
 
+        @property
+        def avg_general_gpu_time(self):
+            return self.general_gpu_time / self.call
+
         def add_cpu_time(self, time):
             if time > self.max_cpu_time:
                 self.max_cpu_time = time
@@ -430,6 +464,13 @@ def add_gpu_time(self, time):
                 self.min_gpu_time = time
             self.gpu_time += time
 
+        def add_general_gpu_time(self, time):
+            if time > self.max_general_gpu_time:
+                self.max_general_gpu_time = time
+            if time < self.min_general_gpu_time:
+                self.min_general_gpu_time = time
+            self.general_gpu_time += time
+
         def add_call(self):
             self.call += 1
 
@@ -437,6 +478,7 @@ def add_item(self, node):
             self.add_call()
             self.add_cpu_time(node.cpu_time)
             self.add_gpu_time(node.gpu_time)
+            self.add_general_gpu_time(node.general_gpu_time)
 
     def __init__(self):
         self.items = {}  # for operator summary
@@ -478,6 +520,8 @@ def parse(self, nodetrees):
                         self.add_model_perspective_item(
                             child)  #find first model perspective node
                     else:
+                        if child.type == TracerEventType.ProfileStep:
+                            self.add_model_perspective_item(child)
                         deque.append(child)
 
     def add_operator_item(self, operator_node):
@@ -533,6 +577,8 @@ def add_model_perspective_item(self, model_perspective_node):
             name = 'Optimization'
         elif model_perspective_node.type == TracerEventType.Dataloader:
             name = 'Dataloader'
+        elif model_perspective_node.type == TracerEventType.ProfileStep:
+            name = 'ProfileStep'
         else:
             return
         if name not in self.model_perspective_items:
@@ -626,7 +672,6 @@ def format_ratio(ratio, indent=0):
     # construct table string
 
     append(add_title(line_length, "Device Summary"))
-    append('Time unit: {}'.format(time_unit))
     append(header_sep)
     append(row_format.format(*headers))
     append(header_sep)
@@ -661,7 +706,7 @@ def format_ratio(ratio, indent=0):
         return ''.join(result)
 
     ###### Print Overview Summary ######
-    headers = ['Event Type', 'CPU Time', 'Ratio (%)']
+    headers = ['Event Type', 'Calls', 'CPU Time', 'Ratio (%)']
     row_format_list = [""]
     header_sep_list = [""]
     line_length_list = [-SPACING_SIZE]
@@ -680,13 +725,13 @@ def format_ratio(ratio, indent=0):
     append(header_sep)
     append(row_format.format(*headers))
     append(header_sep)
-    row_values = [
-        'Total Time', format_time(
-            total_time, unit=time_unit), format_ratio(1)
-    ]
-    append(row_format.format(*row_values))
     cpu_type_time = collections.defaultdict(int)
     gpu_type_time = collections.defaultdict(int)
+    cpu_call_times = collections.defaultdict(int)
+    gpu_call_times = collections.defaultdict(int)
+    cpu_call_times.update(statistic_data.time_range_summary.call_times)
+    gpu_call_times.update(statistic_data.time_range_summary.call_times)
+
     for event_type, value in statistic_data.time_range_summary.CPUTimeRangeSum.items(
     ):
         if event_type != TracerEventType.Communication:
@@ -694,6 +739,19 @@ def format_ratio(ratio, indent=0):
     if statistic_data.distributed_summary.cpu_communication_range:
         cpu_type_time[TracerEventType.Communication] = sum_ranges(
             statistic_data.distributed_summary.cpu_communication_range)
+        cpu_call_times[
+            TracerEventType.
+            Communication] = statistic_data.distributed_summary.cpu_calls
+
+    for event_type in [
+            TracerEventType.Dataloader, TracerEventType.Forward,
+            TracerEventType.Backward, TracerEventType.Optimization
+    ]:
+        event_type_name = str(event_type).split('.')[1]
+        if event_type in cpu_call_times and event_type_name in statistic_data.event_summary.model_perspective_items:
+            cpu_call_times[
+                event_type] = statistic_data.event_summary.model_perspective_items[
+                    event_type_name].call
 
     gpu_time_range = collections.defaultdict(list)
     for device_id, device_time_ranges in statistic_data.time_range_summary.GPUTimeRange.items(
@@ -706,22 +764,34 @@ def format_ratio(ratio, indent=0):
     if statistic_data.distributed_summary.gpu_communication_range:
         gpu_type_time[TracerEventType.Communication] = sum_ranges(
             statistic_data.distributed_summary.gpu_communication_range)
+        gpu_call_times[
+            TracerEventType.
+            Communication] = statistic_data.distributed_summary.gpu_calls
 
     sorted_items = sorted(
         cpu_type_time.items(), key=lambda x: x[1], reverse=True)
-    for event_type, time in sorted_items:
+    event_type, time = sorted_items[0]
+    row_values = [
+        '{}'.format(str(event_type).split('.')[1]), cpu_call_times[event_type],
+        format_time(
+            time, unit=time_unit), format_ratio(float(time) / total_time)
+    ]
+    append(row_format.format(*row_values))
+    for event_type, time in sorted_items[1:]:
         row_values = [
-            '  {}'.format(str(event_type).split('.')[1]), format_time(
+            '  {}'.format(str(event_type).split('.')[1]),
+            cpu_call_times[event_type], format_time(
                 time, unit=time_unit), format_ratio(float(time) / total_time)
         ]
         append(row_format.format(*row_values))
     append(header_sep)
-    headers = ['', 'GPU Time', 'Ratio (%)']
+    headers = ['', 'Calls', 'GPU Time', 'Ratio (%)']
     append(row_format.format(*headers))
     append(header_sep)
     for event_type, time in gpu_type_time.items():
         row_values = [
-            '  {}'.format(str(event_type).split('.')[1]), format_time(
+            '  {}'.format(str(event_type).split('.')[1]),
+            gpu_call_times[event_type], format_time(
                 time, unit=time_unit), format_ratio(float(time) / total_time)
         ]
         append(row_format.format(*row_values))
@@ -730,7 +800,7 @@ def format_ratio(ratio, indent=0):
     append(
         "Note:\nIn this table, We sum up all collected events in terms of event type.\n"
         "The time of events collected on host are presented as CPU Time, and as GPU Time if on device.\n"
-        "Ratio = CPU(GPU) Time / Total Time.\n"
+        "The time with ratio 100% is the base time for calculating ratio. \n"
         "Events with different types may overlap or inclusion, e.g. Operator includes OperatorInner, so the sum of ratios is not 100%.\n"
         "The time of events in the same type with overlap will not calculate twice, and all time is summed after merged.\n"
         "Example:\n"
@@ -746,21 +816,21 @@ def format_ratio(ratio, indent=0):
 
     ###### Print Model Summary Report ######
     model_perspective_items = statistic_data.event_summary.model_perspective_items
-    if model_perspective_items:
+    if len(model_perspective_items) > 1:
         all_row_values = []
-        row_values = [
-            'Total Time', '-', '{} / - / - / - / {}'.format(
-                format_time(
-                    total_time, unit=time_unit), format_ratio(1)),
-            '- / - / - / -/ -'
-        ]
-        all_row_values.append(row_values)
         accmulation_time = 0
-        for name in ['Dataloader', 'Forward', 'Backward', 'Optimization']:
+        gpu_accmulation_time = 0
+        gpu_total_time = 0
+        for name in [
+                'ProfileStep', 'Dataloader', 'Forward', 'Backward',
+                'Optimization'
+        ]:
             if name in model_perspective_items:
                 item = model_perspective_items[name]
+                name = '{}'.format(
+                    name) if 'ProfileStep' in name else '  {}'.format(name)
                 row_values = [
-                    '  {}'.format(name), item.call,
+                    '{}'.format(name), item.call,
                     '{} / {} / {} / {} / {}'.format(
                         format_time(
                             item.cpu_time, unit=time_unit),
@@ -783,15 +853,23 @@ def format_ratio(ratio, indent=0):
                         format_ratio(float(item.gpu_time) / total_time))
                 ]
                 all_row_values.append(row_values)
-                accmulation_time += item.cpu_time
+                if 'ProfileStep' not in name:
+                    accmulation_time += item.cpu_time
+                    gpu_accmulation_time += item.gpu_time
+                else:
+                    gpu_total_time = item.gpu_time
 
         other_time = total_time - accmulation_time
+        other_gpu_time = gpu_total_time - gpu_accmulation_time
         row_values = [
             '  Others', '-', '{} / - / - / - / {}'.format(
                 format_time(
                     other_time, unit=time_unit),
                 format_ratio(float(other_time) / total_time)),
-            '- / - / - / - / -'
+            '{} / - / - / - / {}'.format(
+                format_time(
+                    other_gpu_time, unit=time_unit),
+                format_ratio(float(other_gpu_time) / gpu_total_time))
         ]
         all_row_values.append(row_values)
         # Calculate the column width
@@ -835,6 +913,7 @@ def format_ratio(ratio, indent=0):
         append(
             "Note:\nIn this table, GPU time is the sum of all device(GPU) events called in the phase.\n"
             "Unlike overview summary, if two device(GPU) events execute on different streams with overlap time, we sum them directly here.\n"
+            "The time with ratio 100% is the base time for calculating ratio. \n"
         )
         append('-' * line_length)
         append('')
@@ -872,21 +951,27 @@ def format_ratio(ratio, indent=0):
         overlap_time = sum_ranges(
             statistic_data.distributed_summary.overlap_range)
         row_values = [
-            'Communication', format_time(
+            'ProfileStep', format_time(
+                total_time, unit=time_unit),
+            format_ratio(float(total_time) / total_time)
+        ]
+        append(row_format.format(*row_values))
+        row_values = [
+            '  Communication', format_time(
                 communication_time, unit=time_unit),
             format_ratio(float(communication_time) / total_time)
         ]
         append(row_format.format(*row_values))
 
         row_values = [
-            'Computation', format_time(
+            '  Computation', format_time(
                 computation_time, unit=time_unit),
             format_ratio(float(computation_time) / total_time)
         ]
         append(row_format.format(*row_values))
 
         row_values = [
-            'Overlap', format_time(
+            '  Overlap', format_time(
                 overlap_time, unit=time_unit),
             format_ratio(float(overlap_time) / total_time)
         ]
@@ -896,6 +981,7 @@ def format_ratio(ratio, indent=0):
             "Note:\nCommunication time: Communication Event time, Communication Op time and its kernel time on gpu.\n"
             "Computation time: Kernel time, except kernels belong to communication(nccl kernels).\n"
             "Overlap time: Communication time intersects with computation time.\n"
+            "The time with ratio 100% is the base time for calculating ratio. \n"
             "Example:\n"
             "Communication:\n"
             "  CPU:              |_________________|\n"
@@ -938,20 +1024,22 @@ def format_ratio(ratio, indent=0):
                     items.items(), key=lambda x: x[1].min_cpu_time)
             elif sorted_by == SortedKeys.GPUTotal:
                 sorted_items = sorted(
-                    items.items(), key=lambda x: x[1].gpu_time, reverse=True)
+                    items.items(),
+                    key=lambda x: x[1].general_gpu_time,
+                    reverse=True)
             elif sorted_by == SortedKeys.GPUAvg:
                 sorted_items = sorted(
                     items.items(),
-                    key=lambda x: x[1].avg_gpu_time,
+                    key=lambda x: x[1].avg_general_gpu_time,
                     reverse=True)
             elif sorted_by == SortedKeys.GPUMax:
                 sorted_items = sorted(
                     items.items(),
-                    key=lambda x: x[1].max_gpu_time,
+                    key=lambda x: x[1].max_general_gpu_time,
                     reverse=True)
             elif sorted_by == SortedKeys.GPUMin:
                 sorted_items = sorted(
-                    items.items(), key=lambda x: x[1].min_gpu_time)
+                    items.items(), key=lambda x: x[1].min_general_gpu_time)
 
             for name, item in sorted_items:
                 row_values = [
@@ -967,14 +1055,15 @@ def format_ratio(ratio, indent=0):
                         format_ratio(float(item.cpu_time) / total_time)),
                     '{} / {} / {} / {} / {}'.format(
                         format_time(
-                            item.gpu_time, unit=time_unit),
+                            item.general_gpu_time, unit=time_unit),
                         format_time(
-                            item.avg_gpu_time, unit=time_unit),
+                            item.avg_general_gpu_time, unit=time_unit),
                         format_time(
-                            item.max_gpu_time, unit=time_unit),
+                            item.max_general_gpu_time, unit=time_unit),
                         format_time(
-                            item.min_gpu_time, unit=time_unit),
-                        format_ratio(float(item.gpu_time) / total_time))
+                            item.min_general_gpu_time, unit=time_unit),
+                        format_ratio(
+                            float(item.general_gpu_time) / total_time))
                 ]
                 all_row_values.append(row_values)
                 if op_detail:
@@ -998,18 +1087,23 @@ def format_ratio(ratio, indent=0):
                                     float(innerop_node.cpu_time) / total_time)),
                             '{} / {} / {} / {} / {}'.format(
                                 format_time(
-                                    innerop_node.gpu_time, unit=time_unit),
+                                    innerop_node.general_gpu_time,
+                                    unit=time_unit),
                                 format_time(
-                                    innerop_node.avg_gpu_time, unit=time_unit),
+                                    innerop_node.avg_general_gpu_time,
+                                    unit=time_unit),
                                 format_time(
-                                    innerop_node.max_gpu_time, unit=time_unit),
+                                    innerop_node.max_general_gpu_time,
+                                    unit=time_unit),
                                 format_time(
-                                    innerop_node.min_gpu_time, unit=time_unit),
+                                    innerop_node.min_general_gpu_time,
+                                    unit=time_unit),
                                 format_ratio(
-                                    float(innerop_node.gpu_time) / total_time))
+                                    float(innerop_node.general_gpu_time) /
+                                    total_time))
                         ]
                         all_row_values.append(row_values)
-                        for device_node_name, devicenode in innerop_node.devices.items(
+                        for device_node_name, device_node in innerop_node.devices.items(
                         ):
                             if len(device_node_name) + 4 > name_column_width:
                                 device_node_name = device_node_name[:
@@ -1018,21 +1112,21 @@ def format_ratio(ratio, indent=0):
                                 device_node_name += "..."
                             row_values = [
                                 '    {}'.format(device_node_name),
-                                devicenode.call, '- / - / - / - / -',
+                                device_node.call, '- / - / - / - / -',
                                 '{} / {} / {} / {} / {}'.format(
                                     format_time(
-                                        devicenode.gpu_time, unit=time_unit),
+                                        device_node.gpu_time, unit=time_unit),
                                     format_time(
-                                        devicenode.avg_gpu_time,
+                                        device_node.avg_gpu_time,
                                         unit=time_unit),
                                     format_time(
-                                        devicenode.max_gpu_time,
+                                        device_node.max_gpu_time,
                                         unit=time_unit),
                                     format_time(
-                                        devicenode.min_gpu_time,
+                                        device_node.min_gpu_time,
                                         unit=time_unit),
                                     format_ratio(
-                                        float(devicenode.gpu_time) /
+                                        float(device_node.gpu_time) /
                                         total_time))
                             ]
                             all_row_values.append(row_values)
@@ -1043,19 +1137,19 @@ def format_ratio(ratio, indent=0):
                                                                 - 5]
                             device_node_name += "..."
                         row_values = [
-                            '  {}'.format(device_node_name), devicenode.call,
+                            '  {}'.format(device_node_name), device_node.call,
                             '- / - / - / - / -',
                             '{} / {} / {} / {} / {}'.format(
                                 format_time(
-                                    devicenode.gpu_time, unit=time_unit),
+                                    device_node.gpu_time, unit=time_unit),
                                 format_time(
-                                    devicenode.avg_gpu_time, unit=time_unit),
+                                    device_node.avg_gpu_time, unit=time_unit),
                                 format_time(
-                                    devicenode.max_gpu_time, unit=time_unit),
+                                    device_node.max_gpu_time, unit=time_unit),
                                 format_time(
-                                    devicenode.min_gpu_time, unit=time_unit),
+                                    device_node.min_gpu_time, unit=time_unit),
                                 format_ratio(
-                                    float(devicenode.gpu_time) / total_time))
+                                    float(device_node.gpu_time) / total_time))
                         ]
                         all_row_values.append(row_values)
         # Calculate the column width
@@ -1123,14 +1217,14 @@ def format_ratio(ratio, indent=0):
                     format_ratio(float(item.cpu_time) / total_time)),
                 '{} / {} / {} / {} / {}'.format(
                     format_time(
-                        item.gpu_time, unit=time_unit),
+                        item.general_gpu_time, unit=time_unit),
                     format_time(
-                        item.avg_gpu_time, unit=time_unit),
+                        item.avg_general_gpu_time, unit=time_unit),
                     format_time(
-                        item.max_gpu_time, unit=time_unit),
+                        item.max_general_gpu_time, unit=time_unit),
                     format_time(
-                        item.min_gpu_time, unit=time_unit),
-                    format_ratio(float(item.gpu_time) / total_time)),
+                        item.min_general_gpu_time, unit=time_unit),
+                    format_ratio(float(item.general_gpu_time) / total_time)),
             ]
             all_row_values.append(row_values)
 
@@ -1207,20 +1301,22 @@ def format_ratio(ratio, indent=0):
                     items.items(), key=lambda x: x[1].min_cpu_time)
             elif sorted_by == SortedKeys.GPUTotal:
                 sorted_items = sorted(
-                    items.items(), key=lambda x: x[1].gpu_time, reverse=True)
+                    items.items(),
+                    key=lambda x: x[1].general_gpu_time,
+                    reverse=True)
             elif sorted_by == SortedKeys.GPUAvg:
                 sorted_items = sorted(
                     items.items(),
-                    key=lambda x: x[1].avg_gpu_time,
+                    key=lambda x: x[1].avg_general_gpu_time,
                     reverse=True)
             elif sorted_by == SortedKeys.GPUMax:
                 sorted_items = sorted(
                     items.items(),
-                    key=lambda x: x[1].max_gpu_time,
+                    key=lambda x: x[1].max_general_gpu_time,
                     reverse=True)
             elif sorted_by == SortedKeys.GPUMin:
                 sorted_items = sorted(
-                    items.items(), key=lambda x: x[1].min_gpu_time)
+                    items.items(), key=lambda x: x[1].min_general_gpu_time)
 
             for name, item in sorted_items:
                 row_values = [
@@ -1238,14 +1334,15 @@ def format_ratio(ratio, indent=0):
                         format_ratio(float(item.cpu_time) / total_time)),
                     '{} / {} / {} / {} / {}'.format(
                         format_time(
-                            item.gpu_time, unit=time_unit),
+                            item.general_gpu_time, unit=time_unit),
                         format_time(
-                            item.avg_gpu_time, unit=time_unit),
+                            item.avg_general_gpu_time, unit=time_unit),
                         format_time(
-                            item.max_gpu_time, unit=time_unit),
+                            item.max_general_gpu_time, unit=time_unit),
                         format_time(
-                            item.min_gpu_time, unit=time_unit),
-                        format_ratio(float(item.gpu_time) / total_time)),
+                            item.min_general_gpu_time, unit=time_unit),
+                        format_ratio(
+                            float(item.general_gpu_time) / total_time)),
                 ]
                 all_row_values.append(row_values)
 
diff --git a/python/paddle/tensor/layer_function_generator.py b/python/paddle/tensor/layer_function_generator.py
new file mode 100644
index 00000000000000..ecb13613a125e5
--- /dev/null
+++ b/python/paddle/tensor/layer_function_generator.py
@@ -0,0 +1,382 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import re
+import functools
+import warnings
+import string
+
+from six.moves import cStringIO
+from ..static import Variable
+from ..fluid.proto import framework_pb2
+from ..framework import OpProtoHolder, core, convert_np_dtype_to_dtype_
+from ..framework import LayerHelper
+from ..fluid.data_feeder import check_variable_and_dtype
+import paddle
+from paddle import _C_ops
+
+__all__ = []
+
+
+def _convert_(name):
+    """
+    Formatting.
+
+    Args:
+       name: The name/alias
+
+    This function takes in a name and converts it to a standard format of
+    group1_group2. Where as per the regular expression, group1 can have
+    alphabets and numbers and group2 has capital alphabets.
+
+    """
+    s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
+    return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()
+
+
+def _type_to_str_(tp):
+    return framework_pb2.AttrType.Name(tp)
+
+
+_two_dollar_pattern_ = re.compile(r"\$\$([^\$]+)\$\$")
+_single_dollar_pattern_ = re.compile(r"\$([^\$]+)\$")
+_two_bang_pattern_ = re.compile(r"!!([^!]+)!!")
+
+
+def escape_math(text):
+    #return _two_bang_pattern_.sub(
+    #    r'$$\1$$',
+    #    _single_dollar_pattern_.sub(r':math:\n`\1`',
+    #                                _two_dollar_pattern_.sub(r"!!\1!!", text)))
+    return _two_dollar_pattern_.sub(r':math:`\1`', text)
+
+
+def _generate_doc_string_(op_proto,
+                          additional_args_lines=None,
+                          skip_attrs_set=None):
+    """
+    Generate docstring by OpProto
+
+    Args:
+        op_proto (framework_pb2.OpProto): a protobuf message typed OpProto
+
+    Returns:
+        str: the document string
+    """
+
+    if not isinstance(op_proto, framework_pb2.OpProto):
+        raise TypeError("OpProto should be `framework_pb2.OpProto`")
+
+    buf = cStringIO()
+    buf.write(escape_math(op_proto.comment))
+    buf.write('\nArgs:\n')
+    for each_input in op_proto.inputs:
+        line_begin = '    {0}'.format(_convert_(each_input.name))
+        buf.write(line_begin)
+        buf.write(" (Tensor): ")
+        buf.write(escape_math(each_input.comment))
+        if each_input.duplicable:
+            buf.write("  Duplicatable.")
+        if each_input.dispensable:
+            buf.write("  Optional.")
+        buf.write('\n')
+
+    skip_attrs = OpProtoHolder.generated_op_attr_names()
+    # attr use_mkldnn and is_test also should not be visible to users.
+    skip_attrs.add("use_mkldnn")
+    skip_attrs.add("is_test")
+    skip_attrs.add("use_cudnn")
+
+    if skip_attrs_set:
+        for t in skip_attrs_set:
+            skip_attrs.add(t)
+
+    for each_attr in op_proto.attrs:
+        if each_attr.name in skip_attrs:
+            continue
+        buf.write('    ')
+        buf.write(each_attr.name)
+        buf.write(' (')
+        buf.write(_type_to_str_(each_attr.type))
+        buf.write('): ')
+        buf.write(escape_math(each_attr.comment))
+        buf.write('\n')
+
+    if additional_args_lines is not None:
+        for line in additional_args_lines:
+            line = line.strip()
+            buf.write('    ')
+            buf.write(line)
+            buf.write('\n')
+
+    if len(op_proto.outputs) != 0:
+        buf.write('\nReturns:\n')
+        buf.write('    ')
+        for each_opt in op_proto.outputs:
+            if not each_opt.intermediate:
+                break
+        buf.write(_convert_(each_opt.name))
+        buf.write(' (Tensor): ')
+        buf.write(escape_math(each_opt.comment))
+
+    return buf.getvalue()
+
+
+def generate_layer_fn(op_type):
+    """Register the Python layer for an Operator.
+
+    Args:
+       op_type: The name of the operator to be created.
+
+    This function takes in the operator type (sigmoid, mean , average etc) and
+    creates the operator functionality.
+
+    """
+    op_proto = OpProtoHolder.instance().get_op_proto(op_type)
+    not_intermediate_outputs = \
+        [output for output in op_proto.outputs if not output.intermediate]
+    intermediate_outputs = \
+        [output for output in op_proto.outputs if output.intermediate]
+
+    if len(not_intermediate_outputs) != 1:
+        raise ValueError("Only one non intermediate output operator can be",
+                         "automatically generated. {0}".format(op_type))
+
+    if not_intermediate_outputs[0].duplicable:
+        raise ValueError(
+            "Only non duplicable op can be automatically generated.")
+
+    for output in intermediate_outputs:
+        if output.duplicable:
+            raise ValueError("The op can be automatically generated only when ",
+                             "all intermediate ops are not duplicable.")
+
+    o_name = not_intermediate_outputs[0].name
+    intermediate_output_names = [output.name for output in intermediate_outputs]
+
+    def infer_and_check_dtype(op_proto, *args, **kwargs):
+        """
+        This function performs the sanity check for dtype and
+        instance type.
+        """
+        dtype = None
+        for ipt in op_proto.inputs:
+            name = _convert_(ipt.name)
+            val = kwargs.pop(name, [])
+            if not isinstance(val, list) and not isinstance(val, tuple):
+                val = [val]
+            if len(val) == 0:
+                if len(args) == 0:
+                    continue
+                val = [args[0]]
+                args = args[1:]
+
+            for each in val:
+                if not isinstance(each, Variable):
+                    raise ValueError("input of {0} must be variable".format(
+                        op_type))
+
+                if dtype is None:
+                    dtype = each.dtype
+                elif dtype != each.dtype:
+                    raise ValueError(
+                        "operator {0} must input same dtype. {1} vs {2}".format(
+                            op_type, dtype, each.dtype))
+
+        if dtype is None:
+            arg_dtype = kwargs.get("dtype")
+            if arg_dtype:
+                if not isinstance(arg_dtype, core.VarDesc.VarType):
+                    dtype = convert_np_dtype_to_dtype_(arg_dtype)
+                else:
+                    dtype = arg_dtype
+            else:
+                dtype = core.VarDesc.VarType.FP32
+        return dtype
+
+    def func(*args, **kwargs):
+        helper = LayerHelper(op_type, **kwargs)
+
+        dtype = infer_and_check_dtype(op_proto, *args, **kwargs)
+
+        inputs = dict()
+        for ipt in op_proto.inputs:
+            name = _convert_(ipt.name)
+            val = kwargs.pop(name, [])
+            if not isinstance(val, list) and not isinstance(val, tuple):
+                val = [val]
+            if len(val) == 0 and len(args) != 0:
+                val = args[0]
+                args = args[1:]
+            inputs[ipt.name] = val
+
+        outputs = dict()
+        out = kwargs.pop(_convert_(o_name), [])
+        if out:
+            out_var = out[0] if (isinstance(out, list) or
+                                 isinstance(out, tuple)) else out
+        else:
+            out_var = helper.create_variable_for_type_inference(dtype=dtype)
+        outputs[o_name] = [out_var]
+        for name in intermediate_output_names:
+            outputs[name] = [
+                helper.create_variable_for_type_inference(dtype=dtype)
+            ]
+        helper.append_op(
+            type=op_type, inputs=inputs, outputs=outputs, attrs=kwargs)
+        return helper.append_activation(out_var)
+
+    func.__name__ = op_type
+    func.__doc__ = _generate_doc_string_(op_proto)
+    return func
+
+
+def generate_activation_fn(op_type):
+    """Register the Python layer for an Operator without Attribute.
+
+    Args:
+       op_type: The name of the operator to be created.
+
+    This function takes in the operator type (sigmoid, exp , tanh etc) and
+    creates the operator functionality.
+
+    """
+    op_proto = OpProtoHolder.instance().get_op_proto(op_type)
+
+    def func(x, name=None):
+        if paddle.in_dynamic_mode():
+            op = getattr(_C_ops, op_type)
+            return op(x)
+
+        if op_type not in ["abs", "exp", "square"]:
+            check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
+                                     op_type)
+        else:
+            # abs exp square ops support dtype(int32, int64, float16, float32, float64)
+            check_variable_and_dtype(
+                x, 'x', ['int32', 'int64', 'float16', 'float32', 'float64'],
+                op_type)
+
+        helper = LayerHelper(op_type, **locals())
+
+        output = helper.create_variable_for_type_inference(dtype=x.dtype)
+        helper.append_op(type=op_type, inputs={"X": x}, outputs={"Out": output})
+        return output
+
+    func.__name__ = op_type
+    func.__doc__ = _generate_doc_string_(
+        op_proto,
+        additional_args_lines=[
+            "name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`."
+        ])
+    return func
+
+
+def generate_inplace_fn(inplace_op_type):
+    """Register the Python layer for an Inplace Operator without Attribute.
+
+    Args:
+       inplace_op_type: The name of the inplace operator to be created.
+
+    This function takes in the inplace operator type (exp_ , ceil_ etc) and
+    creates the operator functionality.
+    """
+    origin_op_type = inplace_op_type[:-1]
+
+    def func(x, name=None):
+        if paddle.in_dynamic_mode():
+            op = getattr(_C_ops, inplace_op_type)
+            return op(x)
+        warnings.warn(
+            "In static mode, {}() is the same as {}() and does not perform inplace operation.".
+            format(inplace_op_type, origin_op_type))
+        return generate_activation_fn(origin_op_type)(x, name)
+
+    func.__name__ = inplace_op_type
+    func.__doc__ = """
+Inplace version of ``{0}`` API, the output Tensor will be inplaced with input ``x``.
+Please refer to :ref:`api_fluid_layers_{1}`.
+""".format(origin_op_type, origin_op_type)
+
+    return func
+
+
+def templatedoc(op_type=None):
+    """
+    Decorator of layer function. It will use the docstring from the layer
+    function as the template. The template arguments are:
+
+    * ${comment}: The operator comment written in CPP.
+    * ${{name}_comment}: The comment of ${name} written with AddAttr, AddOutput,
+        and AddInput. The ${name} is Python snake style. i.e., xxx_xxx.
+    * ${{name}_type}: The type of ${name}.
+
+    Returns:
+        Decorated function.
+    """
+
+    def trim_ending_dot(msg):
+        return msg.rstrip('.')
+
+    def __impl__(func):
+        if op_type is None:
+            op_type_name = func.__name__
+        else:
+            op_type_name = op_type
+        op_proto = OpProtoHolder.instance().get_op_proto(op_type_name)
+        tmpl = string.Template(func.__doc__)
+
+        comment_lines = op_proto.comment.split("\n")
+        comment = ""
+        for line in comment_lines:
+            line = line.strip()
+            if len(line) != 0:
+                comment += escape_math(line)
+                comment += " "
+            elif len(comment) != 0:
+                comment += "\n    \n    "
+
+        args = {"comment": trim_ending_dot(comment)}
+        for each_input in op_proto.inputs:
+            input_name = _convert_(each_input.name)
+            args["{0}_comment".format(input_name)] = trim_ending_dot(
+                each_input.comment)
+            args["{0}_type".format(input_name)] = "Variable"
+        for each_attr in op_proto.attrs:
+            input_name = _convert_(each_attr.name)
+            args["{0}_comment".format(input_name)] = trim_ending_dot(
+                each_attr.comment)
+            args["{0}_type".format(input_name)] = _type_to_str_(each_attr.type)
+
+        for each_opt in op_proto.outputs:
+            output_name = _convert_(each_opt.name)
+            args["{0}_comment".format(output_name)] = trim_ending_dot(
+                each_opt.comment)
+            args["{0}_type".format(output_name)] = "Variable"
+        func.__doc__ = tmpl.substitute(args)
+        return func
+
+    return __impl__
+
+
+def add_sample_code(func, sample_code):
+    """
+    Append sample code for dynamically generated functions. 
+
+    Args:
+       func: The function of the function to be append sample code to.
+       sample_code: sample code session in rst format.
+    """
+    func.__doc__ = func.__doc__ + sample_code
diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py
index 876fd5ed5e9582..509ae903f59e48 100644
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -17,7 +17,7 @@
 from ..framework import _varbase_creator, _dygraph_tracer
 from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype
 from ..static import Variable
-from ..fluid.framework import _in_legacy_dygraph, in_dygraph_mode
+from ..fluid.framework import _in_legacy_dygraph, in_dygraph_mode, _non_static_mode
 from ..fluid.layers import transpose, cast  # noqa: F401
 from ..fluid import layers
 import paddle
@@ -551,6 +551,9 @@ def dist(x, y, p=2, name=None):
             out = paddle.dist(x, y, float("-inf"))
             print(out) # out = [0.]
     """
+    if in_dygraph_mode():
+        return _C_ops.final_state_dist(x, y, p)
+
     check_variable_and_dtype(x, 'dtype', ['float32', 'float64'], 'dist')
     check_variable_and_dtype(y, 'dtype', ['float32', 'float64'], 'dist')
     check_type(p, 'p', (float, int), 'dist')
@@ -1284,8 +1287,26 @@ def matrix_rank(x, tol=None, hermitian=False, name=None):
             #      [1, 1, 1, 1]]
 
     """
+    if in_dygraph_mode():
+        if isinstance(tol, Variable):
+            if tol.dtype != x.dtype:
+                tol_tensor = cast(tol, x.dtype)
+            else:
+                tol_tensor = tol
+            use_default_tol = False
+            return _C_ops.final_state_matrix_rank_tol(
+                x, tol_tensor, use_default_tol, hermitian)
 
-    if paddle.in_dynamic_mode():
+        if tol is None:
+            tol_attr = 0.0
+            use_default_tol = True
+        else:
+            tol_attr = float(tol)
+            use_default_tol = False
+        return _C_ops.final_state_matrix_rank(x, tol_attr, use_default_tol,
+                                              hermitian)
+
+    if _in_legacy_dygraph():
         if tol is None:
             tol_tensor = None
             tol_attr = 0.0
@@ -1466,10 +1487,7 @@ def bincount(x, weights=None, minlength=0, name=None):
     if x.dtype not in [paddle.int32, paddle.int64]:
         raise TypeError("Elements in Input(x) should all be integers")
 
-    # if in_dygraph_mode():
-    #     return _C_ops.final_state_bincount(x, weights, minlength)
-
-    if _in_legacy_dygraph():
+    if _non_static_mode():
         return _C_ops.bincount(x, weights, "minlength", minlength)
 
     helper = LayerHelper('bincount', **locals())
@@ -2255,8 +2273,10 @@ def multi_dot(x, name=None):
         # [10, 7]
 
     """
-    if paddle.in_dynamic_mode():
+    if _in_legacy_dygraph():
         return _C_ops.multi_dot(x)
+    if in_dygraph_mode():
+        return _C_ops.final_state_multi_dot(x)
 
     check_type(x, 'x', (list, tuple), 'multi_dot')
     for id, item in enumerate(x):
diff --git a/python/paddle/tensor/logic.py b/python/paddle/tensor/logic.py
index 03d0f42d8417b8..636b2ef17c6a0e 100755
--- a/python/paddle/tensor/logic.py
+++ b/python/paddle/tensor/logic.py
@@ -14,11 +14,15 @@
 
 from ..fluid.layer_helper import LayerHelper
 from ..fluid.data_feeder import check_type, check_variable_and_dtype
-from ..fluid.layers.layer_function_generator import templatedoc
+from .layer_function_generator import templatedoc
 from ..static import Variable
-from ..framework import VarBase as Tensor
 from ..fluid.framework import _in_legacy_dygraph, in_dygraph_mode
-# TODO: define logic functions of a tensor  
+# TODO: define logic functions of a tensor
+import paddle.fluid as fluid
+if fluid.framework._in_eager_mode_:
+    Tensor = fluid.framework.core.eager.Tensor
+else:
+    from ..framework import VarBase as Tensor
 from ..fluid.layers import is_empty  # noqa: F401
 from ..fluid.layers import logical_and  # noqa: F401
 from ..fluid.layers import logical_not  # noqa: F401
@@ -123,7 +127,12 @@ def allclose(x, y, rtol=1e-05, atol=1e-08, equal_nan=False, name=None):
     """
 
     if in_dygraph_mode():
-        return _C_ops.final_state_allclose(x, y, rtol, atol, equal_nan)
+        # NOTE(dev): Pass tol as Tensor to fix precision loss problem, because
+        # C++ backend will cast it into float32 if passing float from python.
+        as_tensor = lambda x: paddle.to_tensor([x], dtype='float64', place='cpu')
+        return _C_ops.final_state_allclose(x, y,
+                                           as_tensor(rtol),
+                                           as_tensor(atol), equal_nan)
     if _in_legacy_dygraph():
         return _C_ops.allclose(x, y, 'rtol',
                                str(rtol), 'atol',
@@ -685,7 +694,12 @@ def isclose(x, y, rtol=1e-05, atol=1e-08, equal_nan=False, name=None):
     """
 
     if in_dygraph_mode():
-        return _C_ops.final_state_isclose(x, y, rtol, atol, equal_nan)
+        # NOTE(dev): Pass tol as Tensor to fix precision loss problem, because
+        # C++ backend will cast it into float32 if passing float from python.
+        as_tensor = lambda x: paddle.to_tensor([x], dtype='float64', place='cpu')
+        return _C_ops.final_state_isclose(x, y,
+                                          as_tensor(rtol),
+                                          as_tensor(atol), equal_nan)
     if _in_legacy_dygraph():
         return _C_ops.isclose(x, y, 'rtol',
                               str(rtol), 'atol',
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index 7e19feba906765..389b5dbd7dbec7 100755
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -458,6 +458,10 @@ def flip(x, axis, name=None):
     """
     if isinstance(axis, int):
         axis = [axis]
+
+    if in_dygraph_mode():
+        return _C_ops.final_state_flip(x, axis)
+
     if paddle.in_dynamic_mode():
         return _C_ops.flip(x, "axis", axis)
 
@@ -780,6 +784,8 @@ def roll(x, shifts, axis=None, name=None):
         axis = []
 
     if in_dygraph_mode():
+        if isinstance(shifts, paddle.Tensor):
+            shifts = shifts.cpu()
         return _C_ops.final_state_roll(x, shifts, axis)
 
     if _in_legacy_dygraph():
@@ -1469,6 +1475,9 @@ def unbind(input, axis=0):
             # x3.shape [3, 5]
 
     """
+    if in_dygraph_mode():
+        return _C_ops.final_state_unbind(input, axis)
+
     if not isinstance(axis, (int)):
         raise TypeError("The type of 'axis'  must be int, but received %s." %
                         (type(axis)))
@@ -1477,7 +1486,7 @@ def unbind(input, axis=0):
     input_shape = input.shape
     axis_ = axis if axis >= 0 else len(input_shape) + axis
     num = input_shape[axis_]
-    if paddle.in_dynamic_mode():
+    if _in_legacy_dygraph():
         return _C_ops.unbind(input, num, 'axis', axis)
 
     helper = LayerHelper("unbind", **locals())
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index 9751892e70188a..3a2d08af88ff8f 100644
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -23,56 +23,52 @@
 from paddle.common_ops_import import templatedoc
 from paddle.common_ops_import import dygraph_utils
 
-from paddle.tensor import cast
-from paddle.tensor.attribute import _complex_to_real_dtype
+from .manipulation import cast
+from .creation import _complex_to_real_dtype
+from .layer_function_generator import _generate_doc_string_, generate_activation_fn, generate_layer_fn
+
 import paddle
-from paddle.static import Variable
-from ..framework import core
-from ..fluid.framework import _in_legacy_dygraph, in_dygraph_mode, _non_static_mode
+from ..static import Variable
+from ..framework import core, in_dygraph_mode, _non_static_mode, LayerHelper
+from ..fluid.framework import _in_legacy_dygraph
 from ..framework import _varbase_creator, convert_np_dtype_to_dtype_
-from ..fluid.layer_helper import LayerHelper
 from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype, convert_dtype
-from ..fluid.layers.layer_function_generator import _generate_doc_string_, generate_activation_fn, generate_layer_fn
 from ..fluid.dygraph.inplace_utils import inplace_apis_in_dygraph_only
 
 # TODO: define math functions
 # yapf: disable
-from ..fluid.layers import abs    # noqa: F401
-from ..fluid.layers import acos    # noqa: F401
-from ..fluid.layers import asin    # noqa: F401
-from ..fluid.layers import ceil    # noqa: F401
-from ..fluid.layers import ceil_    # noqa: F401
-from ..fluid.layers import cos    # noqa: F401
-from ..fluid.layers import tan    # noqa: F401
-from ..fluid.layers import sinh    # noqa: F401
-from ..fluid.layers import cosh    # noqa: F401
-from ..fluid.layers import exp    # noqa: F401
-from ..fluid.layers import exp_    # noqa: F401
-from ..fluid.layers import expm1    # noqa: F401
-from ..fluid.layers import floor    # noqa: F401
-from ..fluid.layers import floor_    # noqa: F401
-from ..fluid.layers import log    # noqa: F401
-from ..fluid.layers import reciprocal    # noqa: F401
-from ..fluid.layers import reciprocal_    # noqa: F401
-from ..fluid.layers import round    # noqa: F401
-from ..fluid.layers import round_    # noqa: F401
-from ..fluid.layers import rsqrt    # noqa: F401
-from ..fluid.layers import rsqrt_    # noqa: F401
-from ..fluid.layers import scale    # noqa: F401
-from ..fluid.layers import square    # noqa: F401
-from ..fluid.layers import stanh    # noqa: F401
-from ..fluid.layers import atan    # noqa: F401
-from ..fluid.layers import erf    # noqa: F401
-from ..fluid.layers import sqrt    # noqa: F401
-from ..fluid.layers import sqrt_    # noqa: F401
-from ..fluid.layers import sin    # noqa: F401
-from ..fluid.layers import lgamma    # noqa: F401
-from ..fluid.layers import asinh    # noqa: F401
-from ..fluid.layers import acosh    # noqa: F401
-from ..fluid.layers import atanh    # noqa: F401
-
-from ..fluid.layers import multiplex    # noqa: F401
-from ..fluid.layers import reduce_prod
+from .ops import abs    # noqa: F401
+from .ops import acos    # noqa: F401
+from .ops import asin    # noqa: F401
+from .ops import ceil    # noqa: F401
+from .ops import ceil_    # noqa: F401
+from .ops import cos    # noqa: F401
+from .ops import tan    # noqa: F401
+from .ops import sinh    # noqa: F401
+from .ops import cosh    # noqa: F401
+from .ops import exp    # noqa: F401
+from .ops import exp_    # noqa: F401
+from .ops import expm1    # noqa: F401
+from .ops import floor    # noqa: F401
+from .ops import floor_    # noqa: F401
+from .ops import reciprocal    # noqa: F401
+from .ops import reciprocal_    # noqa: F401
+from .ops import round    # noqa: F401
+from .ops import round_    # noqa: F401
+from .ops import rsqrt    # noqa: F401
+from .ops import rsqrt_    # noqa: F401
+from .ops import square    # noqa: F401
+from .ops import atan    # noqa: F401
+from .ops import erf    # noqa: F401
+from .ops import sqrt    # noqa: F401
+from .ops import sqrt_    # noqa: F401
+from .ops import sin    # noqa: F401
+from .ops import lgamma    # noqa: F401
+from .ops import asinh    # noqa: F401
+from .ops import acosh    # noqa: F401
+from .ops import atanh    # noqa: F401
+
+
 from ..fluid.layers import elementwise_sub
 from paddle import _C_ops
 
@@ -92,6 +88,241 @@
 ]
 
 
+def log(x, name=None):
+    r"""
+    Calculates the natural log of the given input tensor, element-wise.
+
+    .. math::
+
+        Out = \\ln(x)
+
+    Args:
+        x (Tensor): Input Tensor. Must be one of the following types: float32, float64.
+        name (str|None): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`
+
+
+    Returns:
+        Tensor: The natural log of the input Tensor computed element-wise.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+
+            x = [[2,3,4], [7,8,9]]
+            x = paddle.to_tensor(x, dtype='float32')
+            res = paddle.log(x)
+            # [[0.693147, 1.09861, 1.38629], [1.94591, 2.07944, 2.19722]]
+    """
+    if in_dygraph_mode():
+        return _C_ops.final_state_log(x)
+    if _in_legacy_dygraph():
+        return _C_ops.log(x)
+
+    check_variable_and_dtype(x, 'x', ['float32', 'float64'], "log")
+    inputs = {'X': [x]}
+    helper = LayerHelper('log', **locals())
+    dtype = helper.input_dtype(input_param_name='x')
+    out = helper.create_variable_for_type_inference(dtype)
+    helper.append_op(type="log", inputs={"X": x}, outputs={"Out": out})
+    return out
+
+
+def scale(x, scale=1.0, bias=0.0, bias_after_scale=True, act=None, name=None):
+    """
+    Scale operator.
+
+    Putting scale and bias to the input Tensor as following:
+
+    ``bias_after_scale`` is True:
+
+    .. math::
+                            Out=scale*X+bias
+
+    ``bias_after_scale`` is False:
+
+    .. math::
+                            Out=scale*(X+bias)
+
+    Args:
+        x(Tensor): Input N-D Tensor of scale operator. Data type can be float32, float64, int8, int16, int32, int64, uint8.
+        scale(float|Tensor): The scale factor of the input, it should be a float number or a Tensor with shape [1] and data type as float32.
+        bias(float): The bias to be put on the input.
+        bias_after_scale(bool): Apply bias addition after or before scaling. It is useful for numeric stability in some circumstances.
+        act(str, optional): Activation applied to the output such as tanh, softmax, sigmoid, relu.
+        name(str, optional): The default value is None. Normally there is no need for user to set this property.  For more information, please refer to :ref:`api_guide_Name`
+
+    Returns:
+        Tensor: Output tensor of scale operator, with shape and data type same as input.
+
+    Examples:
+        .. code-block:: python
+            
+            # scale as a float32 number
+            import paddle
+
+            data = paddle.randn(shape=[2,3], dtype='float32')
+            res = paddle.scale(data, scale=2.0, bias=1.0)
+
+        .. code-block:: python
+
+            # scale with parameter scale as a Tensor
+            import paddle
+
+            data = paddle.randn(shape=[2, 3], dtype='float32')
+            factor = paddle.to_tensor([2], dtype='float32')
+            res = paddle.scale(data, scale=factor, bias=1.0)
+
+    """
+
+    if in_dygraph_mode():
+        out = _C_ops.final_state_scale(x, scale, float(bias), bias_after_scale)
+        return dygraph_utils._append_activation_in_dygraph(out)
+    if _non_static_mode():
+        _scale = scale.numpy().item(0) if isinstance(scale, Variable) else scale
+        out = _C_ops.scale(x, 'scale',
+                           float(_scale), 'bias',
+                           float(bias), 'bias_after_scale', bias_after_scale)
+        return dygraph_utils._append_activation_in_dygraph(out)
+
+    check_variable_and_dtype(x, "x", [
+        'float16', 'uint16', 'float32', 'float64', 'int8', 'int16', 'int32',
+        'int64', 'uint8'
+    ], "scale")
+    inputs = {'X': [x]}
+    attrs = {
+        'bias': float(bias),
+        'bias_after_scale': bias_after_scale,
+    }
+    if isinstance(scale, Variable):
+        inputs['ScaleTensor'] = [scale]
+    else:
+        attrs['scale'] = float(scale)
+    helper = LayerHelper('scale', **locals())
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
+
+    helper.append_op(
+        type='scale', inputs=inputs, outputs={'Out': out}, attrs=attrs)
+    return helper.append_activation(out)
+
+
+def stanh(x, scale_a=0.67, scale_b=1.7159, name=None):
+    """
+    stanh activation.
+
+    .. math::
+
+        out = b * \\frac{e^{a * x} - e^{-a * x}}{e^{a * x} + e^{-a * x}}
+
+    Parameters:
+        x (Tensor): The input Tensor with data type float32, float64.
+        scale_a (float, optional): The scale factor a of the input. Default is 0.67.
+        scale_b (float, optional): The scale factor b of the output. Default is 1.7159.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Tensor with the same data type and shape as ``x`` .
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            x = paddle.to_tensor([1.0, 2.0, 3.0, 4.0])
+            out = paddle.stanh(x, scale_a=0.67, scale_b=1.72) # [1.00616539, 1.49927628, 1.65933108, 1.70390463]
+
+    """
+
+    if _non_static_mode():
+        return _C_ops.stanh(x, 'scale_a', scale_a, 'scale_b', scale_b)
+
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'stanh')
+
+    helper = LayerHelper('stanh', **locals())
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
+    helper.append_op(
+        type='stanh',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'scale_a': scale_a,
+               'scale_b': scale_b})
+    return out
+
+def multiplex(inputs, index, name=None):
+    """
+
+    Based on the given index parameter, the OP selects a specific row from each input Tensor to construct the output Tensor.
+
+    If the input of this OP contains :math:`m` Tensors, where :math:`I_{i}` means the i-th input Tensor, :math:`i` between :math:`[0,m)` .
+
+    And :math:`O` means the output, where :math:`O[i]` means the i-th row of the output, then the output satisfies that :math:`O[i] = I_{index[i]}[i]` .
+
+    For Example:
+
+            .. code-block:: text
+
+                Given:
+
+                inputs = [[[0,0,3,4], [0,1,3,4], [0,2,4,4], [0,3,3,4]],
+                          [[1,0,3,4], [1,1,7,8], [1,2,4,2], [1,3,3,4]],
+                          [[2,0,3,4], [2,1,7,8], [2,2,4,2], [2,3,3,4]],
+                          [[3,0,3,4], [3,1,7,8], [3,2,4,2], [3,3,3,4]]]
+
+                index = [[3],[0],[1],[2]]
+
+                out = [[3,0,3,4],    # out[0] = inputs[index[0]][0] = inputs[3][0] = [3,0,3,4]
+                       [0,1,3,4],    # out[1] = inputs[index[1]][1] = inputs[0][1] = [0,1,3,4]
+                       [1,2,4,2],    # out[2] = inputs[index[2]][2] = inputs[1][2] = [1,2,4,2]
+                       [2,3,3,4]]    # out[3] = inputs[index[3]][3] = inputs[2][3] = [2,3,3,4]
+
+
+    Args:
+        inputs (list): The input Tensor list. The list elements are N-D Tensors of data types float32, float64, int32, int64. All input Tensor shapes should be the same and rank must be at least 2.
+        index (Tensor): Used to select some rows in the input Tensor to construct an index of the output Tensor. It is a 2-D Tensor with data type int32 or int64 and shape [M, 1], where M is the number of input Tensors.
+        name(str, optional): The default value is None. Normally there is no
+            need for user to set this property. For more information, please
+            refer to :ref:`api_guide_Name`.
+    Returns:
+        Tensor: Output of multiplex OP, with data type being float32, float64, int32, int64.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+            img1 = np.array([[1, 2], [3, 4]]).astype(np.float32)
+            img2 = np.array([[5, 6], [7, 8]]).astype(np.float32)
+            inputs = [paddle.to_tensor(img1), paddle.to_tensor(img2)]
+            index = paddle.to_tensor(np.array([[1], [0]]).astype(np.int32))
+            res = paddle.multiplex(inputs, index)
+            print(res) # [array([[5., 6.], [3., 4.]], dtype=float32)]
+
+    """
+    if _non_static_mode():
+        return _C_ops.multiplex(index, inputs)
+    helper = LayerHelper('multiplex', **locals())
+
+    check_type(inputs, 'inputs', (list), 'multiplex')
+    if len(inputs) < 2:
+        raise ValueError(
+            "inputs should be a list object with at least 2 elements.")
+    for id, x in enumerate(inputs):
+        check_variable_and_dtype(x, 'input[' + str(id) + ']',
+                                 ['float32', 'float64', 'int32', 'int64'],
+                                 'multiplex')
+    check_variable_and_dtype(index, "index", ['int32', 'int64'], 'multiplex')
+
+    out = helper.create_variable_for_type_inference(inputs[0].dtype)
+    helper.append_op(
+        type='multiplex',
+        inputs={'X': inputs,
+                'Ids': index},
+        outputs={'Out': [out]})
+    return out
+
 @inplace_apis_in_dygraph_only
 def scale_(x, scale=1.0, bias=0.0, bias_after_scale=True, act=None, name=None):
     """
@@ -2674,9 +2905,10 @@ def kron(x, y, name=None):
             #         [12, 15, 18, 16, 20, 24],
             #         [21, 24, 27, 28, 32, 36]])
     """
-    if paddle.in_dynamic_mode():
+    if _in_legacy_dygraph():
         return _C_ops.kron(x, y)
-
+    if in_dygraph_mode():
+        return _C_ops.final_state_kron(x, y)
     helper = LayerHelper('kron', **locals())
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64', 'int32', 'int64'], 'kron')
     check_variable_and_dtype(y, 'y', ['float16', 'float32', 'float64', 'int32', 'int64'], 'kron')
@@ -2972,7 +3204,38 @@ def prod(x, axis=None, keepdim=False, dtype=None, name=None):
         if x.dtype != convert_np_dtype_to_dtype_(dtype):
             x = cast(x, dtype)
 
-    return reduce_prod(input=x, dim=axis, keep_dim=keepdim, name=name)
+    input = x
+    dim = axis
+    keep_dim = keepdim
+    if dim is not None and not isinstance(dim, list):
+        if isinstance(dim, tuple):
+            dim = list(dim)
+        elif isinstance(dim, int):
+            dim = [dim]
+        else:
+            raise TypeError(
+                "The type of axis must be int, list or tuple, but received {}".
+                format(type(dim)))
+    if in_dygraph_mode():
+        return _C_ops.final_state_reduce_prod(
+            input, dim if dim != None and dim != [] else [0], keep_dim, True if
+            dim == None or dim == [] or len(dim) == len(input.shape) else False)
+
+    helper = LayerHelper('reduce_prod', **locals())
+    check_variable_and_dtype(
+        input, 'input', ['float32', 'float64', 'int32', 'int64'], 'reduce_prod')
+    out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
+    helper.append_op(
+        type='reduce_prod',
+        inputs={'X': input},
+        outputs={'Out': out},
+        attrs={
+            'dim': dim if dim != None and dim != [] else [0],
+            'keep_dim': keep_dim,
+            'reduce_all': True if dim == None or dim == [] or
+            len(dim) == len(input.shape) else False
+        })
+    return out
 
 
 def sign(x, name=None):
@@ -3348,6 +3611,9 @@ def conj(x, name=None):
           #        [(4-4j), (5-5j), (6-6j)]])
 
     """
+    if in_dygraph_mode():
+        return _C_ops.final_state_conj(x)
+
     if paddle.in_dynamic_mode():
         return _C_ops.conj(x)
 
@@ -3525,9 +3791,10 @@ def logit(x, eps=None, name=None):
 
     if eps == None:
         eps = 0.0
-    if paddle.in_dynamic_mode():
+    if _in_legacy_dygraph():
         return _C_ops.logit(x, 'eps', eps)
-
+    if in_dygraph_mode():
+        return _C_ops.final_state_logit(x, eps)
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'logit')
     helper = LayerHelper("logit", **locals())
     out = helper.create_variable_for_type_inference(x.dtype)
@@ -3634,6 +3901,9 @@ def erfinv(x, name=None):
             # out: [0, 0.4769, -inf]
 
     """
+    if in_dygraph_mode():
+        return _C_ops.final_state_erfinv( x )
+
     check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'erfinv')
 
     if paddle.in_dynamic_mode():
diff --git a/python/paddle/tensor/ops.py b/python/paddle/tensor/ops.py
new file mode 100644
index 00000000000000..9ee59c6cfd8431
--- /dev/null
+++ b/python/paddle/tensor/ops.py
@@ -0,0 +1,532 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import os
+from .layer_function_generator import generate_layer_fn, generate_activation_fn, generate_inplace_fn, add_sample_code
+from ..framework import core
+from ..framework import convert_np_dtype_to_dtype_
+from ..static import Variable
+from ..fluid.data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
+
+__deprecated_func_name__ = {
+    'tanh_shrink': 'tanhshrink',
+    'logsigmoid': 'log_sigmoid'
+}
+
+__activations_noattr__ = [
+    'sigmoid',
+    'silu',
+    'logsigmoid',
+    'tanh_shrink',
+    'softplus',
+    'softsign',
+    'tanh',
+]
+
+__unary_func__ = [
+    'exp',
+    'expm1',
+    'atan',
+    'sqrt',
+    'rsqrt',
+    'abs',
+    'ceil',
+    'floor',
+    'cos',
+    'tan',
+    'acos',
+    'sin',
+    'sinh',
+    'asin',
+    'cosh',
+    'round',
+    'reciprocal',
+    'square',
+    'lgamma',
+    'acosh',
+    'asinh',
+    'atanh',
+]
+
+__inplace_unary_func__ = [
+    'exp_',
+    'sqrt_',
+    'rsqrt_',
+    'ceil_',
+    'floor_',
+    'round_',
+    'reciprocal_',
+]
+
+__all__ = []
+
+for _OP in set(__all__):
+    globals()[_OP] = generate_layer_fn(_OP)
+
+# It is a hot fix in some unittest using:
+#   fluid.layers.scale(x=x, scale=10.0, out=out_var)
+# e.g.: test_program_code.py, test_dist_train.py
+globals()['_scale'] = generate_layer_fn('scale')
+
+globals()['_elementwise_div'] = generate_layer_fn('elementwise_div')
+
+__all__ += __activations_noattr__
+__all__ += __unary_func__
+__all__ += __inplace_unary_func__
+
+for _OP in set(__activations_noattr__):
+    _new_OP = _OP
+    if _OP in __deprecated_func_name__:
+        _new_OP = __deprecated_func_name__[_OP]
+    _func = generate_activation_fn(_OP)
+    globals()[_OP] = _func
+
+for _OP in set(__unary_func__):
+    _new_OP = _OP
+    if _OP in __deprecated_func_name__:
+        _new_OP = __deprecated_func_name__[_OP]
+    _func = generate_activation_fn(_OP)
+    globals()[_OP] = _func
+
+for _OP in set(__inplace_unary_func__):
+    _new_OP = _OP
+    if _OP in __deprecated_func_name__:
+        _new_OP = __deprecated_func_name__[_OP]
+    _func = generate_inplace_fn(_OP)
+    globals()[_OP] = _func
+
+add_sample_code(globals()["sigmoid"], r"""
+Examples:
+    .. code-block:: python
+
+        import paddle
+        import paddle.nn.functional as F
+
+        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
+        out = F.sigmoid(x)
+        print(out)
+        # [0.40131234 0.450166   0.52497919 0.57444252]
+
+""")
+
+add_sample_code(globals()["silu"], r"""
+Examples:
+    .. code-block:: python
+
+        import paddle
+        import paddle.nn.functional as F
+
+        x = paddle.to_tensor([1.0, 2.0, 3.0, 4.0])
+        out = F.silu(x)
+        print(out)
+        # [ 0.7310586 1.7615942 2.8577224, 3.9280552 ]
+
+""")
+
+add_sample_code(globals()["logsigmoid"], r"""
+Examples:
+    .. code-block:: python
+
+        import paddle
+        import paddle.nn.functional as F
+
+        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
+        out = F.log_sigmoid(x)
+        print(out)
+        # [-0.91301525 -0.79813887 -0.64439666 -0.55435524]
+
+""")
+
+add_sample_code(globals()["exp"], r"""
+Examples:
+    .. code-block:: python
+
+        import paddle
+
+        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
+        out = paddle.exp(x)
+        print(out)
+        # [0.67032005 0.81873075 1.10517092 1.34985881]
+
+""")
+
+add_sample_code(globals()["expm1"], r"""
+Examples:
+    .. code-block:: python
+
+        import paddle
+
+        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
+        out = paddle.expm1(x)
+        print(out)
+        # [-0.32967997, -0.18126924,  0.10517092,  0.34985882]
+
+""")
+
+add_sample_code(globals()["tanh"], r"""
+Examples:
+    .. code-block:: python
+
+        import paddle
+
+        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
+        out = paddle.tanh(x)
+        print(out)
+        # [-0.37994896 -0.19737532  0.09966799  0.29131261]
+
+""")
+
+add_sample_code(globals()["atan"], r"""
+Examples:
+    .. code-block:: python
+
+        import paddle
+
+        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
+        out = paddle.atan(x)
+        print(out)
+        # [-0.38050638 -0.19739556  0.09966865  0.29145679]
+
+""")
+
+add_sample_code(globals()["tanh_shrink"], r"""
+Examples:
+    .. code-block:: python
+
+        import paddle
+        import paddle.nn.functional as F
+
+        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
+        out = F.tanhshrink(x) 
+        print(out)
+        # [-0.020051, -0.00262468, 0.000332005, 0.00868739]
+
+""")
+
+add_sample_code(globals()["sqrt"], r"""
+Examples:
+    .. code-block:: python
+
+        import paddle
+
+        x = paddle.to_tensor([0.1, 0.2, 0.3, 0.4])
+        out = paddle.sqrt(x)
+        print(out)
+        # [0.31622777 0.4472136  0.54772256 0.63245553]
+
+""")
+
+add_sample_code(globals()["rsqrt"], r"""
+Examples:
+    .. code-block:: python
+
+        import paddle
+
+        x = paddle.to_tensor([0.1, 0.2, 0.3, 0.4])
+        out = paddle.rsqrt(x)
+        print(out)
+        # [3.16227766 2.23606798 1.82574186 1.58113883]
+
+""")
+
+add_sample_code(globals()["abs"], r"""
+Examples:
+    .. code-block:: python
+
+        import paddle
+
+        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
+        out = paddle.abs(x)
+        print(out)
+        # [0.4 0.2 0.1 0.3]
+
+""")
+
+add_sample_code(globals()["ceil"], r"""
+Examples:
+    .. code-block:: python
+
+        import paddle
+
+        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
+        out = paddle.ceil(x)
+        print(out)
+        # [-0. -0.  1.  1.]
+
+""")
+
+add_sample_code(globals()["floor"], r"""
+Examples:
+    .. code-block:: python
+
+        import paddle
+
+        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
+        out = paddle.floor(x)
+        print(out)
+        # [-1. -1.  0.  0.]
+
+""")
+
+add_sample_code(globals()["cos"], r"""
+Examples:
+    .. code-block:: python
+
+        import paddle
+
+        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
+        out = paddle.cos(x)
+        print(out)
+        # [0.92106099 0.98006658 0.99500417 0.95533649]
+
+""")
+
+add_sample_code(globals()["tan"], r"""
+Examples:
+    .. code-block:: python
+
+        import paddle
+
+        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
+        out = paddle.tan(x)
+        print(out)
+        # [-0.42279324, -0.20271005, 0.10033467, 0.30933627]
+
+""")
+
+add_sample_code(globals()["acos"], r"""
+Examples:
+    .. code-block:: python
+
+        import paddle
+
+        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
+        out = paddle.acos(x)
+        print(out)
+        # [1.98231317 1.77215425 1.47062891 1.26610367]
+
+""")
+
+add_sample_code(globals()["sin"], r"""
+Examples:
+    .. code-block:: python
+
+        import paddle
+
+        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
+        out = paddle.sin(x)
+        print(out)
+        # [-0.38941834 -0.19866933  0.09983342  0.29552021]
+
+""")
+
+add_sample_code(globals()["asin"], r"""
+Examples:
+    .. code-block:: python
+
+        import paddle
+
+        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
+        out = paddle.asin(x)
+        print(out)
+        # [-0.41151685 -0.20135792  0.10016742  0.30469265]
+
+""")
+
+add_sample_code(globals()["cosh"], r"""
+Examples:
+    .. code-block:: python
+
+        import paddle
+
+        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
+        out = paddle.cosh(x)
+        print(out)
+        # [1.08107237 1.02006676 1.00500417 1.04533851]
+
+""")
+
+add_sample_code(globals()["sinh"], r"""
+Examples:
+    .. code-block:: python
+
+        import paddle
+
+        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
+        out = paddle.sinh(x)
+        print(out)
+        # [-0.41075233 -0.201336    0.10016675  0.30452029]
+
+""")
+
+add_sample_code(globals()["asinh"], r"""
+Examples:
+    .. code-block:: python
+
+        import paddle
+
+        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
+        out = paddle.asinh(x)
+        print(out)
+        # [-0.39003533, -0.19869010,  0.09983408,  0.29567307]
+
+""")
+
+add_sample_code(globals()["acosh"], r"""
+Examples:
+    .. code-block:: python
+
+        import paddle
+
+        x = paddle.to_tensor([1., 3., 4., 5.])
+        out = paddle.acosh(x)
+        print(out)
+        # [0.        , 1.76274729, 2.06343699, 2.29243159]
+
+""")
+
+add_sample_code(globals()["atanh"], r"""
+Examples:
+    .. code-block:: python
+
+        import paddle
+
+        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
+        out = paddle.atanh(x)
+        print(out)
+        # [-0.42364895, -0.20273256,  0.10033535,  0.30951962]
+
+""")
+
+add_sample_code(globals()["round"], r"""
+Examples:
+    .. code-block:: python
+
+        import paddle
+
+        x = paddle.to_tensor([-0.5, -0.2, 0.6, 1.5])
+        out = paddle.round(x)
+        print(out)
+        # [-1. -0.  1.  2.]
+
+""")
+
+add_sample_code(globals()["reciprocal"], r"""
+Examples:
+    .. code-block:: python
+
+        import paddle
+
+        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
+        out = paddle.reciprocal(x)
+        print(out)
+        # [-2.5        -5.         10.          3.33333333]
+
+""")
+
+add_sample_code(globals()["square"], r"""
+Examples:
+    .. code-block:: python
+
+        import paddle
+
+        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
+        out = paddle.square(x)
+        print(out)
+        # [0.16 0.04 0.01 0.09]
+
+""")
+
+add_sample_code(globals()["lgamma"], r"""
+Examples:
+    .. code-block:: python
+
+        import paddle
+
+        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
+        out = paddle.lgamma(x)
+        print(out)
+        # [1.31452441, 1.76149750, 2.25271273, 1.09579802]
+
+""")
+
+add_sample_code(globals()["softplus"], r"""
+Examples:
+    .. code-block:: python
+
+        import paddle
+        import paddle.nn.functional as F
+
+        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
+        out = F.softplus(x) 
+        print(out)
+        # [0.513015, 0.598139, 0.744397, 0.854355]
+
+""")
+
+add_sample_code(globals()["softsign"], r"""
+Examples:
+    .. code-block:: python
+
+        import paddle
+        import paddle.nn.functional as F
+
+        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
+        out = F.softsign(x) 
+        print(out)
+        # [-0.285714, -0.166667, 0.0909091, 0.230769]
+
+""")
+
+__all__ += ['erf']
+
+_erf_ = generate_layer_fn('erf')
+
+
+def erf(x, name=None):
+    locals_var = locals().copy()
+    kwargs = dict()
+    for name, val in locals_var.items():
+        if val is not None:
+            kwargs[name] = val
+    return _erf_(**kwargs)
+
+
+erf.__doc__ = r"""
+:strong:`Erf Operator`
+For more details, see [Error function](https://en.wikipedia.org/wiki/Error_function).
+
+Equation:
+    ..  math::
+        out = \\frac{2}{\\sqrt{\\pi}} \\int_{0}^{x}e^{- \\eta^{2}}d\\eta
+
+Args:
+
+    x (Tensor): The input tensor, it's data type should be float32, float64.
+
+Returns:
+
+    Tensor: The output of Erf op, dtype: float32 or float64, the same as the input, shape: the same as the input.
+
+Examples:
+    
+    .. code-block:: python
+    
+        import paddle
+        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
+        out = paddle.erf(x)
+        print(out)
+        # [-0.42839236 -0.22270259  0.11246292  0.32862676]
+"""
diff --git a/python/paddle/tensor/random.py b/python/paddle/tensor/random.py
index d2e43634437205..82818d50510c9b 100644
--- a/python/paddle/tensor/random.py
+++ b/python/paddle/tensor/random.py
@@ -548,7 +548,14 @@ def uniform(shape, dtype=None, min=-1.0, max=1.0, seed=0, name=None):
     if not isinstance(dtype, core.VarDesc.VarType):
         dtype = convert_np_dtype_to_dtype_(dtype)
 
-    if paddle.in_dynamic_mode():
+    if in_dygraph_mode():
+        shape = utils.convert_shape_to_list(shape)
+        return _C_ops.final_state_uniform_random(shape, dtype,
+                                                 float(min),
+                                                 float(max), seed,
+                                                 _current_expected_place())
+
+    if _in_legacy_dygraph():
         shape = utils.convert_shape_to_list(shape)
         return _C_ops.uniform_random('shape', shape, 'min',
                                      float(min), 'max',
diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml
index 589dfdb0f3e1a1..6ca61de063b552 100644
--- a/python/paddle/utils/code_gen/api.yaml
+++ b/python/paddle/utils/code_gen/api.yaml
@@ -167,6 +167,16 @@
     func : asinh
   backward : asinh_grad
 
+# assign
+- api : assign
+  args : (Tensor x)
+  output : Tensor
+  infer_meta :
+    func : UnchangedInferMeta
+  kernel :
+    func : assign_raw
+  backward : assign_grad
+
 # atan
 - api : atan
   args : (Tensor x)
@@ -345,6 +355,7 @@
     func : UnchangedInferMeta
   kernel :
     func : conj
+  backward : conj_grad
 
 - api : conv2d
   args : (Tensor input, Tensor filter, int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search)
@@ -659,6 +670,7 @@
     func : FlipInferMeta
   kernel :
     func : flip
+  backward : flip_grad
 
 - api : floor
   args : (Tensor x)
@@ -814,7 +826,7 @@
     func : GumbelSoftmaxInferMeta
   kernel :
     func : gumbel_softmax
-  # backward : gumbel_softmax_grad
+  backward : gumbel_softmax_grad
 
 # hard_shrink
 - api : hard_shrink
@@ -838,6 +850,16 @@
     func : hard_sigmoid
   backward : hard_sigmoid_grad
 
+- api : hard_swish
+  args : (Tensor x, float threshold = 6.0, float scale = 6.0, float offset = 3.0)
+  output : Tensor
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [x]
+  kernel :
+    func : hard_swish
+  backward : hard_swish_grad
+
 # histogram
 - api : histogram
   args : (Tensor x, int64_t bins, int min, int max)
@@ -949,6 +971,15 @@
     data_type : x
   backward : kldiv_loss_grad
 
+- api : kron
+  args : (Tensor x, Tensor y)
+  output : Tensor
+  infer_meta :
+    func : KronInferMeta
+  kernel :
+    func : kron
+  backward : kron_grad
+
 - api : kthvalue
   args : (Tensor x, int k, int axis, bool keepdim)
   output : Tensor(out), Tensor(indices)
@@ -1016,6 +1047,15 @@
     func : lgamma
   backward : lgamma_grad
 
+- api : linspace
+  args : (Tensor start, Tensor stop, Tensor number, DataType dtype)
+  output : Tensor
+  infer_meta :
+    func : LinspaceInferMeta
+  kernel :
+    func : linspace
+    data_type : dtype
+
 - api : log
   args : (Tensor x)
   output : Tensor
@@ -1107,6 +1147,17 @@
   kernel :
     func : logical_xor
 
+# logit
+- api : logit
+  args : (Tensor x, float eps = 1e-6f)
+  output : Tensor
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [x]
+  kernel :
+    func : logit
+  backward : logit_grad
+
 # logsigmoid
 - api : logsigmoid
   args : (Tensor x)
@@ -1157,6 +1208,23 @@
     func : matrix_power
   backward : matrix_power_grad
 
+- api : matrix_rank
+  args : (Tensor x, float tol, bool use_default_tol=true, bool hermitian=false)
+  output : Tensor(out)
+  infer_meta :
+    func : MatrixRankInferMeta
+    param : [x, use_default_tol, hermitian]
+  kernel :
+    func : matrix_rank
+
+- api : matrix_rank_tol
+  args : (Tensor x, Tensor atol_tensor, bool use_default_tol=true, bool hermitian=false)
+  output : Tensor(out)
+  infer_meta :
+    func : MatrixRankTolInferMeta
+  kernel :
+    func : matrix_rank_tol
+
 - api : max
   args : (Tensor x, int64_t[] dims={}, bool keep_dim=false)
   output : Tensor(out)
@@ -1193,6 +1261,15 @@
     func : maximum
   backward : maximum_grad
 
+- api : maxout
+  args : (Tensor x, int groups, int axis)
+  output : Tensor(out)
+  infer_meta :
+    func : MaxOutInferMeta
+  kernel :
+    func : maxout
+  backward : maxout_grad
+
 - api : mean
   args : (Tensor x, int64_t[] dims={}, bool keep_dim=false)
   output : Tensor(out)
@@ -1269,6 +1346,15 @@
   invoke : momentum_impl(param, grad, velocity, learning_rate, master_param, mu, use_nesterov, regularization_method, regularization_coeff, multi_precision, rescale_grad)
   optional : master_param
 
+- api : multi_dot
+  args : (Tensor[] x)
+  output : Tensor
+  infer_meta :
+    func : MultiDotInferMeta
+  kernel :
+    func : multi_dot
+  backward : multi_dot_grad
+
 # multinomial
 - api : multinomial
   args : (Tensor x, int num_samples, bool replacement)
@@ -1278,6 +1364,16 @@
   kernel :
     func : multinomial
 
+- api : multiplex
+  args : (Tensor[] ins, Tensor ids)
+  output : Tensor
+  infer_meta :
+    func : MultiplexInferMeta
+  kernel :
+    func : multiplex
+    data_type : ins
+  backward : multiplex_grad
+
 - api : multiply
   args : (Tensor x, Tensor y)
   output : Tensor
@@ -1307,6 +1403,16 @@
   optional : weight
   backward : nll_loss_grad
 
+- api : norm
+  args : (Tensor x, int axis, float epsilon, bool is_test)
+  output : Tensor(out), Tensor(norm)
+  infer_meta :
+    func : NormInferMeta
+  kernel :
+    func : norm
+  intermediate : norm
+  backward : norm_grad
+
 - api : not_equal
   args : (Tensor x, Tensor y, int axis = -1)
   output : Tensor
@@ -1364,7 +1470,7 @@
     func : PixelShuffleInferMeta
   kernel :
     func : pixel_shuffle
-  # backward : pixel_shuffle_grad
+  backward : pixel_shuffle_grad
 
 # poisson  // no need grad
 - api : poisson
@@ -1613,6 +1719,7 @@
     func : SegmentPoolInferMeta
   kernel :
     func : segment_pool
+    data_type : x
   backward : segment_pool_grad
 
 # selu
@@ -1809,6 +1916,17 @@
     data_type : x
   backward : sum_grad
 
+# The python API paddle.nn.functional.swish has no `bete` argument, it may be removed later
+- api : swish
+  args : (Tensor x, float beta=1.0)
+  output : Tensor(out)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [x]
+  kernel :
+    func : swish
+  backward : swish_grad
+
 # take_along_axis
 - api : take_along_axis
   args : (Tensor x, Tensor index, int axis)
@@ -1939,6 +2057,12 @@
     backend : place
     data_type : dtype
 
+- api : unbind
+  args : (Tensor input, int axis)
+  output : Tensor[]
+  invoke : unbind_impl(input, axis)
+  backward : unbind_grad
+
 # unfold
 - api : unfold
   args : (Tensor x, int[] kernel_sizes, int[] strides, int[] paddings, int[] dilations)
@@ -1949,6 +2073,18 @@
     func : unfold
   backward : unfold_grad
 
+- api : uniform_random
+  args : (IntArray shape,  DataType dtype,  float min,  float max,  int seed, Place place={})
+  output : Tensor(out)
+  infer_meta :
+    func : UniformRandomInferMeta
+    param: [shape, dtype, min, max, seed]
+  kernel :
+    func : uniform_random
+    param: [shape, dtype, min, max, seed]
+    data_type : dtype
+    backend : place
+
 # The `axis` argument of Python API paddle.unique is not vector
 - api : unique
   args : (Tensor x, bool return_index, bool return_inverse, bool return_counts, int[] axis, DataType dtype=DataType::INT64)
diff --git a/python/paddle/utils/code_gen/api_base.py b/python/paddle/utils/code_gen/api_base.py
index 38aa3e0cb0b732..275adac8b49727 100644
--- a/python/paddle/utils/code_gen/api_base.py
+++ b/python/paddle/utils/code_gen/api_base.py
@@ -600,7 +600,7 @@ def get_kernel_args(self, code_indent):
                     if self.inputs['input_info'][param] == "const Tensor&":
                         kernel_args = kernel_args + "*" + PREFIX_TENSOR_NAME + param + ", "
                     elif self.inputs['input_info'][
-                            input_name] == "const std::vector<Tensor>&":
+                            param] == "const std::vector<Tensor>&":
                         kernel_args = kernel_args + PREFIX_TENSOR_NAME + param + ", "
                     else:
                         # do nothing
diff --git a/python/paddle/utils/code_gen/backward.yaml b/python/paddle/utils/code_gen/backward.yaml
index 942089f18ce554..555ec600bf7e73 100644
--- a/python/paddle/utils/code_gen/backward.yaml
+++ b/python/paddle/utils/code_gen/backward.yaml
@@ -1,13 +1,3 @@
-# - backward_api : gumbel_softmax_grad
-#   forward : gumbel_softmax (Tensor x, float temperature, bool hard, int axis) -> Tensor(out)
-#   args : (Tensor out, Tensor out_grad, int axis)
-#   output : Tensor(x_grad)
-#   infer_meta :
-#     func : GumbelSoftmaxGradInferMeta
-#     param : [out, out_grad, axis]
-#   kernel :
-#     func : gumbel_softmax_grad
-
 - backward_api : abs_grad
   forward : abs (Tensor x) -> Tensor(out)
   args : (Tensor x, Tensor out_grad)
@@ -99,6 +89,16 @@
   kernel :
     func : asinh_grad
 
+- backward_api : assign_grad
+  forward : assign (Tensor x) -> Tensor(out)
+  args : (Tensor out_grad)
+  output : Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [out_grad]
+  kernel :
+    func : assign_raw
+
 - backward_api : atan2_grad
   forward : atan2 (Tensor x, Tensor y) -> Tensor(out)
   args : (Tensor x, Tensor y, Tensor out_grad)
@@ -217,6 +217,17 @@
   args : (Tensor[] x, Tensor out_grad, Scalar axis = 0)
   output : Tensor[](x_grad)
   invoke : concat_grad_impl(x, out_grad, axis)
+  no_need_buffer : x
+
+- backward_api : conj_grad
+  forward : conj (Tensor x) -> Tensor(out)
+  args : (Tensor out_grad)
+  output : Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param: [out_grad]
+  kernel :
+    func : conj
 
 - backward_api : conv2d_grad
   forward : conv2d (Tensor input, Tensor filter, int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search) -> Tensor(out)
@@ -318,7 +329,7 @@
     func : UnchangedInferMeta
     param : [x]
   kernel :
-    func : determinant_grad
+    func : determinant_grad  
 
 - backward_api : diagonal_grad
   forward : diagonal (Tensor x, int offset, int axis1, int axis2) -> Tensor(out)
@@ -442,6 +453,7 @@
     param : [x]
   kernel :
     func : expand_as_grad
+  no_need_buffer : x
 
 - backward_api : expm1_grad
   forward : expm1 (Tensor x) -> Tensor(out)
@@ -465,6 +477,17 @@
     data_type: out_grad
     backend: out_grad
     layout: out_grad
+  no_need_buffer : x
+
+- backward_api : flip_grad
+  forward : flip (Tensor x, int[] axis) -> Tensor(out)
+  args : (Tensor out_grad, int[] axis)
+  output : Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param: [out_grad]
+  kernel :
+    func : flip
 
 - backward_api : floor_grad
   forward : floor(Tensor x) -> Tensor(out)
@@ -516,6 +539,7 @@
   kernel :
     data_type: x
     func : gather_grad
+  no_need_buffer : x
 
 - backward_api : gather_nd_grad
   forward : gather_nd (Tensor x, Tensor index) -> Tensor(out)
@@ -526,6 +550,7 @@
     param : [x]
   kernel :
     func : gather_nd_grad
+  no_need_buffer : x
 
 - backward_api : gelu_grad
   forward : gelu(Tensor x,  bool approximate) -> Tensor(out)
@@ -548,6 +573,16 @@
     func : graph_send_recv_grad
   optional: out, dst_count
 
+- backward_api : gumbel_softmax_grad
+  forward : gumbel_softmax (Tensor x, float temperature, bool hard, int axis) -> Tensor(out)
+  args : (Tensor out, Tensor out_grad, int axis)
+  output : Tensor(x_grad)
+  infer_meta :
+    func : GumbelSoftmaxGradInferMeta
+    param : [out, out_grad, axis]
+  kernel :
+    func : gumbel_softmax_grad
+
 - backward_api : hard_shrink_grad
   forward : hard_shrink (Tensor x, float threshold) -> Tensor(out)
   args : (Tensor x, Tensor out_grad, float threshold)
@@ -568,6 +603,16 @@
   kernel :
     func : hard_sigmoid_grad
 
+- backward_api : hard_swish_grad
+  forward : hard_swish (Tensor x, float threshold = 6.0, float scale = 6.0, float offset = 3.0) -> Tensor(out)
+  args : (Tensor x, Tensor out_grad, float threshold, float scale, float offset)
+  output : Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [x]
+  kernel :
+    func : hard_swish_grad
+
 - backward_api : huber_loss_grad
   forward : huber_loss (Tensor input, Tensor label, float delta) -> Tensor(out), Tensor(residual)
   args : (Tensor residual, Tensor out_grad, float delta)
@@ -606,6 +651,7 @@
   kernel :
     func : index_select_grad
     data_type : x
+  no_need_buffer : x
 
 - backward_api : kldiv_loss_grad
   forward : kldiv_loss(Tensor x, Tensor label, str reduction) -> Tensor(out)
@@ -616,6 +662,18 @@
     param: [x]
   kernel :
     func : kldiv_loss_grad
+  no_need_buffer : x
+
+- backward_api : kron_grad
+  forward : kron (Tensor x, Tensor y) -> Tensor(out)
+  args : (Tensor x, Tensor y, Tensor out_grad)
+  output : Tensor(x_grad), Tensor(y_grad)
+  infer_meta :
+    func : GeneralBinaryGradInferMeta
+    param : [x, y]
+  kernel :
+    func : kron_grad
+    data_type : out_grad
 
 - backward_api : kthvalue_grad
   forward : kthvalue(Tensor x, int k, int axis, bool keepdim) -> Tensor(out), Tensor(indices)
@@ -728,6 +786,16 @@
   kernel :
     func : log_softmax_grad
 
+- backward_api : logit_grad
+  forward : logit (Tensor x, float eps = 1e-6f) -> Tensor(out)
+  args : (Tensor x, Tensor out_grad, float eps)
+  output : Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [x]
+  kernel :
+    func : logit_grad
+
 - backward_api : logsigmoid_grad
   forward : logsigmoid (Tensor x) -> Tensor(out)
   args : (Tensor x, Tensor out_grad)
@@ -758,6 +826,7 @@
   kernel :
     func : masked_select_grad
     data_type : x
+  no_need_buffer : x
 
 - backward_api : matmul_double_grad
   forward : matmul_grad (Tensor x, Tensor y, Tensor grad_out, bool transpose_x=false, bool transpose_y=false) -> Tensor(grad_x), Tensor(grad_y)
@@ -841,6 +910,16 @@
   kernel :
     func : maximum_grad
 
+- backward_api : maxout_grad
+  forward : maxout(Tensor x, int groups, int axis) -> Tensor(out)
+  args : (Tensor x, Tensor out, Tensor out_grad, int groups, int axis)
+  output : Tensor(x_grad)
+  infer_meta :
+    func : GeneralUnaryGradInferMeta
+    param: [x]
+  kernel :
+    func : maxout_grad
+
 - backward_api : mean_all_grad
   forward : mean_all(Tensor x) -> Tensor(out)
   args : (Tensor x, Tensor out_grad)
@@ -860,6 +939,7 @@
     param: [x]
   kernel :
     func : mean_grad
+  no_need_buffer : x
 
 - backward_api : meshgrid_grad
   forward : meshgrid (Tensor[] inputs) -> Tensor[](outputs)
@@ -918,6 +998,18 @@
     func : modulo_grad
   no_need_buffer : x, y
 
+- backward_api : multi_dot_grad
+  forward : multi_dot (Tensor[] x) -> Tensor(out)
+  args : (Tensor[] x, Tensor out_grad)
+  output : Tensor[](x_grad)
+  invoke : multi_dot_grad_impl(x, out_grad)
+
+- backward_api : multiplex_grad
+  forward : multiplex (Tensor[] ins, Tensor ids) -> Tensor(out)
+  args : (Tensor[] ins, Tensor ids, Tensor out_grad)
+  output : Tensor[](ins_grad)
+  invoke : multiplex_grad_impl(ins, ids, out_grad)
+
 - backward_api : multiply_grad
   forward : multiply (Tensor x, Tensor y) -> Tensor(out)
   args : (Tensor x, Tensor y, Tensor out_grad, int axis = -1)
@@ -949,6 +1041,16 @@
     data_type : input
   optional : weight
 
+- backward_api : norm_grad
+  forward : norm (Tensor x, int axis, float epsilon, bool is_test) -> Tensor(out), Tensor(norm)
+  args : (Tensor x, Tensor norm, Tensor out_grad, int axis, float epsilon, bool is_test)
+  output : Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [x]
+  kernel :
+    func : norm_grad
+
 - backward_api : p_norm_grad
   forward : p_norm(Tensor x,  float porder,  int axis,  float epsilon,  bool keepdim,  bool asvector=false) -> Tensor(out)
   args : (Tensor x, Tensor out, Tensor out_grad,  float porder,  int axis,  float epsilon,  bool keepdim,  bool asvector)
@@ -969,6 +1071,15 @@
   kernel :
     func : pad3d_grad
 
+- backward_api : pixel_shuffle_grad
+  forward : pixel_shuffle (Tensor x, int upscale_factor, str data_format) -> Tensor(out)
+  args : (Tensor out_grad, int upscale_factor, str data_format)
+  output : Tensor(x_grad)
+  infer_meta :
+    func : PixelShuffleGradInferMeta
+  kernel :
+    func : pixel_shuffle_grad
+
 - backward_api : pool2d_grad
   forward : pool2d(Tensor x, int[] kernel_size, int[] strides, int[] paddings, bool ceil_mode, bool exclusive, str data_format, str pooling_type, bool global_pooling, bool adaptive, str padding_algorithm) -> Tensor(out)
   args : (Tensor x, Tensor out, Tensor out_grad, int[] kernel_size, int[] strides, int[] paddings, bool ceil_mode, bool exclusive, str data_format, str pooling_type, bool global_pooling, bool adaptive, str padding_algorithm)
@@ -1122,6 +1233,7 @@
   kernel :
     func : roll_grad
     data_type : x
+  no_need_buffer : x
 
 - backward_api : round_grad
   forward : round(Tensor x) -> Tensor(out)
@@ -1147,7 +1259,7 @@
   forward : scale (Tensor x, Scalar scale, float bias, bool bias_after_scale) -> Tensor(out)
   args : (Tensor out_grad, Scalar scale=1.0, float bias=0.0, bool bias_after_scale=true)
   output : Tensor(x_grad)
-  invoke : scale(out_grad, scale, bias, bias_after_scale)
+  invoke : scale(out_grad, scale, 0.0, bias_after_scale)
 
 - backward_api : scatter_grad
   forward : scatter (Tensor x, Tensor index, Tensor updates, bool overwrite) -> Tensor(out)
@@ -1180,6 +1292,8 @@
     param : [x]
   kernel :
     func : segment_pool_grad
+    data_type : x
+  optional : summed_ids
 
 - backward_api : selu_grad
   forward : selu (Tensor x, float scale, float alpha) -> Tensor(out)
@@ -1272,6 +1386,7 @@
     param : [input]
   kernel :
     func : slice_grad
+  no_need_buffer : input
 
 - backward_api : soft_shrink_grad
   forward : soft_shrink (Tensor x, float lambda) -> Tensor(out)
@@ -1346,6 +1461,7 @@
     param : [x]
   kernel :
     func : strided_slice_grad
+  no_need_buffer : x
 
 - backward_api : subtract_grad
   forward : subtract (Tensor x, Tensor y) -> Tensor(out)
@@ -1367,6 +1483,17 @@
     param : [x]
   kernel :
     func : sum_grad
+  no_need_buffer : x
+
+- backward_api : swish_grad
+  forward : swish (Tensor x, float beta=1.0) -> Tensor(out)
+  args : (Tensor x, Tensor out_grad, float bete=1.0)
+  output : Tensor(x_grad)
+  infer_meta :
+    func : GeneralUnaryGradInferMeta
+    param : [x]
+  kernel :
+    func : swish_grad
 
 - backward_api : take_along_axis_grad
   forward : take_along_axis (Tensor x, Tensor index, int axis) -> Tensor(out)
@@ -1480,6 +1607,12 @@
   kernel :
     func : trunc_grad
 
+- backward_api : unbind_grad
+  forward : unbind (Tensor input, int axis) -> Tensor[](out)
+  args : (Tensor[] out_grad, int axis)
+  output : Tensor(input_grad)
+  invoke : stack(out_grad, axis)
+
 - backward_api : unfold_grad
   forward : unfold (Tensor x, int[] kernel_sizes, int[] strides, int[] paddings, int[] dilations) -> Tensor(out)
   args : (Tensor x, Tensor out_grad, int[] kernel_sizes, int[] strides, int[] paddings, int[] dilations)
@@ -1510,3 +1643,4 @@
     param : [x, y]
   kernel :
     func : where_grad
+  no_need_buffer : x, y
diff --git a/tools/check_file_diff_approvals.sh b/tools/check_file_diff_approvals.sh
index e0598112c822ae..49b84da01b9bb9 100644
--- a/tools/check_file_diff_approvals.sh
+++ b/tools/check_file_diff_approvals.sh
@@ -201,7 +201,7 @@ fi
 # infrt needs to temporarily use LOG(FATAL) during the debugging period, and will replace it with standard error format in the future.
 NO_INFRT_FILES=`git diff --name-only upstream/develop | grep -v "tools/\|paddle/infrt/" || true`
 HAS_LOG_FATAL=`git diff -U0 upstream/$BRANCH $NO_INFRT_FILES |grep "^+" |grep -o -m 1 "LOG(FATAL)" || true`
-if [ ${HAS_LOG_FATAL} ] && [ "${GIT_PR_ID}" != "" ]; then
+if [ ${NO_INFRT_FILES} ] && [ ${HAS_LOG_FATAL} ] && [ "${GIT_PR_ID}" != "" ]; then
     echo_line="LOG(FATAL) is not recommended, because it will throw exception without standard stack information, so please use PADDLE_THROW macro here. If you have to use LOG(FATAL) here, please request chenwhql (Recommend), luotao1 or lanxianghit review and approve.\n"
     check_approval 1 6836917 47554610 22561442
 fi
diff --git a/tools/ci_op_benchmark.sh b/tools/ci_op_benchmark.sh
index 0937ebe5343fcd..8e84eccc083f22 100644
--- a/tools/ci_op_benchmark.sh
+++ b/tools/ci_op_benchmark.sh
@@ -135,6 +135,8 @@ function load_CHANGE_OP_MAP {
   for change_file in ${CHANGE_OP_FILES[@]}
   do
     change_file_name=${change_file#*paddle/fluid/operators/}
+    change_file_name=${change_file_name#*paddle/phi/kernels/gpu/}
+    change_file_name=${change_file_name#*paddle/phi/kernels/gpudnn/}
     if [ -n "${PADDLE_FILENAME_OP_MAP[$change_file_name]}" ]
     then
       for op_name in ${PADDLE_FILENAME_OP_MAP[$change_file_name]}
diff --git a/tools/infrt/skipped_phi_api.json b/tools/infrt/skipped_phi_api.json
index 64fc4c618aebc3..8e2dd0f65d7d5d 100644
--- a/tools/infrt/skipped_phi_api.json
+++ b/tools/infrt/skipped_phi_api.json
@@ -1,4 +1,4 @@
 {
-"phi_apis":["conj", "dropout", "expand_as", "flatten", "nll_loss", "psroi_pool", "roi_align", "roi_pool", "label_smooth"],
+"phi_apis":["conj", "dropout", "expand_as", "nll_loss", "psroi_pool", "roi_align", "roi_pool", "label_smooth"],
 "phi_kernels":["equal_all"]
 }