diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake
index daaf06668920a1..5fbfe57cbf2961 100644
--- a/cmake/external/xpu.cmake
+++ b/cmake/external/xpu.cmake
@@ -10,7 +10,7 @@ set(XPU_RT_LIB_NAME "libxpurt.so")
 if(NOT DEFINED XPU_BASE_URL)
   set(XPU_BASE_URL_WITHOUT_DATE
       "https://baidu-kunlun-product.su.bcebos.com/KL-SDK/klsdk-dev")
-  set(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20221215")
+  set(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20221227")
 else()
   set(XPU_BASE_URL "${XPU_BASE_URL}")
 endif()
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index f7b7c73d4deed1..9cdc0e127c8d51 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -195,6 +195,7 @@ function(create_dummy_static_lib TARGET_NAME)
   # the dummy target would be consisted of limit size libraries
   set(limit ${merge_LIMIT})
   list(LENGTH merge_LIBS libs_len)
+  message("libs_len ${libs_len}")
   foreach(lib ${merge_LIBS})
     list(APPEND merge_list ${lib})
     list(LENGTH merge_list listlen)
diff --git a/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
index b54f45363a00da..26b6fce08a40c6 100644
--- a/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
@@ -739,6 +739,14 @@ def CollectBackwardInfo(self):
             self.backward_returns_list,
         ) = ParseYamlBackward(backward_args_str, backward_returns_str)
 
+        # Remove the output which is intermediate
+        if 'intermediate' in grad_api_contents:
+            backward_returns_list_new = []
+            for return_item in self.backward_returns_list:
+                if return_item[0] not in grad_api_contents['intermediate']:
+                    backward_returns_list_new.append(return_item)
+            self.backward_returns_list = backward_returns_list_new
+
     def CollectForwardInfoFromBackwardContents(self):
 
         backward_forward_str = self.backward_forward_str
@@ -1979,7 +1987,6 @@ def GenerateNodeDefinition(
                         fill_zero_str += f"{indent}egr::EagerUtils::FillZeroForEmptyGradInput(&grads[{fwd_position}], input_metas[{fwd_position}]);\n"
 
         inplace_grad_input_str = ""
-        inplaced_tensor_wrapper = False
         inplace_check_str = ""
         optional_inplace_var_name = []
         # Grad Ins from TensorWrappers
diff --git a/paddle/fluid/eager/autograd_meta.h b/paddle/fluid/eager/autograd_meta.h
index 7d87a7cbaafa8a..4d8b5ec9daacfb 100644
--- a/paddle/fluid/eager/autograd_meta.h
+++ b/paddle/fluid/eager/autograd_meta.h
@@ -23,7 +23,7 @@ using AbstractAutogradMeta = paddle::experimental::AbstractAutogradMeta;
  *
  * AutogradMeta is what record the backward info for tensor. When we run
  * computation graph eagerly, we can not build a static paddle program like
- * static mode do, so we need a new method to record forward info to trace
+ * static graph mode do, so we need a new method to record forward info to trace
  * backward when we finish all forward computation. This require our
  * AutogradMeta class record following main members
  *
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index 3c2e8bf85a7992..088847d1f6f3b9 100644
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -429,10 +429,6 @@ if(WITH_MKLDNN)
     test_conv_batch_norm_mkldnn_fuse_pass
     SRCS mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc
     DEPS ${TEST_CONV_BN_PASS_DEPS})
-  cc_test(
-    test_scale_matmul_fuse_pass
-    SRCS mkldnn/scale_matmul_fuse_pass_tester.cc
-    DEPS scale_matmul_fuse_pass)
   cc_test(
     test_mkldnn_placement_pass
     SRCS mkldnn/mkldnn_placement_pass_tester.cc
diff --git a/paddle/fluid/framework/ir/conv_bn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/conv_bn_fuse_pass_tester.cc
index 324f707af1ec5d..021d372c2c89aa 100644
--- a/paddle/fluid/framework/ir/conv_bn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/conv_bn_fuse_pass_tester.cc
@@ -32,7 +32,11 @@ void AddVarToScope(Scope* param_scope,
                    const DDim& dims) {
   auto* tensor = param_scope->Var(name)->GetMutable<phi::DenseTensor>();
   tensor->Resize(dims);
-  tensor->mutable_data<float>(platform::CPUPlace());
+  auto* data = tensor->mutable_data<float>(platform::CPUPlace());
+  int64_t numel = tensor->numel();
+  for (int64_t i = 0; i < numel; ++i) {
+    data[i] = 0;
+  }
 }
 
 Scope* CreateParamScope() {
diff --git a/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass_tester.cc
deleted file mode 100644
index ed6e63615f7c35..00000000000000
--- a/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass_tester.cc
+++ /dev/null
@@ -1,117 +0,0 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-
-#include "paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-void SetOp(ProgramDesc* prog,
-           const std::string& type,
-           const std::vector<std::string>& inputs,
-           const std::vector<std::string>& outputs,
-           float scale = 1.0f,
-           float bias = 0.0f) {
-  auto* op = prog->MutableBlock(0)->AppendOp();
-
-  op->SetType(type);
-  if (type == "scale") {
-    op->SetInput("X", {inputs[0]});
-    op->SetAttr("scale", scale);
-    op->SetAttr("bias", bias);
-  } else if (type == "matmul") {
-    op->SetAttr("transpose_X", false);
-    op->SetAttr("transpose_Y", false);
-    op->SetInput("X", {inputs[0]});
-    if (inputs.size() > 1) op->SetInput("Y", {inputs[1]});
-    op->SetAttr("alpha", scale);
-  } else {
-    FAIL() << "Unexpected operator type.";
-  }
-  op->SetOutput("Out", {outputs[0]});
-}
-
-// a->scale->b
-// (b,c)->matmul->d
-ProgramDesc BuildProgramDesc(float scale, float bias, float alpha) {
-  ProgramDesc prog;
-
-  for (auto& v : std::vector<std::string>({"a", "b", "c", "d"})) {
-    prog.MutableBlock(0)->Var(v);
-  }
-  SetOp(&prog, "scale", {"a"}, {"b"}, scale, bias);
-  SetOp(&prog, "matmul", {"b", "c"}, {"d"}, alpha);
-  return prog;
-}
-
-void MainTest(const ProgramDesc& prog,
-              int removed_nodes_count,
-              const std::vector<std::string> scale_in_out,
-              const std::vector<std::string> matmul_in_out,
-              float alpha) {
-  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
-  int original_nodes_num = graph->Nodes().size();
-  auto pass = PassRegistry::Instance().Get("scale_matmul_fuse_pass");
-  graph.reset(pass->Apply(graph.release()));
-  int current_nodes_num = graph->Nodes().size();
-
-  for (auto* node : graph->Nodes()) {
-    if (node->IsOp()) {
-      auto* op = node->Op();
-      if (op->Type() == "scale") {
-        EXPECT_EQ(op->Input("X")[0], scale_in_out[0]);
-        EXPECT_EQ(op->Output("Out")[0], scale_in_out[1]);
-      } else if (op->Type() == "matmul") {
-        EXPECT_EQ(op->Input("X")[0], matmul_in_out[0]);
-        EXPECT_EQ(op->Input("Y")[0], matmul_in_out[1]);
-        EXPECT_EQ(op->Output("Out")[0], matmul_in_out[2]);
-        EXPECT_EQ(op->GetAttrIfExists<float>("alpha"), alpha);
-      }
-    }
-  }
-  EXPECT_EQ(original_nodes_num - removed_nodes_count, current_nodes_num);
-}
-
-TEST(ScaleMatmulFusePass, scale_matmul_with_no_bias) {
-  auto bias = 0.0f;
-  auto scale = 2.34f;
-  auto alpha = 3.45f;
-  int removed_nodes_count = 2;
-  MainTest(BuildProgramDesc(scale, bias, alpha),
-           removed_nodes_count,
-           {},
-           {"a", "c", "d"},
-           scale * alpha);
-}
-
-TEST(ScaleMatmulFusePass, scale_matmul_with_bias) {
-  auto bias = 1.0f;
-  auto scale = 2.34f;
-  auto alpha = 3.45f;
-  int removed_nodes_count = 0;
-  MainTest(BuildProgramDesc(scale, bias, alpha),
-           removed_nodes_count,
-           {"a", "b"},
-           {"b", "c", "d"},
-           alpha);
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-USE_PASS(scale_matmul_fuse_pass);
diff --git a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc
index 637de3ee1d03ef..f5b430e829a13c 100644
--- a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc
+++ b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc
@@ -760,7 +760,7 @@ bool BuildOpFuncList(const platform::Place& place,
                   new phi::Kernel(phi::KernelFactory::Instance().SelectKernel(
                       phi_kernel_name, phi_cpu_kernel_key)));
               if (op_with_kernel->PhiKernel()->IsValid()) {
-                VLOG(6) << "Static mode PrepareImpl - kernel name: "
+                VLOG(6) << "Static graph mode PrepareImpl - kernel name: "
                         << phi_kernel_name
                         << " | kernel key: " << phi_cpu_kernel_key
                         << " | kernel: " << *(op_with_kernel->PhiKernel());
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index ae216b1e499a2f..eb7ad8ed94ec95 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1603,11 +1603,11 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   }
 #endif
 
-  auto exe_ctx = ExecutionContext(*this, scope, *dev_ctx, *runtime_ctx);
   // using cache
   if (kernel_type_.get()) {
     dev_ctx = pool.Get(kernel_type_->place_);
   }
+  auto exe_ctx = ExecutionContext(*this, scope, *dev_ctx, *runtime_ctx);
 
 // TODO(Liu-xiandong): Now we are using too much if-else and hard code in XPU
 // device, it's ugly, and we will refactor in the future.
@@ -1679,12 +1679,12 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
               phi_kernel_name, phi_kernel_key)));
 
       if (phi_kernel_->IsValid()) {
-        VLOG(6) << "Static mode ChoosePhiKernel - kernel name: "
+        VLOG(6) << "Static graph mode ChoosePhiKernel - kernel name: "
                 << phi_kernel_name << " | kernel key: " << phi_kernel_key
                 << " | kernel: " << *phi_kernel_;
       } else {
-        VLOG(6) << "Static mode ChoosePhiKernel - kernel `" << phi_kernel_name
-                << "` not found.";
+        VLOG(6) << "Static graph mode ChoosePhiKernel - kernel `"
+                << phi_kernel_name << "` not found.";
       }
     } else {
       phi_kernel_name = kernel_signature_->name;
@@ -1815,7 +1815,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
 
         dev_ctx = pool.Get(platform::CPUPlace());
         if (phi_kernel_->IsValid()) {
-          VLOG(6) << "Static mode PrepareImpl - kernel name: "
+          VLOG(6) << "Static graph mode PrepareImpl - kernel name: "
                   << phi_kernel_name << " | kernel key: " << phi_cpu_kernel_key
                   << " | kernel: " << *phi_kernel_;
           run_phi_kernel_ = true;
@@ -2083,11 +2083,11 @@ phi::KernelKey OperatorWithKernel::ChoosePhiKernel(
       phi_kernel_name, phi_kernel_key)));
 
   if (phi_kernel_->IsValid()) {
-    VLOG(6) << "Static mode ChoosePhiKernel - kernel name: " << phi_kernel_name
-            << " | kernel key: " << phi_kernel_key
+    VLOG(6) << "Static graph mode ChoosePhiKernel - kernel name: "
+            << phi_kernel_name << " | kernel key: " << phi_kernel_key
             << " | kernel: " << *phi_kernel_;
   } else {
-    VLOG(6) << "Static mode ChoosePhiKernel - kernel `" << phi_kernel_name
+    VLOG(6) << "Static graph mode ChoosePhiKernel - kernel `" << phi_kernel_name
             << "` not found.";
   }
   return phi_kernel_key;
@@ -2715,7 +2715,23 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType(
   proto::VarType::Type dafault_data_type =
       static_cast<proto::VarType::Type>(-1);
   proto::VarType::Type data_type = dafault_data_type;
-  for (auto* name : ctx.InNameList()) {
+
+  auto in_name_list = ctx.InNameList();
+  if (Info().HasOpProtoAndChecker()) {
+    for (auto& attr : Info().Proto().attrs()) {
+      auto it =
+          std::find_if(in_name_list.begin(),
+                       in_name_list.end(),
+                       [&attr](const std::string* name) {
+                         return attr.support_tensor() && *name == attr.name();
+                       });
+      if (it != in_name_list.end()) {
+        in_name_list.erase(it);
+      }
+    }
+  }
+
+  for (auto* name : in_name_list) {
     if (ctx.InputSize(*name) == 1UL) {
       ParseInputDataType(ctx.InputVar(*name), *name, &data_type);
     } else {
diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h
index 9a93d299c002a5..2831e007d94c46 100644
--- a/paddle/fluid/imperative/tracer.h
+++ b/paddle/fluid/imperative/tracer.h
@@ -136,7 +136,7 @@ class Tracer {
   }
 
   // Note(Aurelius84): The `tmp` is used as prefix key while naming a temporary
-  // intermediate var both in imperative and static mode. But the
+  // intermediate var both in imperative and static graph mode. But the
   // `UniqueNameGenerator` in C++ and `unique_name.py` in Python doesn't share
   // the same auto-increment id. It will create a variable repeatedly with same
   // name like `tmp_0` in some cases when transform dygraph into static layers.
diff --git a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
old mode 100644
new mode 100755
index 2ff82986e945ca..40a8c5ce66a2a5
--- a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
@@ -222,6 +222,51 @@ void MakeSimpleReusePlan(
   }
 }
 
+// Remove the inplace operation from the plan because it does not support memory
+// reuse
+void DelInplaceOpFromPlan(
+    Graph* graph,
+    std::unordered_map<std::string, std::string>* node2cluster,
+    int sort_kind) {
+  auto topo_nodes = TopologyVarientSort(
+      *graph, static_cast<framework::ir::SortKind>(sort_kind));
+  for (auto* op_node : topo_nodes) {
+    if (!op_node->IsOp()) continue;
+    auto input_tensors = op_node->inputs;
+    auto output_tensors = op_node->outputs;
+
+    std::unordered_set<std::string> in_names;
+    for (const Node* node : input_tensors) {
+      if (!node->Var()) continue;
+      if (node->Var()->Persistable()) continue;
+      std::string var = node->Name();
+      in_names.insert(var);
+    }
+
+    for (const Node* node : output_tensors) {
+      if (!node->Var()) continue;
+      if (node->Var()->Persistable()) continue;
+      std::string var = node->Name();
+      if (in_names.find(var) != in_names.end()) {
+        // delete key
+        if (node2cluster->count(var)) {
+          node2cluster->erase(var);
+        }
+        // delete value
+        std::string tmp_name = "";
+        for (auto it = node2cluster->begin(); it != node2cluster->end(); ++it) {
+          if (it->second == var) {
+            if (tmp_name == "") {
+              tmp_name = it->first;
+            }
+            it->second = tmp_name;
+          }
+        }
+      }
+    }
+  }
+}
+
 // NOTE The optimized opdesc doesn't match ir::Graph.
 void UpdateOpDescsByReuse(
     Graph* graph,
@@ -324,6 +369,7 @@ void MemoryOptimizePass::RunImpl(Argument* argument) {
   CollectLifeCycle(graph, &lifecycles, sort_kind);
   CollectVarMemorySize(graph, &space_table);
   MakeSimpleReusePlan(lifecycles, space_table, &node2cluster, &cluster_size);
+  DelInplaceOpFromPlan(graph, &node2cluster, sort_kind);
 
   auto* pass_res_info = PassResultInfoForRuntime::Instance();
   pass_res_info->Set(
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 2fe3dbe13e71af..46ec559939e8ee 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -2396,6 +2396,7 @@ USE_TRT_CONVERTER(cast)
 USE_TRT_CONVERTER(recover_padding)
 USE_TRT_CONVERTER(remove_padding)
 USE_TRT_CONVERTER(equal);
+USE_TRT_CONVERTER(not_equal);
 USE_TRT_CONVERTER(top_k)
 USE_TRT_CONVERTER(top_k_v2)
 USE_TRT_CONVERTER(range)
diff --git a/paddle/fluid/inference/tensorrt/convert/c_allreduce_op.cc b/paddle/fluid/inference/tensorrt/convert/c_allreduce_op.cc
index 73eec4395f9679..5f9dca9a0d26fe 100644
--- a/paddle/fluid/inference/tensorrt/convert/c_allreduce_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/c_allreduce_op.cc
@@ -1,16 +1,16 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
 
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/plugin/c_allreduce_op_plugin.h"
@@ -32,8 +32,9 @@ class CAllReduceOpConverter : public OpConverter {
                   bool test_mode) override {
     VLOG(4) << "convert fluid callreduce op to tensorrt layer";
     if (!engine_->with_dynamic_shape()) {
-      PADDLE_THROW(platform::errors::Fatal(
-          "Unsupported static mode. Please set dynamic shape of inputs."));
+      PADDLE_THROW(
+          platform::errors::Fatal("Unsupported static graph mode. Please set "
+                                  "dynamic shape of inputs."));
     }
     ReduceType red_type = op_to_reduce_type[op.type()];
     std::string name = op.type();
diff --git a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
index b8f9b22fc7b2b1..314e5390bde827 100644
--- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
@@ -142,7 +142,8 @@ void ConvertConv2d(TensorRTEngine* engine,
       layer,
       platform::errors::Fatal("TensorRT create conv2d/conv2d_transpose"
                               " layer failed."));
-  layer->setStride(nv_strides);
+  layer->setStrideNd(nv_strides);
+
   layer->setPrePadding(nv_pre_paddings);
   if (output_padding.size() > 0) {
     nv_post_paddings.d[0] -= output_padding[0];
@@ -189,7 +190,7 @@ class Conv2dOpConverter : public OpConverter {
             TensorRTEngine::Weight& weight,
             TensorRTEngine::Weight& bias) -> nvinfer1::IConvolutionLayer* {
           auto* layer = TRT_ENGINE_ADD_LAYER(engine_,
-                                             Convolution,
+                                             ConvolutionNd,
                                              *inputs,
                                              n_output,
                                              ksize,
diff --git a/paddle/fluid/inference/tensorrt/convert/equal_op.cc b/paddle/fluid/inference/tensorrt/convert/equal_op.cc
index 3a9627dc99a5c3..d1b4b1c08c81b5 100644
--- a/paddle/fluid/inference/tensorrt/convert/equal_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/equal_op.cc
@@ -35,7 +35,6 @@ class EqualOpConverter : public OpConverter {
   void operator()(const framework::proto::OpDesc& op,
                   const framework::Scope& scope,
                   bool test_mode) override {
-#if IS_TRT_VERSION_GE(8000)
     framework::OpDesc op_desc(op, nullptr);
     nvinfer1::ILayer* layer = nullptr;
 
@@ -79,11 +78,62 @@ class EqualOpConverter : public OpConverter {
     layer = TRT_ENGINE_ADD_LAYER(
         engine_, ElementWise, *X, *Y, nvinfer1::ElementWiseOperation::kEQUAL);
     RreplenishLayerAndOutput(layer, "equal", {output_name}, test_mode);
-#else
-    PADDLE_THROW(
-        platform::errors::Fatal("ElementWise Equal Operation is only supported "
-                                "on TRT 8 or higher version."));
-#endif
+  }
+};
+
+class NotEqualOpConverter : public OpConverter {
+ public:
+  NotEqualOpConverter() {}
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope,
+                  bool test_mode) override {
+    framework::OpDesc op_desc(op, nullptr);
+    nvinfer1::ILayer* layer = nullptr;
+
+    auto* X = engine_->GetITensor(op_desc.Input("X").front());
+    auto* Y = engine_->GetITensor(op_desc.Input("Y").front());
+    nvinfer1::Dims dims_x = X->getDimensions();
+    nvinfer1::Dims dims_y = Y->getDimensions();
+
+    int axis = PADDLE_GET_CONST(int, op_desc.GetAttr("axis"));
+    if (axis < 0) {
+      axis = std::abs(dims_x.nbDims - dims_y.nbDims);
+    }
+    auto output_name = op_desc.Output("Out")[0];
+    nvinfer1::IShuffleLayer* expand_layer = nullptr;
+    if (dims_x.nbDims > dims_y.nbDims) {
+      nvinfer1::Dims expand_shape;
+      expand_shape.nbDims = dims_x.nbDims;
+      for (int i = 0; i < expand_shape.nbDims; i++) {
+        expand_shape.d[i] = 1;
+      }
+      for (int i = 0; i < dims_y.nbDims; i++) {
+        expand_shape.d[i + axis] = dims_y.d[i];
+      }
+      expand_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *Y);
+      expand_layer->setReshapeDimensions(expand_shape);
+      Y = expand_layer->getOutput(0);
+    } else if (dims_x.nbDims < dims_y.nbDims) {
+      nvinfer1::Dims expand_shape;
+      expand_shape.nbDims = dims_y.nbDims;
+      for (int i = 0; i < expand_shape.nbDims; i++) {
+        expand_shape.d[i] = 1;
+      }
+      for (int i = 0; i < dims_x.nbDims; i++) {
+        expand_shape.d[i + axis] = dims_x.d[i];
+      }
+      expand_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *X);
+      expand_layer->setReshapeDimensions(expand_shape);
+      X = expand_layer->getOutput(0);
+    }
+
+    layer = TRT_ENGINE_ADD_LAYER(
+        engine_, ElementWise, *X, *Y, nvinfer1::ElementWiseOperation::kEQUAL);
+
+    layer = TRT_ENGINE_ADD_LAYER(
+        engine_, Unary, *layer->getOutput(0), nvinfer1::UnaryOperation::kNOT);
+
+    RreplenishLayerAndOutput(layer, "not_equal", {output_name}, test_mode);
   }
 };
 
@@ -92,3 +142,4 @@ class EqualOpConverter : public OpConverter {
 }  // namespace paddle
 
 REGISTER_TRT_OP_CONVERTER(equal, EqualOpConverter);
+REGISTER_TRT_OP_CONVERTER(not_equal, NotEqualOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/preln_residual_bias.cc b/paddle/fluid/inference/tensorrt/convert/preln_residual_bias.cc
index 2229a10c2d5f8c..28847aa5b7a307 100644
--- a/paddle/fluid/inference/tensorrt/convert/preln_residual_bias.cc
+++ b/paddle/fluid/inference/tensorrt/convert/preln_residual_bias.cc
@@ -28,8 +28,9 @@ class PrelnResidualBiasOpConverter : public OpConverter {
                   bool test_mode) override {
     VLOG(4) << "convert fused preln_residual_bias op to tensorrt layer";
     if (!engine_->with_dynamic_shape()) {
-      PADDLE_THROW(platform::errors::Fatal(
-          "Unsupported static mode. Please set dynamic shape of inputs."));
+      PADDLE_THROW(
+          platform::errors::Fatal("Unsupported static graph mode. Please set "
+                                  "dynamic shape of inputs."));
     }
     framework::OpDesc op_desc(op, nullptr);
     // Declare inputs
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index e5e344e16cbb34..66bfe56f355d90 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -119,24 +119,21 @@ struct SimpleOpTypeSetTeller : public Teller {
 #endif
     }
 
-    // In static shape mode in TRT, we can't allow that op's input is a
-    // 1D-tensor So we filter it here. Some op like elementwise having "Y" too,
-    // but that is dealt with in the specified op, here just the common case
+    // In static shape in Paddle-TRT, we can't allow that one op has a
+    // 1D intermediate tensor as input.
     if (!with_dynamic_shape) {
-      std::string X_name;
       auto inputs = desc.Inputs();
-      if (inputs.count("X") && !desc.Input("X").empty()) {
-        X_name = desc.Input("X")[0];
-      } else if (inputs.count("Input") && !desc.Input("Input").empty()) {
-        X_name = desc.Input("Input")[0];
-      }
-      auto* block = desc.Block();
-      if (block) {
-        auto* x_var_desc = block->FindVar(X_name);
-        // Can't get feed op's TensorDesc
-        if (op_type != "feed" && x_var_desc && !x_var_desc->Persistable()) {
-          const auto x_shape = x_var_desc->GetShape();
-          if (x_shape.size() == 1) return false;
+      for (auto iter : inputs) {
+        for (auto var_name : iter.second) {
+          auto* block = desc.Block();
+          if (block) {
+            auto* var_desc = block->FindVar(var_name);
+            // Can't get feed op's TensorDesc
+            if (op_type != "feed" && var_desc && !var_desc->Persistable()) {
+              const auto shape = var_desc->GetShape();
+              if (shape.size() == 1) return false;
+            }
+          }
         }
       }
     }
@@ -2341,7 +2338,7 @@ struct SimpleOpTypeSetTeller : public Teller {
     }
 #endif
 
-    if (op_type == "equal") {
+    if (op_type == "equal" || op_type == "not_equal") {
 #if !IS_TRT_VERSION_GE(8000)
       VLOG(3) << "compare is not supported when TensorRT < 8.0";
       return false;
@@ -2493,6 +2490,7 @@ struct SimpleOpTypeSetTeller : public Teller {
       "elementwise_max",
       "elementwise_floordiv",
       "equal",
+      "not_equal",
       "less_than",
       "greater_than",
       "logical_or",
@@ -2639,6 +2637,7 @@ struct SimpleOpTypeSetTeller : public Teller {
       "elementwise_max",
       "elementwise_floordiv",
       "equal",
+      "not_equal",
       "less_than",
       "greater_than",
       "logical_or",
diff --git a/paddle/fluid/operators/collective/c_concat_op.cu.cc b/paddle/fluid/operators/collective/c_concat_op.cu.cc
index cb52ac0479f762..96282ebde36f42 100644
--- a/paddle/fluid/operators/collective/c_concat_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_concat_op.cu.cc
@@ -134,4 +134,7 @@ REGISTER_OP_CUDA_KERNEL(c_concat,
                         ops::CConcatOpCUDAKernel<double>,
                         ops::CConcatOpCUDAKernel<int>,
                         ops::CConcatOpCUDAKernel<int64_t>,
+#if NCCL_VERSION_CODE >= 21000
+                        ops::CConcatOpCUDAKernel<plat::bfloat16>,
+#endif
                         ops::CConcatOpCUDAKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/c_identity_op.cu.cc b/paddle/fluid/operators/collective/c_identity_op.cu.cc
index 0b2f5b7eb1a90b..0ba98ab315d4db 100644
--- a/paddle/fluid/operators/collective/c_identity_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_identity_op.cu.cc
@@ -22,4 +22,7 @@ REGISTER_OP_CUDA_KERNEL(c_identity,
                         ops::CIdentityOpKernel<double>,
                         ops::CIdentityOpKernel<int>,
                         ops::CIdentityOpKernel<int64_t>,
+#if NCCL_VERSION_CODE >= 21000
+                        ops::CIdentityOpKernel<plat::bfloat16>,
+#endif
                         ops::CIdentityOpKernel<plat::float16>);
diff --git a/paddle/fluid/operators/generator/generate_op.py b/paddle/fluid/operators/generator/generate_op.py
index d7121a2aeb567a..a0f4cfacfde38f 100644
--- a/paddle/fluid/operators/generator/generate_op.py
+++ b/paddle/fluid/operators/generator/generate_op.py
@@ -131,9 +131,10 @@ def process_int_array(op_item, int_array_configs):
                 )
                 if attr_item['is_support_tensor']:
                     attr_item['typename'] = (
-                        data_type_map[int_array_config['data_type']]
+                        'int[]'
                         if 'data_type' in int_array_config
-                        else 'std::vector<int64_t>'
+                        and int_array_config['data_type'] == 'int'
+                        else 'int64_t[]'
                     )
                 else:
                     attr_item['data_type'] = (
@@ -153,21 +154,95 @@ def process_int_array(op_item, int_array_configs):
 
 
 # replace name of op and params for OpMaker
-def replace_compat_name(op_op_map, forward_op_dict, backward_op_dict):
-    def get_op_and_op_name(op_item):
+def replace_compat_name(op_fluid_map_list, forward_op_dict, backward_op_dict):
+    def get_phi_and_fluid_op_name(op_item):
         names = op_item.split('(')
         if len(names) == 1:
             return names[0].strip(), names[0].strip()
         else:
             return names[0].strip(), names[1].split(')')[0].strip()
 
-    def update_op_attr_name(attrs, attrs_alias_map):
-        for attr_item in attrs:
-            if attr_item['name'] in attrs_alias_map:
-                attr_item['name'] = attrs_alias_map[attr_item['name']]
+    def update_op_param_name(op_args, args_alias_map):
+        for item in op_args:
+            if item['name'] in args_alias_map:
+                item['name'] = args_alias_map[item['name']]
+
+    def update_grad_args_name(op_args, args_alias_map):
+        for item in op_args:
+            if (
+                item['name'].endswith('_grad')
+                and item['name'][:-5] in args_alias_map
+            ):
+                args_alias_map[item['name']] = (
+                    args_alias_map[item['name'][:-5]] + '_grad'
+                )
+                item['name'] = args_alias_map[item['name'][:-5]] + '_grad'
+
+    def get_param_list_alias(param_list, args_map):
+        return [
+            args_map[param] if param in args_map else param
+            for param in param_list
+        ]
 
-    for op_args in op_op_map:
-        new_op_name, op_name = get_op_and_op_name(op_args['op'])
+    def update_common_params_name(
+        op_item, args_name_map, scalar_configs, int_array_configs
+    ):
+        if 'inplace' in op_item and op_item['inplace']:
+            inplace_map = {}
+            for key, val in op_item['inplace'].items():
+                if key in args_map:
+                    key = args_map[key]
+                if val in args_map:
+                    val = args_map[val]
+                inplace_map[key] = val
+            op_item['inplace'] = inplace_map
+        if 'no_need_buffer' in op_item and op_item['no_need_buffer']:
+            op_item['no_need_buffer'] = get_param_list_alias(
+                op_item['no_need_buffer'], args_map
+            )
+
+        process_scalar(op_item, scalar_configs)
+        process_int_array(op_item, int_array_configs)
+
+        if 'invoke' in op_item:
+            op_item['invoke']['args'] = [
+                args_map[param.strip()]
+                if param.strip() in args_map
+                else param.strip()
+                for param in op_item['invoke']['args'].split(',')
+            ]
+            return
+        op_item['infer_meta']['param'] = get_param_list_alias(
+            op_item['infer_meta']['param'], args_name_map
+        )
+        op_item['kernel']['param'] = get_param_list_alias(
+            op_item['kernel']['param'], args_name_map
+        )
+        if op_item['kernel']['data_type']:
+            op_item['kernel']['data_type']['candidates'] = get_param_list_alias(
+                op_item['kernel']['data_type']['candidates'], args_name_map
+            )
+        if op_item['kernel']['backend']:
+            op_item['kernel']['backend']['candidates'] = get_param_list_alias(
+                op_item['kernel']['backend']['candidates'], args_name_map
+            )
+        if op_item['kernel']['layout']:
+            op_item['kernel']['layout']['candidates'] = get_param_list_alias(
+                op_item['kernel']['layout']['candidates'], args_name_map
+            )
+
+    def update_grad_op_compat_name(grad_op_item, args_name_map):
+        update_op_param_name(grad_op_item['inputs'], args_name_map)
+        update_op_param_name(grad_op_item['outputs'], args_name_map)
+        update_op_param_name(grad_op_item['attrs'], args_name_map)
+        update_op_param_name(grad_op_item['forward']['inputs'], args_name_map)
+        update_op_param_name(grad_op_item['forward']['outputs'], args_name_map)
+        update_op_param_name(grad_op_item['forward']['attrs'], args_name_map)
+        update_grad_args_name(grad_op_item['inputs'], args_map)
+        update_grad_args_name(grad_op_item['outputs'], args_map)
+
+    for op_args in op_fluid_map_list:
+        new_op_name, op_name = get_phi_and_fluid_op_name(op_args['op'])
         if new_op_name not in forward_op_dict:
             continue
         forward_op_item = forward_op_dict[new_op_name]
@@ -179,189 +254,102 @@ def update_op_attr_name(attrs, attrs_alias_map):
 
         scalar_configs = None
         int_array_configs = None
-
         if 'scalar' in op_args:
             scalar_configs = op_args['scalar']
         if 'int_array' in op_args:
             int_array_configs = op_args['int_array']
+        if 'extra' in op_args and 'outputs' in op_args['extra']:
+            for out_item in forward_op_item['outputs']:
+                if out_item['name'] in op_args['extra']['outputs']:
+                    out_item['is_extra'] = True
 
-        process_scalar(forward_op_item, scalar_configs)
-        process_int_array(forward_op_item, int_array_configs)
+        key_set = ['inputs', 'attrs', 'outputs']
+        args_map = {}
+        for key in key_set:
+            if key in op_args:
+                args_map.update(op_args[key])
+                for args_item in forward_op_item[key]:
+                    if args_item['name'] in op_args[key]:
+                        if (
+                            scalar_configs
+                            and args_item['name'] in scalar_configs
+                        ):
+                            scalar_configs[
+                                op_args[key][args_item['name']]
+                            ] = scalar_configs[args_item['name']]
+                        if (
+                            int_array_configs
+                            and args_item['name'] in int_array_configs
+                        ):
+                            int_array_configs[
+                                op_args[key][args_item['name']]
+                            ] = int_array_configs[args_item['name']]
+                        args_item['name'] = op_args[key][args_item['name']]
+                if has_backward:
+                    for args_item in backward_op_item['forward'][key]:
+                        if args_item['name'] in op_args[key]:
+                            args_item['name'] = op_args[key][args_item['name']]
+        forward_op_item["attr_dict"] = to_named_dict(forward_op_item["attrs"])
+        update_common_params_name(
+            forward_op_item, args_map, scalar_configs, int_array_configs
+        )
+
+        if has_backward:
+            update_grad_op_compat_name(backward_op_item, args_map)
+            update_common_params_name(
+                backward_op_item, args_map, scalar_configs, int_array_configs
+            )
+            backward_op_item["attr_dict"] = to_named_dict(
+                backward_op_item["attrs"]
+            )
+
+            if 'backward' not in op_args:
+                continue
 
-        if 'backward' in op_args and has_backward:
             backward_op_list = op_args['backward'].split(',')
-            _, bw_op_name = get_op_and_op_name(backward_op_list[0])
+            _, bw_op_name = get_phi_and_fluid_op_name(backward_op_list[0])
             forward_op_item['backward'] = bw_op_name
             backward_op_item['op_name'] = bw_op_name
 
-            process_scalar(backward_op_item, scalar_configs)
-            process_int_array(backward_op_item, int_array_configs)
-
             # for double grad
             if len(backward_op_list) > 1:
                 (
-                    new_double_grad_op_name,
+                    phi_double_grad_op_name,
                     double_grad_op_name,
-                ) = get_op_and_op_name(backward_op_list[1])
-                double_grad_item = backward_op_dict[new_double_grad_op_name]
+                ) = get_phi_and_fluid_op_name(backward_op_list[1])
+                double_grad_item = backward_op_dict[phi_double_grad_op_name]
                 backward_op_item['backward'] = double_grad_op_name
                 double_grad_item['op_name'] = double_grad_op_name
-                if 'attrs' in op_args:
-                    update_op_attr_name(
-                        double_grad_item['attrs'], op_args['attrs']
-                    )
-                    update_op_attr_name(
-                        double_grad_item['forward']['attrs'], op_args['attrs']
-                    )
-
-                process_scalar(double_grad_item, scalar_configs)
-                process_int_array(double_grad_item, int_array_configs)
+                update_grad_op_compat_name(double_grad_item, args_map)
+                update_common_params_name(
+                    double_grad_item,
+                    args_map,
+                    scalar_configs,
+                    int_array_configs,
+                )
+                double_grad_item["attr_dict"] = to_named_dict(
+                    double_grad_item["attrs"]
+                )
 
                 # for triple grad
                 if len(backward_op_list) > 2:
                     (
-                        new_triple_grad_op_name,
+                        phi_triple_grad_op_name,
                         triple_grad_op_name,
-                    ) = get_op_and_op_name(backward_op_list[2])
-                    triple_grad_item = backward_op_dict[new_triple_grad_op_name]
+                    ) = get_phi_and_fluid_op_name(backward_op_list[2])
+                    triple_grad_item = backward_op_dict[phi_triple_grad_op_name]
                     double_grad_item['backward'] = triple_grad_op_name
                     triple_grad_item['op_name'] = triple_grad_op_name
-                    if 'attrs' in op_args:
-                        update_op_attr_name(
-                            triple_grad_item['attrs'], op_args['attrs']
-                        )
-                        update_op_attr_name(
-                            triple_grad_item['forward']['attrs'],
-                            op_args['attrs'],
-                        )
-
-                    process_scalar(triple_grad_item, scalar_configs)
-                    process_int_array(triple_grad_item, int_array_configs)
-
-        key_set = ['inputs', 'attrs', 'outputs']
-        args_map = {}
-        for key in key_set:
-            if key in op_args:
-                args_map.update(op_args[key])
-                for args_item in forward_op_item[key]:
-                    if args_item['name'] in op_args[key]:
-                        args_item['name'] = op_args[key][args_item['name']]
-                if has_backward:
-                    for args_item in backward_op_item['forward'][key]:
-                        if args_item['name'] in op_args[key]:
-                            args_item['name'] = op_args[key][args_item['name']]
-        forward_op_item['infer_meta']['param'] = [
-            args_map[param] if param in args_map else param
-            for param in forward_op_item['infer_meta']['param']
-        ]
-        forward_op_item['kernel']['param'] = [
-            args_map[param] if param in args_map else param
-            for param in forward_op_item['kernel']['param']
-        ]
-        if forward_op_item['kernel']['data_type']:
-            forward_op_item['kernel']['data_type']['candidates'] = [
-                args_map[param] if param in args_map else param
-                for param in forward_op_item['kernel']['data_type'][
-                    'candidates'
-                ]
-            ]
-        if forward_op_item['kernel']['backend']:
-            forward_op_item['kernel']['backend']['candidates'] = [
-                args_map[param] if param in args_map else param
-                for param in forward_op_item['kernel']['backend']['candidates']
-            ]
-        if forward_op_item['kernel']['layout']:
-            forward_op_item['kernel']['layout']['candidates'] = [
-                args_map[param] if param in args_map else param
-                for param in forward_op_item['kernel']['layout']['candidates']
-            ]
-        if forward_op_item['inplace']:
-            inplace_map = {}
-            for key, val in forward_op_item['inplace'].items():
-                if key in args_map:
-                    key = args_map[key]
-                if val in args_map:
-                    val = args_map[val]
-                inplace_map[key] = val
-            forward_op_item['inplace'] = inplace_map
-
-        if has_backward:
-            for args_item in backward_op_item['inputs']:
-                if args_item['name'] in args_map:
-                    args_item['name'] = args_map[args_item['name']]
-                elif (
-                    args_item['name'].endswith('_grad')
-                    and args_item['name'][:-5] in args_map
-                ):
-                    args_map[args_item['name']] = (
-                        args_map[args_item['name'][:-5]] + '_grad'
+                    update_grad_op_compat_name(triple_grad_item, args_map)
+                    update_common_params_name(
+                        triple_grad_item,
+                        args_map,
+                        scalar_configs,
+                        int_array_configs,
                     )
-                    args_item['name'] = args_map[args_item['name']]
-            for args_item in backward_op_item['attrs']:
-                if args_item['name'] in args_map:
-                    args_item['name'] = args_map[args_item['name']]
-            for args_item in backward_op_item['outputs']:
-                if (
-                    args_item['name'].endswith('_grad')
-                    and args_item['name'][:-5] in args_map
-                ):
-                    args_map[args_item['name']] = (
-                        args_map[args_item['name'][:-5]] + '_grad'
+                    triple_grad_item["attr_dict"] = to_named_dict(
+                        triple_grad_item["attrs"]
                     )
-                    args_item['name'] = args_map[args_item['name']]
-
-            if 'invoke' in backward_op_item:
-                backward_op_item['invoke']['args'] = [
-                    args_map[param.strip()]
-                    if param.strip() in args_map
-                    else param.strip()
-                    for param in backward_op_item['invoke']['args'].split(',')
-                ]
-                continue
-
-            backward_op_item['infer_meta']['param'] = [
-                args_map[param] if param in args_map else param
-                for param in backward_op_item['infer_meta']['param']
-            ]
-            backward_op_item['kernel']['param'] = [
-                args_map[param] if param in args_map else param
-                for param in backward_op_item['kernel']['param']
-            ]
-            if backward_op_item['kernel']['data_type']:
-                backward_op_item['kernel']['data_type']['candidates'] = [
-                    args_map[param] if param in args_map else param
-                    for param in backward_op_item['kernel']['data_type'][
-                        'candidates'
-                    ]
-                ]
-            if backward_op_item['kernel']['backend']:
-                backward_op_item['kernel']['backend']['candidates'] = [
-                    args_map[param] if param in args_map else param
-                    for param in backward_op_item['kernel']['backend'][
-                        'candidates'
-                    ]
-                ]
-            if backward_op_item['kernel']['layout']:
-                backward_op_item['kernel']['layout']['candidates'] = [
-                    args_map[param] if param in args_map else param
-                    for param in backward_op_item['kernel']['layout'][
-                        'candidates'
-                    ]
-                ]
-            if backward_op_item['no_need_buffer']:
-                backward_op_item['no_need_buffer'] = [
-                    args_map[param] if param in args_map else param
-                    for param in backward_op_item['no_need_buffer']
-                ]
-            if backward_op_item['inplace']:
-                inplace_map = {}
-                for key, val in backward_op_item['inplace'].items():
-                    if key in args_map:
-                        key = args_map[key]
-                    if val in args_map:
-                        val = args_map[val]
-                    inplace_map[key] = val
-                backward_op_item['inplace'] = inplace_map
 
 
 def process_invoke_op(forward_op_dict, backward_op_dict):
@@ -372,6 +360,7 @@ def process_invoke_op(forward_op_dict, backward_op_dict):
             args_index = 0
             if invoke_op in forward_op_dict:
                 reuse_op = forward_op_dict[invoke_op]
+                bw_op['invoke']['func'] = reuse_op['op_name']
                 bw_op['invoke']['inputs'] = []
                 bw_op['invoke']['attrs'] = []
                 bw_op['invoke']['outputs'] = []
@@ -430,14 +419,14 @@ def main(
         forward_op_dict[op_version['op']]['version'] = op_version['version']
 
     with open(op_compat_yaml_path, "rt") as f:
-        op_op_map = yaml.safe_load(f)
+        op_fluid_map_list = yaml.safe_load(f)
 
     for op in ops:
         op['op_name'] = op['name']
     for bw_op in backward_ops:
         bw_op['op_name'] = bw_op['name']
 
-    replace_compat_name(op_op_map, forward_op_dict, backward_op_dict)
+    replace_compat_name(op_fluid_map_list, forward_op_dict, backward_op_dict)
 
     # prepare for invoke case
     process_invoke_op(forward_op_dict, backward_op_dict)
diff --git a/paddle/fluid/operators/generator/templates/operator_utils.c.j2 b/paddle/fluid/operators/generator/templates/operator_utils.c.j2
index 0b49721afcc9e0..b28c8bdc1a2978 100644
--- a/paddle/fluid/operators/generator/templates/operator_utils.c.j2
+++ b/paddle/fluid/operators/generator/templates/operator_utils.c.j2
@@ -54,6 +54,10 @@ AddOutput({{name | to_opmaker_name}}, "({{typename}}), output {{i}} of {{op_name
 
     .AsIntermediate()
   {%- endif %}
+  {%- if "is_extra" in output and output["is_extra"] %}
+
+    .AsExtra()
+  {%- endif %}
 {%- endmacro %}
 
 {# add attribute, and process default value if needed #}
@@ -115,7 +119,7 @@ KernelSignature {{op["op_name"] | to_pascal_case }}OpArgumentMapping(const Argum
   paddle::small_vector<const char*> attrs;
   {% for attr in op["attrs"]%}
   {% filter indent(2)%}
-  {{get_an_attr(attr)}}
+  {{get_an_attr(attr, kernel_args)}}
   {% endfilter %}
   {% endfor %}
   {{get_output_list(op["outputs"], kernel_args)}};
@@ -170,7 +174,7 @@ KernelSignature {{op["op_name"] | to_pascal_case }}OpArgumentMapping(const Argum
   paddle::small_vector<const char*> attrs;
   {% for attr in op["attrs"]%}
   {% filter indent(2)%}
-  {{get_an_attr(attr)}}
+  {{get_an_attr(attr, kernel_args)}}
   {% endfilter %}
   {% endfor %}
   {{get_output_list(op["outputs"], kernel_args)}};
@@ -209,8 +213,9 @@ paddle::small_vector<const char*> inputs {
 }
 {%- endmacro %}
 
-{% macro get_an_attr(attr) %}{# inline #}
+{% macro get_an_attr(attr, kernel_args) %}{# inline #}
 {% set typename = attr["typename"] %}
+{%- if attr["name"] in kernel_args %}
 {% set name = attr["name"] %}
 {% if typename is scalar %}{# scalar correspond to a dispensable input and an attr in opmaker #}
 attrs.emplace_back(ctx.HasInput("{{attr | to_scalar_tensor_name}}") ? "{{attr | to_scalar_tensor_name}}" : "{{name}}");
@@ -236,6 +241,7 @@ attrs.emplace_back(
 {%- else %}
 attrs.emplace_back("{{name}}");
 {%- endif %}
+{%- endif %}
 {%- endmacro %}
 
 {% macro get_output_list(outputs, kernel_args) %}{# inline #}
@@ -502,10 +508,9 @@ OutputGrad({{name_in_forward_orig | to_opmaker_name}})
     {% set name_in_forward = name[:-5] %}
     {% set name_in_forward_orig = input_orig_names[input_names.index(name_in_forward)]%}
 InputGrad({{name_in_forward_orig | to_opmaker_name}})
-  {%- elif (name | to_input_name) in input_names %}
-    {% set name_in_forward = name | to_input_name %}
-    {% set name_in_forward_orig = input_orig_names[input_names.index(name_in_forward)]%}
-InputGrad({{name | to_input_name | to_opmaker_name}})
+  {%- elif (name) in input_names %}
+    {% set name_in_forward_orig = input_orig_names[input_names.index(name)]%}
+Input({{name  | to_opmaker_name}})
   {%- endif %}
 {%- endmacro %}
 
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index 6e8b962488a568..e980aa66e7ca33 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -114,11 +114,6 @@ class ReshapeOp : public framework::OperatorWithKernel {
       return;
     }
 
-    PADDLE_ENFORCE_EQ(!shape.empty(),
-                      true,
-                      platform::errors::InvalidArgument(
-                          "The parameter 'shape' in ReshapeOp must be set. "
-                          "But received 'shape' is empty."));
     auto x_dims = ctx->GetInputDim("X");
     auto out_dims = ValidateShape(shape, x_dims);
     ctx->SetOutputDim("Out", out_dims);
diff --git a/paddle/fluid/operators/run_program_op.h b/paddle/fluid/operators/run_program_op.h
index f7d3630c019d84..5f0ad7b6e29e05 100644
--- a/paddle/fluid/operators/run_program_op.h
+++ b/paddle/fluid/operators/run_program_op.h
@@ -288,8 +288,8 @@ class RunProgramOpKernel : public framework::OpKernel<T> {
     auto *out_scope_vec = ctx.Output<StepScopeVar>("OutScope");
     std::unique_ptr<framework::Scope> inner_scope{nullptr};
     if (out_scope_vec->size() == 0) {
-      // For cuda graph under static mode usage.
-      // For static mode, we cannot set value of a tensor before any run,
+      // For cuda graph under static graph mode usage.
+      // For static graph mode, we cannot set value of a tensor before any run,
       // the OutScope variable passed to the op actually contains nothing.
       // Just create a tmp scope to run the program.
       PADDLE_ENFORCE_EQ(
diff --git a/paddle/fluid/operators/set_value_op.cc b/paddle/fluid/operators/set_value_op.cc
index a41b0f5f2b9963..d635feee58b586 100644
--- a/paddle/fluid/operators/set_value_op.cc
+++ b/paddle/fluid/operators/set_value_op.cc
@@ -145,7 +145,7 @@ class SetValueMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<std::vector<int64_t>>("shape", "(vector<int64_t>) Shape of values.")
         .SetDefault({});
     AddComment(R"DOC(SetValue operator.
-Assignment to a phi::DenseTensor in static mode.
+Assignment to a phi::DenseTensor in static graph mode.
 )DOC");
   }
 };
diff --git a/paddle/fluid/operators/squeeze_op.cc b/paddle/fluid/operators/squeeze_op.cc
index bfadb456312109..7b023bcdf662cc 100644
--- a/paddle/fluid/operators/squeeze_op.cc
+++ b/paddle/fluid/operators/squeeze_op.cc
@@ -195,17 +195,6 @@ class SqueezeOpMaker : public framework::OpProtoAndCheckerMaker {
   }
 };
 
-class Squeeze2Op : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    auto input_data_type =
-        framework::OperatorWithKernel::IndicateVarDataType(ctx, "X");
-    return framework::OpKernelType(input_data_type, ctx.GetPlace());
-  }
-};
-
 template <typename T>
 class SqueezeGradOpMaker : public framework::SingleGradOpMaker<T> {
  public:
@@ -220,32 +209,6 @@ class SqueezeGradOpMaker : public framework::SingleGradOpMaker<T> {
   }
 };
 
-class Squeeze2GradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *context) const override {
-    OP_INOUT_CHECK(
-        context->HasInput("XShape"), "Input", "XShape", "Squeeze2Grad");
-    OP_INOUT_CHECK(context->HasInput(framework::GradVarName("Out")),
-                   "Input",
-                   framework::GradVarName("Out"),
-                   "Squeeze2Grad");
-    auto xshape_dims = context->GetInputDim("XShape");
-    auto x_dims = phi::slice_ddim(xshape_dims, 1, xshape_dims.size());
-    context->SetOutputDim(framework::GradVarName("X"), x_dims);
-    context->ShareLoD("XShape", framework::GradVarName("X"));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    auto input_data_type = framework::OperatorWithKernel::IndicateVarDataType(
-        ctx, framework::GradVarName("Out"));
-    return framework::OpKernelType(input_data_type, ctx.GetPlace());
-  }
-};
-
 template <typename T>
 class SqueezeDoubleGradOpMaker : public framework::SingleGradOpMaker<T> {
  public:
@@ -259,82 +222,6 @@ class SqueezeDoubleGradOpMaker : public framework::SingleGradOpMaker<T> {
   }
 };
 
-// FIXME(zcd): squeeze2 adds an intermediate output(XShape) based on squeeze,
-// the XShape is used to carry the shape and lod of X which will be used in
-// squeeze_grad, in this way, the framework can reuse the memory of X
-// immediately the squeeze2_op is finished.
-// Considering compatibility issues, we could not fix squeeze2_op
-class Squeeze2OpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor). The input tensor of squeeze operator.");
-    AddOutput("Out", "(Tensor). The output tensor of squeeze operator.");
-    AddOutput("XShape",
-              "XShape is just used to store the shape and lod of X, which will "
-              "be used in SqueezeGradOp.")
-        .AsIntermediate()
-        .AsExtra();
-    AddAttr<std::vector<int>>("axes",
-                              "(std::vector<int>). List of integers,"
-                              " indicating the dimensions to squeeze.")
-        .SetDefault({})
-        .SupportTensor();
-    AddComment(R"DOC(
-        Squeeze2 Operator.
-
-        Remove single-dimensional entries from the shape of a tensor.
-        Takes a parameter axes with a list of axes to squeeze.
-        If axes is not provided, all the single dimensions will be removed from the shape.
-        If an axis is selected with shape entry not equal to one, an error is raised.
-
-        Examples:
-        Case 1:
-          Given
-            X.shape = (1, 3, 1, 5)
-          and
-            axes = [0]
-          we get:
-            Out.shape = (3, 1, 5)
-
-        Case 2:
-          Given
-            X.shape = (1, 3, 1, 5)
-          and
-            axes = []
-          we get:
-            Out.shape = (3, 5)
-    )DOC");
-  }
-};
-
-template <typename T>
-class Squeeze2GradOpMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
-  void Apply(GradOpPtr<T> grad_op) const override {
-    grad_op->SetType("squeeze2_grad");
-    grad_op->SetInput("XShape", this->Output("XShape"));
-    grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
-    grad_op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
-    grad_op->SetAttrMap(this->Attrs());
-  }
-};
-
-template <typename T>
-class Squeeze2DoubleGradOpMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
-  void Apply(GradOpPtr<T> grad_op) const override {
-    grad_op->SetType("squeeze2");
-    grad_op->SetInput("X", this->OutputGrad(framework::GradVarName("X")));
-    grad_op->SetOutput("Out", this->InputGrad(framework::GradVarName("Out")));
-    grad_op->SetOutput("XShape", this->Input("XShape"));
-    grad_op->SetAttrMap(this->Attrs());
-  }
-};
-
 DECLARE_INPLACE_OP_INFERER(SqueezeInplaceInferer, {"X", "Out"});
 DECLARE_INPLACE_OP_INFERER(SqueezeGradInplaceInferer,
                            {framework::GradVarName("Out"),
@@ -345,10 +232,6 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(SqueezeGradNoNeedBufferVarsInferer, "X");
 
 namespace ops = paddle::operators;
 
-DECLARE_INFER_SHAPE_FUNCTOR(squeeze2,
-                            SqueezeInferShapeFunctor,
-                            PD_INFER_META(phi::SqueezeWithXShapeInferMeta));
-
 REGISTER_OPERATOR(squeeze,
                   ops::SqueezeOp,
                   ops::SqueezeOpMaker,
@@ -360,19 +243,6 @@ REGISTER_OPERATOR(squeeze_grad,
                   ops::SqueezeDoubleGradOpMaker<paddle::imperative::OpBase>,
                   ops::SqueezeGradNoNeedBufferVarsInferer);
 
-REGISTER_OPERATOR(squeeze2,
-                  ops::Squeeze2Op,
-                  ops::Squeeze2OpMaker,
-                  ops::Squeeze2GradOpMaker<paddle::framework::OpDesc>,
-                  ops::Squeeze2GradOpMaker<paddle::imperative::OpBase>,
-                  ops::SqueezeInplaceInferer,
-                  SqueezeInferShapeFunctor);
-REGISTER_OPERATOR(squeeze2_grad,
-                  ops::Squeeze2GradOp,
-                  ops::Squeeze2DoubleGradOpMaker<paddle::framework::OpDesc>,
-                  ops::Squeeze2DoubleGradOpMaker<paddle::imperative::OpBase>,
-                  ops::SqueezeGradInplaceInferer);
-
 REGISTER_OP_CPU_KERNEL(
     squeeze,
     ops::SqueezeKernel<phi::CPUContext, float>,
diff --git a/paddle/fluid/operators/unsqueeze_op.cc b/paddle/fluid/operators/unsqueeze_op.cc
index 8f28e0b606b035..d092c03a563984 100644
--- a/paddle/fluid/operators/unsqueeze_op.cc
+++ b/paddle/fluid/operators/unsqueeze_op.cc
@@ -260,83 +260,6 @@ class UnsqueezeDoubleGradOpMaker : public framework::SingleGradOpMaker<T> {
   }
 };
 
-// FIXME(zcd): unsqueeze2 adds an intermediate output(XShape) based on
-// unsqueeze, the XShape is used to carry the shape and lod of X which
-// will be used in unsqueeze_grad, in this way, the framework can reuse
-// the memory of X immediately the unsqueeze2_op is finished.
-// Considering compatibility issues, we could not fix unsqueeze2_op
-class Unsqueeze2Op : public UnsqueezeOp {
- public:
-  using UnsqueezeOp::UnsqueezeOp;
-};
-
-class Unsqueeze2OpMaker : public UnsqueezeOpMaker {
- public:
-  void Make() override {
-    UnsqueezeOpMaker::Make();
-    AddOutput("XShape",
-              "XShape is just used to store the shape and lod of X, which will "
-              "be used in UnsqueezeGradOp.")
-        .AsIntermediate()
-        .AsExtra();
-  }
-};
-
-template <typename T>
-class Unsqueeze2GradOpMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
-  void Apply(GradOpPtr<T> grad_op) const override {
-    grad_op->SetType("unsqueeze2_grad");
-    grad_op->SetInput("XShape", this->Output("XShape"));
-    grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
-    grad_op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
-    grad_op->SetAttrMap(this->Attrs());
-  }
-};
-
-class Unsqueeze2GradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext *context) const override {
-    PADDLE_ENFORCE_EQ(
-        context->HasInput("XShape"),
-        true,
-        platform::errors::InvalidArgument("Input(XShape) shouldn't be null."));
-    PADDLE_ENFORCE_EQ(context->HasInput(framework::GradVarName("Out")),
-                      true,
-                      platform::errors::InvalidArgument(
-                          "Input(Out@GRAD) shouldn't be null."));
-    auto xshape_dims = context->GetInputDim("XShape");
-    auto x_dims = phi::slice_ddim(xshape_dims, 1, xshape_dims.size());
-    context->SetOutputDim(framework::GradVarName("X"), x_dims);
-    context->ShareLoD("XShape", framework::GradVarName("X"));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType(
-                                       ctx, framework::GradVarName("Out")),
-                                   ctx.device_context());
-  }
-};
-
-template <typename T>
-class Unsqueeze2DoubleGradOpMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
-  void Apply(GradOpPtr<T> grad_op) const override {
-    grad_op->SetType("unsqueeze2");
-    grad_op->SetInput("X", this->OutputGrad(framework::GradVarName("X")));
-    grad_op->SetOutput("Out", this->InputGrad(framework::GradVarName("Out")));
-    grad_op->SetOutput("XShape", this->Input("XShape"));
-    grad_op->SetAttrMap(this->Attrs());
-  }
-};
-
 DECLARE_INPLACE_OP_INFERER(UnsqueezeInplaceInferer, {"X", "Out"});
 DECLARE_INPLACE_OP_INFERER(UnsqueezeGradInplaceInferer,
                            {framework::GradVarName("Out"),
@@ -345,10 +268,6 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(UnsqueezeGradOpNoNeedBufferVarInferer, "X");
 }  // namespace operators
 }  // namespace paddle
 
-DECLARE_INFER_SHAPE_FUNCTOR(unsqueeze2,
-                            Unsqueeze2InferShapeFunctor,
-                            PD_INFER_META(phi::UnsqueezeWithXShapeInferMeta));
-
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(unsqueeze,
                   ops::UnsqueezeOp,
@@ -362,20 +281,6 @@ REGISTER_OPERATOR(unsqueeze_grad,
                   ops::UnsqueezeDoubleGradOpMaker<paddle::imperative::OpBase>,
                   ops::UnsqueezeGradOpNoNeedBufferVarInferer);
 
-REGISTER_OPERATOR(unsqueeze2,
-                  ops::Unsqueeze2Op,
-                  ops::Unsqueeze2OpMaker,
-                  ops::Unsqueeze2GradOpMaker<paddle::framework::OpDesc>,
-                  ops::Unsqueeze2GradOpMaker<paddle::imperative::OpBase>,
-                  Unsqueeze2InferShapeFunctor,
-                  ops::UnsqueezeInplaceInferer);
-
-REGISTER_OPERATOR(unsqueeze2_grad,
-                  ops::Unsqueeze2GradOp,
-                  ops::Unsqueeze2DoubleGradOpMaker<paddle::framework::OpDesc>,
-                  ops::Unsqueeze2DoubleGradOpMaker<paddle::imperative::OpBase>,
-                  ops::UnsqueezeGradInplaceInferer);
-
 REGISTER_OP_CPU_KERNEL(
     unsqueeze,
     ops::UnsqueezeKernel<phi::CPUContext, float>,
diff --git a/paddle/fluid/pybind/eager_legacy_op_function_generator.cc b/paddle/fluid/pybind/eager_legacy_op_function_generator.cc
index fff811e84ba6f6..bc5eeeea875cb0 100644
--- a/paddle/fluid/pybind/eager_legacy_op_function_generator.cc
+++ b/paddle/fluid/pybind/eager_legacy_op_function_generator.cc
@@ -443,9 +443,9 @@ GenerateOpFunctions() {
     // In this case, output will reuse input varbase.
     // Dygraph mode needs to be aligned with the in-place strategy in static
     // mode, and the mapping relationships between output and input that have
-    // been defined in static mode should be used in dygraph mode.
-    // Find which ops need to use Inplace strategy in static mode, and get the
-    // mapping relationship between Inplace output and input.
+    // been defined in static graph mode should be used in dygraph mode.
+    // Find which ops need to use Inplace strategy in static graph mode, and get
+    // the mapping relationship between Inplace output and input.
     auto& infer_inplace =
         paddle::framework::OpInfoMap::Instance().Get(op_type).infer_inplace_;
     std::map<std::string, std::string> inplace_map;
diff --git a/paddle/fluid/pybind/eager_properties.cc b/paddle/fluid/pybind/eager_properties.cc
index b0476db6446937..a86271dfbf532e 100644
--- a/paddle/fluid/pybind/eager_properties.cc
+++ b/paddle/fluid/pybind/eager_properties.cc
@@ -39,7 +39,8 @@ PyObject* tensor_properties_get_name(TensorObject* self, void* closure) {
   EAGER_TRY
   // NOTE(dev): [why not use egr::Controller::Instance::GernerateUniqueName()?]
   // Beacause Controller must holder a tracer, but 'tensor.name' maybe called
-  // everywhere such as static mode in @to_static, which means tracer is None.
+  // everywhere such as static graph mode in @to_static, which means tracer is
+  // None.
   static egr::UniqueNameGenerator name_generator;
   if (self->tensor.name().empty()) {
     self->tensor.set_name(name_generator.Generate());
diff --git a/paddle/fluid/pybind/op_function_generator.cc b/paddle/fluid/pybind/op_function_generator.cc
index 4caa2c207b80cc..f2d784f6d5e86d 100644
--- a/paddle/fluid/pybind/op_function_generator.cc
+++ b/paddle/fluid/pybind/op_function_generator.cc
@@ -473,9 +473,9 @@ GenerateOpFunctions(int split_count) {
     // In this case, output will reuse input varbase.
     // Dygraph mode needs to be aligned with the in-place strategy in static
     // mode, and the mapping relationships between output and input that have
-    // been defined in static mode should be used in dygraph mode.
-    // Find which ops need to use Inplace strategy in static mode, and get the
-    // mapping relationship between Inplace output and input.
+    // been defined in static graph mode should be used in dygraph mode.
+    // Find which ops need to use Inplace strategy in static graph mode, and get
+    // the mapping relationship between Inplace output and input.
     auto& infer_inplace =
         paddle::framework::OpInfoMap::Instance().Get(op_type).infer_inplace_;
     std::map<std::string, std::string> inplace_map;
diff --git a/paddle/phi/CMakeLists.txt b/paddle/phi/CMakeLists.txt
index aec5c7632a8663..7be9e8fb187378 100644
--- a/paddle/phi/CMakeLists.txt
+++ b/paddle/phi/CMakeLists.txt
@@ -44,11 +44,7 @@ set(PHI_DEPS
 get_property(phi_kernels GLOBAL PROPERTY PHI_KERNELS)
 set(PHI_DEPS ${PHI_DEPS} ${phi_kernels})
 
-if(APPLE AND WITH_ARM)
-  cc_library(phi DEPS ${PHI_DEPS})
-else()
-  create_dummy_static_lib(phi LIBS ${PHI_DEPS} LIMIT 100)
-endif()
+cc_library(phi DEPS ${PHI_DEPS})
 
 set(phi_extension_header_file
     ${CMAKE_CURRENT_SOURCE_DIR}/extension.h
diff --git a/paddle/phi/api/ext/tensor_compat.h b/paddle/phi/api/ext/tensor_compat.h
index 7233744c65c3fd..17c0dd3f8732dd 100644
--- a/paddle/phi/api/ext/tensor_compat.h
+++ b/paddle/phi/api/ext/tensor_compat.h
@@ -19,7 +19,7 @@ limitations under the License. */
 
 // Note(chenweihang): In order to be compatible with the original custom
 // operator Tensor interface, only available to external users, the file
-// cannot be includeed in paddle
+// cannot be included in paddle
 
 namespace paddle {
 using Tensor = experimental::Tensor;
diff --git a/paddle/phi/api/yaml/backward.yaml b/paddle/phi/api/yaml/backward.yaml
index dead42d03f7bcf..8f107f02dafafa 100644
--- a/paddle/phi/api/yaml/backward.yaml
+++ b/paddle/phi/api/yaml/backward.yaml
@@ -1186,6 +1186,26 @@
   backward : square_double_grad
   inplace : (out_grad -> x_grad)
 
+- backward_op : squeeze_double_grad
+  forward : squeeze_grad(Tensor xshape, Tensor grad_out, IntArray axis) -> Tensor(grad_x)
+  args : (Tensor grad_x_grad, IntArray axis)
+  output : Tensor(grad_out_grad), Tensor(xshape)
+  invoke: squeeze(grad_x_grad, axis)
+  intermediate : xshape
+
+- backward_op : squeeze_grad
+  forward : squeeze(Tensor x, IntArray axis) -> Tensor(out), Tensor(xshape)
+  args : (Tensor xshape, Tensor out_grad, IntArray axis)
+  output : Tensor(x_grad)
+  infer_meta :
+    func : KernelWithXShapeInferMeta
+    param: [xshape]
+  kernel :
+    func : squeeze_grad
+    data_type : out_grad
+  inplace : (out_grad -> x_grad)
+  backward: squeeze_double_grad
+
 - backward_op : svd_grad
   forward : svd (Tensor x, bool full_matrices = false) -> Tensor(u), Tensor(s), Tensor(vh)
   args : (Tensor x, Tensor u, Tensor vh, Tensor s, Tensor u_grad, Tensor vh_grad, Tensor s_grad, bool full_matrices)
@@ -1321,6 +1341,27 @@
     data_type : out_grad
   no_need_buffer : x
 
+- backward_op : unsqueeze_double_grad
+  forward : unsqueeze_grad(Tensor xshape, Tensor grad_out, IntArray axes) -> Tensor(grad_x)
+  args : (Tensor grad_x_grad, IntArray axes)
+  output : Tensor(grad_out_grad), Tensor(xshape)
+  invoke : unsqueeze(grad_x_grad, axes)
+  intermediate : xshape
+
+- backward_op : unsqueeze_grad
+  forward : unsqueeze(Tensor x, IntArray axes) -> Tensor(out), Tensor(xshape)
+  args : (Tensor xshape, Tensor out_grad, IntArray axes)
+  output : Tensor(x_grad)
+  infer_meta :
+    func : KernelWithXShapeInferMeta
+    param: [xshape]
+  kernel :
+    func : unsqueeze_grad
+    param : [xshape, out_grad]
+    data_type : out_grad
+  inplace : (out_grad -> x_grad)
+  backward : unsqueeze_double_grad
+
 - backward_op : unstack_grad
   forward : unstack (Tensor x, int axis=0, int num=0) -> Tensor[](out)
   args : (Tensor[] out_grad, int axis)
diff --git a/paddle/phi/api/yaml/legacy_backward.yaml b/paddle/phi/api/yaml/legacy_backward.yaml
index 8d7af90a90a59a..acc7b670ba5249 100755
--- a/paddle/phi/api/yaml/legacy_backward.yaml
+++ b/paddle/phi/api/yaml/legacy_backward.yaml
@@ -1363,24 +1363,6 @@
   kernel :
     func : squared_l2_norm_grad
 
-- backward_op : squeeze_double_grad
-  forward : squeeze_grad(Tensor xshape, Tensor grad_out, IntArray axis) -> Tensor(grad_x)
-  args : (Tensor grad_x_grad, IntArray axis)
-  output : Tensor(grad_out_grad)
-  invoke: squeeze(grad_x_grad, axis)
-
-- backward_op : squeeze_grad
-  forward : squeeze(Tensor x, IntArray axis) -> Tensor(out), Tensor(xshape)
-  args : (Tensor xshape, Tensor out_grad, IntArray axis)
-  output : Tensor(x_grad)
-  infer_meta :
-    func : KernelWithXShapeInferMeta
-    param: [xshape]
-  kernel :
-    func : squeeze_grad
-  inplace : (out_grad -> x_grad)
-  backward: squeeze_double_grad
-
 - backward_op : stack_grad
   forward : stack (Tensor[] x, int axis) -> Tensor(out)
   args : (Tensor[] x, Tensor out_grad, int axis)
@@ -1574,25 +1556,6 @@
     func : uniform_inplace_grad
   inplace : (out_grad -> x_grad)
 
-- backward_op : unsqueeze_double_grad
-  forward : unsqueeze_grad(Tensor xshape, Tensor grad_out, IntArray axes) -> Tensor(grad_x)
-  args : (Tensor grad_x_grad, IntArray axes)
-  output : Tensor(grad_out_grad)
-  invoke : unsqueeze(grad_x_grad, axes)
-
-- backward_op : unsqueeze_grad
-  forward : unsqueeze(Tensor x, IntArray axes) -> Tensor(out), Tensor(xshape)
-  args : (Tensor xshape, Tensor out_grad, IntArray axes)
-  output : Tensor(x_grad)
-  infer_meta :
-    func : KernelWithXShapeInferMeta
-    param: [xshape]
-  kernel :
-    func : unsqueeze_grad
-    param: [xshape, out_grad]
-  inplace : (out_grad -> x_grad)
-  backward : unsqueeze_double_grad
-
 - backward_op : warpctc_grad
   forward : warpctc (Tensor logits, Tensor label, Tensor logits_length, Tensor labels_length, int blank, bool norm_by_times) -> Tensor(loss), Tensor(warpctcgrad)
   args : (Tensor logits, Tensor logits_length, Tensor warpctcgrad, Tensor loss_grad, int blank, bool norm_by_times)
diff --git a/paddle/phi/api/yaml/legacy_ops.yaml b/paddle/phi/api/yaml/legacy_ops.yaml
index b93ca2944ab85f..6dfff5d510d65e 100755
--- a/paddle/phi/api/yaml/legacy_ops.yaml
+++ b/paddle/phi/api/yaml/legacy_ops.yaml
@@ -1777,18 +1777,6 @@
     func : squared_l2_norm
   backward : squared_l2_norm_grad
 
-- op : squeeze
-  args : (Tensor x, IntArray axis)
-  output : Tensor(out), Tensor(xshape)
-  infer_meta :
-    func : SqueezeWithXShapeInferMeta
-  kernel :
-    func : squeeze_with_xshape
-  inplace : (x -> out)
-  view: (x -> out)
-  intermediate : xshape
-  backward : squeeze_grad
-
 - op : stack
   args : (Tensor[] x, int axis)
   output : Tensor
@@ -2022,18 +2010,6 @@
     data_type: x
   backward: unpool3d_grad
 
-- op : unsqueeze
-  args : (Tensor x, IntArray axis)
-  output : Tensor(out), Tensor(xshape)
-  infer_meta :
-    func : UnsqueezeWithXShapeInferMeta
-  kernel :
-    func : unsqueeze_with_xshape
-  inplace : (x -> out)
-  view: (x -> out)
-  intermediate : xshape
-  backward : unsqueeze_grad
-
 - op : update_loss_scaling_
   args : (Tensor[] x, Tensor found_infinite, Tensor prev_loss_scaling, Tensor in_good_steps, Tensor in_bad_steps, int incr_every_n_steps, int decr_every_n_nan_or_inf, float incr_ratio, float decr_ratio, Scalar stop_update)
   output : Tensor[](out){x.size()}, Tensor(loss_scaling), Tensor(out_good_steps), Tensor(out_bad_steps)
diff --git a/paddle/phi/api/yaml/op_compat.yaml b/paddle/phi/api/yaml/op_compat.yaml
index 7e960d73bbbd71..cb6f67fbdf2664 100644
--- a/paddle/phi/api/yaml/op_compat.yaml
+++ b/paddle/phi/api/yaml/op_compat.yaml
@@ -1270,9 +1270,20 @@
     attrs : [bool use_mkldnn = false, bool use_cudnn = false]
 
 - op : squeeze (squeeze2)
-  backward : squeeze_grad (squeeze2_grad)
+  backward : squeeze_grad (squeeze2_grad), squeeze_double_grad(squeeze2_double_grad)
+  inputs :
+    x : X
+  attrs :
+   axis : axes
+  outputs :
+    {out : Out, xshape : XShape}
+  int_array:
+    axis :
+      data_type : int
+      support_tensor : true
   extra :
     attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"]
+    outputs : [xshape]
 
 - op : stack
   backward : stack_grad
@@ -1389,6 +1400,22 @@
   outputs :
     out : Y
 
+- op : unsqueeze (unsqueeze2)
+  backward : unsqueeze_grad (unsqueeze2_grad), unsqueeze_double_grad(unsqueeze2_double_grad)
+  inputs :
+    x : X
+  attrs :
+   axis : axes
+  outputs :
+    {out : Out, xshape : XShape}
+  int_array:
+    axis :
+      data_type : int
+      tensor_name : AxesTensor
+      tensors_name : AxesTensorList
+  extra :
+    outputs : [xshape]
+
 - op : unstack
   backward : unstack_grad
   inputs :
diff --git a/paddle/phi/api/yaml/ops.yaml b/paddle/phi/api/yaml/ops.yaml
index 0e85b2d8dffaf7..e5378ce07718b0 100644
--- a/paddle/phi/api/yaml/ops.yaml
+++ b/paddle/phi/api/yaml/ops.yaml
@@ -1054,6 +1054,19 @@
            square_sr {selected_rows -> selected_rows}
   backward : square_grad
 
+- op : squeeze
+  args : (Tensor x, IntArray axis={})
+  output : Tensor(out), Tensor(xshape)
+  infer_meta :
+    func : SqueezeWithXShapeInferMeta
+  kernel :
+    func : squeeze_with_xshape
+    data_type : x
+  inplace : (x -> out)
+  view: (x -> out)
+  intermediate : xshape
+  backward : squeeze_grad
+
 - op : svd
   args : (Tensor x, bool full_matrices = false)
   output : Tensor(u), Tensor(s), Tensor(vh)
@@ -1149,6 +1162,19 @@
     func : unfold
   backward : unfold_grad
 
+- op : unsqueeze
+  args : (Tensor x, IntArray axis = {})
+  output : Tensor(out), Tensor(xshape)
+  infer_meta :
+    func : UnsqueezeWithXShapeInferMeta
+  kernel :
+    func : unsqueeze_with_xshape
+    data_type : x
+  inplace : (x -> out)
+  view: (x -> out)
+  intermediate : xshape
+  backward : unsqueeze_grad
+
 - op : unstack
   args : (Tensor x, int axis=0, int num=0)
   output : Tensor[](out){num}
diff --git a/paddle/phi/kernels/fusion/README.md b/paddle/phi/kernels/fusion/README.md
index 1e9e2bb7e43145..114929376d5ee9 100644
--- a/paddle/phi/kernels/fusion/README.md
+++ b/paddle/phi/kernels/fusion/README.md
@@ -2,7 +2,7 @@
 
 1. We don't recommend to implement Python API for fusion kernel
 
-  - We don't recommend to implement Python API for fusion kernel, because it contains many inputs or outputs arguments generally, it is difficult to use and understand as an Python API, we recommend to call fusion kernel by pass optimization in dy2static mode or static mode.
+  - We don't recommend to implement Python API for fusion kernel, because it contains many inputs or outputs arguments generally, it is difficult to use and understand as an Python API, we recommend to call fusion kernel by pass optimization in dy2static mode or static graph mode.
   - We also don't recommend to reuse fusion kernel in other kernel implementation, but recommended that the fusion kernel be implemented by reusing other kernels.
 
 2. We don't require fusion kernel to have implementations for all devices
diff --git a/paddle/phi/kernels/gpu/flip_kernel.cu b/paddle/phi/kernels/gpu/flip_kernel.cu
index 6e9dbf37a91001..7945d6c8fcbafc 100644
--- a/paddle/phi/kernels/gpu/flip_kernel.cu
+++ b/paddle/phi/kernels/gpu/flip_kernel.cu
@@ -101,6 +101,9 @@ void FlipKernel(const Context& dev_ctx,
                 DenseTensor* out) {
   const size_t total_dims = x.dims().size();
   switch (total_dims) {
+    case 0:
+      LaunchFlipCudaKernel<T, Context, 0>(dev_ctx, x, axis, out);
+      break;
     case 1:
       LaunchFlipCudaKernel<T, Context, 1>(dev_ctx, x, axis, out);
       break;
diff --git a/paddle/phi/kernels/xpu/compare_kernel.cc b/paddle/phi/kernels/xpu/compare_kernel.cc
index 7e46b9da647867..a433329c924d0c 100644
--- a/paddle/phi/kernels/xpu/compare_kernel.cc
+++ b/paddle/phi/kernels/xpu/compare_kernel.cc
@@ -52,22 +52,30 @@ void XPUCompareKernelImpl(const Context& dev_ctx,
   PADDLE_ENFORCE_XDNN_SUCCESS(ret, "compare op");
 }
 
-#define DEFINE_XPU_COMPARE_KERNEL(name, functor)                            \
-  template <typename T, typename Context>                                   \
-  void name##RawKernel(const Context& dev_ctx,                              \
-                       const DenseTensor& x,                                \
-                       const DenseTensor& y,                                \
-                       int axis,                                            \
-                       DenseTensor* out) {                                  \
-    using XPUType = typename XPUTypeTrait<T>::Type;                         \
-    XPUCompareKernelImpl<T, XPUType, Context>(dev_ctx, x, y, out, functor); \
-  }                                                                         \
-  template <typename T, typename Context>                                   \
-  void name##Kernel(const Context& dev_ctx,                                 \
-                    const DenseTensor& x,                                   \
-                    const DenseTensor& y,                                   \
-                    DenseTensor* out) {                                     \
-    name##RawKernel<T, Context>(dev_ctx, x, y, -1, out);                    \
+#define DEFINE_XPU_COMPARE_KERNEL(name, functor)                      \
+  template <typename T, typename Context>                             \
+  void name##RawKernel(const Context& dev_ctx,                        \
+                       const DenseTensor& x,                          \
+                       const DenseTensor& y,                          \
+                       int axis,                                      \
+                       DenseTensor* out) {                            \
+    using XPUType = typename XPUTypeTrait<T>::Type;                   \
+    auto f = [](xpu::Context* ctx,                                    \
+                const XPUType* x,                                     \
+                const XPUType* y,                                     \
+                bool* z,                                              \
+                const std::vector<int>& xshape,                       \
+                const std::vector<int>& yshape) {                     \
+      return functor(ctx, x, y, z, xshape, yshape);                   \
+    };                                                                \
+    XPUCompareKernelImpl<T, XPUType, Context>(dev_ctx, x, y, out, f); \
+  }                                                                   \
+  template <typename T, typename Context>                             \
+  void name##Kernel(const Context& dev_ctx,                           \
+                    const DenseTensor& x,                             \
+                    const DenseTensor& y,                             \
+                    DenseTensor* out) {                               \
+    name##RawKernel<T, Context>(dev_ctx, x, y, -1, out);              \
   }
 
 DEFINE_XPU_COMPARE_KERNEL(Equal, xpu::broadcast_equal<XPUType>)
diff --git a/paddle/phi/kernels/xpu/elementwise_add_kernel.cc b/paddle/phi/kernels/xpu/elementwise_add_kernel.cc
index 1e838acaa8a88c..0e19c59d26c91b 100644
--- a/paddle/phi/kernels/xpu/elementwise_add_kernel.cc
+++ b/paddle/phi/kernels/xpu/elementwise_add_kernel.cc
@@ -54,8 +54,17 @@ void AddRawKernel(const Context& dev_ctx,
                   int axis,
                   DenseTensor* out) {
   using XPUType = typename XPUTypeTrait<T>::Type;
-  XPUElementwise<T, XPUType>(
-      dev_ctx, x, y, axis, out, xpu::broadcast_add<XPUType>);
+
+  auto f = [](xpu::Context* ctx,
+              const XPUType* x,
+              const XPUType* y,
+              XPUType* z,
+              const std::vector<int>& xshape,
+              const std::vector<int>& yshape) {
+    return xpu::broadcast_add<XPUType>(ctx, x, y, z, xshape, yshape);
+  };
+
+  XPUElementwise<T, XPUType>(dev_ctx, x, y, axis, out, f);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/xpu/elementwise_divide_grad_kernel.cc b/paddle/phi/kernels/xpu/elementwise_divide_grad_kernel.cc
index 9251286e495a89..3b20874b5f312e 100644
--- a/paddle/phi/kernels/xpu/elementwise_divide_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/elementwise_divide_grad_kernel.cc
@@ -35,15 +35,21 @@ void DivideGradKernel(const Context& dev_ctx,
                       DenseTensor* dy) {
   using XPUType = typename XPUTypeTrait<T>::Type;
   funcs::ElementwiseGradPreProcess(dout, dx);
-  XPUElementwiseGrad<T, XPUType>(dev_ctx,
-                                 x,
-                                 y,
-                                 dout,
-                                 axis,
-                                 dx,
-                                 dy,
-                                 xpu::broadcast_div_grad<XPUType>,
-                                 true);
+
+  auto f = [](xpu::Context* ctx,
+              const XPUType* x,
+              const XPUType* y,
+              const XPUType* z,
+              const XPUType* dz,
+              XPUType* dy,
+              XPUType* dx,
+              const std::vector<int>& xshape,
+              const std::vector<int>& yshape) {
+    return xpu::broadcast_div_grad<XPUType>(
+        ctx, x, y, z, dz, dy, dx, xshape, yshape);
+  };
+
+  XPUElementwiseGrad<T, XPUType>(dev_ctx, x, y, dout, axis, dx, dy, f, true);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/xpu/elementwise_divide_kernel.cc b/paddle/phi/kernels/xpu/elementwise_divide_kernel.cc
index ebefd05a02af4d..0ec748b817effa 100644
--- a/paddle/phi/kernels/xpu/elementwise_divide_kernel.cc
+++ b/paddle/phi/kernels/xpu/elementwise_divide_kernel.cc
@@ -31,8 +31,16 @@ void DivideRawKernel(const Context& dev_ctx,
                      int axis,
                      DenseTensor* out) {
   using XPUType = typename XPUTypeTrait<T>::Type;
-  XPUElementwise<T, XPUType>(
-      dev_ctx, x, y, axis, out, xpu::broadcast_div<XPUType>);
+  auto f = [](xpu::Context* ctx,
+              const XPUType* x,
+              const XPUType* y,
+              XPUType* z,
+              const std::vector<int>& xshape,
+              const std::vector<int>& yshape) {
+    return xpu::broadcast_div<XPUType>(ctx, x, y, z, xshape, yshape);
+  };
+
+  XPUElementwise<T, XPUType>(dev_ctx, x, y, axis, out, f);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/xpu/elementwise_grad_kernel.cc b/paddle/phi/kernels/xpu/elementwise_grad_kernel.cc
index 9b1d2a6957f727..47da6b25de9201 100644
--- a/paddle/phi/kernels/xpu/elementwise_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/elementwise_grad_kernel.cc
@@ -29,15 +29,21 @@ void MaximumGradKernel(const Context& dev_ctx,
                        DenseTensor* dx,
                        DenseTensor* dy) {
   using XPUType = typename XPUTypeTrait<T>::Type;
-  XPUElementwiseGrad<T, XPUType>(dev_ctx,
-                                 x,
-                                 y,
-                                 dout,
-                                 axis,
-                                 dx,
-                                 dy,
-                                 xpu::broadcast_max_grad<XPUType>,
-                                 true);
+
+  auto f = [](xpu::Context* ctx,
+              const XPUType* x,
+              const XPUType* y,
+              const XPUType* z,
+              const XPUType* dz,
+              XPUType* dy,
+              XPUType* dx,
+              const std::vector<int>& xshape,
+              const std::vector<int>& yshape) {
+    return xpu::broadcast_max_grad<XPUType>(
+        ctx, x, y, z, dz, dy, dx, xshape, yshape);
+  };
+
+  XPUElementwiseGrad<T, XPUType>(dev_ctx, x, y, dout, axis, dx, dy, f, true);
 }
 
 template <typename T, typename Context>
@@ -49,15 +55,21 @@ void MinimumGradKernel(const Context& dev_ctx,
                        DenseTensor* dx,
                        DenseTensor* dy) {
   using XPUType = typename XPUTypeTrait<T>::Type;
-  XPUElementwiseGrad<T, XPUType>(dev_ctx,
-                                 x,
-                                 y,
-                                 dout,
-                                 axis,
-                                 dx,
-                                 dy,
-                                 xpu::broadcast_min_grad<XPUType>,
-                                 true);
+
+  auto f = [](xpu::Context* ctx,
+              const XPUType* x,
+              const XPUType* y,
+              const XPUType* z,
+              const XPUType* dz,
+              XPUType* dy,
+              XPUType* dx,
+              const std::vector<int>& xshape,
+              const std::vector<int>& yshape) {
+    return xpu::broadcast_min_grad<XPUType>(
+        ctx, x, y, z, dz, dy, dx, xshape, yshape);
+  };
+
+  XPUElementwiseGrad<T, XPUType>(dev_ctx, x, y, dout, axis, dx, dy, f, true);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/xpu/elementwise_kernel.cc b/paddle/phi/kernels/xpu/elementwise_kernel.cc
index 87edfb22e502d2..f70f9e743a4114 100644
--- a/paddle/phi/kernels/xpu/elementwise_kernel.cc
+++ b/paddle/phi/kernels/xpu/elementwise_kernel.cc
@@ -27,8 +27,16 @@ void FloorDivideRawKernel(const Context& dev_ctx,
                           int axis,
                           DenseTensor* out) {
   using XPUType = typename XPUTypeTrait<T>::Type;
-  XPUElementwise<T, XPUType>(
-      dev_ctx, x, y, axis, out, xpu::broadcast_floordiv<XPUType>);
+  auto f = [](xpu::Context* ctx,
+              const XPUType* x,
+              const XPUType* y,
+              XPUType* z,
+              const std::vector<int>& xshape,
+              const std::vector<int>& yshape) {
+    return xpu::broadcast_floordiv<XPUType>(ctx, x, y, z, xshape, yshape);
+  };
+
+  XPUElementwise<T, XPUType>(dev_ctx, x, y, axis, out, f);
 }
 
 template <typename T, typename Context>
@@ -38,8 +46,16 @@ void MaximumRawKernel(const Context& dev_ctx,
                       int axis,
                       DenseTensor* out) {
   using XPUType = typename XPUTypeTrait<T>::Type;
-  XPUElementwise<T, XPUType>(
-      dev_ctx, x, y, axis, out, xpu::broadcast_max<XPUType>);
+  auto f = [](xpu::Context* ctx,
+              const XPUType* x,
+              const XPUType* y,
+              XPUType* z,
+              const std::vector<int>& xshape,
+              const std::vector<int>& yshape) {
+    return xpu::broadcast_max<XPUType>(ctx, x, y, z, xshape, yshape);
+  };
+
+  XPUElementwise<T, XPUType>(dev_ctx, x, y, axis, out, f);
 }
 
 template <typename T, typename Context>
@@ -49,8 +65,16 @@ void MinimumRawKernel(const Context& dev_ctx,
                       int axis,
                       DenseTensor* out) {
   using XPUType = typename XPUTypeTrait<T>::Type;
-  XPUElementwise<T, XPUType>(
-      dev_ctx, x, y, axis, out, xpu::broadcast_min<XPUType>);
+  auto f = [](xpu::Context* ctx,
+              const XPUType* x,
+              const XPUType* y,
+              XPUType* z,
+              const std::vector<int>& xshape,
+              const std::vector<int>& yshape) {
+    return xpu::broadcast_min<XPUType>(ctx, x, y, z, xshape, yshape);
+  };
+
+  XPUElementwise<T, XPUType>(dev_ctx, x, y, axis, out, f);
 }
 
 template <typename T, typename Context>
@@ -60,8 +84,16 @@ void RemainderRawKernel(const Context& dev_ctx,
                         int axis,
                         DenseTensor* out) {
   using XPUType = typename XPUTypeTrait<T>::Type;
-  XPUElementwise<T, XPUType>(
-      dev_ctx, x, y, axis, out, xpu::broadcast_mod<XPUType>);
+  auto f = [](xpu::Context* ctx,
+              const XPUType* x,
+              const XPUType* y,
+              XPUType* z,
+              const std::vector<int>& xshape,
+              const std::vector<int>& yshape) {
+    return xpu::broadcast_mod<XPUType>(ctx, x, y, z, xshape, yshape);
+  };
+
+  XPUElementwise<T, XPUType>(dev_ctx, x, y, axis, out, f);
 }
 
 template <typename T, typename Context>
@@ -71,8 +103,16 @@ void ElementwisePowRawKernel(const Context& dev_ctx,
                              int axis,
                              DenseTensor* out) {
   using XPUType = typename XPUTypeTrait<T>::Type;
-  XPUElementwise<T, XPUType>(
-      dev_ctx, x, y, axis, out, xpu::broadcast_pow<XPUType>);
+  auto f = [](xpu::Context* ctx,
+              const XPUType* x,
+              const XPUType* y,
+              XPUType* z,
+              const std::vector<int>& xshape,
+              const std::vector<int>& yshape) {
+    return xpu::broadcast_pow<XPUType>(ctx, x, y, z, xshape, yshape);
+  };
+
+  XPUElementwise<T, XPUType>(dev_ctx, x, y, axis, out, f);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/xpu/elementwise_multiply_grad_kernel.cc b/paddle/phi/kernels/xpu/elementwise_multiply_grad_kernel.cc
index b111630506f838..ebe190827d69d0 100644
--- a/paddle/phi/kernels/xpu/elementwise_multiply_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/elementwise_multiply_grad_kernel.cc
@@ -34,15 +34,20 @@ void MultiplyGradKernel(const Context& dev_ctx,
                         DenseTensor* dy) {
   using XPUType = typename XPUTypeTrait<T>::Type;
   funcs::ElementwiseGradPreProcess(dout, dx);
-  XPUElementwiseGrad<T, XPUType>(dev_ctx,
-                                 x,
-                                 y,
-                                 dout,
-                                 axis,
-                                 dx,
-                                 dy,
-                                 xpu::broadcast_mul_grad<XPUType>,
-                                 true);
+  auto f = [](xpu::Context* ctx,
+              const XPUType* x,
+              const XPUType* y,
+              const XPUType* z,
+              const XPUType* dz,
+              XPUType* dy,
+              XPUType* dx,
+              const std::vector<int>& xshape,
+              const std::vector<int>& yshape) {
+    return xpu::broadcast_mul_grad<XPUType>(
+        ctx, x, y, z, dz, dy, dx, xshape, yshape);
+  };
+
+  XPUElementwiseGrad<T, XPUType>(dev_ctx, x, y, dout, axis, dx, dy, f, true);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/xpu/elementwise_multiply_kernel.cc b/paddle/phi/kernels/xpu/elementwise_multiply_kernel.cc
index e3b62d539486f8..aa7f525c9b5f15 100644
--- a/paddle/phi/kernels/xpu/elementwise_multiply_kernel.cc
+++ b/paddle/phi/kernels/xpu/elementwise_multiply_kernel.cc
@@ -31,8 +31,16 @@ void MultiplyRawKernel(const Context& dev_ctx,
                        int axis,
                        DenseTensor* out) {
   using XPUType = typename XPUTypeTrait<T>::Type;
-  XPUElementwise<T, XPUType>(
-      dev_ctx, x, y, axis, out, xpu::broadcast_mul<XPUType>);
+  auto f = [](xpu::Context* ctx,
+              const XPUType* x,
+              const XPUType* y,
+              XPUType* z,
+              const std::vector<int>& xshape,
+              const std::vector<int>& yshape) {
+    return xpu::broadcast_mul<XPUType>(ctx, x, y, z, xshape, yshape);
+  };
+
+  XPUElementwise<T, XPUType>(dev_ctx, x, y, axis, out, f);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/xpu/elementwise_subtract_grad_kernel.cc b/paddle/phi/kernels/xpu/elementwise_subtract_grad_kernel.cc
index 0fb0ced46b8439..d22b369619d40d 100644
--- a/paddle/phi/kernels/xpu/elementwise_subtract_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/elementwise_subtract_grad_kernel.cc
@@ -28,15 +28,22 @@ void SubtractGradKernel(const Context& dev_ctx,
                         DenseTensor* dx,
                         DenseTensor* dy) {
   using XPUType = typename XPUTypeTrait<T>::Type;
-  phi::XPUElementwiseGrad<T, XPUType>(dev_ctx,
-                                      x,
-                                      y,
-                                      dout,
-                                      axis,
-                                      dx,
-                                      dy,
-                                      xpu::broadcast_sub_grad<XPUType>,
-                                      false);
+
+  auto f = [](xpu::Context* ctx,
+              const XPUType* x,
+              const XPUType* y,
+              const XPUType* z,
+              const XPUType* dz,
+              XPUType* dy,
+              XPUType* dx,
+              const std::vector<int>& xshape,
+              const std::vector<int>& yshape) {
+    return xpu::broadcast_sub_grad<XPUType>(
+        ctx, x, y, z, dz, dy, dx, xshape, yshape);
+  };
+
+  phi::XPUElementwiseGrad<T, XPUType>(
+      dev_ctx, x, y, dout, axis, dx, dy, f, false);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/xpu/elementwise_subtract_kernel.cc b/paddle/phi/kernels/xpu/elementwise_subtract_kernel.cc
index 4e18264d713431..866d9cf6206eda 100644
--- a/paddle/phi/kernels/xpu/elementwise_subtract_kernel.cc
+++ b/paddle/phi/kernels/xpu/elementwise_subtract_kernel.cc
@@ -26,8 +26,16 @@ void SubtractRawKernel(const Context& dev_ctx,
                        int axis,
                        DenseTensor* out) {
   using XPUType = typename XPUTypeTrait<T>::Type;
-  phi::XPUElementwise<T, XPUType>(
-      dev_ctx, x, y, axis, out, xpu::broadcast_sub<XPUType>);
+  auto f = [](xpu::Context* ctx,
+              const XPUType* x,
+              const XPUType* y,
+              XPUType* z,
+              const std::vector<int>& xshape,
+              const std::vector<int>& yshape) {
+    return xpu::broadcast_sub<XPUType>(ctx, x, y, z, xshape, yshape);
+  };
+
+  phi::XPUElementwise<T, XPUType>(dev_ctx, x, y, axis, out, f);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/xpu/prod_kernel.cc b/paddle/phi/kernels/xpu/prod_kernel.cc
index cf237afb227975..ebc9abc049c0e3 100644
--- a/paddle/phi/kernels/xpu/prod_kernel.cc
+++ b/paddle/phi/kernels/xpu/prod_kernel.cc
@@ -29,13 +29,18 @@ void ProdRawKernel(const Context& dev_ctx,
                    bool reduce_all,
                    DenseTensor* out) {
   reduce_all = recompute_reduce_all(x, dims, reduce_all);
-  int r = XPUReduce<Context, T>(dev_ctx,
-                                x,
-                                dims.GetData(),
-                                keep_dim,
-                                reduce_all,
-                                out,
-                                xpu::reduce_prod<T>);
+  using XPUType = typename XPUTypeTrait<T>::Type;
+
+  auto f = [](xpu::Context* ctx,
+              const XPUType* x,
+              XPUType* y,
+              const std::vector<int>& xdims,
+              const std::vector<int>& reduce_dims) {
+    return xpu::reduce_prod<XPUType>(ctx, x, y, xdims, reduce_dims);
+  };
+
+  int r = XPUReduce<Context, T>(
+      dev_ctx, x, dims.GetData(), keep_dim, reduce_all, out, f);
   PADDLE_ENFORCE_XDNN_SUCCESS(r, "reduce_prod");
 }
 
diff --git a/paddle/phi/kernels/xpu/reduce_max_kernel.cc b/paddle/phi/kernels/xpu/reduce_max_kernel.cc
index 8db710a24adce8..ba4aec72cd38c8 100644
--- a/paddle/phi/kernels/xpu/reduce_max_kernel.cc
+++ b/paddle/phi/kernels/xpu/reduce_max_kernel.cc
@@ -29,13 +29,17 @@ void MaxRawKernel(const Context& dev_ctx,
                   bool reduce_all,
                   DenseTensor* out) {
   reduce_all = recompute_reduce_all(x, dims, reduce_all);
-  int r = XPUReduce<Context, T>(dev_ctx,
-                                x,
-                                dims.GetData(),
-                                keep_dim,
-                                reduce_all,
-                                out,
-                                xpu::reduce_max<T>);
+  using XPUType = typename XPUTypeTrait<T>::Type;
+  auto f = [](xpu::Context* ctx,
+              const XPUType* x,
+              XPUType* y,
+              const std::vector<int>& xdims,
+              const std::vector<int>& reduce_dims) {
+    return xpu::reduce_max<XPUType>(ctx, x, y, xdims, reduce_dims);
+  };
+
+  int r = XPUReduce<Context, XPUType>(
+      dev_ctx, x, dims.GetData(), keep_dim, reduce_all, out, f);
   PADDLE_ENFORCE_XDNN_SUCCESS(r, "reduce_max");
 }
 
diff --git a/paddle/phi/kernels/xpu/reduce_mean_kernel.cc b/paddle/phi/kernels/xpu/reduce_mean_kernel.cc
index d29db35517f372..b646cd7ebfbc74 100644
--- a/paddle/phi/kernels/xpu/reduce_mean_kernel.cc
+++ b/paddle/phi/kernels/xpu/reduce_mean_kernel.cc
@@ -29,13 +29,18 @@ void MeanRawKernel(const Context& dev_ctx,
                    bool reduce_all,
                    DenseTensor* out) {
   reduce_all = recompute_reduce_all(x, dims, reduce_all);
-  int r = XPUReduce<Context, T>(dev_ctx,
-                                x,
-                                dims.GetData(),
-                                keep_dim,
-                                reduce_all,
-                                out,
-                                xpu::reduce_mean<T>);
+  using XPUType = typename XPUTypeTrait<T>::Type;
+  auto f = [](xpu::Context* ctx,
+              const XPUType* x,
+              XPUType* y,
+              const std::vector<int>& xdims,
+              const std::vector<int>& reduce_dims) {
+    return xpu::reduce_mean<XPUType>(ctx, x, y, xdims, reduce_dims);
+  };
+
+  int r = XPUReduce<Context, XPUType>(
+      dev_ctx, x, dims.GetData(), keep_dim, reduce_all, out, f);
+
   PADDLE_ENFORCE_XDNN_SUCCESS(r, "reduce_mean");
 }
 
diff --git a/paddle/phi/kernels/xpu/reduce_min_kernel.cc b/paddle/phi/kernels/xpu/reduce_min_kernel.cc
index e330e30becdcfe..e5294e43537b48 100644
--- a/paddle/phi/kernels/xpu/reduce_min_kernel.cc
+++ b/paddle/phi/kernels/xpu/reduce_min_kernel.cc
@@ -29,13 +29,18 @@ void MinRawKernel(const Context& dev_ctx,
                   bool reduce_all,
                   DenseTensor* out) {
   reduce_all = recompute_reduce_all(x, dims, reduce_all);
-  int r = XPUReduce<Context, T>(dev_ctx,
-                                x,
-                                dims.GetData(),
-                                keep_dim,
-                                reduce_all,
-                                out,
-                                xpu::reduce_min<T>);
+  using XPUType = typename XPUTypeTrait<T>::Type;
+
+  auto f = [](xpu::Context* ctx,
+              const XPUType* x,
+              XPUType* y,
+              const std::vector<int>& xdims,
+              const std::vector<int>& reduce_dims) {
+    return xpu::reduce_min<XPUType>(ctx, x, y, xdims, reduce_dims);
+  };
+
+  int r = XPUReduce<Context, XPUType>(
+      dev_ctx, x, dims.GetData(), keep_dim, reduce_all, out, f);
   PADDLE_ENFORCE_XDNN_SUCCESS(r, "reduce_min");
 }
 
diff --git a/paddle/phi/kernels/xpu/reduce_sum_kernel.cc b/paddle/phi/kernels/xpu/reduce_sum_kernel.cc
index 952ed101cdcb8e..ac13dc3de3e0dd 100644
--- a/paddle/phi/kernels/xpu/reduce_sum_kernel.cc
+++ b/paddle/phi/kernels/xpu/reduce_sum_kernel.cc
@@ -30,13 +30,17 @@ void SumRawKernel(const Context& dev_ctx,
                   DataType out_dtype,
                   DenseTensor* out) {
   reduce_all = recompute_reduce_all(x, dims, reduce_all);
-  int r = XPUReduce<Context, T>(dev_ctx,
-                                x,
-                                dims.GetData(),
-                                keep_dim,
-                                reduce_all,
-                                out,
-                                xpu::reduce_sum<T>);
+  using XPUType = typename XPUTypeTrait<T>::Type;
+
+  auto f = [](xpu::Context* ctx,
+              const XPUType* x,
+              XPUType* y,
+              const std::vector<int>& xdims,
+              const std::vector<int>& reduce_dims) {
+    return xpu::reduce_sum<XPUType>(ctx, x, y, xdims, reduce_dims);
+  };
+  int r = XPUReduce<Context, XPUType>(
+      dev_ctx, x, dims.GetData(), keep_dim, reduce_all, out, f);
   PADDLE_ENFORCE_XDNN_SUCCESS(r, "reduce_sum");
 }
 
diff --git a/paddle/phi/ops/compat/squeeze_sig.cc b/paddle/phi/ops/compat/squeeze_sig.cc
deleted file mode 100644
index 4ca45903acfa00..00000000000000
--- a/paddle/phi/ops/compat/squeeze_sig.cc
+++ /dev/null
@@ -1,35 +0,0 @@
-
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/phi/core/compat/op_utils.h"
-
-namespace phi {
-
-KernelSignature SqueezeOpArgumentMapping(const ArgumentMappingContext& ctx) {
-  return KernelSignature(
-      "squeeze_with_xshape", {"X"}, {"axes"}, {"Out", "XShape"});
-}
-
-KernelSignature SqueezeGradOpArgumentMapping(
-    const ArgumentMappingContext& ctx) {
-  return KernelSignature(
-      "squeeze_grad", {"XShape", "Out@GRAD"}, {"axes"}, {"X@GRAD"});
-}
-
-}  // namespace phi
-PD_REGISTER_BASE_KERNEL_NAME(squeeze2, squeeze);
-PD_REGISTER_BASE_KERNEL_NAME(squeeze2_grad, squeeze_grad);
-PD_REGISTER_ARG_MAPPING_FN(squeeze2, phi::SqueezeOpArgumentMapping);
-PD_REGISTER_ARG_MAPPING_FN(squeeze2_grad, phi::SqueezeGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/unsqueeze_sig.cc b/paddle/phi/ops/compat/unsqueeze_sig.cc
deleted file mode 100644
index 568097298b7acc..00000000000000
--- a/paddle/phi/ops/compat/unsqueeze_sig.cc
+++ /dev/null
@@ -1,47 +0,0 @@
-
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/phi/core/compat/op_utils.h"
-
-namespace phi {
-
-KernelSignature UnsqueezeOpArgumentMapping(const ArgumentMappingContext& ctx) {
-  if (ctx.InputSize("AxesTensorList") > 0) {
-    VLOG(2) << "unsqueeze2 in AxesTensorList";
-    return KernelSignature(
-        "unsqueeze_with_xshape", {"X"}, {"AxesTensorList"}, {"Out", "XShape"});
-  } else if (ctx.InputSize("AxesTensor") > 0) {
-    VLOG(2) << "unsqueeze2 in AxesTensor";
-    return KernelSignature(
-        "unsqueeze_with_xshape", {"X"}, {"AxesTensor"}, {"Out", "XShape"});
-  } else {
-    VLOG(2) << "unsqueeze2 in axes";
-    return KernelSignature(
-        "unsqueeze_with_xshape", {"X"}, {"axes"}, {"Out", "XShape"});
-  }
-}
-
-KernelSignature UnsqueezeGradOpArgumentMapping(
-    const ArgumentMappingContext& ctx) {
-  return KernelSignature(
-      "unsqueeze_grad", {"XShape", "Out@GRAD"}, {}, {"X@GRAD"});
-}
-}  // namespace phi
-PD_REGISTER_BASE_KERNEL_NAME(unsqueeze2, unsqueeze);
-PD_REGISTER_BASE_KERNEL_NAME(unsqueeze2_grad, unsqueeze_grad);
-
-PD_REGISTER_ARG_MAPPING_FN(unsqueeze2, phi::UnsqueezeOpArgumentMapping);
-PD_REGISTER_ARG_MAPPING_FN(unsqueeze2_grad,
-                           phi::UnsqueezeGradOpArgumentMapping);
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 74a22d47de301d..a234e4906ff83f 100644
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -1843,6 +1843,11 @@ function precise_card_test_single {
     for case in $(echo $testcases | tr "$|^" "\n" | awk '!/^$/')
     do
         cd ${PADDLE_ROOT}/build
+        
+        find paddle/fluid -name *.gcda | xargs rm -f 
+        find paddle/phi -name *.gcda | xargs rm -f 
+        find paddle/utils -name *.gcda | xargs rm -f 
+
         precise_card_test "^${case}$" $num
 
         #if test failed,continue,if test succeed ,go on 
@@ -1876,9 +1881,6 @@ function precise_card_test_single {
             fi
             mv python-coverage.data.* ${PADDLE_ROOT}/build/pytest/$case
         fi
-        find paddle/fluid -name *.gcda | xargs rm -f 
-        find paddle/phi -name *.gcda | xargs rm -f 
-        find paddle/utils -name *.gcda | xargs rm -f 
     done
 }
 
@@ -1988,6 +1990,10 @@ set +x
             fi
             read testcase <<< $(echo "$line"|grep -oEi "\w+$")
 
+            if [[ "$testcase" == "simple_precision_test" ]]; then
+                continue
+            fi
+
             if [[ "$is_multicard" == "" ]]; then
                 # trick: treat all test case with prefix "test_dist" as dist case, and would run on 2 GPUs
                 read is_multicard <<< $(echo "$testcase"|grep -oEi "test_dist_")
@@ -2032,6 +2038,8 @@ set -x
     mkdir -p ${PADDLE_ROOT}/build/ut_map
     mkdir -p ${PADDLE_ROOT}/build/pytest
     #run all unittest to get the coverage information of .c and .h files
+    precise_card_test_single "^simple_precision_test$" 1
+    wait;
     precise_card_test_single "$single_card_tests" 1
     precise_card_test_single "$single_card_tests_1" 1
     precise_card_test_single "$multiple_card_tests" 2
@@ -3523,7 +3531,7 @@ function run_setup(){
                 export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:/Library/Frameworks/Python.framework/Versions/3.10/lib/
                 export PATH=/Library/Frameworks/Python.framework/Versions/3.10/bin/:${PATH}
                 #after changing "PYTHON_LIBRARY:FILEPATH" to "PYTHON_LIBRARY" ,we can use export
-                export PYTHON_EXECUTABLE=/Library/Frameworks/Python.framework/Versions/3.9/lib/libpython3.9.dylib
+                export PYTHON_EXECUTABLE=/Library/Frameworks/Python.framework/Versions/3.10/bin/python3
                 export PYTHON_INCLUDE_DIR=/Library/Frameworks/Python.framework/Versions/3.10/include/python3.10/
                 export PYTHON_LIBRARY=/Library/Frameworks/Python.framework/Versions/3.10/lib/libpython3.10.dylib
                 pip3.10 install --user -r ${PADDLE_ROOT}/python/requirements.txt
@@ -3651,8 +3659,11 @@ function run_setup(){
 
     # reset ccache zero stats for collect PR's actual hit rate
     ccache -z
-
-    python setup.py $2;build_error=$?
+    if [ "${PYTHON_EXECUTABLE}" != "" ];then
+        ${PYTHON_EXECUTABLE} setup.py $2;build_error=$?
+    else
+        python setup.py $2;build_error=$?
+    fi
     
     # ci will collect ccache hit rate
     collect_ccache_hits
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 3c6ac0229d58d8..9523228eaf022b 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -119,7 +119,7 @@ if(WITH_TESTING)
   add_subdirectory(paddle/tests)
   add_subdirectory(paddle/fluid/tests)
   add_subdirectory(paddle/fluid/contrib/tests)
-  add_subdirectory(paddle/fluid/contrib/slim/tests)
+  add_subdirectory(paddle/static/quantization/tests)
 endif()
 
 if(NOT WITH_SETUP_INSTALL)
diff --git a/python/paddle/common_ops_import.py b/python/paddle/common_ops_import.py
index 1c2bb424dc18b8..91a3f49cdbba2a 100644
--- a/python/paddle/common_ops_import.py
+++ b/python/paddle/common_ops_import.py
@@ -24,7 +24,6 @@
     OpProtoHolder,
     Variable,
     _dygraph_tracer,
-    _in_legacy_dygraph,
     _non_static_mode,
     _varbase_creator,
     convert_np_dtype_to_dtype_,
diff --git a/python/paddle/device/cuda/graphs.py b/python/paddle/device/cuda/graphs.py
index 75c3e61ad2cb87..79a274f52de64c 100644
--- a/python/paddle/device/cuda/graphs.py
+++ b/python/paddle/device/cuda/graphs.py
@@ -82,7 +82,7 @@ def print_to_dot_files(self, dirname, flags=None):
 def wrap_cuda_graph(function, mode="thread_local", memory_pool="default"):
     assert mode in ALL_MODES
     if not paddle.in_dynamic_mode():
-        # static mode
+        # static graph mode
         from paddle.fluid.framework import _cuda_graph_guard
 
         global cuda_graph_id
@@ -94,7 +94,7 @@ def wrap_cuda_graph(function, mode="thread_local", memory_pool="default"):
             memory_pool_id = CoreCUDAGraph.gen_new_memory_pool_id()
         else:
             raise ValueError(
-                "memory_pool should be one of default or new under static mode, but got",
+                "memory_pool should be one of default or new under static graph mode, but got",
                 memory_pool,
             )
         return _cuda_graph_guard(
diff --git a/python/paddle/distributed/auto_parallel/constants.py b/python/paddle/distributed/auto_parallel/constants.py
index 044bc78887b03c..f0c9655c81ec64 100644
--- a/python/paddle/distributed/auto_parallel/constants.py
+++ b/python/paddle/distributed/auto_parallel/constants.py
@@ -76,6 +76,13 @@ def set_field_default_config(category, field, default_value):
 set_field_default_config(AMP, "use_fp16_guard", True)
 set_field_default_config(AMP, "use_optimizer_fp16", False)
 
+set_field_default_config(AMP, "enable_bf16", False)
+set_field_default_config(AMP, "custom_bf16_list", [])
+set_field_default_config(AMP, "custom_fp32_list", [])
+set_field_default_config(AMP, "custom_fp32_varnames", [])
+set_field_default_config(AMP, "use_pure_bf16", False)
+set_field_default_config(AMP, "use_bf16_guard", False)
+
 #########################################
 # sharding configuration
 #########################################
diff --git a/python/paddle/distributed/auto_parallel/engine.py b/python/paddle/distributed/auto_parallel/engine.py
index 1ef44bdb7f5b72..a202dc61acdca1 100644
--- a/python/paddle/distributed/auto_parallel/engine.py
+++ b/python/paddle/distributed/auto_parallel/engine.py
@@ -539,7 +539,7 @@ def _build(self, mode):
 
             paddle.enable_static()
         else:
-            # build program in static mode
+            # build program in static graph mode
             serial_main_prog = self._serial_main_progs.get(mode, None)
             if serial_main_prog is not None:
                 return
@@ -1617,9 +1617,7 @@ def save(self, path, training=True):
             fetch_vars = self._fetch_vars["predict"]['outputs']
             dist_main_prog = self._dist_main_progs["predict"][self._cur_rank]
             if self._strategy.qat.enable and self._strategy.qat.onnx_format:
-                from paddle.fluid.contrib.slim.quantization import (
-                    QuantWeightPass,
-                )
+                from paddle.static.quantization import QuantWeightPass
 
                 self._logger.info("export quantized model.")
                 self._logger.info(
diff --git a/python/paddle/distributed/auto_parallel/operators/common.py b/python/paddle/distributed/auto_parallel/operators/common.py
index ff1518f9b8f82c..ef865dc13bb04b 100644
--- a/python/paddle/distributed/auto_parallel/operators/common.py
+++ b/python/paddle/distributed/auto_parallel/operators/common.py
@@ -266,8 +266,12 @@ def is_parameter_related(varname, block):
         varname = varname[: varname.index(".subprog_")]
     if ".cast_fp" in varname:
         varname = varname[: varname.index(".cast_fp")]
+    if ".cast_bf" in varname:
+        varname = varname[: varname.index(".cast_bf")]
     if ".quantized" in varname:
         varname = varname[: varname.index(".quantized")]
+    # if "@RESHARD" in varname:
+    #     varname = varname[: varname.index("@RESHARD")]
     assert block._find_var_recursive(varname)
     var = block._var_recursive(varname)
     return var.is_parameter
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_matmul.py b/python/paddle/distributed/auto_parallel/operators/dist_matmul.py
index e3d17b26b682f3..dca8e24bc5940a 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_matmul.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_matmul.py
@@ -376,7 +376,7 @@ def _right_operand_parameter_matmul_backward(ctx, *args, **kwargs):
             check_variable_and_dtype(
                 Out_grad,
                 'tensor',
-                ['float16', 'float32', 'float64', 'int32', 'int64'],
+                ['float16', 'float32', 'float64', 'int32', 'int64', 'uint16'],
                 '_c_identity',
             )
 
@@ -417,13 +417,13 @@ def _right_operand_parameter_matmul_backward(ctx, *args, **kwargs):
             check_variable_and_dtype(
                 intermediate_var_0,
                 'x',
-                ['float16', 'float32', 'float64'],
+                ['float16', 'float32', 'float64', 'uint16'],
                 'linear',
             )
             check_dtype(
                 intermediate_var_0.dtype,
                 'dtype',
-                ['float16', 'float32', 'float64'],
+                ['float16', 'float32', 'float64', 'uint16'],
                 'linear',
             )
             set_comm_op_dist_attr_for_program(
@@ -835,7 +835,7 @@ def forward(ctx, *args, **kwargs):
         check_variable_and_dtype(
             X_var,
             'tensor',
-            ['float16', 'float32', 'float64', 'int32', 'int64'],
+            ['float16', 'float32', 'float64', 'int32', 'int64', 'uint16'],
             '_c_identity',
         )
 
@@ -854,12 +854,15 @@ def forward(ctx, *args, **kwargs):
             intermediate_var_0.desc.set_shape(ref_shape_x)
 
         check_variable_and_dtype(
-            intermediate_var_0, 'x', ['float16', 'float32', 'float64'], 'linear'
+            intermediate_var_0,
+            'x',
+            ['float16', 'float32', 'float64', 'uint16'],
+            'linear',
         )
         check_dtype(
             intermediate_var_0.dtype,
             'dtype',
-            ['float16', 'float32', 'float64'],
+            ['float16', 'float32', 'float64', 'uint16'],
             'linear',
         )
         attrs = {
@@ -1183,10 +1186,13 @@ def forward(ctx, *args, **kwargs):
         group = new_process_group(group_ranks)
 
         check_variable_and_dtype(
-            X_var, 'x', ['float16', 'float32', 'float64'], 'linear'
+            X_var, 'x', ['float16', 'float32', 'float64', 'uint16'], 'linear'
         )
         check_dtype(
-            X_var.dtype, 'dtype', ['float16', 'float32', 'float64'], 'linear'
+            X_var.dtype,
+            'dtype',
+            ['float16', 'float32', 'float64', 'uint16'],
+            'linear',
         )
         attrs = {
             'transpose_X': trans_x,
@@ -1731,7 +1737,7 @@ def forward(ctx, *args, **kwargs):
         check_variable_and_dtype(
             X_var,
             'tensor',
-            ['float16', 'float32', 'float64', 'int32', 'int64'],
+            ['float16', 'float32', 'float64', 'int32', 'int64', 'uint16'],
             '_c_identity',
         )
         c_identity_op = main_block.append_op(
@@ -1749,12 +1755,15 @@ def forward(ctx, *args, **kwargs):
             intermediate_var_0.desc.set_shape(ref_shape_x)
 
         check_variable_and_dtype(
-            intermediate_var_0, 'x', ['float16', 'float32', 'float64'], 'linear'
+            intermediate_var_0,
+            'x',
+            ['float16', 'float32', 'float64', 'uint16'],
+            'linear',
         )
         check_dtype(
             intermediate_var_0.dtype,
             'dtype',
-            ['float16', 'float32', 'float64'],
+            ['float16', 'float32', 'float64', 'uint16'],
             'linear',
         )
         attrs = {
@@ -2077,10 +2086,13 @@ def forward(ctx, *args, **kwargs):
         group = new_process_group(group_ranks)
 
         check_variable_and_dtype(
-            X_var, 'x', ['float16', 'float32', 'float64'], 'linear'
+            X_var, 'x', ['float16', 'float32', 'float64', 'uint16'], 'linear'
         )
         check_dtype(
-            X_var.dtype, 'dtype', ['float16', 'float32', 'float64'], 'linear'
+            X_var.dtype,
+            'dtype',
+            ['float16', 'float32', 'float64', 'uint16'],
+            'linear',
         )
         attrs = {
             'trans_x': trans_x,
@@ -2610,7 +2622,7 @@ def forward(ctx, *args, **kwargs):
         check_variable_and_dtype(
             X_var,
             'tensor',
-            ['float16', 'float32', 'float64', 'int32', 'int64'],
+            ['float16', 'float32', 'float64', 'int32', 'int64', 'uint16'],
             '_c_identity',
         )
         c_identity_op = main_block.append_op(
@@ -2628,12 +2640,15 @@ def forward(ctx, *args, **kwargs):
             intermediate_var_0.desc.set_shape(ref_shape_x)
 
         check_variable_and_dtype(
-            intermediate_var_0, 'x', ['float16', 'float32', 'float64'], 'linear'
+            intermediate_var_0,
+            'x',
+            ['float16', 'float32', 'float64', 'uint16'],
+            'linear',
         )
         check_dtype(
             intermediate_var_0.dtype,
             'dtype',
-            ['float16', 'float32', 'float64'],
+            ['float16', 'float32', 'float64', 'uint16'],
             'linear',
         )
         # attrs = {'trans_x': False, 'trans_y': False}
@@ -2965,10 +2980,13 @@ def forward(ctx, *args, **kwargs):
         group = new_process_group(group_ranks)
 
         check_variable_and_dtype(
-            X_var, 'x', ['float16', 'float32', 'float64'], 'linear'
+            X_var, 'x', ['float16', 'float32', 'float64', 'uint16'], 'linear'
         )
         check_dtype(
-            X_var.dtype, 'dtype', ['float16', 'float32', 'float64'], 'linear'
+            X_var.dtype,
+            'dtype',
+            ['float16', 'float32', 'float64', 'uint16'],
+            'linear',
         )
         # attrs = {'trans_x': False, 'trans_y': False}
         attrs = {
diff --git a/python/paddle/distributed/auto_parallel/parallelizer_v2.py b/python/paddle/distributed/auto_parallel/parallelizer_v2.py
index 0a05c716ded645..2ff8f0ee7d118a 100644
--- a/python/paddle/distributed/auto_parallel/parallelizer_v2.py
+++ b/python/paddle/distributed/auto_parallel/parallelizer_v2.py
@@ -221,13 +221,21 @@ def _apply_pre_optimization(
                 self._dist_context.serial_feed_vars["inputs"]
                 + self._dist_context.serial_feed_vars["labels"]
             )
-            if config["use_pure_fp16"]:
+            if config["enable_bf16"]:
+                auto_parallel_bf16_pass = new_pass("auto_parallel_bf16", config)
+                auto_parallel_bf16_pass.apply(
+                    [main_program], [startup_program], self._pass_context
+                )
+                loss = auto_parallel_bf16_pass.get_loss()
+
+            elif config["use_pure_fp16"]:
                 config["base_opt"] = optimizer
                 auto_parallel_fp16_pass = new_pass("auto_parallel_fp16", config)
                 auto_parallel_fp16_pass.apply(
                     [main_program], [startup_program], self._pass_context
                 )
                 loss = auto_parallel_fp16_pass.get_loss()
+
             else:
                 auto_parallel_amp_pass = new_pass("auto_parallel_amp", config)
                 auto_parallel_amp_pass.apply(
diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py
index 7073758b9d52ab..767038addb7e29 100644
--- a/python/paddle/distributed/collective.py
+++ b/python/paddle/distributed/collective.py
@@ -162,7 +162,7 @@ def _new_process_group_impl(
 
 # _custom_gid provides a way for users to
 # set the group id, which is usually useful
-# to be compatible with the static mode.
+# to be compatible with the static graph mode.
 _custom_gid = None
 
 
diff --git a/python/paddle/distributed/communication/stream/all_gather.py b/python/paddle/distributed/communication/stream/all_gather.py
index 8e81a8723aac24..779a3c8f64cf71 100644
--- a/python/paddle/distributed/communication/stream/all_gather.py
+++ b/python/paddle/distributed/communication/stream/all_gather.py
@@ -178,10 +178,12 @@ def all_gather(
                 tensor_or_tensor_list, tensor, group, sync_op, use_calc_stream
             )
     else:
-        assert group is None, "Group can not be used in static mode for now."
+        assert (
+            group is None
+        ), "Group can not be used in static graph mode for now."
         if paddle.is_tensor(tensor_or_tensor_list):
             raise RuntimeError(
-                "Only support passing a tensor list to `all_gather` in static mode now."
+                "Only support passing a tensor list to `all_gather` in static graph mode now."
             )
         else:
             return _all_gather_in_static_mode(
diff --git a/python/paddle/distributed/communication/stream/all_reduce.py b/python/paddle/distributed/communication/stream/all_reduce.py
index 16f69764f4e616..412085b1b1720a 100644
--- a/python/paddle/distributed/communication/stream/all_reduce.py
+++ b/python/paddle/distributed/communication/stream/all_reduce.py
@@ -58,7 +58,7 @@ def _all_reduce_in_static_mode(tensor, op, group, sync_op, use_calc_stream):
     if not isinstance(ring_id, int):
         raise ValueError("The type of 'ring_id' for all_reduce should be int.")
 
-    # TODO: Support task and use task.wait in static mode
+    # TODO: Support task and use task.wait in static graph mode
     #       Use use_calc_stream rather than sync_op
     helper = layer_helper.LayerHelper(op_type, **locals())
     helper.append_op(
@@ -123,7 +123,9 @@ def all_reduce(
             tensor, op, group, sync_op, use_calc_stream
         )
     else:
-        assert group is None, "Group can not be used in static mode for now."
+        assert (
+            group is None
+        ), "Group can not be used in static graph mode for now."
         return _all_reduce_in_static_mode(
             tensor, op, group, sync_op, use_calc_stream
         )
diff --git a/python/paddle/distributed/communication/stream/all_to_all.py b/python/paddle/distributed/communication/stream/all_to_all.py
index a5293aa46e6c3d..d64ccb742ef08c 100644
--- a/python/paddle/distributed/communication/stream/all_to_all.py
+++ b/python/paddle/distributed/communication/stream/all_to_all.py
@@ -200,7 +200,9 @@ def alltoall(
                 "The output and input should be both tensor or tensor list."
             )
     else:
-        assert group is None, "Group can not be used in static mode for now."
+        assert (
+            group is None
+        ), "Group can not be used in static graph mode for now."
         return _all_to_all_in_static_mode(
             out_tensor_or_tensor_list,
             in_tensor_or_tensor_list,
diff --git a/python/paddle/distributed/communication/stream/broadcast.py b/python/paddle/distributed/communication/stream/broadcast.py
index 3c3e7767d0d908..cb6fbc75d15280 100644
--- a/python/paddle/distributed/communication/stream/broadcast.py
+++ b/python/paddle/distributed/communication/stream/broadcast.py
@@ -126,7 +126,9 @@ def broadcast(tensor, src, group=None, sync_op=True, use_calc_stream=False):
             tensor, src_rank_in_group, group, sync_op, use_calc_stream
         )
     else:
-        assert group is None, "Group can not be used in static mode for now."
+        assert (
+            group is None
+        ), "Group can not be used in static graph mode for now."
         return _broadcast_in_static_mode(
             tensor, src, group, sync_op, use_calc_stream
         )
diff --git a/python/paddle/distributed/communication/stream/recv.py b/python/paddle/distributed/communication/stream/recv.py
index b1b66f959789dd..fcd007e6d333db 100644
--- a/python/paddle/distributed/communication/stream/recv.py
+++ b/python/paddle/distributed/communication/stream/recv.py
@@ -114,7 +114,9 @@ def recv(tensor, src=0, group=None, sync_op=True, use_calc_stream=False):
             tensor, src_rank_in_group, group, sync_op, use_calc_stream
         )
     else:
-        assert group is None, "Group can not be used in static mode for now."
+        assert (
+            group is None
+        ), "Group can not be used in static graph mode for now."
         return _recv_in_static_mode(
             tensor, src, group, sync_op, use_calc_stream
         )
diff --git a/python/paddle/distributed/communication/stream/reduce.py b/python/paddle/distributed/communication/stream/reduce.py
index 391d797f3c112a..8bd81bd586a98b 100644
--- a/python/paddle/distributed/communication/stream/reduce.py
+++ b/python/paddle/distributed/communication/stream/reduce.py
@@ -139,7 +139,9 @@ def reduce(
             tensor, dst_rank_in_group, op, group, sync_op, use_calc_stream
         )
     else:
-        assert group is None, "Group can not be used in static mode for now."
+        assert (
+            group is None
+        ), "Group can not be used in static graph mode for now."
         return _reduce_in_static_mode(
             tensor, dst, op, group, sync_op, use_calc_stream
         )
diff --git a/python/paddle/distributed/communication/stream/scatter.py b/python/paddle/distributed/communication/stream/scatter.py
index a75cc7c2922371..6f332fbbd6fb33 100644
--- a/python/paddle/distributed/communication/stream/scatter.py
+++ b/python/paddle/distributed/communication/stream/scatter.py
@@ -220,7 +220,9 @@ def scatter(
                 use_calc_stream,
             )
     else:
-        assert group is None, "Group can not be used in static mode for now."
+        assert (
+            group is None
+        ), "Group can not be used in static graph mode for now."
 
         return _scatter_in_static_mode(
             tensor,
diff --git a/python/paddle/distributed/communication/stream/send.py b/python/paddle/distributed/communication/stream/send.py
index f4325a6c19ab19..e18a9a5738482f 100644
--- a/python/paddle/distributed/communication/stream/send.py
+++ b/python/paddle/distributed/communication/stream/send.py
@@ -113,7 +113,9 @@ def send(tensor, dst=0, group=None, sync_op=True, use_calc_stream=False):
             tensor, dst_rank_in_group, group, sync_op, use_calc_stream
         )
     else:
-        assert group is None, "Group can not be used in static mode for now."
+        assert (
+            group is None
+        ), "Group can not be used in static graph mode for now."
         return _send_in_static_mode(
             tensor, dst, group, sync_op, use_calc_stream
         )
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
index 3f41ebaa96d07a..5c97fe90a2e177 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
@@ -20,11 +20,11 @@
 
 import paddle
 from paddle.common_ops_import import LayerHelper
-from paddle.fluid.clip import GradientClipByNorm, append_gradient_clip_ops
 from paddle.fluid.dygraph import base as imperative_base
 from paddle.fluid.framework import in_dygraph_mode
 from paddle.fluid.optimizer import Momentum, Optimizer
 from paddle.framework import core
+from paddle.nn.clip import ClipGradByNorm, append_gradient_clip_ops
 from paddle.static import create_global_var
 
 
@@ -76,9 +76,9 @@ def __init__(
 
         self._dgc_clip_norm = None
         if grad_clip is not None:
-            if not isinstance(grad_clip, GradientClipByNorm):
+            if not isinstance(grad_clip, ClipGradByNorm):
                 raise TypeError(
-                    "The type of grad_clip should be 'GradientClipByNorm', because DGCMomentumOptimizer only support GradientClipByNorm"
+                    "The type of grad_clip should be 'ClipGradByNorm', because DGCMomentumOptimizer only support ClipGradByNorm"
                 )
             assert isinstance(num_trainers, int), (
                 "The type of num_trainers should be 'int', but received %s"
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py
index 75f0061b2ca20b..9eca2e667a8fd8 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py
@@ -15,9 +15,8 @@
 import paddle
 from paddle import framework
 from paddle.autograd import no_grad
-from paddle.fluid import layers
-from paddle.fluid.clip import ClipGradByGlobalNorm
 from paddle.framework import core
+from paddle.nn import ClipGradByGlobalNorm, clip
 
 from ...base.topology import ParallelMode
 from ...utils.hybrid_parallel_util import (
@@ -62,8 +61,8 @@ def _dygraph_clip(self, params_grads):
                 continue
             merge_grad = g
             if g.type == core.VarDesc.VarType.SELECTED_ROWS:
-                merge_grad = layers.merge_selected_rows(g)
-                merge_grad = layers.get_tensor_from_selected_rows(merge_grad)
+                merge_grad = clip.merge_selected_rows(g)
+                merge_grad = clip.get_tensor_from_selected_rows(merge_grad)
             square = paddle.square(merge_grad)
             sum_square = paddle.sum(square)
 
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_optimizer_stage2.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_optimizer_stage2.py
index b1a572d4edfc30..9a25d7c4912bac 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_optimizer_stage2.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_optimizer_stage2.py
@@ -30,7 +30,7 @@
 import paddle.distributed as dist
 from paddle.distributed import ParallelMode, fleet
 from paddle.fluid import core
-from paddle.fluid.clip import ClipGradByGlobalNorm
+from paddle.nn import ClipGradByGlobalNorm
 from paddle.optimizer import Optimizer
 
 HybridParallelClipGrad = (
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py
index 3d3debb252d400..d99683d4814503 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py
@@ -25,8 +25,8 @@
 from paddle import nn
 from paddle.autograd import PyLayer
 from paddle.distributed import collective
-from paddle.fluid.clip import ClipGradByGlobalNorm
 from paddle.fluid.framework import EagerParamBase
+from paddle.nn import ClipGradByGlobalNorm
 
 from .group_sharded_storage import GradStorage
 from .group_sharded_utils import GroupShardedClipGrad, Type, device_guard
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py
index 620540fea58761..f8c86e02b7b524 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py
@@ -23,6 +23,7 @@
 from paddle.fluid import core, layers
 from paddle.fluid.dygraph import to_variable
 from paddle.fluid.framework import dygraph_only
+from paddle.nn import clip
 
 
 class Taskflow:
@@ -65,8 +66,8 @@ def _dygraph_clip(self, params_grads):
 
             merge_grad = g
             if g.type == core.VarDesc.VarType.SELECTED_ROWS:
-                merge_grad = layers.get_tensor_from_selected_rows(
-                    layers.merge_selected_rows(g)
+                merge_grad = clip.get_tensor_from_selected_rows(
+                    clip.merge_selected_rows(g)
                 )
             square = paddle.square(merge_grad)
             sum_square = paddle.sum(square)
diff --git a/python/paddle/distributed/fleet/metrics/metric.py b/python/paddle/distributed/fleet/metrics/metric.py
index 8c15e47307381d..39284fa9f5a3f1 100644
--- a/python/paddle/distributed/fleet/metrics/metric.py
+++ b/python/paddle/distributed/fleet/metrics/metric.py
@@ -159,7 +159,7 @@ def auc(stat_pos, stat_neg, scope=None, util=None):
         .. code-block:: python
 
           # in model.py
-          similarity_norm = fluid.layers.sigmoid(fluid.layers.clip(output, min=-15.0, max=15.0))
+          similarity_norm = fluid.layers.sigmoid(paddle.clip(output, min=-15.0, max=15.0))
           binary_predict = fluid.layers.concat(
               input=[paddle.subtract(fluid.layers.ceil(similarity_norm), similarity_norm), similarity_norm], axis=1)
           self.auc, batch_auc, [batch_stat_pos, batch_stat_neg, stat_pos, stat_neg] =
diff --git a/python/paddle/distributed/fleet/utils/hybrid_parallel_inference.py b/python/paddle/distributed/fleet/utils/hybrid_parallel_inference.py
index 4502a8ddf4122f..d348d6a8f3e2e3 100644
--- a/python/paddle/distributed/fleet/utils/hybrid_parallel_inference.py
+++ b/python/paddle/distributed/fleet/utils/hybrid_parallel_inference.py
@@ -206,7 +206,8 @@ def __init__(
         elif core.is_compiled_with_cuda():
             self._device = "gpu"
         assert self._device, "Only gpu and npu are supported."
-        assert not in_dygraph_mode(), "Only static mode is supported."
+
+        assert not in_dygraph_mode(), "Only static graph mode is supported."
 
         op_maker = core.op_proto_and_checker_maker
         self._op_role = op_maker.OpRole
diff --git a/python/paddle/distributed/models/moe/utils.py b/python/paddle/distributed/models/moe/utils.py
index 383520f19db338..89c6add474ab6c 100644
--- a/python/paddle/distributed/models/moe/utils.py
+++ b/python/paddle/distributed/models/moe/utils.py
@@ -125,7 +125,7 @@ def _random_routing(topk_idx, topk_value, prob, topk=2):
         if in_dygraph_mode():
             return _legacy_C_ops.random_routing(prob, topk_value, topk_idx)
         else:
-            raise RuntimeError("Not supporting static mode now")
+            raise RuntimeError("Not supporting static graph mode now")
     else:
         raise RuntimeError("only topk=2 is supported now")
 
diff --git a/python/paddle/distributed/passes/__init__.py b/python/paddle/distributed/passes/__init__.py
index 056540a4a1571f..886d29a30b48e2 100644
--- a/python/paddle/distributed/passes/__init__.py
+++ b/python/paddle/distributed/passes/__init__.py
@@ -18,6 +18,7 @@
 from .auto_parallel_sharding import *  # noqa: F403
 from .auto_parallel_amp import *  # noqa: F403
 from .auto_parallel_fp16 import *  # noqa: F403
+from .auto_parallel_bf16 import *  # noqa: F403
 from .auto_parallel_recompute import *  # noqa: F403
 from .auto_parallel_quantization import *  # noqa: F403
 from .auto_parallel_data_parallel_optimization import *  # noqa: F403
diff --git a/python/paddle/distributed/passes/auto_parallel_bf16.py b/python/paddle/distributed/passes/auto_parallel_bf16.py
new file mode 100644
index 00000000000000..3344c648244643
--- /dev/null
+++ b/python/paddle/distributed/passes/auto_parallel_bf16.py
@@ -0,0 +1,661 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle import static
+from paddle.distributed.auto_parallel.dist_context import DistributedContext
+from paddle.distributed.auto_parallel.process_group import (
+    get_world_process_group,
+)
+from paddle.distributed.auto_parallel.utils import (
+    get_loss_op,
+    naive_set_dist_op_attr_for_program_by_mesh_and_mapping,
+    set_var_dist_attr,
+)
+from paddle.distributed.fleet.meta_optimizers.common import OpRole
+from paddle.distributed.passes.pass_base import PassBase, register_pass
+from paddle.fluid import unique_name
+from paddle.fluid.contrib.mixed_precision.bf16 import (
+    AutoMixedPrecisionListsBF16,
+)
+from paddle.fluid.contrib.mixed_precision.bf16.amp_utils import (
+    _dtype_to_str,
+    _is_in_fp32_varnames,
+    _valid_types,
+    find_op_index,
+    find_true_post_op,
+)
+from paddle.fluid.contrib.mixed_precision.fp16_utils import (
+    _rename_arg,
+    find_true_prev_op,
+)
+from paddle.fluid.framework import Block
+from paddle.framework import core
+
+from ..auto_parallel.utils import is_backward_op, is_forward_op, is_loss_op
+
+world_process_group = get_world_process_group()
+
+
+class BF16State(object):
+    def __init__(self, block):
+        self._block: Block = block
+        self._op_bf16_dict = {}
+        self._var_name_dict = {}
+
+    def _is_bf16_op(self, op_id):
+        return self._op_bf16_dict.get(op_id, None)
+
+    def _build_state(self, amp_lists, dist_context):
+        ops = self._block.ops
+        dist_op_context = dist_context.dist_op_context
+        training = False
+        for op in ops:
+            if int(op.attr("op_role")) == 257:
+                training = True
+
+            if int(op.attr("op_role")) == int(OpRole.Forward):
+                self._mark_black_white_op(amp_lists, op, ops)
+            elif int(op.attr("op_role")) == int(OpRole.Backward):
+                if op.desc.original_id() in dist_op_context.grad_op_id_to_op_id:
+                    fwd_op_id = dist_op_context.grad_op_id_to_op_id[
+                        op.desc.original_id()
+                    ]
+                    if self._is_bf16_op(fwd_op_id) is True:
+                        self._op_bf16_dict[op.desc.original_id()] = True
+                    elif self._is_bf16_op(fwd_op_id) is False:
+                        self._op_bf16_dict[op.desc.original_id()] = False
+            elif int(op.attr("op_role")) == int(OpRole.Optimize):
+                break
+        return training
+
+    def _mark_black_white_op(self, amp_lists, op, ops):
+        if op.type == "create_py_reader" or op.type == "read":
+            return
+        if amp_lists.fp32_varnames is not None and _is_in_fp32_varnames(
+            op, amp_lists
+        ):
+            self._op_bf16_dict[op.desc.original_id()] = False
+            return
+        if op.type in amp_lists.bf16_list:
+            self._op_bf16_dict[op.desc.original_id()] = True
+        elif op.type in amp_lists.gray_list:
+            is_fp32_op = False
+            is_bf16_op = False
+            for in_name in op.input_names:
+                if in_name:
+                    for in_var_name in op.input(in_name):
+                        in_var = self._block.var(in_var_name)
+                        if in_var.op is None:
+                            continue
+                        elif in_var.op is op:
+                            prev_op = find_true_prev_op(ops, op, in_var_name)
+                            if prev_op is None:
+                                continue
+                        else:
+                            prev_op = in_var.op
+                        if (
+                            self._op_bf16_dict.get(
+                                prev_op.desc.original_id(), False
+                            )
+                            is False
+                            or prev_op.type in amp_lists.fp32_list
+                        ):
+                            is_fp32_op = True
+                        elif (
+                            self._op_bf16_dict.get(
+                                prev_op.desc.original_id(), False
+                            )
+                            is True
+                            or prev_op.type in amp_lists.bf16_list
+                        ):
+                            is_bf16_op = True
+            if is_fp32_op:
+                self._op_bf16_dict[op.desc.original_id()] = False
+            elif is_bf16_op:
+                self._op_bf16_dict[op.desc.original_id()] = True
+            else:
+                pass
+        else:
+            self._op_bf16_dict[op.desc.original_id()] = False
+
+    def cast_forward_program(self, dist_context):
+        ops = self._block.ops
+        idx = 0
+        while idx < len(ops):
+            num_cast_ops = 0
+            op = ops[idx]
+            if int(op.attr('op_role')) == int(OpRole.Backward):
+                break
+            if self._is_bf16_op(op.desc.original_id()) is False:
+                num_cast_ops = self._insert_cast_op_forward(
+                    op,
+                    idx,
+                    core.VarDesc.VarType.BF16,
+                    core.VarDesc.VarType.FP32,
+                    dist_context,
+                )
+            elif self._is_bf16_op(op.desc.original_id()) is True:
+                if op.has_attr('use_mkldnn'):
+                    op._set_attr('use_mkldnn', True)
+                    op._set_attr('mkldnn_data_type', 'bfloat16')
+                elif (
+                    op.has_attr('dtype')
+                    and op.attr('dtype') == core.VarDesc.VarType.FP32
+                ):
+                    op._set_attr('dtype', core.VarDesc.VarType.BF16)
+
+                num_cast_ops = self._insert_cast_op_forward(
+                    op,
+                    idx,
+                    core.VarDesc.VarType.FP32,
+                    core.VarDesc.VarType.BF16,
+                    dist_context,
+                )
+            else:
+                pass
+
+            idx += num_cast_ops + 1
+        self._block._sync_with_cpp()
+
+    def _insert_cast_op_forward(
+        self, op, idx, src_dtype, dst_dtype, dist_context: DistributedContext
+    ):
+        num_cast_ops = 0
+        var_name_dict = {}
+        for in_name in op.input_names:
+            if src_dtype == core.VarDesc.VarType.FP32 and op.type in [
+                'batch_norm',
+                'fused_bn_add_activation',
+                'layer_norm',
+            ]:
+                if in_name not in {'X', 'Z'}:
+                    continue
+            for in_var_name in op.input(in_name):
+                in_var = self._block.var(in_var_name)
+                if in_var.type not in _valid_types or in_var.dtype == dst_dtype:
+                    continue
+                if in_var.dtype == src_dtype:
+                    cast_name = (
+                        in_var.name + '.cast_' + _dtype_to_str(dst_dtype)
+                    )
+                    var_name_dict[in_var.name] = cast_name
+                    out_var = self._block.vars.get(cast_name)
+                    consume_op_attr = dist_context.get_op_dist_attr_for_program(
+                        op
+                    )
+                    assert consume_op_attr is not None
+                    in_var_dist_attr = consume_op_attr.get_input_dist_attr(
+                        in_var_name
+                    )
+                    if out_var is None or out_var.dtype != dst_dtype:
+                        assert in_var_dist_attr is not None
+                        ref_mesh = in_var_dist_attr.process_mesh
+                        ref_mapping = in_var_dist_attr.dims_mapping
+                        consume_op_attr.set_input_dist_attr(
+                            cast_name, in_var_dist_attr
+                        )
+
+                        out_var = self._block.create_var(
+                            name=cast_name,
+                            dtype=dst_dtype,
+                            persistable=False,
+                            stop_gradient=in_var.stop_gradient,
+                        )
+                        set_var_dist_attr(
+                            dist_context, out_var, ref_mapping, ref_mesh
+                        )
+
+                        cast_op = self._block._insert_op_without_sync(
+                            idx,
+                            type="cast",
+                            inputs={"X": in_var},
+                            outputs={"Out": out_var},
+                            attrs={
+                                "in_dtype": in_var.dtype,
+                                "out_dtype": out_var.dtype,
+                            },
+                        )
+                        naive_set_dist_op_attr_for_program_by_mesh_and_mapping(
+                            cast_op, ref_mesh, ref_mapping, dist_context
+                        )
+                        num_cast_ops += 1
+                    else:
+                        consume_op_attr.set_input_dist_attr(
+                            cast_name, in_var_dist_attr
+                        )
+                    _rename_arg(op, in_var_name, out_var.name)
+                else:
+                    if op.has_attr('in_dtype'):
+                        op._set_attr('in_dtype', dst_dtype)
+        self._var_name_dict[op.desc.original_id()] = var_name_dict
+
+        if (
+            src_dtype == core.VarDesc.VarType.FP32
+            and dst_dtype == core.VarDesc.VarType.BF16
+        ):
+            for out_name in op.output_names:
+                if (
+                    op.type
+                    in ['batch_norm', 'fused_bn_add_activation', 'layer_norm']
+                    and out_name != 'Y'
+                ):
+                    continue
+                for out_var_name in op.output(out_name):
+                    out_var = self._block.var(out_var_name)
+                    if out_var.type not in _valid_types:
+                        continue
+                    if out_var.dtype == core.VarDesc.VarType.FP32:
+                        out_var.desc.set_dtype(core.VarDesc.VarType.BF16)
+                        if op.has_attr('out_dtype'):
+                            op._set_attr('out_dtype', core.VarDesc.VarType.BF16)
+        return num_cast_ops
+
+    def cast_backward_program(self, params_grads, dist_context):
+        self._block._sync_with_cpp()
+        ops = self._block.ops
+        appended_grad_times = 0
+        dist_op_context = dist_context.dist_op_context
+        loss_op = get_loss_op(self._block)
+        idx = find_op_index(self._block.desc, loss_op.desc) + 1
+        while idx < len(ops):
+            num_cast_ops = 0
+            grad_op = ops[idx]
+            op_dist_attr = dist_context.get_op_dist_attr_for_program(grad_op)
+            if is_backward_op(grad_op) and (
+                is_forward_op(ops[idx - 1]) or is_loss_op(ops[idx - 1])
+            ):
+                if not op_dist_attr.is_recompute:
+                    appended_grad_times += 1
+            if (
+                grad_op.desc.original_id()
+                in dist_op_context.grad_op_id_to_op_id
+            ):
+                if self._is_bf16_op(grad_op.desc.original_id()) is False:
+                    num_cast_ops = self._insert_cast_op_backward(
+                        grad_op,
+                        idx,
+                        core.VarDesc.VarType.BF16,
+                        core.VarDesc.VarType.FP32,
+                        dist_context,
+                        appended_grad_times,
+                    )
+                elif self._is_bf16_op(grad_op.desc.original_id()) is True:
+                    if grad_op.has_attr('use_mkldnn'):
+                        grad_op._set_attr('use_mkldnn', True)
+                        grad_op._set_attr('mkldnn_data_type', 'bfloat16')
+                    elif (
+                        grad_op.has_attr('dtype')
+                        and grad_op.attr('dtype') == core.VarDesc.VarType.FP32
+                    ):
+                        grad_op._set_attr('dtype', core.VarDesc.VarType.BF16)
+                    num_cast_ops = self._insert_cast_op_backward(
+                        grad_op,
+                        idx,
+                        core.VarDesc.VarType.FP32,
+                        core.VarDesc.VarType.BF16,
+                        dist_context,
+                        appended_grad_times,
+                    )
+            elif grad_op.type == "sum":
+                in_var_name = grad_op.desc.input_arg_names()[0]
+                src_dtype = self._block.var(in_var_name).dtype
+                for in_var_name in grad_op.desc.input_arg_names():
+                    assert src_dtype == self._block.var(in_var_name).dtype
+                out_var_name = grad_op.desc.output_arg_names()[0]
+                out_var = self._block.var(out_var_name)
+                if out_var.dtype != src_dtype:
+                    out_var.desc.set_dtype(src_dtype)
+            elif int(grad_op.attr("op_role")) == 257:
+                pass
+            else:
+                raise ValueError(
+                    "'{}' op is not supported in the complete amp pass.".format(
+                        grad_op.type
+                    )
+                )
+            idx += num_cast_ops + 1
+        self._block._sync_with_cpp()
+        _update_backward_cast_ops(params_grads, dist_context)
+
+    def _insert_cast_op_backward(
+        self,
+        grad_op,
+        idx,
+        src_dtype,
+        dst_dtype,
+        dist_context,
+        appended_grad_times,
+    ):
+        def _keep_fp32_input(op, in_name):
+            op_type = op.type
+            if op_type in ['layer_norm_grad']:
+                return in_name not in {'X', 'Y@GRAD'}
+            return False
+
+        def _keep_fp32_output(op, out_name):
+            op_type = op.type
+            if op_type in ['layer_norm_grad']:
+                return out_name != 'X@GRAD'
+            return False
+
+        num_cast_ops = 0
+        original_id = grad_op.desc.original_id()
+        dist_op_context = dist_context.dist_op_context
+        fwd_op_id = dist_op_context.grad_op_id_to_op_id[original_id]
+        for in_name in grad_op.input_names:
+            if src_dtype == core.VarDesc.VarType.FP32 and _keep_fp32_input(
+                grad_op, in_name
+            ):
+                for in_var_name in grad_op.input(in_name):
+                    in_var = self._block._find_var_recursive(in_var_name)
+                    assert in_var.dtype == core.VarDesc.VarType.FP32
+                continue
+            for in_var_name in grad_op.input(in_name):
+                in_var = self._block._find_var_recursive(in_var_name)
+                if in_var.dtype == src_dtype:
+                    consume_op_attr = dist_context.get_op_dist_attr_for_program(
+                        grad_op
+                    )
+                    if in_var_name in self._var_name_dict[fwd_op_id]:
+                        cast_name = self._var_name_dict[fwd_op_id][in_var_name]
+                        grad_op.desc._rename_input(in_var_name, cast_name)
+                        in_var_dist_attr = consume_op_attr.get_input_dist_attr(
+                            in_var_name
+                        )
+                        consume_op_attr.set_input_dist_attr(
+                            cast_name, in_var_dist_attr
+                        )
+                    else:
+                        assert (
+                            in_var.dtype == dst_dtype
+                        ), "op [{}] expect input [{}] to be dtype [{}] BUT got [{}]. {}".format(
+                            grad_op.type,
+                            in_name,
+                            dst_dtype,
+                            in_var.dtype,
+                            str(grad_op),
+                        )
+
+        for out_name in grad_op.output_names:
+            if src_dtype == core.VarDesc.VarType.FP32 and _keep_fp32_output(
+                grad_op, out_name
+            ):
+                for out_var_name in grad_op.output(out_name):
+                    out_var = self._block._find_var_recursive(out_var_name)
+                    assert out_var.dtype == core.VarDesc.VarType.FP32
+                continue
+
+            for out_var_name in grad_op.output(out_name):
+                out_var = self._block._find_var_recursive(out_var_name)
+                out_var_name_prefix = out_var_name[: out_var_name.find('@')]
+                fwd_var = self._block._find_var_recursive(out_var_name_prefix)
+                if out_var.dtype != fwd_var.dtype:
+                    out_var.desc.set_dtype(fwd_var.dtype)
+
+                if out_var.dtype == src_dtype:
+                    if out_var_name_prefix in self._var_name_dict[fwd_op_id]:
+                        consume_op_attr = (
+                            dist_context.get_op_dist_attr_for_program(grad_op)
+                        )
+                        fwd_cast_name = self._var_name_dict[fwd_op_id][
+                            out_var_name_prefix
+                        ]
+                        suffix = ''
+                        if "@RENAME" in out_var_name:
+                            suffix = out_var_name[
+                                out_var_name.find("@RENAME") :
+                            ]
+                        cast_name = fwd_cast_name + "@GRAD" + suffix
+                        cast_var = self._block.vars.get(cast_name)
+                        if cast_var is None or cast_var.dtype != dst_dtype:
+                            grad_op.desc._rename_output(out_var_name, cast_name)
+                            out_var_dist_attr = (
+                                consume_op_attr.get_output_dist_attr(
+                                    out_var_name
+                                )
+                            )
+                            ref_mesh = out_var_dist_attr.process_mesh
+                            ref_mapping = out_var_dist_attr.dims_mapping
+                            consume_op_attr.set_output_dist_attr(
+                                cast_name, out_var_dist_attr
+                            )
+                            assert ref_mapping is not None
+                            cast_var = self._block.create_var(
+                                name=cast_name,
+                                shape=out_var.shape,
+                                dtype=dst_dtype,
+                                persistable=False,
+                                stop_gradient=out_var.stop_gradient,
+                            )
+                            set_var_dist_attr(
+                                dist_context, cast_var, ref_mapping, ref_mesh
+                            )
+                            dist_op_context.grad_var_to_var[
+                                appended_grad_times
+                            ][cast_name] = fwd_cast_name
+
+                            cast_op = self._block._insert_op(
+                                idx + 1,
+                                type="cast",
+                                inputs={"X": cast_var},
+                                outputs={"Out": out_var},
+                                attrs={
+                                    "in_dtype": cast_var.dtype,
+                                    "out_dtype": out_var.dtype,
+                                    "op_role": OpRole.Backward,
+                                },
+                            )
+                            cast_op._remove_attr("op_role_var")
+                            cast_op._remove_attr("op_namescope")
+                            cast_op._remove_attr("with_quant_attr")
+                            naive_set_dist_op_attr_for_program_by_mesh_and_mapping(
+                                cast_op, ref_mesh, ref_mapping, dist_context
+                            )
+                            num_cast_ops += 1
+                else:
+                    assert out_var.dtype == dst_dtype
+        return num_cast_ops
+
+
+def _update_backward_cast_ops(params_grads, dist_context):
+    """
+    move param grad cast to the end of backward segment
+    in order to enabel fp16 allreduce
+    """
+    # TODO filter optimize ops in future
+
+    main_block = paddle.static.default_main_program().global_block()
+    main_block._sync_with_cpp()
+
+    for p, g in params_grads:
+        op = g.op
+        if g.dtype == core.VarDesc.VarType.FP32 and op.type == 'cast':
+            if int(op.attr('op_role')) == int(OpRole.Backward) and op.has_attr(
+                'op_role_var'
+            ):
+                op._remove_attr("op_role_var")
+
+            post_ops = find_true_post_op(main_block.ops, op, g.name)
+            if post_ops:
+                raise ValueError(
+                    "The cast op {0}'s output should not be"
+                    "used by a non-optimize op, however, it"
+                    "is used by {1}".format(op, post_ops[0])
+                )
+
+            if op == main_block.ops[-1]:
+                continue
+
+            # add new op in the python and cpp at the same time
+            new_op_desc = main_block.desc.append_op()
+            new_op_desc.copy_from(op.desc)
+            new_op = paddle.fluid.framework.Operator(
+                block=main_block,
+                desc=new_op_desc,
+                type=None,
+                inputs=None,
+                outputs=None,
+                attrs=None,
+            )
+            main_block.ops.append(new_op)
+
+            # dist attr
+            param_dist_attr = dist_context.get_tensor_dist_attr_for_program(p)
+            output_dist_attr = dist_context.get_tensor_dist_attr_for_program(
+                main_block.var(op.output_arg_names[0])
+            )
+            assert param_dist_attr is not None
+            assert output_dist_attr is not None
+            naive_set_dist_op_attr_for_program_by_mesh_and_mapping(
+                new_op,
+                param_dist_attr.process_mesh,
+                param_dist_attr.dims_mapping,
+                dist_context,
+            )
+
+            output_dist_attr.process_mesh = param_dist_attr.process_mesh
+            output_dist_attr.dims_mapping = param_dist_attr.dims_mapping
+
+            op_idx = find_op_index(main_block.desc, op.desc)
+            if op_idx == -1:
+                raise ValueError("The op {0} is not in program".format(op))
+            main_block._remove_op(op_idx, sync=False)
+
+    main_block._sync_with_cpp()
+
+
+@register_pass("auto_parallel_bf16")
+class BF16Pass(PassBase):
+    def __init__(self):
+        super().__init__()
+        self.set_attr("dist_context", None)
+        self.set_attr("custom_bf16_list", None)
+        self.set_attr("custom_fp32_list", None)
+        self.set_attr("custom_fp32_varnames", None)
+        self.set_attr("input_data", [])
+        self.set_attr("loss", None)
+        self.set_attr("params_grads", [])
+        self.set_attr("use_bf16_guard", False)
+        self._loss = None
+
+    def _check_self(self):
+        if self.get_attr("dist_context") is None:
+            return False
+        return True
+
+    def _check_conflict(self, other_pass):
+        return True
+
+    def _apply_single_impl(self, main_program, startup_program, context):
+        self.dist_context = self.get_attr("dist_context")
+        params_grads = self.get_attr("params_grads")
+
+        amp_lists = AutoMixedPrecisionListsBF16(
+            self.get_attr("custom_bf16_list"),
+            self.get_attr("custom_fp32_list"),
+            self.get_attr("custom_fp32_varnames"),
+        )
+
+        with static.program_guard(main_program, startup_program):
+            amp_state = BF16State(main_program.global_block())
+            training = amp_state._build_state(amp_lists, self.dist_context)
+            amp_state.cast_forward_program(self.dist_context)
+
+        if training:
+            with paddle.static.program_guard(main_program, startup_program):
+                amp_state.cast_backward_program(params_grads, self.dist_context)
+                self._scale_loss()
+
+    def _scale_loss(self):
+
+        main_block = paddle.static.default_main_program().global_block()
+        main_block._sync_with_cpp()
+        OP_ROLE_KEY = core.op_proto_and_checker_maker.kOpRoleAttrName()
+
+        loss = self.get_attr("loss")
+        assert loss is not None
+        loss_op = loss.op
+        loss_op_dist_attr = self.dist_context.get_op_dist_attr_for_program(
+            loss_op
+        )
+        if loss.dtype != core.VarDesc.VarType.FP32:
+            tmp_name = unique_name.generate(loss.name + ".cast_fp32")
+            cast_loss = main_block.create_var(
+                name=tmp_name, dtype=core.VarDesc.VarType.FP32
+            )
+            loss_dist_attr = self.dist_context.get_tensor_dist_attr_for_program(
+                loss
+            )
+            ref_mesh = loss_op_dist_attr.process_mesh
+            self.dist_context.set_tensor_dist_attr_for_program(
+                cast_loss, loss_dist_attr
+            )
+
+            loss_op_idx = find_op_index(main_block.desc, loss_op.desc)
+            cast_op = main_block._insert_op(
+                loss_op_idx + 1,
+                type='cast',
+                inputs={"X": [loss]},
+                outputs={"Out": [cast_loss]},
+                attrs={
+                    "in_dtype": loss.dtype,
+                    "out_dtype": core.VarDesc.VarType.FP32,
+                    "op_role": loss_op.all_attrs()[OP_ROLE_KEY],
+                },
+            )
+
+            loss_op._set_attr(
+                OP_ROLE_KEY, core.op_proto_and_checker_maker.OpRole.Forward
+            )
+            naive_set_dist_op_attr_for_program_by_mesh_and_mapping(
+                cast_op, ref_mesh, [-1], self.dist_context
+            )
+            first_backward_op = main_block.ops[loss_op_idx + 2]
+            assert (
+                first_backward_op.type == "fill_constant"
+                and int(first_backward_op.all_attrs()[OP_ROLE_KEY]) == 257
+            )
+            cast_loss_grad = main_block.create_var(
+                name=unique_name.generate(tmp_name + "@GRAD"),
+                shape=loss.shape,
+                dtype=core.VarDesc.VarType.FP32,
+                persistable=loss.persistable,
+            )
+            set_var_dist_attr(self.dist_context, cast_loss_grad, [-1], ref_mesh)
+            pre_grad_name = first_backward_op.output_arg_names[0]
+            first_backward_op._rename_output(pre_grad_name, cast_loss_grad.name)
+            cast_grad_op = main_block._insert_op(
+                loss_op_idx + 3,
+                type='cast',
+                inputs={'X': [cast_loss_grad]},
+                outputs={'Out': [pre_grad_name]},
+                attrs={
+                    "in_dtype": core.VarDesc.VarType.FP32,
+                    "out_dtype": core.VarDesc.VarType.FP16,
+                    'op_role': core.op_proto_and_checker_maker.OpRole.Backward,
+                },
+            )
+            naive_set_dist_op_attr_for_program_by_mesh_and_mapping(
+                cast_grad_op, ref_mesh, [-1], self.dist_context
+            )
+            loss = cast_loss
+        self._loss = loss
+        main_block._sync_with_cpp()
+
+    def get_loss(self):
+        if self._loss:
+            return self._loss
+        else:
+            return self.get_attr("loss")
diff --git a/python/paddle/distributed/passes/auto_parallel_data_parallel_optimization.py b/python/paddle/distributed/passes/auto_parallel_data_parallel_optimization.py
index 66f80ee9950491..8cb11270b12573 100644
--- a/python/paddle/distributed/passes/auto_parallel_data_parallel_optimization.py
+++ b/python/paddle/distributed/passes/auto_parallel_data_parallel_optimization.py
@@ -279,7 +279,7 @@ def _could_be_overlap(self):
         # NOTE current different nccl comm will use different cuda stream
         # so if there too many dp group there will be too many stream need to be
         # created and sync.
-        # revise here when framework support custom stream in static mode.
+        # revise here when framework support custom stream in static graph mode.
         num_dp_comm_stream = len(set(self._group_to_grad_name_map.keys()))
         if num_dp_comm_stream > __max_stream_num_allow__:
             return False
diff --git a/python/paddle/distributed/passes/auto_parallel_quantization.py b/python/paddle/distributed/passes/auto_parallel_quantization.py
index 924499c27355be..e7ee507c39951b 100644
--- a/python/paddle/distributed/passes/auto_parallel_quantization.py
+++ b/python/paddle/distributed/passes/auto_parallel_quantization.py
@@ -18,14 +18,14 @@
 
 import paddle
 from paddle.fluid import core, framework
-from paddle.fluid.contrib.slim.quantization import (
+from paddle.fluid.dygraph.parallel import ParallelEnv
+from paddle.static.quantization import (
     AddQuantDequantForInferencePass,
     AddQuantDequantPassV2,
     OutScaleForTrainingPass,
     QuantizationTransformPassV2,
     utils,
 )
-from paddle.fluid.dygraph.parallel import ParallelEnv
 
 from ..auto_parallel.converter import Converter
 from ..auto_parallel.dist_attribute import (
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 53d50a8b4a3ed3..eaf64e6dc6c0bd 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -90,7 +90,6 @@
     DistributeTranspilerConfig,
 )
 from .lod_tensor import create_lod_tensor, create_random_int_lodtensor
-from . import clip
 from . import profiler
 from . import unique_name
 from . import parallel_executor
@@ -99,7 +98,6 @@
 from .compiler import *
 from paddle.fluid.layers.math_op_patch import monkey_patch_variable
 from . import install_check
-from .dygraph.nn import *
 from .dygraph.layers import *
 from .dygraph.base import enable_dygraph, disable_dygraph
 from .io import save, load, load_program_state, set_program_state
@@ -165,7 +163,6 @@
         'ParamAttr',
         'WeightNormParamAttr',
         'DataFeeder',
-        'clip',
         'profiler',
         'unique_name',
         'Scope',
diff --git a/python/paddle/fluid/clip.py b/python/paddle/fluid/clip.py
deleted file mode 100644
index ffaa84ed3e53c5..00000000000000
--- a/python/paddle/fluid/clip.py
+++ /dev/null
@@ -1,944 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import copy
-import warnings
-
-import functools
-import paddle
-from . import layers
-from . import framework
-from . import core
-from . import name_scope
-from .dygraph import base as imperative_base
-from .data_feeder import check_variable_and_dtype
-from .framework import in_dygraph_mode
-from .layer_helper import LayerHelper
-from .framework import default_main_program
-from paddle import _C_ops, _legacy_C_ops
-
-__all__ = [
-    'set_gradient_clip',
-    'ErrorClipByValue',
-    'ClipGradByValue',
-    'ClipGradByNorm',
-    'ClipGradByGlobalNorm',
-]
-
-_clip_by_global_norm_using_mp_type_flag = False
-
-
-def _clip_by_global_norm_using_mp_type(*args):
-    global _clip_by_global_norm_using_mp_type_flag
-    assert len(args) <= 1
-    if len(args) == 1:
-        assert isinstance(args[0], bool)
-        old_value = _clip_by_global_norm_using_mp_type_flag
-        _clip_by_global_norm_using_mp_type_flag = args[0]
-        return old_value
-    else:
-        return _clip_by_global_norm_using_mp_type_flag
-
-
-def _cast_to_mp_type_if_enabled(x):
-    if (
-        x.dtype == core.VarDesc.VarType.FP16
-        or x.dtype == core.VarDesc.VarType.BF16
-    ) and _clip_by_global_norm_using_mp_type():
-        return x.astype(core.VarDesc.VarType.FP32)
-    else:
-        return x
-
-
-def _squared_l2_norm(x):
-    r"""
-    This OP returns the squared L2 norm of a tensor.
-    """
-
-    x = _cast_to_mp_type_if_enabled(x)
-    if (
-        core.is_compiled_with_xpu()
-        or x.dtype == core.VarDesc.VarType.FP16
-        or x.dtype == core.VarDesc.VarType.BF16
-    ):
-        square = paddle.square(x)
-        sum_square = paddle.sum(square)
-        return sum_square
-
-    if in_dygraph_mode():
-        return _C_ops.squared_l2_norm(x)
-    else:
-        op_type = 'squared_l2_norm'
-        check_variable_and_dtype(x, 'x', ['float32', 'float64'], op_type)
-        helper = LayerHelper(op_type, **locals())
-        out = helper.create_variable_for_type_inference(x.dtype)
-
-        inputs = {"X": x}
-        outputs = {'Out': out}
-        helper.append_op(type=op_type, inputs=inputs, outputs=outputs)
-        return out
-
-
-class BaseErrorClipAttr:
-    def __str__(self):
-        raise NotImplementedError()
-
-    def _append_clip_op(self, block, grad_name):
-        raise NotImplementedError()
-
-
-class ErrorClipByValue(BaseErrorClipAttr):
-    r"""
-    Clips tensor values to the range [min, max].
-
-    Given a tensor ``t`` (see Examples below), this operation clips its value \
-    to ``min`` and ``max`` inplace.
-
-    - Any values less than min are set to min.
-    - Any values greater than max are set to max.
-
-    Args:
-        max (float): The maximum value to clip by.
-        min (float, optional): The minimum value to clip by. if not set by user, \
-        will be set to ``-max`` by framework.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            import paddle
-            paddle.enable_static()
-            BATCH_SIZE = 128
-            CLIP_MAX = 2e-6
-            CLIP_MIN = -1e-6
-            prog = fluid.framework.Program()
-            with fluid.program_guard(main_program=prog):
-                image = fluid.layers.data(
-                    name='x', shape=[784], dtype='float32')
-                hidden1 = fluid.layers.fc(input=image, size=128, act='relu')
-                hidden2 = fluid.layers.fc(input=hidden1, size=64, act='relu')
-                predict = fluid.layers.fc(
-                    input=hidden2, size=10, act='softmax')
-                label = fluid.layers.data(name='y', shape=[1], dtype='int64')
-                cost = paddle.nn.functional.cross_entropy(input=predict, label=label, reduction='none', use_softmax=False)
-                avg_cost = paddle.mean(cost)
-            prog_clip = prog.clone()
-            prog_clip.block(0).var(hidden1.name)._set_error_clip(
-                fluid.clip.ErrorClipByValue(
-                    max=CLIP_MAX, min=CLIP_MIN
-                )
-            )
-    """
-
-    def __init__(self, max, min=None):
-        max = float(max)
-        if min is None:
-            min = -max
-        else:
-            min = float(min)
-        self.max = max
-        self.min = min
-
-    def __str__(self):
-        return "ByValue, min=%f, max=%f" % (self.min, self.max)
-
-    def _append_clip_op(self, block, grad_name):
-        clip_op_desc = block.desc.append_op()
-        clip_op_desc.set_type("clip")
-        clip_op_desc.set_input("X", [grad_name])
-        clip_op_desc.set_output("Out", [grad_name])
-        clip_op_desc._set_attr("min", self.min)
-        clip_op_desc._set_attr("max", self.max)
-
-
-def error_clip_callback(block, context):
-    # the context is a grad_to_var map
-    grad_to_var = context
-    op_desc = block.desc.op(block.desc.op_size() - 1)
-    for grad_n in [n for n in op_desc.output_arg_names() if n in grad_to_var]:
-        fwd_var = block._var_recursive(grad_to_var[grad_n])
-        error_clip = getattr(fwd_var, "error_clip", None)
-        if not (
-            error_clip is None or isinstance(error_clip, BaseErrorClipAttr)
-        ):
-            raise TypeError(
-                "Variable's error_clip should be an instance of BaseErrorClipAttr or None."
-            )
-        if error_clip is not None:
-            error_clip._append_clip_op(block, grad_n)
-
-
-class ClipGradBase:
-    def __init__(self):
-        super().__init__()
-
-    def __str__(self):
-        raise NotImplementedError()
-
-    @imperative_base.no_grad
-    def _dygraph_clip(self, params_grads):
-        raise NotImplementedError
-
-    def _static_clip(self, params_grads):
-        raise NotImplementedError
-
-    def __call__(self, params_grads):
-        if in_dygraph_mode():
-            return self._dygraph_clip(params_grads)
-        else:
-            for p, g in params_grads:
-                if getattr(p, 'gradient_clip_attr', None) is not None:
-                    warnings.warn(
-                        "'set_gradient_clip' will be ineffective, because you have "
-                        "set 'need_clip' in 'ParamAttr'. So, 'set_gradient_clip' "
-                        "is redundant and you can remove it."
-                    )
-                    break
-            return self._static_clip(params_grads)
-
-    def _process_context(self, context, param, grad):
-        raise NotImplementedError()
-
-    def _create_operators(self, param, grad):
-        raise NotImplementedError()
-
-
-class ClipGradByValue(ClipGradBase):
-    """
-    Limit the value of multi-dimensional Tensor :math:`X` to the range [min, max].
-
-    - Any values less than min are set to ``min``.
-
-    - Any values greater than max are set to ``max``.
-
-    The multi-dimensional Tensor :math:`X` is not passed from this class, but the gradients of all parameters set in ``optimizer``.
-    If ``need_clip`` of specific param is ``False`` in its ``ParamAttr``, then the gradients of this param will not be clipped.
-
-    Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer``
-    (for example: :ref:`api_paddle_optimizer_SGD`).
-
-    Note:
-        ``need_clip`` of ``ClipGradByValue`` HAS BEEN DEPRECATED since 2.0.
-        Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.
-
-    Args:
-        max (float): The maximum value to clip by.
-        min (float, optional): The minimum value to clip by. if not set by user, it will be set to ``-max``
-            automatically. In this case, ``max`` must be greater than 0.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle
-
-            x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
-            linear = paddle.nn.Linear(in_features=10, out_features=10,
-                                      weight_attr=paddle.ParamAttr(need_clip=True),
-                                      bias_attr=paddle.ParamAttr(need_clip=False))
-            out = linear(x)
-            loss = paddle.mean(out)
-            loss.backward()
-
-            clip = paddle.nn.ClipGradByValue(min=-1, max=1)
-            sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip)
-            sdg.step()
-    """
-
-    def __init__(self, max, min=None):
-        super().__init__()
-        if min is None:
-            assert max > 0.0
-            min = -max
-        self.max = float(max)
-        self.min = float(min)
-
-    def __str__(self):
-        return "Clip Gradient By Value, min = %f, max=%f" % (self.min, self.max)
-
-    @imperative_base.no_grad
-    def _dygraph_clip(self, params_grads):
-        params_and_grads = []
-        for p, g in params_grads:
-            if g is None:
-                continue
-            if getattr(p, 'need_clip', True) is False:
-                params_and_grads.append((p, g))
-                continue
-            new_grad = paddle.clip(x=g, min=self.min, max=self.max)
-            params_and_grads.append((p, new_grad))
-        return params_and_grads
-
-    def _static_clip(self, params_grads):
-        params_and_grads = []
-        param_new_grad_name_dict = dict()
-        with framework.name_scope('gradient_clip'):
-            for p, g in params_grads:
-                if g is None:
-                    continue
-                if getattr(p, 'need_clip', True) is False:
-                    params_and_grads.append((p, g))
-                    continue
-
-                with p.block.program._optimized_guard([p, g]):
-                    new_grad = layers.clip(x=g, min=self.min, max=self.max)
-                params_and_grads.append((p, new_grad))
-                param_new_grad_name_dict[p.name] = new_grad.name
-        _correct_clip_op_role_var(params_and_grads, param_new_grad_name_dict)
-        return params_and_grads
-
-    def _process_context(self, context, param, grad):
-        pass
-
-    def _create_operators(self, param, grad):
-        new_grad = layers.clip(x=grad, min=self.min, max=self.max)
-        return param, new_grad
-
-
-class ClipGradByNorm(ClipGradBase):
-    r"""
-    Limit the l2 norm of multi-dimensional Tensor :math:`X` to ``clip_norm`` .
-
-    - If the l2 norm of :math:`X` is greater than ``clip_norm`` , :math:`X` will be compressed by a ratio.
-
-    - If the l2 norm of :math:`X` is less than or equal to ``clip_norm`` , nothing will be done.
-
-    The multidimensional Tensor :math:`X` is not passed from this class, but the gradients of all parameters set in ``optimizer``.
-    If ``need_clip`` of specific param is ``False`` in its ``ParamAttr``, then the gradients of this param will not be clipped.
-
-    Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer``
-    (for example: :ref:`api_paddle_optimizer_SGD`).
-
-    The clipping formula is:
-
-    .. math::
-        Out =
-        \left\{
-            \begin{array}{ccl}
-                X & & if (norm(X) \leq clip\_norm) \\
-                \frac{clip\_norm*X}{norm(X)} & & if (norm(X) > clip\_norm) \\
-        \end{array}
-        \right.
-
-
-    where :math:`norm(X)` represents the L2 norm of :math:`X`.
-
-    .. math::
-        norm(X) = ( \sum_{i=1}^{n}|x\_i|^2)^{ \frac{1}{2}}
-
-    Note:
-        ``need_clip`` of ``ClipGradByNorm`` HAS BEEN DEPRECATED since 2.0.
-        Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.
-
-    Args:
-        clip_norm(float): The maximum norm value.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle
-
-            x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
-            linear = paddle.nn.Linear(in_features=10, out_features=10,
-                                      weight_attr=paddle.ParamAttr(need_clip=True),
-                                      bias_attr=paddle.ParamAttr(need_clip=False))
-            out = linear(x)
-            loss = paddle.mean(out)
-            loss.backward()
-
-            clip = paddle.nn.ClipGradByNorm(clip_norm=1.0)
-            sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip)
-            sdg.step()
-    """
-
-    def __init__(self, clip_norm):
-        super().__init__()
-        self.clip_norm = float(clip_norm)
-
-    def __str__(self):
-        return "Gradient Clip By Norm, clip_norm=%f" % self.clip_norm
-
-    @imperative_base.no_grad
-    def _dygraph_clip(self, params_grads):
-        params_and_grads = []
-        for p, g in params_grads:
-            if g is None:
-                continue
-            if getattr(p, 'need_clip', True) is False:
-                params_and_grads.append((p, g))
-                continue
-            new_grad = layers.clip_by_norm(x=g, max_norm=self.clip_norm)
-            params_and_grads.append((p, new_grad))
-        return params_and_grads
-
-    def _static_clip(self, params_grads):
-        params_and_grads = []
-        with framework.name_scope('gradient_clip'):
-            param_new_grad_name_dict = dict()
-            for p, g in params_grads:
-                if g is None:
-                    continue
-                if getattr(p, 'need_clip', True) is False:
-                    params_and_grads.append((p, g))
-                    continue
-
-                with p.block.program._optimized_guard([p, g]):
-                    new_grad = layers.clip_by_norm(x=g, max_norm=self.clip_norm)
-                param_new_grad_name_dict[p.name] = new_grad.name
-                params_and_grads.append((p, new_grad))
-        _correct_clip_op_role_var(params_and_grads, param_new_grad_name_dict)
-        return params_and_grads
-
-    def _process_context(self, context, param, grad):
-        pass
-
-    def _create_operators(self, param, grad):
-        new_grad = layers.clip_by_norm(x=grad, max_norm=self.clip_norm)
-        return param, new_grad
-
-
-_allow_pure_fp16_global_norm_clip_flag = False
-
-
-def _allow_pure_fp16_global_norm_clip(*args):
-    global _allow_pure_fp16_global_norm_clip_flag
-    if len(args) == 0:
-        return _allow_pure_fp16_global_norm_clip_flag
-    else:
-        assert len(args) == 1 and isinstance(args[0], bool)
-        old_value = _allow_pure_fp16_global_norm_clip_flag
-        _allow_pure_fp16_global_norm_clip_flag = args[0]
-        return old_value
-
-
-class ClipGradByGlobalNorm(ClipGradBase):
-    r"""
-    Given a list of Tensor :math:`t\_list` , calculate the global norm for the elements of all tensors in
-    :math:`t\_list` , and limit it to ``clip_norm`` .
-
-    - If the global norm is greater than ``clip_norm`` , all elements of :math:`t\_list` will be compressed by a ratio.
-
-    - If the global norm is less than or equal to ``clip_norm`` , nothing will be done.
-
-    The list of Tensor :math:`t\_list` is not passed from this class, but the gradients of all parameters set in ``optimizer``.
-    If ``need_clip`` of specific param is ``False`` in its ``ParamAttr``, then the gradients of this param will not be clipped.
-
-    Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer``
-    (for example: :ref:`api_paddle_optimizer_SGD`).
-
-    The clipping formula is:
-
-    .. math::
-
-        t\_list[i] = t\_list[i] * \frac{clip\_norm}{\max(global\_norm, clip\_norm)}
-
-    where:
-
-    .. math::
-
-        global\_norm = \sqrt{\sum_{i=0}^{N-1}(l2norm(t\_list[i]))^2}
-
-    Note:
-        ``need_clip`` of ``ClipGradyGlobalNorm`` HAS BEEN DEPRECATED since 2.0.
-        Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.
-
-    Args:
-        clip_norm (float): The maximum norm value.
-        group_name (str, optional): The group name for this clip. Default value is ``default_group``.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle
-
-            x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
-            linear = paddle.nn.Linear(in_features=10, out_features=10,
-                                      weight_attr=paddle.ParamAttr(need_clip=True),
-                                      bias_attr=paddle.ParamAttr(need_clip=False))
-            out = linear(x)
-            loss = paddle.mean(out)
-            loss.backward()
-
-            clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
-            sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip)
-            sdg.step()
-    """
-
-    def __init__(
-        self, clip_norm, group_name="default_group", auto_skip_clip=False
-    ):
-        super().__init__()
-        self.clip_norm = float(clip_norm)
-        self.group_name = group_name
-        assert isinstance(auto_skip_clip, bool)
-        self.auto_skip_clip = auto_skip_clip
-
-    def __str__(self):
-        return "Gradient Clip By GlobalNorm, global_norm=%f" % (self.clip_norm)
-
-    @imperative_base.no_grad
-    def _dygraph_clip(self, params_grads):
-        params_and_grads = []
-        sum_square_list = []
-        sum_square_list_fp16 = []
-        sum_square_list_fp32 = []
-        for p, g in params_grads:
-            if g is None:
-                continue
-            if getattr(p, 'need_clip', True) is False:
-                continue
-            merge_grad = g
-
-            if in_dygraph_mode() and g.is_selected_rows():
-                merge_grad = layers.merge_selected_rows(g)
-                merge_grad = merge_grad._get_tensor_from_selected_rows()
-
-            elif g.type == core.VarDesc.VarType.SELECTED_ROWS:
-                merge_grad = layers.merge_selected_rows(g)
-                merge_grad = layers.get_tensor_from_selected_rows(merge_grad)
-
-            sum_square = _squared_l2_norm(merge_grad)
-            if (
-                sum_square.dtype == core.VarDesc.VarType.FP16
-                or sum_square.dtype == core.VarDesc.VarType.BF16
-            ):
-                sum_square_list_fp16.append(sum_square)
-            elif sum_square.dtype == core.VarDesc.VarType.FP32:
-                sum_square_list_fp32.append(sum_square)
-            else:
-                sum_square_list.append(sum_square)
-
-        # all parameters have been filterd out
-        if (
-            len(sum_square_list)
-            + len(sum_square_list_fp16)
-            + len(sum_square_list_fp32)
-            == 0
-        ):
-            return params_grads
-
-        sum_dtype = 'float64' if len(sum_square_list) > 0 else "float32"
-        global_norm_var = []
-        if len(sum_square_list_fp16) > 0:
-            global_norm_var_fp16 = paddle.add_n(sum_square_list_fp16)
-            global_norm_var.append(global_norm_var_fp16.astype(sum_dtype))
-        if len(sum_square_list_fp32) > 0:
-            global_norm_var_fp32 = paddle.add_n(sum_square_list_fp32)
-            if sum_dtype == 'float32':
-                global_norm_var.append(global_norm_var_fp32)
-            else:
-                global_norm_var.append(global_norm_var_fp32.astype(sum_dtype))
-        if len(sum_square_list) > 0:
-            global_norm_var_fp64 = paddle.add_n(sum_square_list)
-            global_norm_var.append(global_norm_var_fp64)
-        global_norm_var = paddle.add_n(global_norm_var)
-        global_norm_var = paddle.sqrt(global_norm_var)
-        max_global_norm = layers.fill_constant(
-            shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm
-        )
-
-        need_clip = False
-        if not self.auto_skip_clip:  # always apply clip
-            need_clip = True
-            clip_var = paddle.divide(
-                x=max_global_norm,
-                y=paddle.maximum(x=global_norm_var, y=max_global_norm),
-            )
-        elif global_norm_var > max_global_norm:
-            # only when global_norm_var > max_global_norm, grad need clip
-            need_clip = True
-            clip_var = paddle.divide(x=max_global_norm, y=global_norm_var)
-
-        for p, g in params_grads:
-            if g is None:
-                continue
-            if getattr(p, 'need_clip', True) is False:
-                params_and_grads.append((p, g))
-                continue
-            # TODO(wangxi): use inplace elementwise_mul
-            if need_clip:
-                clip_input = (
-                    clip_var.astype(g.dtype)
-                    if clip_var.dtype != g.dtype
-                    else clip_var
-                )
-                new_grad = paddle.multiply(g, clip_input)
-                params_and_grads.append((p, new_grad))
-            else:
-                params_and_grads.append((p, g))
-
-        return params_and_grads
-
-    def _static_clip(self, params_grads):
-        params_and_grads = []
-        sum_square_list = []
-        sum_square_list_fp16 = []
-        sum_square_list_fp32 = []
-        with framework.name_scope('gradient_clip'):
-            for p, g in params_grads:
-                if g is None:
-                    continue
-                if getattr(p, 'need_clip', True) is False:
-                    continue
-                merge_grad = g
-                with p.block.program._optimized_guard([p, g]):
-                    if g.type == core.VarDesc.VarType.SELECTED_ROWS:
-                        merge_grad = layers.merge_selected_rows(g)
-                        merge_grad = layers.get_tensor_from_selected_rows(
-                            merge_grad
-                        )
-                    sum_square = _squared_l2_norm(merge_grad)
-                    if sum_square.dtype == core.VarDesc.VarType.FP16:
-                        sum_square_list_fp16.append(sum_square)
-                    elif sum_square.dtype == core.VarDesc.VarType.FP32:
-                        sum_square_list_fp32.append(sum_square)
-                    else:
-                        sum_square_list.append(sum_square)
-
-            # all parameters have been filterd out
-            if (
-                len(sum_square_list)
-                + len(sum_square_list_fp16)
-                + len(sum_square_list_fp32)
-                == 0
-            ):
-                return params_grads
-
-            with p.block.program._optimized_guard([p, g]):
-                sum_dtype = 'float64' if len(sum_square_list) > 0 else "float32"
-
-                global_norm_var = []
-                if len(sum_square_list_fp16) > 0:
-                    global_norm_var_fp16 = layers.sums(sum_square_list_fp16)
-                    if (
-                        sum_square_list_fp32
-                        or sum_square_list
-                        or not _allow_pure_fp16_global_norm_clip()
-                    ):
-                        global_norm_var.append(
-                            global_norm_var_fp16.astype(sum_dtype)
-                        )
-                    else:
-                        global_norm_var.append(global_norm_var_fp16)
-                if len(sum_square_list_fp32) > 0:
-                    global_norm_var_fp32 = layers.sums(sum_square_list_fp32)
-                    if sum_dtype == 'float32':
-                        global_norm_var.append(global_norm_var_fp32)
-                    else:
-                        global_norm_var.append(
-                            global_norm_var_fp32.astype(sum_dtype)
-                        )
-                if len(sum_square_list) > 0:
-                    # fp64
-                    global_norm_var_other_dtype = layers.sums(sum_square_list)
-                    global_norm_var.append(global_norm_var_other_dtype)
-
-                global_norm_var = (
-                    layers.sums(global_norm_var)
-                    if len(global_norm_var) > 1
-                    else global_norm_var[0]
-                )
-                global_norm_var = paddle.sqrt(x=global_norm_var)
-                max_global_norm = layers.fill_constant(
-                    shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm
-                )
-                scale_var = paddle.divide(
-                    x=max_global_norm,
-                    y=paddle.maximum(x=max_global_norm, y=global_norm_var),
-                )
-            param_new_grad_name_dict = dict()
-            for p, g in params_grads:
-                if g is None:
-                    continue
-                if getattr(p, 'need_clip', True) is False:
-                    params_and_grads.append((p, g))
-                    continue
-
-                with p.block.program._optimized_guard([p, g]):
-                    new_g = _cast_to_mp_type_if_enabled(g)
-                    # inplace
-                    scale_input = (
-                        scale_var.astype('float16')
-                        if new_g.dtype == core.VarDesc.VarType.FP16
-                        and scale_var.dtype != core.VarDesc.VarType.FP16
-                        else scale_var
-                    )
-                    # NOTE(Yuang Liu): For pure dp with gradient merge, the p and g
-                    # will be in different blocks with the gradient clip related ops.
-                    # We need to handle the correct block, otherwise will encounter
-                    # a 'NotFoundError' during compile time.
-                    block = default_main_program().current_block()
-                    block.append_op(
-                        type='elementwise_mul',
-                        inputs={'X': new_g, 'Y': scale_input},
-                        outputs={'Out': new_g},
-                    )
-                    if new_g is not g:
-                        block.append_op(
-                            type='cast',
-                            inputs={'X': new_g},
-                            outputs={'Out': g},
-                            attrs={
-                                'in_dtype': new_g.dtype,
-                                'out_dtype': g.dtype,
-                            },
-                        )
-
-                param_new_grad_name_dict[p.name] = g.name
-                params_and_grads.append((p, g))
-
-        _correct_clip_op_role_var(params_and_grads, param_new_grad_name_dict)
-        return params_and_grads
-
-    def _process_context(self, context, param, grad):
-        if self.group_name not in context:
-            context[self.group_name] = []
-            context[self.group_name + "_clip_value"] = self.clip_norm
-            context[self.group_name + "_clip"] = layers.fill_constant(
-                shape=[1], dtype=grad.dtype, value=self.clip_norm
-            )
-        else:
-            if not self.clip_norm == context[self.group_name + "_clip_value"]:
-                raise ValueError(
-                    "All parameters' 'clip_norm' of a same group should be the same"
-                )
-
-        merge_grad = grad
-        if grad.type == core.VarDesc.VarType.SELECTED_ROWS:
-            merge_grad = layers.merge_selected_rows(grad)
-            merge_grad = layers.get_tensor_from_selected_rows(merge_grad)
-
-        local_norm_var = _squared_l2_norm(merge_grad)
-        context[self.group_name].append(local_norm_var)
-
-        self.context = context
-
-    def _create_operators(self, param, grad):
-        group_scale_name = self.group_name + "_scale"
-        if group_scale_name not in self.context:
-            group_norm_var = layers.sums(input=self.context[self.group_name])
-            group_norm_var = paddle.sqrt(x=group_norm_var)
-            clip_var = self.context[self.group_name + "_clip"]
-            group_scale_var = paddle.divide(
-                x=clip_var,
-                y=paddle.maximum(x=clip_var, y=group_norm_var),
-            )
-            assert group_scale_var.shape == (1,)
-            self.context[group_scale_name] = group_scale_var
-
-        # inplace
-        param.block.append_op(
-            type='elementwise_mul',
-            inputs={'X': grad, 'Y': self.context[group_scale_name]},
-            outputs={'Out': grad},
-        )
-
-        return param, grad
-
-
-@framework.dygraph_not_support
-def set_gradient_clip(clip, param_list=None, program=None):
-    """
-    :api_attr: Static Graph
-
-    Warning:
-
-        This API must be used after building network, and before ``minimize`` ,
-        and it may be removed in future releases, so it is not recommended.
-        It is recommended to set ``grad_clip`` when initializing the ``optimizer`` ,
-        this is a better method to clip gradient. There are three clipping strategies:
-         :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
-         :ref:`api_fluid_clip_GradientClipByValue` .
-
-    To specify parameters that require gradient clip.
-
-    Args:
-        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
-            some derived class of ``GradientClipBase`` . There are three cliping strategies
-            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
-            :ref:`api_fluid_clip_GradientClipByValue` ). Default value: None, and there is no
-            gradient clipping.
-        param_list (list(Variable), optional): Parameters that require gradient clip.
-                It can be a list of parameter or a list of parameter's name.
-                Default None, meaning that all parameters in the program will be included.
-        program (Program, optional): The program where parameters are located.
-                Default None, meaning that using :ref:`api_fluid_default_main_program` .
-
-    Returns:
-        None
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-
-            def network():
-                image = fluid.data(name='image', shape=[
-                                   None, 28], dtype='float32')
-                param_attr1 = fluid.ParamAttr("fc1_param")
-                fc1 = fluid.layers.fc(image, size=10, param_attr=param_attr1)
-                param_attr2 = fluid.ParamAttr("fc2_param")
-                fc2 = fluid.layers.fc(fc1, size=10, param_attr=param_attr2)
-                loss = fluid.layers.reduce_mean(fc2)
-                return loss
-
-
-            # network 1: clip all parameter gradient
-            with fluid.program_guard(fluid.Program(), fluid.Program()):
-                loss = network()
-                fluid.clip.set_gradient_clip(
-                    fluid.clip.GradientClipByGlobalNorm(clip_norm=2.0))
-                sgd = fluid.optimizer.SGD(learning_rate=1e-3)
-                sgd.minimize(loss)
-
-            # network 2: clip parameter gradient by name
-            with fluid.program_guard(fluid.Program(), fluid.Program()):
-                loss = network()
-                fluid.clip.set_gradient_clip(
-                    fluid.clip.GradientClipByValue(min=-1.0, max=1.0),
-                    param_list=["fc1_param", "fc2_param"])
-                sgd = fluid.optimizer.SGD(learning_rate=1e-3)
-                sgd.minimize(loss)
-
-            # network 3: clip parameter gradient by value
-            with fluid.program_guard(fluid.Program(), fluid.Program()):
-                loss = network()
-                param_var1 = fluid.default_main_program().global_block().var("fc1_param")
-                param_var2 = fluid.default_main_program().global_block().var("fc2_param")
-                fluid.clip.set_gradient_clip(
-                    fluid.clip.GradientClipByValue(min=-1.0, max=1.0),
-                    param_list=[param_var1, param_var2])
-                sgd = fluid.optimizer.SGD(learning_rate=1e-3)
-                sgd.minimize(loss)
-
-            # network 4: use 'set_gradient_clip' and 'optimize(grad_clip=clip)' together
-            with fluid.program_guard(fluid.Program(), fluid.Program()):
-                loss = network()
-                clip1 = fluid.clip.GradientClipByValue(min=-1.0, max=1.0)
-                clip2 = fluid.clip.GradientClipByNorm(clip_norm=1.0)
-                # Set the gradient clipping strategy: clip1
-                fluid.clip.set_gradient_clip(clip1)
-                # Set the gradient clipping strategy: clip2
-                sgd = fluid.optimizer.SGD(learning_rate=1e-3, grad_clip=clip2)
-                sgd.minimize(loss)
-                # 'set_gradient_clip' will not take effect when setting has a conflict,
-                # and the gradient clipping strategy will be 'clip2'
-
-
-    """
-    warnings.warn(
-        "Caution! 'set_gradient_clip' is not recommended "
-        "and may be deprecated in future! "
-        "We recommend a new strategy: set 'grad_clip' "
-        "when initializing the 'optimizer'. "
-        "This method can reduce the mistakes, please "
-        "refer to documention of 'optimizer'."
-    )
-
-    if not isinstance(clip, ClipGradBase):
-        raise TypeError(
-            "'clip' should be an instance of ClipGradBase's derived class"
-        )
-    if program is None:
-        program = framework.default_main_program()
-
-    for op in program.block(0).ops:
-        if 'op_namescope' in op.all_attrs() and "optimizer" in op.attr(
-            "op_namescope"
-        ):
-            warnings.warn(
-                "'minimize' has been invoked before, this will make 'set_gradient_clip' "
-                "be ineffective! Please invoke 'set_gradient_clip' before 'minimize'."
-            )
-            break
-
-    if param_list is None:
-        param_list = program.block(0).all_parameters()
-    if all(isinstance(elem, str) for elem in param_list):
-        param_list = [program.block(0).var(elem) for elem in param_list]
-    if not all(isinstance(elem, framework.Parameter) for elem in param_list):
-        raise TypeError(
-            "'param_list' should be a list of Parameter or basestring(parameter's name)."
-        )
-
-    for param in param_list:
-        param.gradient_clip_attr = copy.deepcopy(clip)
-
-
-def append_gradient_clip_ops(param_grads):
-    context = dict()
-    for p, g in param_grads:
-        if g is None:
-            continue
-        with p.block.program._optimized_guard([p, g]), framework.name_scope(
-            'gradient_clip'
-        ):
-            clip_attr = getattr(p, 'gradient_clip_attr', None)
-            if clip_attr is None:
-                return param_grads
-            if not isinstance(clip_attr, ClipGradBase):
-                raise TypeError(
-                    "clip attribute should be an instance of GradientClipBase"
-                )
-
-            clip_attr._process_context(context=context, param=p, grad=g)
-
-    res = []
-    param_new_grad_name_dict = dict()
-    for p, g in param_grads:
-        if g is None:
-            continue
-        with p.block.program._optimized_guard([p, g]), framework.name_scope(
-            'gradient_clip'
-        ):
-            param, new_grad = clip_attr._create_operators(param=p, grad=g)
-            param_new_grad_name_dict[param.name] = new_grad.name
-            res.append([param, new_grad])
-
-    _correct_clip_op_role_var(res, param_new_grad_name_dict)
-    return res
-
-
-# change wrong mapping relation between param & grad in clip op
-# Note: This function is sensitive to the time cost of the network with gradient clipping
-# and should not be changed easily. If you must change, please test the time cost.
-def _correct_clip_op_role_var(params_grads, param_new_grad_name_dict):
-    block_id_list = []
-    if len(param_new_grad_name_dict) == 0:
-        return
-    for param, grad in params_grads:
-        if grad is None:
-            continue
-        block_id = param.block.idx
-        if block_id in block_id_list:
-            continue
-        block_id_list.append(block_id)
-        for op in param.block.program.global_block().ops:
-            if (
-                op.has_attr("op_namescope")
-                and "gradient_clip" in op.attr("op_namescope")
-                and op.attr('op_role_var')
-            ):
-                param_name = op.attr('op_role_var')[0]
-                if param_name in param_new_grad_name_dict:
-                    correct_p_g = [
-                        param_name,
-                        param_new_grad_name_dict[param_name],
-                    ]
-                    op._set_attr('op_role_var', correct_p_g)
-
-
-GradientClipBase = ClipGradBase
-GradientClipByValue = ClipGradByValue
-GradientClipByNorm = ClipGradByNorm
-GradientClipByGlobalNorm = ClipGradByGlobalNorm
diff --git a/python/paddle/fluid/compiler.py b/python/paddle/fluid/compiler.py
index f763e0f1d8838c..e8393c63b10536 100644
--- a/python/paddle/fluid/compiler.py
+++ b/python/paddle/fluid/compiler.py
@@ -751,7 +751,7 @@ def patch_getter(self, item):
     def patch_lr_scheduler(ipu_strategy):
         from paddle.optimizer.lr import LRScheduler
 
-        # For IPU dynamic graph usage, lr_var is not synced in executor as static mode do.
+        # For IPU dynamic graph usage, lr_var is not synced in executor as static graph mode do.
         # Manually set lr to ipu_strategy to update the lr.
         old_step = LRScheduler.step
 
diff --git a/python/paddle/fluid/contrib/__init__.py b/python/paddle/fluid/contrib/__init__.py
index 1a509f725b5b69..5ad924b49f2157 100644
--- a/python/paddle/fluid/contrib/__init__.py
+++ b/python/paddle/fluid/contrib/__init__.py
@@ -18,9 +18,6 @@
 from .memory_usage_calc import *
 from . import op_frequence
 from .op_frequence import *
-from . import quantize
-from .quantize import *
-from . import slim
 from . import extend_optimizer
 from .extend_optimizer import *
 from . import model_stat
@@ -36,7 +33,6 @@
 
 __all__ += memory_usage_calc.__all__
 __all__ += op_frequence.__all__
-__all__ += quantize.__all__
 __all__ += extend_optimizer.__all__
 __all__ += ['mixed_precision']
 __all__ += layers.__all__
diff --git a/python/paddle/fluid/contrib/optimizer.py b/python/paddle/fluid/contrib/optimizer.py
index d9eb208f8498af..c5dc5859e82f81 100644
--- a/python/paddle/fluid/contrib/optimizer.py
+++ b/python/paddle/fluid/contrib/optimizer.py
@@ -53,7 +53,7 @@ class Momentum(Optimizer):
         momentum (float): Momentum factor
         parameter_list (Iterable, optional):  Iterable of ``Variable`` names to update to minimize ``loss``. \
             This parameter is required in dygraph mode. \
-            The default value is None in static mode, at this time all parameters will be updated.
+            The default value is None in static graph mode, at this time all parameters will be updated.
         use_nesterov (bool, optional): Enables Nesterov momentum, default is false.
         regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \
              :ref:`api_fluid_regularizer_L1Decay` , :ref:`api_fluid_regularizer_L2Decay` . If a parameter has set \
diff --git a/python/paddle/fluid/contrib/quantize/__init__.py b/python/paddle/fluid/contrib/quantize/__init__.py
deleted file mode 100644
index 25920f1575323e..00000000000000
--- a/python/paddle/fluid/contrib/quantize/__init__.py
+++ /dev/null
@@ -1,18 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from . import quantize_transpiler
-from .quantize_transpiler import *
-
-__all__ = quantize_transpiler.__all__
diff --git a/python/paddle/fluid/contrib/quantize/quantize_transpiler.py b/python/paddle/fluid/contrib/quantize/quantize_transpiler.py
deleted file mode 100644
index edd07c0ba9f368..00000000000000
--- a/python/paddle/fluid/contrib/quantize/quantize_transpiler.py
+++ /dev/null
@@ -1,606 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import collections
-import numpy as np
-
-from paddle.fluid.framework import (
-    default_main_program,
-    default_startup_program,
-    program_guard,
-)
-from paddle.fluid.layer_helper import LayerHelper
-from paddle.fluid import unique_name
-from paddle.fluid import core
-from paddle.fluid.initializer import Constant
-from paddle.fluid.param_attr import ParamAttr
-from paddle.fluid.layer_helper import LayerHelper
-from paddle.fluid.layers.nn import autoincreased_step_counter
-from paddle.fluid.framework import Variable
-from paddle.fluid.executor import global_scope
-
-__all__ = ['QuantizeTranspiler']
-
-_QUANTIZABLE_OP_TYPES = ['conv2d', 'depthwise_conv2d', 'mul']
-
-
-def _quantized_var_name(var_name):
-    """
-    Return quantized variable name for the input `var_name`.
-    """
-    return "%s.quantized" % (var_name)
-
-
-def _dequantized_var_name(var_name):
-    """
-    Return dequantized variable name for the input `var_name`.
-    """
-    return "%s.dequantized" % (var_name)
-
-
-def _quantized_scale_name(var_name):
-    """
-    Return quantized variable name for the input `var_name`.
-    """
-    return "%s.scale" % (var_name)
-
-
-def _original_var_name(var_name):
-    """
-    Return the original variable name.
-    """
-    if var_name.endswith('.quantized.dequantized'):
-        return var_name[: -len('.quantized.dequantized')]
-    if var_name.endswith('.quantized'):
-        return var_name[: -len('.quantized')]
-    if var_name.endswith('.dequantized'):
-        return var_name[: -len('.dequantized')]
-    if var_name.endswith('.scale'):
-        return var_name[: -len('.scale')]
-    else:
-        return var_name
-
-
-def _is_float(v):
-    return isinstance(v, float) or isinstance(v, np.float32)
-
-
-def quant(x, scale, num_bits):
-    y = np.round(x / scale * ((1 << (num_bits - 1)) - 1))
-    return y
-
-
-class QuantizeTranspiler:
-    def __init__(
-        self,
-        weight_bits=8,
-        activation_bits=8,
-        activation_quantize_type='abs_max',
-        weight_quantize_type='abs_max',
-        window_size=10000,
-        moving_rate=0.9,
-    ):
-        """
-        Convert and rewrite the fluid Program according to weight and
-        activation quantization type.
-
-        Args:
-            weight_bits (int): quantization bit number for weights,
-                the bias is not quantized.
-            activation_bits (int): quantization bit number for activation.
-            activation_quantize_type (str): quantization type for activation,
-                now support 'abs_max', 'range_abs_max'. If use 'abs_max' mode,
-                the quantization scale will be calculated dynamically each step
-                in both training and testing period. If use 'range_abs_max',
-                a static quantization scale will be calculated during training
-                and used in inference.
-            weight_quantize_type (str): quantization type for weights,
-                support 'abs_max'. The 'range_abs_max' usually is not used for
-                weight, since weights are fixed once the model is well trained.
-            window_size (int): the window size for 'range_abs_max' quantization.
-
-        Examples:
-
-        .. code-block:: python
-
-            # the original program will be rewrite, if you don't want to
-            # change it, please clone at first.
-            # quantize_program = program.clone()
-            t = fluid.QuantizeTranspiler()
-            t.transpile(quantize_program)
-
-        """
-        self.weight_bits = weight_bits
-        self.activation_bits = activation_bits
-        quant_type = ['abs_max', 'range_abs_max', 'moving_average_abs_max']
-        if weight_quantize_type not in quant_type:
-            raise ValueError(
-                "Unknown weight_quantize_type: '%s'. It can only be ",
-                "'abs_max' or 'range_abs_max' or 'moving_average_abs_max'.",
-                str(weight_quantize_type),
-            )
-        if activation_quantize_type not in quant_type:
-            raise ValueError(
-                "Unknown activation_quantize_type : '%s'. It can only be ",
-                "'abs_max' or 'range_abs_max' or 'moving_average_abs_max'.",
-                str(activation_quantize_type),
-            )
-
-        self.weight_quantize_type = weight_quantize_type
-        self.activation_quantize_type = activation_quantize_type
-
-        self.window_size = window_size
-        self.moving_rate = moving_rate
-        self.helper = LayerHelper(self.__class__.__name__)
-        self.fake_quant_op_types = [
-            'fake_quantize_abs_max',
-            'fake_quantize_range_abs_max',
-            'fake_quantize_moving_average_abs_max',
-        ]
-        self.fake_dequant_op_types = ['fake_dequantize_max_abs']
-        self.is_test = None
-        self.global_step = None
-
-    def training_transpile(self, program=None, startup_program=None):
-        """Rewrites a training input program in place for simulated
-        quantization. Insert fake quantization and de-quantization ops into
-        program to simulate the error introduced by quantization. And change
-        the gradient ops' input by using the faked quantization weights and
-        activation. Since the program is transformed in place, the graph
-        connection will change.
-
-        Args:
-            program (Program): the input program to be transpile.
-        """
-        self.is_test = False
-        program = default_main_program() if program is None else program
-        startup_program = (
-            default_startup_program()
-            if startup_program is None
-            else startup_program
-        )
-
-        # marked the variable which has been quantized and dequantized.
-        dequanted_vars = [
-            collections.OrderedDict() for _ in range(len(program.blocks))
-        ]
-        grad_op_types = ['%s_grad' % (type) for type in _QUANTIZABLE_OP_TYPES]
-
-        params = [p.name for p in program.global_block().iter_parameters()]
-
-        def _transpile_forward(block, op):
-            idx = block.ops.index(op)
-            block_id = block.idx
-            # insert quant op and dequant op
-            for name in op.input_arg_names:
-                # if share input between ops
-                if name in dequanted_vars[block_id]:
-                    dequant_var = dequanted_vars[block_id][name]
-                else:
-                    var = block.var(name)
-                    quant_bits = (
-                        self.weight_bits
-                        if var.name in params
-                        else self.activation_bits
-                    )
-                    quant_type = (
-                        self.weight_quantize_type
-                        if var.name in params
-                        else self.activation_quantize_type
-                    )
-
-                    quant_var, scale_var = self._insert_quant_op(
-                        block, idx, var, quant_bits, quant_type
-                    )
-                    dequant_var = self._insert_dequant_op(
-                        block, idx + 1, quant_var, scale_var, quant_bits
-                    )
-                    dequanted_vars[block_id][name] = dequant_var
-                # rename the forward op inputs
-                op._rename_input(name, dequant_var.name)
-
-        def _transpile_backward(block, op):
-            block_id = block.idx
-            no_dequanted_input_vars = True
-            for name in op.input_arg_names:
-                if name in dequanted_vars[block_id]:
-                    dequant_var = dequanted_vars[block_id][name]
-                    op._rename_input(name, dequant_var.name)
-                    no_dequanted_input_vars = False
-            if no_dequanted_input_vars:
-                raise ValueError(
-                    "There is no dequanted inputs for op %s." % (op.type)
-                )
-
-        with program_guard(program, startup_program):
-            self._create_global_step()
-            for block in program.blocks:
-                ops = list(block.ops)
-                block_id = block.idx
-                for op in ops:
-                    # rewrite the forward ProgramDes
-                    if op.type in _QUANTIZABLE_OP_TYPES:
-                        _transpile_forward(block, op)
-                    # rename the backward op inputs
-                    if op.type in grad_op_types:
-                        _transpile_backward(block, op)
-
-    def _create_global_step(self):
-        if (
-            self.weight_quantize_type == 'range_abs_max'
-            or self.activation_quantize_type == 'range_abs_max'
-        ):
-            self.global_step = autoincreased_step_counter()
-
-    def freeze_program(self, program, place, scope=None):
-        """Freeze input training program for inference.
-
-        Args:
-            program (Program): the input program to be transpile.
-        """
-
-        self.is_test = True
-        scope = global_scope() if scope is None else scope
-        program = default_main_program() if program is None else program
-
-        persistable_vars = [
-            v.name
-            for v in filter(lambda var: var.persistable, program.list_vars())
-        ]
-        op_in_rename_map = [
-            collections.OrderedDict() for _ in range(len(program.blocks))
-        ]
-        op_out_rename_map = [
-            collections.OrderedDict() for _ in range(len(program.blocks))
-        ]
-        var_scale_map = [
-            collections.OrderedDict() for _ in range(len(program.blocks))
-        ]
-
-        def _remove_fake_quant_and_dequant_op(block, op):
-            idx = block.ops.index(op)
-            block_id = block.idx
-            k = op.output('Out')[0]
-            v = op.input('X')[0]
-            if v not in op_in_rename_map[block_id]:
-                op_in_rename_map[block_id][k] = v
-            else:
-                op_in_rename_map[block_id][k] = op_in_rename_map[block_id][v]
-            block._remove_op(idx)
-
-        def _insert_post_dequant_op(block, op):
-            idx = block.ops.index(op)
-            block_id = block.idx
-            max_range = None
-            scale_var = None
-            for name in op.input_arg_names:
-                # rename input name of the op to the input name of last op which has be removed
-                if name in op_in_rename_map[block_id]:
-                    op._rename_input(name, op_in_rename_map[block_id][name])
-
-                scale_v = var_scale_map[block_id][_original_var_name(name)]
-                if _original_var_name(name) in persistable_vars:
-                    param_range = (1 << (self.weight_bits - 1)) - 1
-                    act_range = (1 << (self.activation_bits - 1)) - 1
-                    assert _is_float(scale_v)
-                    max_range = param_range * act_range / scale_v
-                else:
-                    assert isinstance(scale_v, Variable)
-                    scale_var = scale_v
-
-            if len(op.output_arg_names) != 1:
-                raise ValueError(
-                    "Only support one output, but op %s has"
-                    " more than one output." % (op.type)
-                )
-            out_var = block.var(op.output_arg_names[0])
-            dequant_var = block.create_var(
-                name=_dequantized_var_name(out_var.name),
-                type=out_var.type,
-                shape=out_var.shape,
-                dtype=out_var.dtype,
-            )
-            # insert fake_dequantize_op
-            dequant_op = block._insert_op(
-                idx + 1,
-                type="fake_dequantize_max_abs",
-                attrs={'max_range': float(max_range)},
-                inputs={"X": out_var, 'Scale': scale_var},
-                outputs={"Out": dequant_var},
-            )
-            op_out_rename_map[block_id][out_var.name] = dequant_var.name
-            return dequant_var
-
-        def _load_var(name):
-            return np.array(scope.find_var(name).get_tensor())
-
-        def _restore_var(name, arr):
-            t = scope.find_var(name).get_tensor()
-            t.set(arr, place)
-
-        for block in program.blocks:
-            ops = list(block.ops)
-            block_id = block.idx
-            for op in ops:
-                op_type = op.type
-
-                # insert dequant_op after fc/conv, need to rename
-                # input of the followed ops(of fc/conv) to the dquant_op
-                for name in op.input_arg_names:
-                    if name in op_out_rename_map[block_id]:
-                        op._rename_input(
-                            name, op_out_rename_map[block_id][name]
-                        )
-
-                if op_type in self.fake_quant_op_types:
-                    in_arg_name = op.input('X')[0]
-                    if in_arg_name in persistable_vars:
-                        if self.weight_quantize_type == 'abs_max':
-                            param = _load_var(in_arg_name)
-                            scale_v = np.max(np.abs(param))
-                        else:
-                            scale_v = _load_var(op.output('OutScale')[0])
-                        var_scale_map[block_id][in_arg_name] = scale_v
-                    else:
-                        scale_v = block.var(op.output('OutScale')[0])
-                        var_scale_map[block_id][in_arg_name] = scale_v
-
-                    if in_arg_name in persistable_vars:
-                        _remove_fake_quant_and_dequant_op(block, op)
-                        # quantize weight and restore
-                        param_t = _load_var(in_arg_name)
-                        param_q_t = quant(param_t, scale_v, self.weight_bits)
-                        _restore_var(in_arg_name, param_q_t)
-
-                if op_type in self.fake_dequant_op_types:
-                    _remove_fake_quant_and_dequant_op(block, op)
-
-                if op_type in _QUANTIZABLE_OP_TYPES:
-                    dequant_var = _insert_post_dequant_op(block, op)
-
-        # remove the unused var in ProgramDesc
-        self._remove_unused_var(program)
-        # program = program.clone()
-
-    def convert_to_int8(self, program, place, scope=None):
-        scope = global_scope() if scope is None else scope
-        program = default_main_program() if program is None else program
-
-        def _load_var(name):
-            return np.array(scope.find_var(name).get_tensor())
-
-        global_block = program.global_block()
-
-        def convert_to_int8(var):
-            int8_var_name = var.name + ".int8"
-            int8_var = global_block.create_parameter(
-                name=int8_var_name.encode('ascii'),
-                type=var.type,
-                dtype=core.VarDesc.VarType.INT8,
-                shape=var.shape,
-            )
-
-            tensor = _load_var(var.name)
-
-            scope.var(int8_var_name)
-            int8_tensor = scope.find_var(int8_var_name).get_tensor()
-            int8_tensor.set(tensor.astype(np.int8), place)
-            return int8_var
-
-        input_map = {}
-        for block in program.blocks:
-            for op in list(block.ops):
-                if op.type in _QUANTIZABLE_OP_TYPES:
-                    for name in op.input_arg_names:
-                        var = block.var(name)
-                        if var.persistable:
-                            if name not in input_map:
-                                int8_var = convert_to_int8(var)
-                                input_map[name] = int8_var.name
-                            op._rename_input(name, input_map[name])
-        self._remove_unused_var(program)
-
-    def _remove_unused_var(self, program):
-        all_remove_vars = []
-        for block in program.blocks:
-            args = []
-            for op in block.ops:
-                args += op.input_arg_names
-                args += op.output_arg_names
-            args = list(set(args))  # vals of all left ops
-            var_names = block.vars.keys()  # all vals
-            sub_block_remove_vars = []
-            for var in var_names:
-                if var not in args:
-                    sub_block_remove_vars.append(var)
-            all_remove_vars.append(sub_block_remove_vars)
-
-        remove_vars = [list(set(v)) for v in all_remove_vars]
-        for i, block in enumerate(program.blocks):
-            for v in remove_vars[i]:
-                block._remove_var(v)
-
-    def _insert_quant_abs_max_op(self, block, idx, var, quant_bits):
-        """Insert fake_quantize_abs_max op."""
-        quant_var = block.create_var(
-            name=_quantized_var_name(var.name),
-            type=var.type,
-            shape=var.shape,
-            dtype=var.dtype,
-        )
-        scale = block.create_var(
-            name=_quantized_scale_name(var.name),
-            type=var.type,
-            shape=var.shape,
-            dtype=var.dtype,
-        )
-        quant_op = block._insert_op(
-            idx,
-            type='fake_quantize_abs_max',
-            attrs={'bit_length': quant_bits},
-            inputs={'X': var},
-            outputs={'Out': quant_var, 'OutScale': scale},
-        )
-        return quant_var, scale
-
-    def _insert_quant_range_abs_max_op(self, block, idx, var, quant_bits):
-        """Insert fake_quantize_range_abs_max"""
-        quant_var = block.create_var(
-            name=_quantized_var_name(var.name),
-            type=var.type,
-            shape=var.shape,
-            dtype=var.dtype,
-        )
-        scale = self.helper.create_parameter(
-            attr=ParamAttr(
-                name=_quantized_scale_name(var.name),
-                initializer=Constant(0.001),
-                trainable=False,
-            ),
-            shape=[1],
-            dtype=var.dtype,
-        )
-        scale.stop_gradient = True
-
-        ins = {'X': var, 'InScale': scale}
-        outs = {'Out': quant_var, 'OutScale': scale}
-        if not self.is_test:
-            # A global step counter variable with type int64
-            scales = self.helper.create_global_variable(
-                name=unique_name.generate('scales'),
-                persistable=True,
-                dtype=var.dtype,
-                shape=[self.window_size],
-            )
-            self.helper.set_variable_initializer(
-                scales, initializer=Constant(value=0)
-            )
-
-            ins['Iter'] = self.global_step
-            outs['OutScales'] = scales
-
-        attrs = {
-            'window_size': self.window_size,
-            'bit_length': quant_bits,
-            'is_test': self.is_test,
-        }
-
-        quant_op = block._insert_op(
-            idx,
-            type='fake_quantize_range_abs_max',
-            attrs=attrs,
-            inputs=ins,
-            outputs=outs,
-        )
-
-        return quant_var, scale
-
-    def _insert_quant_moving_average_abs_max_op(
-        self, block, idx, var, quant_bits
-    ):
-        """Insert fake_quantize_moving_average_abs_max"""
-        quant_var = block.create_var(
-            name=_quantized_var_name(var.name),
-            type=var.type,
-            shape=var.shape,
-            dtype=var.dtype,
-        )
-        state = self.helper.create_global_variable(
-            name=unique_name.generate('state'),
-            persistable=True,
-            dtype=var.dtype,
-            shape=[1],
-        )
-        self.helper.set_variable_initializer(
-            state, initializer=Constant(value=1)
-        )
-        accum = self.helper.create_global_variable(
-            name=unique_name.generate('accum'),
-            persistable=True,
-            dtype=var.dtype,
-            shape=[1],
-        )
-        self.helper.set_variable_initializer(
-            accum, initializer=Constant(value=1)
-        )
-        scale = self.helper.create_parameter(
-            attr=ParamAttr(
-                name=_quantized_scale_name(var.name),
-                initializer=Constant(0.001),
-                trainable=False,
-            ),
-            shape=[1],
-            dtype=var.dtype,
-        )
-        scale.stop_gradient = True
-
-        ins = {'X': var, 'InScale': scale}
-        outs = {'Out': quant_var, 'OutScale': scale}
-        if not self.is_test:
-            ins['InState'] = state
-            ins['InAccum'] = accum
-            outs['OutState'] = state
-            outs['OutAccum'] = accum
-
-        attrs = {
-            'bit_length': quant_bits,
-            'moving_rate': self.moving_rate,
-            'is_test': self.is_test,
-        }
-
-        quant_op = block._insert_op(
-            idx,
-            type='fake_quantize_moving_average_abs_max',
-            attrs=attrs,
-            inputs=ins,
-            outputs=outs,
-        )
-
-        return quant_var, scale
-
-    def _insert_quant_op(self, block, idx, var, quant_bits, quant_type):
-        """
-        Insert fake_quantize_op
-        """
-        if quant_type == 'abs_max':
-            return self._insert_quant_abs_max_op(block, idx, var, quant_bits)
-        elif quant_type == 'range_abs_max':
-            return self._insert_quant_range_abs_max_op(
-                block, idx, var, quant_bits
-            )
-        elif quant_type == 'moving_average_abs_max':
-            return self._insert_quant_moving_average_abs_max_op(
-                block, idx, var, quant_bits
-            )
-
-    def _insert_dequant_op(self, block, idx, var, scale, quant_bits):
-        """
-        Insert fake_quantize_op
-        """
-        dequant_var = block.create_var(
-            name=_dequantized_var_name(var.name),
-            type=var.type,
-            shape=var.shape,
-            dtype=var.dtype,
-        )
-        # insert fake_dequantize_op
-        max_range = (1 << (quant_bits - 1)) - 1
-        dequant_op = block._insert_op(
-            idx,
-            type="fake_dequantize_max_abs",
-            attrs={'max_range': float(max_range)},
-            inputs={"X": var, 'Scale': scale},
-            outputs={"Out": dequant_var},
-        )
-        return dequant_var
diff --git a/python/paddle/fluid/contrib/slim/__init__.py b/python/paddle/fluid/contrib/slim/__init__.py
deleted file mode 100644
index b94a21a7e406b8..00000000000000
--- a/python/paddle/fluid/contrib/slim/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/python/paddle/fluid/contrib/slim/quantization/__init__.py b/python/paddle/fluid/contrib/slim/quantization/__init__.py
deleted file mode 100644
index 84c6e43cc4e31c..00000000000000
--- a/python/paddle/fluid/contrib/slim/quantization/__init__.py
+++ /dev/null
@@ -1,31 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from . import quantization_pass
-from .quantization_pass import *
-from . import quant_int8_mkldnn_pass
-from .quant_int8_mkldnn_pass import *
-from . import quant2_int8_mkldnn_pass
-from .quant2_int8_mkldnn_pass import *
-from . import post_training_quantization
-from .post_training_quantization import *
-from . import imperative
-from .imperative import *
-
-__all__ = []
-__all__ += quantization_pass.__all__
-__all__ += quant_int8_mkldnn_pass.__all__
-__all__ += quant2_int8_mkldnn_pass.__all__
-__all__ += post_training_quantization.__all__
-__all__ += imperative.__all__
diff --git a/python/paddle/fluid/contrib/slim/quantization/quantize_transpiler_v2.py b/python/paddle/fluid/contrib/slim/quantization/quantize_transpiler_v2.py
deleted file mode 100644
index 64bb1a6c45a1c8..00000000000000
--- a/python/paddle/fluid/contrib/slim/quantization/quantize_transpiler_v2.py
+++ /dev/null
@@ -1,497 +0,0 @@
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import collections
-import logging
-import numpy as np
-from .... import core
-from ....framework import Program, Operator, Variable, program_guard
-from ....executor import global_scope
-from .... import unique_name
-from ....layer_helper import LayerHelper
-from ....param_attr import ParamAttr
-from ....initializer import Constant
-from ....log_helper import get_logger
-
-_logger = get_logger(
-    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s'
-)
-
-
-def find_next_ops(block, var_name):
-    """
-    Find all followed ops for the input variable.
-    """
-    res_ops = []
-    for op in block.ops:
-        if var_name in op.input_arg_names:
-            res_ops.append(op)
-    return res_ops
-
-
-def load_variable_data(scope, var_name):
-    '''
-    Load variable value from scope
-    '''
-    var_node = scope.find_var(var_name)
-    assert var_node is not None, "Cannot find " + var_name + " in scope."
-    return np.array(var_node.get_tensor())
-
-
-class QuantizeTranspilerV2:
-    def __init__(
-        self,
-        weight_bits=8,
-        activation_bits=8,
-        weight_quantize_type='abs_max',
-        activation_quantize_type='moving_average_abs_max',
-        quantizable_op_type=[
-            'conv2d',
-            'depthwise_conv2d',
-            'mul',
-        ],
-        skip_pattern=['skip_quant'],
-    ):
-        """
-        Apply fake quant for the quantized ops.
-
-        Args:
-            weight_bits(int): the bit of quantized weight.
-            activation_bits(int): the bit of quantized activation.
-            weight_quantize_type(str): the quantization type for weight.
-                Only support to be 'abs_max' and 'channel_wise_abs_max'.
-            activation_quantize_type(str): the quantization type for activation.
-                Only support to be 'abs_max' and 'moving_average_abs_max'.
-            quantizable_op_type(str): set the op type for quantization.
-            skip_pattern(str|list): The user-defined quantization skip pattern, which
-                will be presented in the name scope of an op. When the skip pattern is
-                detected in an op's name scope, the corresponding op will not be quantized.
-        """
-        self._weight_bits = weight_bits
-        self._activation_bits = activation_bits
-
-        assert activation_quantize_type in [
-            "abs_max",
-            "moving_average_abs_max",
-        ], (
-            "activation_quantize_type should be abs_max "
-            "or moving_average_abs_max for now."
-        )
-        assert weight_quantize_type in [
-            "abs_max",
-            "channel_wise_abs_max",
-        ], "weight_quantize_type should be abs_max or channel_wise_abs_max."
-        self._activation_quantize_type = activation_quantize_type
-        self._weight_quantize_type = weight_quantize_type
-
-        for op_type in quantizable_op_type:
-            assert op_type in [
-                'conv2d',
-                'depthwise_conv2d',
-                'mul',
-            ], "Quantize op should be ['conv2d', 'depthwise_conv2d', 'mul']"
-        self._quantizable_ops = quantizable_op_type
-        self._quantizable_grad_ops = [
-            '%s_grad' % (op) for op in self._quantizable_ops
-        ]
-
-        self._skip_pattern = skip_pattern
-        self._helper = LayerHelper(self.__class__.__name__)
-
-        self._moving_rate = 0.9
-        self._out_ch_axis1_ops = ['conv2d_transpose', 'mul', 'matmul']
-
-    def apply(self, program, startup_program, is_test=False):
-        """
-        Apply quantization to fluid Program.
-
-        Args:
-            program(Program): the train or test program to be quantized.
-            startup_program(Program): the corresponding startup_program.
-            is_test(bool): Whethe the program is used for test.
-        Returns:
-            None
-        """
-        assert isinstance(
-            program, Program
-        ), "program must be the instance of Program"
-        assert isinstance(
-            startup_program, Program
-        ), "startup_program must be the instance of Program"
-
-        var_rename_map = [
-            collections.OrderedDict() for _ in range(len(program.blocks))
-        ]
-        with program_guard(program, startup_program):
-            for block in program.blocks:
-                ops = list(block.ops)
-                for op in ops:
-                    if op.type in self._quantizable_ops and (
-                        not self._is_skip_quant(op)
-                    ):
-                        self._transform_forward(
-                            block, op, var_rename_map, is_test
-                        )
-
-            for block in program.blocks:
-                ops = list(block.ops)
-                for op in ops:
-                    if op.type in self._quantizable_grad_ops and (
-                        not self._is_skip_quant(op)
-                    ):
-                        self._transform_backward(block, op, var_rename_map)
-
-    def convert(self, test_program, scope=None):
-        """
-        Convert the test program.
-        Get the out scale from the moving_average_abs_max_scale op and save the
-        out scale into the quantized op.
-        Args:
-            test_program(Program): the test program to be converted.
-            scope(fluid.Scope, optional): The scope of the program, use it to load
-                and save variables. If scope=None, get scope by global_scope().
-        """
-        scope = global_scope() if scope is None else scope
-
-        for block in test_program.blocks:
-            for op in block.ops:
-                if (
-                    op.has_attr("quantization_type")
-                    and op.attr("quantization_type") == "qat_with_weight"
-                ):
-                    # quant op -> var1 -> fake op -> var2
-                    assert len(op.output_arg_names) == 1
-                    var1_name = op.output_arg_names[0]
-
-                    fake_ops = find_next_ops(block, var1_name)
-                    assert len(fake_ops) == 1
-                    fake_op = fake_ops[0]
-                    assert fake_op.type == "moving_average_abs_max_scale"
-
-                    out_scale_name = fake_op.output("OutScale")
-                    out_threshold = load_variable_data(scope, out_scale_name[0])
-                    op._set_attr("out_threshold", float(out_threshold))
-
-                    var2_name = fake_op.output("Out")[0]
-                    op._rename_output(var1_name, var2_name)
-                    fake_op._rename_output(var2_name, var1_name)
-
-    def _transform_forward(self, block, op, var_rename_map, is_test):
-        """
-        Insert fake quant op before the target ops.
-        """
-        op._set_attr("quantization_type", "qat_with_weight")
-
-        # insert fake quant op before the quantized op
-        for in_name in op.input_arg_names:
-            block_id = block.idx
-            idx = block.ops.index(op)
-
-            if in_name in var_rename_map[block_id]:
-                new_in_name = var_rename_map[block_id][in_name]
-            else:
-                in_var = block.var(in_name)
-                target_dtype = [
-                    core.VarDesc.VarType.FP32,
-                    core.VarDesc.VarType.FP16,
-                ]
-                if in_var.dtype not in target_dtype:
-                    continue
-
-                quant_bits = (
-                    self._weight_bits
-                    if in_var.persistable
-                    else self._activation_bits
-                )
-                quant_type = (
-                    self._weight_quantize_type
-                    if in_var.persistable
-                    else self._activation_quantize_type
-                )
-
-                if quant_type == "abs_max":
-                    new_var = self._insert_abs_max_fq_op(
-                        block, idx, in_var, quant_bits
-                    )
-                elif quant_type == "moving_average_abs_max":
-                    new_var = self._insert_ma_abs_max_fq_op(
-                        block, idx, in_var, quant_bits, is_test
-                    )
-                elif quant_type == "channel_wise_abs_max":
-                    ch_axis = 1 if op.type in self._out_ch_axis1_ops else 0
-                    new_var = self._insert_pc_abs_max_fq_op(
-                        block, idx, in_var, quant_bits, ch_axis
-                    )
-                else:
-                    _logger.error(
-                        "Don't support the quant_type: %s" % quant_type
-                    )
-                    continue
-
-                new_in_name = new_var.name
-                var_rename_map[block_id][in_name] = new_in_name
-
-            op._rename_input(in_name, new_in_name)
-
-        # insert out scale op followed the quantized op
-        for out_name in op.output_arg_names:
-            next_ops = find_next_ops(block, out_name)
-
-            idx = block.ops.index(op)
-            out_var = block.var(out_name)
-            new_out_var = self._insert_ma_abs_max_scale_op(
-                block, idx + 1, out_var, is_test, True
-            )
-
-            for next_op in next_ops:
-                if "_grad" not in next_op.type:
-                    next_op._rename_input(out_name, new_out_var.name)
-
-    def _is_skip_quant(self, op):
-        """
-        Analyse whether the op should skip quantization or not.
-        """
-        user_skipped = False
-        if isinstance(self._skip_pattern, list):
-            user_skipped = op.has_attr("op_namescope") and any(
-                pattern in op.attr("op_namescope")
-                for pattern in self._skip_pattern
-            )
-        elif isinstance(self._skip_pattern, str):
-            user_skipped = (
-                op.has_attr("op_namescope")
-                and op.attr("op_namescope").find(self._skip_pattern) != -1
-            )
-        return user_skipped
-
-    def _transform_backward(self, block, op, var_rename_map):
-        """
-        Update the backword of the target ops.
-        Note: for the grad ops, only rename the input, skip rename the output.
-        """
-        block_id = block.idx
-        no_dequanted_input_vars = True
-        for name in op.input_arg_names:
-            if name in var_rename_map[block_id]:
-                new_var_name = var_rename_map[block_id][name]
-                op._rename_input(name, new_var_name)
-                no_dequanted_input_vars = False
-        if no_dequanted_input_vars:
-            raise ValueError(
-                "There is no dequanted inputs for op %s." % (op.type)
-            )
-
-    def _insert_abs_max_fq_op(self, block, idx, in_var, quant_bits):
-        """
-        Inset abs max fake quant op.
-        """
-        quant_dequant_var = block.create_var(
-            type=in_var.type,
-            name="{}.quant_dequant".format(in_var.name),
-            shape=in_var.shape,
-            dtype=in_var.dtype,
-        )
-        scale_var = self._helper.create_parameter(
-            attr=ParamAttr(
-                name="{}.quant_dequant.scale".format(in_var.name),
-                initializer=Constant(0.0),
-                trainable=False,
-            ),
-            shape=[1],
-            dtype=in_var.dtype,
-        )
-        scale_var.stop_gradient = True
-
-        inputs = {'X': in_var}
-        outputs = {'Out': quant_dequant_var, 'OutScale': scale_var}
-        attrs = {'bit_length': quant_bits}
-        block._insert_op(
-            idx,
-            type='fake_quantize_dequantize_abs_max',
-            attrs=attrs,
-            inputs=inputs,
-            outputs=outputs,
-        )
-        return quant_dequant_var
-
-    def _insert_ma_abs_max_fq_op(self, block, idx, in_var, quant_bits, is_test):
-        """
-        Insert moving average abs max fake quant op.
-        """
-        quant_dequant_var = block.create_var(
-            type=in_var.type,
-            name="{}.quant_dequant".format(in_var.name),
-            shape=in_var.shape,
-            dtype=in_var.dtype,
-        )
-
-        scale_var = self._helper.create_parameter(
-            attr=ParamAttr(
-                name="{}.quant_dequant.scale".format(in_var.name),
-                initializer=Constant(0.0),
-                trainable=False,
-            ),
-            shape=[1],
-            dtype=in_var.dtype,
-        )
-        scale_var.stop_gradient = True
-
-        if not is_test:
-            state_var = self._helper.create_parameter(
-                attr=ParamAttr(
-                    name="{}.quant_dequant.state".format(in_var.name),
-                    initializer=Constant(0),
-                    trainable=False,
-                ),
-                shape=[1],
-                dtype=in_var.dtype,
-            )
-            state_var.stop_gradient = True
-
-            accum_var = self._helper.create_parameter(
-                attr=ParamAttr(
-                    name="{}.quant_dequant.accum".format(in_var.name),
-                    initializer=Constant(0),
-                    trainable=False,
-                ),
-                shape=[1],
-                dtype=in_var.dtype,
-            )
-            accum_var.stop_gradient = True
-
-        attrs = {
-            'moving_rate': self._moving_rate,
-            'bit_length': quant_bits,
-            'is_test': is_test,
-        }
-        inputs = {'X': in_var, 'InScale': scale_var}
-        outputs = {'Out': quant_dequant_var, 'OutScale': scale_var}
-        if not is_test:
-            inputs['InState'] = state_var
-            inputs['InAccum'] = accum_var
-            outputs['OutState'] = state_var
-            outputs['OutAccum'] = accum_var
-
-        block._insert_op(
-            idx,
-            type='fake_quantize_dequantize_moving_average_abs_max',
-            attrs=attrs,
-            inputs=inputs,
-            outputs=outputs,
-        )
-        return quant_dequant_var
-
-    def _insert_pc_abs_max_fq_op(self, block, idx, in_var, quant_bits, ch_axis):
-        """
-        Insert per channel abs max fake quant op.
-        """
-        quant_dequant_var = block.create_var(
-            type=in_var.type,
-            name="{}.quant_dequant".format(in_var.name),
-            shape=in_var.shape,
-            dtype=in_var.dtype,
-        )
-
-        scale_var = self._helper.create_parameter(
-            attr=ParamAttr(
-                name="{}.quant_dequant.scale".format(in_var.name),
-                initializer=Constant(0.0),
-                trainable=False,
-            ),
-            shape=[in_var.shape[ch_axis]],
-            dtype=in_var.dtype,
-        )
-        scale_var.stop_gradient = True
-
-        inputs = {'X': in_var}
-        outputs = {'Out': quant_dequant_var, 'OutScale': scale_var}
-        attrs = {'bit_length': quant_bits, 'quant_axis': ch_axis}
-        block._insert_op(
-            idx,
-            type='fake_channel_wise_quantize_dequantize_abs_max',
-            attrs=attrs,
-            inputs=inputs,
-            outputs=outputs,
-        )
-        return quant_dequant_var
-
-    def _insert_ma_abs_max_scale_op(
-        self, block, idx, in_var, is_test, has_out_var=False
-    ):
-        """
-        Insert moving average abs max scale op.
-        """
-        scale_var = self._helper.create_parameter(
-            attr=ParamAttr(
-                name="{}.outscale.scale".format(in_var.name),
-                initializer=Constant(0.0),
-                trainable=False,
-            ),
-            shape=[1],
-            dtype=in_var.dtype,
-        )
-        scale_var.stop_gradient = True
-
-        attrs = {'moving_rate': self._moving_rate, 'is_test': is_test}
-        inputs = {'X': in_var}
-        outputs = {'OutScale': scale_var}
-
-        if not is_test:
-            state_var = self._helper.create_parameter(
-                attr=ParamAttr(
-                    name="{}.outscale.state".format(in_var.name),
-                    initializer=Constant(0),
-                    trainable=False,
-                ),
-                shape=[1],
-                dtype=in_var.dtype,
-            )
-            state_var.stop_gradient = True
-
-            accum_var = self._helper.create_parameter(
-                attr=ParamAttr(
-                    name="{}.outscale.accum".format(in_var.name),
-                    initializer=Constant(0),
-                    trainable=False,
-                ),
-                shape=[1],
-                dtype=in_var.dtype,
-            )
-            accum_var.stop_gradient = True
-
-            inputs['InState'] = state_var
-            inputs['InAccum'] = accum_var
-            outputs['OutState'] = state_var
-            outputs['OutAccum'] = accum_var
-
-        if has_out_var:
-            out_var = block.create_var(
-                type=in_var.type,
-                name="{}.tmp".format(in_var.name),
-                shape=in_var.shape,
-                dtype=in_var.dtype,
-            )
-
-            outputs['Out'] = out_var
-
-        block._insert_op(
-            idx,
-            type='moving_average_abs_max_scale',
-            attrs=attrs,
-            inputs=inputs,
-            outputs=outputs,
-        )
-
-        if has_out_var:
-            return out_var
diff --git a/python/paddle/fluid/contrib/slim/tests/test_quantize_transpiler_v2.py b/python/paddle/fluid/contrib/slim/tests/test_quantize_transpiler_v2.py
deleted file mode 100644
index 0c5c957f776441..00000000000000
--- a/python/paddle/fluid/contrib/slim/tests/test_quantize_transpiler_v2.py
+++ /dev/null
@@ -1,207 +0,0 @@
-#   copyright (c) 2018 paddlepaddle authors. all rights reserved.
-#
-# licensed under the apache license, version 2.0 (the "license");
-# you may not use this file except in compliance with the license.
-# you may obtain a copy of the license at
-#
-#     http://www.apache.org/licenses/license-2.0
-#
-# unless required by applicable law or agreed to in writing, software
-# distributed under the license is distributed on an "as is" basis,
-# without warranties or conditions of any kind, either express or implied.
-# see the license for the specific language governing permissions and
-# limitations under the license.
-
-import os
-import unittest
-import random
-import numpy as np
-import paddle.fluid as fluid
-import paddle
-from paddle.fluid.framework import IrGraph
-from paddle.fluid.contrib.slim.quantization.quantize_transpiler_v2 import (
-    QuantizeTranspilerV2,
-)
-from paddle.fluid import core
-
-paddle.enable_static()
-
-os.environ["CUDA_VISIBLE_DEVICES"] = "0"
-os.environ["CPU_NUM"] = "1"
-
-
-def conv_net(img, label):
-    conv_pool_1 = fluid.nets.simple_img_conv_pool(
-        input=img,
-        filter_size=5,
-        num_filters=20,
-        pool_size=2,
-        pool_stride=2,
-        pool_type='max',
-        act="relu",
-    )
-    conv_pool_2 = fluid.nets.simple_img_conv_pool(
-        input=conv_pool_1,
-        filter_size=5,
-        num_filters=50,
-        pool_size=2,
-        pool_stride=2,
-        pool_type='avg',
-        act="relu",
-    )
-    with fluid.name_scope("skip_quant"):
-        hidden = fluid.layers.fc(input=conv_pool_1, size=100, act='relu')
-    prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
-    loss = paddle.nn.functional.cross_entropy(
-        input=prediction, label=label, reduction='none', use_softmax=False
-    )
-    avg_loss = paddle.mean(loss)
-    return avg_loss
-
-
-class TestQuantizeProgramPass(unittest.TestCase):
-    def quantize_program(
-        self,
-        use_cuda,
-        seed,
-        activation_quant_type='abs_max',
-        weight_quant_type='abs_max',
-        for_ci=False,
-    ):
-        def build_program(main, startup, is_test):
-            main.random_seed = seed
-            startup.random_seed = seed
-            with fluid.unique_name.guard():
-                with fluid.program_guard(main, startup):
-                    img = fluid.layers.data(
-                        name='image', shape=[1, 28, 28], dtype='float32'
-                    )
-                    label = fluid.layers.data(
-                        name='label', shape=[1], dtype='int64'
-                    )
-                    loss = conv_net(img, label)
-                    if not is_test:
-                        opt = fluid.optimizer.Adam(learning_rate=0.0001)
-                        opt.minimize(loss)
-            return [img, label], loss
-
-        random.seed(0)
-        np.random.seed(0)
-
-        # 1 Define program
-        train_program = fluid.Program()
-        startup_program = fluid.Program()
-        test_program = fluid.Program()
-        feeds, loss = build_program(train_program, startup_program, False)
-        build_program(test_program, startup_program, True)
-        test_program = test_program.clone(for_test=True)
-
-        if not for_ci:
-            train_graph = IrGraph(
-                core.Graph(train_program.desc), for_test=False
-            )
-            train_graph.draw('.', 'train_program_1')
-            test_graph = IrGraph(core.Graph(test_program.desc), for_test=True)
-            test_graph.draw('.', 'test_program_1')
-
-        # 2 Apply quantization
-        qt = QuantizeTranspilerV2(
-            activation_quantize_type=activation_quant_type,
-            weight_quantize_type=weight_quant_type,
-        )
-        qt.apply(train_program, startup_program, is_test=False)
-        qt.apply(test_program, startup_program, is_test=True)
-
-        # 3 Train
-        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        scope = fluid.Scope()
-        with fluid.scope_guard(scope):
-            exe.run(startup_program)
-        if not for_ci:
-            train_graph = IrGraph(
-                core.Graph(train_program.desc), for_test=False
-            )
-            train_graph.draw('.', 'train_program_2')
-            test_graph = IrGraph(core.Graph(test_program.desc), for_test=True)
-            test_graph.draw('.', 'test_program_2')
-
-        build_strategy = fluid.BuildStrategy()
-        build_strategy.memory_optimize = False
-        build_strategy.enable_inplace = False
-        build_strategy.fuse_all_reduce_ops = False
-        binary = fluid.CompiledProgram(train_program).with_data_parallel(
-            loss_name=loss.name, build_strategy=build_strategy
-        )
-        iters = 5
-        batch_size = 8
-
-        train_reader = paddle.batch(
-            paddle.dataset.mnist.train(), batch_size=batch_size
-        )
-        feeder = fluid.DataFeeder(feed_list=feeds, place=place)
-        with fluid.scope_guard(scope):
-            for idx in range(iters):
-                data = next(train_reader())
-                loss_v = exe.run(
-                    binary, feed=feeder.feed(data), fetch_list=[loss]
-                )
-                if not for_ci and idx % 20 == 0:
-                    print('{}: {}'.format('loss', np.mean(loss_v)))
-
-        print('{}: {}'.format('loss', np.mean(loss_v)))
-
-        # 4 Convert
-        qt.convert(test_program, scope)
-        if not for_ci:
-            with fluid.scope_guard(scope):
-                fluid.io.save_inference_model(
-                    './infer_model',
-                    ['image', 'label'],
-                    [loss],
-                    exe,
-                    test_program,
-                    clip_extra=True,
-                )
-
-    def test_gpu_1(self):
-        if fluid.core.is_compiled_with_cuda():
-            self.quantize_program(
-                use_cuda=True,
-                seed=1,
-                activation_quant_type='abs_max',
-                weight_quant_type='abs_max',
-                for_ci=True,
-            )
-
-    def test_gpu_2(self):
-        if fluid.core.is_compiled_with_cuda():
-            self.quantize_program(
-                use_cuda=True,
-                seed=1,
-                activation_quant_type='moving_average_abs_max',
-                weight_quant_type='channel_wise_abs_max',
-                for_ci=True,
-            )
-
-    def test_cpu_1(self):
-        self.quantize_program(
-            use_cuda=False,
-            seed=2,
-            activation_quant_type='abs_max',
-            weight_quant_type='abs_max',
-            for_ci=True,
-        )
-
-    def test_cpu_2(self):
-        self.quantize_program(
-            use_cuda=False,
-            seed=2,
-            activation_quant_type='moving_average_abs_max',
-            weight_quant_type='channel_wise_abs_max',
-            for_ci=True,
-        )
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/contrib/tests/CMakeLists.txt b/python/paddle/fluid/contrib/tests/CMakeLists.txt
index 48e107c4b4d7ae..42075f262b86a3 100644
--- a/python/paddle/fluid/contrib/tests/CMakeLists.txt
+++ b/python/paddle/fluid/contrib/tests/CMakeLists.txt
@@ -25,5 +25,4 @@ set_tests_properties(test_multi_precision_fp16_train PROPERTIES TIMEOUT 120)
 
 if(APPLE)
   set_tests_properties(test_model_cast_to_bf16 PROPERTIES TIMEOUT 300)
-  set_tests_properties(test_quantize_transpiler PROPERTIES TIMEOUT 300)
 endif()
diff --git a/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py b/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py
deleted file mode 100644
index 18b17ea98e8e4f..00000000000000
--- a/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py
+++ /dev/null
@@ -1,314 +0,0 @@
-#   copyright (c) 2018 paddlepaddle authors. all rights reserved.
-#
-# licensed under the apache license, version 2.0 (the "license");
-# you may not use this file except in compliance with the license.
-# you may obtain a copy of the license at
-#
-#     http://www.apache.org/licenses/license-2.0
-#
-# unless required by applicable law or agreed to in writing, software
-# distributed under the license is distributed on an "as is" basis,
-# without warranties or conditions of any kind, either express or implied.
-# see the license for the specific language governing permissions and
-# limitations under the license.
-
-import numpy as np
-
-import unittest
-import paddle
-import paddle.fluid as fluid
-from paddle.fluid.contrib.quantize.quantize_transpiler import _original_var_name
-from paddle.fluid.contrib.quantize.quantize_transpiler import QuantizeTranspiler
-import paddle
-
-paddle.enable_static()
-
-
-def linear_fc(num):
-    data = fluid.layers.data(name='image', shape=[1, 32, 32], dtype='float32')
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-    hidden = data
-    for _ in range(num):
-        hidden = fluid.layers.fc(hidden, size=128, act='relu')
-    loss = paddle.nn.functional.cross_entropy(
-        input=hidden, label=label, reduction='none', use_softmax=False
-    )
-    loss = paddle.mean(loss)
-    return loss
-
-
-def residual_block(num):
-    def conv_bn_layer(
-        input, ch_out, filter_size, stride, padding, act='relu', bias_attr=False
-    ):
-        tmp = paddle.static.nn.conv2d(
-            input=input,
-            filter_size=filter_size,
-            num_filters=ch_out,
-            stride=stride,
-            padding=padding,
-            act=None,
-            bias_attr=bias_attr,
-        )
-        return paddle.static.nn.batch_norm(input=tmp, act=act)
-
-    data = fluid.layers.data(name='image', shape=[1, 32, 32], dtype='float32')
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-    hidden = data
-    for _ in range(num):
-        conv = conv_bn_layer(hidden, 16, 3, 1, 1, act=None, bias_attr=True)
-        short = conv_bn_layer(hidden, 16, 1, 1, 0, act=None)
-        hidden = paddle.nn.functional.relu(paddle.add(x=conv, y=short))
-    fc = fluid.layers.fc(input=hidden, size=10)
-    loss = paddle.nn.functional.cross_entropy(
-        input=fc, label=label, reduction='none', use_softmax=False
-    )
-    loss = paddle.mean(loss)
-    return loss
-
-
-def conv_net(img, label):
-    conv_pool_1 = fluid.nets.simple_img_conv_pool(
-        input=img,
-        filter_size=5,
-        num_filters=20,
-        pool_size=2,
-        pool_stride=2,
-        act="relu",
-    )
-    conv_pool_1 = paddle.static.nn.batch_norm(conv_pool_1)
-    conv_pool_2 = fluid.nets.simple_img_conv_pool(
-        input=conv_pool_1,
-        filter_size=5,
-        num_filters=50,
-        pool_size=2,
-        pool_stride=2,
-        act="relu",
-    )
-    prediction = fluid.layers.fc(input=conv_pool_2, size=10, act='softmax')
-    loss = paddle.nn.functional.cross_entropy(
-        input=prediction, label=label, reduction='none', use_softmax=False
-    )
-    avg_loss = paddle.mean(loss)
-    return avg_loss
-
-
-class TestQuantizeTranspiler(unittest.TestCase):
-    def setUp(self):
-        # since quant_op and dequant_op is not ready, use cos and sin for test
-        self.weight_quant_op_type = 'fake_quantize_abs_max'
-        self.dequant_op_type = 'fake_dequantize_max_abs'
-        self.quantizable_op_and_inputs = {
-            'conv2d': ['Input', 'Filter'],
-            'depthwise_conv2d': ['Input', 'Filter'],
-            'mul': ['X', 'Y'],
-        }
-        self.quantizable_op_grad_and_inputs = {
-            'conv2d_grad': ['Input', 'Filter'],
-            'depthwise_conv2d_grad': ['Input', 'Filter'],
-            'mul_grad': ['X', 'Y'],
-        }
-
-    def check_program(self, program):
-        quantized_ops = {}
-
-        persistable_vars = [
-            v.name
-            for v in filter(lambda var: var.persistable, program.list_vars())
-        ]
-
-        for block in program.blocks:
-            for idx, op in enumerate(block.ops):
-                # check forward
-                if op.type in self.quantizable_op_and_inputs:
-                    for i, arg_name in enumerate(op.input_arg_names):
-                        quant_op_type = (
-                            self.weight_quant_op_type
-                            if _original_var_name(arg_name) in persistable_vars
-                            else self.act_quant_op_type
-                        )
-                        self.assertTrue(
-                            arg_name.endswith('.quantized.dequantized')
-                        )
-                        if arg_name not in quantized_ops:
-                            self.assertEqual(
-                                block.ops[idx - 2 * i - 1].type,
-                                self.dequant_op_type,
-                            )
-                            self.assertEqual(
-                                block.ops[idx - 2 * i - 2].type, quant_op_type
-                            )
-                            quantized_ops[arg_name] = block.ops[idx - 2 * i - 2]
-                        else:
-                            op_idx = block.ops.index(quantized_ops[arg_name])
-                            self.assertLess(op_idx, idx)
-
-                # check backward
-                if op.type in self.quantizable_op_grad_and_inputs:
-                    for pname in self.quantizable_op_grad_and_inputs[op.type]:
-                        arg_name = op.input(pname)[0]
-                        self.assertTrue(
-                            arg_name.endswith('.quantized.dequantized')
-                        )
-                        self.assertTrue(arg_name in quantized_ops)
-
-    def linear_fc_quant(self, quant_type):
-        main = fluid.Program()
-        startup = fluid.Program()
-        with fluid.program_guard(main, startup):
-            loss = linear_fc(3)
-            opt = fluid.optimizer.Adam(learning_rate=0.001)
-            opt.minimize(loss)
-            t = QuantizeTranspiler(activation_quantize_type=quant_type)
-            t.training_transpile(main)
-            self.check_program(main)
-
-    def test_linear_fc_quant_abs_max(self):
-        self.act_quant_op_type = 'fake_quantize_abs_max'
-        self.linear_fc_quant('abs_max')
-
-    def test_linear_fc_quant_range_abs_max(self):
-        self.act_quant_op_type = 'fake_quantize_range_abs_max'
-        self.linear_fc_quant('range_abs_max')
-
-    def residual_block_quant(self, quant_type):
-        main = fluid.Program()
-        startup = fluid.Program()
-        with fluid.program_guard(main, startup):
-            loss = residual_block(2)
-            opt = fluid.optimizer.Adam(learning_rate=0.001)
-            opt.minimize(loss)
-            t = QuantizeTranspiler(activation_quantize_type=quant_type)
-            t.training_transpile(main)
-            self.check_program(main)
-
-    def test_residual_block_abs_max(self):
-        self.act_quant_op_type = 'fake_quantize_abs_max'
-        self.residual_block_quant('abs_max')
-
-    def test_residual_block_range_abs_max(self):
-        self.act_quant_op_type = 'fake_quantize_range_abs_max'
-        self.residual_block_quant('range_abs_max')
-
-    def freeze_program(self, use_cuda, seed):
-        def build_program(main, startup, is_test):
-            main.random_seed = seed
-            startup.random_seed = seed
-            with fluid.unique_name.guard():
-                with fluid.program_guard(main, startup):
-                    img = fluid.layers.data(
-                        name='image', shape=[1, 28, 28], dtype='float32'
-                    )
-                    label = fluid.layers.data(
-                        name='label', shape=[1], dtype='int64'
-                    )
-                    loss = conv_net(img, label)
-                    if not is_test:
-                        opt = fluid.optimizer.Adam(learning_rate=0.001)
-                        opt.minimize(loss)
-            return [img, label], loss
-
-        main = fluid.Program()
-        startup = fluid.Program()
-        test_program = fluid.Program()
-
-        import random
-
-        random.seed(0)
-        np.random.seed(0)
-
-        feeds, loss = build_program(main, startup, False)
-        build_program(test_program, startup, True)
-        test_program = test_program.clone(for_test=True)
-
-        quant_type = 'range_abs_max'  # 'range_abs_max' or 'abs_max'
-        quant_transpiler = QuantizeTranspiler(
-            activation_quantize_type=quant_type
-        )
-        quant_transpiler.training_transpile(main, startup)
-        quant_transpiler.training_transpile(test_program, startup)
-
-        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        iters = 5
-        batch_size = 8
-        class_num = 10
-        exe.run(startup)
-
-        train_reader = paddle.batch(
-            paddle.reader.shuffle(paddle.dataset.mnist.train(), buf_size=500),
-            batch_size=batch_size,
-        )
-        test_reader = paddle.batch(
-            paddle.dataset.mnist.test(), batch_size=batch_size
-        )
-        feeder = fluid.DataFeeder(feed_list=feeds, place=place)
-
-        with fluid.program_guard(main):
-            for _ in range(iters):
-                data = next(train_reader())
-                loss_v = exe.run(
-                    program=main, feed=feeder.feed(data), fetch_list=[loss]
-                )
-
-        with fluid.program_guard(test_program):
-            test_data = next(test_reader())
-            w_var = fluid.framework._get_var(
-                'conv2d_1.w_0.quantized', test_program
-            )
-            # Testing during training
-            test_loss1, w_quant = exe.run(
-                program=test_program,
-                feed=feeder.feed(test_data),
-                fetch_list=[loss, w_var],
-            )
-
-            # Freeze program for inference, but the weight of fc/conv is still float type.
-            quant_transpiler.freeze_program(test_program, place)
-            (test_loss2,) = exe.run(
-                program=test_program,
-                feed=feeder.feed(test_data),
-                fetch_list=[loss],
-            )
-            self.assertAlmostEqual(test_loss1, test_loss2, delta=5e-3)
-            w_freeze = np.array(
-                fluid.global_scope().find_var('conv2d_1.w_0').get_tensor()
-            )
-            # fail: -432.0 != -433.0, this is due to the calculation precision
-            # self.assertAlmostEqual(np.sum(w_freeze), np.sum(w_quant))
-
-            # Convert parameter to 8-bit.
-            quant_transpiler.convert_to_int8(test_program, place)
-            # Save the 8-bit parameter and model file.
-            fluid.io.save_inference_model(
-                'model_8bit',
-                ['image', 'label'],
-                [loss],
-                exe,
-                test_program,
-                clip_extra=True,
-            )
-            # Test whether the 8-bit parameter and model file can be loaded successfully.
-            [infer, feed, fetch] = fluid.io.load_inference_model(
-                'model_8bit', exe
-            )
-            # Check the loaded 8-bit weight.
-            w_8bit = np.array(
-                fluid.global_scope().find_var('conv2d_1.w_0.int8').get_tensor()
-            )
-
-            self.assertEqual(w_8bit.dtype, np.int8)
-            self.assertEqual(np.sum(w_8bit), np.sum(w_freeze))
-
-    def not_test_freeze_program_cuda(self):
-        if fluid.core.is_compiled_with_cuda():
-            with fluid.unique_name.guard():
-                self.freeze_program(True, seed=1)
-
-    def not_test_freeze_program_cpu(self):
-        with fluid.unique_name.guard():
-            self.freeze_program(False, seed=2)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/dataloader/dataloader_iter.py b/python/paddle/fluid/dataloader/dataloader_iter.py
index 8687b696bbde31..fc1effbd89c7af 100644
--- a/python/paddle/fluid/dataloader/dataloader_iter.py
+++ b/python/paddle/fluid/dataloader/dataloader_iter.py
@@ -303,7 +303,7 @@ def __next__(self):
                 )
                 data = _restore_batch(data, self._structure_infos.pop(0))
             else:
-                # in static mode
+                # in static graph mode
                 if self._return_list:
                     data = self._reader.read_next_list()
                     for i in range(len(data)):
diff --git a/python/paddle/fluid/dygraph/__init__.py b/python/paddle/fluid/dygraph/__init__.py
index aebcc09eaa14ba..b98c188ae4f6ab 100644
--- a/python/paddle/fluid/dygraph/__init__.py
+++ b/python/paddle/fluid/dygraph/__init__.py
@@ -21,9 +21,6 @@
 from . import container
 from .container import *
 
-from . import nn
-from .nn import *
-
 from . import tracer
 from .tracer import *
 
@@ -45,7 +42,6 @@
 __all__ += layers.__all__
 __all__ += base.__all__
 __all__ += container.__all__
-__all__ += nn.__all__
 __all__ += parallel.__all__
 __all__ += checkpoint.__all__
 __all__ += learning_rate_scheduler.__all__
diff --git a/python/paddle/fluid/dygraph/base.py b/python/paddle/fluid/dygraph/base.py
index c36d77d9a11ae5..6a96c31ead8fd3 100644
--- a/python/paddle/fluid/dygraph/base.py
+++ b/python/paddle/fluid/dygraph/base.py
@@ -210,7 +210,7 @@ def enable_dygraph(place=None):
             print(paddle.in_dynamic_mode())  # True, dynamic mode is turn ON by default since paddle 2.0.0
 
             paddle.enable_static()
-            print(paddle.in_dynamic_mode())  # False, Now we are in static mode
+            print(paddle.in_dynamic_mode())  # False, Now we are in static graph mode
 
             paddle.disable_static()
             print(paddle.in_dynamic_mode())  # True, Now we are in dynamic mode
@@ -245,7 +245,7 @@ def disable_dygraph():
             print(paddle.in_dynamic_mode())  # True, dynamic mode is turn ON by default since paddle 2.0.0
 
             paddle.enable_static()
-            print(paddle.in_dynamic_mode())  # False, Now we are in static mode
+            print(paddle.in_dynamic_mode())  # False, Now we are in static graph mode
 
             paddle.disable_static()
             print(paddle.in_dynamic_mode())  # True, Now we are in dynamic mode
diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py
deleted file mode 100644
index afebf277d0d216..00000000000000
--- a/python/paddle/fluid/dygraph/nn.py
+++ /dev/null
@@ -1,401 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-from .. import core
-from ..layers import utils
-from ..layers import nn as F
-from .. import dygraph_utils
-from . import layers
-from ..framework import (
-    Variable,
-    OpProtoHolder,
-    Parameter,
-    _dygraph_tracer,
-    _varbase_creator,
-    default_main_program,
-    _global_flags,
-    in_dygraph_mode,
-)
-
-from ..data_feeder import (
-    convert_dtype,
-    check_variable_and_dtype,
-    check_type,
-    check_dtype,
-)
-
-from ..param_attr import ParamAttr
-from ..initializer import Normal, Constant, NumpyArrayInitializer
-from .. import unique_name
-from .layer_object_helper import LayerObjectHelper
-from ..data_feeder import check_variable_and_dtype, check_type
-import numpy as np
-import numbers
-import logging
-import os
-import paddle.utils.deprecated as deprecated
-from paddle import _C_ops, _legacy_C_ops
-
-__all__ = []
-
-
-class BatchNorm(layers.Layer):
-    r"""
-
-    This interface is used to construct a callable object of the ``BatchNorm`` class.
-    For more details, refer to code examples.
-    It implements the function of the Batch Normalization Layer and can be used
-    as a normalizer function for conv2d and fully connected operations.
-    The data is normalized by the mean and variance of the channel based on the current batch data.
-    Refer to `Batch Normalization: Accelerating Deep Network Training by Reducing
-    Internal Covariate Shift <https://arxiv.org/pdf/1502.03167.pdf>`_
-    for more details.
-
-    When use_global_stats = False, the :math:`\mu_{\beta}`
-    and :math:`\sigma_{\beta}^{2}` are the statistics of one mini-batch.
-    Calculated as follows:
-
-    ..  math::
-
-        \mu_{\beta} &\gets \frac{1}{m} \sum_{i=1}^{m} x_i \qquad &
-        //\ mini-batch\ mean \\
-        \sigma_{\beta}^{2} &\gets \frac{1}{m} \sum_{i=1}^{m}(x_i - \mu_{\beta})^2 \qquad &
-        //\ mini-batch\ variance \\
-
-    - :math:`x` : mini-batch data
-    - :math:`m` : the size of the mini-batch data
-
-    When use_global_stats = True, the :math:`\\mu_{\\beta}`
-    and :math:`\\sigma_{\\beta}^{2}` are not the statistics of one mini-batch.
-    They are global or running statistics (moving_mean and moving_variance). It usually got from the
-    pre-trained model. Calculated as follows:
-
-    .. math::
-        moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global mean \\
-        moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global variance \\
-
-    The normalization function formula is as follows:
-
-    ..  math::
-
-        \hat{x_i} &\gets \frac{x_i - \mu_\beta} {\sqrt{\
-        \sigma_{\beta}^{2} + \epsilon}} \qquad &//\ normalize \\
-        y_i &\gets \gamma \hat{x_i} + \beta \qquad &//\ scale\ and\ shift
-
-
-    - :math:`\epsilon` : add a smaller value to the variance to prevent division by zero
-    - :math:`\gamma` : trainable proportional parameter
-    - :math:`\beta` : trainable deviation parameter
-
-    Parameters:
-        num_channels(int): Indicate the number of channels of the input ``Tensor``.
-        act(str, optional): Activation to be applied to the output of batch normalization. Default: None.
-        is_test (bool, optional): A flag indicating whether it is in test phrase or not.
-             This flag only has effect on static graph mode. For dygraph mode, please use ``eval()``.
-             Default: False.
-        momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
-        epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5.
-        param_attr(ParamAttr, optional): The parameter attribute for Parameter `scale`
-             of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm
-             will create ParamAttr as param_attr. If the Initializer of the param_attr
-             is not set, the parameter is initialized with Xavier. Default: None.
-        bias_attr(ParamAttr, optional): The parameter attribute for the bias of batch_norm.
-             If it is set to None or one attribute of ParamAttr, batch_norm
-             will create ParamAttr as bias_attr. If the Initializer of the bias_attr
-             is not set, the bias is initialized zero. Default: None.
-        dtype(str, optional): Indicate the data type of the input ``Tensor``,
-             which can be float32 or float64. Default: float32.
-        data_layout(str, optional): Specify the input data format, the data format can be "NCHW" or "NHWC", where `N` is batch size, `C` is the number of the feature map, `H` is the height of the feature map, `W` is the width of the feature map. Default: NCHW.
-        in_place(bool, optional): Make the input and output of batch norm reuse memory. Default: False.
-        moving_mean_name(str, optional): The name of moving_mean which store the global Mean. Default: None.
-        moving_variance_name(str, optional): The name of the moving_variance which store the global Variance. Default: None.
-        do_model_average_for_mean_and_var(bool, optional): Whether parameter mean and variance should do model
-            average when model average is enabled. Default: True.
-        use_global_stats(bool, optional): Whether to use global mean and
-            variance. In inference or test mode, set use_global_stats to true
-            or is_test to true, and the behavior is equivalent.
-            In train mode, when setting use_global_stats True, the global mean
-            and variance are also used during train period. Default: False.
-        trainable_statistics(bool, optional): Whether to calculate mean and var in eval mode. In eval mode, when
-            setting trainable_statistics True, mean and variance will be calculated by current batch statistics.
-            Default: False.
-
-    Returns:
-        None
-
-    Examples:
-        .. code-block:: python
-
-          import paddle
-          import paddle.fluid as fluid
-          from paddle.fluid.dygraph.base import to_variable
-
-          x = paddle.rand([3, 10, 3, 7], 'float32')
-          with fluid.dygraph.guard():
-              x = to_variable(x)
-              batch_norm = fluid.BatchNorm(10)
-              hidden1 = batch_norm(x)
-    """
-
-    def __init__(
-        self,
-        num_channels,
-        act=None,
-        is_test=False,
-        momentum=0.9,
-        epsilon=1e-05,
-        param_attr=None,
-        bias_attr=None,
-        dtype='float32',
-        data_layout='NCHW',
-        in_place=False,
-        moving_mean_name=None,
-        moving_variance_name=None,
-        do_model_average_for_mean_and_var=True,
-        use_global_stats=False,
-        trainable_statistics=False,
-    ):
-        super().__init__()
-        self._param_attr = param_attr
-        self._bias_attr = bias_attr
-        self._act = act
-        self._use_mkldnn = _global_flags()["FLAGS_use_mkldnn"]
-
-        assert (
-            bias_attr is not False
-        ), "bias_attr should not be False in batch_norm."
-
-        if dtype == "float16":
-            self._dtype = "float32"
-        else:
-            self._dtype = dtype
-
-        param_shape = [num_channels]
-
-        # create parameter
-        self.weight = self.create_parameter(
-            attr=self._param_attr,
-            shape=param_shape,
-            dtype=self._dtype,
-            default_initializer=Constant(1.0),
-        )
-        self.weight.stop_gradient = (
-            use_global_stats and self._param_attr.learning_rate == 0.0
-        )
-
-        self.bias = self.create_parameter(
-            attr=self._bias_attr,
-            shape=param_shape,
-            dtype=self._dtype,
-            is_bias=True,
-        )
-        self.bias.stop_gradient = (
-            use_global_stats and self._param_attr.learning_rate == 0.0
-        )
-
-        self._mean = self.create_parameter(
-            attr=ParamAttr(
-                name=moving_mean_name,
-                initializer=Constant(0.0),
-                trainable=False,
-                do_model_average=do_model_average_for_mean_and_var,
-            ),
-            shape=param_shape,
-            dtype=self._dtype,
-        )
-        self._mean.stop_gradient = True
-
-        self._variance = self.create_parameter(
-            attr=ParamAttr(
-                name=moving_variance_name,
-                initializer=Constant(1.0),
-                trainable=False,
-                do_model_average=do_model_average_for_mean_and_var,
-            ),
-            shape=param_shape,
-            dtype=self._dtype,
-        )
-        self._variance.stop_gradient = True
-
-        self._in_place = in_place
-        self._data_layout = data_layout
-        self._momentum = momentum
-        self._epsilon = epsilon
-        self._is_test = is_test
-        self._fuse_with_relu = False
-        self._use_global_stats = use_global_stats
-        self._trainable_statistics = trainable_statistics
-
-    def forward(self, input):
-        # create output
-        # mean and mean_out share the same memory
-        mean_out = self._mean
-        # variance and variance out share the same memory
-        variance_out = self._variance
-
-        if in_dygraph_mode():
-            batch_norm_out, t1, t2, t3, t4, _ = _C_ops.batch_norm(
-                input,
-                self._mean,
-                self._variance,
-                self.weight,
-                self.bias,
-                not self.training,
-                self._momentum,
-                self._epsilon,
-                self._data_layout,
-                self._use_global_stats,
-                self._trainable_statistics,
-            )
-            return dygraph_utils._append_activation_in_dygraph(
-                batch_norm_out, act=self._act, use_mkldnn=self._use_mkldnn
-            )
-        else:
-            check_variable_and_dtype(
-                input, 'input', ['float16', 'float32', 'float64'], 'BatchNorm'
-            )
-
-            attrs = {
-                "momentum": self._momentum,
-                "epsilon": self._epsilon,
-                "is_test": self._is_test,
-                "data_layout": self._data_layout,
-                "use_mkldnn": False,
-                "fuse_with_relu": self._fuse_with_relu,
-                "use_global_stats": self._use_global_stats,
-                "trainable_statistics": self._trainable_statistics,
-            }
-
-            inputs = {
-                "X": [input],
-                "Scale": [self.weight],
-                "Bias": [self.bias],
-                "Mean": [self._mean],
-                "Variance": [self._variance],
-            }
-
-            saved_mean = self._helper.create_variable_for_type_inference(
-                dtype=self._dtype, stop_gradient=True
-            )
-            saved_variance = self._helper.create_variable_for_type_inference(
-                dtype=self._dtype, stop_gradient=True
-            )
-            reserve_space = self._helper.create_variable_for_type_inference(
-                dtype=self._helper.input_dtype(input), stop_gradient=True
-            )
-
-            batch_norm_out = (
-                input
-                if self._in_place
-                else self._helper.create_variable_for_type_inference(
-                    self._dtype
-                )
-            )
-
-            outputs = {
-                "Y": [batch_norm_out],
-                "MeanOut": [mean_out],
-                "VarianceOut": [variance_out],
-                "SavedMean": [saved_mean],
-                "SavedVariance": [saved_variance],
-            }
-            if reserve_space is not None:
-                outputs["ReserveSpace"] = [reserve_space]
-
-            self._helper.append_op(
-                type="batch_norm", inputs=inputs, outputs=outputs, attrs=attrs
-            )
-
-            # Currently, we don't support inplace in dygraph mode
-            return self._helper.append_activation(batch_norm_out, self._act)
-
-
-class RowConv(layers.Layer):
-    """
-    ***Row-convolution operator***
-
-    The row convolution is called lookahead convolution.  This operator was introduced in the following paper for DeepSpeech2:
-    http://www.cs.cmu.edu/~dyogatam/papers/wang+etal.iclrworkshop2016.pdf
-
-    The main motivation is that a bidirectional RNN, useful in DeepSpeech like speech models, learns representation for a sequence by performing a
-    forward and a backward pass through the entire sequence. However, unlike
-    unidirectional RNNs, bidirectional RNNs are challenging to deploy in an online
-    and low-latency setting. The lookahead convolution incorporates information
-    from future subsequences in a computationally efficient manner to improve
-    unidirectional recurrent neural networks. The row convolution operator is
-    different from the 1D sequence convolution, and is computed as follows:
-
-    Given an input sequence X of length t and input dimension D, and a filter (W) of size context * D.
-
-    More details about row_conv please refer to the design document https://github.com/PaddlePaddle/Paddle/issues/2228#issuecomment-303903645 .
-
-    Parameters:
-        name_scope(str): The name of this class.
-        future_context_size (int): Future context size. Please note, the shape
-            of convolution kernel is [future_context_size + 1, D].
-        param_attr (ParamAttr): Attributes of parameters, including
-            name, initializer etc. Default: None.
-        act (str): Non-linear activation to be applied to output variable. Default: None.
-
-    Attributes:
-        weight (Parameter): the learnable weights of this layer.
-
-    Returns:
-        the output(Out) is a LodTensor, which supports variable time-length input sequences.
-        The underlying tensor in this LodTensor is a matrix with shape T x N, i.e., the same shape as X.
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          import numpy
-
-          with fluid.dygraph.guard():
-              x = numpy.random.random((16)).astype('float32')
-              rowConv = fluid.dygraph.nn.RowConv(
-                    'RowConv', future_context_size=2)
-              ret = rowConv(fluid.dygraph.base.to_variable(x))
-
-    """
-
-    def __init__(
-        self, name_scope, future_context_size, param_attr=None, act=None
-    ):
-        assert (
-            not in_dygraph_mode()
-        ), "RowConv is not supported by dynamic graph mode yet!"
-        super().__init__(name_scope)
-        self._act = act
-        self._param_attr = param_attr
-        self._future_context_size = future_context_size
-
-    def _build_once(self, input):
-        self._dtype = self._helper.input_dtype(input)
-        filter_shape = [self._future_context_size + 1, input.shape[1]]
-        self.weight = self.create_parameter(
-            attr=self._param_attr,
-            shape=filter_shape,
-            dtype=self._dtype,
-            is_bias=False,
-        )
-
-    def forward(self, input):
-        out = self._helper.create_variable_for_type_inference(self._dtype)
-        self._helper.append_op(
-            type='row_conv',
-            inputs={'X': [input], 'Filter': [self.weight]},
-            outputs={'Out': [out]},
-        )
-        return self._helper.append_activation(out, act=self._act)
diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py
index 936c6ee7034393..90c71abbaaa8ed 100644
--- a/python/paddle/fluid/dygraph/parallel.py
+++ b/python/paddle/fluid/dygraph/parallel.py
@@ -570,7 +570,7 @@ def __init__(
 
         assert (
             in_dygraph_mode()
-        ), "It's not supported to construct DataParallel in static mode."
+        ), "It's not supported to construct DataParallel in static graph mode."
 
         self._layers = layers
         self.find_unused_parameters = find_unused_parameters
diff --git a/python/paddle/fluid/dygraph/tracer.py b/python/paddle/fluid/dygraph/tracer.py
index e13fdac0e734b0..74826c9a6bcccc 100644
--- a/python/paddle/fluid/dygraph/tracer.py
+++ b/python/paddle/fluid/dygraph/tracer.py
@@ -306,7 +306,7 @@ def trace_op(
         stop_gradient=False,
         inplace_map=None,
     ):
-        if not framework._in_legacy_dygraph():
+        if framework.in_dygraph_mode():
             # inputs : {"sum": [tensor], ...}
             # outputs : {"sum": [tensor], ...}
             if type in name_mapping.keys():
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index d56dbde378abf0..428cf3dbbe81d3 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -98,11 +98,10 @@
 # 2. dygraph_mode():
 # This flags inidicates we are now running in dygraph mode which called eager mode before.
 # 3. _in_legacy_dygraph():
-# This flags inidicates we are now running in legacy dygraph mode
+# This flags has been deprecated
 #
 # They have a relation ship as below:
-# Both dygraph_mode and _in_legacy_dygraph are _non_static_mode, but if you are running in
-# dygraph mode means you are not in _in_legacy_dygraph.
+# Since _in_legacy_graph is deprecated, so dygraph_mode is _non_static_mode
 #
 # Why we have to make different of _in_legacy_dygraph and dygraph_mode?
 # In some performance issue, we find that python if statement cause server performance problem
@@ -228,7 +227,7 @@ def in_dygraph_mode():
             print(paddle.in_dynamic_mode())  # True, dynamic mode is turn ON by default since paddle 2.0.0
 
             paddle.enable_static()
-            print(paddle.in_dynamic_mode())  # False, Now we are in static mode
+            print(paddle.in_dynamic_mode())  # False, Now we are in static graph mode
 
             paddle.disable_static()
             print(paddle.in_dynamic_mode())  # True, Now we are in dynamic mode
@@ -237,10 +236,6 @@ def in_dygraph_mode():
     return (_dygraph_tracer_ is not None) and _in_eager_mode_
 
 
-def _in_legacy_dygraph():
-    return (not _in_eager_mode_) and (_dygraph_tracer_ is not None)
-
-
 def _non_static_mode():
     return _dygraph_tracer_ is not None
 
@@ -1334,8 +1329,6 @@ def __instancecheck__(cls, instance):
         if in_dygraph_mode():
             return issubclass(t, core.eager.Tensor)
         else:
-            if _in_legacy_dygraph():
-                return issubclass(t, core.VarBase)
             return issubclass(t, Variable)
 
 
@@ -1346,8 +1339,6 @@ def __instancecheck__(cls, instance):
         if in_dygraph_mode():
             return issubclass(t, EagerParamBase)
         else:
-            if _in_legacy_dygraph():
-                return issubclass(t, ParamBase)
             return issubclass(t, Parameter)
 
 
@@ -2842,7 +2833,7 @@ def __init__(
                 op_attrs = dict()
             del attrs
 
-            # attr for static mode cuda graph
+            # attr for static graph mode cuda graph
             self._cuda_graph_attr = _current_cuda_graph_mode
 
             op_maker = core.op_proto_and_checker_maker
@@ -2988,7 +2979,7 @@ def find_name(var_list, name):
                             out_arg_names.append(arg)
                         else:
                             out_arg_names.append(arg.name)
-                        # TODO(minqiyang): could we remove variable's op in static mode?
+                        # TODO(minqiyang): could we remove variable's op in static graph mode?
                         if not _non_static_mode():
                             if isinstance(arg, str):
                                 block.var(arg).op = self
@@ -3893,31 +3884,18 @@ def _rename_var(self, name, new_name):
                     error_clip=error_clip,
                 )
             else:
-                if _in_legacy_dygraph():
-                    var = ParamBase(
-                        d.shape(),
-                        d.dtype(),
-                        type=orig_var_type,
-                        name=new_name,
-                        stop_gradient=stop_gradient,
-                        trainable=trainable,
-                        optimize_attr=optimize_attr,
-                        regularizer=regularizer,
-                        error_clip=error_clip,
-                    )
-                else:
-                    var = Parameter(
-                        self,
-                        d.shape(),
-                        d.dtype(),
-                        type=orig_var_type,
-                        name=new_name,
-                        stop_gradient=stop_gradient,
-                        trainable=trainable,
-                        optimize_attr=optimize_attr,
-                        regularizer=regularizer,
-                        error_clip=error_clip,
-                    )
+                var = Parameter(
+                    self,
+                    d.shape(),
+                    d.dtype(),
+                    type=orig_var_type,
+                    name=new_name,
+                    stop_gradient=stop_gradient,
+                    trainable=trainable,
+                    optimize_attr=optimize_attr,
+                    regularizer=regularizer,
+                    error_clip=error_clip,
+                )
         elif var_type == "Variable":
             var = Variable(
                 self,
@@ -3946,10 +3924,7 @@ def create_parameter(self, *args, **kwargs):
         if in_dygraph_mode():
             param = EagerParamBase(*args, **kwargs)
         else:
-            if _in_legacy_dygraph():
-                param = ParamBase(*args, **kwargs)
-            else:
-                param = Parameter(global_block, *args, **kwargs)
+            param = Parameter(global_block, *args, **kwargs)
 
         if 'initializer' in kwargs:
 
@@ -4015,7 +3990,7 @@ def append_op(self, *args, **kwargs):
 
             # record ops in tracer rather than blocks
             #
-            # TODO(minqiyang): add op stop_gradient support in static mode too.
+            # TODO(minqiyang): add op stop_gradient support in static graph mode too.
             # currently, we only support stop_gradient in dygraph mode.
 
             _dygraph_tracer().trace_op(
@@ -4262,35 +4237,21 @@ def _copy_param_info_from(self, other):
                     name=v.name,
                 )
             else:
-                if _in_legacy_dygraph():
-                    new_p = ParamBase(
-                        shape=v.shape,
-                        dtype=v.dtype,
-                        type=v.type,
-                        lod_level=v.lod_level,
-                        stop_gradient=p.stop_gradient,
-                        trainable=p.trainable,
-                        optimize_attr=p.optimize_attr,
-                        regularizer=p.regularizer,
-                        error_clip=p.error_clip,
-                        name=v.name,
-                    )
-                else:
-                    new_p = Parameter(
-                        block=self,
-                        shape=v.shape,
-                        dtype=v.dtype,
-                        type=v.type,
-                        lod_level=v.lod_level
-                        if v.type == core.VarDesc.VarType.LOD_TENSOR
-                        else None,
-                        stop_gradient=p.stop_gradient,
-                        trainable=p.trainable,
-                        optimize_attr=p.optimize_attr,
-                        regularizer=p.regularizer,
-                        error_clip=p.error_clip,
-                        name=v.name,
-                    )
+                new_p = Parameter(
+                    block=self,
+                    shape=v.shape,
+                    dtype=v.dtype,
+                    type=v.type,
+                    lod_level=v.lod_level
+                    if v.type == core.VarDesc.VarType.LOD_TENSOR
+                    else None,
+                    stop_gradient=p.stop_gradient,
+                    trainable=p.trainable,
+                    optimize_attr=p.optimize_attr,
+                    regularizer=p.regularizer,
+                    error_clip=p.error_clip,
+                    name=v.name,
+                )
             self.vars[new_p.name] = new_p
 
     def _clone_variable(self, var, force_persistable=True):
@@ -7512,7 +7473,7 @@ def device_guard(device=None):
     """
 
     Note:
-        The API only supports static mode.
+        The API only supports static graph mode.
 
     A context manager that specifies the device on which the OP will be placed.
 
@@ -7586,9 +7547,9 @@ def _cuda_graph_guard(cuda_graph_attr=None):
     """
 
     Note:
-        The API only supports static mode.
+        The API only supports static graph mode.
 
-    A context manager that specifies the cuda_graph_mode which indicating the cuda graph capture under static mode.
+    A context manager that specifies the cuda_graph_mode which indicating the cuda graph capture under static graph mode.
 
     Args:
         cuda_graph_attr(str|None): The cuda graph attr with the format of:
@@ -7596,7 +7557,7 @@ def _cuda_graph_guard(cuda_graph_attr=None):
     """
     assert (
         not _non_static_mode()
-    ), "cuda_graph_guard only works under static mode"
+    ), "cuda_graph_guard only works under static graph mode"
     assert (
         core.is_compiled_with_cuda()
     ), "cuda_graph_guard context can be only used when Paddle is compiled with cuda"
diff --git a/python/paddle/fluid/incubate/fleet/utils/fleet_util.py b/python/paddle/fluid/incubate/fleet/utils/fleet_util.py
index df198931199f59..4ec3c1d16e077e 100644
--- a/python/paddle/fluid/incubate/fleet/utils/fleet_util.py
+++ b/python/paddle/fluid/incubate/fleet/utils/fleet_util.py
@@ -185,7 +185,7 @@ def print_global_auc(
 
               # below is part of model
               emb = my_slot_net(slots, label) # emb can be fc layer of size 1
-              similarity_norm = fluid.layers.sigmoid(fluid.layers.clip(\
+              similarity_norm = fluid.layers.sigmoid(paddle.clip(\
                   emb, min=-15.0, max=15.0), name="similarity_norm")\
               binary_predict = fluid.layers.concat(input=[\
                   paddle.subtract(\
@@ -1374,7 +1374,7 @@ def get_global_metrics(
               label = fluid.layers.data(name="click", shape=[-1, 1],\
                   dtype="int64", lod_level=0, append_batch_size=False)
               emb = my_slot_net(slots, label) # emb can be fc layer of size 1
-              similarity_norm = fluid.layers.sigmoid(fluid.layers.clip(\
+              similarity_norm = fluid.layers.sigmoid(paddle.clip(\
                   emb, min=-15.0, max=15.0), name="similarity_norm")\
               binary_predict = fluid.layers.concat(input=[\
                   paddle.subtract(\
@@ -1574,7 +1574,7 @@ def print_global_metrics(
               label = fluid.layers.data(name="click", shape=[-1, 1],\
                   dtype="int64", lod_level=0, append_batch_size=False)
               emb = my_slot_net(slots, label) # emb can be fc layer of size 1
-              similarity_norm = fluid.layers.sigmoid(fluid.layers.clip(\
+              similarity_norm = fluid.layers.sigmoid(paddle.clip(\
                   emb, min=-15.0, max=15.0), name="similarity_norm")\
               binary_predict = fluid.layers.concat(input=[\
                   paddle.subtract(\
diff --git a/python/paddle/fluid/install_check.py b/python/paddle/fluid/install_check.py
index 5756361f89e46f..bf1ad9b107f746 100644
--- a/python/paddle/fluid/install_check.py
+++ b/python/paddle/fluid/install_check.py
@@ -25,7 +25,7 @@
 from .initializer import Constant
 from . import layers
 from . import backward
-from .dygraph import Layer, nn
+from .dygraph import Layer
 from . import executor
 from . import optimizer
 from . import core
diff --git a/python/paddle/fluid/layers/layer_function_generator.py b/python/paddle/fluid/layers/layer_function_generator.py
index bb5d06157e1204..6e4b1f836f020c 100644
--- a/python/paddle/fluid/layers/layer_function_generator.py
+++ b/python/paddle/fluid/layers/layer_function_generator.py
@@ -272,7 +272,6 @@ def func(x, name=None):
             op = getattr(_C_ops, op_type)
             return op(x)
         # TODO(dev): Because some ops' yaml has not been migrated.
-        # Replace it with _in_legacy_dygraph while all yaml work is done.
         if in_dygraph_mode() and hasattr(_legacy_C_ops, op_type):
             op = getattr(_legacy_C_ops, op_type)
             return op(x)
diff --git a/python/paddle/fluid/layers/math_op_patch.py b/python/paddle/fluid/layers/math_op_patch.py
index 1cfcc68088806e..126bc1c6eb62c6 100644
--- a/python/paddle/fluid/layers/math_op_patch.py
+++ b/python/paddle/fluid/layers/math_op_patch.py
@@ -155,12 +155,12 @@ def cuda(self):
     @static_only
     def place(self):
         """
-        Variable don't have 'place' interface in static mode
+        Variable don't have 'place' interface in static graph mode
         But this interface can greatly facilitate dy2static.
         So we give a warnning here and return None.
         """
         warnings.warn(
-            "Variable do not have 'place' interface for static mode, try not to use it. None will be returned."
+            "Variable do not have 'place' interface for static graph mode, try not to use it. None will be returned."
         )
         return None
 
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 264c8ce6da94e8..c11a541df53267 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -63,10 +63,6 @@
     'fc',
     'embedding',
     'autoincreased_step_counter',
-    'clip',
-    'clip_by_norm',
-    'merge_selected_rows',
-    'get_tensor_from_selected_rows',
 ]
 
 OP_NAMEMAPPING = {
@@ -997,199 +993,3 @@ def _logical_op(op_name, x, y, out=None, name=None, binary_op=True):
             )
 
         return out
-
-
-@templatedoc()
-def clip(x, min, max, name=None):
-    """
-        :old_api: paddle.fluid.layers.clip
-
-    ${comment}
-
-    Args:
-        x(${x_type}): ${x_comment}
-        min(float): ${min_comment}
-        max(float): ${max_comment}
-        name(str, optional): The default value is None.
-                             Normally there is no need for user to set this property.
-                             For more information, please refer to :ref:`api_guide_Name`
-
-    Returns:
-        ${out_comment}
-
-    Return Type:
-        ${out_type}
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            input = fluid.data(
-                name='data', shape=[1], dtype='float32')
-            reward = fluid.layers.clip(x=input, min=-1.0, max=1.0)
-    """
-
-    helper = LayerHelper("clip", **locals())
-    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'clip')
-
-    if name is None:
-        name = unique_name.generate_with_ignorable_key(
-            ".".join([helper.name, 'tmp'])
-        )
-
-    out = helper.create_variable(
-        type=x.type, name=name, dtype=x.dtype, persistable=False
-    )
-
-    helper.append_op(
-        type="clip",
-        inputs={"X": x},
-        attrs={"min": min, "max": max},
-        outputs={"Out": out},
-    )
-
-    return out
-
-
-@templatedoc()
-def clip_by_norm(x, max_norm, name=None):
-    """
-    ${comment}
-
-    Args:
-        x(${x_type}): ${x_comment}
-        max_norm(${max_norm_type}): ${max_norm_comment}
-        name(str, optional): For detailed information, please refer
-            to :ref:`api_guide_Name`. Usually name is no need to set and
-            None by default.
-
-    Returns:
-        Tensor:
-
-        out(${out_type}): ${out_comment}
-
-
-    Examples:
-        .. code-block:: python
-
-            import paddle
-            import paddle.fluid as fluid
-
-            input = paddle.to_tensor([[2.0, 2.0], [2.0, 2.0]], dtype='float32')
-            reward = fluid.layers.clip_by_norm(x=input, max_norm=1.0)
-            # [[0.5, 0.5], [0.5, 0.5]]
-    """
-
-    if in_dygraph_mode():
-        return _C_ops.clip_by_norm(x, max_norm)
-    else:
-        helper = LayerHelper("clip_by_norm", **locals())
-        check_variable_and_dtype(x, 'X', ['float32', 'float16'], 'clip_by_norm')
-        check_type(max_norm, 'max_norm', (float), 'clip_by_norm')
-
-        if name is None:
-            name = unique_name.generate_with_ignorable_key(
-                ".".join([helper.name, 'tmp'])
-            )
-
-        out = helper.create_variable(
-            type=x.type, name=name, dtype=x.dtype, persistable=False
-        )
-
-        helper.append_op(
-            type="clip_by_norm",
-            inputs={"X": x},
-            attrs={"max_norm": max_norm},
-            outputs={"Out": out},
-        )
-
-        return out
-
-
-@templatedoc()
-def merge_selected_rows(x, name=None):
-    """
-    ${comment}
-
-    Args:
-        x(${x_type}): ${x_comment}
-        name(basestring|None): Name of the output.
-
-    Returns:
-        out(${out_type}): ${out_comment}
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            b = fluid.default_main_program().global_block()
-            var = b.create_var(
-                name="X", dtype="float32", persistable=True,
-                type=fluid.core.VarDesc.VarType.SELECTED_ROWS)
-            y = fluid.layers.merge_selected_rows(var)
-    """
-    if in_dygraph_mode():
-        return _C_ops.merge_selected_rows(x)
-    else:
-        helper = LayerHelper("merge_selected_rows", **locals())
-        out = helper.create_variable_for_type_inference(dtype=x.dtype)
-        helper.append_op(
-            type="merge_selected_rows",
-            inputs={"X": x},
-            attrs={},
-            outputs={"Out": out},
-        )
-        return out
-
-
-@templatedoc()
-def get_tensor_from_selected_rows(x, name=None):
-    """
-    This operator gets tensor data from input with SelectedRows type, and outputs a LoDTensor.
-
-    .. code-block:: text
-
-        input x is SelectedRows:
-           x.rows = [0, 5, 5, 4, 19]
-           x.height = 20
-           x.value = [[1, 1] [2, 2] [2, 2] [3, 3] [6, 6]]
-
-        Output is LoDTensor:
-           out.shape = [5, 2]
-           out.data = [[1, 1],
-                       [2, 2],
-                       [2, 2],
-                       [3, 3],
-                       [6, 6]]
-
-    Args:
-        x(SelectedRows): Input with SelectedRows type. The data type is float32, float64, int32 or int64.
-        name(str, optional): The default value is None.  Normally there is no need for user to set this property.
-            For more information, please refer to :ref:`api_guide_Name` .
-
-    Returns:
-        Variable: LoDTensor transformed from SelectedRows. The data type is same with input.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            b = fluid.default_main_program().global_block()
-            input = b.create_var(name="X", dtype="float32", persistable=True, type=fluid.core.VarDesc.VarType.SELECTED_ROWS)
-            out = fluid.layers.get_tensor_from_selected_rows(input)
-    """
-
-    check_type(x, 'x', Variable, 'get_tensor_from_selected_rows')
-    if x.type != core.VarDesc.VarType.SELECTED_ROWS:
-        raise TypeError(
-            "The type of 'x' in get_tensor_from_selected_rows must be SELECTED_ROWS."
-        )
-    helper = LayerHelper('get_tensor_from_selected_rows', **locals())
-    out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type='get_tensor_from_selected_rows',
-        inputs={'X': x},
-        outputs={'Out': out},
-        attrs={},
-    )
-    return out
diff --git a/python/paddle/fluid/layers/utils.py b/python/paddle/fluid/layers/utils.py
index 7e3e69fda7c072..7cf049fd05d51b 100644
--- a/python/paddle/fluid/layers/utils.py
+++ b/python/paddle/fluid/layers/utils.py
@@ -484,7 +484,7 @@ def try_set_static_shape_tensor(tensor, shape):
 
     """
     if not _non_static_mode():
-        # static mode, and shape is not all inferred (contains -1)
+        # static graph mode, and shape is not all inferred (contains -1)
         if -1 in tensor.shape:
             if isinstance(shape, Variable):
                 shape = try_get_constant_shape_from_tensor(shape)
diff --git a/python/paddle/fluid/lazy_init.py b/python/paddle/fluid/lazy_init.py
index 6242ad2c4eded0..54755c0787947f 100644
--- a/python/paddle/fluid/lazy_init.py
+++ b/python/paddle/fluid/lazy_init.py
@@ -19,7 +19,7 @@
 
 class LazyInitHelper:
     """
-    A Helper Context to trigger switching mode between dygraph and static mode,
+    A Helper Context to trigger switching mode between dygraph and static graph mode,
     and holds the startup program resource.
     """
 
@@ -54,7 +54,7 @@ def disable(self):
     def __enter__(self):
         """
         Switch into lazy mode and set _dygraph_tracer_ with None to convert
-        dygraph mode into static mode.
+        dygraph mode into static graph mode.
         """
         self.enable()
         if self._in_guard:
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 4528ea12771e6c..cbbe8dbadef12f 100755
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -38,13 +38,6 @@
     _append_grad_suffix_,
     _get_no_grad_set_name,
 )
-from .clip import (
-    GradientClipBase,
-    GradientClipByNorm,
-    error_clip_callback,
-    append_gradient_clip_ops,
-    ClipGradByGlobalNorm,
-)
 from .framework import program_guard
 from .initializer import Constant
 from .layer_helper import LayerHelper
@@ -160,7 +153,7 @@ def __init__(
                 )
 
         if grad_clip is not None:
-            if not isinstance(grad_clip, GradientClipBase):
+            if not isinstance(grad_clip, paddle.nn.clip.GradientClipBase):
                 raise TypeError(
                     "'grad_clip' should be an instance of GradientClipBase's derived class"
                 )
@@ -1030,7 +1023,7 @@ def backward(
                     params_grads.append((param, grad_var))
         else:
             if callbacks is None:
-                callbacks = [error_clip_callback]
+                callbacks = [paddle.nn.clip.error_clip_callback]
             else:
                 assert isinstance(callbacks, list)
             program = loss.block.program
@@ -1260,7 +1253,7 @@ def apply_gradients(self, params_grads):
         # NOTE(zhiqiu): currently, only support ClipGradByGlobalNorm and without regularization.
         if self._flatten_param_grads and self.regularization is None:
             if self._grad_clip is None or isinstance(
-                self._grad_clip, ClipGradByGlobalNorm
+                self._grad_clip, paddle.nn.ClipGradByGlobalNorm
             ):
                 params_grads = self.flatten_param_grads(params_grads)
 
@@ -1268,7 +1261,7 @@ def apply_gradients(self, params_grads):
         if self._grad_clip is not None:
             params_grads = self._grad_clip(params_grads)
         else:
-            params_grads = append_gradient_clip_ops(params_grads)
+            params_grads = paddle.nn.clip.append_gradient_clip_ops(params_grads)
 
         # Add regularization if any
         params_grads = self.append_regularization_ops(
@@ -1414,7 +1407,7 @@ class SGDOptimizer(Optimizer):
             Can be a float value or a Variable with one float value as data element.
         parameter_list (Iterable, optional):  Iterable of ``Variable`` names to update to minimize ``loss``. \
             This parameter is required in dygraph mode. \
-            The default value is None in static mode, at this time all parameters will be updated.
+            The default value is None in static graph mode, at this time all parameters will be updated.
         regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \
              :ref:`api_fluid_regularizer_L1Decay` , :ref:`api_fluid_regularizer_L2Decay` . If a parameter has set \
             regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \
@@ -1605,7 +1598,7 @@ class MomentumOptimizer(Optimizer):
         momentum (float): Momentum factor
         parameter_list (Iterable, optional):  Iterable of ``Variable`` names to update to minimize ``loss``. \
             This parameter is required in dygraph mode. \
-            The default value is None in static mode, at this time all parameters will be updated.
+            The default value is None in static graph mode, at this time all parameters will be updated.
         use_nesterov (bool, optional): Enables Nesterov momentum, default is false.
         regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \
              :ref:`api_fluid_regularizer_L1Decay` , :ref:`api_fluid_regularizer_L2Decay` . If a parameter has set \
@@ -1752,7 +1745,7 @@ class LarsMomentumOptimizer(Optimizer):
         lars_weight_decay (float): Weight decay coefficient for decaying using LARS.
         parameter_list (Iterable, optional):  Iterable of ``Variable`` names to update to minimize ``loss``. \
             This parameter is required in dygraph mode. \
-            The default value is None in static mode, at this time all parameters will be updated.
+            The default value is None in static graph mode, at this time all parameters will be updated.
         regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \
              :ref:`api_fluid_regularizer_L1Decay` , :ref:`api_fluid_regularizer_L2Decay` . If a parameter has set \
             regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \
@@ -2014,7 +2007,7 @@ class AdagradOptimizer(Optimizer):
             The default value is 1e-06.
         parameter_list (Iterable, optional):  Iterable of ``Variable`` names to update to minimize ``loss``. \
             This parameter is required in dygraph mode. \
-            The default value is None in static mode, at this time all parameters will be updated.
+            The default value is None in static graph mode, at this time all parameters will be updated.
         regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \
              :ref:`api_fluid_regularizer_L1Decay` , :ref:`api_fluid_regularizer_L2Decay` . If a parameter has set \
             regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \
@@ -2160,7 +2153,7 @@ class AdamOptimizer(Optimizer):
             The default value is 1e-08.
         parameter_list (Iterable, optional):  Iterable of ``Variable`` names to update to minimize ``loss``. \
             This parameter is required in dygraph mode. \
-            The default value is None in static mode, at this time all parameters will be updated.
+            The default value is None in static graph mode, at this time all parameters will be updated.
         regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \
              :ref:`api_fluid_regularizer_L1Decay` , :ref:`api_fluid_regularizer_L2Decay` . If a parameter has set \
             regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \
@@ -2587,7 +2580,7 @@ class AdamaxOptimizer(Optimizer):
             The default value is 1e-08.
         parameter_list (Iterable, optional):  Iterable of ``Variable`` names to update to minimize ``loss``. \
             This parameter is required in dygraph mode. \
-            The default value is None in static mode, at this time all parameters will be updated.
+            The default value is None in static graph mode, at this time all parameters will be updated.
         regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \
              :ref:`api_fluid_regularizer_L1Decay` , :ref:`api_fluid_regularizer_L2Decay` . If a parameter has set \
             regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \
@@ -2793,7 +2786,7 @@ class DpsgdOptimizer(Optimizer):
         sigma (float): for gaussian noise.
         parameter_list (Iterable, optional):  Iterable of ``Variable`` names to update to minimize ``loss``. \
             This parameter is required in dygraph mode. \
-            The default value is None in static mode, at this time all parameters will be updated.
+            The default value is None in static graph mode, at this time all parameters will be updated.
     Notes:
        Currently, DpsgdOptimizer doesn't support sparse parameter optimization.
     """
@@ -2896,7 +2889,7 @@ class DecayedAdagradOptimizer(Optimizer):
             The default value is 1e-06.
         parameter_list (Iterable, optional):  Iterable of ``Variable`` names to update to minimize ``loss``. \
             This parameter is required in dygraph mode. \
-            The default value is None in static mode, at this time all parameters will be updated.
+            The default value is None in static graph mode, at this time all parameters will be updated.
         regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \
              :ref:`api_fluid_regularizer_L1Decay` , :ref:`api_fluid_regularizer_L2Decay` . If a parameter has set \
             regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \
@@ -3021,7 +3014,7 @@ class AdadeltaOptimizer(Optimizer):
         rho (float): a floating point value indicating the decay rate. Default 0.95.
         parameter_list (Iterable, optional):  Iterable of ``Variable`` names to update to minimize ``loss``. \
             This parameter is required in dygraph mode. \
-            The default value is None in static mode, at this time all parameters will be updated.
+            The default value is None in static graph mode, at this time all parameters will be updated.
         regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \
              :ref:`api_fluid_regularizer_L1Decay` , :ref:`api_fluid_regularizer_L2Decay` . If a parameter has set \
             regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \
@@ -3193,7 +3186,7 @@ class RMSPropOptimizer(Optimizer):
             computation and memory. Defaults to False.
         parameter_list (Iterable, optional):  Iterable of ``Variable`` names to update to minimize ``loss``. \
             This parameter is required in dygraph mode. \
-            The default value is None in static mode, at this time all parameters will be updated.
+            The default value is None in static graph mode, at this time all parameters will be updated.
         regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \
              :ref:`api_fluid_regularizer_L1Decay` , :ref:`api_fluid_regularizer_L2Decay` . If a parameter has set \
             regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \
@@ -3390,7 +3383,7 @@ class FtrlOptimizer(Optimizer):
         lr_power (float): Learning Rate Power, default is -0.5.
         parameter_list (Iterable, optional):  Iterable of ``Variable`` names to update to minimize ``loss``. \
             This parameter is required in dygraph mode. \
-            The default value is None in static mode, at this time all parameters will be updated.
+            The default value is None in static graph mode, at this time all parameters will be updated.
         regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \
              :ref:`api_fluid_regularizer_L1Decay` , :ref:`api_fluid_regularizer_L2Decay` . If a parameter has set \
             regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \
@@ -3571,7 +3564,7 @@ class LambOptimizer(AdamOptimizer):
         epsilon (float, optional): A small float value for numerical stability. Default 1e-6.
         parameter_list (Iterable, optional):  Iterable of ``Variable`` names to update to minimize ``loss``. \
             This parameter is required in dygraph mode. \
-            The default value is None in static mode, at this time all parameters will be updated.
+            The default value is None in static graph mode, at this time all parameters will be updated.
         regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \
              :ref:`api_fluid_regularizer_L1Decay` , :ref:`api_fluid_regularizer_L2Decay` . If a parameter has set \
             regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \
diff --git a/python/paddle/fluid/reader.py b/python/paddle/fluid/reader.py
index 90e5f71b4f0730..82dee029f523ed 100644
--- a/python/paddle/fluid/reader.py
+++ b/python/paddle/fluid/reader.py
@@ -1347,7 +1347,7 @@ def __init__(
         self._iterable = iterable
         self._return_list = return_list
         if not self._feed_list:
-            raise Exception("Feed list must be given under static mode.")
+            raise Exception("Feed list must be given under static graph mode.")
         self._use_double_buffer = use_double_buffer
         self._capacity = capacity
         if not self._iterable:
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py
index 599edf09b7f1b1..8d8046f19aa79f 100644
--- a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py
@@ -58,7 +58,7 @@ def custom_relu_static(
 
             exe = static.Executor()
             exe.run(static.default_startup_program())
-            # in static mode, x data has been covered by out
+            # in static graph mode, x data has been covered by out
             out_v = exe.run(
                 static.default_main_program(),
                 feed={'X': np_x},
@@ -84,7 +84,7 @@ def custom_relu_static_pe(func, device, dtype, np_x, use_func=True):
             exe = static.Executor()
             exe.run(static.default_startup_program())
 
-            # in static mode, x data has been covered by out
+            # in static graph mode, x data has been covered by out
             compiled_prog = static.CompiledProgram(
                 static.default_main_program()
             ).with_data_parallel(loss_name=out.name, places=places)
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_xpu_setup.py b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_xpu_setup.py
index 655bdeca022a1d..7b251e8063a05e 100644
--- a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_xpu_setup.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_xpu_setup.py
@@ -57,7 +57,7 @@ def custom_relu_static(
 
             exe = static.Executor()
             exe.run(static.default_startup_program())
-            # in static mode, x data has been covered by out
+            # in static graph mode, x data has been covered by out
             out_v = exe.run(
                 static.default_main_program(),
                 feed={'X': np_x},
@@ -83,7 +83,7 @@ def custom_relu_static_pe(func, device, dtype, np_x, use_func=True):
             exe = static.Executor()
             exe.run(static.default_startup_program())
 
-            # in static mode, x data has been covered by out
+            # in static graph mode, x data has been covered by out
             compiled_prog = static.CompiledProgram(
                 static.default_main_program()
             ).with_data_parallel(loss_name=out.name, places=places)
@@ -150,26 +150,29 @@ def custom_relu_double_grad_dynamic(func, device, dtype, np_x, use_func=True):
     t = paddle.to_tensor(np_x, dtype=dtype, stop_gradient=False)
 
     out = func(t) if use_func else paddle.nn.functional.relu(t)
-    out.stop_gradient = False
-
     dx = paddle.grad(
-        outputs=[out], inputs=[t], create_graph=True, retain_graph=True
+        outputs=out,
+        inputs=t,
+        grad_outputs=paddle.ones_like(t),
+        create_graph=True,
+        retain_graph=True,
     )
 
-    dx[0].backward()
-
-    assert dx[0].grad is not None
-    return dx[0].numpy(), dx[0].grad.numpy()
+    ddout = paddle.grad(
+        outputs=dx[0],
+        inputs=out.grad,
+        grad_outputs=paddle.ones_like(t),
+        create_graph=False,
+    )
     fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
+    assert ddout[0].numpy() is not None
+    return dx[0].numpy(), ddout[0].numpy()
 
-class TestNewCustomOpSetUpInstall(unittest.TestCase):
+
+class TestNewCustomOpXpuSetUpInstall(unittest.TestCase):
     def setUp(self):
         cur_dir = os.path.dirname(os.path.abspath(__file__))
-        # compile, install the custom op egg into site-packages under background
-        # Currently custom XPU op does not support Windows
-        if os.name == 'nt':
-            return
         cmd = 'cd {} && {} custom_relu_xpu_setup.py install'.format(
             cur_dir, sys.executable
         )
@@ -192,7 +195,7 @@ def setUp(self):
         self.custom_op = custom_relu_xpu_module_setup.custom_relu
 
         self.dtypes = ['float32', 'float64']
-        self.devices = ['xpu']
+        self.device = 'xpu'
 
         # config seed
         SEED = 2021
@@ -200,91 +203,90 @@ def setUp(self):
         paddle.framework.random._manual_program_seed(SEED)
 
     def test_static(self):
-        for device in self.devices:
-            for dtype in self.dtypes:
-                x = np.random.uniform(-1, 1, [4, 8]).astype(dtype)
-                out = custom_relu_static(self.custom_op, device, dtype, x)
-                pd_out = custom_relu_static(
-                    self.custom_op, device, dtype, x, False
-                )
-                np.testing.assert_array_equal(
-                    out,
-                    pd_out,
-                    err_msg='custom op out: {},\n paddle api out: {}'.format(
-                        out, pd_out
-                    ),
-                )
+        for dtype in self.dtypes:
+            x = np.random.uniform(-1, 1, [4, 8]).astype(dtype)
+            out = custom_relu_static(self.custom_op, self.device, dtype, x)
+            pd_out = custom_relu_static(
+                self.custom_op, self.device, dtype, x, False
+            )
+            np.testing.assert_array_equal(
+                out,
+                pd_out,
+                err_msg='custom op out: {},\n paddle api out: {}'.format(
+                    out, pd_out
+                ),
+            )
 
     def test_static_pe(self):
-        for device in self.devices:
-            for dtype in self.dtypes:
-                x = np.random.uniform(-1, 1, [4, 8]).astype(dtype)
-                out = custom_relu_static_pe(self.custom_op, device, dtype, x)
-                pd_out = custom_relu_static_pe(
-                    self.custom_op, device, dtype, x, False
-                )
-                np.testing.assert_array_equal(
-                    out,
-                    pd_out,
-                    err_msg='custom op out: {},\n paddle api out: {}'.format(
-                        out, pd_out
-                    ),
-                )
+        for dtype in self.dtypes:
+            x = np.random.uniform(-1, 1, [4, 8]).astype(dtype)
+            out = custom_relu_static_pe(self.custom_op, self.device, dtype, x)
+            pd_out = custom_relu_static_pe(
+                self.custom_op, self.device, dtype, x, False
+            )
+            np.testing.assert_allclose(
+                out,
+                pd_out,
+                atol=1e-2,
+                err_msg='custom op out: {},\n paddle api out: {}'.format(
+                    out, pd_out
+                ),
+            )
 
     def test_dynamic(self):
-        for device in self.devices:
-            for dtype in self.dtypes:
-                x = np.random.uniform(-1, 1, [4, 8]).astype(dtype)
-                out, x_grad = custom_relu_dynamic(
-                    self.custom_op, device, dtype, x
-                )
-                pd_out, pd_x_grad = custom_relu_dynamic(
-                    self.custom_op, device, dtype, x, False
-                )
-                np.testing.assert_array_equal(
-                    out,
-                    pd_out,
-                    err_msg='custom op out: {},\n paddle api out: {}'.format(
-                        out, pd_out
-                    ),
-                )
-                np.testing.assert_array_equal(
-                    x_grad,
-                    pd_x_grad,
-                    err_msg='custom op x grad: {},\n paddle api x grad: {}'.format(
-                        x_grad, pd_x_grad
-                    ),
-                )
+        for dtype in self.dtypes:
+            x = np.random.uniform(-1, 1, [4, 8]).astype(dtype)
+            out, x_grad = custom_relu_dynamic(
+                self.custom_op, self.device, dtype, x
+            )
+            pd_out, pd_x_grad = custom_relu_dynamic(
+                self.custom_op, self.device, dtype, x, False
+            )
+            np.testing.assert_array_equal(
+                out,
+                pd_out,
+                err_msg='custom op out: {},\n paddle api out: {}'.format(
+                    out, pd_out
+                ),
+            )
+            np.testing.assert_array_equal(
+                x_grad,
+                pd_x_grad,
+                err_msg='custom op x grad: {},\n paddle api x grad: {}'.format(
+                    x_grad, pd_x_grad
+                ),
+            )
 
     def test_static_save_and_load_inference_model(self):
         paddle.enable_static()
         np_data = np.random.random((1, 1, 28, 28)).astype("float32")
         np_label = np.random.random((1, 1)).astype("int64")
         path_prefix = "self.custom_op_inference/custom_relu"
-        for device in self.devices:
-            predict = custom_relu_static_inference(
-                self.custom_op, device, np_data, np_label, path_prefix
+
+        predict = custom_relu_static_inference(
+            self.custom_op, self.device, np_data, np_label, path_prefix
+        )
+        # load inference model
+        with static.scope_guard(static.Scope()):
+            exe = static.Executor()
+            [
+                inference_program,
+                feed_target_names,
+                fetch_targets,
+            ] = static.load_inference_model(path_prefix, exe)
+            predict_infer = exe.run(
+                inference_program,
+                feed={feed_target_names[0]: np_data},
+                fetch_list=fetch_targets,
+            )
+            np.testing.assert_allclose(
+                predict,
+                predict_infer,
+                atol=1e-2,
+                err_msg='custom op predict: {},\n custom op infer predict: {}'.format(
+                    predict, predict_infer
+                ),
             )
-            # load inference model
-            with static.scope_guard(static.Scope()):
-                exe = static.Executor()
-                [
-                    inference_program,
-                    feed_target_names,
-                    fetch_targets,
-                ] = static.load_inference_model(path_prefix, exe)
-                predict_infer = exe.run(
-                    inference_program,
-                    feed={feed_target_names[0]: np_data},
-                    fetch_list=fetch_targets,
-                )
-                np.testing.assert_array_equal(
-                    predict,
-                    predict_infer,
-                    err_msg='custom op predict: {},\n custom op infer predict: {}'.format(
-                        predict, predict_infer
-                    ),
-                )
         paddle.disable_static()
 
     def test_static_save_and_run_inference_predictor(self):
@@ -294,92 +296,97 @@ def test_static_save_and_run_inference_predictor(self):
         path_prefix = "self.custom_op_inference/custom_relu"
         from paddle.inference import Config, create_predictor
 
-        for device in self.devices:
-            predict = custom_relu_static_inference(
-                self.custom_op, device, np_data, np_label, path_prefix
-            )
-            # load inference model
-            config = Config(
-                path_prefix + ".pdmodel", path_prefix + ".pdiparams"
+        predict = custom_relu_static_inference(
+            self.custom_op, self.device, np_data, np_label, path_prefix
+        )
+        # load inference model
+        config = Config(path_prefix + ".pdmodel", path_prefix + ".pdiparams")
+        predictor = create_predictor(config)
+        input_tensor = predictor.get_input_handle(
+            predictor.get_input_names()[0]
+        )
+        input_tensor.reshape(np_data.shape)
+        input_tensor.copy_from_cpu(np_data.copy())
+        predictor.run()
+        output_tensor = predictor.get_output_handle(
+            predictor.get_output_names()[0]
+        )
+        predict_infer = output_tensor.copy_to_cpu()
+        predict = np.array(predict).flatten()
+        predict_infer = np.array(predict_infer).flatten()
+        np.testing.assert_allclose(
+            predict,
+            predict_infer,
+            rtol=5e-5,
+            atol=1e-2,
+            err_msg="custom op predict: {},\n custom op infer predict: {}".format(
+                predict, predict_infer
+            ),
+        )
+        paddle.disable_static()
+
+    def test_func_double_grad_dynamic(self):
+        for dtype in self.dtypes:
+            x = np.random.uniform(-1, 1, [4, 8]).astype(dtype)
+            out, dx_grad = custom_relu_double_grad_dynamic(
+                self.custom_op, self.device, dtype, x
             )
-            predictor = create_predictor(config)
-            input_tensor = predictor.get_input_handle(
-                predictor.get_input_names()[0]
+            pd_out, pd_dx_grad = custom_relu_double_grad_dynamic(
+                self.custom_op, self.device, dtype, x, False
             )
-            input_tensor.reshape(np_data.shape)
-            input_tensor.copy_from_cpu(np_data.copy())
-            predictor.run()
-            output_tensor = predictor.get_output_handle(
-                predictor.get_output_names()[0]
+            np.testing.assert_array_equal(
+                out,
+                pd_out,
+                err_msg='custom op out: {},\n paddle api out: {}'.format(
+                    out, pd_out
+                ),
             )
-            predict_infer = output_tensor.copy_to_cpu()
-            self.assertTrue(
-                np.isclose(predict, predict_infer, rtol=5e-5).any(),
-                "custom op predict: {},\n custom op infer predict: {}".format(
-                    predict, predict_infer
+            np.testing.assert_array_equal(
+                dx_grad,
+                pd_dx_grad,
+                err_msg='custom op dx grad: {},\n paddle api dx grad: {}'.format(
+                    dx_grad, pd_dx_grad
                 ),
             )
-        paddle.disable_static()
-
-    def test_func_double_grad_dynamic(self):
-        for device in self.devices:
-            for dtype in self.dtypes:
-                x = np.random.uniform(-1, 1, [4, 8]).astype(dtype)
-                out, dx_grad = custom_relu_double_grad_dynamic(
-                    self.custom_op, device, dtype, x
-                )
-                pd_out, pd_dx_grad = custom_relu_double_grad_dynamic(
-                    self.custom_op, device, dtype, x, False
-                )
-                np.testing.assert_array_equal(
-                    out,
-                    pd_out,
-                    err_msg='custom op out: {},\n paddle api out: {}'.format(
-                        out, pd_out
-                    ),
-                )
-                np.testing.assert_array_equal(
-                    dx_grad,
-                    pd_dx_grad,
-                    err_msg='custom op dx grad: {},\n paddle api dx grad: {}'.format(
-                        dx_grad, pd_dx_grad
-                    ),
-                )
 
     def test_with_dataloader(self):
         paddle.disable_static()
-        for device in self.devices:
-            paddle.set_device(device)
-            # data loader
-            transform = Compose(
-                [Normalize(mean=[127.5], std=[127.5], data_format='CHW')]
-            )
-            train_dataset = paddle.vision.datasets.MNIST(
-                mode='train', transform=transform
-            )
-            train_loader = paddle.io.DataLoader(
-                train_dataset,
-                batch_size=64,
-                shuffle=True,
-                drop_last=True,
-                num_workers=0,
-            )
+        paddle.set_device(self.device)
+        # data loader
+        transform = Compose(
+            [Normalize(mean=[127.5], std=[127.5], data_format='CHW')]
+        )
+        train_dataset = paddle.vision.datasets.MNIST(
+            mode='train', transform=transform
+        )
+        train_loader = paddle.io.DataLoader(
+            train_dataset,
+            batch_size=64,
+            shuffle=True,
+            drop_last=True,
+            num_workers=0,
+        )
 
-            for batch_id, (image, _) in enumerate(train_loader()):
-                out = self.custom_op(image)
-                pd_out = paddle.nn.functional.relu(image)
-                np.testing.assert_array_equal(
-                    out,
-                    pd_out,
-                    err_msg='custom op out: {},\n paddle api out: {}'.format(
-                        out, pd_out
-                    ),
-                )
+        for batch_id, (image, _) in enumerate(train_loader()):
+            out = self.custom_op(image)
+            pd_out = paddle.nn.functional.relu(image)
+            np.testing.assert_allclose(
+                out,
+                pd_out,
+                atol=1e-2,
+                err_msg='custom op out: {},\n paddle api out: {}'.format(
+                    out, pd_out
+                ),
+            )
 
-                if batch_id == 5:
-                    break
+            if batch_id == 5:
+                break
         paddle.enable_static()
 
 
 if __name__ == '__main__':
+    # compile, install the custom op egg into site-packages under background
+    # Currently custom XPU op does not support Windows
+    if os.name == 'nt':
+        exit()
     unittest.main()
diff --git a/python/paddle/fluid/tests/custom_runtime/CMakeLists.txt b/python/paddle/fluid/tests/custom_runtime/CMakeLists.txt
index 1dd6ef6776750c..00eef2d5a77316 100644
--- a/python/paddle/fluid/tests/custom_runtime/CMakeLists.txt
+++ b/python/paddle/fluid/tests/custom_runtime/CMakeLists.txt
@@ -28,4 +28,5 @@ if(WITH_CUSTOM_DEVICE AND NOT WITH_GPU)
   set_tests_properties(test_custom_cpu_profiler_plugin PROPERTIES TIMEOUT 120)
   set_tests_properties(test_fleet_launch_custom_device PROPERTIES TIMEOUT 120)
   set_tests_properties(test_custom_cpu_to_static PROPERTIES TIMEOUT 120)
+  set_tests_properties(test_custom_device_relu_setup PROPERTIES TIMEOUT 120)
 endif()
diff --git a/python/paddle/fluid/tests/custom_runtime/custom_relu_op.cc b/python/paddle/fluid/tests/custom_runtime/custom_relu_op.cc
new file mode 100644
index 00000000000000..da0563ffeb10e3
--- /dev/null
+++ b/python/paddle/fluid/tests/custom_runtime/custom_relu_op.cc
@@ -0,0 +1,193 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <iostream>
+#include <vector>
+
+#include "paddle/extension.h"
+
+#define CHECK_CPU_INPUT(x) PD_CHECK(x.is_cpu(), #x " must be a CPU Tensor.")
+#define CHECK_CUSTOM_INPUT(x) \
+  PD_CHECK(x.is_custom_device(), #x " must be a custom Tensor.")
+
+template <typename data_t>
+void relu_cpu_forward_kernel(const data_t* x_data,
+                             data_t* out_data,
+                             int64_t x_numel) {
+  PD_CHECK(x_data != nullptr, "x_data is nullptr.");
+  PD_CHECK(out_data != nullptr, "out_data is nullptr.");
+  for (int64_t i = 0; i < x_numel; ++i) {
+    out_data[i] = std::max(static_cast<data_t>(0.), x_data[i]);
+  }
+}
+
+template <typename data_t>
+void relu_cpu_backward_kernel(const data_t* grad_out_data,
+                              const data_t* out_data,
+                              data_t* grad_x_data,
+                              int64_t out_numel) {
+  for (int64_t i = 0; i < out_numel; ++i) {
+    grad_x_data[i] =
+        grad_out_data[i] * (out_data[i] > static_cast<data_t>(0) ? 1. : 0.);
+  }
+}
+
+template <typename data_t>
+void relu_cpu_double_backward_kernel(const data_t* out_data,
+                                     const data_t* ddx_data,
+                                     data_t* ddout_data,
+                                     int64_t ddout_numel) {
+  for (int64_t i = 0; i < ddout_numel; ++i) {
+    ddout_data[i] =
+        ddx_data[i] * (out_data[i] > static_cast<data_t>(0) ? 1. : 0.);
+  }
+}
+
+std::vector<paddle::Tensor> relu_cpu_forward(const paddle::Tensor& x) {
+  CHECK_CPU_INPUT(x);
+  auto out = paddle::empty_like(x);
+
+  PD_DISPATCH_FLOATING_TYPES(
+      x.type(), "relu_cpu_forward", ([&] {
+        relu_cpu_forward_kernel<data_t>(
+            x.data<data_t>(), out.data<data_t>(), x.numel());
+      }));
+
+  return {out};
+}
+
+std::vector<paddle::Tensor> relu_cpu_backward(const paddle::Tensor& x,
+                                              const paddle::Tensor& out,
+                                              const paddle::Tensor& grad_out) {
+  auto grad_x = paddle::empty_like(x);
+
+  PD_DISPATCH_FLOATING_TYPES(out.type(), "relu_cpu_backward", ([&] {
+                               relu_cpu_backward_kernel<data_t>(
+                                   grad_out.data<data_t>(),
+                                   out.data<data_t>(),
+                                   grad_x.data<data_t>(),
+                                   out.size());
+                             }));
+
+  return {grad_x};
+}
+
+std::vector<paddle::Tensor> relu_cpu_double_backward(
+    const paddle::Tensor& out, const paddle::Tensor& ddx) {
+  CHECK_CPU_INPUT(out);
+  CHECK_CPU_INPUT(ddx);
+  auto ddout = paddle::empty(out.shape(), out.dtype(), out.place());
+
+  PD_DISPATCH_FLOATING_TYPES(out.type(), "relu_cpu_double_backward", ([&] {
+                               relu_cpu_double_backward_kernel<data_t>(
+                                   out.data<data_t>(),
+                                   ddx.data<data_t>(),
+                                   ddout.mutable_data<data_t>(out.place()),
+                                   ddout.size());
+                             }));
+
+  return {ddout};
+}
+
+std::vector<paddle::Tensor> relu_custom_forward(const paddle::Tensor& x) {
+  CHECK_CUSTOM_INPUT(x);
+  auto out = paddle::relu(x);
+  return {out};
+}
+
+std::vector<paddle::Tensor> relu_custom_backward(
+    const paddle::Tensor& x,
+    const paddle::Tensor& out,
+    const paddle::Tensor& grad_out) {
+  CHECK_CUSTOM_INPUT(x);
+  CHECK_CUSTOM_INPUT(out);
+  auto grad_x = paddle::empty_like(x, x.dtype(), x.place());
+  auto ones = paddle::experimental::full_like(x, 1.0, x.dtype(), x.place());
+  auto zeros = paddle::experimental::full_like(x, 0.0, x.dtype(), x.place());
+  auto condition = paddle::experimental::greater_than(x, zeros);
+
+  grad_x = paddle::multiply(grad_out, paddle::where(condition, ones, zeros));
+
+  return {grad_x};
+}
+
+std::vector<paddle::Tensor> relu_custom_double_backward(
+    const paddle::Tensor& out, const paddle::Tensor& ddx) {
+  CHECK_CUSTOM_INPUT(out);
+  auto ddout = paddle::empty(out.shape(), out.dtype(), out.place());
+  auto ones =
+      paddle::experimental::full_like(out, 1.0, out.dtype(), out.place());
+  auto zeros =
+      paddle::experimental::full_like(out, 0.0, out.dtype(), out.place());
+  auto condition = paddle::experimental::greater_than(out, zeros);
+
+  ddout = paddle::multiply(ddx, paddle::where(condition, ones, zeros));
+
+  return {ddout};
+}
+
+std::vector<paddle::Tensor> ReluForward(const paddle::Tensor& x) {
+  if (x.is_cpu()) {
+    return relu_cpu_forward(x);
+  } else if (x.is_custom_device()) {
+    return relu_custom_forward(x);
+  } else {
+    PD_THROW("Not implemented.");
+  }
+}
+
+std::vector<paddle::Tensor> ReluBackward(const paddle::Tensor& x,
+                                         const paddle::Tensor& out,
+                                         const paddle::Tensor& grad_out) {
+  if (x.is_cpu()) {
+    return relu_cpu_backward(x, out, grad_out);
+  } else if (x.is_custom_device()) {
+    return relu_custom_backward(x, out, grad_out);
+  } else {
+    PD_THROW("Not implemented.");
+  }
+}
+
+std::vector<paddle::Tensor> ReluDoubleBackward(const paddle::Tensor& out,
+                                               const paddle::Tensor& ddx) {
+  if (out.is_cpu()) {
+    return relu_cpu_double_backward(out, ddx);
+  } else if (out.is_custom_device()) {
+    return relu_custom_double_backward(out, ddx);
+  } else {
+    PD_THROW("Not implemented.");
+  }
+}
+
+std::vector<std::vector<int64_t>> ReluDoubleBackwardInferShape(
+    const std::vector<int64_t>& out_shape,
+    const std::vector<int64_t>& ddx_shape) {
+  return {out_shape};
+}
+
+PD_BUILD_OP(custom_relu)
+    .Inputs({"X"})
+    .Outputs({"Out"})
+    .SetKernelFn(PD_KERNEL(ReluForward));
+
+PD_BUILD_GRAD_OP(custom_relu)
+    .Inputs({"X", "Out", paddle::Grad("Out")})
+    .Outputs({paddle::Grad("X")})
+    .SetKernelFn(PD_KERNEL(ReluBackward));
+
+PD_BUILD_DOUBLE_GRAD_OP(custom_relu)
+    .Inputs({"Out", paddle::Grad(paddle::Grad("X"))})
+    .Outputs({paddle::Grad(paddle::Grad("Out"))})
+    .SetKernelFn(PD_KERNEL(ReluDoubleBackward))
+    .SetInferShapeFn(PD_INFER_SHAPE(ReluDoubleBackwardInferShape));
diff --git a/python/paddle/fluid/tests/custom_runtime/test_custom_device_relu_setup.py b/python/paddle/fluid/tests/custom_runtime/test_custom_device_relu_setup.py
new file mode 100644
index 00000000000000..760ad56cc3380e
--- /dev/null
+++ b/python/paddle/fluid/tests/custom_runtime/test_custom_device_relu_setup.py
@@ -0,0 +1,325 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import tempfile
+import unittest
+from site import getsitepackages
+
+import numpy as np
+
+
+def custom_relu_dynamic(func, device, dtype, np_x, use_func=True):
+    import paddle
+
+    paddle.fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
+    paddle.set_device(device)
+
+    t = paddle.to_tensor(np_x, dtype=dtype)
+    t.stop_gradient = False
+    sys.stdout.flush()
+
+    out = func(t) if use_func else paddle.nn.functional.relu(t)
+    out.stop_gradient = False
+
+    out.backward()
+
+    paddle.fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
+    if t.grad is None:
+        return out.numpy(), t.grad
+    else:
+        return out.numpy(), t.grad.numpy()
+
+
+def custom_relu_static(func, device, dtype, np_x, use_func=True):
+    import paddle
+    import paddle.static as static
+
+    paddle.enable_static()
+    paddle.set_device(device)
+
+    with static.scope_guard(static.Scope()):
+        with static.program_guard(static.Program()):
+            x = static.data(name="X", shape=[None, 8], dtype=dtype)
+            x.stop_gradient = False
+            out = func(x) if use_func else paddle.nn.functional.relu(x)
+            static.append_backward(out)
+
+            exe = static.Executor()
+            exe.run(static.default_startup_program())
+            # in static mode, x data has been covered by out
+            out_v = exe.run(
+                static.default_main_program(),
+                feed={"X": np_x},
+                fetch_list=[out.name],
+            )
+
+    paddle.disable_static()
+    return out_v
+
+
+def custom_relu_static_pe(func, device, dtype, np_x, use_func=True):
+    import paddle
+    import paddle.static as static
+
+    paddle.enable_static()
+    paddle.set_device(device)
+
+    places = paddle.CustomPlace("custom_cpu", 0)
+
+    with static.scope_guard(static.Scope()):
+        with static.program_guard(static.Program()):
+            x = static.data(name="X", shape=[None, 8], dtype=dtype)
+            x.stop_gradient = False
+            out = func(x) if use_func else paddle.nn.functional.relu(x)
+            static.append_backward(out)
+
+            exe = static.Executor()
+            exe.run(static.default_startup_program())
+
+            # in static mode, x data has been covered by out
+            compiled_prog = static.CompiledProgram(
+                static.default_main_program()
+            ).with_data_parallel(loss_name=out.name, places=places)
+            out_v = exe.run(
+                compiled_prog, feed={"X": np_x}, fetch_list=[out.name]
+            )
+
+    paddle.disable_static()
+    return out_v
+
+
+def custom_relu_double_grad_dynamic(func, device, dtype, np_x, use_func=True):
+    import paddle
+
+    paddle.set_device(device)
+    paddle.fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
+
+    t = paddle.to_tensor(np_x, dtype=dtype, stop_gradient=False)
+
+    out = func(t) if use_func else paddle.nn.functional.relu(t)
+    dx = paddle.grad(
+        outputs=out,
+        inputs=t,
+        grad_outputs=paddle.ones_like(t),
+        create_graph=True,
+        retain_graph=True,
+    )
+
+    ddout = paddle.grad(
+        outputs=dx[0],
+        inputs=out.grad,
+        grad_outputs=paddle.ones_like(t),
+        create_graph=False,
+    )
+
+    paddle.fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
+    assert ddout[0].numpy() is not None
+    return dx[0].numpy(), ddout[0].numpy()
+
+
+class TestNewCustomOpSetUpInstall(unittest.TestCase):
+    def setUp(self):
+        # compile so and set to current path
+        self.cur_dir = os.path.dirname(os.path.abspath(__file__))
+        self.temp_dir = tempfile.TemporaryDirectory()
+        cmd = 'cd {} \
+            && git clone {} \
+            && cd PaddleCustomDevice \
+            && git fetch origin \
+            && git checkout {} -b dev \
+            && cd backends/custom_cpu \
+            && mkdir build && cd build && cmake .. && make -j8 \
+            && cd {}'.format(
+            self.temp_dir.name,
+            os.getenv('PLUGIN_URL'),
+            os.getenv('PLUGIN_TAG'),
+            self.cur_dir,
+        )
+        os.system(cmd)
+
+        # set environment for loading and registering compiled custom kernels
+        # only valid in current process
+        os.environ['CUSTOM_DEVICE_ROOT'] = os.path.join(
+            self.cur_dir,
+            '{}/PaddleCustomDevice/backends/custom_cpu/build'.format(
+                self.temp_dir.name
+            ),
+        )
+
+        # `import paddle` loads custom_cpu.so, hence we must import paddle after finishing build PaddleCustomDevice
+        import paddle
+
+        # [Why specific paddle_includes directory?]
+        # Add paddle_includes to pass CI, for more details,
+        # please refer to the comments in `paddle/fluid/tests/custom_op/utils.py``
+        paddle_includes = []
+        for site_packages_path in getsitepackages():
+            paddle_includes.append(
+                os.path.join(site_packages_path, 'paddle', 'include')
+            )
+            paddle_includes.append(
+                os.path.join(
+                    site_packages_path, 'paddle', 'include', 'third_party'
+                )
+            )
+
+        custom_module = paddle.utils.cpp_extension.load(
+            name='custom_device_relu',
+            sources=['custom_relu_op.cc'],
+            extra_include_paths=paddle_includes,  # add for Coverage CI
+            extra_cxx_cflags=["-w", "-g"],  # test for cc flags
+            # build_directory=self.cur_dir,
+            verbose=True,
+        )
+        self.custom_op = custom_module.custom_relu
+
+        self.dtypes = ["float32", "float64"]
+        self.device = "custom_cpu"
+
+        # config seed
+        SEED = 2021
+        paddle.seed(SEED)
+        paddle.framework.random._manual_program_seed(SEED)
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
+        del os.environ['CUSTOM_DEVICE_ROOT']
+
+    def test_custom_device(self):
+        self._test_static()
+        self._test_static_pe()
+        self._test_dynamic()
+        self._test_double_grad_dynamic()
+        self._test_with_dataloader()
+
+    def _test_static(self):
+        for dtype in self.dtypes:
+            x = np.random.uniform(-1, 1, [4, 8]).astype(dtype)
+            out = custom_relu_static(self.custom_op, self.device, dtype, x)
+            pd_out = custom_relu_static(
+                self.custom_op, self.device, dtype, x, False
+            )
+            np.testing.assert_array_equal(
+                out,
+                pd_out,
+                err_msg="custom op out: {},\n paddle api out: {}".format(
+                    out, pd_out
+                ),
+            )
+
+    def _test_static_pe(self):
+        for dtype in self.dtypes:
+            x = np.random.uniform(-1, 1, [4, 8]).astype(dtype)
+            out = custom_relu_static_pe(self.custom_op, self.device, dtype, x)
+            pd_out = custom_relu_static_pe(
+                self.custom_op, self.device, dtype, x, False
+            )
+            np.testing.assert_array_equal(
+                out,
+                pd_out,
+                err_msg="custom op out: {},\n paddle api out: {}".format(
+                    out, pd_out
+                ),
+            )
+
+    def _test_dynamic(self):
+        for dtype in self.dtypes:
+            x = np.random.uniform(-1, 1, [4, 8]).astype(dtype)
+            out, x_grad = custom_relu_dynamic(
+                self.custom_op, self.device, dtype, x
+            )
+            pd_out, pd_x_grad = custom_relu_dynamic(
+                self.custom_op, self.device, dtype, x, False
+            )
+            np.testing.assert_array_equal(
+                out,
+                pd_out,
+                err_msg="custom op out: {},\n paddle api out: {}".format(
+                    out, pd_out
+                ),
+            )
+            np.testing.assert_array_equal(
+                x_grad,
+                pd_x_grad,
+                err_msg="custom op x grad: {},\n paddle api x grad: {}".format(
+                    x_grad, pd_x_grad
+                ),
+            )
+
+    def _test_double_grad_dynamic(self):
+        for dtype in self.dtypes:
+            x = np.random.uniform(-1, 1, [4, 8]).astype(dtype)
+            out, dx_grad = custom_relu_double_grad_dynamic(
+                self.custom_op, self.device, dtype, x
+            )
+            pd_out, pd_dx_grad = custom_relu_double_grad_dynamic(
+                self.custom_op, self.device, dtype, x, False
+            )
+            np.testing.assert_array_equal(
+                out,
+                pd_out,
+                err_msg="custom op out: {},\n paddle api out: {}".format(
+                    out, pd_out
+                ),
+            )
+            np.testing.assert_array_equal(
+                dx_grad,
+                pd_dx_grad,
+                err_msg="custom op dx grad: {},\n paddle api dx grad: {}".format(
+                    dx_grad, pd_dx_grad
+                ),
+            )
+
+    def _test_with_dataloader(self):
+        import paddle
+        from paddle.vision.transforms import Compose, Normalize
+
+        paddle.set_device(self.device)
+        # data loader
+        transform = Compose(
+            [Normalize(mean=[127.5], std=[127.5], data_format="CHW")]
+        )
+        train_dataset = paddle.vision.datasets.MNIST(
+            mode="train", transform=transform
+        )
+        train_loader = paddle.io.DataLoader(
+            train_dataset,
+            batch_size=64,
+            shuffle=True,
+            drop_last=True,
+            num_workers=0,
+        )
+
+        for batch_id, (image, _) in enumerate(train_loader()):
+            out = self.custom_op(image)
+            pd_out = paddle.nn.functional.relu(image)
+            np.testing.assert_array_equal(
+                out,
+                pd_out,
+                err_msg="custom op out: {},\n paddle api out: {}".format(
+                    out, pd_out
+                ),
+            )
+
+            if batch_id == 5:
+                break
+
+
+if __name__ == "__main__":
+    if os.name == 'nt' or sys.platform.startswith('darwin'):
+        # only support Linux now
+        exit()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/test_error_clip.py b/python/paddle/fluid/tests/test_error_clip.py
index dcfe477a76b3e2..65483d1c6adf68 100644
--- a/python/paddle/fluid/tests/test_error_clip.py
+++ b/python/paddle/fluid/tests/test_error_clip.py
@@ -38,13 +38,13 @@
 
 prog_clip = prog.clone()
 prog_clip.block(0).var(hidden1.name)._set_error_clip(
-    fluid.clip.ErrorClipByValue(max=CLIP_MAX, min=CLIP_MIN)
+    paddle.nn.clip.ErrorClipByValue(max=CLIP_MAX, min=CLIP_MIN)
 )
 
 avg_cost_clip = prog_clip.block(0).var(avg_cost.name)
 fluid.backward.append_backward(loss=avg_cost)
 fluid.backward.append_backward(
-    loss=avg_cost_clip, callbacks=[fluid.clip.error_clip_callback]
+    loss=avg_cost_clip, callbacks=[paddle.nn.clip.error_clip_callback]
 )
 
 hidden1_grad = prog.block(0).var(hidden1.name + "@GRAD")
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
index 90394ce24d011c..249387a0781d1f 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
@@ -128,5 +128,6 @@ if(WITH_DISTRIBUTE AND WITH_GPU)
   py_test_modules(test_cluster_partition MODULES test_cluster_partition)
   py_test_modules(test_convert_to_process_meshes MODULES
                   test_convert_to_process_meshes)
+  py_test_modules(test_pass_bf16 MODULES test_pass_bf16)
   py_test_modules(test_dist_saver MODULES test_dist_saver)
 endif()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_pass_bf16.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_pass_bf16.py
new file mode 100644
index 00000000000000..f26908df2cf029
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_pass_bf16.py
@@ -0,0 +1,211 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+import unittest
+
+import numpy as np
+
+import paddle
+import paddle.fluid.core as core
+import paddle.nn as nn
+from paddle.distributed.fleet import auto
+from paddle.fluid.contrib.mixed_precision.bf16.amp_utils import _valid_types
+from paddle.fluid.contrib.mixed_precision.fp16_utils import find_true_prev_op
+from paddle.fluid.dygraph.parallel import ParallelEnv
+from paddle.static import InputSpec
+from paddle.vision.datasets import MNIST
+
+paddle.enable_static()
+
+
+def apply_pass(use_bf16=False):
+    strategy = auto.Strategy()
+    strategy.auto_mode = "semi"
+    strategy.reinit = True
+    if use_bf16:
+        amp = strategy.amp
+        amp.enable = True
+        amp.enable_bf16 = True
+    return strategy
+
+
+class MnistDataset(MNIST):
+    def __init__(self, mode, return_label=True):
+        super().__init__(mode=mode)
+        self.return_label = return_label
+
+    def __getitem__(self, idx):
+        img = np.reshape(self.images[idx], [1, 28, 28])
+        if self.return_label:
+            return img, np.array(self.labels[idx]).astype('int64')
+        return (img,)
+
+    def __len__(self):
+        return len(self.images)
+
+
+def reset_prog():
+    paddle.fluid.framework.switch_main_program(paddle.static.Program())
+    paddle.fluid.framework.switch_startup_program(paddle.static.Program())
+
+
+class Model(nn.Layer):
+    def __init__(self):
+        super().__init__()
+        self.flatten = nn.Flatten()
+        self.fc1 = nn.Linear(784, 120)
+        self.relu1 = nn.ReLU()
+        self.fc2 = nn.Linear(120, 10)
+
+    def forward(self, input):
+        input.stop_gradient = True
+        x = self.flatten(input)
+        x = self.relu1(self.fc1(x))
+        x = self.fc2(x)
+        return x
+
+
+class TestBF16Pass(unittest.TestCase):
+    def setUp(self):
+        self.rtol = 1e-5
+        self.atol = 1e-8
+        self.batch_size = 256
+        self.batch_num = 10
+        self.dataset = MnistDataset("train")
+        self.eval_dataset = MnistDataset("test")
+
+    def init(self, engine):
+        paddle.seed(2021)
+        np.random.seed(2021)
+        random.seed(2021)
+        place = paddle.fluid.CUDAPlace(ParallelEnv().dev_id)
+        engine._executor = paddle.static.Executor(place)
+
+    def get_engine(self, use_bf16=False):
+        reset_prog()
+
+        strategy = apply_pass(use_bf16)
+        model = Model()
+        opt = paddle.optimizer.SGD(0.001, parameters=model.parameters())
+        loss = nn.CrossEntropyLoss()
+        engine = auto.Engine(model, loss, opt, strategy=strategy)
+        self.init(engine)
+        return engine
+
+    def check_program(self, program):
+        bf16_op_list = {
+            "matmul_v2",
+            "elementwise_add",
+            "relu",
+            "elementwise_add_grad",
+            "matmul_v2_grad",
+            "relu_grad",
+        }
+
+        fp32_op_list = {
+            "flatten_contiguous_range",
+            "reduce_mean",
+            "softmax_with_cross_entropy",
+            "fill_constant",
+            "reduce_mean_grad",
+            "softmax_with_cross_entropy_grad",
+        }
+
+        for block in program.blocks:
+            for op in block.ops:
+                if op not in bf16_op_list and op not in fp32_op_list:
+                    continue
+
+                for in_name in op.input_names:
+                    for in_var_name in op.input(in_name):
+                        var = None
+                        try:
+                            var = block.var(in_var_name)
+                        except ValueError as e:
+                            var = block._var_recursive(in_var_name)
+                        if var is None or var.type not in _valid_types:
+                            break
+
+                        if op.type in bf16_op_list:
+                            assert var.dtype == core.VarDesc.VarType.BF16
+                            if "cast_bf16" in in_var_name:
+                                if "@GRAD" in in_var_name:
+                                    tmp_in_var_name = in_var_name[
+                                        : in_var_name.find("@GRAD")
+                                    ]
+                                else:
+                                    tmp_in_var_name = in_var_name
+                                prev_op = find_true_prev_op(
+                                    block.ops, op, tmp_in_var_name
+                                )
+                                assert prev_op is not None
+                                assert prev_op.type == "cast"
+                                for in_name in prev_op.input_names:
+                                    for in_var_name in prev_op.input(in_name):
+                                        var = block.var(in_var_name)
+                                        assert (
+                                            var.dtype
+                                            == core.VarDesc.VarType.FP32
+                                        )
+
+                        elif op.type in fp32_op_list:
+                            if (
+                                op.type == "softmax_with_cross_entropy"
+                                or op.type == "softmax_with_cross_entropy_grad"
+                            ) and in_var_name == "label0":
+                                continue
+                            assert var.dtype == core.VarDesc.VarType.FP32
+                            if "cast_fp32" in in_var_name:
+                                prev_op = find_true_prev_op(
+                                    block.ops, op, tmp_in_var_name
+                                )
+                                assert prev_op is not None
+                                assert prev_op.type == "cast"
+                                for in_name in prev_op.input_names:
+                                    for in_var_name in prev_op.input(in_name):
+                                        var = block.var(in_var_name)
+                                        assert (
+                                            var.dtype
+                                            == core.VarDesc.VarType.BF16
+                                        )
+
+                for out_name in op.output_names:
+                    for out_var_name in op.output(out_name):
+                        var = None
+                        try:
+                            var = block.var(out_var_name)
+                        except ValueError as e:
+                            var = block._var_recursive(out_var_name)
+
+                        if var is None or var.type not in _valid_types:
+                            break
+                        if op.type in bf16_op_list:
+                            assert var.dtype == core.VarDesc.VarType.BF16
+                        elif op.type in fp32_op_list:
+                            assert var.dtype == core.VarDesc.VarType.FP32
+
+    def test_bf16_pass(self):
+        bf16_o1_engine = self.get_engine(True)
+        inputs_spec = [InputSpec([None, 1, 28, 28], 'float32', 'input0')]
+        labels_spec = [InputSpec([None, 1], 'int64', 'label0')]
+        bf16_o1_engine.prepare(
+            inputs_spec=inputs_spec, labels_spec=labels_spec, mode="train"
+        )
+        self.check_program(bf16_o1_engine._dist_main_progs["train"][0])
+        print("BF16!check program successfully!")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_strategy.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_strategy.py
index 529d1d5f6255d6..0b41e323ffd7a7 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_strategy.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_strategy.py
@@ -41,6 +41,13 @@ def test_default_config(self):
         self.assertEqual(amp.use_fp16_guard, True)
         self.assertEqual(amp.use_optimizer_fp16, False)
 
+        self.assertEqual(amp.enable_bf16, False)
+        self.assertEqual(amp.custom_bf16_list, [])
+        self.assertEqual(amp.custom_fp32_list, [])
+        self.assertEqual(amp.custom_fp32_varnames, [])
+        self.assertEqual(amp.use_pure_bf16, False)
+        self.assertEqual(amp.use_bf16_guard, False)
+
         sharding = strategy.sharding
         self.assertEqual(sharding.enable, False)
         self.assertEqual(sharding.stage, 1)
diff --git a/python/paddle/fluid/tests/unittests/collective/collective_allgather_api.py b/python/paddle/fluid/tests/unittests/collective/collective_allgather_api.py
index 3c6b1ad84eca04..72e28c21f250c1 100644
--- a/python/paddle/fluid/tests/unittests/collective/collective_allgather_api.py
+++ b/python/paddle/fluid/tests/unittests/collective/collective_allgather_api.py
@@ -60,7 +60,7 @@ def run_trainer(self, args):
         )
         assert (
             args['static_mode'] == 1
-        ), "collective_allgather_api only support static mode"
+        ), "collective_allgather_api only support static graph mode"
         result = self.get_model(
             train_prog, startup_prog, rank, dtype=args["dtype"]
         )
diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/dygraph_save_for_auto_infer.py b/python/paddle/fluid/tests/unittests/collective/fleet/dygraph_save_for_auto_infer.py
index ef85aab80f6c95..0c4ec9418d71cd 100644
--- a/python/paddle/fluid/tests/unittests/collective/fleet/dygraph_save_for_auto_infer.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/dygraph_save_for_auto_infer.py
@@ -254,7 +254,7 @@ def train_mlp_static(args, model, loss, opt_state=None, save_model=False):
     model.fit(dataset, epochs=1)
     model.save(os.path.join(args.output_dir, "static_save"))
     paddle.device.cuda.synchronize()
-    print("=============== predict in static mode =================")
+    print("=============== predict in static graph mode =================")
     out = model.predict(dataset, verbose=1000)
 
     if save_model:
diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_qat.py b/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_qat.py
index 3514e0aedef76a..9db26a3f497df5 100644
--- a/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_qat.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_qat.py
@@ -23,7 +23,7 @@
 import paddle.fluid as fluid
 import paddle.nn as nn
 from paddle.distributed.utils.launch_utils import find_free_ports, get_cluster
-from paddle.fluid.contrib.slim.quantization import ImperativeQuantAware
+from paddle.quantization import ImperativeQuantAware
 
 
 def set_random_seed(seed, dp_id, rank_id):
diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/pipeline_mnist.py b/python/paddle/fluid/tests/unittests/collective/fleet/pipeline_mnist.py
index f974709ce87abe..f97faed1d584fc 100644
--- a/python/paddle/fluid/tests/unittests/collective/fleet/pipeline_mnist.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/pipeline_mnist.py
@@ -122,7 +122,7 @@ def get_model(self, batch_size=2, use_dgc=False, dist_strategy=None):
 
         opt = paddle.optimizer.AdamW(
             learning_rate=lr_val,
-            grad_clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0),
+            grad_clip=paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0),
         )
 
         acc_steps = 2  # accumulated steps for pipeline
diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/pipeline_mnist_multi_device.py b/python/paddle/fluid/tests/unittests/collective/fleet/pipeline_mnist_multi_device.py
index ecc71abe6252cd..170243fc962839 100644
--- a/python/paddle/fluid/tests/unittests/collective/fleet/pipeline_mnist_multi_device.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/pipeline_mnist_multi_device.py
@@ -122,7 +122,7 @@ def get_model(self, batch_size=2, use_dgc=False, dist_strategy=None):
         opt = fluid.optimizer.Momentum(
             learning_rate=lr_val,
             momentum=0.9,
-            grad_clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0),
+            grad_clip=paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0),
         )
 
         acc_steps = 2  # accumulated steps for pipeline
diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/test_dgc_optimizer.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_dgc_optimizer.py
index c7b44fe305d25a..0d499393f12155 100644
--- a/python/paddle/fluid/tests/unittests/collective/fleet/test_dgc_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/test_dgc_optimizer.py
@@ -15,10 +15,10 @@
 import unittest
 
 import paddle
-import paddle.fluid.clip as clip
 import paddle.fluid.framework as framework
 import paddle.fluid.optimizer as optimizer
 import paddle.fluid.regularizer as regularizer
+import paddle.nn.clip as clip
 
 paddle.enable_static()
 
@@ -76,7 +76,7 @@ def check_dgc_momentum_optimizer(
             rampup_begin_step=0,
             num_trainers=2,
             regularization=regularization,
-            grad_clip=clip.GradientClipByNorm(1.0),
+            grad_clip=clip.ClipGradByNorm(1.0),
         )
 
         if use_recompute:
@@ -144,14 +144,14 @@ def check_dgc_momentum_optimizer(
                 print("dgc regular_coeff=" + str(coeff))
 
     def test_tpyeError(self):
-        # the type of DGCMomentumOptimizer(grad_clip=) must be 'GradientClipByNorm'
+        # the type of DGCMomentumOptimizer(grad_clip=) must be 'ClipGradByNorm'
         with self.assertRaises(TypeError):
             dgc_momentum_optimizer = self.MockDGCMomentum(
                 learning_rate=0.01,
                 momentum=0.2,
                 rampup_begin_step=0,
                 num_trainers=2,
-                grad_clip=clip.GradientClipByGlobalNorm(1.0),
+                grad_clip=clip.ClipGradByGlobalNorm(1.0),
             )
 
     def test_momentum_without_dgc(self):
diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_hybrid_meta_optimizer.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_hybrid_meta_optimizer.py
index eee1235670805f..0982ab86117c9f 100755
--- a/python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_hybrid_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_hybrid_meta_optimizer.py
@@ -354,7 +354,7 @@ def test_opt_sharding_with_pp_amp_gclip(self):
         }
         strategy.fuse_all_reduce_ops = True
         strategy.fuse_grad_size_in_MB = 32
-        clip = paddle.fluid.clip.GradientClipByGlobalNorm(1.0)
+        clip = paddle.nn.ClipGradByGlobalNorm(1.0)
 
         self.optimizer(
             avg_cost, strategy, train_prog, startup_prog, grad_clip=clip
@@ -552,7 +552,7 @@ def test_opt_sharding_with_pp_amp_gclip_fuse_gm(self):
         strategy.fuse_all_reduce_ops = True
         strategy.fuse_grad_size_in_MB = 32
         strategy.fuse_grad_merge = True
-        clip = paddle.fluid.clip.GradientClipByGlobalNorm(1.0)
+        clip = paddle.nn.ClipGradByGlobalNorm(1.0)
 
         self.optimizer(
             avg_cost, strategy, train_prog, startup_prog, grad_clip=clip
@@ -940,7 +940,7 @@ def test_opt_sharding_with_pp_amp_gclip_boundary(self):
         }
         strategy.fuse_all_reduce_ops = True
         strategy.fuse_grad_size_in_MB = 32
-        clip = paddle.fluid.clip.GradientClipByGlobalNorm(1.0)
+        clip = paddle.nn.ClipGradByGlobalNorm(1.0)
 
         self.optimizer(
             avg_cost, strategy, train_prog, startup_prog, grad_clip=clip
@@ -1044,7 +1044,7 @@ def test_opt_sharding_with_pp_amp_gclip_boundary_card1(self):
         }
         strategy.fuse_all_reduce_ops = True
         strategy.fuse_grad_size_in_MB = 32
-        clip = paddle.fluid.clip.GradientClipByGlobalNorm(1.0)
+        clip = paddle.nn.ClipGradByGlobalNorm(1.0)
 
         self.optimizer(
             avg_cost, strategy, train_prog, startup_prog, grad_clip=clip
diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_sharding_meta_optimizer.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_sharding_meta_optimizer.py
index d59c074c03f11d..46b5fe9ed4b6a6 100755
--- a/python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_sharding_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_sharding_meta_optimizer.py
@@ -640,7 +640,7 @@ def test_sharding_gradient_clip(self):
         )
         avg_cost, strategy = self.net(train_prog, startup_prog)
         self.set_strategy(strategy, 'sharding')
-        clip = paddle.fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0)
+        clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
         self.optimizer(
             avg_cost, strategy, train_prog, startup_prog, grad_clip=clip
         )
@@ -1309,7 +1309,7 @@ def test_hybrid_with_mp_pp_amp_gclip(self):
             "micro_batch_size": 2,
             "accumulate_steps": 4,
         }
-        clip = paddle.fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0)
+        clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
         self.optimizer(
             avg_cost, strategy, train_prog, startup_prog, grad_clip=clip
         )
@@ -1547,7 +1547,7 @@ def test_hybrid_with_mp_pp_amp_gclip_for_optimizer(self):
             "micro_batch_size": 2,
             "accumulate_steps": 4,
         }
-        clip = paddle.fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0)
+        clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
         self.optimizer(
             avg_cost,
             strategy,
diff --git a/python/paddle/fluid/tests/unittests/collective/process_group_nccl.py b/python/paddle/fluid/tests/unittests/collective/process_group_nccl.py
index 130510d90cb045..3be3cfecf16d6e 100644
--- a/python/paddle/fluid/tests/unittests/collective/process_group_nccl.py
+++ b/python/paddle/fluid/tests/unittests/collective/process_group_nccl.py
@@ -488,6 +488,9 @@ def test_create_process_group_nccl(self):
             task.wait()
 
         print("test reduce prod api ok")
+
+        test_reduce_with_zero_dim([], self.dtype, pg)
+
         # test Scatter
         # rank 0
         in_shape = list(self.shape)
@@ -601,5 +604,88 @@ def config(self):
         self.shape = (4, 20, 20)
 
 
+def test_reduce_with_zero_dim(shape, dtype, pg):
+    # test Reduce With Zero Dim
+    # rank 0
+    x = np.random.random(shape).astype(dtype)
+    y = np.random.random(shape).astype(dtype)
+    tensor_x = paddle.to_tensor(x)
+    tensor_y = paddle.to_tensor(y)
+    sum_result = tensor_x + tensor_y
+    if pg.rank() == 0:
+        task = dist.reduce(tensor_x, 0, sync_op=True)
+        paddle.device.cuda.synchronize()
+    # rank 1
+    else:
+        task = dist.reduce(tensor_y, 0, sync_op=False)
+        task.wait()
+        paddle.device.cuda.synchronize()
+    if pg.rank() == 0:
+        assert np.array_equal(tensor_x, sum_result) and len(tensor_x.shape) == 0
+    print("test reduce with zero dim sum api ok\n")
+
+    # test reduce with zero dim max
+    # rank 0
+    x = np.random.random(shape).astype(dtype)
+    tensor_x = paddle.to_tensor(x)
+    # rank 1
+    y = np.random.random(shape).astype(dtype)
+    tensor_y = paddle.to_tensor(y)
+
+    max_result = paddle.maximum(tensor_x, tensor_y)
+
+    if pg.rank() == 0:
+        task = dist.reduce(tensor_x, 0, dist.ReduceOp.MAX, sync_op=False)
+        task.wait()
+        assert np.array_equal(tensor_x, max_result) and len(tensor_x.shape) == 0
+    else:
+        task = dist.reduce(tensor_y, 0, dist.ReduceOp.MAX, sync_op=False)
+        task.wait()
+
+    print("test reduce with zero dim max api ok")
+
+    # test reduce with zero dim min
+    # rank 0
+    x = np.random.random(shape).astype(dtype)
+    tensor_x = paddle.to_tensor(x)
+    # rank 1
+    y = np.random.random(shape).astype(dtype)
+    tensor_y = paddle.to_tensor(y)
+
+    min_result = paddle.minimum(tensor_x, tensor_y)
+
+    if pg.rank() == 0:
+        task = dist.reduce(tensor_x, 0, dist.ReduceOp.MIN, sync_op=False)
+        task.wait()
+        assert np.array_equal(tensor_x, min_result) and len(tensor_x.shape) == 0
+    else:
+        task = dist.reduce(tensor_y, 0, dist.ReduceOp.MIN, sync_op=False)
+        task.wait()
+
+    print("test reduce with zero dim min api ok")
+
+    # test reduce with zero dim product
+    # rank 0
+    x = np.random.random(shape).astype(dtype)
+    tensor_x = paddle.to_tensor(x)
+    # rank 1
+    y = np.random.random(shape).astype(dtype)
+    tensor_y = paddle.to_tensor(y)
+
+    prod_result = np.multiply(x, y)
+
+    if pg.rank() == 0:
+        task = dist.reduce(tensor_x, 0, dist.ReduceOp.PROD, sync_op=False)
+        task.wait()
+        assert (
+            np.array_equal(tensor_x, prod_result) and len(tensor_x.shape) == 0
+        )
+    else:
+        task = dist.reduce(tensor_y, 0, dist.ReduceOp.PROD, sync_op=False)
+        task.wait()
+
+    print("test reduce with zero dim prod api ok")
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/distributed_fused_lamb_test_base.py b/python/paddle/fluid/tests/unittests/distributed_fused_lamb_test_base.py
index ff9122b1191b64..3fa9c12529272c 100644
--- a/python/paddle/fluid/tests/unittests/distributed_fused_lamb_test_base.py
+++ b/python/paddle/fluid/tests/unittests/distributed_fused_lamb_test_base.py
@@ -22,8 +22,8 @@
 import paddle.distributed.fleet as fleet
 import paddle.fluid.core as core
 from paddle.distributed.fleet.meta_optimizers.common import CollectiveHelper
-from paddle.fluid.clip import ClipGradBase, _clip_by_global_norm_using_mp_type
 from paddle.incubate import DistributedFusedLamb
+from paddle.nn.clip import ClipGradBase, _clip_by_global_norm_using_mp_type
 from paddle.vision.models import resnet18 as resnet
 
 
diff --git a/python/paddle/fluid/tests/unittests/distribution/test_distribution_laplace_static.py b/python/paddle/fluid/tests/unittests/distribution/test_distribution_laplace_static.py
index 6ec508f92077a5..748c869323382e 100644
--- a/python/paddle/fluid/tests/unittests/distribution/test_distribution_laplace_static.py
+++ b/python/paddle/fluid/tests/unittests/distribution/test_distribution_laplace_static.py
@@ -273,7 +273,7 @@ def _np_kl(self):
 
 
 """
-# Note: Zero dimension of a Tensor is not supported by static mode of paddle;
+# Note: Zero dimension of a Tensor is not supported by static graph mode of paddle;
 # therefore, ks test below cannot be conducted temporarily.
 
 @parameterize.place(config.DEVICES)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_convert_call.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_convert_call.py
index 743fff189cb9f8..ad08aa1e3ccae5 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_convert_call.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_convert_call.py
@@ -68,7 +68,7 @@ class A:
     def add(a, b):
         """
         dygraph mode, return a numpy object.
-        static mode, return a variable object.
+        static graph mode, return a variable object.
         """
         return paddle.to_tensor(a.numpy() + b.numpy())
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_len.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_len.py
index de3508afcbe2bc..218e3ed4326ad5 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_len.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_len.py
@@ -19,6 +19,7 @@
 import paddle
 import paddle.fluid as fluid
 from paddle.jit.dy2static import Call
+from paddle.nn import clip
 
 SEED = 2020
 np.random.seed(SEED)
@@ -89,11 +90,11 @@ def len_with_selected_rows(place):
         type=fluid.core.VarDesc.VarType.SELECTED_ROWS,
     )
     # y is Variable(SelectedRows)
-    y = fluid.layers.merge_selected_rows(var)
+    y = clip.merge_selected_rows(var)
     y_len = Call(len)(y)
 
     # z is inner tensor with shape [4, 2]
-    z = fluid.layers.get_tensor_from_selected_rows(y)
+    z = clip.get_tensor_from_selected_rows(y)
     z_len = Call(len)(z)
 
     # set data for selected_rows
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_loop.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_loop.py
index fabfa8edc3c83a..5c84da8e621be9 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_loop.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_loop.py
@@ -441,5 +441,39 @@ def _init_dyfunc(self):
         self.dyfunc = for_loop_dyfunc_not_support
 
 
+class Net(paddle.nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+        self.layer_dict = paddle.nn.LayerDict(
+            {
+                "conv1": paddle.nn.Conv2D(3, 3, 1),
+                "conv2": paddle.nn.Conv2D(3, 3, 1),
+                "conv3": paddle.nn.Conv2D(3, 3, 1),
+            }
+        )
+
+    def forward(self, x):
+        out = 0
+        for layer_name in self.layer_dict:
+            out += self.layer_dict[layer_name](x)
+        return out
+
+
+class TestForLoopMeetDict(unittest.TestCase):
+    def test_start(self):
+
+        net = Net()
+        model = paddle.jit.to_static(
+            net,
+            input_spec=[
+                paddle.static.InputSpec(
+                    shape=[None, 3, 224, 224], dtype='float32'
+                )
+            ],
+        )
+        paddle.jit.save(model, "./inference/inference")
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py
index 42c54129342218..4e5ac62949757b 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py
@@ -259,7 +259,7 @@ def check_jit_save_load(self, model, inputs, input_spec, to_static, gt_out):
                 input_spec=input_spec,
                 output_spec=[gt_out],
             )
-            # load in static mode
+            # load in static graph mode
             static_infer_out = self.jit_load_and_run_inference_static(
                 model_save_dir, model_filename, params_filename, inputs
             )
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py
index 04cc9ce7ebe531..bee41124080f48 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py
@@ -237,7 +237,7 @@ def __del__(self):
 
     def train(self, to_static, build_strategy=None):
         """
-        Tests model decorated by `dygraph_to_static_output` in static mode. For users, the model is defined in dygraph mode and trained in static mode.
+        Tests model decorated by `dygraph_to_static_output` in static graph mode. For users, the model is defined in dygraph mode and trained in static graph mode.
         """
         with fluid.dygraph.guard(place):
             np.random.seed(SEED)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_amp.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_amp.py
index 59f5ec7435a2f3..22c045dc05c503 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_amp.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_amp.py
@@ -37,7 +37,7 @@
 
 def train(to_static, build_strategy=None):
     """
-    Tests model decorated by `dygraph_to_static_output` in static mode. For users, the model is defined in dygraph mode and trained in static mode.
+    Tests model decorated by `dygraph_to_static_output` in static graph mode. For users, the model is defined in dygraph mode and trained in static graph mode.
     """
     with fluid.dygraph.guard(place):
         np.random.seed(SEED)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_pure_fp16.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_pure_fp16.py
index 50739957c17390..6cbde55c974fb8 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_pure_fp16.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_pure_fp16.py
@@ -34,7 +34,7 @@
 
 def train(to_static, build_strategy=None):
     """
-    Tests model decorated by `dygraph_to_static_output` in static mode. For users, the model is defined in dygraph mode and trained in static mode.
+    Tests model decorated by `dygraph_to_static_output` in static graph mode. For users, the model is defined in dygraph mode and trained in static graph mode.
     """
     np.random.seed(SEED)
     paddle.seed(SEED)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_v2.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_v2.py
index 134274a9ed4a6b..87ed10ef460d76 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_v2.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_v2.py
@@ -243,7 +243,7 @@ def tearDown(self):
 
     def do_train(self, to_static):
         """
-        Tests model decorated by `dygraph_to_static_output` in static mode. For users, the model is defined in dygraph mode and trained in static mode.
+        Tests model decorated by `dygraph_to_static_output` in static graph mode. For users, the model is defined in dygraph mode and trained in static graph mode.
         """
         paddle.disable_static(place)
         np.random.seed(SEED)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_seq2seq.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_seq2seq.py
index a0a45ddbde2bea..64d0d816ba0a5b 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_seq2seq.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_seq2seq.py
@@ -22,8 +22,8 @@
 from seq2seq_utils import Seq2SeqModelHyperParams, get_data_iter
 
 import paddle.fluid as fluid
-from paddle.fluid.clip import GradientClipByGlobalNorm
 from paddle.jit import ProgramTranslator
+from paddle.nn import ClipGradByGlobalNorm
 
 place = (
     fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda() else fluid.CPUPlace()
@@ -71,7 +71,7 @@ def train(args, attn_model=False):
                 dropout=args.dropout,
             )
 
-        gloabl_norm_clip = GradientClipByGlobalNorm(args.max_grad_norm)
+        gloabl_norm_clip = ClipGradByGlobalNorm(args.max_grad_norm)
         optimizer = fluid.optimizer.SGD(
             args.learning_rate,
             parameter_list=model.parameters(),
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/program_config.py b/python/paddle/fluid/tests/unittests/ir/inference/program_config.py
index f3dd18a9a026b4..992f7810a123e0 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/program_config.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/program_config.py
@@ -20,10 +20,6 @@
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
-from paddle.fluid.contrib.slim.quantization import (
-    QuantizationFreezePass,
-    QuantizationTransformPass,
-)
 from paddle.fluid.executor import global_scope
 from paddle.fluid.framework import (
     IrGraph,
@@ -32,6 +28,10 @@
     convert_np_dtype_to_dtype_,
 )
 from paddle.fluid.initializer import NumpyArrayInitializer
+from paddle.static.quantization import (
+    QuantizationFreezePass,
+    QuantizationTransformPass,
+)
 
 
 class TensorConfig:
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/quant_dequant_test.py b/python/paddle/fluid/tests/unittests/ir/inference/quant_dequant_test.py
index bb50410ff01cb0..a73af6b49422a7 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/quant_dequant_test.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/quant_dequant_test.py
@@ -21,16 +21,16 @@
 import paddle
 import paddle.fluid as fluid
 from paddle.fluid import Program, Variable, core
-from paddle.fluid.contrib.slim.quantization import (
+from paddle.fluid.core import AnalysisConfig, create_paddle_predictor
+from paddle.fluid.framework import IrGraph
+from paddle.fluid.io import append_fetch_ops, prepend_feed_ops
+from paddle.static.quantization import (
     AddQuantDequantPass,
     OutScaleForInferencePass,
     OutScaleForTrainingPass,
     QuantizationFreezePass,
     QuantizationTransformPass,
 )
-from paddle.fluid.core import AnalysisConfig, create_paddle_predictor
-from paddle.fluid.framework import IrGraph
-from paddle.fluid.io import append_fetch_ops, prepend_feed_ops
 
 
 class QuantDequantTest(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass.py
index b1890ea95ab978..24a63751cfec43 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass.py
@@ -127,7 +127,7 @@ class ElementwiseActivationMkldnnFusePassTest_Add_Clip(
 ):
     def set_params(self):
         self.operand = paddle.add
-        self.act = fluid.layers.clip
+        self.act = paddle.clip
         self.act_alpha = 0.0
         self.act_beta = 10.0
 
@@ -219,7 +219,7 @@ class ElementwiseActivationMkldnnFusePassTest_Sub_Clip(
 ):
     def set_params(self):
         self.operand = paddle.subtract
-        self.act = fluid.layers.clip
+        self.act = paddle.clip
         self.act_alpha = 0.0
         self.act_beta = 10.0
 
@@ -319,7 +319,7 @@ class ElementwiseActivationMkldnnFusePassTest_Mul_Clip(
 ):
     def set_params(self):
         self.operand = paddle.multiply
-        self.act = fluid.layers.clip
+        self.act = paddle.clip
         self.act_alpha = 0.0
         self.act_beta = 10.0
 
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_scale_matmul_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_scale_matmul_fuse_pass.py
index d696fa44f5aaa0..5c1a11625611ba 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_scale_matmul_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_scale_matmul_fuse_pass.py
@@ -37,74 +37,55 @@ def sample_program_config(self, draw):
         input_dim = draw(st.sampled_from([1, 32, 64]))
 
         def generate_input(attrs, type):
-            if attrs[1]['transpose_X'] and attrs[1]['transpose_Y']:
-                shape_x = [
-                    attrs[2]['batch_size'],
-                    attrs[2]['channel'],
-                    attrs[2]['input_dim'],
-                    32,
-                ]
-                shape_y = [
-                    attrs[2]['batch_size'],
-                    attrs[2]['channel'],
-                    64,
-                    attrs[2]['input_dim'],
-                ]
-            elif attrs[1]['transpose_X']:
-                shape_x = [
-                    attrs[2]['batch_size'],
-                    attrs[2]['channel'],
-                    attrs[2]['input_dim'],
-                    32,
-                ]
-                shape_y = [
-                    attrs[2]['batch_size'],
-                    attrs[2]['channel'],
-                    attrs[2]['input_dim'],
-                    64,
-                ]
-            elif attrs[1]['transpose_Y']:
-                shape_x = [
-                    attrs[2]['batch_size'],
-                    attrs[2]['channel'],
-                    32,
-                    attrs[2]['input_dim'],
-                ]
-                shape_y = [
-                    attrs[2]['batch_size'],
-                    attrs[2]['channel'],
-                    8,
-                    attrs[2]['input_dim'],
-                ]
+            is_transpose_X = attrs[1]['transpose_X']
+            is_transpose_Y = attrs[1]['transpose_Y']
+
+            if is_transpose_X:
+                shape_x_3 = attrs[2]['input_dim']
+                shape_x_4 = 32
             else:
-                shape_x = [
-                    attrs[2]['batch_size'],
-                    attrs[2]['channel'],
-                    32,
-                    attrs[2]['input_dim'],
-                ]
-                shape_y = [
-                    attrs[2]['batch_size'],
-                    attrs[2]['channel'],
-                    attrs[2]['input_dim'],
-                    16,
-                ]
-
-            if type == "x":
-                return np.random.random(shape_x).astype(np.float32)
+                shape_x_3 = 32
+                shape_x_4 = attrs[2]['input_dim']
+
+            if is_transpose_X and is_transpose_Y:
+                shape_y_3 = 64
+                shape_y_4 = attrs[2]['input_dim']
+            elif is_transpose_X:
+                shape_y_3 = attrs[2]['input_dim']
+                shape_y_4 = 64
+            elif is_transpose_Y:
+                shape_y_3 = 8
+                shape_y_4 = attrs[2]['input_dim']
             else:
-                return np.random.random(shape_y).astype(np.float32)
+                shape_y_3 = attrs[2]['input_dim']
+                shape_y_4 = 16
+
+            shape_x = [
+                attrs[2]['batch_size'],
+                attrs[2]['channel'],
+                shape_x_3,
+                shape_x_4,
+            ]
+            shape_y = [
+                attrs[2]['batch_size'],
+                attrs[2]['channel'],
+                shape_y_3,
+                shape_y_4,
+            ]
+
+            shape = shape_x if type == 'x' else shape_y
+            return np.random.random(shape).astype(np.float32)
 
         attrs = [
             {
-                "scale": scale,
-                "bias": bias,
-                "bias_after_scale": bias_after_scale,
+                'scale': scale,
+                'bias': bias,
+                'bias_after_scale': bias_after_scale,
             },
             {
-                "transpose_X": transpose_X,
-                "transpose_Y": transpose_Y,
-                "alpha": alpha,
+                'transpose_X': transpose_X,
+                'transpose_Y': transpose_Y,
+                'alpha': alpha,
             },
             {
                 'batch_size': batch_size,
@@ -115,29 +96,29 @@ def generate_input(attrs, type):
 
         ops_config = [
             {
-                "op_type": "scale",
-                "op_inputs": {"X": ["input_data1"]},
-                "op_outputs": {"Out": ["scale_output"]},
-                "op_attrs": {
-                    "scale": attrs[0]['scale'],
-                    "bias": attrs[0]['bias'],
-                    "bias_after_scale": attrs[0]['bias_after_scale'],
+                'op_type': 'scale',
+                'op_inputs': {'X': ['input_data1']},
+                'op_outputs': {'Out': ['scale_output']},
+                'op_attrs': {
+                    'scale': attrs[0]['scale'],
+                    'bias': attrs[0]['bias'],
+                    'bias_after_scale': attrs[0]['bias_after_scale'],
                 },
             },
             {
-                "op_type": "matmul",
-                "op_inputs": {"X": ["scale_output"], "Y": ["input_data2"]},
-                "op_outputs": {"Out": ["matmul_output"]},
-                "op_attrs": {
+                'op_type': 'matmul',
+                'op_inputs': {'X': ['scale_output'], 'Y': ['input_data2']},
+                'op_outputs': {'Out': ['matmul_output']},
+                'op_attrs': {
                     'transpose_X': attrs[1]['transpose_X'],
                     'transpose_Y': attrs[1]['transpose_Y'],
                     'alpha': attrs[1]['alpha'],
-                    "fused_reshape_X": [],
-                    "fused_reshape_Y": [],
-                    "fused_transpose_X": [],
-                    "fused_transpose_Y": [],
-                    "fused_reshape_Out": [],
-                    "fused_transpose_Out": [],
+                    'fused_reshape_X': [],
+                    'fused_reshape_Y': [],
+                    'fused_transpose_X': [],
+                    'fused_transpose_Y': [],
+                    'fused_reshape_Out': [],
+                    'fused_transpose_Out': [],
                 },
             },
         ]
@@ -148,25 +129,27 @@ def generate_input(attrs, type):
             ops=ops,
             weights={},
             inputs={
-                "input_data1": TensorConfig(
-                    data_gen=partial(generate_input, attrs, "x")
+                'input_data1': TensorConfig(
+                    data_gen=partial(generate_input, attrs, 'x')
                 ),
-                "input_data2": TensorConfig(
-                    data_gen=partial(generate_input, attrs, "y")
+                'input_data2': TensorConfig(
+                    data_gen=partial(generate_input, attrs, 'y')
                 ),
             },
-            outputs=["matmul_output"],
+            outputs=['matmul_output'],
         )
 
         return program_config
 
     def sample_predictor_configs(self, program_config):
-        config = self.create_inference_config(use_mkldnn=True)
+        config = self.create_inference_config(
+            use_mkldnn=True, passes=['scale_matmul_fuse_pass']
+        )
         yield config, ['matmul'], (1e-5, 1e-5)
 
     def test(self):
-        self.run_and_statis(quant=False, passes=["scale_matmul_fuse_pass"])
+        self.run_and_statis(quant=False, passes=['scale_matmul_fuse_pass'])
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_activation_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_activation_pass.py
index 3a39c84141ced2..0c205fbee7c870 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_activation_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_activation_pass.py
@@ -106,7 +106,7 @@ def append_act(self, x):
 
 class TensorRTSubgraphPassClipTest(TensorRTSubgraphPassActivationTest):
     def append_act(self, x):
-        return fluid.layers.clip(x, 0, 1)
+        return paddle.clip(x, 0, 1)
 
 
 class TensorRTSubgraphPassTanhTest(TensorRTSubgraphPassActivationTest):
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_equal.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_equal.py
index 35780b491cc763..ce46c79cbbd3dd 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_equal.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_equal.py
@@ -39,45 +39,46 @@ def sample_program_configs(self):
         def generate_input(shape):
             return np.random.random(shape).astype(np.float32)
 
-        for batch in [1, 2, 4]:
-            for shape in [[batch, 1], [batch, 1, 32], [batch, 1, 16, 32]]:
-                for axis in [-1 if len(shape) == 1 else 1]:
-                    self.dims = len(shape)
-                    dics = [{"axis": axis}, {"in_dtype": 0, "out_dtype": 5}]
-                    ops_config = [
-                        {
-                            "op_type": "equal",
-                            "op_inputs": {
-                                "X": ["input_data1"],
-                                "Y": ["input_data2"],
+        for op_type in ["equal", "not_equal"]:
+            for batch in [1, 2, 4]:
+                for shape in [[batch, 1], [batch, 1, 32], [batch, 1, 16, 32]]:
+                    for axis in [-1 if len(shape) == 1 else 1]:
+                        self.dims = len(shape)
+                        dics = [{"axis": axis}, {"in_dtype": 0, "out_dtype": 5}]
+                        ops_config = [
+                            {
+                                "op_type": op_type,
+                                "op_inputs": {
+                                    "X": ["input_data1"],
+                                    "Y": ["input_data2"],
+                                },
+                                "op_outputs": {"Out": ["compare_output_data"]},
+                                "op_attrs": dics[0],
                             },
-                            "op_outputs": {"Out": ["compare_output_data"]},
-                            "op_attrs": dics[0],
-                        },
-                        {
-                            "op_type": "cast",
-                            "op_inputs": {"X": ["compare_output_data"]},
-                            "op_outputs": {"Out": ["output_data"]},
-                            "op_attrs": dics[1],
-                        },
-                    ]
-                    ops = self.generate_op_config(ops_config)
-
-                    program_config = ProgramConfig(
-                        ops=ops,
-                        weights={},
-                        inputs={
-                            "input_data1": TensorConfig(
-                                data_gen=partial(generate_input, shape)
-                            ),
-                            "input_data2": TensorConfig(
-                                data_gen=partial(generate_input, shape)
-                            ),
-                        },
-                        outputs=["output_data"],
-                    )
-
-                    yield program_config
+                            {
+                                "op_type": "cast",
+                                "op_inputs": {"X": ["compare_output_data"]},
+                                "op_outputs": {"Out": ["output_data"]},
+                                "op_attrs": dics[1],
+                            },
+                        ]
+                        ops = self.generate_op_config(ops_config)
+
+                        program_config = ProgramConfig(
+                            ops=ops,
+                            weights={},
+                            inputs={
+                                "input_data1": TensorConfig(
+                                    data_gen=partial(generate_input, shape)
+                                ),
+                                "input_data2": TensorConfig(
+                                    data_gen=partial(generate_input, shape)
+                                ),
+                            },
+                            outputs=["output_data"],
+                        )
+
+                        yield program_config
 
     def sample_predictor_configs(
         self, program_config
diff --git a/python/paddle/fluid/tests/unittests/ir/test_ir_subgraph_python_interface.py b/python/paddle/fluid/tests/unittests/ir/test_ir_subgraph_python_interface.py
index 4cfc7a1faaa4f9..5ccfa2340f9265 100644
--- a/python/paddle/fluid/tests/unittests/ir/test_ir_subgraph_python_interface.py
+++ b/python/paddle/fluid/tests/unittests/ir/test_ir_subgraph_python_interface.py
@@ -18,9 +18,9 @@
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
 from paddle.fluid import core
-from paddle.fluid.contrib.slim.quantization import QuantizationTransformPass
 from paddle.fluid.framework import IrGraph, Program, program_guard
 from paddle.fluid.tests.unittests.op_test import OpTestTool
+from paddle.static.quantization import QuantizationTransformPass
 
 paddle.enable_static()
 
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_onnx_format_quantization_mobilenetv1.py b/python/paddle/fluid/tests/unittests/mkldnn/test_onnx_format_quantization_mobilenetv1.py
index 7c7ab360107171..86bdc767d7cb48 100755
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_onnx_format_quantization_mobilenetv1.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_onnx_format_quantization_mobilenetv1.py
@@ -24,7 +24,7 @@
 import paddle
 import paddle.fluid as fluid
 from paddle.dataset.common import download
-from paddle.fluid.contrib.slim.quantization import PostTrainingQuantization
+from paddle.static.quantization import PostTrainingQuantization
 
 paddle.enable_static()
 
diff --git a/python/paddle/fluid/tests/unittests/npu/test_clip_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_clip_op_npu.py
index 8b13546d9a2852..122429a7f8454c 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_clip_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_clip_op_npu.py
@@ -117,13 +117,13 @@ def test_errors(self):
             input_data = np.random.random((2, 4)).astype("float32")
 
             def test_Variable():
-                fluid.layers.clip(x=input_data, min=-1.0, max=1.0)
+                paddle.clip(x=input_data, min=-1.0, max=1.0)
 
             self.assertRaises(TypeError, test_Variable)
 
             def test_dtype():
                 x2 = fluid.layers.data(name='x2', shape=[1], dtype='int32')
-                fluid.layers.clip(x=x2, min=-1.0, max=1.0)
+                paddle.clip(x=x2, min=-1.0, max=1.0)
 
             self.assertRaises(TypeError, test_dtype)
         paddle.disable_static()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_run_program_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_run_program_op_npu.py
index 899a3fdf03cd7e..c45d58d598c2c1 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_run_program_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_run_program_op_npu.py
@@ -58,14 +58,14 @@ def build_model(self):
     def check_output(self):
         places = [fluid.NPUPlace(0)]
         for place in places:
-            # TODO: RunProgramOp is not recommended for use in static mode now
+            # TODO: RunProgramOp is not recommended for use in static graph mode now
             self.expect_outs = self.run_static_model(place, is_test=True)
             self.check_output_with_place(place)
 
     def check_grad(self):
         places = [fluid.NPUPlace(0)]
         for place in places:
-            # TODO: RunProgramOp is not recommended for use in static mode now
+            # TODO: RunProgramOp is not recommended for use in static graph mode now
             self.expect_grads = self.run_static_model(place, is_test=False)
             self.check_grad_with_place(place)
 
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index 9728edf5d1c04f..cf1e78630097b7 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -38,8 +38,8 @@
     _dygraph_tracer,
     _enable_legacy_dygraph,
     _in_eager_without_dygraph_check,
-    _in_legacy_dygraph,
     _test_eager_guard,
+    in_dygraph_mode,
 )
 from paddle.fluid.op import Operator
 from paddle.jit.dy2static.utils import parse_arg_and_kwargs
@@ -716,7 +716,7 @@ def create_var(np_value, name, is_input, if_return_inputs_grad_dict):
 
                 if if_return_inputs_grad_dict:
                     v.stop_gradient = False
-                    if not _in_legacy_dygraph():
+                    if hasattr(v, "retain_grads"):
                         v.retain_grads()
 
                 if has_lod:
@@ -2515,7 +2515,7 @@ def _get_dygraph_grad(
                 for no_grad_val in no_grad_set:
                     del inputs[no_grad_val]
 
-                if not _in_legacy_dygraph():
+                if in_dygraph_mode():
                     core.eager.run_backward(
                         fluid.layers.utils.flatten(outputs), grad_outputs, False
                     )
diff --git a/python/paddle/fluid/tests/unittests/test_adam_op.py b/python/paddle/fluid/tests/unittests/test_adam_op.py
index 56057513b6b984..d0e6c98e25a422 100644
--- a/python/paddle/fluid/tests/unittests/test_adam_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adam_op.py
@@ -686,7 +686,7 @@ def test_adam_with_grad_clip(self):
         value = np.arange(26).reshape(2, 13).astype("float32")
         a = fluid.dygraph.to_variable(value)
         linear = paddle.nn.Linear(13, 5)
-        clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0)
+        clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
         adam = paddle.optimizer.Adam(
             0.1, parameters=linear.parameters(), grad_clip=clip
         )
@@ -1212,7 +1212,7 @@ def _check_with_place_amp(self, place, use_amp):
             np.testing.assert_allclose(
                 params_dygraph1[idx], params_dygraph2[idx], rtol=1e-05
             )
-        # test static mode
+        # test static graph mode
         output_static1 = self._adam_optimize_static(
             place=place, use_amp=use_amp, use_multi_tensor=True
         )
diff --git a/python/paddle/fluid/tests/unittests/test_cholesky_solve_op.py b/python/paddle/fluid/tests/unittests/test_cholesky_solve_op.py
index 49c50e2280c717..a3c7cf8f1505f8 100644
--- a/python/paddle/fluid/tests/unittests/test_cholesky_solve_op.py
+++ b/python/paddle/fluid/tests/unittests/test_cholesky_solve_op.py
@@ -192,7 +192,7 @@ def check_static_result(self, place):
             )
             np.testing.assert_allclose(fetches[0], z_np, rtol=1e-05)
 
-    # test in static mode
+    # test in static graph mode
     def test_static(self):
         for place in self.place:
             self.check_static_result(place=place)
diff --git a/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py b/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py
index a9d79f81bf310b..ce3dd7509ce1d8 100644
--- a/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py
@@ -20,12 +20,13 @@
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
+from paddle.nn import clip
 
 
 class TestClipByNormOp(OpTest):
     def setUp(self):
         self.max_relative_error = 0.006
-        self.python_api = fluid.layers.clip_by_norm
+        self.python_api = clip.clip_by_norm
         self.init_dtype()
         self.initTestCase()
         input = np.random.random(self.shape).astype(self.dtype)
diff --git a/python/paddle/fluid/tests/unittests/test_clip_op.py b/python/paddle/fluid/tests/unittests/test_clip_op.py
index a25edccb97a4ed..359220a7a601f1 100644
--- a/python/paddle/fluid/tests/unittests/test_clip_op.py
+++ b/python/paddle/fluid/tests/unittests/test_clip_op.py
@@ -128,15 +128,9 @@ def test_errors(self):
             input_data = np.random.random((2, 4)).astype("float32")
 
             def test_Variable():
-                fluid.layers.clip(x=input_data, min=-1.0, max=1.0)
+                paddle.clip(x=input_data, min=-1.0, max=1.0)
 
             self.assertRaises(TypeError, test_Variable)
-
-            def test_dtype():
-                x2 = fluid.layers.data(name='x2', shape=[1], dtype='int32')
-                fluid.layers.clip(x=x2, min=-1.0, max=1.0)
-
-            self.assertRaises(TypeError, test_dtype)
         paddle.disable_static()
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_digamma_op.py b/python/paddle/fluid/tests/unittests/test_digamma_op.py
index 10f0a420ed3b0d..e420e74c66697f 100644
--- a/python/paddle/fluid/tests/unittests/test_digamma_op.py
+++ b/python/paddle/fluid/tests/unittests/test_digamma_op.py
@@ -101,7 +101,7 @@ def test_name_argument(self):
             self.assertTrue("digamma_res" in out.name)
 
     def test_dtype_error(self):
-        # in static mode
+        # in static graph mode
         with self.assertRaises(TypeError):
             with static.program_guard(static.Program()):
                 x = static.data(name="x", shape=self._shape, dtype="int32")
diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
index d5ad18fc434cbe..c6bdd59d496634 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
@@ -584,7 +584,7 @@ def net_conf(self):
         def filter(param):
             return param.name == "fc_w"
 
-        clip = fluid.clip.GradientClipByValue(0.1, need_clip=filter)
+        clip = paddle.nn.ClipGradByValue(0.1, need_clip=filter)
         sgd_optimizer.minimize(avg_cost, grad_clip=clip)
 
     def transpiler_test_impl(self):
diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_mode_of_unittest.py b/python/paddle/fluid/tests/unittests/test_dygraph_mode_of_unittest.py
index 82eb7256b7cef2..c6e926c51a1f46 100644
--- a/python/paddle/fluid/tests/unittests/test_dygraph_mode_of_unittest.py
+++ b/python/paddle/fluid/tests/unittests/test_dygraph_mode_of_unittest.py
@@ -21,7 +21,7 @@ class TestDygraphModeOfUnittest(unittest.TestCase):
     def test_dygraph_mode(self):
         self.assertTrue(
             paddle.in_dynamic_mode(),
-            'Default Mode of Unittest should be dygraph mode, but get static mode.',
+            'Default Mode of Unittest should be dygraph mode, but get static graph mode.',
         )
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py
index ea1401d3fbe97f..80bc977f091bac 100644
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py
@@ -504,8 +504,8 @@ def _prepare_program(self, config, parallel=True):
                     self.feed_order,
                 ) = res_vars
 
-                fluid.clip.set_gradient_clip(
-                    clip=fluid.clip.GradientClipByGlobalNorm(
+                paddle.nn.clip.set_gradient_clip(
+                    clip=paddle.nn.ClipGradByGlobalNorm(
                         clip_norm=config.max_grad_norm
                     )
                 )
@@ -644,7 +644,7 @@ def compare_padding_static_mode(
         self, parallel=True, use_program_cache=True
     ):
         '''
-        Test that train ppl of padding mode is same to that of static mode
+        Test that train ppl of padding mode is same to that of static graph mode
         '''
         config = RNNConfig('test', 'padding')
         with fluid.scope_guard(fluid.Scope()):
@@ -658,7 +658,7 @@ def compare_padding_static_mode(
 class EagerDeletionPaddingRNNTest(PaddingRNNTestBase):
     def test_padding_mode_no_eager_deletion(self):
         '''
-        Test that train ppl of padding mode is same to that of static mode without eager deletion
+        Test that train ppl of padding mode is same to that of static graph mode without eager deletion
         '''
         fluid.core._set_eager_deletion_mode(-1.0, 1.0, True)
         # When parallel is True, use_program_cache does not make a difference.
@@ -666,7 +666,7 @@ def test_padding_mode_no_eager_deletion(self):
 
     def test_padding_mode_eager_deletion(self):
         '''
-        Test that train ppl of padding mode is same to that of static mode under eager deletion
+        Test that train ppl of padding mode is same to that of static graph mode under eager deletion
         '''
         fluid.core._set_eager_deletion_mode(0.0, 1.0, True)
         # When parallel is True, use_program_cache does not make a difference.
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_executor.py b/python/paddle/fluid/tests/unittests/test_fleet_executor.py
index e81fc34ea2ca0f..400009f820de3c 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_executor.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_executor.py
@@ -64,7 +64,7 @@ def run_fleet_executor(self, place, x_data, y_data):
             )
             opt = paddle.optimizer.AdamW(
                 learning_rate=lr_val,
-                grad_clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0),
+                grad_clip=paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0),
             )
             opt.minimize(loss)
         # TODO: section_program will be removed in the future
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_executor_origin_scheduler.py b/python/paddle/fluid/tests/unittests/test_fleet_executor_origin_scheduler.py
index 0de28e9839efa3..d24348b7d77b58 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_executor_origin_scheduler.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_executor_origin_scheduler.py
@@ -64,7 +64,7 @@ def run_fleet_executor(self, place, x_data, y_data):
             )
             opt = paddle.optimizer.AdamW(
                 learning_rate=lr_val,
-                grad_clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0),
+                grad_clip=paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0),
             )
             opt.minimize(loss)
         # TODO: section_program will be removed in the future
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_executor_with_task_nodes.py b/python/paddle/fluid/tests/unittests/test_fleet_executor_with_task_nodes.py
index 36a85e2d74fc7a..46eb0dc6f0bf84 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_executor_with_task_nodes.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_executor_with_task_nodes.py
@@ -47,7 +47,7 @@ def run_fleet_executor(self, place, x_data, y_data):
             )
             opt = paddle.optimizer.AdamW(
                 learning_rate=lr_val,
-                grad_clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0),
+                grad_clip=paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0),
             )
             opt.minimize(loss)
         # TODO: section_program will be removed in the future
diff --git a/python/paddle/fluid/tests/unittests/test_functional_conv1d_transpose.py b/python/paddle/fluid/tests/unittests/test_functional_conv1d_transpose.py
index a1e144da146a57..1d4e079f9f84ad 100644
--- a/python/paddle/fluid/tests/unittests/test_functional_conv1d_transpose.py
+++ b/python/paddle/fluid/tests/unittests/test_functional_conv1d_transpose.py
@@ -70,5 +70,17 @@ def setUp(self):
         self.data_format = "NCL"
 
 
+class TestFunctionalConv1DErrorCase2(TestFunctionalConv1DError):
+    def setUp(self):
+        self.input = np.random.randn(1, 3, 3)
+        self.filter = np.random.randn(3)
+        self.bias = None
+        self.padding = 0
+        self.stride = 1
+        self.dilation = 1
+        self.groups = 1
+        self.data_format = "NCL"
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_get_tensor_from_selected_rows_op.py b/python/paddle/fluid/tests/unittests/test_get_tensor_from_selected_rows_op.py
index ab5b9096dcc8ad..d1e3e6df335b00 100644
--- a/python/paddle/fluid/tests/unittests/test_get_tensor_from_selected_rows_op.py
+++ b/python/paddle/fluid/tests/unittests/test_get_tensor_from_selected_rows_op.py
@@ -20,6 +20,7 @@
 import paddle.fluid.core as core
 from paddle.fluid import Program, program_guard
 from paddle.fluid.op import Operator
+from paddle.nn import clip
 
 
 class TestGetTensorFromSelectedRowsError(unittest.TestCase):
@@ -31,12 +32,12 @@ def test_errors(self):
             x_data = np.random.random((2, 4)).astype("float32")
 
             def test_Variable():
-                fluid.layers.get_tensor_from_selected_rows(x=x_data)
+                clip.get_tensor_from_selected_rows(x=x_data)
 
             self.assertRaises(TypeError, test_Variable)
 
             def test_SELECTED_ROWS():
-                fluid.layers.get_tensor_from_selected_rows(x=x_var)
+                clip.get_tensor_from_selected_rows(x=x_var)
 
             self.assertRaises(TypeError, test_SELECTED_ROWS)
 
diff --git a/python/paddle/fluid/tests/unittests/test_grad_clip_minimize.py b/python/paddle/fluid/tests/unittests/test_grad_clip_minimize.py
index db34123d3bdd8f..4cb4b5d773b48d 100644
--- a/python/paddle/fluid/tests/unittests/test_grad_clip_minimize.py
+++ b/python/paddle/fluid/tests/unittests/test_grad_clip_minimize.py
@@ -17,12 +17,8 @@
 import numpy as np
 
 import paddle.fluid as fluid
-from paddle.fluid.clip import (
-    GradientClipByGlobalNorm,
-    GradientClipByNorm,
-    GradientClipByValue,
-)
 from paddle.fluid.dygraph.base import to_variable
+from paddle.nn import ClipGradByGlobalNorm, ClipGradByNorm, ClipGradByValue
 
 
 class TestGradClipByGlobalNorm(unittest.TestCase):
@@ -67,7 +63,7 @@ def get_numpy_global_norm_result(self):
     def get_dygrap_global_norm_result(self):
         with fluid.dygraph.guard():
 
-            gloabl_norm_clip = GradientClipByGlobalNorm(self.max_global_norm)
+            gloabl_norm_clip = ClipGradByGlobalNorm(self.max_global_norm)
             p_g_var = []
             for p, g in self.para_and_grad:
                 new_p = to_variable(p)
@@ -142,7 +138,7 @@ def get_numpy_norm_result(self):
     def get_dygrap_norm_result(self):
         with fluid.dygraph.guard():
 
-            norm_clip = GradientClipByNorm(self.max_norm)
+            norm_clip = ClipGradByNorm(self.max_norm)
             p_g_var = []
             for p, g in self.para_and_grad:
                 new_p = to_variable(p)
@@ -212,9 +208,7 @@ def get_numpy_clip_result(self):
 
     def get_dygrap_clip_result(self):
         with fluid.dygraph.guard():
-            value_clip = GradientClipByValue(
-                max=self.max_value, min=self.min_value
-            )
+            value_clip = ClipGradByValue(max=self.max_value, min=self.min_value)
             p_g_var = []
             for p, g in self.para_and_grad:
                 new_p = to_variable(p)
diff --git a/python/paddle/fluid/tests/unittests/test_gradient_clip.py b/python/paddle/fluid/tests/unittests/test_gradient_clip.py
index 2243ae8c45602a..b5b0b20c6f48bc 100644
--- a/python/paddle/fluid/tests/unittests/test_gradient_clip.py
+++ b/python/paddle/fluid/tests/unittests/test_gradient_clip.py
@@ -20,7 +20,7 @@
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
-from paddle.fluid.clip import _allow_pure_fp16_global_norm_clip
+from paddle.nn.clip import _allow_pure_fp16_global_norm_clip
 
 paddle.enable_static()
 
@@ -173,9 +173,9 @@ def check_clip_result(self, out, out_clip):
     # test whether the output is right when use 'set_gradient_clip'
     def test_old_gradient_clip(self):
         def func(params_grads):
-            clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=self.clip_norm)
-            fluid.clip.set_gradient_clip(clip)
-            return fluid.clip.append_gradient_clip_ops(params_grads)
+            clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=self.clip_norm)
+            paddle.nn.clip.set_gradient_clip(clip)
+            return paddle.nn.clip.append_gradient_clip_ops(params_grads)
 
         self.clip_gradient = func
         self.check_gradient_clip(fluid.CPUPlace())
@@ -183,7 +183,7 @@ def func(params_grads):
     # test whether the output is right when use grad_clip
     def test_new_gradient_clip(self):
         def func(params_grads):
-            clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=self.clip_norm)
+            clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=self.clip_norm)
             return clip(params_grads)
 
         self.clip_gradient = func
@@ -192,7 +192,7 @@ def func(params_grads):
     # test whether the output is right when use grad_clip under float64
     def test_new_gradient_clip_fp64(self):
         def func(params_grads):
-            clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=self.clip_norm)
+            clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=self.clip_norm)
             return clip(params_grads)
 
         self.clip_gradient = func
@@ -201,15 +201,15 @@ def func(params_grads):
     # invoke 'set_gradient_clip' in a wrong order
     def test_wrong_API_order(self):
         def backward_func(cost):
-            clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=5.0)
-            fluid.clip.set_gradient_clip(clip)
+            clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=5.0)
+            paddle.nn.clip.set_gradient_clip(clip)
             sgd_optimizer = fluid.optimizer.SGD(
                 learning_rate=0.01, grad_clip=clip
             )
             # if 'set_gradient_clip' and 'optimize(grad_clip)' together, 'set_gradient_clip' will be ineffective
             sgd_optimizer.minimize(cost)
             # 'set_gradient_clip' must before 'minimize', otherwise, 'set_gradient_clip' will be ineffective
-            fluid.clip.set_gradient_clip(clip)
+            paddle.nn.clip.set_gradient_clip(clip)
 
         self.backward_and_optimize = backward_func
         for place in self.get_places():
@@ -269,7 +269,7 @@ def _test_none_grad_helper(self, dtype):
         with fluid.program_guard(
             main_program=prog, startup_program=startup_program
         ):
-            clip = fluid.clip.GradientClipByGlobalNorm(self.clip_norm)
+            clip = paddle.nn.ClipGradByGlobalNorm(self.clip_norm)
             x = (
                 fluid.default_main_program()
                 .global_block()
@@ -313,7 +313,7 @@ def check_clip_result(self, out, out_clip):
     # test whether the output is right when use grad_clip
     def test_gradient_clip(self):
         def func(params_grads):
-            clip = fluid.clip.GradientClipByNorm(clip_norm=self.clip_norm)
+            clip = paddle.nn.ClipGradByNorm(clip_norm=self.clip_norm)
             return clip(params_grads)
 
         self.clip_gradient = func
@@ -321,7 +321,7 @@ def func(params_grads):
 
     # if grad is None or not need clip
     def test_none_grad(self):
-        clip = fluid.clip.GradientClipByNorm(self.clip_norm)
+        clip = paddle.nn.ClipGradByNorm(self.clip_norm)
         x = (
             fluid.default_main_program()
             .global_block()
@@ -371,7 +371,7 @@ def check_clip_result(self, out, out_clip):
     # test whether the output is right when use grad_clip
     def test_gradient_clip(self):
         def func(params_grads):
-            clip = fluid.clip.GradientClipByValue(max=self.max, min=self.min)
+            clip = paddle.nn.ClipGradByValue(max=self.max, min=self.min)
             return clip(params_grads)
 
         self.clip_gradient = func
@@ -379,7 +379,7 @@ def func(params_grads):
 
     # if grad is None or not need clip
     def test_none_grad(self):
-        clip = fluid.clip.GradientClipByValue(self.max, self.min)
+        clip = paddle.nn.ClipGradByValue(self.max, self.min)
         x = (
             fluid.default_main_program()
             .global_block()
@@ -419,7 +419,7 @@ def test_gradient_clip(self):
             sgd_optimizer = fluid.optimizer.SGD(
                 learning_rate=0.0,
                 parameter_list=linear.parameters(),
-                grad_clip=fluid.clip.GradientClipByGlobalNorm(0.1),
+                grad_clip=paddle.nn.ClipGradByGlobalNorm(0.1),
             )
             self.check_clip_result(loss, sgd_optimizer)
 
@@ -430,12 +430,8 @@ def check_clip_result(self, loss, optimizer):
 class TestDygraphGradientClipByGlobalNorm(TestDygraphGradientClip):
     def setUp(self):
         self.clip_norm = 0.8
-        self.clip1 = fluid.clip.GradientClipByGlobalNorm(
-            clip_norm=self.clip_norm
-        )
-        self.clip2 = fluid.clip.GradientClipByGlobalNorm(
-            clip_norm=self.clip_norm
-        )
+        self.clip1 = paddle.nn.ClipGradByGlobalNorm(clip_norm=self.clip_norm)
+        self.clip2 = paddle.nn.ClipGradByGlobalNorm(clip_norm=self.clip_norm)
 
     def check_clip_result(self, loss, optimizer):
         # if grad is None
@@ -476,7 +472,7 @@ def check_clip_result(self, loss, optimizer):
 class TestDygraphGradientClipByNorm(TestDygraphGradientClip):
     def setUp(self):
         self.clip_norm = 0.8
-        self.clip = fluid.clip.GradientClipByNorm(clip_norm=self.clip_norm)
+        self.clip = paddle.nn.ClipGradByNorm(clip_norm=self.clip_norm)
 
     def check_clip_result(self, loss, optimizer):
         # if grad is None
@@ -506,7 +502,7 @@ class TestDygraphGradientClipByValue(TestDygraphGradientClip):
     def setUp(self):
         self.max = 0.2
         self.min = 0.1
-        self.clip = fluid.clip.GradientClipByValue(max=self.max, min=self.min)
+        self.clip = paddle.nn.ClipGradByValue(max=self.max, min=self.min)
 
     def check_clip_result(self, loss, optimizer):
         # if grad is None
@@ -572,7 +568,7 @@ def test_gradient_clip(self):
                         params_grads.append((param, param._grad_ivar()))
                 _, grads = zip(*params_grads)
                 # clip grads
-                clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=0.8)
+                clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=0.8)
                 params_grads = clip(params_grads)
                 _, grads_clip = zip(*params_grads)
                 # param update
@@ -616,7 +612,7 @@ def test_gradient_clip(self):
                     params_grads.append((param, param._grad_ivar()))
             _, grads = zip(*params_grads)
             # clip grads
-            clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=0.1)
+            clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=0.1)
             params_grads = clip(params_grads)
             _, grads_clip = zip(*params_grads)
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py b/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py
index ecb35e8eaf950c..54cba6eb800295 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py
@@ -361,7 +361,7 @@ def test_auto_prune_with_optimizer(self):
         place = fluid.CPUPlace()
         with fluid.dygraph.guard(place):
             model = MyLayer(size, vocab_size, size)
-            grad_clip = fluid.clip.GradientClipByGlobalNorm(0.001)
+            grad_clip = paddle.nn.ClipGradByGlobalNorm(0.001)
             optimizer = fluid.optimizer.AdamOptimizer(
                 0.001, parameter_list=model.parameters(), grad_clip=grad_clip
             )
@@ -380,7 +380,7 @@ def test_auto_prune_with_optimizer(self):
 
         with fluid.dygraph.guard(place):
             model = MyLayer2(size, vocab_size, size)
-            grad_clip = fluid.clip.GradientClipByGlobalNorm(0.001)
+            grad_clip = paddle.nn.ClipGradByGlobalNorm(0.001)
             optimizer = fluid.optimizer.AdamOptimizer(
                 0.001, parameter_list=model.parameters(), grad_clip=grad_clip
             )
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_load_static_param.py b/python/paddle/fluid/tests/unittests/test_imperative_load_static_param.py
index a6b378275a6163..0e1974474d86ca 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_load_static_param.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_load_static_param.py
@@ -28,7 +28,7 @@
 
 class TestDygraphLoadStatic(unittest.TestCase):
     def testLoadStaticModel(self):
-        # static mode
+        # static graph mode
         temp_dir = tempfile.TemporaryDirectory()
         a = fluid.data(name="a", shape=[10, 10])
         conv_in = fluid.data(name="conv_in", shape=[None, 10, 10, 10])
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_selected_rows.py b/python/paddle/fluid/tests/unittests/test_imperative_selected_rows.py
index cea97398d17159..5cc7f63eb7883b 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_selected_rows.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_selected_rows.py
@@ -52,7 +52,7 @@ def test_selectedrows_gradient1(self):
                     fluid.set_flags(
                         {'FLAGS_sort_sum_gradient': sort_sum_gradient}
                     )
-                    # grad_clip = fluid.clip.GradientClipByGlobalNorm(5.0)
+                    # grad_clip = paddle.nn.ClipGradByGlobalNorm(5.0)
 
                     input_word = np.array([[1, 2], [2, 1]]).astype('int64')
                     input = paddle.to_tensor(input_word)
@@ -91,7 +91,7 @@ def test_selectedrows_gradient2(self):
                     fluid.set_flags(
                         {'FLAGS_sort_sum_gradient': sort_sum_gradient}
                     )
-                    grad_clip = fluid.clip.GradientClipByGlobalNorm(5.0)
+                    grad_clip = paddle.nn.ClipGradByGlobalNorm(5.0)
 
                     input_word = np.array([[1, 2], [2, 1]]).astype('int64')
                     input = to_variable(input_word)
diff --git a/python/paddle/fluid/tests/unittests/test_inplace_auto_generated_apis.py b/python/paddle/fluid/tests/unittests/test_inplace_auto_generated_apis.py
index bbf06f74c24d0f..75bcee3e49a8a6 100644
--- a/python/paddle/fluid/tests/unittests/test_inplace_auto_generated_apis.py
+++ b/python/paddle/fluid/tests/unittests/test_inplace_auto_generated_apis.py
@@ -20,7 +20,7 @@
 from paddle.static import Program, program_guard
 
 
-# In static mode, inplace strategy will not be used in Inplace APIs.
+# In static graph mode, inplace strategy will not be used in Inplace APIs.
 class TestStaticAutoGeneratedAPI(unittest.TestCase):
     def setUp(self):
         paddle.enable_static()
diff --git a/python/paddle/fluid/tests/unittests/test_linalg_cond.py b/python/paddle/fluid/tests/unittests/test_linalg_cond.py
index 68b4287f2f6ec5..274e04d5031f71 100644
--- a/python/paddle/fluid/tests/unittests/test_linalg_cond.py
+++ b/python/paddle/fluid/tests/unittests/test_linalg_cond.py
@@ -84,7 +84,7 @@ def gen_empty_input():
 class API_TestStaticCond(unittest.TestCase):
     def test_out(self):
         paddle.enable_static()
-        # test calling results of 'cond' in static mode
+        # test calling results of 'cond' in static graph mode
         x_list_n_n, x_list_m_n = gen_input()
         test_static_assert_true(self, x_list_n_n, p_list_n_n + p_list_m_n)
         test_static_assert_true(self, x_list_m_n, p_list_m_n)
@@ -117,7 +117,7 @@ def test_dygraph_api_error(self):
 
     def test_static_api_error(self):
         paddle.enable_static()
-        # test raising errors when 'cond' is called in static mode
+        # test raising errors when 'cond' is called in static graph mode
         p_list_error = ('f ro', 'fre', 'NUC', -1.6, 0, 5)
         x_list_n_n, x_list_m_n = gen_input()
         for p in p_list_error:
@@ -132,7 +132,7 @@ def test_static_api_error(self):
                     x_data = static.data("X", shape=x.shape, dtype=x.dtype)
                     self.assertRaises(ValueError, paddle.linalg.cond, x_data, p)
 
-    # it's not supported when input is an empty tensor in static mode
+    # it's not supported when input is an empty tensor in static graph mode
     def test_static_empty_input_error(self):
         paddle.enable_static()
 
diff --git a/python/paddle/fluid/tests/unittests/test_load_state_dict_from_old_format.py b/python/paddle/fluid/tests/unittests/test_load_state_dict_from_old_format.py
index 16ba749c9b4ae9..f8c5751c8c2904 100644
--- a/python/paddle/fluid/tests/unittests/test_load_state_dict_from_old_format.py
+++ b/python/paddle/fluid/tests/unittests/test_load_state_dict_from_old_format.py
@@ -67,7 +67,7 @@ def setUp(self):
         self.epoch_num = 1
         self.batch_size = 128
         self.batch_num = 10
-        # enable static mode
+        # enable static graph mode
         paddle.enable_static()
 
     def tearDown(self):
diff --git a/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py b/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py
index 72efd20c6d1167..2d398e9f44b69e 100644
--- a/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py
@@ -87,7 +87,7 @@ def func(self, place):
             x = paddle.create_parameter(dtype=dtype, shape=shape, name='x')
             z = paddle.nn.functional.instance_norm(x)
             x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
-            # check for static mode
+            # check for static graph mode
             gradient_checker.double_grad_check(
                 [x], z, x_init=x_arr, atol=atol, place=place, eps=eps
             )
@@ -129,7 +129,7 @@ def func(self, place):
             x = paddle.create_parameter(dtype=dtype, shape=shape, name='x')
             z = paddle.nn.InstanceNorm2D(3)(x)
             x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
-            # check for static mode
+            # check for static graph mode
             gradient_checker.double_grad_check(
                 [x], z, x_init=x_arr, atol=atol, place=place, eps=eps
             )
diff --git a/python/paddle/fluid/tests/unittests/test_pad_op.py b/python/paddle/fluid/tests/unittests/test_pad_op.py
index 04617274356702..735f62c646b16a 100644
--- a/python/paddle/fluid/tests/unittests/test_pad_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pad_op.py
@@ -178,6 +178,22 @@ def call_func(self, x):
         return out
 
 
+class TestPaddingValueTensor3(unittest.TestCase):
+    def test_static(self):
+        np_x = np.random.random((16, 16)).astype('float32')
+        main_prog = Program()
+        starup_prog = Program()
+        with program_guard(main_prog, starup_prog):
+            x = paddle.assign(np_x).astype('float32')
+            pad_value = paddle.assign([0.0]).astype('float64')
+            y = paddle.nn.functional.pad(x, [0, 1, 2, 3], value=pad_value)
+
+        exe = paddle.static.Executor(paddle.CPUPlace())
+        [pd_out] = exe.run(main_prog, fetch_list=[y])
+        np_out = np.pad(np_x, [(0, 1), (2, 3)], constant_values=0.0)
+        np.testing.assert_allclose(pd_out, np_out)
+
+
 if __name__ == '__main__':
     paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_paddle_save_load.py b/python/paddle/fluid/tests/unittests/test_paddle_save_load.py
index 193e1047642614..aee128bd99ed5e 100644
--- a/python/paddle/fluid/tests/unittests/test_paddle_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_paddle_save_load.py
@@ -374,7 +374,7 @@ def test_single_pickle_var_dygraph(self):
         np.testing.assert_array_equal(tensor.numpy(), np.array(lod_static))
 
     def test_single_pickle_var_static(self):
-        # enable static mode
+        # enable static graph mode
         paddle.enable_static()
         with new_program_scope():
             # create network
@@ -547,7 +547,7 @@ def test_save_load_complex_object_dygraph_save(self):
 
         np.testing.assert_array_equal(load_array4[0], obj4[0])
 
-        # static mode
+        # static graph mode
         paddle.enable_static()
 
         load_tensor1 = paddle.load(path1, return_numpy=False)
@@ -1012,7 +1012,7 @@ def test_save_load(self):
         self.check_load_state_dict(layer_state_dict, load_layer_state_dict)
         self.check_load_state_dict(opt_state_dict, load_opt_state_dict)
 
-        # test save load in static mode
+        # test save load in static graph mode
         paddle.enable_static()
         static_save_path = os.path.join(
             self.temp_dir.name,
diff --git a/python/paddle/fluid/tests/unittests/test_pow.py b/python/paddle/fluid/tests/unittests/test_pow.py
index 77a5a8a7d25ba4..20e39ddf5eaac1 100755
--- a/python/paddle/fluid/tests/unittests/test_pow.py
+++ b/python/paddle/fluid/tests/unittests/test_pow.py
@@ -42,7 +42,7 @@ def _run_power(mode, x, y, device='cpu'):
             y_ = paddle.to_tensor(y)
             res = paddle.pow(x_, y_)
             return res.numpy()
-    # static mode
+    # static graph mode
     elif mode == STATIC:
         paddle.enable_static()
         # y is scalar
diff --git a/python/paddle/fluid/tests/unittests/test_profiler.py b/python/paddle/fluid/tests/unittests/test_profiler.py
index c1ff35222d8ae6..e39648285daba7 100644
--- a/python/paddle/fluid/tests/unittests/test_profiler.py
+++ b/python/paddle/fluid/tests/unittests/test_profiler.py
@@ -293,9 +293,6 @@ def test_flops(self):
             )
             == 3 * 12 * 12 * 12 * 2 * 8
         )
-        self.assertTrue(
-            flops('relu', {'X': [[12, 12, 12]]}, {}) == 12 * 12 * 12
-        )
         self.assertTrue(
             flops('softmax', {'X': [[12, 12, 12]]}, {}) == 3 * 12 * 12 * 12
         )
@@ -303,6 +300,56 @@ def test_flops(self):
             flops('c_embedding', {'Ids': [[12, 12]], 'W': [[12, 12, 3]]}, {})
             == 0
         )
+        self.assertTrue(
+            flops(
+                'elu',
+                {
+                    'X': [[12, 12]],
+                },
+                {},
+            )
+            == 144
+        )
+        self.assertTrue(
+            flops(
+                'leaky_relu',
+                {
+                    'X': [[12, 12]],
+                },
+                {},
+            )
+            == 144
+        )
+        self.assertTrue(
+            flops(
+                'prelu',
+                {
+                    'X': [[12, 12]],
+                },
+                {},
+            )
+            == 144
+        )
+        self.assertTrue(
+            flops(
+                'relu6',
+                {
+                    'X': [[12, 12]],
+                },
+                {},
+            )
+            == 144
+        )
+        self.assertTrue(
+            flops(
+                'silu',
+                {
+                    'X': [[12, 12]],
+                },
+                {},
+            )
+            == 144
+        )
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_quantile_and_nanquantile.py b/python/paddle/fluid/tests/unittests/test_quantile_and_nanquantile.py
index 60e3ae08d36ec8..1f5aaa6b0952b9 100644
--- a/python/paddle/fluid/tests/unittests/test_quantile_and_nanquantile.py
+++ b/python/paddle/fluid/tests/unittests/test_quantile_and_nanquantile.py
@@ -219,7 +219,7 @@ def test_axis_value_error_3():
 class TestQuantileRuntime(unittest.TestCase):
     """
     This class is used to test the API could run correctly with
-    different devices, different data types, and dygraph/static mode.
+    different devices, different data types, and dygraph/static graph mode.
     """
 
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/test_real_imag_op.py b/python/paddle/fluid/tests/unittests/test_real_imag_op.py
index 6f186063df3163..4c20e736455af6 100644
--- a/python/paddle/fluid/tests/unittests/test_real_imag_op.py
+++ b/python/paddle/fluid/tests/unittests/test_real_imag_op.py
@@ -145,7 +145,7 @@ def test_name_argument(self):
             self.assertTrue("real_res" in out.name)
 
     def test_dtype_error(self):
-        # in static mode
+        # in static graph mode
         with self.assertRaises(TypeError):
             with static.program_guard(static.Program()):
                 x = static.data(name="x", shape=self._shape, dtype="float32")
diff --git a/python/paddle/fluid/tests/unittests/test_reshape_op.py b/python/paddle/fluid/tests/unittests/test_reshape_op.py
index a31749d744aead..887ce9ff3f7411 100755
--- a/python/paddle/fluid/tests/unittests/test_reshape_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reshape_op.py
@@ -49,20 +49,20 @@ def test_check_grad(self):
 class TestReshapeOp_ZeroDim1(OpTest):
     def init_data(self):
         self.ori_shape = ()
-        self.new_shape = 1
-        self.infered_shape = 1
+        self.new_shape = (1,)
+        self.infered_shape = (1,)
 
 
 class TestReshapeOp_ZeroDim2(OpTest):
     def init_data(self):
         self.ori_shape = ()
-        self.new_shape = -1
-        self.infered_shape = 1
+        self.new_shape = (-1,)
+        self.infered_shape = (1,)
 
 
 class TestReshapeOp_ZeroDim3(OpTest):
     def init_data(self):
-        self.ori_shape = 1
+        self.ori_shape = (1,)
         self.new_shape = ()
         self.infered_shape = ()
 
diff --git a/python/paddle/fluid/tests/unittests/test_run_program_op.py b/python/paddle/fluid/tests/unittests/test_run_program_op.py
index afa1fe2321944a..bf0b89ef1eb275 100644
--- a/python/paddle/fluid/tests/unittests/test_run_program_op.py
+++ b/python/paddle/fluid/tests/unittests/test_run_program_op.py
@@ -86,7 +86,7 @@ def check_output(self):
         if core.is_compiled_with_cuda():
             places.append(fluid.CUDAPlace(0))
         for place in places:
-            # TODO: RunProgramOp is not recommended for use in static mode now
+            # TODO: RunProgramOp is not recommended for use in static graph mode now
             self.expect_outs = self.run_static_model(place, is_test=True)
             self.check_output_with_place(place)
 
@@ -95,7 +95,7 @@ def check_grad(self):
         if core.is_compiled_with_cuda():
             places.append(fluid.CUDAPlace(0))
         for place in places:
-            # TODO: RunProgramOp is not recommended for use in static mode now
+            # TODO: RunProgramOp is not recommended for use in static graph mode now
             self.expect_grads = self.run_static_model(place, is_test=False)
             self.check_grad_with_place(place)
 
@@ -437,7 +437,7 @@ def test_check_grad(self):
         if core.is_compiled_with_cuda():
             places.append(fluid.CUDAPlace(0))
         for place in places:
-            # TODO: RunProgramOp is not recommended for use in static mode now
+            # TODO: RunProgramOp is not recommended for use in static graph mode now
             self.calc_dygraph_grad(place)
 
     def build_model(self):
diff --git a/python/paddle/fluid/tests/unittests/test_set_value_op.py b/python/paddle/fluid/tests/unittests/test_set_value_op.py
index 7ad35100592b94..ec05ae92f96b80 100644
--- a/python/paddle/fluid/tests/unittests/test_set_value_op.py
+++ b/python/paddle/fluid/tests/unittests/test_set_value_op.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# Test set_value op in static mode
+# Test set_value op in static graph mode
 
 import unittest
 from functools import reduce
diff --git a/python/paddle/fluid/tests/unittests/test_sgd_op.py b/python/paddle/fluid/tests/unittests/test_sgd_op.py
index 29fb869efee0ad..801866c9023d0b 100644
--- a/python/paddle/fluid/tests/unittests/test_sgd_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sgd_op.py
@@ -401,7 +401,7 @@ def test_main(self):
                 rtol=1e-05,
                 atol=0.1,
             )
-        "Test static mode"
+        "Test static graph mode"
         output1_st = self.static_sgd_mp(mp=True)
         output2_st = self.static_sgd_mp(mp=False)
         for idx in range(len(output1_st)):
@@ -511,7 +511,7 @@ def test_main(self):
                 rtol=1e-05,
                 atol=0.1,
             )
-        "Test static mode"
+        "Test static graph mode"
         output1_st = self.static_sgd_mp(mp=True)
         output2_st = self.static_sgd_mp(mp=False)
         for idx in range(len(output1_st)):
diff --git a/python/paddle/fluid/tests/unittests/test_static_save_load.py b/python/paddle/fluid/tests/unittests/test_static_save_load.py
index a2007e6c144eb4..852975b975e087 100644
--- a/python/paddle/fluid/tests/unittests/test_static_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_static_save_load.py
@@ -1759,7 +1759,7 @@ def test_ptb_rnn_cpu_float32(self):
 
 class TestStaticSaveLoadPickle(unittest.TestCase):
     def test_pickle_protocol(self):
-        # enable static mode
+        # enable static graph mode
         paddle.enable_static()
 
         with new_program_scope():
diff --git a/python/paddle/fluid/tests/unittests/test_static_save_load_large.py b/python/paddle/fluid/tests/unittests/test_static_save_load_large.py
index 0231c133845bbf..85c263e9791bf1 100644
--- a/python/paddle/fluid/tests/unittests/test_static_save_load_large.py
+++ b/python/paddle/fluid/tests/unittests/test_static_save_load_large.py
@@ -28,7 +28,7 @@
 
 class TestStaticSaveLoadLargeParameters(unittest.TestCase):
     def test_large_parameters_static_save(self):
-        # enable static mode
+        # enable static graph mode
         paddle.enable_static()
         with new_program_scope():
             # create network
diff --git a/python/paddle/fluid/tests/unittests/test_tensordot.py b/python/paddle/fluid/tests/unittests/test_tensordot.py
index dd8529e50eef8d..a8c4dbaed47305 100644
--- a/python/paddle/fluid/tests/unittests/test_tensordot.py
+++ b/python/paddle/fluid/tests/unittests/test_tensordot.py
@@ -282,7 +282,7 @@ def set_test_axes(self):
         ]
 
     def test_tensor_axes(self):
-        # The 'axes' with type 'Tensor' in tensordot is not available in static mode
+        # The 'axes' with type 'Tensor' in tensordot is not available in static graph mode
         paddle.disable_static()
         tensor_axes = [
             paddle.to_tensor([1]),
diff --git a/python/paddle/fluid/tests/unittests/test_zero_dim_tensor.py b/python/paddle/fluid/tests/unittests/test_zero_dim_tensor.py
index 9fb6017d446d07..8523fb44b982fe 100644
--- a/python/paddle/fluid/tests/unittests/test_zero_dim_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_zero_dim_tensor.py
@@ -454,6 +454,15 @@ def setUp(self):
         paddle.disable_static()
         self.x = paddle.rand([])
 
+    def test_flip(self):
+        x = paddle.rand([])
+        x.stop_gradient = False
+        out = paddle.flip(x, axis=[])
+        out.backward()
+        self.assertEqual(x.shape, [])
+        self.assertEqual(out.shape, [])
+        self.assertEqual(out.grad.shape, [])
+
     def test_linear(self):
         x = paddle.randn([3, 2])
         w = paddle.full(shape=[2, 4], fill_value=0.5)
@@ -712,12 +721,158 @@ def test_scatter_nd(self):
         self.assertEqual(out.numpy()[3], 2)
         self.assertEqual(out.grad.shape, [5])
 
+    def test_scale(self):
+        x = paddle.rand([])
+        x.stop_gradient = False
+        out = paddle.scale(x, scale=2.0, bias=1.0)
+        out.backward()
+
+        self.assertEqual(out.shape, [])
+        self.assertEqual(out.grad.shape, [])
+        self.assertEqual(x.grad.shape, [])
+
+    def test_floor_divide(self):
+        # 1-d // 0-d
+        x = paddle.to_tensor([1, -2, 3], dtype="int64")
+        y = paddle.full([], 2, dtype='int64')
+        out1_1 = paddle.floor_divide(x, y)
+        out1_2 = paddle.Tensor.__floordiv__(x, y)
+
+        np.testing.assert_array_equal(out1_1.numpy(), out1_2.numpy())
+        np.testing.assert_array_equal(out1_1.numpy(), np.asarray([0, -1, 1]))
+
+        # 0-d // 1-d
+        out2_1 = paddle.floor_divide(y, x)
+        out2_2 = paddle.Tensor.__floordiv__(y, x)
+
+        np.testing.assert_array_equal(out2_1.numpy(), out2_2.numpy())
+        np.testing.assert_array_equal(out2_2.numpy(), np.asarray([2, -1, 0]))
+
+        # 0-d // 0-d
+        x = paddle.full([], 3, dtype='int64')
+        out3_1 = paddle.floor_divide(x, y)
+        out3_2 = paddle.Tensor.__floordiv__(x, y)
+
+        np.testing.assert_array_equal(out3_1.numpy(), out3_2.numpy())
+        np.testing.assert_array_equal(out3_2.numpy(), np.asarray(1))
+
+    def test_reshape_list(self):
+        x = paddle.rand([])
+        x.stop_gradient = False
+
+        out = paddle.reshape(x, [])
+        out.backward()
+        self.assertEqual(x.grad.shape, [])
+        self.assertEqual(out.shape, [])
+        self.assertEqual(out.grad.shape, [])
+
+        out = paddle.reshape(x, [1])
+        out.backward()
+        self.assertEqual(x.grad.shape, [])
+        self.assertEqual(out.shape, [1])
+        self.assertEqual(out.grad.shape, [1])
+
+        out = paddle.reshape(x, [-1])
+        out.backward()
+        self.assertEqual(x.grad.shape, [])
+        self.assertEqual(out.shape, [1])
+        self.assertEqual(out.grad.shape, [1])
+
+        out = paddle.reshape(x, [-1, 1])
+        out.backward()
+        self.assertEqual(x.grad.shape, [])
+        self.assertEqual(out.shape, [1, 1])
+        self.assertEqual(out.grad.shape, [1, 1])
+
+    def test_reshape_tensor(self):
+        x = paddle.rand([1, 1])
+        x.stop_gradient = False
+
+        out = paddle.reshape(x, [])
+        out.backward()
+        self.assertEqual(x.grad.shape, [1, 1])
+        self.assertEqual(out.shape, [])
+        self.assertEqual(out.grad.shape, [])
+
+        new_shape = paddle.full([1], 1, "int32")
+        out = paddle.reshape(x, new_shape)
+        out.backward()
+        self.assertEqual(x.grad.shape, [1, 1])
+        self.assertEqual(out.shape, [1])
+        self.assertEqual(out.grad.shape, [1])
+
+        new_shape = paddle.full([1], -1, "int32")
+        out = paddle.reshape(x, new_shape)
+        out.backward()
+        self.assertEqual(x.grad.shape, [1, 1])
+        self.assertEqual(out.shape, [1])
+        self.assertEqual(out.grad.shape, [1])
+
+        new_shape = [paddle.full([], -1, "int32"), paddle.full([], 1, "int32")]
+        out = paddle.reshape(x, new_shape)
+        out.backward()
+        self.assertEqual(x.grad.shape, [1, 1])
+        self.assertEqual(out.shape, [1, 1])
+        self.assertEqual(out.grad.shape, [1, 1])
+
+    def test_reshape__list(self):
+        x = paddle.rand([])
+        out = paddle.reshape_(x, [])
+        self.assertEqual(out.shape, [])
+
+        out = paddle.reshape_(x, [1])
+        self.assertEqual(out.shape, [1])
+
+        out = paddle.reshape_(x, [-1])
+        self.assertEqual(out.shape, [1])
+
+        out = paddle.reshape_(x, [-1, 1])
+        self.assertEqual(out.shape, [1, 1])
+
+    def test_reshape__tensor(self):
+        x = paddle.rand([1, 1])
+        out = paddle.reshape_(x, [])
+        self.assertEqual(out.shape, [])
+
+        new_shape = paddle.full([1], 1, "int32")
+        out = paddle.reshape_(x, new_shape)
+        self.assertEqual(out.shape, [1])
+
+        new_shape = paddle.full([1], -1, "int32")
+        out = paddle.reshape_(x, new_shape)
+        self.assertEqual(out.shape, [1])
+
+        new_shape = [paddle.full([], -1, "int32"), paddle.full([], 1, "int32")]
+        out = paddle.reshape_(x, new_shape)
+        self.assertEqual(out.shape, [1, 1])
+
+    def test_reverse(self):
+        x = paddle.rand([])
+        x.stop_gradient = False
+        out = paddle.reverse(x, axis=[])
+        out.backward()
+        self.assertEqual(x.shape, [])
+        self.assertEqual(out.shape, [])
+        self.assertEqual(out.grad.shape, [])
+
 
 class TestSundryAPIStatic(unittest.TestCase):
     def setUp(self):
         paddle.enable_static()
         self.exe = paddle.static.Executor()
 
+    @prog_scope()
+    def test_flip(self):
+        x = paddle.rand([])
+        x.stop_gradient = False
+        out = paddle.flip(x, axis=[])
+        paddle.static.append_backward(out)
+
+        program = paddle.static.default_main_program()
+        res1, res2 = self.exe.run(program, fetch_list=[x, out])
+        self.assertEqual(res1.shape, ())
+        self.assertEqual(res2.shape, ())
+
     @prog_scope()
     def test_pow_factor(self):
         x = paddle.rand([])
@@ -914,6 +1069,119 @@ def test_scatter_nd(self):
         self.assertEqual(res[0].shape, (5,))
         self.assertEqual(res[0][3], 2)
 
+    @prog_scope()
+    def test_scale(self):
+        x = paddle.rand([])
+        x.stop_gradient = False
+        out = paddle.scale(x, scale=2.0, bias=1.0)
+        paddle.static.append_backward(out)
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, fetch_list=[out])
+        self.assertEqual(res[0].shape, ())
+
+    @prog_scope()
+    def test_floor_divide(self):
+        # 1-d // 0-d
+        x = paddle.to_tensor([1, -2, 3], dtype="int64")
+        y = paddle.full([], 2, dtype='int64')
+        out1_1 = paddle.floor_divide(x, y)
+        out1_2 = x // y
+
+        # 0-d // 1-d
+        out2_1 = paddle.floor_divide(y, x)
+        out2_2 = y // x
+
+        # 0-d // 0-d
+        x = paddle.full([], 3, dtype='int64')
+        out3_1 = paddle.floor_divide(x, y)
+        out3_2 = x // y
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(
+            prog, fetch_list=[out1_1, out1_2, out2_1, out2_2, out3_1, out3_2]
+        )
+        out1_1, out1_2, out2_1, out2_2, out3_1, out3_2 = res
+
+        np.testing.assert_array_equal(out1_1, out1_2)
+        np.testing.assert_array_equal(out1_1, np.asarray([0, -1, 1]))
+        np.testing.assert_array_equal(out2_1, out2_2)
+        np.testing.assert_array_equal(out2_2, np.asarray([2, -1, 0]))
+        np.testing.assert_array_equal(out3_1, out3_2)
+        np.testing.assert_array_equal(out3_2, np.asarray(1))
+
+    @prog_scope()
+    def test_reshape_list(self):
+        x1 = paddle.rand([])
+        x2 = paddle.rand([])
+        x3 = paddle.rand([])
+        x4 = paddle.rand([])
+        x1.stop_gradient = False
+        x2.stop_gradient = False
+        x3.stop_gradient = False
+        x4.stop_gradient = False
+
+        out1 = paddle.reshape(x1, [])
+        paddle.static.append_backward(out1)
+
+        out2 = paddle.reshape(x2, [1])
+        paddle.static.append_backward(out2)
+
+        out3 = paddle.reshape(x3, [-1])
+        paddle.static.append_backward(out3)
+
+        out4 = paddle.reshape(x4, [-1, 1])
+        paddle.static.append_backward(out4)
+
+        program = paddle.static.default_main_program()
+        res1, res2, res3, res4 = self.exe.run(
+            program, fetch_list=[out1, out2, out3, out4]
+        )
+        self.assertEqual(res1.shape, ())
+        self.assertEqual(res2.shape, (1,))
+        self.assertEqual(res3.shape, (1,))
+        self.assertEqual(res4.shape, (1, 1))
+
+    @prog_scope()
+    def test_reshape_tensor(self):
+        x1 = paddle.rand([])
+        x2 = paddle.rand([])
+        x3 = paddle.rand([])
+        x1.stop_gradient = False
+        x2.stop_gradient = False
+        x3.stop_gradient = False
+
+        new_shape = paddle.full([1], 1, "int32")
+        out1 = paddle.reshape(x1, new_shape)
+        paddle.static.append_backward(out1)
+
+        new_shape = paddle.full([1], -1, "int32")
+        out2 = paddle.reshape(x2, new_shape)
+        paddle.static.append_backward(out2)
+
+        new_shape = [paddle.full([], -1, "int32"), paddle.full([], 1, "int32")]
+        out3 = paddle.reshape(x3, new_shape)
+        paddle.static.append_backward(out3)
+
+        program = paddle.static.default_main_program()
+        res1, res2, res3 = self.exe.run(program, fetch_list=[out1, out2, out3])
+        self.assertEqual(res1.shape, (1,))
+        self.assertEqual(res2.shape, (1,))
+        self.assertEqual(res3.shape, (1, 1))
+
+    @prog_scope()
+    def test_reverse(self):
+        x = paddle.rand([])
+        x.stop_gradient = False
+
+        out = paddle.reverse(x, axis=[])
+        paddle.static.append_backward(out)
+
+        program = paddle.static.default_main_program()
+        res1, res2 = self.exe.run(program, fetch_list=[x, out])
+        self.assertEqual(res1.shape, ())
+        self.assertEqual(res2.shape, ())
+
 
 # Use to test API whose zero-dim input tensors don't have grad and not need to test backward in OpTest.
 class TestNoBackwardAPI(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_clip_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_clip_op_xpu.py
index 9efb334ac7dd5e..e4982c42e4e100 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_clip_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_clip_op_xpu.py
@@ -131,13 +131,13 @@ def test_errors(self):
             input_data = np.random.random((2, 4)).astype("float32")
 
             def test_Variable():
-                fluid.layers.clip(x=input_data, min=-1.0, max=1.0)
+                paddle.clip(x=input_data, min=-1.0, max=1.0)
 
             self.assertRaises(TypeError, test_Variable)
 
             def test_dtype():
                 x2 = fluid.layers.data(name='x2', shape=[1], dtype='int32')
-                fluid.layers.clip(x=x2, min=-1.0, max=1.0)
+                paddle.clip(x=x2, min=-1.0, max=1.0)
 
             self.assertRaises(TypeError, test_dtype)
         paddle.disable_static()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_set_value_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_set_value_op_xpu.py
index b3e8c8b58f8f55..f09d2b8df9b4d1 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_set_value_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_set_value_op_xpu.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# Test set_value op in static mode
+# Test set_value op in static graph mode
 
 import sys
 import unittest
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_zero_dim_tensor_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_zero_dim_tensor_xpu.py
index 018ecc20e7dafe..8ceee04c206b10 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_zero_dim_tensor_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_zero_dim_tensor_xpu.py
@@ -521,6 +521,131 @@ def test_scatter__XD(self):
         for i in range(3):
             self.assertEqual(out.numpy()[1][i], updates.numpy()[i])
 
+    def test_scale(self):
+        x = paddle.rand([])
+        x.stop_gradient = False
+        out = paddle.scale(x, scale=2.0, bias=1.0)
+        out.backward()
+
+        self.assertEqual(out.shape, [])
+        self.assertEqual(out.grad.shape, [])
+        self.assertEqual(x.grad.shape, [])
+
+    def test_floor_divide(self):
+        # 1-d // 0-d
+        x = paddle.to_tensor([1, -2, 3], dtype="int64")
+        y = paddle.full([], 2, dtype='int64')
+        out1_1 = paddle.floor_divide(x, y)
+        out1_2 = paddle.Tensor.__floordiv__(x, y)
+
+        np.testing.assert_array_equal(out1_1.numpy(), out1_2.numpy())
+        np.testing.assert_array_equal(out1_1.numpy(), np.asarray([0, -1, 1]))
+
+        # 0-d // 1-d
+        out2_1 = paddle.floor_divide(y, x)
+        out2_2 = paddle.Tensor.__floordiv__(y, x)
+
+        np.testing.assert_array_equal(out2_1.numpy(), out2_2.numpy())
+        np.testing.assert_array_equal(out2_2.numpy(), np.asarray([2, -1, 0]))
+
+        # 0-d // 0-d
+        x = paddle.full([], 3, dtype='int64')
+        out3_1 = paddle.floor_divide(x, y)
+        out3_2 = paddle.Tensor.__floordiv__(x, y)
+
+        np.testing.assert_array_equal(out3_1.numpy(), out3_2.numpy())
+        np.testing.assert_array_equal(out3_2.numpy(), np.asarray(1))
+
+    def test_reshape_list(self):
+        x = paddle.rand([])
+        x.stop_gradient = False
+
+        out = paddle.reshape(x, [])
+        out.backward()
+        self.assertEqual(x.grad.shape, [])
+        self.assertEqual(out.shape, [])
+        self.assertEqual(out.grad.shape, [])
+
+        out = paddle.reshape(x, [1])
+        out.backward()
+        self.assertEqual(x.grad.shape, [])
+        self.assertEqual(out.shape, [1])
+        self.assertEqual(out.grad.shape, [1])
+
+        out = paddle.reshape(x, [-1])
+        out.backward()
+        self.assertEqual(x.grad.shape, [])
+        self.assertEqual(out.shape, [1])
+        self.assertEqual(out.grad.shape, [1])
+
+        out = paddle.reshape(x, [-1, 1])
+        out.backward()
+        self.assertEqual(x.grad.shape, [])
+        self.assertEqual(out.shape, [1, 1])
+        self.assertEqual(out.grad.shape, [1, 1])
+
+    def test_reshape_tensor(self):
+        x = paddle.rand([1, 1])
+        x.stop_gradient = False
+
+        out = paddle.reshape(x, [])
+        out.backward()
+        self.assertEqual(x.grad.shape, [1, 1])
+        self.assertEqual(out.shape, [])
+        self.assertEqual(out.grad.shape, [])
+
+        new_shape = paddle.full([], 1, "int32")
+        out = paddle.reshape(x, new_shape)
+        out.backward()
+        self.assertEqual(x.grad.shape, [1, 1])
+        self.assertEqual(out.shape, [1])
+        self.assertEqual(out.grad.shape, [1])
+
+        new_shape = paddle.full([], -1, "int32")
+        out = paddle.reshape(x, new_shape)
+        out.backward()
+        self.assertEqual(x.grad.shape, [1, 1])
+        self.assertEqual(out.shape, [1])
+        self.assertEqual(out.grad.shape, [1])
+
+        new_shape = [paddle.full([], -1, "int32"), paddle.full([], 1, "int32")]
+        out = paddle.reshape(x, new_shape)
+        out.backward()
+        self.assertEqual(x.grad.shape, [1, 1])
+        self.assertEqual(out.shape, [1, 1])
+        self.assertEqual(out.grad.shape, [1, 1])
+
+    def test_reshape__list(self):
+        x = paddle.rand([])
+        out = paddle.reshape_(x, [])
+        self.assertEqual(out.shape, [])
+
+        out = paddle.reshape_(x, [1])
+        self.assertEqual(out.shape, [1])
+
+        out = paddle.reshape_(x, [-1])
+        self.assertEqual(out.shape, [1])
+
+        out = paddle.reshape_(x, [-1, 1])
+        self.assertEqual(out.shape, [1, 1])
+
+    def test_reshape__tensor(self):
+        x = paddle.rand([1, 1])
+        out = paddle.reshape_(x, [])
+        self.assertEqual(out.shape, [])
+
+        new_shape = paddle.full([1], 1, "int32")
+        out = paddle.reshape_(x, new_shape)
+        self.assertEqual(out.shape, [1])
+
+        new_shape = paddle.full([1], -1, "int32")
+        out = paddle.reshape_(x, new_shape)
+        self.assertEqual(out.shape, [1])
+
+        new_shape = [paddle.full([], -1, "int32"), paddle.full([], 1, "int32")]
+        out = paddle.reshape_(x, new_shape)
+        self.assertEqual(out.shape, [1, 1])
+
 
 # Use to test API whose zero-dim input tensors don't have grad and not need to test backward in OpTest.
 class TestNoBackwardAPI(unittest.TestCase):
diff --git a/python/paddle/framework/__init__.py b/python/paddle/framework/__init__.py
index 99d9cffed1fa33..986b8e93ae6828 100755
--- a/python/paddle/framework/__init__.py
+++ b/python/paddle/framework/__init__.py
@@ -64,7 +64,6 @@
 
 from ..fluid.layer_helper import LayerHelper  # noqa: F401
 from ..fluid.framework import in_dygraph_mode  # noqa: F401
-from ..fluid.framework import _in_legacy_dygraph  # noqa: F401
 from ..fluid.framework import _global_flags  # noqa: F401
 from ..fluid.framework import _apply_pass  # noqa: F401
 from ..fluid.framework import switch_main_program
diff --git a/python/paddle/framework/io.py b/python/paddle/framework/io.py
index d1ebcc28f465e2..ccc5310853ba47 100644
--- a/python/paddle/framework/io.py
+++ b/python/paddle/framework/io.py
@@ -902,7 +902,7 @@ def load(path, **configs):
         directory, such as ``model`` and model is a directory.
 
     Note:
-        If you load ``state_dict`` from the saved result of static mode API such as
+        If you load ``state_dict`` from the saved result of static graph mode API such as
         ``paddle.static.save`` or ``paddle.static.save_inference_model`` ,
         the structured variable name in dynamic mode will cannot be restored.
         You need to set the argument ``use_structured_name=False`` when using
diff --git a/python/paddle/geometric/message_passing/utils.py b/python/paddle/geometric/message_passing/utils.py
index 09a051feb93402..cd8e7e4acb1041 100644
--- a/python/paddle/geometric/message_passing/utils.py
+++ b/python/paddle/geometric/message_passing/utils.py
@@ -37,7 +37,7 @@ def convert_out_size_to_list(out_size):
 def get_out_size_tensor_inputs(inputs, attrs, out_size, op_type):
     """
     Convert out_size(int, np.int32, np.int64, Variable) to inputs
-    and attrs in static mode.
+    and attrs in static graph mode.
     """
     if out_size is None:
         attrs['out_size'] = [0]
diff --git a/python/paddle/hapi/model.py b/python/paddle/hapi/model.py
index 14943cfc019daa..52a0f8b4b3c4f4 100644
--- a/python/paddle/hapi/model.py
+++ b/python/paddle/hapi/model.py
@@ -305,7 +305,7 @@ def train_batch(self, inputs, labels=None, update=True):
         self.mode = 'train'
         assert (
             update is True
-        ), "Does not support `update == False` in static mode by now."
+        ), "Does not support `update == False` in static graph mode by now."
         return self._run(inputs, labels)
 
     def eval_batch(self, inputs, labels=None):
@@ -1012,7 +1012,7 @@ class Model:
     must be required for static graph.
 
     When training on GPU, auto mixed precision (AMP O1) and pure float16
-    (AMP O2) training are both supported in static mode and dynamic mode.
+    (AMP O2) training are both supported in static graph mode and dynamic mode.
     In static graph mode, before training with pure float16 (AMP O2),
     `multi_precision` could be set to True when creating optimizer, which can
     avoid poor accuracy or slow convergence in a way, and inputs of dtype float
@@ -1535,7 +1535,7 @@ def _check_pure_fp16_configs():
                 assert isinstance(
                     self._optimizer._grad_clip,
                     (paddle.nn.ClipGradByGlobalNorm, paddle.nn.ClipGradByNorm),
-                ), "Only GradientClipByNorm and GradientClipByGlobalNorm are supported in amp training with level=O2 currently."
+                ), "Only ClipGradByNorm and ClipGradByGlobalNorm are supported in amp training with level=O2 currently."
 
         self._adapter._amp_custom_lists = {}
         self._adapter._amp_configs = {}
@@ -1605,7 +1605,7 @@ def _check_amp_configs(amp_config_key_set):
             if 'use_fp16_guard' in amp_config_key_set:
                 if _non_static_mode():
                     raise ValueError(
-                        "'use_fp16_guard' is supported in static mode only."
+                        "'use_fp16_guard' is supported in static graph mode only."
                     )
                 self._adapter._use_fp16_guard = amp_configs['use_fp16_guard']
                 amp_config_key_set.remove('use_fp16_guard')
@@ -1643,7 +1643,7 @@ def prepare(
                 'incr_every_n_steps', 'decr_every_n_nan_or_inf',
                 'use_dynamic_loss_scaling', 'custom_white_list',
                 'custom_black_list', and 'custom_black_varnames'or
-                'use_fp16_guard' is only supported in static mode. Mixed
+                'use_fp16_guard' is only supported in static graph mode. Mixed
                 precision API documentations  :ref:`api_paddle_amp_auto_cast`
                 and  :ref:`api_paddle_amp_GradScaler` could be referenced
                 for details. For convenience, 'amp_configs' could be set to
diff --git a/python/paddle/hapi/model_summary.py b/python/paddle/hapi/model_summary.py
index 259262a106def8..d6234eacd15aff 100644
--- a/python/paddle/hapi/model_summary.py
+++ b/python/paddle/hapi/model_summary.py
@@ -180,7 +180,7 @@ def forward(self, inputs):
 
     if not paddle.in_dynamic_mode():
         warnings.warn(
-            "Your model was created in static mode, this may not get correct summary information!"
+            "Your model was created in static graph mode, this may not get correct summary information!"
         )
         in_train_mode = False
     else:
diff --git a/python/paddle/incubate/autograd/primapi.py b/python/paddle/incubate/autograd/primapi.py
index 18a06af5dca7fc..0cd68983800b82 100644
--- a/python/paddle/incubate/autograd/primapi.py
+++ b/python/paddle/incubate/autograd/primapi.py
@@ -23,7 +23,7 @@ def forward_grad(outputs, inputs, grad_inputs=None):
     """Forward mode of automatic differentiation.
 
     Note:
-        **ONLY available in the static mode and primitive operators.**
+        **ONLY available in the static graph mode and primitive operators.**
 
     Args:
         outputs(Tensor|Sequence[Tensor]): The output tensor or tensors.
@@ -106,7 +106,7 @@ def grad(outputs, inputs, grad_outputs=None):
     """Reverse mode of automatic differentiation.
 
     Note:
-        **ONLY available in the static mode and primitive operators**
+        **ONLY available in the static graph mode and primitive operators**
 
     Args:
         outputs(Tensor|Sequence[Tensor]): The output Tensor or Tensors.
diff --git a/python/paddle/incubate/autograd/primx.py b/python/paddle/incubate/autograd/primx.py
index 601b486d354c79..08489068de0ae7 100644
--- a/python/paddle/incubate/autograd/primx.py
+++ b/python/paddle/incubate/autograd/primx.py
@@ -547,7 +547,7 @@ def expand_nested_list(xs):
 def orig2prim(block=None):
     """
     Note:
-        **This API is ONLY available in the static mode.**
+        **This API is ONLY available in the static graph mode.**
         **Args block must be None or current block of main program.**
 
     All operators in the target block are processed as follows.
@@ -572,7 +572,7 @@ def orig2prim(block=None):
 def prim2orig(block=None, blacklist=None):
     """
     Note:
-        **ONLY available in the static mode.**
+        **ONLY available in the static graph mode.**
         **Args block must be None or current block of main program.**
 
     All operators in the target block are processed as follows.
diff --git a/python/paddle/incubate/autograd/utils.py b/python/paddle/incubate/autograd/utils.py
index 5437401aecaab9..b5f93ebe9703cd 100644
--- a/python/paddle/incubate/autograd/utils.py
+++ b/python/paddle/incubate/autograd/utils.py
@@ -35,7 +35,7 @@ def set_status(self, flag):
 def prim_enabled():
     """
     Note:
-        **ONLY available in the static mode.**
+        **ONLY available in the static graph mode.**
 
     Shows whether the automatic differentiation mechanism based on
     automatic differential basic operators is ON. Defaults to OFF.
@@ -66,7 +66,7 @@ def prim_enabled():
 def enable_prim():
     """
     Note:
-        **ONLY available in the static mode.**
+        **ONLY available in the static graph mode.**
 
     Turns ON automatic differentiation mechanism based on automatic
     differential basic operators.
@@ -90,7 +90,7 @@ def enable_prim():
 def disable_prim():
     """
     Note:
-        **ONLY available in the static mode.**
+        **ONLY available in the static graph mode.**
 
     Turns OFF automatic differentiation mechanism based on automatic
     differential basic operators.
diff --git a/python/paddle/incubate/distributed/models/moe/grad_clip.py b/python/paddle/incubate/distributed/models/moe/grad_clip.py
index ca4922700b8f49..6bee79b871cd5e 100644
--- a/python/paddle/incubate/distributed/models/moe/grad_clip.py
+++ b/python/paddle/incubate/distributed/models/moe/grad_clip.py
@@ -15,13 +15,14 @@
 import paddle
 import paddle.distributed as dist
 from paddle.fluid import core, layers
-from paddle.fluid.clip import ClipGradBase, _squared_l2_norm
 from paddle.fluid.dygraph import base as imperative_base
+from paddle.nn import clip
+from paddle.nn.clip import ClipGradBase, _squared_l2_norm
 
 
 class ClipGradForMOEByGlobalNorm(ClipGradBase):
     r"""
-    The Algrithm is the same as paddle.fluid.clip.ClipGradByGlobalNorm
+    The Algrithm is the same as paddle.nn.ClipGradByGlobalNorm
     Given a list of Tensor :math:`t\_list` , calculate the global norm for the elements of all tensors in
     :math:`t\_list` , and limit it to ``clip_norm`` .
 
@@ -113,8 +114,8 @@ def get_l2_norm_pow(params_grads, sum_dtype=None):
                 continue
             merge_grad = g
             if g.type == core.VarDesc.VarType.SELECTED_ROWS:
-                merge_grad = layers.merge_selected_rows(g)
-                merge_grad = layers.get_tensor_from_selected_rows(merge_grad)
+                merge_grad = clip.merge_selected_rows(g)
+                merge_grad = clip.get_tensor_from_selected_rows(merge_grad)
             sum_square = _squared_l2_norm(merge_grad)
             if sum_square.dtype == core.VarDesc.VarType.FP16:
                 sum_square_list_fp16.append(sum_square)
diff --git a/python/paddle/incubate/operators/graph_send_recv.py b/python/paddle/incubate/operators/graph_send_recv.py
index e433d145bf9732..007f6b3c5519d6 100644
--- a/python/paddle/incubate/operators/graph_send_recv.py
+++ b/python/paddle/incubate/operators/graph_send_recv.py
@@ -189,7 +189,7 @@ def convert_out_size_to_list(out_size):
 def get_out_size_tensor_inputs(inputs, attrs, out_size, op_type):
     """
     Convert out_size(int, np.int32, np.int64, Variable) to inputs
-    and attrs in static mode.
+    and attrs in static graph mode.
     """
     if out_size is None:
         attrs['out_size'] = [0]
diff --git a/python/paddle/incubate/optimizer/distributed_fused_lamb.py b/python/paddle/incubate/optimizer/distributed_fused_lamb.py
index bc2837fa2fe58f..9aa51cd8122e68 100644
--- a/python/paddle/incubate/optimizer/distributed_fused_lamb.py
+++ b/python/paddle/incubate/optimizer/distributed_fused_lamb.py
@@ -16,11 +16,11 @@
 
 import paddle
 from paddle.fluid import core, framework, unique_name
-from paddle.fluid.clip import ClipGradByGlobalNorm
 from paddle.fluid.executor import global_scope
 from paddle.fluid.framework import Variable, name_scope
 from paddle.fluid.layer_helper import LayerHelper
 from paddle.fluid.optimizer import Optimizer
+from paddle.nn import ClipGradByGlobalNorm
 
 
 def init_communicator(block, rank, ranks, ring_id):
diff --git a/python/paddle/incubate/optimizer/functional/lbfgs.py b/python/paddle/incubate/optimizer/functional/lbfgs.py
index 9001c2812b7685..a7221f0925e76d 100644
--- a/python/paddle/incubate/optimizer/functional/lbfgs.py
+++ b/python/paddle/incubate/optimizer/functional/lbfgs.py
@@ -132,7 +132,7 @@ def func(x):
     tail = paddle.full(shape=[1], fill_value=0, dtype='int64')
 
     shape = initial_position.shape[0]
-    # Use tensor as array of fixed length, rather than flexible tensor array. Because in static mode,
+    # Use tensor as array of fixed length, rather than flexible tensor array. Because in static graph mode,
     # tensor array will produce tensor of shape[-1], which will cause error when calling jacobian. In this way, can not use append
     # or pop, so we need head and tail to record where is the newest data and where is the oldest.
     # Totally speaking, realized a stack by array.
diff --git a/python/paddle/incubate/optimizer/modelaverage.py b/python/paddle/incubate/optimizer/modelaverage.py
index 52bf1ac4f34e12..9f10c47ef60163 100644
--- a/python/paddle/incubate/optimizer/modelaverage.py
+++ b/python/paddle/incubate/optimizer/modelaverage.py
@@ -59,7 +59,7 @@ class ModelAverage(Optimizer):
         average_window_rate (float): The calculate ratio of the window length relative to ``Parameter`` update times.
         parameters (list, optional): List of ``Tensor`` names to update to minimize ``loss``. \
             This parameter is required in dygraph mode. \
-            The default value is None in static mode, at this time all parameters will be updated.
+            The default value is None in static graph mode, at this time all parameters will be updated.
         min_average_window (int, optional): the minimum size of average window length. The default value is 10000.
         max_average_window (int, optional): The maximum size of average window length. The default value is 10000.
         name (str, optional): Normally there is no need for user to set this property.
diff --git a/python/paddle/jit/api.py b/python/paddle/jit/api.py
index 30dc5fd1d6f2e8..199667d3cb1923 100644
--- a/python/paddle/jit/api.py
+++ b/python/paddle/jit/api.py
@@ -195,7 +195,7 @@ def to_static(
 ):
     """
     Converts imperative dygraph APIs into declarative function APIs. Decorator
-    @to_static handles the Program and Executor of static mode and returns
+    @to_static handles the Program and Executor of static graph mode and returns
     the result as dygraph Tensor(s). Users could use the returned dygraph
     Tensor(s) to do imperative training, inference, or other operations. If the
     decorated function calls other imperative function, the called one will be
diff --git a/python/paddle/jit/dy2static/basic_api_transformer.py b/python/paddle/jit/dy2static/basic_api_transformer.py
index 6fadfa81911288..24825e8e937c70 100644
--- a/python/paddle/jit/dy2static/basic_api_transformer.py
+++ b/python/paddle/jit/dy2static/basic_api_transformer.py
@@ -133,7 +133,7 @@ class AttributeJstTransformer(BaseTransformer):
     for example:
         a.size  -->  __jst.attr(a, "size")
 
-    because `size` have different behavier when in dygraph / static mode
+    because `size` have different behavier when in dygraph / static graph mode
     NOTE: we only deal with ctx=Load() case.
     """
 
diff --git a/python/paddle/jit/dy2static/convert_operators.py b/python/paddle/jit/dy2static/convert_operators.py
index 3ec3dba88df2bc..328b879c5aab62 100644
--- a/python/paddle/jit/dy2static/convert_operators.py
+++ b/python/paddle/jit/dy2static/convert_operators.py
@@ -42,10 +42,12 @@ def convert_attr(x, attr):
 def indexable(x, code=None):
     if isinstance(x, Variable):
         return x
-    if hasattr(x, '__len__') and hasattr(x, '__getitem__'):
-        return x
-    if hasattr(x, '__iter__'):
+    elif hasattr(x, '__iter__'):
         return [i for i in x]
+    elif hasattr(x, '__len__') and hasattr(
+        x, '__getitem__'
+    ):  # used for customed type and non-iterable type.
+        return x
     else:
         raise RuntimeError("X can't be convert into indexable.")
 
diff --git a/python/paddle/jit/dy2static/partial_program.py b/python/paddle/jit/dy2static/partial_program.py
index 40f276d269f96a..ca678f6a6a67fd 100644
--- a/python/paddle/jit/dy2static/partial_program.py
+++ b/python/paddle/jit/dy2static/partial_program.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from copy import deepcopy
+
 import numpy as np
 
 import paddle
@@ -148,7 +150,7 @@ class PartialProgramLayer:
         parameters(list[VarBase]|None): All trainable parameters included in the program. Default None.
 
     Returns:
-        Layer: A Layer object that run all ops internally in static mode.
+        Layer: A Layer object that run all ops internally in static graph mode.
     """
 
     def __init__(
@@ -699,19 +701,32 @@ def infer_program(self):
     def _get_forward_backward_program_form(
         self, whole_program, forward_end_op_index
     ):
-        forward_builded_program = add_build_strategy_for(
-            whole_program, 0, forward_end_op_index, self._build_strategy
-        )
+        # NOTE(dev): We apply build_strategy for backward firstly to
+        # avoid skipping more gc variables.
         backward_start_op_index = forward_end_op_index + 2 * len(
             self._outputs.var_ids
         )
         backward_end_op_index = whole_program.desc.block(0).op_size()
+        backward_skip_vars = self._parse_skip_gc_vars(whole_program)
         backward_builded_program = add_build_strategy_for(
             whole_program,
             backward_start_op_index,
             backward_end_op_index,
             self._build_strategy,
+            backward_skip_vars,
+        )
+
+        forward_skip_vars = self._parse_skip_gc_vars(
+            whole_program, backward_builded_program
+        )
+        forward_builded_program = add_build_strategy_for(
+            whole_program,
+            0,
+            forward_end_op_index,
+            self._build_strategy,
+            forward_skip_vars,
         )
+
         self._apply_inplace_pass(
             forward_builded_program, backward_builded_program
         )
@@ -726,26 +741,10 @@ def _apply_inplace_pass(self, forward_program, backward_program):
         empty_startup_program = paddle.static.Program()
         use_cuda = True if core.is_compiled_with_cuda() else False
         # skip data var
-        forward_mem_opt_skip_vars = []
-        backward_mem_opt_skip_vars = []
-        for var_name, var in forward_program.global_block().vars.items():
-            if var.is_data:
-                forward_mem_opt_skip_vars.append(var_name)
-        for var_name, var in backward_program.global_block().vars.items():
-            if var.is_data:
-                backward_mem_opt_skip_vars.append(var_name)
-        for var in self._inputs:
-            if isinstance(var, paddle.fluid.framework.Variable):
-                forward_mem_opt_skip_vars.append(var.desc.name())
-                backward_mem_opt_skip_vars.append(var.desc.name())
-        for var in self._outputs:
-            if isinstance(var, paddle.fluid.framework.Variable):
-                forward_mem_opt_skip_vars.append(var.desc.name())
-                backward_mem_opt_skip_vars.append(var.desc.name())
-        for var_name in core.parse_safe_eager_deletion_skip_vars(
-            backward_program.desc
-        ):
-            forward_mem_opt_skip_vars.append(var_name)
+        forward_mem_opt_skip_vars = self._parse_skip_gc_vars(
+            forward_program, backward_program
+        )
+        backward_mem_opt_skip_vars = self._parse_skip_gc_vars(forward_program)
         attrs = {
             "use_cuda": use_cuda,
             "mem_opt_skip_vars": forward_mem_opt_skip_vars,
@@ -771,6 +770,38 @@ def _apply_inplace_pass(self, forward_program, backward_program):
             attr_types,
         )
 
+    @LazyInitialized
+    def _inout_var_names(self):
+        """
+        Returns Variable Names from self._inputs and self.outputs
+        """
+        var_names = []
+        for var in self._inputs:
+            if isinstance(var, paddle.fluid.framework.Variable):
+                var_names.append(var.desc.name())
+        for var in self._outputs:
+            if isinstance(var, paddle.fluid.framework.Variable):
+                var_names.append(var.desc.name())
+        return var_names
+
+    def _parse_skip_gc_vars(self, program, backward_program=None):
+        """
+        Parse variables that need to skip GC after execute it.
+        If specify backward_program, it will keep the variables used in backward.
+        """
+        # skip data var, DO NOT ignore this deepcopy
+        skip_vars = deepcopy(self._inout_var_names)
+        for var_name, var in program.global_block().vars.items():
+            if var.is_data:
+                skip_vars.append(var_name)
+
+        if backward_program:
+            for var_name in core.parse_safe_eager_deletion_skip_vars(
+                backward_program.desc
+            ):
+                skip_vars.append(var_name)
+        return skip_vars
+
     def _prepare(self, inputs):
         """
         Prepare inputs, outputs, attrs.
@@ -1055,13 +1086,16 @@ def partial_program_from(concrete_program):
 
 @switch_to_static_graph
 def add_build_strategy_for(
-    program, start_op_index, end_op_index, build_strategy=None
+    program, start_op_index, end_op_index, build_strategy=None, skip_vars=None
 ):
     if start_op_index < end_op_index:
         compiled_program = paddle.static.CompiledProgram(
             core.Graph(program.desc, start_op_index, end_op_index),
             build_strategy=build_strategy,
         )
+        if skip_vars:
+            # TODO(Aurelius84): Need to unify name with C++, such as kSkipVarNames.
+            compiled_program._graph.set("skip_gc_vars", set(skip_vars))
         compiled_program._compile(
             core.Scope(), framework._current_expected_place()
         )
diff --git a/python/paddle/jit/dy2static/program_translator.py b/python/paddle/jit/dy2static/program_translator.py
index d21a59c96e218e..a01fb286c78226 100644
--- a/python/paddle/jit/dy2static/program_translator.py
+++ b/python/paddle/jit/dy2static/program_translator.py
@@ -415,7 +415,7 @@ def __call__(self, *args, **kwargs):
         if not _non_static_mode():
             raise RuntimeError(
                 "Failed to run the callable object {} decorated by '@paddle.jit.to_static', "
-                "because it is NOT in dynamic mode. Please disable the static mode to enter dynamic mode with the "
+                "because it is NOT in dynamic mode. Please disable the static graph mode to enter dynamic mode with the "
                 "following API: paddle.disable_static().".format(
                     self.dygraph_function
                 )
@@ -691,7 +691,7 @@ def forward(self, x, flag=True):
                         return out
 
                 x = paddle.randn([10, 1], 'float32')
-                net = paddle.jit.to_static(Net())  # convert into static mode
+                net = paddle.jit.to_static(Net())  # convert into static graph mode
                 out = net(x)
 
                 net.forward.rollback()  # rollback into dygraph mode
@@ -751,7 +751,7 @@ def forward(self, x, flag=True):
                         return out
 
                 x = paddle.randn([10, 1], 'float32')
-                net = paddle.jit.to_static(Net())  # convert into static mode
+                net = paddle.jit.to_static(Net())  # convert into static graph mode
 
                 copy_net = copy.deepcopy(net)      # deepcopy a new net without @to_static
 
diff --git a/python/paddle/metric/metrics.py b/python/paddle/metric/metrics.py
index 0fa5c84f07f7b9..bba9082308d97b 100644
--- a/python/paddle/metric/metrics.py
+++ b/python/paddle/metric/metrics.py
@@ -269,7 +269,7 @@ def compute(self, pred, label, *args):
         if (len(label.shape) == 1) or (
             len(label.shape) == 2 and label.shape[-1] == 1
         ):
-            # In static mode, the real label data shape may be different
+            # In static graph mode, the real label data shape may be different
             # from shape defined by paddle.static.InputSpec in model
             # building, reshape to the right shape.
             label = paddle.reshape(label, (-1, 1))
diff --git a/python/paddle/nn/clip.py b/python/paddle/nn/clip.py
index 61143175fd4af5..10eeb6319063c1 100644
--- a/python/paddle/nn/clip.py
+++ b/python/paddle/nn/clip.py
@@ -12,9 +12,1074 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# TODO: define the functions to clip gradient of parameter
-from ..fluid.clip import ClipGradByGlobalNorm  # noqa: F401
-from ..fluid.clip import ClipGradByNorm  # noqa: F401
-from ..fluid.clip import ClipGradByValue  # noqa: F401
+import copy
+import warnings
+
+import paddle
+import paddle.autograd as imperative_base
+from paddle import _C_ops, _legacy_C_ops
+from paddle.common_ops_import import Variable, check_type, default_main_program
+from paddle.fluid import core, framework, layers, unique_name
+from paddle.fluid.data_feeder import check_variable_and_dtype
+from paddle.framework import LayerHelper, _non_static_mode, in_dygraph_mode
+from paddle.tensor.layer_function_generator import templatedoc
 
 __all__ = []
+
+
+@templatedoc()
+def clip_by_norm(x, max_norm, name=None):
+    """
+    ${comment}
+
+    Args:
+        x(${x_type}): ${x_comment}
+        max_norm(${max_norm_type}): ${max_norm_comment}
+        name(str, optional): For detailed information, please refer
+            to :ref:`api_guide_Name`. Usually name is no need to set and
+            None by default.
+
+    Returns:
+        Tensor:
+
+        out(${out_type}): ${out_comment}
+
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            from paddle.nn import clip
+
+            input = paddle.to_tensor([[2.0, 2.0], [2.0, 2.0]], dtype='float32')
+            reward = clip.clip_by_norm(x=input, max_norm=1.0)
+            # [[0.5, 0.5], [0.5, 0.5]]
+    """
+
+    if in_dygraph_mode():
+        return _C_ops.clip_by_norm(x, max_norm)
+    if _non_static_mode():
+        return _legacy_C_ops.clip_by_norm(x, 'max_norm', max_norm)
+
+    helper = LayerHelper("clip_by_norm", **locals())
+    check_variable_and_dtype(x, 'X', ['float32', 'float16'], 'clip_by_norm')
+    check_type(max_norm, 'max_norm', (float), 'clip_by_norm')
+
+    if name is None:
+        name = unique_name.generate_with_ignorable_key(
+            ".".join([helper.name, 'tmp'])
+        )
+
+    out = helper.create_variable(
+        type=x.type, name=name, dtype=x.dtype, persistable=False
+    )
+
+    helper.append_op(
+        type="clip_by_norm",
+        inputs={"X": x},
+        attrs={"max_norm": max_norm},
+        outputs={"Out": out},
+    )
+
+    return out
+
+
+@templatedoc()
+def merge_selected_rows(x, name=None):
+    """
+    ${comment}
+
+    Args:
+        x(${x_type}): ${x_comment}
+        name(basestring|None): Name of the output.
+
+    Returns:
+        out(${out_type}): ${out_comment}
+
+    Examples:
+        .. code-block:: python
+
+            import paddle.fluid as fluid
+            b = fluid.default_main_program().global_block()
+            var = b.create_var(
+                name="X", dtype="float32", persistable=True,
+                type=fluid.core.VarDesc.VarType.SELECTED_ROWS)
+            y = nn.merge_selected_rows(var)
+    """
+    if in_dygraph_mode():
+        return _C_ops.merge_selected_rows(x)
+
+    if _non_static_mode():
+        return _legacy_C_ops.merge_selected_rows(x)
+
+    helper = LayerHelper("merge_selected_rows", **locals())
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
+    helper.append_op(
+        type="merge_selected_rows",
+        inputs={"X": x},
+        attrs={},
+        outputs={"Out": out},
+    )
+    return out
+
+
+@templatedoc()
+def get_tensor_from_selected_rows(x, name=None):
+    """
+    Get tensor data from input with SelectedRows type, and outputs a Tensor.
+
+    .. code-block:: text
+
+        input x is SelectedRows:
+           x.rows = [0, 5, 5, 4, 19]
+           x.height = 20
+           x.value = [[1, 1] [2, 2] [2, 2] [3, 3] [6, 6]]
+
+        Output is LoDTensor:
+           out.shape = [5, 2]
+           out.data = [[1, 1],
+                       [2, 2],
+                       [2, 2],
+                       [3, 3],
+                       [6, 6]]
+
+    Args:
+        x(SelectedRows): Input with SelectedRows type. The data type is float32, float64, int32 or int64.
+        name(str, optional): The default value is None.  Normally there is no need for user to set this property.
+            For more information, please refer to :ref:`api_guide_Name` .
+
+    Returns:
+        Variable: LoDTensor transformed from SelectedRows. The data type is same with input.
+
+    Examples:
+        .. code-block:: python
+
+            from paddle import nnp.py
+            b = fluid.default_main_program().global_block()
+            input = b.create_var(name="X", dtype="float32", persistable=True, type=fluid.core.VarDesc.VarType.SELECTED_ROWS)
+            out = nn.get_tensor_from_selected_rows(input)
+    """
+
+    check_type(x, 'x', Variable, 'get_tensor_from_selected_rows')
+    if x.type != core.VarDesc.VarType.SELECTED_ROWS:
+        raise TypeError(
+            "The type of 'x' in get_tensor_from_selected_rows must be SELECTED_ROWS."
+        )
+    helper = LayerHelper('get_tensor_from_selected_rows', **locals())
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
+    helper.append_op(
+        type='get_tensor_from_selected_rows',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={},
+    )
+    return out
+
+
+_clip_by_global_norm_using_mp_type_flag = False
+
+
+def _clip_by_global_norm_using_mp_type(*args):
+    global _clip_by_global_norm_using_mp_type_flag
+    assert len(args) <= 1
+    if len(args) == 1:
+        assert isinstance(args[0], bool)
+        old_value = _clip_by_global_norm_using_mp_type_flag
+        _clip_by_global_norm_using_mp_type_flag = args[0]
+        return old_value
+    else:
+        return _clip_by_global_norm_using_mp_type_flag
+
+
+def _cast_to_mp_type_if_enabled(x):
+    if (
+        x.dtype == core.VarDesc.VarType.FP16
+        or x.dtype == core.VarDesc.VarType.BF16
+    ) and _clip_by_global_norm_using_mp_type():
+        return x.astype(core.VarDesc.VarType.FP32)
+    else:
+        return x
+
+
+def _squared_l2_norm(x):
+    r"""
+    Return the squared L2 norm of a tensor.
+    """
+
+    x = _cast_to_mp_type_if_enabled(x)
+    if (
+        core.is_compiled_with_xpu()
+        or x.dtype == core.VarDesc.VarType.FP16
+        or x.dtype == core.VarDesc.VarType.BF16
+    ):
+        square = paddle.square(x)
+        sum_square = paddle.sum(square)
+        return sum_square
+
+    if in_dygraph_mode():
+        return _C_ops.squared_l2_norm(x)
+
+    op_type = 'squared_l2_norm'
+    check_variable_and_dtype(x, 'x', ['float32', 'float64'], op_type)
+    helper = LayerHelper(op_type, **locals())
+    out = helper.create_variable_for_type_inference(x.dtype)
+
+    inputs = {"X": x}
+    outputs = {'Out': out}
+    helper.append_op(type=op_type, inputs=inputs, outputs=outputs)
+    return out
+
+
+class BaseErrorClipAttr:
+    def __str__(self):
+        raise NotImplementedError()
+
+    def _append_clip_op(self, block, grad_name):
+        raise NotImplementedError()
+
+
+class ErrorClipByValue(BaseErrorClipAttr):
+    r"""
+    Clip tensor values to the range [min, max].
+
+    Given a tensor ``t`` (see Examples below), this operation clips its value \
+    to ``min`` and ``max`` inplace.
+
+    - Any values less than min are set to min.
+    - Any values greater than max are set to max.
+
+    Args:
+        max (float): The maximum value to clip by.
+        min (float, optional): The minimum value to clip by. if not set by user, \
+        will be set to ``-max`` by framework.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle.fluid as fluid
+            import paddle
+            paddle.enable_static()
+            BATCH_SIZE = 128
+            CLIP_MAX = 2e-6
+            CLIP_MIN = -1e-6
+            prog = fluid.framework.Program()
+            with fluid.program_guard(main_program=prog):
+                image = fluid.layers.data(
+                    name='x', shape=[784], dtype='float32')
+                hidden1 = fluid.layers.fc(input=image, size=128, act='relu')
+                hidden2 = fluid.layers.fc(input=hidden1, size=64, act='relu')
+                predict = fluid.layers.fc(
+                    input=hidden2, size=10, act='softmax')
+                label = fluid.layers.data(name='y', shape=[1], dtype='int64')
+                cost = paddle.nn.functional.cross_entropy(input=predict, label=label)
+                avg_cost = paddle.mean(cost)
+            prog_clip = prog.clone()
+            prog_clip.block(0).var(hidden1.name)._set_error_clip(
+                paddle.nn.clip.ErrorClipByValue(
+                    max=CLIP_MAX, min=CLIP_MIN)
+                    )
+    """
+
+    def __init__(self, max, min=None):
+        max = float(max)
+        if min is None:
+            min = -max
+        else:
+            min = float(min)
+        self.max = max
+        self.min = min
+
+    def __str__(self):
+        return "ByValue, min=%f, max=%f" % (self.min, self.max)
+
+    def _append_clip_op(self, block, grad_name):
+        clip_op_desc = block.desc.append_op()
+        clip_op_desc.set_type("clip")
+        clip_op_desc.set_input("X", [grad_name])
+        clip_op_desc.set_output("Out", [grad_name])
+        clip_op_desc._set_attr("min", self.min)
+        clip_op_desc._set_attr("max", self.max)
+
+
+def error_clip_callback(block, context):
+    # the context is a grad_to_var map
+    grad_to_var = context
+    op_desc = block.desc.op(block.desc.op_size() - 1)
+    for grad_n in [n for n in op_desc.output_arg_names() if n in grad_to_var]:
+        fwd_var = block._var_recursive(grad_to_var[grad_n])
+        error_clip = getattr(fwd_var, "error_clip", None)
+        if not (
+            error_clip is None or isinstance(error_clip, BaseErrorClipAttr)
+        ):
+            raise TypeError(
+                "Variable's error_clip should be an instance of BaseErrorClipAttr or None."
+            )
+        if error_clip is not None:
+            error_clip._append_clip_op(block, grad_n)
+
+
+class ClipGradBase:
+    def __init__(self):
+        super().__init__()
+
+    def __str__(self):
+        raise NotImplementedError()
+
+    @imperative_base.no_grad()
+    def _dygraph_clip(self, params_grads):
+        raise NotImplementedError
+
+    def _static_clip(self, params_grads):
+        raise NotImplementedError
+
+    def __call__(self, params_grads):
+        if _non_static_mode():
+            return self._dygraph_clip(params_grads)
+        else:
+            for p, g in params_grads:
+                if getattr(p, 'gradient_clip_attr', None) is not None:
+                    warnings.warn(
+                        "'set_gradient_clip' will be ineffective, because you have "
+                        "set 'need_clip' in 'ParamAttr'. So, 'set_gradient_clip' "
+                        "is redundant and you can remove it."
+                    )
+                    break
+            return self._static_clip(params_grads)
+
+    def _process_context(self, context, param, grad):
+        raise NotImplementedError()
+
+    def _create_operators(self, param, grad):
+        raise NotImplementedError()
+
+
+class ClipGradByValue(ClipGradBase):
+    """
+    Limit the value of multi-dimensional Tensor :math:`X` to the range [min, max].
+
+    - Any values less than min are set to ``min``.
+
+    - Any values greater than max are set to ``max``.
+
+    The multi-dimensional Tensor :math:`X` is not passed from this class, but the gradients of all parameters set in ``optimizer``.
+    If ``need_clip`` of specific param is ``False`` in its ``ParamAttr``, then the gradients of this param will not be clipped.
+
+    Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer``
+    (for example: :ref:`api_paddle_optimizer_SGD`).
+
+    Note:
+        ``need_clip`` of ``ClipGradByValue`` HAS BEEN DEPRECATED since 2.0.
+        Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.
+
+    Args:
+        max (float): The maximum value to clip by.
+        min (float, optional): The minimum value to clip by. if not set by user, it will be set to ``-max``
+            automatically. In this case, ``max`` must be greater than 0.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
+            linear = paddle.nn.Linear(in_features=10, out_features=10,
+                                      weight_attr=paddle.ParamAttr(need_clip=True),
+                                      bias_attr=paddle.ParamAttr(need_clip=False))
+            out = linear(x)
+            loss = paddle.mean(out)
+            loss.backward()
+
+            clip = paddle.nn.ClipGradByValue(min=-1, max=1)
+            sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip)
+            sdg.step()
+    """
+
+    def __init__(self, max, min=None):
+        super().__init__()
+        if min is None:
+            assert max > 0.0
+            min = -max
+        self.max = float(max)
+        self.min = float(min)
+
+    def __str__(self):
+        return "Clip Gradient By Value, min = %f, max=%f" % (self.min, self.max)
+
+    @imperative_base.no_grad()
+    def _dygraph_clip(self, params_grads):
+        params_and_grads = []
+        for p, g in params_grads:
+            if g is None:
+                continue
+            if getattr(p, 'need_clip', True) is False:
+                params_and_grads.append((p, g))
+                continue
+            new_grad = paddle.clip(x=g, min=self.min, max=self.max)
+            params_and_grads.append((p, new_grad))
+        return params_and_grads
+
+    def _static_clip(self, params_grads):
+        params_and_grads = []
+        param_new_grad_name_dict = dict()
+        with framework.name_scope('gradient_clip'):
+            for p, g in params_grads:
+                if g is None:
+                    continue
+                if getattr(p, 'need_clip', True) is False:
+                    params_and_grads.append((p, g))
+                    continue
+
+                with p.block.program._optimized_guard([p, g]):
+                    new_grad = paddle.clip(x=g, min=self.min, max=self.max)
+                params_and_grads.append((p, new_grad))
+                param_new_grad_name_dict[p.name] = new_grad.name
+        _correct_clip_op_role_var(params_and_grads, param_new_grad_name_dict)
+        return params_and_grads
+
+    def _process_context(self, context, param, grad):
+        pass
+
+    def _create_operators(self, param, grad):
+        new_grad = paddle.clip(x=grad, min=self.min, max=self.max)
+        return param, new_grad
+
+
+class ClipGradByNorm(ClipGradBase):
+    r"""
+    Limit the l2 norm of multi-dimensional Tensor :math:`X` to ``clip_norm`` .
+
+    - If the l2 norm of :math:`X` is greater than ``clip_norm`` , :math:`X` will be compressed by a ratio.
+
+    - If the l2 norm of :math:`X` is less than or equal to ``clip_norm`` , nothing will be done.
+
+    The multidimensional Tensor :math:`X` is not passed from this class, but the gradients of all parameters set in ``optimizer``.
+    If ``need_clip`` of specific param is ``False`` in its ``ParamAttr``, then the gradients of this param will not be clipped.
+
+    Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer``
+    (for example: :ref:`api_paddle_optimizer_SGD`).
+
+    The clipping formula is:
+
+    .. math::
+        Out =
+        \left\{
+            \begin{array}{ccl}
+                X & & if (norm(X) \leq clip\_norm) \\
+                \frac{clip\_norm*X}{norm(X)} & & if (norm(X) > clip\_norm) \\
+        \end{array}
+        \right.
+
+
+    where :math:`norm(X)` represents the L2 norm of :math:`X`.
+
+    .. math::
+        norm(X) = ( \sum_{i=1}^{n}|x\_i|^2)^{ \frac{1}{2}}
+
+    Note:
+        ``need_clip`` of ``ClipGradByNorm`` HAS BEEN DEPRECATED since 2.0.
+        Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.
+
+    Args:
+        clip_norm(float): The maximum norm value.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
+            linear = paddle.nn.Linear(in_features=10, out_features=10,
+                                      weight_attr=paddle.ParamAttr(need_clip=True),
+                                      bias_attr=paddle.ParamAttr(need_clip=False))
+            out = linear(x)
+            loss = paddle.mean(out)
+            loss.backward()
+
+            clip = paddle.nn.ClipGradByNorm(clip_norm=1.0)
+            sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip)
+            sdg.step()
+    """
+
+    def __init__(self, clip_norm):
+        super().__init__()
+        self.clip_norm = float(clip_norm)
+
+    def __str__(self):
+        return "Gradient Clip By Norm, clip_norm=%f" % self.clip_norm
+
+    @imperative_base.no_grad()
+    def _dygraph_clip(self, params_grads):
+        params_and_grads = []
+        for p, g in params_grads:
+            if g is None:
+                continue
+            if getattr(p, 'need_clip', True) is False:
+                params_and_grads.append((p, g))
+                continue
+            new_grad = clip_by_norm(x=g, max_norm=self.clip_norm)
+            params_and_grads.append((p, new_grad))
+        return params_and_grads
+
+    def _static_clip(self, params_grads):
+        params_and_grads = []
+        with framework.name_scope('gradient_clip'):
+            param_new_grad_name_dict = dict()
+            for p, g in params_grads:
+                if g is None:
+                    continue
+                if getattr(p, 'need_clip', True) is False:
+                    params_and_grads.append((p, g))
+                    continue
+
+                with p.block.program._optimized_guard([p, g]):
+                    new_grad = clip_by_norm(x=g, max_norm=self.clip_norm)
+                param_new_grad_name_dict[p.name] = new_grad.name
+                params_and_grads.append((p, new_grad))
+        _correct_clip_op_role_var(params_and_grads, param_new_grad_name_dict)
+        return params_and_grads
+
+    def _process_context(self, context, param, grad):
+        pass
+
+    def _create_operators(self, param, grad):
+        new_grad = clip_by_norm(x=grad, max_norm=self.clip_norm)
+        return param, new_grad
+
+
+_allow_pure_fp16_global_norm_clip_flag = False
+
+
+def _allow_pure_fp16_global_norm_clip(*args):
+    global _allow_pure_fp16_global_norm_clip_flag
+    if len(args) == 0:
+        return _allow_pure_fp16_global_norm_clip_flag
+    else:
+        assert len(args) == 1 and isinstance(args[0], bool)
+        old_value = _allow_pure_fp16_global_norm_clip_flag
+        _allow_pure_fp16_global_norm_clip_flag = args[0]
+        return old_value
+
+
+class ClipGradByGlobalNorm(ClipGradBase):
+    r"""
+    Given a list of Tensor :math:`t\_list` , calculate the global norm for the elements of all tensors in
+    :math:`t\_list` , and limit it to ``clip_norm`` .
+
+    - If the global norm is greater than ``clip_norm`` , all elements of :math:`t\_list` will be compressed by a ratio.
+
+    - If the global norm is less than or equal to ``clip_norm`` , nothing will be done.
+
+    The list of Tensor :math:`t\_list` is not passed from this class, but the gradients of all parameters set in ``optimizer``.
+    If ``need_clip`` of specific param is ``False`` in its ``ParamAttr``, then the gradients of this param will not be clipped.
+
+    Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer``
+    (for example: :ref:`api_paddle_optimizer_SGD`).
+
+    The clipping formula is:
+
+    .. math::
+
+        t\_list[i] = t\_list[i] * \frac{clip\_norm}{\max(global\_norm, clip\_norm)}
+
+    where:
+
+    .. math::
+
+        global\_norm = \sqrt{\sum_{i=0}^{N-1}(l2norm(t\_list[i]))^2}
+
+    Note:
+        ``need_clip`` of ``ClipGradyGlobalNorm`` HAS BEEN DEPRECATED since 2.0.
+        Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.
+
+    Args:
+        clip_norm (float): The maximum norm value.
+        group_name (str, optional): The group name for this clip. Default value is ``default_group``.
+        auto_skip_clip (bool, optional): skip clipping gradient. Default value is ``False``.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
+            linear = paddle.nn.Linear(in_features=10, out_features=10,
+                                      weight_attr=paddle.ParamAttr(need_clip=True),
+                                      bias_attr=paddle.ParamAttr(need_clip=False))
+            out = linear(x)
+            loss = paddle.mean(out)
+            loss.backward()
+
+            clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
+            sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip)
+            sdg.step()
+    """
+
+    def __init__(
+        self, clip_norm, group_name="default_group", auto_skip_clip=False
+    ):
+        super().__init__()
+        self.clip_norm = float(clip_norm)
+        self.group_name = group_name
+        assert isinstance(auto_skip_clip, bool)
+        self.auto_skip_clip = auto_skip_clip
+
+    def __str__(self):
+        return "Gradient Clip By GlobalNorm, global_norm=%f" % (self.clip_norm)
+
+    @imperative_base.no_grad()
+    def _dygraph_clip(self, params_grads):
+        params_and_grads = []
+        sum_square_list = []
+        sum_square_list_fp16 = []
+        sum_square_list_fp32 = []
+        for p, g in params_grads:
+            if g is None:
+                continue
+            if getattr(p, 'need_clip', True) is False:
+                continue
+            merge_grad = g
+
+            if in_dygraph_mode() and g.is_selected_rows():
+                merge_grad = merge_selected_rows(g)
+                merge_grad = merge_grad._get_tensor_from_selected_rows()
+
+            elif g.type == core.VarDesc.VarType.SELECTED_ROWS:
+                merge_grad = merge_selected_rows(g)
+                merge_grad = get_tensor_from_selected_rows(merge_grad)
+
+            sum_square = _squared_l2_norm(merge_grad)
+            if (
+                sum_square.dtype == core.VarDesc.VarType.FP16
+                or sum_square.dtype == core.VarDesc.VarType.BF16
+            ):
+                sum_square_list_fp16.append(sum_square)
+            elif sum_square.dtype == core.VarDesc.VarType.FP32:
+                sum_square_list_fp32.append(sum_square)
+            else:
+                sum_square_list.append(sum_square)
+
+        # all parameters have been filterd out
+        if (
+            len(sum_square_list)
+            + len(sum_square_list_fp16)
+            + len(sum_square_list_fp32)
+            == 0
+        ):
+            return params_grads
+
+        sum_dtype = 'float64' if len(sum_square_list) > 0 else "float32"
+        global_norm_var = []
+        if len(sum_square_list_fp16) > 0:
+            global_norm_var_fp16 = paddle.add_n(sum_square_list_fp16)
+            global_norm_var.append(global_norm_var_fp16.astype(sum_dtype))
+        if len(sum_square_list_fp32) > 0:
+            global_norm_var_fp32 = paddle.add_n(sum_square_list_fp32)
+            if sum_dtype == 'float32':
+                global_norm_var.append(global_norm_var_fp32)
+            else:
+                global_norm_var.append(global_norm_var_fp32.astype(sum_dtype))
+        if len(sum_square_list) > 0:
+            global_norm_var_fp64 = paddle.add_n(sum_square_list)
+            global_norm_var.append(global_norm_var_fp64)
+        global_norm_var = paddle.add_n(global_norm_var)
+        global_norm_var = paddle.sqrt(global_norm_var)
+        max_global_norm = paddle.full(
+            shape=[1], dtype=global_norm_var.dtype, fill_value=self.clip_norm
+        )
+
+        need_clip = False
+        if not self.auto_skip_clip:  # always apply clip
+            need_clip = True
+            clip_var = paddle.divide(
+                x=max_global_norm,
+                y=paddle.maximum(x=global_norm_var, y=max_global_norm),
+            )
+        elif global_norm_var > max_global_norm:
+            # only when global_norm_var > max_global_norm, grad need clip
+            need_clip = True
+            clip_var = paddle.divide(x=max_global_norm, y=global_norm_var)
+
+        for p, g in params_grads:
+            if g is None:
+                continue
+            if getattr(p, 'need_clip', True) is False:
+                params_and_grads.append((p, g))
+                continue
+            # TODO(wangxi): use inplace elementwise_mul
+            if need_clip:
+                clip_input = (
+                    clip_var.astype(g.dtype)
+                    if clip_var.dtype != g.dtype
+                    else clip_var
+                )
+                new_grad = paddle.multiply(g, clip_input)
+                params_and_grads.append((p, new_grad))
+            else:
+                params_and_grads.append((p, g))
+
+        return params_and_grads
+
+    def _static_clip(self, params_grads):
+        params_and_grads = []
+        sum_square_list = []
+        sum_square_list_fp16 = []
+        sum_square_list_fp32 = []
+        with framework.name_scope('gradient_clip'):
+            for p, g in params_grads:
+                if g is None:
+                    continue
+                if getattr(p, 'need_clip', True) is False:
+                    continue
+                merge_grad = g
+                with p.block.program._optimized_guard([p, g]):
+                    if g.type == core.VarDesc.VarType.SELECTED_ROWS:
+                        merge_grad = merge_selected_rows(g)
+                        merge_grad = get_tensor_from_selected_rows(merge_grad)
+                    sum_square = _squared_l2_norm(merge_grad)
+                    if sum_square.dtype == core.VarDesc.VarType.FP16:
+                        sum_square_list_fp16.append(sum_square)
+                    elif sum_square.dtype == core.VarDesc.VarType.FP32:
+                        sum_square_list_fp32.append(sum_square)
+                    else:
+                        sum_square_list.append(sum_square)
+
+            # all parameters have been filterd out
+            if (
+                len(sum_square_list)
+                + len(sum_square_list_fp16)
+                + len(sum_square_list_fp32)
+                == 0
+            ):
+                return params_grads
+
+            with p.block.program._optimized_guard([p, g]):
+                sum_dtype = 'float64' if len(sum_square_list) > 0 else "float32"
+
+                global_norm_var = []
+                if len(sum_square_list_fp16) > 0:
+                    global_norm_var_fp16 = layers.sums(sum_square_list_fp16)
+                    if (
+                        sum_square_list_fp32
+                        or sum_square_list
+                        or not _allow_pure_fp16_global_norm_clip()
+                    ):
+                        global_norm_var.append(
+                            global_norm_var_fp16.astype(sum_dtype)
+                        )
+                    else:
+                        global_norm_var.append(global_norm_var_fp16)
+                if len(sum_square_list_fp32) > 0:
+                    global_norm_var_fp32 = layers.sums(sum_square_list_fp32)
+                    if sum_dtype == 'float32':
+                        global_norm_var.append(global_norm_var_fp32)
+                    else:
+                        global_norm_var.append(
+                            global_norm_var_fp32.astype(sum_dtype)
+                        )
+                if len(sum_square_list) > 0:
+                    # fp64
+                    global_norm_var_other_dtype = layers.sums(sum_square_list)
+                    global_norm_var.append(global_norm_var_other_dtype)
+
+                global_norm_var = (
+                    layers.sums(global_norm_var)
+                    if len(global_norm_var) > 1
+                    else global_norm_var[0]
+                )
+                global_norm_var = paddle.sqrt(x=global_norm_var)
+                max_global_norm = paddle.full(
+                    shape=[1],
+                    dtype=global_norm_var.dtype,
+                    fill_value=self.clip_norm,
+                )
+                scale_var = paddle.divide(
+                    x=max_global_norm,
+                    y=paddle.maximum(x=max_global_norm, y=global_norm_var),
+                )
+            param_new_grad_name_dict = dict()
+            for p, g in params_grads:
+                if g is None:
+                    continue
+                if getattr(p, 'need_clip', True) is False:
+                    params_and_grads.append((p, g))
+                    continue
+
+                with p.block.program._optimized_guard([p, g]):
+                    new_g = _cast_to_mp_type_if_enabled(g)
+                    # inplace
+                    scale_input = (
+                        scale_var.astype('float16')
+                        if new_g.dtype == core.VarDesc.VarType.FP16
+                        and scale_var.dtype != core.VarDesc.VarType.FP16
+                        else scale_var
+                    )
+                    # NOTE(Yuang Liu): For pure dp with gradient merge, the p and g
+                    # will be in different blocks with the gradient clip related ops.
+                    # We need to handle the correct block, otherwise will encounter
+                    # a 'NotFoundError' during compile time.
+                    block = default_main_program().current_block()
+                    block.append_op(
+                        type='elementwise_mul',
+                        inputs={'X': new_g, 'Y': scale_input},
+                        outputs={'Out': new_g},
+                    )
+                    if new_g is not g:
+                        block.append_op(
+                            type='cast',
+                            inputs={'X': new_g},
+                            outputs={'Out': g},
+                            attrs={
+                                'in_dtype': new_g.dtype,
+                                'out_dtype': g.dtype,
+                            },
+                        )
+
+                param_new_grad_name_dict[p.name] = g.name
+                params_and_grads.append((p, g))
+
+        _correct_clip_op_role_var(params_and_grads, param_new_grad_name_dict)
+        return params_and_grads
+
+    def _process_context(self, context, param, grad):
+        if self.group_name not in context:
+            context[self.group_name] = []
+            context[self.group_name + "_clip_value"] = self.clip_norm
+            context[self.group_name + "_clip"] = paddle.full(
+                shape=[1], dtype=grad.dtype, fill_value=self.clip_norm
+            )
+        else:
+            if not self.clip_norm == context[self.group_name + "_clip_value"]:
+                raise ValueError(
+                    "All parameters' 'clip_norm' of a same group should be the same"
+                )
+
+        merge_grad = grad
+        if grad.type == core.VarDesc.VarType.SELECTED_ROWS:
+            merge_grad = merge_selected_rows(grad)
+            merge_grad = get_tensor_from_selected_rows(merge_grad)
+
+        local_norm_var = _squared_l2_norm(merge_grad)
+        context[self.group_name].append(local_norm_var)
+
+        self.context = context
+
+    def _create_operators(self, param, grad):
+        group_scale_name = self.group_name + "_scale"
+        if group_scale_name not in self.context:
+            group_norm_var = layers.sums(input=self.context[self.group_name])
+            group_norm_var = paddle.sqrt(x=group_norm_var)
+            clip_var = self.context[self.group_name + "_clip"]
+            group_scale_var = paddle.divide(
+                x=clip_var,
+                y=paddle.maximum(x=clip_var, y=group_norm_var),
+            )
+            assert group_scale_var.shape == (1,)
+            self.context[group_scale_name] = group_scale_var
+
+        # inplace
+        param.block.append_op(
+            type='elementwise_mul',
+            inputs={'X': grad, 'Y': self.context[group_scale_name]},
+            outputs={'Out': grad},
+        )
+
+        return param, grad
+
+
+@framework.dygraph_not_support
+def set_gradient_clip(clip, param_list=None, program=None):
+    """
+    Warning:
+
+        This API must be used after building network, and before ``minimize`` ,
+        and it may be removed in future releases, so it is not recommended.
+        It is recommended to set ``grad_clip`` when initializing the ``optimizer`` ,
+        this is a better method to clip gradient. There are three clipping strategies:
+         :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
+         :ref:`api_fluid_clip_GradientClipByValue` .
+
+    To specify parameters that require gradient clip.
+
+    Args:
+        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
+            some derived class of ``GradientClipBase`` . There are three cliping strategies
+            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
+            :ref:`api_fluid_clip_GradientClipByValue` ). Default value: None, and there is no
+            gradient clipping.
+        param_list (list(Variable), optional): Parameters that require gradient clip.
+                It can be a list of parameter or a list of parameter's name.
+                Default None, meaning that all parameters in the program will be included.
+        program (Program, optional): The program where parameters are located.
+                Default None, meaning that using :ref:`api_fluid_default_main_program` .
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.fluid as fluid
+
+            paddle.enable_static()
+
+            def network():
+                image = fluid.data(name='image', shape=[
+                                   None, 28], dtype='float32')
+                param_attr1 = fluid.ParamAttr("fc1_param")
+                fc1 = fluid.layers.fc(image, size=10, param_attr=param_attr1)
+                param_attr2 = fluid.ParamAttr("fc2_param")
+                fc2 = fluid.layers.fc(fc1, size=10, param_attr=param_attr2)
+                loss = paddle.mean(fc2)
+                return loss
+
+
+            # network 1: clip all parameter gradient
+            with fluid.program_guard(fluid.Program(), fluid.Program()):
+                loss = network()
+                paddle.nn.clip.set_gradient_clip(
+                    paddle.nn.ClipGradByGlobalNorm(clip_norm=2.0))
+                sgd = fluid.optimizer.SGD(learning_rate=1e-3)
+                sgd.minimize(loss)
+
+            # network 2: clip parameter gradient by name
+            with fluid.program_guard(fluid.Program(), fluid.Program()):
+                loss = network()
+                paddle.nn.clip.set_gradient_clip(
+                    paddle.nn.ClipGradByValue(min=-1.0, max=1.0),
+                    param_list=["fc1_param", "fc2_param"])
+                sgd = fluid.optimizer.SGD(learning_rate=1e-3)
+                sgd.minimize(loss)
+
+            # network 3: clip parameter gradient by value
+            with fluid.program_guard(fluid.Program(), fluid.Program()):
+                loss = network()
+                param_var1 = fluid.default_main_program().global_block().var("fc1_param")
+                param_var2 = fluid.default_main_program().global_block().var("fc2_param")
+                paddle.nn.clip.set_gradient_clip(
+                    paddle.nn.ClipGradByValue(min=-1.0, max=1.0),
+                    param_list=[param_var1, param_var2])
+                sgd = fluid.optimizer.SGD(learning_rate=1e-3)
+                sgd.minimize(loss)
+
+            # network 4: use 'set_gradient_clip' and 'optimize(grad_clip=clip)' together
+            with fluid.program_guard(fluid.Program(), fluid.Program()):
+                loss = network()
+                clip1 = paddle.nn.ClipGradByValue(min=-1.0, max=1.0)
+                clip2 = paddle.nn.ClipGradByNorm(clip_norm=1.0)
+                # Set the gradient clipping strategy: clip1
+                paddle.nn.clip.set_gradient_clip(clip1)
+                # Set the gradient clipping strategy: clip2
+                sgd = fluid.optimizer.SGD(learning_rate=1e-3, grad_clip=clip2)
+                sgd.minimize(loss)
+                # 'set_gradient_clip' will not take effect when setting has a conflict,
+                # and the gradient clipping strategy will be 'clip2'
+
+
+    """
+    warnings.warn(
+        "Caution! 'set_gradient_clip' is not recommended "
+        "and may be deprecated in future! "
+        "We recommend a new strategy: set 'grad_clip' "
+        "when initializing the 'optimizer'. "
+        "This method can reduce the mistakes, please "
+        "refer to documention of 'optimizer'."
+    )
+
+    if not isinstance(clip, ClipGradBase):
+        raise TypeError(
+            "'clip' should be an instance of ClipGradBase's derived class"
+        )
+    if program is None:
+        program = framework.default_main_program()
+
+    for op in program.block(0).ops:
+        if 'op_namescope' in op.all_attrs() and "optimizer" in op.attr(
+            "op_namescope"
+        ):
+            warnings.warn(
+                "'minimize' has been invoked before, this will make 'set_gradient_clip' "
+                "be ineffective! Please invoke 'set_gradient_clip' before 'minimize'."
+            )
+            break
+
+    if param_list is None:
+        param_list = program.block(0).all_parameters()
+    if all(isinstance(elem, str) for elem in param_list):
+        param_list = [program.block(0).var(elem) for elem in param_list]
+    if not all(isinstance(elem, framework.Parameter) for elem in param_list):
+        raise TypeError(
+            "'param_list' should be a list of Parameter or basestring(parameter's name)."
+        )
+
+    for param in param_list:
+        param.gradient_clip_attr = copy.deepcopy(clip)
+
+
+def append_gradient_clip_ops(param_grads):
+    context = dict()
+    for p, g in param_grads:
+        if g is None:
+            continue
+        with p.block.program._optimized_guard([p, g]), framework.name_scope(
+            'gradient_clip'
+        ):
+            clip_attr = getattr(p, 'gradient_clip_attr', None)
+            if clip_attr is None:
+                return param_grads
+            if not isinstance(clip_attr, ClipGradBase):
+                raise TypeError(
+                    "clip attribute should be an instance of GradientClipBase"
+                )
+
+            clip_attr._process_context(context=context, param=p, grad=g)
+
+    res = []
+    param_new_grad_name_dict = dict()
+    for p, g in param_grads:
+        if g is None:
+            continue
+        with p.block.program._optimized_guard([p, g]), framework.name_scope(
+            'gradient_clip'
+        ):
+            param, new_grad = clip_attr._create_operators(param=p, grad=g)
+            param_new_grad_name_dict[param.name] = new_grad.name
+            res.append([param, new_grad])
+
+    _correct_clip_op_role_var(res, param_new_grad_name_dict)
+    return res
+
+
+# change wrong mapping relation between param & grad in clip op
+# Note: This function is sensitive to the time cost of the network with gradient clipping
+# and should not be changed easily. If you must change, please test the time cost.
+def _correct_clip_op_role_var(params_grads, param_new_grad_name_dict):
+    block_id_list = []
+    if len(param_new_grad_name_dict) == 0:
+        return
+    for param, grad in params_grads:
+        if grad is None:
+            continue
+        block_id = param.block.idx
+        if block_id in block_id_list:
+            continue
+        block_id_list.append(block_id)
+        for op in param.block.program.global_block().ops:
+            if (
+                op.has_attr("op_namescope")
+                and "gradient_clip" in op.attr("op_namescope")
+                and op.attr('op_role_var')
+            ):
+                param_name = op.attr('op_role_var')[0]
+                if param_name in param_new_grad_name_dict:
+                    correct_p_g = [
+                        param_name,
+                        param_new_grad_name_dict[param_name],
+                    ]
+                    op._set_attr('op_role_var', correct_p_g)
+
+
+GradientClipBase = ClipGradBase
+GradientClipByValue = ClipGradByValue
+GradientClipByNorm = ClipGradByNorm
+GradientClipByGlobalNorm = ClipGradByGlobalNorm
diff --git a/python/paddle/nn/functional/conv.py b/python/paddle/nn/functional/conv.py
index ceaa6e5e4a8dbc..74a97e25938ed3 100644
--- a/python/paddle/nn/functional/conv.py
+++ b/python/paddle/nn/functional/conv.py
@@ -984,6 +984,13 @@ def conv1d_transpose(
             )
         )
 
+    if len(weight.shape) != 3:
+        raise ValueError(
+            'Input weight should be 3D tensor, but received weight with the shape of {}'.format(
+                weight.shape
+            )
+        )
+
     op_type = 'conv2d_transpose'
     num_filters = weight.shape[1]
     if (
diff --git a/python/paddle/nn/functional/norm.py b/python/paddle/nn/functional/norm.py
index 4d5bac573c5271..42f2ff170786ba 100644
--- a/python/paddle/nn/functional/norm.py
+++ b/python/paddle/nn/functional/norm.py
@@ -17,8 +17,8 @@
 # TODO: define normalization api
 import paddle
 import paddle.fluid as fluid
-from paddle import _C_ops, _legacy_C_ops, in_dynamic_mode
-from paddle.fluid.framework import _in_legacy_dygraph, in_dygraph_mode
+from paddle import _C_ops, in_dynamic_mode
+from paddle.fluid.framework import in_dygraph_mode
 
 from ...fluid import dygraph_utils
 from ...fluid.data_feeder import check_type, check_variable_and_dtype
@@ -336,54 +336,43 @@ def layer_norm(
         out, _, _ = _C_ops.layer_norm(x, weight, bias, epsilon, begin_norm_axis)
         return out
 
-    if _in_legacy_dygraph():
-        out, _, _ = _legacy_C_ops.layer_norm(
-            x,
-            weight,
-            bias,
-            'epsilon',
-            epsilon,
-            'begin_norm_axis',
-            begin_norm_axis,
+    else:
+        check_variable_and_dtype(
+            x, 'input', ['float16', 'float32', 'float64'], 'LayerNorm'
+        )
+
+        inputs = dict()
+        inputs['X'] = [x]
+        if weight:
+            inputs['Scale'] = [weight]
+        if bias:
+            inputs['Bias'] = [bias]
+        attrs = {"epsilon": epsilon, "begin_norm_axis": begin_norm_axis}
+
+        # create output
+        helper = LayerHelper('layer_norm', **locals())
+
+        dtype = x.dtype
+        mean_out = helper.create_variable_for_type_inference(
+            dtype=dtype, stop_gradient=True
+        )
+        variance_out = helper.create_variable_for_type_inference(
+            dtype=dtype, stop_gradient=True
+        )
+        layer_norm_out = helper.create_variable_for_type_inference(dtype)
+
+        helper.append_op(
+            type="layer_norm",
+            inputs=inputs,
+            outputs={
+                "Y": layer_norm_out,
+                "Mean": mean_out,
+                "Variance": variance_out,
+            },
+            attrs={"epsilon": epsilon, "begin_norm_axis": begin_norm_axis},
         )
-        return out
 
-    check_variable_and_dtype(
-        x, 'input', ['float16', 'float32', 'float64'], 'LayerNorm'
-    )
-
-    inputs = dict()
-    inputs['X'] = [x]
-    if weight:
-        inputs['Scale'] = [weight]
-    if bias:
-        inputs['Bias'] = [bias]
-    attrs = {"epsilon": epsilon, "begin_norm_axis": begin_norm_axis}
-
-    # create output
-    helper = LayerHelper('layer_norm', **locals())
-
-    dtype = x.dtype
-    mean_out = helper.create_variable_for_type_inference(
-        dtype=dtype, stop_gradient=True
-    )
-    variance_out = helper.create_variable_for_type_inference(
-        dtype=dtype, stop_gradient=True
-    )
-    layer_norm_out = helper.create_variable_for_type_inference(dtype)
-
-    helper.append_op(
-        type="layer_norm",
-        inputs=inputs,
-        outputs={
-            "Y": layer_norm_out,
-            "Mean": mean_out,
-            "Variance": variance_out,
-        },
-        attrs={"epsilon": epsilon, "begin_norm_axis": begin_norm_axis},
-    )
-
-    return helper.append_activation(layer_norm_out)
+        return helper.append_activation(layer_norm_out)
 
 
 def instance_norm(
diff --git a/python/paddle/nn/functional/pooling.py b/python/paddle/nn/functional/pooling.py
index 1a3eb6761850a4..6041f8a07e2a82 100755
--- a/python/paddle/nn/functional/pooling.py
+++ b/python/paddle/nn/functional/pooling.py
@@ -1593,7 +1593,7 @@ def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None):
             item.numpy().item(0) if isinstance(item, Variable) else item
             for item in output_size
         ]
-    # output_size support Variable in static mode
+    # output_size support Variable in static graph mode
     elif utils._contain_var(output_size):
         output_size = utils._convert_to_tensor_list(output_size)
 
diff --git a/python/paddle/optimizer/adadelta.py b/python/paddle/optimizer/adadelta.py
index d3c16048c90799..ff0f0a13feddca 100644
--- a/python/paddle/optimizer/adadelta.py
+++ b/python/paddle/optimizer/adadelta.py
@@ -49,7 +49,7 @@ class Adadelta(Optimizer):
             different parameter groups such as the learning rate, weight decay, etc, \
             then the parameters are list of dict. Note that the learning_rate in paramter groups \
             represents the scale of base learning_rate. \
-            The default value is None in static mode, at this time all parameters will be updated.
+            The default value is None in static graph mode, at this time all parameters will be updated.
         weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
             It canbe a float value as coeff of L2 regularization or \
             :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
diff --git a/python/paddle/optimizer/adagrad.py b/python/paddle/optimizer/adagrad.py
index a562bf77d8f6ae..6bea5773270bb5 100644
--- a/python/paddle/optimizer/adagrad.py
+++ b/python/paddle/optimizer/adagrad.py
@@ -48,7 +48,7 @@ class Adagrad(Optimizer):
             different parameter groups such as the learning rate, weight decay, etc,
             then the parameters are list of dict. Note that the learning_rate in paramter groups
             represents the scale of base learning_rate.
-            The default value is None in static mode, at this time all parameters will be updated.
+            The default value is None in static graph mode, at this time all parameters will be updated.
         weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization.
             It canbe a float value as coeff of L2 regularization or
             :ref:`api_paddle_regularizer_L1Decay`, :ref:`api_paddle_regularizer_L2Decay`.
diff --git a/python/paddle/optimizer/adam.py b/python/paddle/optimizer/adam.py
index c5bc56769188e2..070efdff2d126f 100644
--- a/python/paddle/optimizer/adam.py
+++ b/python/paddle/optimizer/adam.py
@@ -70,7 +70,7 @@ class Adam(Optimizer):
             different parameter groups such as the learning rate, weight decay, etc,
             then the parameters are list of dict. Note that the learning_rate in paramter groups
             represents the scale of base learning_rate.
-            The default value is None in static mode, at this time all parameters will be updated.
+            The default value is None in static graph mode, at this time all parameters will be updated.
         weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization.
             It canbe a float value as coeff of L2 regularization or
             :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
diff --git a/python/paddle/optimizer/adamax.py b/python/paddle/optimizer/adamax.py
index f3990f62aff9da..c460ab6be032dd 100644
--- a/python/paddle/optimizer/adamax.py
+++ b/python/paddle/optimizer/adamax.py
@@ -62,7 +62,7 @@ class Adamax(Optimizer):
             different parameter groups such as the learning rate, weight decay, etc,
             then the parameters are list of dict. Note that the learning_rate in paramter groups
             represents the scale of base learning_rate.
-            The default value is None in static mode, at this time all parameters will be updated.
+            The default value is None in static graph mode, at this time all parameters will be updated.
         weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization.
             It canbe a float value as coeff of L2 regularization or
             :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
diff --git a/python/paddle/optimizer/adamw.py b/python/paddle/optimizer/adamw.py
index ff0cb9fb841b5e..a4d304b451e7b3 100644
--- a/python/paddle/optimizer/adamw.py
+++ b/python/paddle/optimizer/adamw.py
@@ -20,10 +20,10 @@
 
 from .. import _C_ops
 from ..fluid import core, framework, unique_name
-from ..fluid.clip import GradientClipBase
 from ..fluid.dygraph import base as imperative_base
 from ..fluid.framework import Parameter, Variable
 from ..fluid.layer_helper import LayerHelper
+from ..nn.clip import GradientClipBase
 from .lr import LRScheduler
 from .optimizer import Optimizer
 
@@ -58,7 +58,7 @@ class AdamW(Optimizer):
             different parameter groups such as the learning rate, weight decay, etc,
             then the parameters are list of dict. Note that the learning_rate in paramter groups
             represents the scale of base learning_rate.
-            The default value is None in static mode, at this time all parameters will be updated.
+            The default value is None in static graph mode, at this time all parameters will be updated.
         beta1 (float|Tensor, optional): The exponential decay rate for the 1st moment estimates.
             It should be a float number or a Tensor with shape [1] and data type as float32.
             The default value is 0.9.
diff --git a/python/paddle/optimizer/lamb.py b/python/paddle/optimizer/lamb.py
index 1e959a9ce471c6..e531e785e319fb 100644
--- a/python/paddle/optimizer/lamb.py
+++ b/python/paddle/optimizer/lamb.py
@@ -67,7 +67,7 @@ class Lamb(Optimizer):
             different parameter groups such as the learning rate, weight decay, etc, \
             then the parameters are list of dict. Note that the learning_rate in paramter groups \
             represents the scale of base learning_rate. \
-            The default value is None in static mode, at this time all parameters will be updated.
+            The default value is None in static graph mode, at this time all parameters will be updated.
         grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
             some derived class of ``GradientClipBase`` . There are three cliping strategies
             ( :ref:`api_paddle_fluid_clip_ClipGradByGlobalNorm` , :ref:`api_paddle_fluid_clip_ClipGradByNorm` ,
diff --git a/python/paddle/optimizer/momentum.py b/python/paddle/optimizer/momentum.py
index 1c5327b7d7841d..3b20777599fb01 100644
--- a/python/paddle/optimizer/momentum.py
+++ b/python/paddle/optimizer/momentum.py
@@ -57,7 +57,7 @@ class Momentum(Optimizer):
             different parameter groups such as the learning rate, weight decay, etc, \
             then the parameters are list of dict. Note that the learning_rate in paramter groups \
             represents the scale of base learning_rate. \
-            The default value is None in static mode, at this time all parameters will be updated.
+            The default value is None in static graph mode, at this time all parameters will be updated.
         weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
             It canbe a float value as coeff of L2 regularization or \
             :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py
index 5d53593c2e039c..1799461254ced5 100644
--- a/python/paddle/optimizer/optimizer.py
+++ b/python/paddle/optimizer/optimizer.py
@@ -18,6 +18,7 @@
 import numpy as np
 
 import paddle
+import paddle.autograd as imperative_base
 from paddle import _C_ops
 from paddle.fluid import core
 from paddle.fluid.framework import (
@@ -32,12 +33,6 @@
 
 from ..fluid import framework, unique_name
 from ..fluid.backward import _get_no_grad_set_name, append_backward
-from ..fluid.clip import (
-    GradientClipBase,
-    append_gradient_clip_ops,
-    error_clip_callback,
-)
-from ..fluid.dygraph import base as imperative_base
 from ..fluid.framework import Parameter, program_guard
 from ..fluid.initializer import Constant
 from ..fluid.layer_helper import LayerHelper
@@ -109,7 +104,7 @@ class Optimizer:
             different parameter groups such as the learning rate, weight decay, etc, \
             then the parameters are list of dict. Note that the learning_rate in paramter groups \
             represents the scale of base learning_rate. \
-            The default value is None in static mode, at this time all parameters will be updated.
+            The default value is None in static graph mode, at this time all parameters will be updated.
         weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
             It canbe a float value as coeff of L2 regularization or \
             :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
@@ -168,7 +163,7 @@ class Optimizer:
 
     """
 
-    @imperative_base.no_grad
+    @imperative_base.no_grad()
     def __init__(
         self,
         learning_rate,
@@ -225,7 +220,7 @@ def __init__(
                 % type(learning_rate)
             )
         if grad_clip is not None:
-            if not isinstance(grad_clip, GradientClipBase):
+            if not isinstance(grad_clip, paddle.nn.clip.GradientClipBase):
                 raise TypeError(
                     "'grad_clip' should be an instance of GradientClipBase's derived class"
                 )
@@ -1042,7 +1037,7 @@ def backward(
                     params_grads.append((parameter_list[index], grad))
         else:
             if callbacks is None:
-                callbacks = [error_clip_callback]
+                callbacks = [paddle.nn.clip.error_clip_callback]
             else:
                 assert isinstance(callbacks, list)
             program = loss.block.program
@@ -1103,7 +1098,7 @@ def apply_gradients(self, params_grads):
             params_grads = self._grad_clip(params_grads)
         else:
 
-            params_grads = append_gradient_clip_ops(params_grads)
+            params_grads = paddle.nn.clip.append_gradient_clip_ops(params_grads)
 
         # Add regularization if any
         params_grads = self.append_regularization_ops(
@@ -1317,7 +1312,7 @@ def clear_grad(self, set_to_zero=True):
         else:
             core.clear_gradients(param_list, set_to_zero)
 
-    @imperative_base.no_grad
+    @imperative_base.no_grad()
     def minimize(
         self, loss, startup_program=None, parameters=None, no_grad_set=None
     ):
@@ -1380,7 +1375,7 @@ def minimize(
 
         return optimize_ops, params_grads
 
-    @imperative_base.no_grad
+    @imperative_base.no_grad()
     @framework.dygraph_only
     def step(self):
         """
diff --git a/python/paddle/optimizer/rmsprop.py b/python/paddle/optimizer/rmsprop.py
index 460c5e00ed227e..855082eae5f8f8 100644
--- a/python/paddle/optimizer/rmsprop.py
+++ b/python/paddle/optimizer/rmsprop.py
@@ -86,7 +86,7 @@ class RMSProp(Optimizer):
           different parameter groups such as the learning rate, weight decay, etc,
           then the parameters are list of dict. Note that the learning_rate in paramter groups
           represents the scale of base learning_rate.
-          The default value is None in static mode, at this time all parameters will be updated.
+          The default value is None in static graph mode, at this time all parameters will be updated.
         weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization.
           It canbe a float value as coeff of L2 regularization or \
           :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
diff --git a/python/paddle/optimizer/sgd.py b/python/paddle/optimizer/sgd.py
index db85080834ccd6..c188cd15a8c3af 100644
--- a/python/paddle/optimizer/sgd.py
+++ b/python/paddle/optimizer/sgd.py
@@ -39,7 +39,7 @@ class SGD(Optimizer):
             It can be a float value, a ``Tensor`` with a float type or a LearningRateDecay. The default value is 0.001.
         parameters (list|tuple, optional): List/Tuple of ``Tensor`` to update to minimize ``loss``. \
             This parameter is required in dygraph mode. \
-            The default value is None in static mode, at this time all parameters will be updated.
+            The default value is None in static graph mode, at this time all parameters will be updated.
         weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
             It canbe a float value as coeff of L2 regularization or \
             :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
diff --git a/python/paddle/quantization/__init__.py b/python/paddle/quantization/__init__.py
index d5c5806e80da22..8b7f9769e81ba2 100644
--- a/python/paddle/quantization/__init__.py
+++ b/python/paddle/quantization/__init__.py
@@ -12,40 +12,41 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from ..fluid.contrib.slim.quantization.imperative.ptq_config import (
+from .imperative.ptq_config import (
     PTQConfig,
     default_ptq_config,
 )
-from ..fluid.contrib.slim.quantization.imperative.ptq_quantizer import (
+from .imperative.ptq_quantizer import (
     BaseQuantizer,
 )
-from ..fluid.contrib.slim.quantization.imperative.ptq_quantizer import (
+from .imperative.ptq_quantizer import (
     AbsmaxQuantizer,
 )
-from ..fluid.contrib.slim.quantization.imperative.ptq_quantizer import (
+from .imperative.ptq_quantizer import (
     PerChannelAbsmaxQuantizer,
 )
-from ..fluid.contrib.slim.quantization.imperative.ptq_quantizer import (
+from .imperative.ptq_quantizer import (
     KLQuantizer,
 )
-from ..fluid.contrib.slim.quantization.imperative.ptq_quantizer import (
+from .imperative.ptq_quantizer import (
     HistQuantizer,
 )
-from ..fluid.contrib.slim.quantization.imperative.ptq_quantizer import (
+from .imperative.ptq_quantizer import (
     SUPPORT_ACT_QUANTIZERS,
 )
-from ..fluid.contrib.slim.quantization.imperative.ptq_quantizer import (
+from .imperative.ptq_quantizer import (
     SUPPORT_WT_QUANTIZERS,
 )
-from ..fluid.contrib.slim.quantization.imperative.ptq_registry import (
+from .imperative.ptq_registry import (
     PTQRegistry,
 )
-from ..fluid.contrib.slim.quantization.imperative.ptq import ImperativePTQ
-from ..fluid.contrib.slim.quantization.imperative.qat import (
+from .imperative.ptq import (
+    ImperativePTQ,
+)
+from .imperative.qat import (
     ImperativeQuantAware,
 )
 
-
 from .config import QuantConfig
 from .base_quanter import BaseQuanter
 from .factory import quanter
diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/__init__.py b/python/paddle/quantization/imperative/__init__.py
similarity index 61%
rename from python/paddle/fluid/contrib/slim/quantization/imperative/__init__.py
rename to python/paddle/quantization/imperative/__init__.py
index 2f0d3480ff1c89..98a0a0b4f8c146 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/__init__.py
+++ b/python/paddle/quantization/imperative/__init__.py
@@ -1,4 +1,4 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,23 +13,24 @@
 # limitations under the License.
 
 from . import qat
-from .qat import *
+from .qat import ImperativeQuantAware
 
 from . import ptq
-from .ptq import *
+from .ptq import ImperativePTQ
 
 from . import ptq_config
-from .ptq_config import *
+from .ptq_config import PTQConfig, default_ptq_config
 
 from . import ptq_quantizer
-from .ptq_quantizer import *
+from .ptq_quantizer import (
+    BaseQuantizer,
+    AbsmaxQuantizer,
+    PerChannelAbsmaxQuantizer,
+    KLQuantizer,
+    HistQuantizer,
+    SUPPORT_ACT_QUANTIZERS,
+    SUPPORT_WT_QUANTIZERS,
+)
 
 from . import ptq_registry
-from .ptq_registry import *
-
-__all__ = []
-__all__ += qat.__all__
-__all__ += ptq.__all__
-__all__ += ptq_config.__all__
-__all__ += ptq_quantizer.__all__
-__all__ += ptq_registry.__all__
+from .ptq_registry import PTQRegistry
diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/fuse_utils.py b/python/paddle/quantization/imperative/fuse_utils.py
similarity index 98%
rename from python/paddle/fluid/contrib/slim/quantization/imperative/fuse_utils.py
rename to python/paddle/quantization/imperative/fuse_utils.py
index 0c86c6711d773d..1276e68e56a327 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/fuse_utils.py
+++ b/python/paddle/quantization/imperative/fuse_utils.py
@@ -1,4 +1,4 @@
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,8 +13,10 @@
 # limitations under the License.
 
 import copy
+
 import paddle
 import paddle.nn as nn
+
 from . import utils
 
 
@@ -66,7 +68,7 @@ def fuse_layers(model, layers_to_fuse, inplace=False):
     Return
         fused_model(paddle.nn.Layer): The fused model.
     '''
-    if inplace == False:
+    if inplace is False:
         model = copy.deepcopy(model)
     for layers in layers_to_fuse:
         _fuse_layers(model, layers)
diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/ptq.py b/python/paddle/quantization/imperative/ptq.py
similarity index 91%
rename from python/paddle/fluid/contrib/slim/quantization/imperative/ptq.py
rename to python/paddle/quantization/imperative/ptq.py
index 4b9af49d064225..e5ea882c523e90 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/ptq.py
+++ b/python/paddle/quantization/imperative/ptq.py
@@ -1,4 +1,4 @@
-#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,24 +12,27 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import logging
 import copy
+import logging
 import os
+
 import numpy as np
 
 import paddle
 import paddle.nn.quant.quant_layers as quant_layers
-from paddle.fluid.log_helper import get_logger
-from paddle.jit.translated_layer import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
-
-from . import fuse_utils
-from . import utils
-from . import ptq_hooks
-from . import ptq_config
-from . import ptq_quantizer
+
+from ...static.log_helper import get_logger
+from ...static.quantization.utils import (
+    _get_input_name_index,
+    _get_op_input_var_names,
+    _get_op_output_var_names,
+    _get_output_name_index,
+)
+from . import fuse_utils, ptq_config, ptq_hooks, ptq_quantizer, utils
 from .ptq_registry import PTQRegistry
 
-__all__ = ['ImperativePTQ']
+INFER_MODEL_SUFFIX = ".pdmodel"
+INFER_PARAMS_SUFFIX = ".pdiparams"
 
 _logger = get_logger(
     __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s'
@@ -165,8 +168,8 @@ def save_quantized_model(self, model, path, input_spec=None, **config):
             infer_program,
             feed_target_names,
             fetch_targets,
-        ] = paddle.fluid.io.load_inference_model(
-            dirname=dirname,
+        ] = paddle.static.load_inference_model(
+            path_prefix=dirname,
             executor=exe,
             model_filename=model_filename,
             params_filename=params_filename,
@@ -178,14 +181,23 @@ def save_quantized_model(self, model, path, input_spec=None, **config):
         self._remove_scale_op(infer_program)
 
         # Save final program
-        paddle.fluid.io.save_inference_model(
-            dirname=dirname,
-            feeded_var_names=feed_target_names,
-            target_vars=fetch_targets,
+        model_name = None
+        if model_filename is None:
+            model_name = "model"
+        elif model_filename.endswith(".pdmodel"):
+            model_name = model_filename.rsplit(".", 1)[0]
+        else:
+            model_name = model_filename
+        path_prefix = os.path.join(dirname, model_name)
+        feed_vars = [
+            infer_program.global_block().var(name) for name in feed_target_names
+        ]
+        paddle.static.save_inference_model(
+            path_prefix,
+            feed_vars,
+            fetch_targets,
             executor=exe,
-            main_program=infer_program.clone(),
-            model_filename=model_filename,
-            params_filename=params_filename,
+            program=infer_program.clone(),
         )
 
         if is_dynamic_mode:
@@ -302,7 +314,7 @@ def _wrap_simulated_layers(self, model):
             ) and PTQRegistry.is_simulated_quant_layer(sub_layer):
 
                 quant_config = sub_layer._quant_config
-                assert quant_config.enable_in_act_quantizer == True
+                assert quant_config.enable_in_act_quantizer is True
                 wt_quantizer = quant_config.wt_quantizer
                 in_act_quantizer = quant_config.in_act_quantizer
 
@@ -376,7 +388,7 @@ def _gather_input_thresholds(self, program, scope):
             None
         """
         for op in utils.program_all_ops(program):
-            for in_var_name in utils._get_op_input_var_names(op):
+            for in_var_name in _get_op_input_var_names(op):
                 previous_op = utils.find_previous_op(op.block, in_var_name)
                 if previous_op is None:
                     continue
@@ -388,20 +400,16 @@ def _gather_input_thresholds(self, program, scope):
                     attr_name = previous_op.output('OutScale')[0]
                     in_threshold = utils.load_variable_data(scope, attr_name)
                     in_threshold = utils.fp_numpy_to_naive(in_threshold)
-                    argname, index = utils._get_input_name_index(
-                        op, in_var_name
-                    )
+                    argname, index = _get_input_name_index(op, in_var_name)
                     op._set_attr(
                         argname + str(index) + "_threshold", in_threshold
                     )
                     op._set_attr("with_quant_attr", True)
                 else:
-                    for out_var_name in utils._get_op_output_var_names(
-                        previous_op
-                    ):
+                    for out_var_name in _get_op_output_var_names(previous_op):
                         if out_var_name != in_var_name:
                             continue
-                        argname, index = utils._get_output_name_index(
+                        argname, index = _get_output_name_index(
                             previous_op, out_var_name
                         )
                         attr_name = argname + str(index) + "_threshold"
@@ -409,9 +417,7 @@ def _gather_input_thresholds(self, program, scope):
                             continue
                         threshold = previous_op.attr(attr_name)
 
-                        argname, index = utils._get_input_name_index(
-                            op, in_var_name
-                        )
+                        argname, index = _get_input_name_index(op, in_var_name)
                         attr_name = argname + str(index) + "_threshold"
                         op._set_attr(attr_name, threshold)
                         op._set_attr("with_quant_attr", True)
@@ -453,10 +459,10 @@ def _helper(op, next_op, old_attr_name, new_attr_name):
                     continue
                 next_op = next_ops[0]
 
-                argname, index = utils._get_output_name_index(op, out_var_name)
+                argname, index = _get_output_name_index(op, out_var_name)
                 old_attr_name = argname + str(index) + "_threshold"
 
-                argname, index = utils._get_output_name_index(
+                argname, index = _get_output_name_index(
                     next_op, next_op.output("Out")[0]
                 )
                 new_attr_name = argname + str(index) + "_threshold"
@@ -478,7 +484,7 @@ def _remove_scale_op(self, program):
 
     @staticmethod
     def _is_skip_layer(layer):
-        return hasattr(layer, "skip_quant") and layer.skip_quant == True
+        return hasattr(layer, "skip_quant") and layer.skip_quant is True
 
     @staticmethod
     def _is_quant_layer(layer):
diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/ptq_config.py b/python/paddle/quantization/imperative/ptq_config.py
similarity index 89%
rename from python/paddle/fluid/contrib/slim/quantization/imperative/ptq_config.py
rename to python/paddle/quantization/imperative/ptq_config.py
index 88eb998c0e071a..986b813324fc13 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/ptq_config.py
+++ b/python/paddle/quantization/imperative/ptq_config.py
@@ -1,4 +1,4 @@
-#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,14 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import abc
 import copy
 
-import paddle
-
-from .ptq_quantizer import *
-
-__all__ = ['PTQConfig', 'default_ptq_config']
+from .ptq_quantizer import (
+    SUPPORT_ACT_QUANTIZERS,
+    SUPPORT_WT_QUANTIZERS,
+    KLQuantizer,
+    PerChannelAbsmaxQuantizer,
+)
 
 
 class PTQConfig:
diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/ptq_hooks.py b/python/paddle/quantization/imperative/ptq_hooks.py
similarity index 84%
rename from python/paddle/fluid/contrib/slim/quantization/imperative/ptq_hooks.py
rename to python/paddle/quantization/imperative/ptq_hooks.py
index 319beee0ed73b5..1917320412973c 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/ptq_hooks.py
+++ b/python/paddle/quantization/imperative/ptq_hooks.py
@@ -1,4 +1,4 @@
-#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,12 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import paddle
-import math
-import numpy as np
-from . import ptq_config
-from .ptq_registry import PTQRegistry
-
 
 def quant_forward_post_hook(layer, inputs, outputs):
     """
diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/ptq_quantizer.py b/python/paddle/quantization/imperative/ptq_quantizer.py
similarity index 96%
rename from python/paddle/fluid/contrib/slim/quantization/imperative/ptq_quantizer.py
rename to python/paddle/quantization/imperative/ptq_quantizer.py
index a18e6a306cc945..41b0be44a75302 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/ptq_quantizer.py
+++ b/python/paddle/quantization/imperative/ptq_quantizer.py
@@ -1,4 +1,4 @@
-#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,24 +13,14 @@
 # limitations under the License.
 
 import abc
-import copy
 import math
+
 import numpy as np
 
 import paddle
 
+from ...static.quantization.cal_kl_threshold import cal_kl_threshold
 from . import utils
-from ..cal_kl_threshold import cal_kl_threshold
-
-__all__ = [
-    'BaseQuantizer',
-    'AbsmaxQuantizer',
-    'PerChannelAbsmaxQuantizer',
-    'KLQuantizer',
-    'HistQuantizer',
-    'SUPPORT_ACT_QUANTIZERS',
-    'SUPPORT_WT_QUANTIZERS',
-]
 
 
 def abs_max_value(tensor):
diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/ptq_registry.py b/python/paddle/quantization/imperative/ptq_registry.py
similarity index 98%
rename from python/paddle/fluid/contrib/slim/quantization/imperative/ptq_registry.py
rename to python/paddle/quantization/imperative/ptq_registry.py
index d8df91f78fb8ba..d7a4a882f080df 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/ptq_registry.py
+++ b/python/paddle/quantization/imperative/ptq_registry.py
@@ -1,4 +1,4 @@
-#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,8 +14,6 @@
 
 import paddle
 
-__all__ = ['PTQRegistry']
-
 
 class LayerInfo:
     """
diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py b/python/paddle/quantization/imperative/qat.py
similarity index 91%
rename from python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
rename to python/paddle/quantization/imperative/qat.py
index 1c34af18c69147..06c66ae07315b7 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
+++ b/python/paddle/quantization/imperative/qat.py
@@ -1,4 +1,4 @@
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,35 +12,27 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import collections
-import logging
-import numpy as np
-import sys
 import os
-import warnings
 
 import paddle
-import paddle.nn as nn
 import paddle.nn.quant.quant_layers as quant_layers
-from paddle.fluid import dygraph, core, framework, unique_name
 from paddle.fluid.framework import IrGraph
-from paddle.fluid.executor import Executor, global_scope
-from paddle.fluid.param_attr import ParamAttr
-from paddle.fluid.initializer import Constant
-from paddle.jit.translated_layer import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
-from paddle.fluid.io import load_inference_model, save_inference_model
-from ..quantization_pass import ReplaceFakeQuantDequantPass, QuantWeightPass
-from paddle.fluid.log_helper import get_logger
-from .. import quantization_pass
-from ..utils import move_persistable_var_to_global_block
-from . import utils
-from . import fuse_utils
-
-__all__ = ['ImperativeQuantAware']
-
-_logger = get_logger(
-    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s'
+from paddle.framework import core
+
+from ...static.quantization.quantization_pass import (
+    QuantWeightPass,
+    ReplaceFakeQuantDequantPass,
+)
+from ...static.quantization.utils import (
+    _get_input_name_index,
+    _get_op_input_var_names,
+    _get_output_name_index,
+    move_persistable_var_to_global_block,
 )
+from . import fuse_utils, utils
+
+INFER_MODEL_SUFFIX = ".pdmodel"
+INFER_PARAMS_SUFFIX = ".pdiparams"
 
 
 def lazy_import_fleet(layer_name_map, fake_quant_input_layers):
@@ -147,7 +139,7 @@ def __init__(
         .. code-block:: python
 
             import paddle
-            from paddle.fluid.contrib.slim.quantization \
+            from paddle.static.quantization \
                 import ImperativeQuantAware
             from paddle.vision.models \
                 import resnet
@@ -178,7 +170,7 @@ def __init__(
         .. code-block:: python
 
             import paddle
-            from paddle.fluid.contrib.slim.quantization \
+            from paddle.static.quantization \
                 import ImperativeQuantAware
 
             class ImperativeModel(paddle.nn.Layer):
@@ -256,7 +248,7 @@ def quantize(self, model):
         .. code-block:: python
 
             import paddle
-            from paddle.fluid.contrib.slim.quantization \
+            from paddle.static.quantization \
                 import ImperativeQuantAware
 
             class ImperativeModel(paddle.nn.Layer):
@@ -288,8 +280,8 @@ def forward(self, inputs):
             imperative_qat.quantize(model)
         """
         assert isinstance(
-            model, dygraph.Layer
-        ), "The model must be the instance of dygraph.Layer."
+            model, paddle.nn.Layer
+        ), "The model must be the instance of paddle.nn.Layer."
 
         if self.fuse_conv_bn:
             fuse_utils.fuse_conv_bn(model)
@@ -376,7 +368,7 @@ def __init__(
         ), "activation_bits should be 1, 2,... or 16."
 
         layer_check = lambda method: method is None or issubclass(
-            method, dygraph.layers.Layer
+            method, paddle.nn.Layer
         )
         assert layer_check(
             weight_preprocess_layer
@@ -417,13 +409,13 @@ def apply(self, model):
         """
 
         assert isinstance(
-            model, dygraph.Layer
-        ), "The model must be the instance of dygraph.Layer."
+            model, paddle.nn.Layer
+        ), "The model must be the instance of paddle.nn.Layer."
 
         for name, cur_layer in model.named_sublayers():
             if not isinstance(cur_layer, self._quantizable_layer_type) or (
                 hasattr(cur_layer, "skip_quant")
-                and cur_layer.skip_quant == True
+                and cur_layer.skip_quant is True
             ):
                 continue
 
@@ -480,8 +472,8 @@ def apply(self, model):
             None
         """
         assert isinstance(
-            model, dygraph.Layer
-        ), "The model must be the instance of dygraph.Layer."
+            model, paddle.nn.Layer
+        ), "The model must be the instance of paddle.nn.Layer."
 
         for cur_name, cur_layer in model.named_sublayers():
             if '_act_preprocess' in cur_name:
@@ -535,8 +527,8 @@ def save_quantized_model(self, model, path, input_spec=None, **config):
             None
         """
         assert isinstance(
-            model, dygraph.Layer
-        ), "The model must be the instance of dygraph.Layer."
+            model, paddle.nn.Layer
+        ), "The model must be the instance of paddle.nn.Layer."
 
         paddle.jit.save(layer=model, path=path, input_spec=input_spec, **config)
 
@@ -546,8 +538,8 @@ def save_quantized_model(self, model, path, input_spec=None, **config):
             paddle.enable_static()
 
         place = core.CPUPlace()
-        scope = global_scope()
-        exe = Executor(place)
+        scope = paddle.static.global_scope()
+        exe = paddle.static.Executor(place)
 
         dirname = os.path.dirname(path)
         basename = os.path.basename(path)
@@ -558,8 +550,8 @@ def save_quantized_model(self, model, path, input_spec=None, **config):
             infer_program,
             feed_target_names,
             fetch_targets,
-        ] = load_inference_model(
-            dirname=dirname,
+        ] = paddle.static.load_inference_model(
+            dirname,
             executor=exe,
             model_filename=model_filename,
             params_filename=params_filename,
@@ -600,14 +592,23 @@ def save_quantized_model(self, model, path, input_spec=None, **config):
 
         move_persistable_var_to_global_block(infer_program)
 
-        save_inference_model(
-            dirname=dirname,
-            feeded_var_names=feed_target_names,
-            target_vars=fetch_targets,
+        model_name = None
+        if model_filename is None:
+            model_name = "model"
+        elif model_filename.endswith(".pdmodel"):
+            model_name = model_filename.rsplit(".", 1)[0]
+        else:
+            model_name = model_filename
+        path_prefix = os.path.join(dirname, model_name)
+        feed_vars = [
+            infer_program.global_block().var(name) for name in feed_target_names
+        ]
+        paddle.static.save_inference_model(
+            path_prefix,
+            feed_vars,
+            fetch_targets,
             executor=exe,
-            main_program=infer_program.clone(),
-            model_filename=model_filename,
-            params_filename=params_filename,
+            program=infer_program.clone(),
             clip_extra=clip_extra,
         )
 
@@ -619,7 +620,7 @@ def _is_target_layer(self, layer):
         Whether the layer needs to calculate output scales.
         """
         # exclude fake_quant ops in quant_layers file
-        if not isinstance(layer, dygraph.Layer):
+        if not isinstance(layer, paddle.nn.Layer):
             return False
 
         if self._onnx_format:
@@ -660,7 +661,7 @@ def _gather_input_scale():
                         target_ops.append(op)
 
             for op in target_ops:
-                for in_var_name in utils._get_op_input_var_names(op):
+                for in_var_name in _get_op_input_var_names(op):
                     previous_op = utils.find_previous_op(op.block, in_var_name)
 
                     if previous_op is not None and (
@@ -670,9 +671,7 @@ def _gather_input_scale():
                         scale_name = previous_op.output('OutScale')[0]
                         in_scale = utils.load_variable_data(scope, scale_name)
                         in_scale = utils.fp_numpy_to_naive(in_scale)
-                        argname, index = utils._get_input_name_index(
-                            op, in_var_name
-                        )
+                        argname, index = _get_input_name_index(op, in_var_name)
                         op._set_attr(
                             argname + str(index) + "_threshold", in_scale
                         )
@@ -697,7 +696,7 @@ def _gather_output_scale():
                 out_scale = utils.fp_numpy_to_naive(out_scale)
 
                 if previous_op.type != "feed":
-                    res = utils._get_output_name_index(previous_op, in_var_name)
+                    res = _get_output_name_index(previous_op, in_var_name)
                     if res is not None:
                         argname, index = res
                         previous_op._set_attr(
diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py b/python/paddle/quantization/imperative/utils.py
similarity index 95%
rename from python/paddle/fluid/contrib/slim/quantization/imperative/utils.py
rename to python/paddle/quantization/imperative/utils.py
index e5ed14cb9f1e17..b076b896fdeb66 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py
+++ b/python/paddle/quantization/imperative/utils.py
@@ -1,4 +1,4 @@
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,19 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import math
 import numpy as np
 
 import paddle
 import paddle.nn.quant.quant_layers as quant_layers
 
-from ..utils import (
-    _get_op_input_var_names,
-    _get_op_output_var_names,
-    _get_output_name_index,
-    _get_input_name_index,
-)
-
 layer_name_map = {
     'Conv2DTranspose': paddle.nn.Conv2DTranspose,
     'Conv2D': paddle.nn.Conv2D,
@@ -42,7 +34,6 @@
     'Softmax': paddle.nn.Softmax,
     'Swish': paddle.nn.Swish,
     'Tanh': paddle.nn.Tanh,
-    'Hardswish': paddle.nn.Hardswish,
     'BatchNorm': paddle.nn.BatchNorm,
     'GroupNorm': paddle.nn.GroupNorm,
     'LayerNorm': paddle.nn.LayerNorm,
diff --git a/python/paddle/static/log_helper.py b/python/paddle/static/log_helper.py
new file mode 100644
index 00000000000000..3bb0e8477fd3ca
--- /dev/null
+++ b/python/paddle/static/log_helper.py
@@ -0,0 +1,53 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+
+
+def get_logger(name, level, fmt=None):
+    """
+    Get logger from logging with given name, level and format without
+    setting logging basicConfig. For setting basicConfig in paddle
+    will disable basicConfig setting after import paddle.
+
+    Args:
+        name (str): The logger name.
+        level (logging.LEVEL): The base level of the logger
+        fmt (str): Format of logger output
+
+    Returns:
+        logging.Logger: logging logger with given settings
+
+    Examples:
+        .. code-block:: python
+            import paddle
+            import logging
+            logger = paddle.static.log_helper.get_logger(__name__, logging.INFO,
+                            fmt='%(asctime)s-%(levelname)s: %(message)s')
+    """
+
+    logger = logging.getLogger(name)
+    logger.setLevel(level)
+    handler = logging.StreamHandler()
+
+    if fmt:
+        formatter = logging.Formatter(fmt=fmt, datefmt='%a %b %d %H:%M:%S')
+        handler.setFormatter(formatter)
+
+    logger.addHandler(handler)
+
+    # stop propagate for propagating may print
+    # log multiple times
+    logger.propagate = False
+    return logger
diff --git a/python/paddle/static/nn/common.py b/python/paddle/static/nn/common.py
index c8e8e5455c7b3c..f68783cbb5d34b 100644
--- a/python/paddle/static/nn/common.py
+++ b/python/paddle/static/nn/common.py
@@ -1593,7 +1593,7 @@ def is_list_or_tuple(ele):
                 output_size
             ):
                 raise ValueError(
-                    "filter_size should not be None when output_size is Tensor or contain Tensor in static mode."
+                    "filter_size should not be None when output_size is Tensor or contain Tensor in static graph mode."
                 )
         else:
             output_size = utils.convert_shape_to_list(output_size)
diff --git a/python/paddle/static/nn/control_flow.py b/python/paddle/static/nn/control_flow.py
index 2c9b2dd2937381..d21d95b097e3b6 100644
--- a/python/paddle/static/nn/control_flow.py
+++ b/python/paddle/static/nn/control_flow.py
@@ -888,10 +888,10 @@ def cond(pred, true_fn=None, false_fn=None, name=None, return_names=None):
         the same shape because of dataflow model of PaddlePaddle while the
         tensors in the tuples or the lists can have different shapes.
 
-        2. This API could be used under both static mode or dygraph mode. If it
+        2. This API could be used under both static graph mode or dygraph mode. If it
         is in dygraph mode, the API only runs one branch based on condition.
 
-        3. If it is in static mode, any tensors or operations created outside
+        3. If it is in static graph mode, any tensors or operations created outside
         or inside of ``true_fn`` and ``false_fn`` will be in net building
         regardless of which branch is selected at runtime. This has frequently
         surprised users who expected a lazy semantics. For example:
diff --git a/python/paddle/static/quantization/__init__.py b/python/paddle/static/quantization/__init__.py
index 3a1a7549f83027..28d76d8cd67723 100644
--- a/python/paddle/static/quantization/__init__.py
+++ b/python/paddle/static/quantization/__init__.py
@@ -12,50 +12,55 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from ...fluid.contrib.slim.quantization.quantization_pass import (
+from .quantization_pass import (
     QuantizationTransformPass,
 )
-from ...fluid.contrib.slim.quantization.quantization_pass import (
+from .quantization_pass import (
     QuantizationFreezePass,
 )
-from ...fluid.contrib.slim.quantization.quantization_pass import (
+from .quantization_pass import (
     ConvertToInt8Pass,
 )
-from ...fluid.contrib.slim.quantization.quantization_pass import (
+from .quantization_pass import (
     TransformForMobilePass,
 )
-from ...fluid.contrib.slim.quantization.quantization_pass import (
+from .quantization_pass import (
     OutScaleForTrainingPass,
 )
-from ...fluid.contrib.slim.quantization.quantization_pass import (
+from .quantization_pass import (
     OutScaleForInferencePass,
 )
-from ...fluid.contrib.slim.quantization.quantization_pass import (
+from .quantization_pass import (
     AddQuantDequantPass,
 )
-from ...fluid.contrib.slim.quantization.quantization_pass import (
+from .quantization_pass import (
     ReplaceFakeQuantDequantPass,
 )
-from ...fluid.contrib.slim.quantization.quantization_pass import QuantWeightPass
-from ...fluid.contrib.slim.quantization.quantization_pass import (
+from .quantization_pass import (
+    QuantWeightPass,
+)
+from .quantization_pass import (
     QuantizationTransformPassV2,
 )
-from ...fluid.contrib.slim.quantization.quantization_pass import (
+from .quantization_pass import (
     AddQuantDequantPassV2,
 )
-from ...fluid.contrib.slim.quantization.quant_int8_mkldnn_pass import (
+from .quantization_pass import (
+    AddQuantDequantForInferencePass,
+)
+from .quant_int8_mkldnn_pass import (
     QuantInt8MkldnnPass,
 )
-from ...fluid.contrib.slim.quantization.quant2_int8_mkldnn_pass import (
+from .quant2_int8_mkldnn_pass import (
     Quant2Int8MkldnnPass,
 )
 
-from ...fluid.contrib.slim.quantization.post_training_quantization import (
+from .post_training_quantization import (
     PostTrainingQuantization,
 )
-from ...fluid.contrib.slim.quantization.post_training_quantization import (
+from .post_training_quantization import (
     PostTrainingQuantizationProgram,
 )
-from ...fluid.contrib.slim.quantization.post_training_quantization import (
+from .post_training_quantization import (
     WeightQuantization,
 )
diff --git a/python/paddle/fluid/contrib/slim/quantization/adaround.py b/python/paddle/static/quantization/adaround.py
similarity index 91%
rename from python/paddle/fluid/contrib/slim/quantization/adaround.py
rename to python/paddle/static/quantization/adaround.py
index f4bbccd7f1d86d..98934d2e8e4e33 100644
--- a/python/paddle/fluid/contrib/slim/quantization/adaround.py
+++ b/python/paddle/static/quantization/adaround.py
@@ -1,4 +1,4 @@
-#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,25 +12,25 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import numpy as np
-import time
-import sys
 import logging
-import paddle
+import sys
+import time
+
+import numpy as np
 
 import paddle
-import paddle.fluid as fluid
+import paddle.static as static
 
-from ....log_helper import get_logger
+from ..log_helper import get_logger
 from .utils import (
+    _channelwise_quant_axis1_ops,
+    bias_correction_w,
+    calculate_quant_cos_error,
+    dequant_tensor,
     load_variable_data,
+    quant_tensor,
     set_variable_data,
     stable_sigmoid,
-    quant_tensor,
-    dequant_tensor,
-    _channelwise_quant_axis1_ops,
-    calculate_quant_cos_error,
-    bias_correction_w,
 )
 
 _logger = get_logger(
@@ -42,7 +42,7 @@
 
 
 def compute_soft_rounding(alpha_v):
-    return fluid.layers.clip(
+    return paddle.clip(
         paddle.nn.functional.sigmoid(alpha_v) * (ZETA - GAMMA) + GAMMA,
         min=0,
         max=1,
@@ -83,11 +83,9 @@ def round_loss_fn():
 
             return round_loss
 
-        round_loss = paddle.static.nn.cond(
+        round_loss = static.nn.cond(
             warm_start,
-            lambda: fluid.layers.fill_constant(
-                shape=[1], dtype='float32', value=0.0
-            ),
+            lambda: paddle.full(shape=[1], dtype='float32', fill_value=0.0),
             round_loss_fn,
         )
 
@@ -151,7 +149,7 @@ def initialize_alpha(self, tensor, scale, var_name):
             shape=alpha.shape,
             dtype="float32",
             name=var_name + ".alpha",
-            default_initializer=fluid.initializer.NumpyArrayInitializer(alpha),
+            default_initializer=paddle.nn.initializer.Assign(alpha),
         )
 
     def _calculate_output_with_adarounded_weights(
@@ -258,12 +256,12 @@ def run_adaround(
                 fetch_op_name = quant_op_out_name
 
         # build adaround program
-        exec_strategy = fluid.ExecutionStrategy()
+        exec_strategy = static.ExecutionStrategy()
         exec_strategy.num_iteration_per_drop_scope = 1
-        startup_program = fluid.Program()
-        train_program = fluid.Program()
-        with fluid.program_guard(train_program, startup_program):
-            with fluid.unique_name.guard():
+        startup_program = static.Program()
+        train_program = static.Program()
+        with static.program_guard(train_program, startup_program):
+            with paddle.utils.unique_name.guard():
                 # initialize adaround
                 adaround = AdaRound(
                     scale,
@@ -273,21 +271,21 @@ def run_adaround(
                     weight_op_type=weight_op_type,
                     num_iterations=num_iterations,
                 )
-                orig_out_tensor = fluid.data(
+                orig_out_tensor = static.data(
                     name='orig_out_tensor',
-                    shape=fp32_fetch_list.shape,
+                    shape=(-1,) + fp32_fetch_list.shape,
                     dtype='float32',
                 )
-                adaround_out_tensor = fluid.data(
+                adaround_out_tensor = static.data(
                     name='adaround_out_tensor',
-                    shape=fp32_fetch_list.shape,
+                    shape=(-1,) + fp32_fetch_list.shape,
                     dtype='float32',
                 )
-                beta_tensor = fluid.data(
-                    name='beta', shape=[1], dtype='float32'
+                beta_tensor = static.data(
+                    name='beta', shape=[-1, 1], dtype='float32'
                 )
-                warm_start_tensor = fluid.data(
-                    name='warm_start', shape=[1], dtype='bool'
+                warm_start_tensor = static.data(
+                    name='warm_start', shape=[-1, 1], dtype='bool'
                 )
 
                 train_fetches_loss = adaround.get_loss(
@@ -296,7 +294,7 @@ def run_adaround(
                     adaround_out_tensor,
                     orig_out_tensor,
                 )
-                optimizer = fluid.optimizer.Adam(learning_rate=lr)
+                optimizer = paddle.optimizer.Adam(learning_rate=lr)
                 loss = train_fetches_loss['loss']
                 optimizer.minimize(loss)
         exe.run(startup_program)
diff --git a/python/paddle/fluid/contrib/slim/quantization/cal_kl_threshold.py b/python/paddle/static/quantization/cal_kl_threshold.py
similarity index 97%
rename from python/paddle/fluid/contrib/slim/quantization/cal_kl_threshold.py
rename to python/paddle/static/quantization/cal_kl_threshold.py
index ea3b1876f23462..c8101a892cfdec 100644
--- a/python/paddle/fluid/contrib/slim/quantization/cal_kl_threshold.py
+++ b/python/paddle/static/quantization/cal_kl_threshold.py
@@ -1,4 +1,4 @@
-#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,15 +14,15 @@
 
 import logging
 import math
+
 import numpy as np
-from ....log_helper import get_logger
+
+from ..log_helper import get_logger
 
 _logger = get_logger(
     __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s'
 )
 
-__all__ = ['cal_kl_threshold']
-
 
 def expand_quantized_bins(quantized_bins, reference_bins):
     '''
diff --git a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py b/python/paddle/static/quantization/post_training_quantization.py
similarity index 96%
rename from python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
rename to python/paddle/static/quantization/post_training_quantization.py
index 5ed3be2622ae50..0ab859998dcb99 100644
--- a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
+++ b/python/paddle/static/quantization/post_training_quantization.py
@@ -1,4 +1,4 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,43 +12,37 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import logging
 import os
-import re
-import math
 import shutil
-import logging
+
 import numpy as np
 
 try:
     from tqdm import tqdm
 except:
     from .utils import tqdm
+
 from inspect import isgeneratorfunction
-from .... import io
-from .... import core
-from .... import reader
-from .... import framework
-from .... import unique_name
-from ....executor import global_scope, Executor
-from ....framework import IrGraph
-from ....log_helper import get_logger
+
+from paddle.fluid.framework import IrGraph, _get_var
+
+from ... import io, static
+from ...fluid import reader
+from ...framework import core
+from ...utils import unique_name
+from ..log_helper import get_logger
+from . import utils
+from .adaround import run_adaround
+from .cal_kl_threshold import cal_kl_threshold
 from .quantization_pass import (
+    AddQuantDequantPass,
+    AddQuantDequantPassV2,
+    QuantizationFreezePass,
     QuantizationTransformPass,
     QuantizationTransformPassV2,
-    QuantizationFreezePass,
     QuantWeightPass,
-    AddQuantDequantPass,
-    AddQuantDequantPassV2,
 )
-from .cal_kl_threshold import cal_kl_threshold
-from .adaround import run_adaround
-from . import utils
-
-__all__ = [
-    'PostTrainingQuantization',
-    'WeightQuantization',
-    'PostTrainingQuantizationProgram',
-]
 
 _logger = get_logger(
     __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s'
@@ -156,10 +150,10 @@ def __init__(
         Constructor.
 
         Args:
-            executor(fluid.Executor): The executor to load, run and save the
+            executor(static.Executor): The executor to load, run and save the
                 quantized model.
-            scope(fluid.Scope, optional): The scope of the program, use it to load
-                and save variables. If scope=None, get scope by global_scope().
+            scope(static.Scope, optional): The scope of the program, use it to load
+                and save variables. If scope=None, get scope by static.global_scope().
             model_dir(str): The path of the fp32 model that will be quantized,
                 and the model and params files are under the path.
             model_filename(str, optional): The name of file to load the inference
@@ -245,10 +239,10 @@ def __init__(
 
         Examples:
         .. code-block:: python
-            import paddle.fluid as fluid
-            from paddle.fluid.contrib.slim.quantization import PostTrainingQuantization
+            import paddle.static as static
+            from paddle.static.quantization import PostTrainingQuantization
 
-            exe = fluid.Executor(fluid.CPUPlace())
+            exe = static.Executor(paddle.CPUPlace())
             model_dir = path/to/fp32_model_params
             # set model_filename as None when the filename is __model__,
             # otherwise set it as the real filename
@@ -344,7 +338,7 @@ def __init__(
         # Save input params
         self._bias_correction = bias_correction
         self._executor = executor
-        self._scope = global_scope() if scope is None else scope
+        self._scope = static.global_scope() if scope is None else scope
         self._model_dir = model_dir
         self._model_filename = model_filename
         self._params_filename = params_filename
@@ -537,22 +531,29 @@ def save_quantized_model(
         Args:
             save_model_path(str): The path to save the quantized model.
             model_filename(str, optional): If the model_filename is None,
-                save the model to '__model__'. Otherwise, save the model
-                to the specified filename. Default: None.
-            params_filename(str, optional): If the params_filename is None,
-                save params to separted files. Otherwise, save all params
-                to the specified filename.
+                save the model to 'model.pdmodel' and 'model.pdiparams'. Otherwise, save the model to 'model_name.pdmodel' and
+                'model_name.pdiparams". Default: None.
         Returns:
             None
         '''
-        io.save_inference_model(
-            dirname=save_model_path,
-            model_filename=model_filename,
-            params_filename=params_filename,
-            feeded_var_names=self._feed_list,
-            target_vars=self._fetch_list,
+        model_name = None
+        if model_filename is None:
+            model_name = "model"
+        elif model_filename.endswith(".pdmodel"):
+            model_name = model_filename.rsplit(".", 1)[0]
+        else:
+            model_name = model_filename
+
+        path_prefix = os.path.join(save_model_path, model_name)
+        feed_vars = [
+            self._program.global_block().var(name) for name in self._feed_list
+        ]
+        static.save_inference_model(
+            path_prefix,
+            feed_vars,
+            self._fetch_list,
             executor=self._executor,
-            main_program=self._program,
+            program=self._program,
             clip_extra=self._clip_extra,
         )
         _logger.info("The quantized model is saved in " + save_model_path)
@@ -567,8 +568,8 @@ def _load_model_data(self):
                 self._program,
                 self._feed_list,
                 self._fetch_list,
-            ] = io.load_inference_model(
-                dirname=self._model_dir,
+            ] = static.load_inference_model(
+                self._model_dir,
                 executor=self._executor,
                 model_filename=self._model_filename,
                 params_filename=self._params_filename,
@@ -578,7 +579,7 @@ def _load_model_data(self):
             self._optimize_fp32_model()
 
         feed_vars = [
-            framework._get_var(str(var_name), self._program)
+            _get_var(str(var_name), self._program)
             for var_name in self._feed_list
         ]
 
@@ -1632,17 +1633,17 @@ def convert_weight_to_fp16(self, save_model_dir):
 
         # Load model
         place = core.CPUPlace()
-        exe = Executor(place)
-        scope = global_scope()
-        [infer_program, feed_list, fetch_list] = io.load_inference_model(
-            dirname=self._model_dir,
+        exe = static.Executor(place)
+        scope = static.global_scope()
+        [infer_program, feed_list, fetch_list] = static.load_inference_model(
+            self._model_dir,
             executor=exe,
             model_filename=self._model_filename,
             params_filename=self._params_filename,
         )
 
         # Clone and save fp16 weights
-        save_program = framework.Program()
+        save_program = static.Program()
         save_block = save_program.global_block()
         save_var_map = {}
 
@@ -1723,10 +1724,10 @@ def _quantize_weight_to_int(
         """
         # Load model
         place = core.CPUPlace()
-        exe = Executor(place)
-        scope = global_scope()
-        [program, feed_list, fetch_list] = io.load_inference_model(
-            dirname=self._model_dir,
+        exe = static.Executor(place)
+        scope = static.global_scope()
+        [program, feed_list, fetch_list] = static.load_inference_model(
+            self._model_dir,
             executor=exe,
             model_filename=self._model_filename,
             params_filename=self._params_filename,
@@ -1758,15 +1759,22 @@ def _quantize_weight_to_int(
                         self._weight_channel_wise_abs_max_quantization(
                             scope, place, weight_bits, op, var_name, for_test
                         )
-
-        io.save_inference_model(
-            dirname=save_model_dir,
-            feeded_var_names=feed_list,
-            target_vars=fetch_list,
+        model_name = None
+        if save_model_filename is None:
+            model_name = "model"
+        elif save_model_filename.endswith(".pdmodel"):
+            model_name = save_model_filename.rsplit(".", 1)[0]
+        else:
+            model_name = save_model_filename
+
+        path_prefix = os.path.join(save_model_dir, model_name)
+        feed_vars = [program.global_block().var(name) for name in feed_list]
+        static.save_inference_model(
+            path_prefix,
+            feed_vars,
+            fetch_list,
             executor=exe,
-            main_program=program,
-            model_filename=save_model_filename,
-            params_filename=save_params_filename,
+            program=program,
         )
 
     def _weight_abs_max_quantization(
diff --git a/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py b/python/paddle/static/quantization/quant2_int8_mkldnn_pass.py
similarity index 99%
rename from python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
rename to python/paddle/static/quantization/quant2_int8_mkldnn_pass.py
index 89540eeedbd3ae..dd0855f0268d59 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
+++ b/python/paddle/static/quantization/quant2_int8_mkldnn_pass.py
@@ -1,4 +1,4 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,11 +13,9 @@
 # limitations under the License.
 
 import numpy as np
-from .... import core
-from ....framework import IrGraph
-from ....framework import _get_paddle_place
 
-__all__ = ['Quant2Int8MkldnnPass']
+from ...fluid.framework import IrGraph
+from ...framework import _get_paddle_place, core
 
 OpRole = core.op_proto_and_checker_maker.OpRole
 
diff --git a/python/paddle/fluid/contrib/slim/quantization/quant_int8_mkldnn_pass.py b/python/paddle/static/quantization/quant_int8_mkldnn_pass.py
similarity index 93%
rename from python/paddle/fluid/contrib/slim/quantization/quant_int8_mkldnn_pass.py
rename to python/paddle/static/quantization/quant_int8_mkldnn_pass.py
index 25278fc6913ee9..de04f66b3b8856 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quant_int8_mkldnn_pass.py
+++ b/python/paddle/static/quantization/quant_int8_mkldnn_pass.py
@@ -1,4 +1,4 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,12 +13,9 @@
 # limitations under the License.
 
 import numpy as np
-from .... import core
-from ....framework import IrGraph
-from ....framework import IrNode
-from ....framework import _get_paddle_place
 
-__all__ = ['QuantInt8MkldnnPass']
+from ...fluid.framework import IrGraph
+from ...framework import _get_paddle_place
 
 
 class QuantInt8MkldnnPass:
@@ -40,23 +37,23 @@ class QuantInt8MkldnnPass:
     def __init__(self, _scope=None, _place=None):
         r"""
         Args:
-            scope(fluid.Scope): scope is used to initialize the new parameters.
-            place(fluid.CPUPlace|str): place is used to initialize the new parameters.
+            scope(static.Scope): scope is used to initialize the new parameters.
+            place(static.CPUPlace|str): place is used to initialize the new parameters.
             When it is string, it can be only 'cpu'.
 
 
         Examples:
         .. code-block:: python
             # The original graph will be rewrite.
-            import paddle.fluid as fluid
-            from paddle.fluid.contrib.slim.quantization \
+            import paddle.static as static
+            from paddle.static.quantization \
                 import QuantInt8MkldnnPass
             from paddle.fluid.framework import IrGraph
-            from paddle.fluid import core
+            from paddle.framework import core
 
-            graph = IrGraph(core.Graph(fluid.Program().desc), for_test=False)
-            place = fluid.CPUPlace()
-            mkldnn_pass = QuantInt8MkldnnPass(fluid.global_scope(),
+            graph = IrGraph(core.Graph(static.Program().desc), for_test=False)
+            place = static.CPUPlace()
+            mkldnn_pass = QuantInt8MkldnnPass(static.global_scope(),
             place)
             mkldnn_pass.apply(graph)
         """
diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py b/python/paddle/static/quantization/quantization_pass.py
similarity index 97%
rename from python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
rename to python/paddle/static/quantization/quantization_pass.py
index 0b1e3cd96d7d60..1198d1c2cf17f5 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
+++ b/python/paddle/static/quantization/quantization_pass.py
@@ -1,4 +1,4 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,39 +13,21 @@
 # limitations under the License.
 
 import collections
+
 import numpy as np
 
 try:
     from tqdm import tqdm
 except:
     from .utils import tqdm
-from .... import core
-from ....framework import IrGraph
-from ....framework import IrNode
-from ....framework import Operator
-from .... import unique_name
-
-from ....framework import Program, program_guard, default_startup_program
-from ....data import data
-from ....executor import scope_guard
-from ....framework import _get_paddle_place
-from . import utils
+
 import paddle
 
-__all__ = [
-    'QuantizationTransformPass',
-    'QuantizationFreezePass',
-    'ConvertToInt8Pass',
-    'TransformForMobilePass',
-    'OutScaleForTrainingPass',
-    'OutScaleForInferencePass',
-    'AddQuantDequantPass',
-    'QuantizationTransformPassV2',
-    'AddQuantDequantPassV2',
-    'ReplaceFakeQuantDequantPass',
-    'QuantWeightPass',
-    'AddQuantDequantForInferencePass',
-]
+from ...fluid.framework import IrGraph, IrNode
+from ...framework import _get_paddle_place, core
+from ...static import Program, data, program_guard, scope_guard
+from ...utils import unique_name
+from . import utils
 
 _fake_quant_op_list = [
     'fake_quantize_abs_max',
@@ -137,10 +119,10 @@ def __init__(
         Constructor.
 
         Args:
-            scope(fluid.Scope): When activation use 'range_abs_max' as the quantize
+            scope(static.Scope): When activation use 'range_abs_max' as the quantize
                 type, this pass will create some new parameters. The scope is used to
                 initialize these new parameters.
-            place(fluid.CPUPlace|fluid.CUDAPlace|str): place is used to initialize new
+            place(static.CPUPlace|static.CUDAPlace|str): place is used to initialize new
                 parameters described above. If it's string, It can be ``cpu``, and ``gpu:x``,
                 where ``x`` is the index of the GPUs.
             weight_bits(int): quantization bit number for weights,
@@ -197,15 +179,15 @@ def __init__(
         Examples:
         .. code-block:: python
             # The original graph will be rewrite.
-            import paddle.fluid as fluid
-            from paddle.fluid.contrib.slim.quantization \
+            import paddle.static as static
+            from paddle.static.quantization \
                 import QuantizationTransformPass
-            from paddle.fluid.contrib.slim.graph import IrGraph
-            from paddle.fluid import core
+            from paddle.fluid.framework import IrGraph
+            from paddle.framework import core
 
-            graph = IrGraph(core.Graph(program.desc), for_test=False)
-            place = fluid.CPUPlace()
-            transform_pass = QuantizationTransformPass(fluid.global_scope(),
+            graph = IrGraph(core.Graph(static.Program().desc), for_test=False)
+            place = paddle.CPUPlace()
+            transform_pass = QuantizationTransformPass(static.global_scope(),
             place)
             transform_pass.apply(graph)
         """
@@ -1094,8 +1076,8 @@ def __init__(
             and weight will be scaled offline.
 
         Args:
-            scope(fluid.Scope): scope is used to get the weight tensor values.
-            place(fluid.CPUPlace|fluid.CUDAPlace|str): place is used to restore the weight tensors.
+            scope(static.Scope): scope is used to get the weight tensor values.
+            place(static.CPUPlace|static.CUDAPlace|str): place is used to restore the weight tensors.
                 If it's string, It can be ``cpu``, and ``gpu:x``, where ``x`` is the index of the GPUs.
             bias_correction(bool): whether use bias correction for post-training quantization.
                  https://arxiv.org/abs/1810.05723.
@@ -1190,7 +1172,7 @@ def apply(self, graph):
                             )
                             quantized_param_v = np.round(quantized_param_v)
                             # Weight bias correction
-                            if self._bias_correction == True:
+                            if self._bias_correction is True:
                                 quantized_param_v = utils.bias_correction_w(
                                     param_v,
                                     quantized_param_v,
@@ -1459,8 +1441,8 @@ def __init__(self, scope, place, quantizable_op_type=None):
         Convert the weights into int8_t type.
 
         Args:
-            scope(fluid.Scope): scope is used to get the weight tensor values.
-            place(fluid.CPUPlace|fluid.CUDAPlace|str): place is used to restore the
+            scope(static.Scope): scope is used to get the weight tensor values.
+            place(static.CPUPlace|static.CUDAPlace|str): place is used to restore the
                 8bits weight tensors. If it's string, It can be ``cpu``, and ``gpu:x``,
                 where ``x`` is the index of the GPUs.
             quantizable_op_type(list[str]): This input param will be removed latter. The pass
@@ -1602,8 +1584,8 @@ def __init__(
         These output scales may be used by tensorRT or some other inference engines.
 
         Args:
-            scope(fluid.Scope): The scope is used to initialize these new parameters.
-            place(fluid.CPUPlace|fluid.CUDAPlace|str): The place is used to initialize new parameters.
+            scope(static.Scope): The scope is used to initialize these new parameters.
+            place(static.CPUPlace|static.CUDAPlace|str): The place is used to initialize new parameters.
                 If it's string, It can be ``cpu``, and ``gpu:x``, where ``x`` is the
                 index of the GPUs.
             moving_rate(float): The decay coefficient of moving average. The default value is 0.9.
@@ -1764,7 +1746,7 @@ def __init__(self, scope=None):
         These output scales may be used by tensorRT or some other inference engines.
 
         Args:
-            scope(fluid.Scope): The scope is used to initialize these new parameters.
+            scope(static.Scope): The scope is used to initialize these new parameters.
         """
         self._scope = scope
         self._teller_set = utils.QUANT_SUPPORTED_OP_TYPE_LIST
@@ -1856,8 +1838,8 @@ def __init__(
         Constructor.
 
         Args:
-            scope(fluid.Scope): The scope is used to initialize these new parameters.
-            place(fluid.CPUPlace|fluid.CUDAPlace|str): place is used to initialize new
+            scope(static.Scope): The scope is used to initialize these new parameters.
+            place(static.CPUPlace|static.CUDAPlace|str): place is used to initialize new
                 parameters described above. If ``place`` is string, it can be It can be ``cpu``
                 or ``gpu:x``, where ``x`` is the index of the GPUs.
             moving_rate(float, optional): the param for 'quant_dequant_moving_average_abs_max'
@@ -2452,12 +2434,12 @@ def __init__(
         .. code-block:: python
             # The original graph will be rewrite.
             import paddle
-            from paddle.fluid.contrib.slim.quantization \
+            from paddle.static.quantization \
                 import QuantizationTransformPassV2
-            from paddle.fluid.contrib.slim.graph import IrGraph
-            from paddle.fluid import core
+            from paddle.fluid.framework import IrGraph
+            from paddle.framework import core
 
-            graph = IrGraph(core.Graph(program.desc), for_test=False)
+            graph = IrGraph(core.Graph(static.Program().desc), for_test=False)
             place = paddle.CPUPlace()
             scope = paddle.static.global_scope()
             transform_pass = QuantizationTransformPassV2(scope, place)
@@ -2810,12 +2792,12 @@ def __init__(
         .. code-block:: python
             # The original graph will be rewrite.
             import paddle
-            from paddle.fluid.contrib.slim.quantization \
+            from paddle.static.quantization \
                 import AddQuantDequantPassV2
-            from paddle.fluid.contrib.slim.graph import IrGraph
-            from paddle.fluid import core
+            from paddle.fluid.framework import IrGraph
+            from paddle.framework import core
 
-            graph = IrGraph(core.Graph(program.desc), for_test=False)
+            graph = IrGraph(core.Graph(static.Program().desc), for_test=False)
             place = paddle.CPUPlace()
             scope = paddle.static.global_scope()
             add_quant_dequant_pass = AddQuantDequantPassV2(scope, place)
@@ -2977,12 +2959,12 @@ def __init__(self, scope, place, quant_bits=8):
         .. code-block:: python
             # The original graph will be rewrite.
             import paddle
-            from paddle.fluid.contrib.slim.quantization \
+            from paddle.static.quantization \
                 import ReplaceFakeQuantDequantPass
-            from paddle.fluid.contrib.slim.graph import IrGraph
-            from paddle.fluid import core
+            from paddle.fluid.framework import IrGraph
+            from paddle.framework import core
 
-            graph = IrGraph(core.Graph(program.desc), for_test=False)
+            graph = IrGraph(core.Graph(static.Program().desc), for_test=False)
             place = paddle.CPUPlace()
             scope = paddle.static.global_scope()
             replace_pass = ReplaceFakeQuantDequantPass(scope, place)
@@ -3133,12 +3115,12 @@ class QuantWeightPass:
         .. code-block:: python
             # The original graph will be rewrite.
             import paddle
-            from paddle.fluid.contrib.slim.quantization \
+            from paddle.static.quantization \
                 import QuantWeightPass
-            from paddle.fluid.contrib.slim.graph import IrGraph
-            from paddle.fluid import core
+            from paddle.fluid.framework import IrGraph
+            from paddle.framework import core
 
-            graph = IrGraph(core.Graph(program.desc), for_test=False)
+            graph = IrGraph(core.Graph(paddle.static.Program().desc), for_test=False)
             place = paddle.CPUPlace()
             scope = paddle.static.global_scope()
             quant_weight_pass = QuantWeightPass(scope, place)
@@ -3207,7 +3189,7 @@ def apply(self, graph):
                         bits_length,
                         onnx_format=True,
                     )
-                    if self._bias_correction == True:
+                    if self._bias_correction is True:
                         quantized_param_v = utils.bias_correction_w(
                             param_v,
                             quantized_param_v,
@@ -3264,7 +3246,7 @@ class AddQuantDequantForInferencePass:
     def __init__(self, scope, place, quant_bits=8):
         """
         Args:
-            scope(fluid.Scope): The scope is used to initialize these new parameters.
+            scope(static.Scope): The scope is used to initialize these new parameters.
             place(paddle.CPUPlace|paddle.CUDAPlace|str): place is used to restore the weight tensors.
                 If it's string, it can be ``cpu``, and ``gpu:x``, where ``x`` is the index of the GPUs.
             quant_bits(int, optional): quantization bit number for weight. Default is 8.
diff --git a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt b/python/paddle/static/quantization/tests/CMakeLists.txt
similarity index 99%
rename from python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
rename to python/paddle/static/quantization/tests/CMakeLists.txt
index f544154a22073d..af424826024443 100755
--- a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
+++ b/python/paddle/static/quantization/tests/CMakeLists.txt
@@ -250,7 +250,6 @@ if(WIN32)
   list(REMOVE_ITEM TEST_OPS test_post_training_quantization_lstm_model)
   list(REMOVE_ITEM TEST_OPS test_imperative_ptq)
   list(REMOVE_ITEM TEST_OPS test_weight_quantization_mobilenetv1)
-  list(REMOVE_ITEM TEST_OPS test_quantize_transpiler_v2)
   list(REMOVE_ITEM TEST_OPS test_imperative_qat_amp)
   list(REMOVE_ITEM TEST_OPS test_imperative_qat_lsq)
   list(REMOVE_ITEM TEST_OPS test_imperative_qat_matmul)
diff --git a/python/paddle/fluid/contrib/slim/tests/README.md b/python/paddle/static/quantization/tests/README.md
similarity index 95%
rename from python/paddle/fluid/contrib/slim/tests/README.md
rename to python/paddle/static/quantization/tests/README.md
index e052cdfea84be8..ce049b45f27f35 100644
--- a/python/paddle/fluid/contrib/slim/tests/README.md
+++ b/python/paddle/static/quantization/tests/README.md
@@ -91,17 +91,18 @@ Having gathered all the data needed for quantization we apply the `cpu_quantize_
 The code snipped shows how the `Quant2Int8MkldnnPass` can be applied to a model graph:
 
 ```python
-    import paddle.fluid as fluid
-    from paddle.fluid.contrib.slim.quantization import Quant2Int8MkldnnPass
+    import paddle
+    import paddle.static as static
+    from paddle.static.quantization import Quant2Int8MkldnnPass
     from paddle.fluid.framework import IrGraph
-    from paddle.fluid import core
+    from paddle.framework import core
 
     # Create the IrGraph by Program
-    graph = IrGraph(core.Graph(fluid.Program().desc), for_test=False)
-    place = fluid.CPUPlace()
+    graph = IrGraph(core.Graph(static.Program().desc), for_test=False)
+    place = paddle.CPUPlace()
     # Convert the IrGraph to MKL-DNN supported INT8 IrGraph using the
     # Quant2Int8MkldnnPass. It requires a list of operators to be quantized
-    mkldnn_pass = Quant2Int8MkldnnPass({'conv2d', 'pool2d'}, fluid.global_scope(), place, fluid.core, False)
+    mkldnn_pass = Quant2Int8MkldnnPass({'conv2d', 'pool2d'}, static.global_scope(), place, core, False)
     # Apply Quant2Int8MkldnnPass to IrGraph
     mkldnn_pass.apply(graph)
 
@@ -263,7 +264,7 @@ The following options are also accepted:
 
 ```bash
 cd /PATH/TO/PADDLE
-OMP_NUM_THREADS=28 FLAGS_use_mkldnn=true python python/paddle/fluid/contrib/slim/tests/quant2_int8_image_classification_comparison.py --quant_model=/PATH/TO/DOWNLOADED/QUANT/MODEL --fp32_model=/PATH/TO/DOWNLOADED/FP32/MODEL --infer_data=$HOME/.cache/paddle/dataset/int8/download/int8_full_val.bin --batch_size=50 --batch_num=1000 --acc_diff_threshold=0.01 --ops_to_quantize="conv2d,pool2d"
+OMP_NUM_THREADS=28 FLAGS_use_mkldnn=true python python/paddle/static/quantization/slim/tests/quant2_int8_image_classification_comparison.py --quant_model=/PATH/TO/DOWNLOADED/QUANT/MODEL --fp32_model=/PATH/TO/DOWNLOADED/FP32/MODEL --infer_data=$HOME/.cache/paddle/dataset/int8/download/int8_full_val.bin --batch_size=50 --batch_num=1000 --acc_diff_threshold=0.01 --ops_to_quantize="conv2d,pool2d"
 ```
 
 > Notes: Due to a large amount of images in the `int8_full_val.bin` dataset (50 000), the accuracy benchmark may last long. To accelerate accuracy measuring, it is recommended to set `OMP_NUM_THREADS` to the maximum number of physical cores available on the server.
@@ -276,7 +277,7 @@ To reproduce the performance results, the environment variable `OMP_NUM_THREADS=
 
    ```bash
    cd /PATH/TO/PADDLE/build
-   python ../python/paddle/fluid/contrib/slim/tests/save_quant_model.py --quant_model_path=/PATH/TO/DOWNLOADED/QUANT/MODEL --int8_model_save_path=/PATH/TO/SAVE/QUANT/INT8/MODEL --ops_to_quantize="conv2d,pool2d"
+   python ../python/paddle/static/quantization/slim/tests/save_quant_model.py --quant_model_path=/PATH/TO/DOWNLOADED/QUANT/MODEL --int8_model_save_path=/PATH/TO/SAVE/QUANT/INT8/MODEL --ops_to_quantize="conv2d,pool2d"
    ```
 
 2. Run the C-API test for performance benchmark.
diff --git a/python/paddle/fluid/contrib/slim/tests/__init__.py b/python/paddle/static/quantization/tests/__init__.py
similarity index 89%
rename from python/paddle/fluid/contrib/slim/tests/__init__.py
rename to python/paddle/static/quantization/tests/__init__.py
index 6d41233e227dc7..97043fd7ba6885 100644
--- a/python/paddle/fluid/contrib/slim/tests/__init__.py
+++ b/python/paddle/static/quantization/tests/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/python/paddle/fluid/contrib/slim/tests/convert_model2dot.py b/python/paddle/static/quantization/tests/convert_model2dot.py
similarity index 81%
rename from python/paddle/fluid/contrib/slim/tests/convert_model2dot.py
rename to python/paddle/static/quantization/tests/convert_model2dot.py
index 7bb7de706bcaca..4eb5463b511e10 100644
--- a/python/paddle/fluid/contrib/slim/tests/convert_model2dot.py
+++ b/python/paddle/static/quantization/tests/convert_model2dot.py
@@ -1,4 +1,4 @@
-#   copyright (c) 2020 paddlepaddle authors. all rights reserved.
+#   copyright (c) 2022 paddlepaddle authors. all rights reserved.
 #
 # licensed under the apache license, version 2.0 (the "license");
 # you may not use this file except in compliance with the license.
@@ -12,14 +12,14 @@
 # see the license for the specific language governing permissions and
 # limitations under the license.
 
-import unittest
+import argparse
 import os
 import sys
-import argparse
-import paddle.fluid as fluid
-from paddle.fluid.framework import IrGraph
-from paddle.fluid import core
+import unittest
+
 import paddle
+from paddle.fluid.framework import IrGraph
+from paddle.framework import core
 
 paddle.enable_static()
 
@@ -47,29 +47,32 @@ def parse_args():
 
 
 def generate_dot_for_model(model_path, save_graph_dir, save_graph_name):
-    place = fluid.CPUPlace()
-    exe = fluid.Executor(place)
-    inference_scope = fluid.executor.global_scope()
-    with fluid.scope_guard(inference_scope):
+    place = paddle.CPUPlace()
+    exe = paddle.static.Executor(place)
+    inference_scope = paddle.static.global_scope()
+    with paddle.static.scope_guard(inference_scope):
         if os.path.exists(os.path.join(model_path, '__model__')):
             [
                 inference_program,
                 feed_target_names,
                 fetch_targets,
-            ] = fluid.io.load_inference_model(model_path, exe)
+            ] = paddle.fluid.io.load_inference_model(model_path, exe)
         else:
             [
                 inference_program,
                 feed_target_names,
                 fetch_targets,
-            ] = fluid.io.load_inference_model(
-                model_path, exe, 'model', 'params'
+            ] = paddle.static.load_inference_model(
+                model_path,
+                exe,
+                model_filename='model',
+                params_filename='params',
             )
         graph = IrGraph(core.Graph(inference_program.desc), for_test=True)
         if not os.path.exists(save_graph_dir):
             os.makedirs(save_graph_dir)
         model_name = os.path.basename(os.path.normpath(save_graph_dir))
-        if save_graph_name is '':
+        if save_graph_name == '':
             save_graph_name = model_name
         graph.draw(save_graph_dir, save_graph_name, graph.all_op_nodes())
         print(
diff --git a/python/paddle/fluid/contrib/slim/tests/imperative_test_utils.py b/python/paddle/static/quantization/tests/imperative_test_utils.py
similarity index 82%
rename from python/paddle/fluid/contrib/slim/tests/imperative_test_utils.py
rename to python/paddle/static/quantization/tests/imperative_test_utils.py
index 744bd3690309b2..3ba7b9ffef676c 100644
--- a/python/paddle/fluid/contrib/slim/tests/imperative_test_utils.py
+++ b/python/paddle/static/quantization/tests/imperative_test_utils.py
@@ -11,18 +11,27 @@
 # without warranties or conditions of any kind, either express or implied.
 # see the license for the specific language governing permissions and
 # limitations under the license.
-import numpy as np
 import logging
 
-import paddle
-import paddle.fluid as fluid
-from paddle.fluid import core
-from paddle.nn import Sequential
-from paddle.nn import ReLU, ReLU6, LeakyReLU, Sigmoid, Softmax, PReLU
-from paddle.nn import Linear, Conv2D, Softmax, BatchNorm2D, MaxPool2D
-from paddle.nn import BatchNorm1D
+import numpy as np
 
-from paddle.fluid.log_helper import get_logger
+import paddle
+from paddle.framework import ParamAttr
+from paddle.nn import (
+    BatchNorm1D,
+    BatchNorm2D,
+    Conv2D,
+    LeakyReLU,
+    Linear,
+    MaxPool2D,
+    PReLU,
+    ReLU,
+    ReLU6,
+    Sequential,
+    Sigmoid,
+    Softmax,
+)
+from paddle.static.log_helper import get_logger
 
 _logger = get_logger(
     __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s'
@@ -86,18 +95,18 @@ def train_lenet(lenet, reader, optimizer):
     return loss_list
 
 
-class ImperativeLenet(fluid.dygraph.Layer):
+class ImperativeLenet(paddle.nn.Layer):
     def __init__(self, num_classes=10):
         super().__init__()
-        conv2d_w1_attr = fluid.ParamAttr(name="conv2d_w_1")
-        conv2d_w2_attr = fluid.ParamAttr(name="conv2d_w_2")
-        fc_w1_attr = fluid.ParamAttr(name="fc_w_1")
-        fc_w2_attr = fluid.ParamAttr(name="fc_w_2")
-        fc_w3_attr = fluid.ParamAttr(name="fc_w_3")
-        conv2d_b2_attr = fluid.ParamAttr(name="conv2d_b_2")
-        fc_b1_attr = fluid.ParamAttr(name="fc_b_1")
-        fc_b2_attr = fluid.ParamAttr(name="fc_b_2")
-        fc_b3_attr = fluid.ParamAttr(name="fc_b_3")
+        conv2d_w1_attr = ParamAttr(name="conv2d_w_1")
+        conv2d_w2_attr = ParamAttr(name="conv2d_w_2")
+        fc_w1_attr = ParamAttr(name="fc_w_1")
+        fc_w2_attr = ParamAttr(name="fc_w_2")
+        fc_w3_attr = ParamAttr(name="fc_w_3")
+        conv2d_b2_attr = ParamAttr(name="conv2d_b_2")
+        fc_b1_attr = ParamAttr(name="fc_b_1")
+        fc_b2_attr = ParamAttr(name="fc_b_2")
+        fc_b3_attr = ParamAttr(name="fc_b_3")
         self.features = Sequential(
             Conv2D(
                 in_channels=1,
@@ -155,26 +164,26 @@ def forward(self, inputs):
         x = self.quant_stub(inputs)
         x = self.features(x)
 
-        x = paddle.flatten(x, 1, -1)
+        x = paddle.flatten(x, 1)
         x = self.add(x, paddle.to_tensor(0.0))  # For CI
         x = self.fc(x)
         return x
 
 
-class ImperativeLenetWithSkipQuant(fluid.dygraph.Layer):
+class ImperativeLenetWithSkipQuant(paddle.nn.Layer):
     def __init__(self, num_classes=10):
         super().__init__()
 
-        conv2d_w1_attr = fluid.ParamAttr(name="conv2d_w_1")
-        conv2d_w2_attr = fluid.ParamAttr(name="conv2d_w_2")
-        fc_w1_attr = fluid.ParamAttr(name="fc_w_1")
-        fc_w2_attr = fluid.ParamAttr(name="fc_w_2")
-        fc_w3_attr = fluid.ParamAttr(name="fc_w_3")
-        conv2d_b1_attr = fluid.ParamAttr(name="conv2d_b_1")
-        conv2d_b2_attr = fluid.ParamAttr(name="conv2d_b_2")
-        fc_b1_attr = fluid.ParamAttr(name="fc_b_1")
-        fc_b2_attr = fluid.ParamAttr(name="fc_b_2")
-        fc_b3_attr = fluid.ParamAttr(name="fc_b_3")
+        conv2d_w1_attr = ParamAttr(name="conv2d_w_1")
+        conv2d_w2_attr = ParamAttr(name="conv2d_w_2")
+        fc_w1_attr = ParamAttr(name="fc_w_1")
+        fc_w2_attr = ParamAttr(name="fc_w_2")
+        fc_w3_attr = ParamAttr(name="fc_w_3")
+        conv2d_b1_attr = ParamAttr(name="conv2d_b_1")
+        conv2d_b2_attr = ParamAttr(name="conv2d_b_2")
+        fc_b1_attr = ParamAttr(name="fc_b_1")
+        fc_b2_attr = ParamAttr(name="fc_b_2")
+        fc_b3_attr = ParamAttr(name="fc_b_3")
         self.conv2d_0 = Conv2D(
             in_channels=1,
             out_channels=6,
@@ -240,8 +249,7 @@ def forward(self, inputs):
         x = self.relu6_0(x)
         x = self.pool2d_1(x)
 
-        x = paddle.flatten(x, 1, -1)
-
+        x = paddle.flatten(x, 1)
         x = self.linear_0(x)
         x = self.leaky_relu_0(x)
         x = self.linear_1(x)
@@ -252,7 +260,7 @@ def forward(self, inputs):
         return x
 
 
-class ImperativeLinearBn(fluid.dygraph.Layer):
+class ImperativeLinearBn(paddle.nn.Layer):
     def __init__(self):
         super().__init__()
 
@@ -284,7 +292,7 @@ def forward(self, inputs):
         return x
 
 
-class ImperativeLinearBn_hook(fluid.dygraph.Layer):
+class ImperativeLinearBn_hook(paddle.nn.Layer):
     def __init__(self):
         super().__init__()
 
diff --git a/python/paddle/fluid/contrib/slim/tests/quant2_int8_image_classification_comparison.py b/python/paddle/static/quantization/tests/quant2_int8_image_classification_comparison.py
similarity index 96%
rename from python/paddle/fluid/contrib/slim/tests/quant2_int8_image_classification_comparison.py
rename to python/paddle/static/quantization/tests/quant2_int8_image_classification_comparison.py
index dcd5d6de313316..6c5158db22379f 100644
--- a/python/paddle/fluid/contrib/slim/tests/quant2_int8_image_classification_comparison.py
+++ b/python/paddle/static/quantization/tests/quant2_int8_image_classification_comparison.py
@@ -12,19 +12,20 @@
 # see the license for the specific language governing permissions and
 # limitations under the license.
 
-import unittest
-import os
-import sys
 import argparse
 import logging
+import os
 import struct
-import numpy as np
+import sys
 import time
+import unittest
+
+import numpy as np
+
 import paddle
-import paddle.fluid as fluid
 from paddle.fluid.framework import IrGraph
-from paddle.fluid.contrib.slim.quantization import Quant2Int8MkldnnPass
-from paddle.fluid import core
+from paddle.framework import core
+from paddle.static.quantization import Quant2Int8MkldnnPass
 
 paddle.enable_static()
 
@@ -185,23 +186,26 @@ def _predict(
         target='quant',
     ):
         assert target in ['quant', 'int8', 'fp32']
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        inference_scope = fluid.executor.global_scope()
-        with fluid.scope_guard(inference_scope):
+        place = paddle.CPUPlace()
+        exe = paddle.static.Executor(place)
+        inference_scope = paddle.static.global_scope()
+        with paddle.static.scope_guard(inference_scope):
             if os.path.exists(os.path.join(model_path, '__model__')):
                 [
                     inference_program,
                     feed_target_names,
                     fetch_targets,
-                ] = fluid.io.load_inference_model(model_path, exe)
+                ] = paddle.fluid.io.load_inference_model(model_path, exe)
             else:
                 [
                     inference_program,
                     feed_target_names,
                     fetch_targets,
-                ] = fluid.io.load_inference_model(
-                    model_path, exe, 'model', 'params'
+                ] = paddle.static.load_inference_model(
+                    model_path,
+                    exe,
+                    model_filename='model',
+                    params_filename='params',
                 )
 
             graph = IrGraph(core.Graph(inference_program.desc), for_test=True)
@@ -359,7 +363,7 @@ def _ints_from_csv(self, string):
         return set(map(int, string.split(',')))
 
     def test_graph_transformation(self):
-        if not fluid.core.is_compiled_with_mkldnn():
+        if not core.is_compiled_with_mkldnn():
             return
 
         quant_model_path = test_case_args.quant_model
diff --git a/python/paddle/fluid/contrib/slim/tests/quant2_int8_lstm_model.py b/python/paddle/static/quantization/tests/quant2_int8_lstm_model.py
similarity index 92%
rename from python/paddle/fluid/contrib/slim/tests/quant2_int8_lstm_model.py
rename to python/paddle/static/quantization/tests/quant2_int8_lstm_model.py
index 96cb22dc2e5999..aa5f184a0c7da9 100644
--- a/python/paddle/fluid/contrib/slim/tests/quant2_int8_lstm_model.py
+++ b/python/paddle/static/quantization/tests/quant2_int8_lstm_model.py
@@ -13,15 +13,17 @@
 # limitations under the License.
 
 import argparse
-import numpy as np
 import struct
 import sys
 import time
 import unittest
-from paddle import fluid
-from paddle.fluid.core import AnalysisConfig, create_paddle_predictor
+
+import numpy as np
 from save_quant_model import transform_and_save_int8_model
 
+import paddle
+from paddle.framework import core
+
 
 def parse_args():
     parser = argparse.ArgumentParser()
@@ -80,17 +82,19 @@ def get_warmup_tensor(self, data_path, place):
                     [len(feat) // 4 // 8, 8]
                 )
                 lod_feat = [feat.shape[0]]
-                minputs = fluid.create_lod_tensor(feat, [lod_feat], place)
+                minputs = paddle.fluid.create_lod_tensor(
+                    feat, [lod_feat], place
+                )
 
-                infer_data = fluid.core.PaddleTensor()
+                infer_data = core.PaddleTensor()
                 infer_data.lod = minputs.lod()
-                infer_data.data = fluid.core.PaddleBuf(np.array(minputs))
+                infer_data.data = core.PaddleBuf(np.array(minputs))
                 infer_data.shape = minputs.shape()
-                infer_data.dtype = fluid.core.PaddleDType.FLOAT32
-                infer_label = fluid.core.PaddleTensor()
-                infer_label.data = fluid.core.PaddleBuf(np.array(label))
+                infer_data.dtype = core.PaddleDType.FLOAT32
+                infer_label = core.PaddleTensor()
+                infer_label.data = core.PaddleBuf(np.array(label))
                 infer_label.shape = label.shape
-                infer_label.dtype = fluid.core.PaddleDType.INT32
+                infer_label.dtype = core.PaddleDType.INT32
                 data.append([infer_data, infer_label])
         warmup_data = data[:1]
         inputs = data[1:]
@@ -105,7 +109,7 @@ def set_config(
         use_analysis=False,
         enable_ptq=False,
     ):
-        config = AnalysisConfig(model_path)
+        config = core.AnalysisConfig(model_path)
         config.set_cpu_math_library_num_threads(num_threads)
         if use_analysis:
             config.disable_gpu()
@@ -132,7 +136,7 @@ def run_program(
         use_analysis=False,
         enable_ptq=False,
     ):
-        place = fluid.CPUPlace()
+        place = paddle.CPUPlace()
         warmup_data, inputs = self.get_warmup_tensor(data_path, place)
         warmup_data = [item[0] for item in warmup_data]
         config = self.set_config(
@@ -144,7 +148,7 @@ def run_program(
             enable_ptq,
         )
 
-        predictor = create_paddle_predictor(config)
+        predictor = core.create_paddle_predictor(config)
         data = [item[0] for item in inputs]
         label = np.array([item[1] for item in inputs])
 
@@ -197,7 +201,7 @@ def run_program(
         return hx_acc, ctc_acc, fps
 
     def test_lstm_model(self):
-        if not fluid.core.is_compiled_with_mkldnn():
+        if not core.is_compiled_with_mkldnn():
             return
 
         fp32_model = test_case_args.fp32_model
diff --git a/python/paddle/fluid/contrib/slim/tests/quant2_int8_nlp_comparison.py b/python/paddle/static/quantization/tests/quant2_int8_nlp_comparison.py
similarity index 96%
rename from python/paddle/fluid/contrib/slim/tests/quant2_int8_nlp_comparison.py
rename to python/paddle/static/quantization/tests/quant2_int8_nlp_comparison.py
index 3b997fa0d50001..e4098e0fbe6f52 100644
--- a/python/paddle/fluid/contrib/slim/tests/quant2_int8_nlp_comparison.py
+++ b/python/paddle/static/quantization/tests/quant2_int8_nlp_comparison.py
@@ -12,18 +12,19 @@
 # see the license for the specific language governing permissions and
 # limitations under the license.
 
-import unittest
-import os
-import sys
 import argparse
 import logging
-import numpy as np
+import os
+import sys
 import time
+import unittest
+
+import numpy as np
+
 import paddle
-import paddle.fluid as fluid
 from paddle.fluid.framework import IrGraph
-from paddle.fluid.contrib.slim.quantization import Quant2Int8MkldnnPass
-from paddle.fluid import core
+from paddle.framework import core
+from paddle.static.quantization import Quant2Int8MkldnnPass
 
 paddle.enable_static()
 
@@ -158,23 +159,26 @@ def _predict(
         target='quant',
     ):
         assert target in ['quant', 'int8', 'fp32']
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        inference_scope = fluid.executor.global_scope()
-        with fluid.scope_guard(inference_scope):
+        place = paddle.CPUPlace()
+        exe = paddle.static.Executor(place)
+        inference_scope = paddle.static.global_scope()
+        with paddle.static.scope_guard(inference_scope):
             if os.path.exists(os.path.join(model_path, '__model__')):
                 [
                     inference_program,
                     feed_target_names,
                     fetch_targets,
-                ] = fluid.io.load_inference_model(model_path, exe)
+                ] = paddle.fluid.io.load_inference_model(model_path, exe)
             else:
                 [
                     inference_program,
                     feed_target_names,
                     fetch_targets,
-                ] = fluid.io.load_inference_model(
-                    model_path, exe, 'model', 'params'
+                ] = paddle.static.load_inference_model(
+                    model_path,
+                    exe,
+                    model_filename='model',
+                    params_filename='params',
                 )
 
             graph = IrGraph(core.Graph(inference_program.desc), for_test=True)
@@ -296,7 +300,7 @@ def _ints_from_csv(self, string):
         return set(map(int, string.split(',')))
 
     def test_graph_transformation(self):
-        if not fluid.core.is_compiled_with_mkldnn():
+        if not core.is_compiled_with_mkldnn():
             return
 
         quant_model_path = test_case_args.quant_model
diff --git a/python/paddle/fluid/contrib/slim/tests/quant_int8_image_classification_comparison.py b/python/paddle/static/quantization/tests/quant_int8_image_classification_comparison.py
similarity index 95%
rename from python/paddle/fluid/contrib/slim/tests/quant_int8_image_classification_comparison.py
rename to python/paddle/static/quantization/tests/quant_int8_image_classification_comparison.py
index e3aecd48c34db6..87fd0989374475 100644
--- a/python/paddle/fluid/contrib/slim/tests/quant_int8_image_classification_comparison.py
+++ b/python/paddle/static/quantization/tests/quant_int8_image_classification_comparison.py
@@ -12,19 +12,20 @@
 # see the license for the specific language governing permissions and
 # limitations under the license.
 
-import unittest
-import os
-import sys
 import argparse
 import logging
+import os
 import struct
-import numpy as np
+import sys
 import time
+import unittest
+
+import numpy as np
+
 import paddle
-import paddle.fluid as fluid
 from paddle.fluid.framework import IrGraph
-from paddle.fluid.contrib.slim.quantization import QuantInt8MkldnnPass
-from paddle.fluid import core
+from paddle.framework import core
+from paddle.static.quantization import QuantInt8MkldnnPass
 
 paddle.enable_static()
 
@@ -163,23 +164,26 @@ def _predict(
         skip_batch_num=0,
         transform_to_int8=False,
     ):
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        inference_scope = fluid.executor.global_scope()
-        with fluid.scope_guard(inference_scope):
+        place = paddle.CPUPlace()
+        exe = paddle.static.Executor(place)
+        inference_scope = paddle.static.global_scope()
+        with paddle.static.scope_guard(inference_scope):
             if os.path.exists(os.path.join(model_path, '__model__')):
                 [
                     inference_program,
                     feed_target_names,
                     fetch_targets,
-                ] = fluid.io.load_inference_model(model_path, exe)
+                ] = paddle.fluid.io.load_inference_model(model_path, exe)
             else:
                 [
                     inference_program,
                     feed_target_names,
                     fetch_targets,
-                ] = fluid.io.load_inference_model(
-                    model_path, exe, 'model', 'params'
+                ] = paddle.static.load_inference_model(
+                    model_path,
+                    exe,
+                    model_filename='model',
+                    params_filename='params',
                 )
 
             graph = IrGraph(core.Graph(inference_program.desc), for_test=True)
@@ -298,7 +302,7 @@ def _compare_accuracy(
         assert fp32_acc1 - int8_acc1 <= threshold
 
     def test_graph_transformation(self):
-        if not fluid.core.is_compiled_with_mkldnn():
+        if not core.is_compiled_with_mkldnn():
             return
 
         quant_model_path = test_case_args.quant_model
diff --git a/python/paddle/fluid/contrib/slim/tests/save_quant_model.py b/python/paddle/static/quantization/tests/save_quant_model.py
similarity index 80%
rename from python/paddle/fluid/contrib/slim/tests/save_quant_model.py
rename to python/paddle/static/quantization/tests/save_quant_model.py
index b743615f575aa7..9c496bec22a904 100644
--- a/python/paddle/fluid/contrib/slim/tests/save_quant_model.py
+++ b/python/paddle/static/quantization/tests/save_quant_model.py
@@ -12,15 +12,15 @@
 # see the license for the specific language governing permissions and
 # limitations under the license.
 
-import unittest
+import argparse
 import os
 import sys
-import argparse
+import unittest
+
 import paddle
-import paddle.fluid as fluid
 from paddle.fluid.framework import IrGraph
-from paddle.fluid.contrib.slim.quantization import Quant2Int8MkldnnPass
-from paddle.fluid import core
+from paddle.framework import core
+from paddle.static.quantization import Quant2Int8MkldnnPass
 
 paddle.enable_static()
 
@@ -93,35 +93,41 @@ def transform_and_save_int8_model(
     debug=False,
     quant_model_filename='',
     quant_params_filename='',
-    save_model_filename="__model__",
+    save_model_filename="model",
     save_params_filename=None,
 ):
-    place = fluid.CPUPlace()
-    exe = fluid.Executor(place)
-    inference_scope = fluid.executor.global_scope()
-    with fluid.scope_guard(inference_scope):
+    place = paddle.CPUPlace()
+    exe = paddle.static.Executor(place)
+    inference_scope = paddle.static.global_scope()
+    with paddle.static.scope_guard(inference_scope):
         if not quant_model_filename:
             if os.path.exists(os.path.join(original_path, '__model__')):
                 [
                     inference_program,
                     feed_target_names,
                     fetch_targets,
-                ] = fluid.io.load_inference_model(original_path, exe)
+                ] = paddle.fluid.io.load_inference_model(original_path, exe)
             else:
                 [
                     inference_program,
                     feed_target_names,
                     fetch_targets,
-                ] = fluid.io.load_inference_model(
-                    original_path, exe, 'model', 'params'
+                ] = paddle.static.load_inference_model(
+                    original_path,
+                    exe,
+                    model_filename='model',
+                    params_filename='params',
                 )
         else:
             [
                 inference_program,
                 feed_target_names,
                 fetch_targets,
-            ] = fluid.io.load_inference_model(
-                original_path, exe, quant_model_filename, quant_params_filename
+            ] = paddle.static.load_inference_model(
+                original_path,
+                exe,
+                model_filename=quant_model_filename,
+                params_filename=quant_params_filename,
             )
 
         ops_to_quantize_set = set()
@@ -147,15 +153,18 @@ def transform_and_save_int8_model(
         )
         graph = transform_to_mkldnn_int8_pass.apply(graph)
         inference_program = graph.to_program()
-        with fluid.scope_guard(inference_scope):
-            fluid.io.save_inference_model(
-                save_path,
-                feed_target_names,
+        with paddle.static.scope_guard(inference_scope):
+            path_prefix = os.path.join(save_path, save_model_filename)
+            feed_vars = [
+                inference_program.global_block().var(name)
+                for name in feed_target_names
+            ]
+            paddle.static.save_inference_model(
+                path_prefix,
+                feed_vars,
                 fetch_targets,
-                exe,
-                inference_program,
-                model_filename=save_model_filename,
-                params_filename=save_params_filename,
+                executor=exe,
+                program=inference_program,
             )
         print(
             "Success! INT8 model obtained from the Quant model can be found at {}\n".format(
diff --git a/python/paddle/fluid/contrib/slim/tests/test_graph.py b/python/paddle/static/quantization/tests/test_graph.py
similarity index 64%
rename from python/paddle/fluid/contrib/slim/tests/test_graph.py
rename to python/paddle/static/quantization/tests/test_graph.py
index ed4b4f2c437475..64ec55a4e3eb90 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_graph.py
+++ b/python/paddle/static/quantization/tests/test_graph.py
@@ -13,12 +13,13 @@
 # limitations under the license.
 
 import os
-import numpy as np
 import unittest
+
+import numpy as np
+
 import paddle
-import paddle.fluid as fluid
 from paddle.fluid.framework import IrGraph
-from paddle.fluid import core
+from paddle.framework import core
 
 paddle.enable_static()
 
@@ -27,63 +28,68 @@
 
 
 def conv_block():
-    img = fluid.layers.data(name='image', shape=[1, 28, 28], dtype='float32')
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-    conv_pool_1 = fluid.nets.simple_img_conv_pool(
+    img = paddle.static.data(
+        name='image', shape=[-1, 1, 28, 28], dtype='float32'
+    )
+    label = paddle.static.data(name='label', shape=[-1, 1], dtype='int64')
+    conv_out_1 = paddle.static.nn.conv2d(
         input=img,
         filter_size=5,
         num_filters=20,
-        pool_size=2,
-        pool_stride=2,
-        act="relu",
+        act='relu',
+    )
+    conv_pool_1 = paddle.nn.functional.max_pool2d(
+        conv_out_1, kernel_size=2, stride=2
     )
     conv_pool_1 = paddle.static.nn.batch_norm(conv_pool_1)
-    conv_pool_2 = fluid.nets.simple_img_conv_pool(
+
+    conv_out_2 = paddle.static.nn.conv2d(
         input=conv_pool_1,
         filter_size=5,
-        num_filters=50,
-        pool_size=2,
-        pool_stride=2,
-        act="relu",
+        num_filters=20,
+        act='relu',
+    )
+    conv_pool_2 = paddle.nn.functional.max_pool2d(
+        conv_out_2, kernel_size=2, stride=2
     )
-    prediction = fluid.layers.fc(input=conv_pool_2, size=10, act='softmax')
-    loss = paddle.nn.functional.cross_entropy(
-        input=prediction, label=label, reduction='none', use_softmax=False
+    prediction = paddle.static.nn.fc(
+        x=conv_pool_2, size=10, activation='softmax'
     )
+    loss = paddle.nn.functional.cross_entropy(input=prediction, label=label)
     avg_loss = paddle.mean(loss)
     return [img, label], avg_loss
 
 
 class TestGraph(unittest.TestCase):
     def graph_apis(self, use_cuda=False, for_ci=True):
-        main = fluid.Program()
-        startup = fluid.Program()
-        with fluid.unique_name.guard():
-            with fluid.program_guard(main, startup):
+        main = paddle.static.Program()
+        startup = paddle.static.Program()
+        with paddle.utils.unique_name.guard():
+            with paddle.static.program_guard(main, startup):
                 feeds, loss = conv_block()
-                opt = fluid.optimizer.Adam(learning_rate=0.001)
+                opt = paddle.optimizer.Adam(learning_rate=0.001)
                 opt.minimize(loss)
         graph = IrGraph(core.Graph(main.desc), for_test=False)
         backup_graph = graph.clone()
         self.assertEqual(len(graph.all_nodes()), len(backup_graph.all_nodes()))
-        build_strategy = fluid.BuildStrategy()
+        build_strategy = paddle.static.BuildStrategy()
         build_strategy.memory_optimize = False
         build_strategy.enable_inplace = False
-        origin_binary = fluid.CompiledProgram(graph.graph).with_data_parallel(
-            loss_name=loss.name, build_strategy=build_strategy
-        )
-        backup_binary = fluid.CompiledProgram(
+        origin_binary = paddle.static.CompiledProgram(
+            graph.graph
+        ).with_data_parallel(loss_name=loss.name, build_strategy=build_strategy)
+        backup_binary = paddle.static.CompiledProgram(
             backup_graph.graph
         ).with_data_parallel(loss_name=loss.name, build_strategy=build_strategy)
-        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-        exe = fluid.Executor(place)
+        place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+        exe = paddle.static.Executor(place)
         exe.run(startup)
         iters = 5
         batch_size = 8
         train_reader = paddle.batch(
             paddle.dataset.mnist.train(), batch_size=batch_size
         )
-        feeder = fluid.DataFeeder(feed_list=feeds, place=place)
+        feeder = paddle.fluid.DataFeeder(feed_list=feeds, place=place)
 
         def _train(binary):
             for _ in range(iters):
@@ -105,17 +111,29 @@ def _set_zero(var_name, scope, place):
             var.set(var_array, place)
 
         sum_before = np.sum(
-            np.array(fluid.global_scope().find_var('conv2d_1.w_0').get_tensor())
+            np.array(
+                paddle.static.global_scope()
+                .find_var('conv2d_1.w_0')
+                .get_tensor()
+            )
         )
-        fluid.io._save_persistable_nodes(exe, checkponit_dir, graph)
-        _set_zero('conv2d_1.w_0', fluid.global_scope(), place)
+        paddle.fluid.io._save_persistable_nodes(exe, checkponit_dir, graph)
+        _set_zero('conv2d_1.w_0', paddle.static.global_scope(), place)
         set_after = np.sum(
-            np.array(fluid.global_scope().find_var('conv2d_1.w_0').get_tensor())
+            np.array(
+                paddle.static.global_scope()
+                .find_var('conv2d_1.w_0')
+                .get_tensor()
+            )
         )
         self.assertEqual(set_after, 0)
-        fluid.io._load_persistable_nodes(exe, checkponit_dir, graph)
+        paddle.fluid.io._load_persistable_nodes(exe, checkponit_dir, graph)
         sum_after = np.sum(
-            np.array(fluid.global_scope().find_var('conv2d_1.w_0').get_tensor())
+            np.array(
+                paddle.static.global_scope()
+                .find_var('conv2d_1.w_0')
+                .get_tensor()
+            )
         )
         self.assertEqual(sum_before, sum_after)
 
@@ -144,7 +162,7 @@ def test_graph_apis_cpu(self):
         self.graph_apis(use_cuda=False, for_ci=True)
 
     def test_graph_apis_cuda(self):
-        if fluid.core.is_compiled_with_cuda():
+        if core.is_compiled_with_cuda():
             self.graph_apis(use_cuda=True, for_ci=True)
 
 
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py b/python/paddle/static/quantization/tests/test_imperative_out_scale.py
similarity index 75%
rename from python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py
rename to python/paddle/static/quantization/tests/test_imperative_out_scale.py
index 45e96e7cb4ad09..ca1e6c2700cda0 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py
+++ b/python/paddle/static/quantization/tests/test_imperative_out_scale.py
@@ -13,38 +13,31 @@
 # limitations under the license.
 
 import os
-import numpy as np
-import random
-import unittest
-import logging
-import warnings
 import tempfile
+import unittest
+
+import numpy as np
+from imperative_test_utils import fix_model_dict, train_lenet
 
 import paddle
 import paddle.fluid as fluid
-import paddle.fluid.layers as layers
-from paddle.fluid import core
-from paddle.fluid.optimizer import AdamOptimizer
-from paddle.fluid.framework import IrGraph
-from paddle.fluid.contrib.slim.quantization import ImperativeQuantAware
-from paddle.nn import Sequential
-from paddle.jit.translated_layer import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
-from paddle.nn.layer import ReLU, LeakyReLU, Sigmoid, Softmax, PReLU
-from paddle.nn import Linear, Conv2D, Softmax, BatchNorm2D, MaxPool2D
-from paddle.fluid.log_helper import get_logger
-from paddle.fluid.dygraph import nn
-
-from imperative_test_utils import fix_model_dict, train_lenet
+from paddle.framework import core, set_flags
+from paddle.nn import (
+    BatchNorm2D,
+    Conv2D,
+    Linear,
+    MaxPool2D,
+    Sequential,
+    Softmax,
+)
+from paddle.nn.layer import LeakyReLU, PReLU, ReLU, Sigmoid
+from paddle.quantization import ImperativeQuantAware
 
 paddle.enable_static()
 
 os.environ["CPU_NUM"] = "1"
 if core.is_compiled_with_cuda():
-    fluid.set_flags({"FLAGS_cudnn_deterministic": True})
-
-_logger = get_logger(
-    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s'
-)
+    set_flags({"FLAGS_cudnn_deterministic": True})
 
 
 def get_vaild_warning_num(warning, w):
@@ -55,18 +48,18 @@ def get_vaild_warning_num(warning, w):
     return num
 
 
-class ImperativeLenet(fluid.dygraph.Layer):
+class ImperativeLenet(paddle.nn.Layer):
     def __init__(self, num_classes=10):
         super().__init__()
-        conv2d_w1_attr = fluid.ParamAttr(name="conv2d_w_1")
-        conv2d_w2_attr = fluid.ParamAttr(name="conv2d_w_2")
-        fc_w1_attr = fluid.ParamAttr(name="fc_w_1")
-        fc_w2_attr = fluid.ParamAttr(name="fc_w_2")
-        fc_w3_attr = fluid.ParamAttr(name="fc_w_3")
-        conv2d_b2_attr = fluid.ParamAttr(name="conv2d_b_2")
-        fc_b1_attr = fluid.ParamAttr(name="fc_b_1")
-        fc_b2_attr = fluid.ParamAttr(name="fc_b_2")
-        fc_b3_attr = fluid.ParamAttr(name="fc_b_3")
+        conv2d_w1_attr = paddle.ParamAttr(name="conv2d_w_1")
+        conv2d_w2_attr = paddle.ParamAttr(name="conv2d_w_2")
+        fc_w1_attr = paddle.ParamAttr(name="fc_w_1")
+        fc_w2_attr = paddle.ParamAttr(name="fc_w_2")
+        fc_w3_attr = paddle.ParamAttr(name="fc_w_3")
+        conv2d_b2_attr = paddle.ParamAttr(name="conv2d_b_2")
+        fc_b1_attr = paddle.ParamAttr(name="fc_b_1")
+        fc_b2_attr = paddle.ParamAttr(name="fc_b_2")
+        fc_b3_attr = paddle.ParamAttr(name="fc_b_3")
         self.features = Sequential(
             Conv2D(
                 in_channels=1,
@@ -121,7 +114,7 @@ def __init__(self, num_classes=10):
     def forward(self, inputs):
         x = self.features(inputs)
 
-        x = paddle.flatten(x, 1, -1)
+        x = paddle.flatten(x, 1)
         x = self.fc(x)
         return x
 
@@ -152,8 +145,8 @@ def test_out_scale_acc(self):
 
         with fluid.dygraph.guard():
             np.random.seed(seed)
-            fluid.default_main_program().random_seed = seed
-            fluid.default_startup_program().random_seed = seed
+            paddle.static.default_main_program().random_seed = seed
+            paddle.static.default_startup_program().random_seed = seed
 
             lenet = ImperativeLenet()
             lenet = fix_model_dict(lenet)
@@ -162,8 +155,8 @@ def test_out_scale_acc(self):
             reader = paddle.batch(
                 paddle.dataset.mnist.test(), batch_size=32, drop_last=True
             )
-            adam = AdamOptimizer(
-                learning_rate=lr, parameter_list=lenet.parameters()
+            adam = paddle.optimizer.Adam(
+                learning_rate=lr, parameters=lenet.parameters()
             )
             loss_list = train_lenet(lenet, reader, adam)
             lenet.eval()
@@ -186,8 +179,8 @@ def test_out_scale_acc(self):
             reader = paddle.batch(
                 paddle.dataset.mnist.test(), batch_size=32, drop_last=True
             )
-            adam = AdamOptimizer(
-                learning_rate=lr, parameter_list=lenet.parameters()
+            adam = paddle.optimizer.Adam(
+                learning_rate=lr, parameters=lenet.parameters()
             )
             loss_list = train_lenet(lenet, reader, adam)
             lenet.eval()
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_ptq.py b/python/paddle/static/quantization/tests/test_imperative_ptq.py
similarity index 95%
rename from python/paddle/fluid/contrib/slim/tests/test_imperative_ptq.py
rename to python/paddle/static/quantization/tests/test_imperative_ptq.py
index 161700cb2f0853..32967bf9c5ce8a 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_ptq.py
+++ b/python/paddle/static/quantization/tests/test_imperative_ptq.py
@@ -1,4 +1,4 @@
-#   copyright (c) 2018 paddlepaddle authors. all rights reserved.
+#   copyright (c) 2022 paddlepaddle authors. all rights reserved.
 #
 # licensed under the apache license, version 2.0 (the "license");
 # you may not use this file except in compliance with the license.
@@ -12,29 +12,32 @@
 # see the license for the specific language governing permissions and
 # limitations under the license.
 
+import logging
 import os
-import numpy as np
-import random
-import shutil
+import tempfile
 import time
 import unittest
-import copy
-import logging
-import tempfile
-
-import paddle.nn as nn
-import paddle
-import paddle.fluid as fluid
-from paddle.fluid.contrib.slim.quantization import *
-from paddle.fluid.log_helper import get_logger
-from paddle.dataset.common import download
 
+import numpy as np
 from imperative_test_utils import (
-    fix_model_dict,
     ImperativeLenet,
     ImperativeLinearBn,
+    ImperativeLinearBn_hook,
 )
-from imperative_test_utils import ImperativeLinearBn_hook
+
+import paddle
+import paddle.nn as nn
+from paddle.dataset.common import download
+from paddle.fluid.framework import _test_eager_guard
+from paddle.quantization import (
+    AbsmaxQuantizer,
+    HistQuantizer,
+    ImperativePTQ,
+    KLQuantizer,
+    PerChannelAbsmaxQuantizer,
+    PTQConfig,
+)
+from paddle.static.log_helper import get_logger
 
 _logger = get_logger(
     __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s'
@@ -149,8 +152,8 @@ def model_test(self, model, batch_num=-1, batch_size=8):
             label = paddle.to_tensor(y_data)
 
             out = model(img)
-            acc_top1 = paddle.static.accuracy(input=out, label=label, k=1)
-            acc_top5 = paddle.static.accuracy(input=out, label=label, k=5)
+            acc_top1 = paddle.metric.accuracy(input=out, label=label, k=1)
+            acc_top5 = paddle.metric.accuracy(input=out, label=label, k=5)
             eval_acc_top1_list.append(float(acc_top1.numpy()))
 
             if batch_id % 50 == 0:
@@ -207,7 +210,7 @@ def program_test(self, program_path, batch_num=-1, batch_size=8):
                 break
         return top1_correct_num / total_num
 
-    def test_ptq(self):
+    def func_ptq(self):
         start_time = time.time()
 
         self.set_vars()
@@ -265,9 +268,14 @@ def test_ptq(self):
             end_time = time.time()
             print("total time: %ss \n" % (end_time - start_time))
 
+    def test_ptq(self):
+        with _test_eager_guard():
+            self.func_ptq()
+        self.func_ptq()
+
 
 class TestImperativePTQfuse(TestImperativePTQ):
-    def test_ptq(self):
+    def func_ptq(self):
         start_time = time.time()
 
         self.set_vars()
@@ -336,6 +344,11 @@ def test_ptq(self):
             end_time = time.time()
             print("total time: %ss \n" % (end_time - start_time))
 
+    def test_ptq(self):
+        with _test_eager_guard():
+            self.func_ptq()
+        self.func_ptq()
+
 
 class TestImperativePTQHist(TestImperativePTQ):
     def set_vars(self):
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py b/python/paddle/static/quantization/tests/test_imperative_qat.py
similarity index 82%
rename from python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
rename to python/paddle/static/quantization/tests/test_imperative_qat.py
index 43e4f0686346bb..b77c081c06e39b 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
+++ b/python/paddle/static/quantization/tests/test_imperative_qat.py
@@ -12,34 +12,34 @@
 # see the license for the specific language governing permissions and
 # limitations under the license.
 
+import logging
 import os
-import numpy as np
-import random
-import time
 import tempfile
 import unittest
-import logging
+
+import numpy as np
+from imperative_test_utils import ImperativeLenet, fix_model_dict
 
 import paddle
 import paddle.fluid as fluid
-from paddle.fluid import core
-from paddle.fluid.optimizer import AdamOptimizer
-from paddle.fluid.contrib.slim.quantization import ImperativeQuantAware
-from paddle.nn import Sequential
-from paddle.nn import Linear, Conv2D, Softmax, Conv2DTranspose
-from paddle.fluid.log_helper import get_logger
-from paddle.jit.translated_layer import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
+from paddle.framework import core, set_flags
+from paddle.nn import Conv2D, Conv2DTranspose
 from paddle.nn.quant.quant_layers import (
     QuantizedConv2D,
     QuantizedConv2DTranspose,
 )
-from imperative_test_utils import fix_model_dict, ImperativeLenet
+from paddle.optimizer import Adam
+from paddle.quantization import ImperativeQuantAware
+from paddle.static.log_helper import get_logger
+
+INFER_MODEL_SUFFIX = ".pdmodel"
+INFER_PARAMS_SUFFIX = ".pdiparams"
 
 paddle.enable_static()
 
 os.environ["CPU_NUM"] = "1"
 if core.is_compiled_with_cuda():
-    fluid.set_flags({"FLAGS_cudnn_deterministic": True})
+    set_flags({"FLAGS_cudnn_deterministic": True})
 
 _logger = get_logger(
     __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s'
@@ -84,7 +84,7 @@ def test_qat(self):
             )
             quant_conv1 = QuantizedConv2D(conv1)
             data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32')
-            quant_conv1(fluid.dygraph.to_variable(data))
+            quant_conv1(paddle.to_tensor(data))
 
             conv_transpose = Conv2DTranspose(4, 6, (3, 3))
             quant_conv_transpose = QuantizedConv2DTranspose(conv_transpose)
@@ -95,15 +95,13 @@ def test_qat(self):
 
             seed = 1
             np.random.seed(seed)
-            fluid.default_main_program().random_seed = seed
-            fluid.default_startup_program().random_seed = seed
+            paddle.static.default_main_program().random_seed = seed
+            paddle.static.default_startup_program().random_seed = seed
 
             lenet = ImperativeLenet()
             lenet = fix_model_dict(lenet)
             imperative_qat.quantize(lenet)
-            adam = AdamOptimizer(
-                learning_rate=0.001, parameter_list=lenet.parameters()
-            )
+            adam = Adam(learning_rate=0.001, parameters=lenet.parameters())
 
             train_reader = paddle.batch(
                 paddle.dataset.mnist.train(), batch_size=32, drop_last=True
@@ -125,10 +123,10 @@ def test_qat(self):
                         .reshape(-1, 1)
                     )
 
-                    img = fluid.dygraph.to_variable(x_data)
-                    label = fluid.dygraph.to_variable(y_data)
+                    img = paddle.to_tensor(x_data)
+                    label = paddle.to_tensor(y_data)
                     out = lenet(img)
-                    acc = paddle.static.accuracy(out, label)
+                    acc = paddle.metric.accuracy(out, label)
                     loss = paddle.nn.functional.cross_entropy(
                         out, label, reduction='none', use_softmax=False
                     )
@@ -157,14 +155,14 @@ def test_qat(self):
                         .reshape(-1, 1)
                     )
 
-                    img = fluid.dygraph.to_variable(x_data)
-                    label = fluid.dygraph.to_variable(y_data)
+                    img = paddle.to_tensor(x_data)
+                    label = paddle.to_tensor(y_data)
 
                     out = lenet(img)
-                    acc_top1 = paddle.static.accuracy(
+                    acc_top1 = paddle.metric.accuracy(
                         input=out, label=label, k=1
                     )
-                    acc_top5 = paddle.static.accuracy(
+                    acc_top5 = paddle.metric.accuracy(
                         input=out, label=label, k=5
                     )
 
@@ -197,11 +195,11 @@ def test_qat(self):
             y_data = (
                 np.array([x[1] for x in data]).astype('int64').reshape(-1, 1)
             )
-            test_img = fluid.dygraph.to_variable(test_data)
-            label = fluid.dygraph.to_variable(y_data)
+            test_img = paddle.to_tensor(test_data)
+            label = paddle.to_tensor(y_data)
             lenet.eval()
             fp32_out = lenet(test_img)
-            fp32_acc = paddle.static.accuracy(fp32_out, label).numpy()
+            fp32_acc = paddle.metric.accuracy(fp32_out, label).numpy()
 
         with tempfile.TemporaryDirectory(prefix="qat_save_path_") as tmpdir:
             # save inference quantized model
@@ -220,13 +218,13 @@ def test_qat(self):
                 place = core.CUDAPlace(0)
             else:
                 place = core.CPUPlace()
-            exe = fluid.Executor(place)
+            exe = paddle.static.Executor(place)
             [
                 inference_program,
                 feed_target_names,
                 fetch_targets,
-            ] = fluid.io.load_inference_model(
-                dirname=tmpdir,
+            ] = paddle.static.load_inference_model(
+                tmpdir,
                 executor=exe,
                 model_filename="lenet" + INFER_MODEL_SUFFIX,
                 params_filename="lenet" + INFER_PARAMS_SUFFIX,
@@ -237,8 +235,8 @@ def test_qat(self):
                 fetch_list=fetch_targets,
             )
             paddle.disable_static()
-            quant_out = fluid.dygraph.to_variable(quant_out)
-            quant_acc = paddle.static.accuracy(quant_out, label).numpy()
+            quant_out = paddle.to_tensor(quant_out)
+            quant_acc = paddle.metric.accuracy(quant_out, label).numpy()
             paddle.enable_static()
             delta_value = fp32_acc - quant_acc
             self.assertLessEqual(delta_value, self.diff_threshold)
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_amp.py b/python/paddle/static/quantization/tests/test_imperative_qat_amp.py
similarity index 92%
rename from python/paddle/fluid/contrib/slim/tests/test_imperative_qat_amp.py
rename to python/paddle/static/quantization/tests/test_imperative_qat_amp.py
index 0f7608927007ad..1a3701615b87c3 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_amp.py
+++ b/python/paddle/static/quantization/tests/test_imperative_qat_amp.py
@@ -1,4 +1,4 @@
-#   copyright (c) 2018 paddlepaddle authors. all rights reserved.
+#   copyright (c) 2022 paddlepaddle authors. all rights reserved.
 #
 # licensed under the apache license, version 2.0 (the "license");
 # you may not use this file except in compliance with the license.
@@ -12,25 +12,25 @@
 # see the license for the specific language governing permissions and
 # limitations under the license.
 
+import logging
 import os
-import numpy as np
-import random
-import shutil
+import tempfile
 import time
 import unittest
-import logging
-import tempfile
+
+import numpy as np
+from imperative_test_utils import ImperativeLenet
 
 import paddle
 import paddle.fluid as fluid
-from paddle.fluid.contrib.slim.quantization import ImperativeQuantAware
-from paddle.fluid.log_helper import get_logger
 from paddle.dataset.common import download
-from imperative_test_utils import fix_model_dict, ImperativeLenet
+from paddle.framework import set_flags
+from paddle.quantization import ImperativeQuantAware
+from paddle.static.log_helper import get_logger
 
 os.environ["CPU_NUM"] = "1"
 if paddle.is_compiled_with_cuda():
-    fluid.set_flags({"FLAGS_cudnn_deterministic": True})
+    set_flags({"FLAGS_cudnn_deterministic": True})
 
 _logger = get_logger(
     __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s'
@@ -117,7 +117,7 @@ def model_train(self, model, batch_num=-1, batch_size=32, use_amp=False):
             if use_amp:
                 with paddle.amp.auto_cast():
                     out = model(img)
-                    acc = paddle.static.accuracy(out, label)
+                    acc = paddle.metric.accuracy(out, label)
                     loss = paddle.nn.functional.cross_entropy(
                         out, label, reduction='none', use_softmax=False
                     )
@@ -129,7 +129,7 @@ def model_train(self, model, batch_num=-1, batch_size=32, use_amp=False):
                 adam.clear_gradients()
             else:
                 out = model(img)
-                acc = paddle.static.accuracy(out, label)
+                acc = paddle.metric.accuracy(out, label)
                 loss = paddle.nn.functional.cross_entropy(
                     out, label, reduction='none', use_softmax=False
                 )
@@ -170,8 +170,8 @@ def model_test(self, model, batch_num=-1, batch_size=32, use_amp=False):
 
             with paddle.amp.auto_cast(use_amp):
                 out = model(img)
-                acc_top1 = paddle.static.accuracy(input=out, label=label, k=1)
-                acc_top5 = paddle.static.accuracy(input=out, label=label, k=5)
+                acc_top1 = paddle.metric.accuracy(input=out, label=label, k=1)
+                acc_top5 = paddle.metric.accuracy(input=out, label=label, k=5)
 
             acc_top1_list.append(float(acc_top1.numpy()))
             if batch_id % 100 == 0:
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_channelwise.py b/python/paddle/static/quantization/tests/test_imperative_qat_channelwise.py
similarity index 80%
rename from python/paddle/fluid/contrib/slim/tests/test_imperative_qat_channelwise.py
rename to python/paddle/static/quantization/tests/test_imperative_qat_channelwise.py
index 45ea756ce27abf..68dfbf8c8bcdd8 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_channelwise.py
+++ b/python/paddle/static/quantization/tests/test_imperative_qat_channelwise.py
@@ -1,4 +1,4 @@
-#   copyright (c) 2018 paddlepaddle authors. all rights reserved.
+#   copyright (c) 2022 paddlepaddle authors. all rights reserved.
 #
 # licensed under the apache license, version 2.0 (the "license");
 # you may not use this file except in compliance with the license.
@@ -13,27 +13,18 @@
 # limitations under the license.
 
 import os
-import numpy as np
-import random
 import unittest
-import logging
-
-import paddle
-import paddle.fluid as fluid
-from paddle.fluid import core
-from paddle.fluid.log_helper import get_logger
 
 from test_imperative_qat import TestImperativeQat
 
+import paddle
+from paddle.framework import core, set_flags
+
 paddle.enable_static()
 
 os.environ["CPU_NUM"] = "1"
 if core.is_compiled_with_cuda():
-    fluid.set_flags({"FLAGS_cudnn_deterministic": True})
-
-_logger = get_logger(
-    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s'
-)
+    set_flags({"FLAGS_cudnn_deterministic": True})
 
 
 class TestImperativeQatChannelWise(TestImperativeQat):
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_fuse.py b/python/paddle/static/quantization/tests/test_imperative_qat_fuse.py
similarity index 78%
rename from python/paddle/fluid/contrib/slim/tests/test_imperative_qat_fuse.py
rename to python/paddle/static/quantization/tests/test_imperative_qat_fuse.py
index dc1dee13cf81f1..1057a9f765cad7 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_fuse.py
+++ b/python/paddle/static/quantization/tests/test_imperative_qat_fuse.py
@@ -13,27 +13,18 @@
 # limitations under the license.
 
 import os
-import numpy as np
-import random
 import unittest
-import logging
-
-import paddle
-import paddle.fluid as fluid
-from paddle.fluid import core
-from paddle.fluid.log_helper import get_logger
 
 from test_imperative_qat import TestImperativeQat
 
+import paddle
+from paddle.framework import core, set_flags
+
 paddle.enable_static()
 
 os.environ["CPU_NUM"] = "1"
 if core.is_compiled_with_cuda():
-    fluid.set_flags({"FLAGS_cudnn_deterministic": True})
-
-_logger = get_logger(
-    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s'
-)
+    set_flags({"FLAGS_cudnn_deterministic": True})
 
 
 class TestImperativeQatfuseBN(TestImperativeQat):
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_lsq.py b/python/paddle/static/quantization/tests/test_imperative_qat_lsq.py
similarity index 75%
rename from python/paddle/fluid/contrib/slim/tests/test_imperative_qat_lsq.py
rename to python/paddle/static/quantization/tests/test_imperative_qat_lsq.py
index 7cf3e9ad2b0b3e..609802612fd982 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_lsq.py
+++ b/python/paddle/static/quantization/tests/test_imperative_qat_lsq.py
@@ -12,57 +12,53 @@
 # see the license for the specific language governing permissions and
 # limitations under the license.
 
+import logging
 import os
-import numpy as np
-import random
-import time
-import tempfile
 import unittest
-import logging
+
+import numpy as np
+from imperative_test_utils import fix_model_dict
 
 import paddle
-import paddle.fluid as fluid
-from paddle.fluid import core
-from paddle.fluid.optimizer import (
-    SGDOptimizer,
-    AdamOptimizer,
-    MomentumOptimizer,
+from paddle.framework import core, set_flags
+from paddle.nn import (
+    BatchNorm2D,
+    Conv2D,
+    LeakyReLU,
+    Linear,
+    MaxPool2D,
+    PReLU,
+    ReLU,
+    Sequential,
+    Sigmoid,
+    Softmax,
 )
-from paddle.fluid.contrib.slim.quantization import ImperativeQuantAware
-from paddle.nn import Sequential
-from paddle.nn import ReLU, ReLU6, LeakyReLU, Sigmoid, Softmax, PReLU
-from paddle.nn import Linear, Conv2D, Softmax, BatchNorm2D, MaxPool2D
-from paddle.fluid.log_helper import get_logger
-from paddle.jit.translated_layer import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
-from paddle.nn.quant.quant_layers import (
-    QuantizedConv2D,
-    QuantizedConv2DTranspose,
-)
-from imperative_test_utils import fix_model_dict
+from paddle.quantization import ImperativeQuantAware
+from paddle.static.log_helper import get_logger
 
 paddle.enable_static()
 
 os.environ["CPU_NUM"] = "1"
 if core.is_compiled_with_cuda():
-    fluid.set_flags({"FLAGS_cudnn_deterministic": True})
+    set_flags({"FLAGS_cudnn_deterministic": True})
 
 _logger = get_logger(
     __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s'
 )
 
 
-class ImperativeLenet(fluid.dygraph.Layer):
+class ImperativeLenet(paddle.nn.Layer):
     def __init__(self, num_classes=10):
         super().__init__()
-        conv2d_w1_attr = fluid.ParamAttr(name="conv2d_w_1")
-        conv2d_w2_attr = fluid.ParamAttr(name="conv2d_w_2")
-        fc_w1_attr = fluid.ParamAttr(name="fc_w_1")
-        fc_w2_attr = fluid.ParamAttr(name="fc_w_2")
-        fc_w3_attr = fluid.ParamAttr(name="fc_w_3")
-        conv2d_b2_attr = fluid.ParamAttr(name="conv2d_b_2")
-        fc_b1_attr = fluid.ParamAttr(name="fc_b_1")
-        fc_b2_attr = fluid.ParamAttr(name="fc_b_2")
-        fc_b3_attr = fluid.ParamAttr(name="fc_b_3")
+        conv2d_w1_attr = paddle.ParamAttr(name="conv2d_w_1")
+        conv2d_w2_attr = paddle.ParamAttr(name="conv2d_w_2")
+        fc_w1_attr = paddle.ParamAttr(name="fc_w_1")
+        fc_w2_attr = paddle.ParamAttr(name="fc_w_2")
+        fc_w3_attr = paddle.ParamAttr(name="fc_w_3")
+        conv2d_b2_attr = paddle.ParamAttr(name="conv2d_b_2")
+        fc_b1_attr = paddle.ParamAttr(name="fc_b_1")
+        fc_b2_attr = paddle.ParamAttr(name="fc_b_2")
+        fc_b3_attr = paddle.ParamAttr(name="fc_b_3")
         self.features = Sequential(
             Conv2D(
                 in_channels=1,
@@ -116,7 +112,7 @@ def __init__(self, num_classes=10):
 
     def forward(self, inputs):
         x = self.features(inputs)
-        x = paddle.flatten(x, 1, -1)
+        x = paddle.flatten(x, 1)
         x = self.fc(x)
         return x
 
@@ -139,14 +135,14 @@ def func_qat(self):
 
         seed = 100
         np.random.seed(seed)
-        fluid.default_main_program().random_seed = seed
-        fluid.default_startup_program().random_seed = seed
+        paddle.static.default_main_program().random_seed = seed
+        paddle.static.default_startup_program().random_seed = seed
         paddle.disable_static()
         lenet = ImperativeLenet()
         lenet = fix_model_dict(lenet)
         imperative_qat.quantize(lenet)
-        optimizer = MomentumOptimizer(
-            learning_rate=0.1, parameter_list=lenet.parameters(), momentum=0.9
+        optimizer = paddle.optimizer.Momentum(
+            learning_rate=0.1, parameters=lenet.parameters(), momentum=0.9
         )
 
         train_reader = paddle.batch(
@@ -166,10 +162,10 @@ def func_qat(self):
                     .reshape(-1, 1)
                 )
 
-                img = fluid.dygraph.to_variable(x_data)
-                label = fluid.dygraph.to_variable(y_data)
+                img = paddle.to_tensor(x_data)
+                label = paddle.to_tensor(y_data)
                 out = lenet(img)
-                acc = paddle.static.accuracy(out, label)
+                acc = paddle.metric.accuracy(out, label)
                 loss = paddle.nn.functional.cross_entropy(
                     out, label, reduction='none', use_softmax=False
                 )
@@ -199,14 +195,14 @@ def func_qat(self):
                         .astype('int64')
                         .reshape(-1, 1)
                     )
-                    img = fluid.dygraph.to_variable(x_data)
-                    label = fluid.dygraph.to_variable(y_data)
+                    img = paddle.to_tensor(x_data)
+                    label = paddle.to_tensor(y_data)
 
                     out = lenet(img)
-                    acc_top1 = paddle.static.accuracy(
+                    acc_top1 = paddle.metric.accuracy(
                         input=out, label=label, k=1
                     )
-                    acc_top5 = paddle.static.accuracy(
+                    acc_top5 = paddle.metric.accuracy(
                         input=out, label=label, k=5
                     )
 
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_matmul.py b/python/paddle/static/quantization/tests/test_imperative_qat_matmul.py
similarity index 75%
rename from python/paddle/fluid/contrib/slim/tests/test_imperative_qat_matmul.py
rename to python/paddle/static/quantization/tests/test_imperative_qat_matmul.py
index 7caad9ad18576f..2cf75eb0ceda57 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_matmul.py
+++ b/python/paddle/static/quantization/tests/test_imperative_qat_matmul.py
@@ -12,57 +12,55 @@
 # see the license for the specific language governing permissions and
 # limitations under the license.
 
+import logging
 import os
-import numpy as np
-import random
-import time
-import tempfile
 import unittest
-import logging
+
+import numpy as np
+from imperative_test_utils import fix_model_dict
 
 import paddle
-import paddle.fluid as fluid
-from paddle.fluid import core
-from paddle.fluid.optimizer import (
-    SGDOptimizer,
-    AdamOptimizer,
-    MomentumOptimizer,
+from paddle.framework import core, set_flags
+from paddle.nn import (
+    BatchNorm2D,
+    Conv2D,
+    LeakyReLU,
+    Linear,
+    MaxPool2D,
+    PReLU,
+    ReLU,
+    Sequential,
+    Sigmoid,
+    Softmax,
 )
-from paddle.fluid.contrib.slim.quantization import ImperativeQuantAware
-from paddle.nn import Sequential
-from paddle.nn import ReLU, ReLU6, LeakyReLU, Sigmoid, Softmax, PReLU
-from paddle.nn import Linear, Conv2D, Softmax, BatchNorm2D, MaxPool2D
-from paddle.fluid.log_helper import get_logger
-from paddle.jit.translated_layer import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
-from paddle.nn.quant.quant_layers import (
-    QuantizedConv2D,
-    QuantizedMatmul,
-)
-from imperative_test_utils import fix_model_dict
+from paddle.nn.quant.quant_layers import QuantizedMatmul
+from paddle.optimizer import Momentum
+from paddle.quantization import ImperativeQuantAware
+from paddle.static.log_helper import get_logger
 
 paddle.enable_static()
 
 os.environ["CPU_NUM"] = "1"
 if core.is_compiled_with_cuda():
-    fluid.set_flags({"FLAGS_cudnn_deterministic": True})
+    set_flags({"FLAGS_cudnn_deterministic": True})
 
 _logger = get_logger(
     __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s'
 )
 
 
-class ImperativeLenet(fluid.dygraph.Layer):
+class ImperativeLenet(paddle.nn.Layer):
     def __init__(self, num_classes=10):
         super().__init__()
-        conv2d_w1_attr = fluid.ParamAttr(name="conv2d_w_1")
-        conv2d_w2_attr = fluid.ParamAttr(name="conv2d_w_2")
-        fc_w1_attr = fluid.ParamAttr(name="fc_w_1")
-        fc_w2_attr = fluid.ParamAttr(name="fc_w_2")
-        fc_w3_attr = fluid.ParamAttr(name="fc_w_3")
-        conv2d_b2_attr = fluid.ParamAttr(name="conv2d_b_2")
-        fc_b1_attr = fluid.ParamAttr(name="fc_b_1")
-        fc_b2_attr = fluid.ParamAttr(name="fc_b_2")
-        fc_b3_attr = fluid.ParamAttr(name="fc_b_3")
+        conv2d_w1_attr = paddle.ParamAttr(name="conv2d_w_1")
+        conv2d_w2_attr = paddle.ParamAttr(name="conv2d_w_2")
+        fc_w1_attr = paddle.ParamAttr(name="fc_w_1")
+        fc_w2_attr = paddle.ParamAttr(name="fc_w_2")
+        fc_w3_attr = paddle.ParamAttr(name="fc_w_3")
+        conv2d_b2_attr = paddle.ParamAttr(name="conv2d_b_2")
+        fc_b1_attr = paddle.ParamAttr(name="fc_b_1")
+        fc_b2_attr = paddle.ParamAttr(name="fc_b_2")
+        fc_b3_attr = paddle.ParamAttr(name="fc_b_3")
         self.features = Sequential(
             Conv2D(
                 in_channels=1,
@@ -140,15 +138,15 @@ def func_qat(self):
 
         seed = 100
         np.random.seed(seed)
-        fluid.default_main_program().random_seed = seed
-        fluid.default_startup_program().random_seed = seed
+        paddle.static.default_main_program().random_seed = seed
+        paddle.static.default_startup_program().random_seed = seed
         paddle.disable_static()
         lenet = ImperativeLenet()
         lenet = fix_model_dict(lenet)
         imperative_qat.quantize(lenet)
 
-        optimizer = MomentumOptimizer(
-            learning_rate=0.1, parameter_list=lenet.parameters(), momentum=0.9
+        optimizer = Momentum(
+            learning_rate=0.1, parameters=lenet.parameters(), momentum=0.9
         )
 
         train_reader = paddle.batch(
@@ -168,18 +166,18 @@ def func_qat(self):
                     .reshape(-1, 1)
                 )
 
-                img = fluid.dygraph.to_variable(x_data)
-                label = fluid.dygraph.to_variable(y_data)
+                img = paddle.to_tensor(x_data)
+                label = paddle.to_tensor(y_data)
                 out = lenet(img)
-                acc = paddle.static.accuracy(out, label)
+                acc = paddle.metric.accuracy(out, label)
                 loss = paddle.nn.functional.cross_entropy(
                     out, label, reduction='none', use_softmax=False
                 )
                 avg_loss = paddle.mean(loss)
 
                 avg_loss.backward()
-                optimizer.minimize(avg_loss)
-                lenet.clear_gradients()
+                optimizer.step()
+                optimizer.clear_grad()
 
                 if batch_id % 100 == 0:
                     _logger.info(
@@ -201,14 +199,14 @@ def func_qat(self):
                         .astype('int64')
                         .reshape(-1, 1)
                     )
-                    img = fluid.dygraph.to_variable(x_data)
-                    label = fluid.dygraph.to_variable(y_data)
+                    img = paddle.to_tensor(x_data)
+                    label = paddle.to_tensor(y_data)
 
                     out = lenet(img)
-                    acc_top1 = paddle.static.accuracy(
+                    acc_top1 = paddle.metric.accuracy(
                         input=out, label=label, k=1
                     )
-                    acc_top5 = paddle.static.accuracy(
+                    acc_top5 = paddle.metric.accuracy(
                         input=out, label=label, k=5
                     )
 
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_user_defined.py b/python/paddle/static/quantization/tests/test_imperative_qat_user_defined.py
similarity index 91%
rename from python/paddle/fluid/contrib/slim/tests/test_imperative_qat_user_defined.py
rename to python/paddle/static/quantization/tests/test_imperative_qat_user_defined.py
index ae18f4a4f24bb5..eb30afe61ef180 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_user_defined.py
+++ b/python/paddle/static/quantization/tests/test_imperative_qat_user_defined.py
@@ -12,20 +12,19 @@
 # see the license for the specific language governing permissions and
 # limitations under the license.
 
+import logging
 import os
-import numpy as np
-import random
 import unittest
-import logging
+
+import numpy as np
+
 import paddle
 import paddle.nn as nn
-from paddle.optimizer import Adam
-from paddle.fluid.contrib.slim.quantization import ImperativeQuantAware
-from paddle.fluid.contrib.slim.quantization import QuantizationTransformPass
+from paddle.fluid.framework import _test_eager_guard
 from paddle.nn import Sequential
-from paddle.nn import Linear
-from paddle.nn.quant.quant_layers import QuantizedConv2DTranspose
-from paddle.fluid.log_helper import get_logger
+from paddle.optimizer import Adam
+from paddle.quantization import ImperativeQuantAware
+from paddle.static.log_helper import get_logger
 
 os.environ["CPU_NUM"] = "1"
 
@@ -110,7 +109,7 @@ class ModelForConv2dT(nn.Layer):
     def __init__(self, num_classes=10):
         super().__init__()
         self.features = nn.Conv2DTranspose(4, 6, (3, 3))
-        self.fc = Linear(600, num_classes)
+        self.fc = nn.Linear(in_features=600, out_features=num_classes)
 
     def forward(self, inputs):
         x = self.features(inputs)
@@ -123,28 +122,28 @@ class ImperativeLenet(paddle.nn.Layer):
     def __init__(self, num_classes=10, classifier_activation='softmax'):
         super().__init__()
         self.features = Sequential(
-            paddle.nn.Conv2D(
+            nn.Conv2D(
                 in_channels=1,
                 out_channels=6,
                 kernel_size=3,
                 stride=1,
                 padding=1,
             ),
-            paddle.nn.MaxPool2D(kernel_size=2, stride=2),
-            paddle.nn.Conv2D(
+            nn.MaxPool2D(kernel_size=2, stride=2),
+            nn.Conv2D(
                 in_channels=6,
                 out_channels=16,
                 kernel_size=5,
                 stride=1,
                 padding=0,
             ),
-            paddle.nn.MaxPool2D(kernel_size=2, stride=2),
+            nn.MaxPool2D(kernel_size=2, stride=2),
         )
 
         self.fc = Sequential(
-            Linear(400, 120),
-            Linear(120, 84),
-            Linear(84, num_classes),
+            nn.Linear(in_features=400, out_features=120),
+            nn.Linear(in_features=120, out_features=84),
+            nn.Linear(in_features=84, out_features=num_classes),
         )
 
     def forward(self, inputs):
@@ -160,7 +159,7 @@ def setUp(self):
         _logger.info("test act_preprocess")
         self.imperative_qat = ImperativeQuantAware(act_preprocess_layer=PACT)
 
-    def test_quant_aware_training(self):
+    def func_quant_aware_training(self):
         imperative_qat = self.imperative_qat
         seed = 1
         np.random.seed(seed)
@@ -170,8 +169,8 @@ def test_quant_aware_training(self):
         fixed_state = {}
         param_init_map = {}
         for name, param in lenet.named_parameters():
-            p_shape = param.numpy().shape
-            p_value = param.numpy()
+            p_shape = np.array(param).shape
+            p_value = np.array(param)
             if name.endswith("bias"):
                 value = np.zeros_like(p_value).astype('float32')
             else:
@@ -217,8 +216,8 @@ def train(model):
                     loss = nn.functional.loss.cross_entropy(out, label)
                     avg_loss = paddle.mean(loss)
                     avg_loss.backward()
-                    adam.minimize(avg_loss)
-                    model.clear_gradients()
+                    adam.step()
+                    adam.clear_grad()
                     if batch_id % 50 == 0:
                         _logger.info(
                             "Train | At epoch {} step {}: loss = {:}, acc= {:}".format(
@@ -262,6 +261,11 @@ def test(model):
         train(lenet)
         test(lenet)
 
+    def test_quant_aware_training(self):
+        with _test_eager_guard():
+            self.func_quant_aware_training()
+        self.func_quant_aware_training()
+
 
 class TestUserDefinedWeightPreprocess(TestUserDefinedActPreprocess):
     def setUp(self):
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_skip_op.py b/python/paddle/static/quantization/tests/test_imperative_skip_op.py
similarity index 79%
rename from python/paddle/fluid/contrib/slim/tests/test_imperative_skip_op.py
rename to python/paddle/static/quantization/tests/test_imperative_skip_op.py
index d38cbd88fe02ac..d3dab28a022b70 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_skip_op.py
+++ b/python/paddle/static/quantization/tests/test_imperative_skip_op.py
@@ -1,4 +1,4 @@
-#   copyright (c) 2018 paddlepaddle authors. all rights reserved.
+#   copyright (c) 2022 paddlepaddle authors. all rights reserved.
 #
 # licensed under the apache license, version 2.0 (the "license");
 # you may not use this file except in compliance with the license.
@@ -13,34 +13,25 @@
 # limitations under the license.
 
 import os
-import numpy as np
-import random
 import unittest
-import logging
-import paddle
-import paddle.fluid as fluid
-import paddle.fluid.layers as layers
-from paddle.fluid import core
-from paddle.fluid.optimizer import AdamOptimizer
-from paddle.fluid.contrib.slim.quantization import ImperativeQuantAware
-from paddle.jit.translated_layer import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
-from paddle.nn.layer import ReLU, LeakyReLU, Sigmoid, Softmax, ReLU6
-from paddle.nn import Linear, Conv2D, Softmax, BatchNorm
-from paddle.fluid.log_helper import get_logger
 
+import numpy as np
 from imperative_test_utils import (
+    ImperativeLenetWithSkipQuant,
     fix_model_dict,
     train_lenet,
-    ImperativeLenetWithSkipQuant,
 )
 
+import paddle
+from paddle.framework import core, set_flags
+from paddle.optimizer import Adam
+from paddle.quantization import ImperativeQuantAware
+
+INFER_MODEL_SUFFIX = ".pdmodel"
+INFER_PARAMS_SUFFIX = ".pdiparams"
 os.environ["CPU_NUM"] = "1"
 if core.is_compiled_with_cuda():
-    fluid.set_flags({"FLAGS_cudnn_deterministic": True})
-
-_logger = get_logger(
-    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s'
-)
+    set_flags({"FLAGS_cudnn_deterministic": True})
 
 
 class TestImperativeOutSclae(unittest.TestCase):
@@ -60,9 +51,7 @@ def test_out_scale_acc(self):
         lenet = fix_model_dict(lenet)
         qat.quantize(lenet)
 
-        adam = AdamOptimizer(
-            learning_rate=lr, parameter_list=lenet.parameters()
-        )
+        adam = Adam(learning_rate=lr, parameters=lenet.parameters())
         dynamic_loss_rec = []
         lenet.train()
         loss_list = train_lenet(lenet, reader, adam)
@@ -88,14 +77,14 @@ def test_out_scale_acc(self):
             place = core.CUDAPlace(0)
         else:
             place = core.CPUPlace()
-        exe = fluid.Executor(place)
+        exe = paddle.static.Executor(place)
 
         [
             inference_program,
             feed_target_names,
             fetch_targets,
-        ] = fluid.io.load_inference_model(
-            dirname=save_dir,
+        ] = paddle.static.load_inference_model(
+            save_dir,
             executor=exe,
             model_filename="lenet" + INFER_MODEL_SUFFIX,
             params_filename="lenet" + INFER_PARAMS_SUFFIX,
diff --git a/python/paddle/fluid/contrib/slim/tests/test_moving_average_abs_max_scale_op.py b/python/paddle/static/quantization/tests/test_moving_average_abs_max_scale_op.py
similarity index 72%
rename from python/paddle/fluid/contrib/slim/tests/test_moving_average_abs_max_scale_op.py
rename to python/paddle/static/quantization/tests/test_moving_average_abs_max_scale_op.py
index 4184166806d4fc..8b95cae34c1537 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_moving_average_abs_max_scale_op.py
+++ b/python/paddle/static/quantization/tests/test_moving_average_abs_max_scale_op.py
@@ -13,12 +13,12 @@
 # limitations under the License.
 
 import unittest
+
 import numpy as np
+
 import paddle
-import paddle
-import paddle.fluid as fluid
-from paddle.fluid import core
 import paddle.nn.quant.quant_layers as quant_layers
+from paddle.framework import core
 
 paddle.enable_static()
 
@@ -38,23 +38,23 @@ def init_data(batch_size=32, img_shape=[784], label_range=9):
 
 class TestMovingAverageAbsMaxScaleOp(unittest.TestCase):
     def check_backward(self, use_cuda):
-        main_program = fluid.Program()
-        startup_program = fluid.Program()
-        with fluid.program_guard(main_program, startup_program):
-            image = fluid.layers.data(
-                name='image', shape=[784], dtype='float32'
+        main_program = paddle.static.Program()
+        startup_program = paddle.static.Program()
+        with paddle.static.program_guard(main_program, startup_program):
+            image = paddle.static.data(
+                name='image', shape=[-1, 784], dtype='float32'
             )
-            label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-            fc_tmp = fluid.layers.fc(image, size=10, act='softmax')
+            label = paddle.static.data(
+                name='label', shape=[-1, 1], dtype='int64'
+            )
+            fc_tmp = paddle.static.nn.fc(image, size=10, activation='softmax')
             out_scale = quant_layers.MovingAverageAbsMaxScale(
                 name=fc_tmp.name, dtype=fc_tmp.dtype
             )
             fc_tmp_1 = out_scale(fc_tmp)
-            cross_entropy = paddle.nn.functional.softmax_with_cross_entropy(
-                fc_tmp, label
-            )
+            cross_entropy = paddle.nn.functional.cross_entropy(fc_tmp, label)
             loss = paddle.mean(cross_entropy)
-            sgd = fluid.optimizer.SGD(learning_rate=1e-3)
+            sgd = paddle.optimizer.SGD(learning_rate=1e-3)
             sgd.minimize(loss)
 
         moving_average_abs_max_scale_ops = [
@@ -66,13 +66,13 @@ def check_backward(self, use_cuda):
             len(moving_average_abs_max_scale_ops) == 1
         ), "The number of moving_average_abs_max_scale_ops should be 1."
 
-        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-        exe = fluid.Executor(place)
+        place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+        exe = paddle.static.Executor(place)
         exe.run(startup_program)
 
-        binary = fluid.compiler.CompiledProgram(
-            main_program
-        ).with_data_parallel(loss_name=loss.name)
+        binary = paddle.static.CompiledProgram(main_program).with_data_parallel(
+            loss_name=loss.name
+        )
 
         img, label = init_data()
         feed_dict = {"image": img, "label": label}
diff --git a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_lstm_model.py b/python/paddle/static/quantization/tests/test_post_training_quantization_lstm_model.py
similarity index 88%
rename from python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_lstm_model.py
rename to python/paddle/static/quantization/tests/test_post_training_quantization_lstm_model.py
index 7eb7f4d479e262..fb9ebbae550af7 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_lstm_model.py
+++ b/python/paddle/static/quantization/tests/test_post_training_quantization_lstm_model.py
@@ -11,21 +11,20 @@
 # without warranties or conditions of any kind, either express or implied.
 # see the license for the specific language governing permissions and
 # limitations under the license.
-import unittest
 import os
-import time
-import sys
 import random
-import math
-import functools
-import contextlib
 import struct
+import sys
 import tempfile
+import time
+import unittest
+
 import numpy as np
+
 import paddle
 import paddle.fluid as fluid
 from paddle.dataset.common import download
-from paddle.fluid.contrib.slim.quantization import PostTrainingQuantization
+from paddle.static.quantization import PostTrainingQuantization
 
 paddle.enable_static()
 
@@ -133,15 +132,27 @@ def reader():
 
         return reader
 
-    def run_program(self, model_path, data_path, infer_iterations):
+    def run_program(
+        self,
+        model_path,
+        model_filename,
+        params_filename,
+        data_path,
+        infer_iterations,
+    ):
         print("test model path:" + model_path)
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
+        place = paddle.CPUPlace()
+        exe = paddle.static.Executor(place)
         [
             infer_program,
             feed_dict,
             fetch_targets,
-        ] = fluid.io.load_inference_model(model_path, exe)
+        ] = paddle.static.load_inference_model(
+            model_path,
+            exe,
+            model_filename=model_filename,
+            params_filename=params_filename,
+        )
 
         val_reader = self.get_simple_reader(data_path, place)
 
@@ -176,6 +187,8 @@ def run_program(self, model_path, data_path, infer_iterations):
     def generate_quantized_model(
         self,
         model_path,
+        model_filename,
+        params_filename,
         data_path,
         algo="KL",
         round_type="round",
@@ -188,14 +201,16 @@ def generate_quantized_model(
         onnx_format=False,
     ):
 
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        scope = fluid.global_scope()
+        place = paddle.CPUPlace()
+        exe = paddle.static.Executor(place)
+        scope = paddle.static.global_scope()
         batch_generator = self.get_batch_reader(data_path, place)
 
         ptq = PostTrainingQuantization(
             executor=exe,
             model_dir=model_path,
+            model_filename=model_filename,
+            params_filename=params_filename,
             batch_generator=batch_generator,
             batch_nums=batch_nums,
             algo=algo,
@@ -214,6 +229,8 @@ def generate_quantized_model(
     def run_test(
         self,
         model_name,
+        model_filename,
+        params_filename,
         model_url,
         model_md5,
         data_name,
@@ -242,7 +259,11 @@ def run_test(
             )
         )
         (fp32_latency, fp32_acc) = self.run_program(
-            fp32_model_path, data_path, infer_iterations
+            fp32_model_path,
+            model_filename,
+            params_filename,
+            data_path,
+            infer_iterations,
         )
 
         print(
@@ -252,6 +273,8 @@ def run_test(
         )
         self.generate_quantized_model(
             fp32_model_path,
+            model_filename,
+            params_filename,
             data_path,
             algo,
             round_type,
@@ -270,7 +293,11 @@ def run_test(
             )
         )
         (int8_latency, int8_acc) = self.run_program(
-            self.int8_model_path, data_path, infer_iterations
+            self.int8_model_path,
+            'model.pdmodel',
+            'model.pdiparams',
+            data_path,
+            infer_iterations,
         )
 
         print("---Post training quantization of {} method---".format(algo))
@@ -293,8 +320,8 @@ def run_test(
 class TestPostTrainingAvgForLSTM(TestPostTrainingQuantization):
     def test_post_training_avg(self):
         model_name = "nlp_lstm_fp32_model"
-        model_url = "https://paddle-inference-dist.cdn.bcebos.com/int8/unittest_model_data/nlp_lstm_fp32_model.tar.gz"
-        model_md5 = "519b8eeac756e7b4b7bcb2868e880452"
+        model_url = "https://paddle-inference-dist.cdn.bcebos.com/int8/unittest_model_data/nlp_lstm_fp32_model_combined.tar.gz"
+        model_md5 = "5b47cd7ba2afcf24120d9727ed3f05a7"
         data_name = "quant_lstm_input_data"
         data_url = "https://paddle-inference-dist.cdn.bcebos.com/int8/unittest_model_data/quant_lstm_input_data.tar.gz"
         data_md5 = "add84c754e9b792fea1fbd728d134ab7"
@@ -309,6 +336,8 @@ def test_post_training_avg(self):
         quant_iterations = 10
         self.run_test(
             model_name,
+            'model.pdmodel',
+            'model.pdiparams',
             model_url,
             model_md5,
             data_name,
@@ -329,8 +358,8 @@ def test_post_training_avg(self):
 class TestPostTrainingAvgForLSTMONNXFormat(TestPostTrainingQuantization):
     def not_test_post_training_avg_onnx_format(self):
         model_name = "nlp_lstm_fp32_model"
-        model_url = "https://paddle-inference-dist.cdn.bcebos.com/int8/unittest_model_data/nlp_lstm_fp32_model.tar.gz"
-        model_md5 = "519b8eeac756e7b4b7bcb2868e880452"
+        model_url = "https://paddle-inference-dist.cdn.bcebos.com/int8/unittest_model_data/nlp_lstm_fp32_model_combined.tar.gz"
+        model_md5 = "5b47cd7ba2afcf24120d9727ed3f05a7"
         data_name = "quant_lstm_input_data"
         data_url = "https://paddle-inference-dist.cdn.bcebos.com/int8/unittest_model_data/quant_lstm_input_data.tar.gz"
         data_md5 = "add84c754e9b792fea1fbd728d134ab7"
@@ -346,6 +375,8 @@ def not_test_post_training_avg_onnx_format(self):
         onnx_format = True
         self.run_test(
             model_name,
+            'model.pdmodel',
+            'model.pdiparams',
             model_url,
             model_md5,
             data_name,
diff --git a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mnist.py b/python/paddle/static/quantization/tests/test_post_training_quantization_mnist.py
similarity index 71%
rename from python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mnist.py
rename to python/paddle/static/quantization/tests/test_post_training_quantization_mnist.py
index 6ff54f7c970761..d22997eca6397d 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mnist.py
+++ b/python/paddle/static/quantization/tests/test_post_training_quantization_mnist.py
@@ -1,4 +1,4 @@
-#   copyright (c) 2018 paddlepaddle authors. all rights reserved.
+#   copyright (c) 2022 paddlepaddle authors. all rights reserved.
 #
 # licensed under the apache license, version 2.0 (the "license");
 # you may not use this file except in compliance with the license.
@@ -11,20 +11,18 @@
 # without warranties or conditions of any kind, either express or implied.
 # see the license for the specific language governing permissions and
 # limitations under the license.
-import unittest
 import os
-import time
-import sys
 import random
-import math
-import functools
+import sys
 import tempfile
-import contextlib
+import time
+import unittest
+
 import numpy as np
+
 import paddle
-import paddle.fluid as fluid
-from paddle.dataset.common import download
-from paddle.fluid.contrib.slim.quantization import PostTrainingQuantization
+from paddle.dataset.common import md5file
+from paddle.static.quantization import PostTrainingQuantization
 
 paddle.enable_static()
 
@@ -38,12 +36,13 @@ def setUp(self):
         self.int8_model_path = os.path.join(
             self.root_path.name, "post_training_quantization"
         )
-        self.download_path = 'int8/download'
-        self.cache_folder = os.path.expanduser(
-            '~/.cache/paddle/dataset/' + self.download_path
+        self.download_path = f'download_model_{time.time()}'
+        self.cache_folder = os.path.join(
+            self.root_path.name, self.download_path
         )
         try:
             os.system("mkdir -p " + self.int8_model_path)
+            os.system("mkdir -p " + self.cache_folder)
         except Exception as e:
             print(
                 "Failed to create {} due to {}".format(
@@ -62,25 +61,110 @@ def cache_unzipping(self, target_folder, zip_path):
             )
             os.system(cmd)
 
+    def download(self, url, dirname, md5sum, save_name=None):
+        import shutil
+
+        import requests
+
+        filename = os.path.join(
+            dirname, url.split('/')[-1] if save_name is None else save_name
+        )
+
+        if os.path.exists(filename) and md5file(filename) == md5sum:
+            return filename
+
+        retry = 0
+        retry_limit = 3
+        while not (os.path.exists(filename) and md5file(filename) == md5sum):
+            if os.path.exists(filename):
+                sys.stderr.write(
+                    "file %s  md5 %s\n" % (md5file(filename), md5sum)
+                )
+            if retry < retry_limit:
+                retry += 1
+            else:
+                raise RuntimeError(
+                    "Cannot download {0} within retry limit {1}".format(
+                        url, retry_limit
+                    )
+                )
+            sys.stderr.write(
+                "Cache file %s not found, downloading %s \n" % (filename, url)
+            )
+            sys.stderr.write("Begin to download\n")
+            try:
+                r = requests.get(url, stream=True)
+                total_length = r.headers.get('content-length')
+
+                if total_length is None:
+                    with open(filename, 'wb') as f:
+                        shutil.copyfileobj(r.raw, f)
+                else:
+                    with open(filename, 'wb') as f:
+                        chunk_size = 4096
+                        total_length = int(total_length)
+                        total_iter = total_length / chunk_size + 1
+                        log_interval = (
+                            total_iter // 20 if total_iter > 20 else 1
+                        )
+                        log_index = 0
+                        bar = paddle.hapi.progressbar.ProgressBar(
+                            total_iter, name='item'
+                        )
+                        for data in r.iter_content(chunk_size=chunk_size):
+                            f.write(data)
+                            log_index += 1
+                            bar.update(log_index, {})
+                            if log_index % log_interval == 0:
+                                bar.update(log_index)
+
+            except Exception as e:
+                # re-try
+                continue
+        sys.stderr.write("\nDownload finished\n")
+        sys.stdout.flush()
+        return filename
+
     def download_model(self, data_url, data_md5, folder_name):
-        download(data_url, self.download_path, data_md5)
+        self.download(data_url, self.cache_folder, data_md5)
+        os.system(f'wget -q {data_url}')
         file_name = data_url.split('/')[-1]
         zip_path = os.path.join(self.cache_folder, file_name)
-        print('Data is downloaded at {0}'.format(zip_path))
+        print(
+            'Data is downloaded at {0}. File exists: {1}'.format(
+                zip_path, os.path.exists(zip_path)
+            )
+        )
 
         data_cache_folder = os.path.join(self.cache_folder, folder_name)
         self.cache_unzipping(data_cache_folder, zip_path)
         return data_cache_folder
 
-    def run_program(self, model_path, batch_size, infer_iterations):
-        print("test model path:" + model_path)
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
+    def run_program(
+        self,
+        model_path,
+        model_filename,
+        params_filename,
+        batch_size,
+        infer_iterations,
+    ):
+        print(
+            "test model path: {}. File exists: {}".format(
+                model_path, os.path.exists(model_path)
+            )
+        )
+        place = paddle.CPUPlace()
+        exe = paddle.static.Executor(place)
         [
             infer_program,
             feed_dict,
             fetch_targets,
-        ] = fluid.io.load_inference_model(model_path, exe)
+        ] = paddle.static.load_inference_model(
+            model_path,
+            exe,
+            model_filename=model_filename,
+            params_filename=params_filename,
+        )
         val_reader = paddle.batch(paddle.dataset.mnist.test(), batch_size)
 
         img_shape = [1, 28, 28]
@@ -119,6 +203,8 @@ def run_program(self, model_path, batch_size, infer_iterations):
     def generate_quantized_model(
         self,
         model_path,
+        model_filename,
+        params_filename,
         algo="KL",
         round_type="round",
         quantizable_op_type=["conv2d"],
@@ -132,13 +218,15 @@ def generate_quantized_model(
         bias_correction=False,
     ):
 
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
+        place = paddle.CPUPlace()
+        exe = paddle.static.Executor(place)
         val_reader = paddle.dataset.mnist.train()
 
         ptq = PostTrainingQuantization(
             executor=exe,
             model_dir=model_path,
+            model_filename=model_filename,
+            params_filename=params_filename,
             sample_generator=val_reader,
             batch_size=batch_size,
             batch_nums=batch_nums,
@@ -158,6 +246,8 @@ def generate_quantized_model(
     def run_test(
         self,
         model_name,
+        model_filename,
+        params_filename,
         data_url,
         data_md5,
         algo,
@@ -183,8 +273,13 @@ def run_test(
                 model_name, infer_iterations * batch_size
             )
         )
+
         (fp32_throughput, fp32_latency, fp32_acc1) = self.run_program(
-            origin_model_path, batch_size, infer_iterations
+            origin_model_path,
+            model_filename,
+            params_filename,
+            batch_size,
+            infer_iterations,
         )
 
         print(
@@ -194,6 +289,8 @@ def run_test(
         )
         self.generate_quantized_model(
             origin_model_path,
+            model_filename,
+            params_filename,
             algo,
             round_type,
             quantizable_op_type,
@@ -213,7 +310,11 @@ def run_test(
             )
         )
         (int8_throughput, int8_latency, int8_acc1) = self.run_program(
-            self.int8_model_path, batch_size, infer_iterations
+            self.int8_model_path,
+            'model.pdmodel',
+            'model.pdiparams',
+            batch_size,
+            infer_iterations,
         )
 
         print("---Post training quantization of {} method---".format(algo))
@@ -236,10 +337,8 @@ def run_test(
 class TestPostTrainingKLForMnist(TestPostTrainingQuantization):
     def test_post_training_kl(self):
         model_name = "mnist_model"
-        data_url = (
-            "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_model.tar.gz"
-        )
-        data_md5 = "be71d3997ec35ac2a65ae8a145e2887c"
+        data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_model_combined.tar.gz"
+        data_md5 = "a49251d3f555695473941e5a725c6014"
         algo = "KL"
         round_type = "round"
         quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"]
@@ -252,6 +351,8 @@ def test_post_training_kl(self):
         quant_iterations = 5
         self.run_test(
             model_name,
+            'model.pdmodel',
+            'model.pdiparams',
             data_url,
             data_md5,
             algo,
@@ -270,10 +371,8 @@ def test_post_training_kl(self):
 class TestPostTraininghistForMnist(TestPostTrainingQuantization):
     def test_post_training_hist(self):
         model_name = "mnist_model"
-        data_url = (
-            "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_model.tar.gz"
-        )
-        data_md5 = "be71d3997ec35ac2a65ae8a145e2887c"
+        data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_model_combined.tar.gz"
+        data_md5 = "a49251d3f555695473941e5a725c6014"
         algo = "hist"
         round_type = "round"
         quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"]
@@ -286,6 +385,8 @@ def test_post_training_hist(self):
         quant_iterations = 5
         self.run_test(
             model_name,
+            'model.pdmodel',
+            'model.pdiparams',
             data_url,
             data_md5,
             algo,
@@ -304,10 +405,8 @@ def test_post_training_hist(self):
 class TestPostTrainingmseForMnist(TestPostTrainingQuantization):
     def test_post_training_mse(self):
         model_name = "mnist_model"
-        data_url = (
-            "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_model.tar.gz"
-        )
-        data_md5 = "be71d3997ec35ac2a65ae8a145e2887c"
+        data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_model_combined.tar.gz"
+        data_md5 = "a49251d3f555695473941e5a725c6014"
         algo = "mse"
         round_type = "round"
         quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"]
@@ -320,6 +419,8 @@ def test_post_training_mse(self):
         quant_iterations = 5
         self.run_test(
             model_name,
+            'model.pdmodel',
+            'model.pdiparams',
             data_url,
             data_md5,
             algo,
@@ -338,10 +439,8 @@ def test_post_training_mse(self):
 class TestPostTrainingemdForMnist(TestPostTrainingQuantization):
     def test_post_training_mse(self):
         model_name = "mnist_model"
-        data_url = (
-            "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_model.tar.gz"
-        )
-        data_md5 = "be71d3997ec35ac2a65ae8a145e2887c"
+        data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_model_combined.tar.gz"
+        data_md5 = "a49251d3f555695473941e5a725c6014"
         algo = "emd"
         round_type = "round"
         quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"]
@@ -354,6 +453,8 @@ def test_post_training_mse(self):
         quant_iterations = 5
         self.run_test(
             model_name,
+            'model.pdmodel',
+            'model.pdiparams',
             data_url,
             data_md5,
             algo,
@@ -372,10 +473,8 @@ def test_post_training_mse(self):
 class TestPostTrainingavgForMnist(TestPostTrainingQuantization):
     def test_post_training_avg(self):
         model_name = "mnist_model"
-        data_url = (
-            "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_model.tar.gz"
-        )
-        data_md5 = "be71d3997ec35ac2a65ae8a145e2887c"
+        data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_model_combined.tar.gz"
+        data_md5 = "a49251d3f555695473941e5a725c6014"
         algo = "avg"
         round_type = "round"
         quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"]
@@ -388,6 +487,8 @@ def test_post_training_avg(self):
         quant_iterations = 5
         self.run_test(
             model_name,
+            'model.pdmodel',
+            'model.pdiparams',
             data_url,
             data_md5,
             algo,
@@ -406,10 +507,8 @@ def test_post_training_avg(self):
 class TestPostTrainingAbsMaxForMnist(TestPostTrainingQuantization):
     def test_post_training_abs_max(self):
         model_name = "mnist_model"
-        data_url = (
-            "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_model.tar.gz"
-        )
-        data_md5 = "be71d3997ec35ac2a65ae8a145e2887c"
+        data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_model_combined.tar.gz"
+        data_md5 = "a49251d3f555695473941e5a725c6014"
         algo = "abs_max"
         round_type = "round"
         quantizable_op_type = ["conv2d", "mul"]
@@ -422,6 +521,8 @@ def test_post_training_abs_max(self):
         quant_iterations = 10
         self.run_test(
             model_name,
+            'model.pdmodel',
+            'model.pdiparams',
             data_url,
             data_md5,
             algo,
@@ -440,10 +541,8 @@ def test_post_training_abs_max(self):
 class TestPostTrainingmseAdaroundForMnist(TestPostTrainingQuantization):
     def test_post_training_mse(self):
         model_name = "mnist_model"
-        data_url = (
-            "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_model.tar.gz"
-        )
-        data_md5 = "be71d3997ec35ac2a65ae8a145e2887c"
+        data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_model_combined.tar.gz"
+        data_md5 = "a49251d3f555695473941e5a725c6014"
         algo = "mse"
         round_type = "adaround"
         quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"]
@@ -457,6 +556,8 @@ def test_post_training_mse(self):
         bias_correction = True
         self.run_test(
             model_name,
+            'model.pdmodel',
+            'model.pdiparams',
             data_url,
             data_md5,
             algo,
@@ -476,10 +577,8 @@ def test_post_training_mse(self):
 class TestPostTrainingKLAdaroundForMnist(TestPostTrainingQuantization):
     def test_post_training_kl(self):
         model_name = "mnist_model"
-        data_url = (
-            "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_model.tar.gz"
-        )
-        data_md5 = "be71d3997ec35ac2a65ae8a145e2887c"
+        data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_model_combined.tar.gz"
+        data_md5 = "a49251d3f555695473941e5a725c6014"
         algo = "KL"
         round_type = "adaround"
         quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"]
@@ -492,6 +591,8 @@ def test_post_training_kl(self):
         quant_iterations = 5
         self.run_test(
             model_name,
+            'model.pdmodel',
+            'model.pdiparams',
             data_url,
             data_md5,
             algo,
@@ -510,10 +611,8 @@ def test_post_training_kl(self):
 class TestPostTrainingmseForMnistONNXFormat(TestPostTrainingQuantization):
     def test_post_training_mse_onnx_format(self):
         model_name = "mnist_model"
-        data_url = (
-            "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_model.tar.gz"
-        )
-        data_md5 = "be71d3997ec35ac2a65ae8a145e2887c"
+        data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_model_combined.tar.gz"
+        data_md5 = "a49251d3f555695473941e5a725c6014"
         algo = "mse"
         round_type = "round"
         quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"]
@@ -527,6 +626,8 @@ def test_post_training_mse_onnx_format(self):
         quant_iterations = 5
         self.run_test(
             model_name,
+            'model.pdmodel',
+            'model.pdiparams',
             data_url,
             data_md5,
             algo,
@@ -548,10 +649,8 @@ class TestPostTrainingmseForMnistONNXFormatFullQuant(
 ):
     def test_post_training_mse_onnx_format_full_quant(self):
         model_name = "mnist_model"
-        data_url = (
-            "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_model.tar.gz"
-        )
-        data_md5 = "be71d3997ec35ac2a65ae8a145e2887c"
+        data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_model_combined.tar.gz"
+        data_md5 = "a49251d3f555695473941e5a725c6014"
         algo = "mse"
         round_type = "round"
         quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"]
@@ -565,6 +664,8 @@ def test_post_training_mse_onnx_format_full_quant(self):
         quant_iterations = 5
         self.run_test(
             model_name,
+            'model.pdmodel',
+            'model.pdiparams',
             data_url,
             data_md5,
             algo,
@@ -584,10 +685,8 @@ def test_post_training_mse_onnx_format_full_quant(self):
 class TestPostTrainingavgForMnistSkipOP(TestPostTrainingQuantization):
     def test_post_training_avg_skip_op(self):
         model_name = "mnist_model"
-        data_url = (
-            "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_model.tar.gz"
-        )
-        data_md5 = "be71d3997ec35ac2a65ae8a145e2887c"
+        data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_model_combined.tar.gz"
+        data_md5 = "a49251d3f555695473941e5a725c6014"
         algo = "avg"
         round_type = "round"
         quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"]
@@ -601,6 +700,8 @@ def test_post_training_avg_skip_op(self):
         skip_tensor_list = ["fc_0.w_0"]
         self.run_test(
             model_name,
+            'model.pdmodel',
+            'model.pdiparams',
             data_url,
             data_md5,
             algo,
diff --git a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py b/python/paddle/static/quantization/tests/test_post_training_quantization_mobilenetv1.py
similarity index 92%
rename from python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py
rename to python/paddle/static/quantization/tests/test_post_training_quantization_mobilenetv1.py
index 471798dec28c5c..bdb80cd3d37c5f 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py
+++ b/python/paddle/static/quantization/tests/test_post_training_quantization_mobilenetv1.py
@@ -1,4 +1,4 @@
-#   copyright (c) 2018 paddlepaddle authors. all rights reserved.
+#   copyright (c) 2022 paddlepaddle authors. all rights reserved.
 #
 # licensed under the apache license, version 2.0 (the "license");
 # you may not use this file except in compliance with the license.
@@ -11,21 +11,20 @@
 # without warranties or conditions of any kind, either express or implied.
 # see the license for the specific language governing permissions and
 # limitations under the license.
-import unittest
+import functools
 import os
-import time
-import sys
 import random
-import math
-import functools
-import contextlib
+import sys
 import tempfile
+import time
+import unittest
+
 import numpy as np
-from PIL import Image, ImageEnhance
+from PIL import Image
+
 import paddle
-import paddle.fluid as fluid
 from paddle.dataset.common import download
-from paddle.fluid.contrib.slim.quantization import PostTrainingQuantization
+from paddle.static.quantization import PostTrainingQuantization
 
 paddle.enable_static()
 
@@ -52,7 +51,7 @@ def resize_short(img, target_size):
 def crop_image(img, target_size, center):
     width, height = img.size
     size = target_size
-    if center == True:
+    if center is True:
         w_start = (width - size) / 2
         h_start = (height - size) / 2
     else:
@@ -201,19 +200,26 @@ def download_data(self, data_urls, data_md5s, folder_name, is_model=True):
     def download_model(self):
         pass
 
-    def run_program(self, model_path, batch_size, infer_iterations):
+    def run_program(
+        self,
+        model_path,
+        model_filename,
+        params_filename,
+        batch_size,
+        infer_iterations,
+    ):
         image_shape = [3, 224, 224]
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
+        place = paddle.CPUPlace()
+        exe = paddle.static.Executor(place)
         [
             infer_program,
             feed_dict,
             fetch_targets,
-        ] = fluid.io.load_inference_model(
+        ] = paddle.static.load_inference_model(
             model_path,
             exe,
-            model_filename="inference.pdmodel",
-            params_filename="inference.pdiparams",
+            model_filename=model_filename,
+            params_filename=params_filename,
         )
         val_reader = paddle.batch(val(), batch_size)
         iterations = infer_iterations
@@ -260,6 +266,8 @@ def run_program(self, model_path, batch_size, infer_iterations):
     def generate_quantized_model(
         self,
         model_path,
+        model_filename,
+        params_filename,
         quantizable_op_type,
         batch_size,
         algo="KL",
@@ -278,17 +286,16 @@ def generate_quantized_model(
             )
             sys.exit(-1)
 
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        scope = fluid.global_scope()
+        place = paddle.CPUPlace()
+        exe = paddle.static.Executor(place)
         val_reader = val()
 
         ptq = PostTrainingQuantization(
             executor=exe,
             sample_generator=val_reader,
             model_dir=model_path,
-            model_filename="inference.pdmodel",
-            params_filename="inference.pdiparams",
+            model_filename=model_filename,
+            params_filename=params_filename,
             batch_size=batch_size,
             batch_nums=batch_nums,
             algo=algo,
@@ -309,6 +316,8 @@ def generate_quantized_model(
     def run_test(
         self,
         model,
+        model_filename,
+        params_filename,
         algo,
         round_type,
         data_urls,
@@ -333,17 +342,16 @@ def run_test(
         )
         (fp32_throughput, fp32_latency, fp32_acc1) = self.run_program(
             os.path.join(model_cache_folder, "MobileNetV1_infer"),
+            model_filename,
+            params_filename,
             batch_size,
             infer_iterations,
         )
 
-        print(
-            "Start INT8 post training quantization for {0} on {1} images ...".format(
-                model, batch_nums * batch_size
-            )
-        )
         self.generate_quantized_model(
             os.path.join(model_cache_folder, "MobileNetV1_infer"),
+            model_filename,
+            params_filename,
             quantizable_op_type,
             batch_size,
             algo,
@@ -361,7 +369,11 @@ def run_test(
             )
         )
         (int8_throughput, int8_latency, int8_acc1) = self.run_program(
-            self.int8_model, batch_size, infer_iterations
+            self.int8_model,
+            model_filename,
+            params_filename,
+            batch_size,
+            infer_iterations,
         )
 
         print("---Post training quantization of {} method---".format(algo))
@@ -403,6 +415,8 @@ def test_post_training_kl_mobilenetv1(self):
         batch_nums = 3
         self.run_test(
             model,
+            'inference.pdmodel',
+            'inference.pdiparams',
             algo,
             round_type,
             data_urls,
@@ -435,6 +449,8 @@ def test_post_training_avg_mobilenetv1(self):
         diff_threshold = 0.025
         self.run_test(
             model,
+            'inference.pdmodel',
+            'inference.pdiparams',
             algo,
             round_type,
             data_urls,
@@ -468,6 +484,8 @@ def test_post_training_hist_mobilenetv1(self):
         batch_nums = 3
         self.run_test(
             model,
+            'inference.pdmodel',
+            'inference.pdiparams',
             algo,
             round_type,
             data_urls,
@@ -501,6 +519,8 @@ def test_post_training_abs_max_mobilenetv1(self):
         diff_threshold = 0.05
         self.run_test(
             model,
+            'inference.pdmodel',
+            'inference.pdiparams',
             algo,
             round_type,
             data_urls,
@@ -535,6 +555,8 @@ def test_post_training_onnx_format_mobilenetv1(self):
         batch_nums = 3
         self.run_test(
             model,
+            'inference.pdmodel',
+            'inference.pdiparams',
             algo,
             round_type,
             data_urls,
diff --git a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_program_resnet50.py b/python/paddle/static/quantization/tests/test_post_training_quantization_program_resnet50.py
similarity index 88%
rename from python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_program_resnet50.py
rename to python/paddle/static/quantization/tests/test_post_training_quantization_program_resnet50.py
index 345853636a41ba..26d52cf2011704 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_program_resnet50.py
+++ b/python/paddle/static/quantization/tests/test_post_training_quantization_program_resnet50.py
@@ -12,24 +12,22 @@
 # see the license for the specific language governing permissions and
 # limitations under the license.
 
+import functools
 import os
+import random
 import sys
 import time
-import paddle
-import random
 import unittest
-import functools
-import contextlib
+
 import numpy as np
-import paddle.fluid as fluid
-from PIL import Image, ImageEnhance
-from paddle.fluid.contrib.slim.quantization import (
-    PostTrainingQuantizationProgram,
-)
+from PIL import Image
 from test_post_training_quantization_mobilenetv1 import (
     TestPostTrainingQuantization,
 )
 
+import paddle
+from paddle.static.quantization import PostTrainingQuantizationProgram
+
 paddle.enable_static()
 
 random.seed(0)
@@ -55,7 +53,7 @@ def resize_short(img, target_size):
 def crop_image(img, target_size, center):
     width, height = img.size
     size = target_size
-    if center == True:
+    if center is True:
         w_start = (width - size) / 2
         h_start = (height - size) / 2
     else:
@@ -115,15 +113,27 @@ def val(data_dir=DATA_DIR):
 
 
 class TestPostTrainingQuantizationProgram(TestPostTrainingQuantization):
-    def run_program(self, model_path, batch_size, infer_iterations):
+    def run_program(
+        self,
+        model_path,
+        model_filename,
+        params_filename,
+        batch_size,
+        infer_iterations,
+    ):
         image_shape = [3, 224, 224]
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
+        place = paddle.CPUPlace()
+        exe = paddle.static.Executor(place)
         [
             infer_program,
             feed_dict,
             fetch_targets,
-        ] = fluid.io.load_inference_model(model_path, exe)
+        ] = paddle.static.load_inference_model(
+            model_path,
+            exe,
+            model_filename=model_filename,
+            params_filename=params_filename,
+        )
         val_reader = paddle.batch(val(), batch_size)
         iterations = infer_iterations
         test_info = []
@@ -162,7 +172,12 @@ def run_program(self, model_path, batch_size, infer_iterations):
             infer_program,
             feed_dict,
             fetch_targets,
-        ] = fluid.io.load_inference_model(model_path, exe)
+        ] = paddle.static.load_inference_model(
+            model_path,
+            exe,
+            model_filename=model_filename,
+            params_filename=params_filename,
+        )
         return (
             throughput,
             latency,
@@ -193,9 +208,8 @@ def generate_quantized_model(
             )
             sys.exit(-1)
 
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        scope = fluid.global_scope()
+        place = paddle.CPUPlace()
+        exe = paddle.static.Executor(place)
         val_reader = val()
         same_scale_tensor_list = [
             ['batch_norm_3.tmp_2#/#1', 'batch_norm_4.tmp_2#*#1'],
@@ -231,6 +245,8 @@ def generate_quantized_model(
     def run_test(
         self,
         model,
+        model_filename,
+        params_filename,
         algo,
         round_type,
         data_urls,
@@ -244,7 +260,6 @@ def run_test(
     ):
         infer_iterations = self.infer_iterations
         batch_size = self.batch_size
-        sample_iterations = self.sample_iterations
 
         model_cache_folder = self.download_data(data_urls, data_md5s, model)
 
@@ -262,14 +277,12 @@ def run_test(
             fetch_targets,
         ) = self.run_program(
             os.path.join(model_cache_folder, "model"),
+            model_filename,
+            params_filename,
             batch_size,
             infer_iterations,
         )
-        print(
-            "Start INT8 post training quantization for {0} on {1} images ...".format(
-                model, sample_iterations * batch_size
-            )
-        )
+
         self.generate_quantized_model(
             infer_program,
             quantizable_op_type,
@@ -289,7 +302,11 @@ def run_test(
             )
         )
         (int8_throughput, int8_latency, int8_acc1, _, _, _) = self.run_program(
-            self.int8_model, batch_size, infer_iterations
+            self.int8_model,
+            model_filename,
+            params_filename,
+            batch_size,
+            infer_iterations,
         )
 
         print("---Post training quantization of {} method---".format(algo))
@@ -317,9 +334,9 @@ def test_post_training_abs_max_resnet50(self):
         algo = "abs_max"
         round_type = "round"
         data_urls = [
-            'http://paddle-inference-dist.bj.bcebos.com/int8/resnet50_int8_model.tar.gz'
+            'http://paddle-inference-dist.bj.bcebos.com/int8/resnet50_int8_model_combined.tar.gz'
         ]
-        data_md5s = ['4a5194524823d9b76da6e738e1367881']
+        data_md5s = ['db212fd4e9edc83381aef4533107e60c']
         quantizable_op_type = ["conv2d", "mul"]
         is_full_quantize = False
         is_use_cache_file = False
@@ -327,6 +344,8 @@ def test_post_training_abs_max_resnet50(self):
         diff_threshold = 0.025
         self.run_test(
             model,
+            'model.pdmodel',
+            'model.pdiparams',
             algo,
             round_type,
             data_urls,
diff --git a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_resnet50.py b/python/paddle/static/quantization/tests/test_post_training_quantization_resnet50.py
similarity index 88%
rename from python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_resnet50.py
rename to python/paddle/static/quantization/tests/test_post_training_quantization_resnet50.py
index 65e1d391399ddb..bd54d450a1a5ed 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_resnet50.py
+++ b/python/paddle/static/quantization/tests/test_post_training_quantization_resnet50.py
@@ -12,11 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
+
 from test_post_training_quantization_mobilenetv1 import (
     TestPostTrainingQuantization,
 )
+
 import paddle
 
 paddle.enable_static()
@@ -28,9 +29,9 @@ def test_post_training_resnet50(self):
         algo = "min_max"
         round_type = "round"
         data_urls = [
-            'http://paddle-inference-dist.bj.bcebos.com/int8/resnet50_int8_model.tar.gz'
+            'http://paddle-inference-dist.bj.bcebos.com/int8/resnet50_int8_model_combined.tar.gz'
         ]
-        data_md5s = ['4a5194524823d9b76da6e738e1367881']
+        data_md5s = ['db212fd4e9edc83381aef4533107e60c']
         quantizable_op_type = ["conv2d", "mul"]
         is_full_quantize = False
         is_use_cache_file = False
@@ -38,6 +39,8 @@ def test_post_training_resnet50(self):
         diff_threshold = 0.025
         self.run_test(
             model,
+            'model.pdmodel',
+            'model.pdiparams',
             algo,
             round_type,
             data_urls,
@@ -56,9 +59,9 @@ def test_post_training_resnet50(self):
         algo = "min_max"
         round_type = "round"
         data_urls = [
-            'http://paddle-inference-dist.bj.bcebos.com/int8/resnet50_int8_model.tar.gz'
+            'http://paddle-inference-dist.bj.bcebos.com/int8/resnet50_int8_model_combined.tar.gz'
         ]
-        data_md5s = ['4a5194524823d9b76da6e738e1367881']
+        data_md5s = ['db212fd4e9edc83381aef4533107e60c']
         quantizable_op_type = ["conv2d", "mul"]
         is_full_quantize = False
         is_use_cache_file = False
@@ -67,6 +70,8 @@ def test_post_training_resnet50(self):
         onnx_format = True
         self.run_test(
             model,
+            'model.pdmodel',
+            'model.pdiparams',
             algo,
             round_type,
             data_urls,
diff --git a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_while.py b/python/paddle/static/quantization/tests/test_post_training_quantization_while.py
similarity index 97%
rename from python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_while.py
rename to python/paddle/static/quantization/tests/test_post_training_quantization_while.py
index 628d120f45ebd9..71482209b3f058 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_while.py
+++ b/python/paddle/static/quantization/tests/test_post_training_quantization_while.py
@@ -11,19 +11,17 @@
 # without warranties or conditions of any kind, either express or implied.
 # see the license for the specific language governing permissions and
 # limitations under the license.
-import unittest
 import os
-import time
-import sys
 import random
-import math
-import functools
-import contextlib
+import sys
+import time
+import unittest
+
 import numpy as np
+
 import paddle
-import paddle.fluid as fluid
 from paddle.dataset.common import download
-from paddle.fluid.contrib.slim.quantization import PostTrainingQuantization
+from paddle.static.quantization import PostTrainingQuantization
 
 paddle.enable_static()
 
@@ -77,13 +75,13 @@ def download_model(self, data_url, data_md5, folder_name):
 
     def run_program(self, model_path, batch_size, infer_iterations):
         print("test model path:" + model_path)
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
+        place = paddle.CPUPlace()
+        exe = paddle.static.Executor(place)
         [
             infer_program,
             feed_dict,
             fetch_targets,
-        ] = fluid.io.load_inference_model(
+        ] = paddle.static.load_inference_model(
             model_path,
             model_filename='model.pdmodel',
             params_filename='model.pdiparams',
@@ -137,9 +135,8 @@ def generate_quantized_model(
         is_data_loader=False,
     ):
 
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        scope = fluid.global_scope()
+        place = paddle.CPUPlace()
+        exe = paddle.static.Executor(place)
         val_reader = paddle.dataset.mnist.train()
 
         def val_data_generator():
diff --git a/python/paddle/fluid/contrib/slim/tests/test_quant2_int8_mkldnn_pass.py b/python/paddle/static/quantization/tests/test_quant2_int8_mkldnn_pass.py
similarity index 95%
rename from python/paddle/fluid/contrib/slim/tests/test_quant2_int8_mkldnn_pass.py
rename to python/paddle/static/quantization/tests/test_quant2_int8_mkldnn_pass.py
index 0f7a43ebebd683..61c700d23b7f4e 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_quant2_int8_mkldnn_pass.py
+++ b/python/paddle/static/quantization/tests/test_quant2_int8_mkldnn_pass.py
@@ -13,12 +13,13 @@
 # limitations under the license.
 
 import unittest
+
 import numpy as np
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-from paddle.fluid.framework import IrGraph
-from paddle.fluid.contrib.slim.quantization import Quant2Int8MkldnnPass
+
 import paddle
+from paddle.fluid.framework import IrGraph
+from paddle.framework import core
+from paddle.static.quantization import Quant2Int8MkldnnPass
 
 paddle.enable_static()
 
@@ -28,8 +29,8 @@ def op_name(self):
         return "mul"
 
     def setUp(self):
-        self.scope = fluid.Scope()
-        self.place = fluid.CPUPlace()
+        self.scope = paddle.static.global_scope()
+        self.place = paddle.CPUPlace()
         self.dtype = np.float32
         self.use_mkldnn = True
 
@@ -67,8 +68,8 @@ def prepare_program_mul(self, program):
         )
 
     def test_dequantize_op_weights(self):
-        program = fluid.Program()
-        with fluid.program_guard(program):
+        program = paddle.static.Program()
+        with paddle.static.program_guard(program):
             self.prepare_program_mul(program)
             graph = IrGraph(core.Graph(program.desc), for_test=True)
 
@@ -131,8 +132,8 @@ def op_name(self):
 
 class TestQuant2Int8MkldnnPassConv2D(unittest.TestCase):
     def setUp(self):
-        self.scope = fluid.Scope()
-        self.place = fluid.CPUPlace()
+        self.scope = paddle.static.global_scope()
+        self.place = paddle.CPUPlace()
         self.dtype = np.float32
         self.use_cudnn = False
         self.use_mkldnn = True
@@ -218,8 +219,8 @@ def check_graph_after_pass(self, graph):
                     self.assertTrue(op.op().attr("fuse_activation") == "relu")
 
     def test_quant_update_activation(self):
-        program = fluid.Program()
-        with fluid.program_guard(program):
+        program = paddle.static.Program()
+        with paddle.static.program_guard(program):
             self.prepare_program_conv2d(program)
             graph = IrGraph(core.Graph(program.desc), for_test=True)
             graph = self.remove_fuse_activation_attribute(graph)
@@ -239,8 +240,8 @@ def op_name(self):
             return "nearest_interp"
 
         def setUp(self):
-            self.scope = fluid.Scope()
-            self.place = fluid.CPUPlace()
+            self.scope = paddle.static.global_scope()
+            self.place = paddle.CPUPlace()
             self.dtype = np.float32
             self.use_cudnn = False
             self.use_mkldnn = True
@@ -352,8 +353,8 @@ def check_graph_after_pass(self, graph):
                     self.assertTrue(op.op().attr("mkldnn_data_type") == "int8")
 
         def test_quant_update_activation(self):
-            program = fluid.Program()
-            with fluid.program_guard(program):
+            program = paddle.static.Program()
+            with paddle.static.program_guard(program):
                 self.prepare_program(program)
                 graph = IrGraph(core.Graph(program.desc), for_test=True)
                 quant2_int8_mkldnn_pass = Quant2Int8MkldnnPass(
diff --git a/python/paddle/fluid/contrib/slim/tests/test_quantization_mkldnn_pass.py b/python/paddle/static/quantization/tests/test_quantization_mkldnn_pass.py
similarity index 79%
rename from python/paddle/fluid/contrib/slim/tests/test_quantization_mkldnn_pass.py
rename to python/paddle/static/quantization/tests/test_quantization_mkldnn_pass.py
index fa9f5d97ae5f04..fa33fb1a87f6a6 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_quantization_mkldnn_pass.py
+++ b/python/paddle/static/quantization/tests/test_quantization_mkldnn_pass.py
@@ -13,40 +13,46 @@
 # limitations under the license.
 
 import os
-import unittest
 import random
+import unittest
+
 import numpy as np
-import paddle.fluid as fluid
+
 import paddle
 from paddle.fluid.framework import IrGraph
-from paddle.fluid.contrib.slim.quantization import QuantizationFreezePass
-from paddle.fluid.contrib.slim.quantization import QuantizationTransformPass
-from paddle.fluid.contrib.slim.quantization import QuantInt8MkldnnPass
-from paddle.fluid import core
+from paddle.framework import core
+from paddle.static.quantization import (
+    QuantInt8MkldnnPass,
+    QuantizationFreezePass,
+    QuantizationTransformPass,
+)
 
 paddle.enable_static()
 os.environ["CPU_NUM"] = "1"
 
 
 def conv_net(img, label):
-    conv_pool_1 = fluid.nets.simple_img_conv_pool(
+    conv_out_1 = paddle.static.nn.conv2d(
         input=img,
         filter_size=5,
         num_filters=20,
-        pool_size=2,
-        pool_stride=2,
-        act="relu",
+        act='relu',
+    )
+    conv_pool_1 = paddle.nn.functional.max_pool2d(
+        conv_out_1, kernel_size=2, stride=2
     )
     conv_pool_1 = paddle.static.nn.batch_norm(conv_pool_1)
-    conv_pool_2 = fluid.nets.simple_img_conv_pool(
+
+    conv_out_2 = paddle.static.nn.conv2d(
         input=conv_pool_1,
         filter_size=5,
-        num_filters=50,
-        pool_size=2,
-        pool_stride=2,
-        act="relu",
+        num_filters=20,
+        act='relu',
     )
-    prediction = fluid.layers.fc(input=conv_pool_2, size=10, act='softmax')
+    conv_pool_2 = paddle.nn.functional.max_pool2d(
+        conv_out_2, kernel_size=2, stride=2
+    )
+    prediction = paddle.static.nn.fc(conv_pool_2, size=10, activation='softmax')
     loss = paddle.nn.functional.cross_entropy(
         input=prediction, label=label, reduction='none', use_softmax=False
     )
@@ -77,17 +83,17 @@ def isinteger(self, x):
     def build_program(self, main, startup, is_test, seed):
         main.random_seed = seed
         startup.random_seed = seed
-        with fluid.unique_name.guard():
-            with fluid.program_guard(main, startup):
-                img = fluid.layers.data(
-                    name='image', shape=[1, 28, 28], dtype='float32'
+        with paddle.utils.unique_name.guard():
+            with paddle.static.program_guard(main, startup):
+                img = paddle.static.data(
+                    name='image', shape=[-1, 1, 28, 28], dtype='float32'
                 )
-                label = fluid.layers.data(
-                    name='label', shape=[1], dtype='int64'
+                label = paddle.static.data(
+                    name='label', shape=[-1, 1], dtype='int64'
                 )
                 loss = conv_net(img, label)
                 if not is_test:
-                    opt = fluid.optimizer.Adam(learning_rate=0.001)
+                    opt = paddle.optimizer.Adam(learning_rate=0.001)
                     opt.minimize(loss)
         return [img, label], loss
 
@@ -103,19 +109,19 @@ def mkldnn_based_freeze_graph(
         random.seed(0)
         np.random.seed(0)
 
-        main = fluid.Program()
-        startup = fluid.Program()
-        test_program = fluid.Program()
+        main = paddle.static.Program()
+        startup = paddle.static.Program()
+        test_program = paddle.static.Program()
         feeds, loss = self.build_program(main, startup, False, seed)
         self.build_program(test_program, startup, True, seed)
         test_program = test_program.clone(for_test=True)
         main_graph = IrGraph(core.Graph(main.desc), for_test=False)
         test_graph = IrGraph(core.Graph(test_program.desc), for_test=True)
 
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        scope = fluid.Scope()
-        with fluid.scope_guard(scope):
+        place = paddle.CPUPlace()
+        exe = paddle.static.Executor(place)
+        scope = paddle.static.global_scope()
+        with paddle.static.scope_guard(scope):
             exe.run(startup)
         # Apply the QuantizationTransformPass
         transform_pass = QuantizationTransformPass(
@@ -133,12 +139,12 @@ def mkldnn_based_freeze_graph(
         )
         transform_pass.apply(test_graph)
 
-        build_strategy = fluid.BuildStrategy()
+        build_strategy = paddle.static.BuildStrategy()
         build_strategy.memory_optimize = False
         build_strategy.enable_inplace = False
-        binary = fluid.CompiledProgram(main_graph.graph).with_data_parallel(
-            loss_name=loss.name, build_strategy=build_strategy
-        )
+        binary = paddle.static.CompiledProgram(
+            main_graph.graph
+        ).with_data_parallel(loss_name=loss.name, build_strategy=build_strategy)
         quantized_test_program = test_graph.to_program()
         iters = 5
         batch_size = 8
@@ -150,10 +156,10 @@ def mkldnn_based_freeze_graph(
         test_reader = paddle.batch(
             paddle.dataset.mnist.test(), batch_size=batch_size
         )
-        feeder = fluid.DataFeeder(feed_list=feeds, place=place)
+        feeder = paddle.fluid.DataFeeder(feed_list=feeds, place=place)
 
         # Training the model to get the weights value
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             for _ in range(iters):
                 data = next(train_reader())
                 loss_v = exe.run(
@@ -204,12 +210,12 @@ def mkldnn_based_freeze_graph(
                     + activation_quant_type
                     + '_'
                     + weight_quant_type,
-                    np.sum(w_mkldnn),
+                    np.sum(mul_w_mkldnn),
                 )
             )
 
     def test_mkldnn_graph_cpu_static(self):
-        with fluid.unique_name.guard():
+        with paddle.utils.unique_name.guard():
             self.mkldnn_based_freeze_graph(
                 False,
                 seed=2,
diff --git a/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py b/python/paddle/static/quantization/tests/test_quantization_pass.py
similarity index 82%
rename from python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
rename to python/paddle/static/quantization/tests/test_quantization_pass.py
index db89a4ee1eafca..1bad3ea9e9305b 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
+++ b/python/paddle/static/quantization/tests/test_quantization_pass.py
@@ -13,19 +13,23 @@
 # limitations under the license.
 
 import os
-import unittest
 import random
+import unittest
+
 import numpy as np
-import paddle.fluid as fluid
+
 import paddle
+import paddle.fluid as fluid
 from paddle.fluid.framework import IrGraph
-from paddle.fluid.contrib.slim.quantization import QuantizationTransformPass
-from paddle.fluid.contrib.slim.quantization import QuantizationTransformPassV2
-from paddle.fluid.contrib.slim.quantization import QuantizationFreezePass
-from paddle.fluid.contrib.slim.quantization import ConvertToInt8Pass
-from paddle.fluid.contrib.slim.quantization import TransformForMobilePass
-from paddle.fluid.contrib.slim.quantization import AddQuantDequantPass
-from paddle.fluid import core
+from paddle.framework import core
+from paddle.static.quantization import (
+    AddQuantDequantPass,
+    ConvertToInt8Pass,
+    QuantizationFreezePass,
+    QuantizationTransformPass,
+    QuantizationTransformPassV2,
+    TransformForMobilePass,
+)
 
 paddle.enable_static()
 
@@ -34,11 +38,13 @@
 
 
 def linear_fc(num):
-    data = fluid.layers.data(name='image', shape=[1, 32, 32], dtype='float32')
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    data = paddle.static.data(
+        name='image', shape=[-1, 1, 32, 32], dtype='float32'
+    )
+    label = paddle.static.data(name='label', shape=[-1, 1], dtype='int64')
     hidden = data
     for _ in range(num):
-        hidden = fluid.layers.fc(hidden, size=128, act='relu')
+        hidden = paddle.static.nn.fc(hidden, size=128, activation='relu')
     loss = paddle.nn.functional.cross_entropy(
         input=hidden, label=label, reduction='none', use_softmax=False
     )
@@ -61,34 +67,30 @@ def conv_bn_layer(
         )
         return paddle.static.nn.batch_norm(input=tmp, act=act)
 
-    data = fluid.layers.data(
+    data = paddle.static.data(
         name='image',
         shape=[1, 1, 32, 32],
         dtype='float32',
-        append_batch_size=False,
-    )
-    label = fluid.layers.data(
-        name='label', shape=[1, 1], dtype='int64', append_batch_size=False
     )
+    label = paddle.static.data(name='label', shape=[1, 1], dtype='int64')
     hidden = data
     for _ in range(num):
         conv = conv_bn_layer(hidden, 16, 3, 1, 1, act=None, bias_attr=True)
         short = conv_bn_layer(hidden, 16, 1, 1, 0, act=None)
-        hidden = paddle.nn.functional.relu(paddle.add(x=conv, y=short))
-    matmul_weight = paddle.create_parameter(
+        hidden = paddle.add(x=conv, y=short)
+        hidden = paddle.nn.functional.relu(hidden)
+    matmul_weight = paddle.static.create_parameter(
         shape=[1, 16, 32, 32], dtype='float32'
     )
     hidden = paddle.matmul(hidden, matmul_weight, True, True)
     if quant_skip_pattern:
-        with fluid.name_scope(quant_skip_pattern):
+        with paddle.static.name_scope(quant_skip_pattern):
             pool = paddle.nn.functional.avg_pool2d(
-                x=hidden, kernel_size=2, stride=2
+                hidden, kernel_size=2, stride=2
             )
     else:
-        pool = paddle.nn.functional.avg_pool2d(
-            x=hidden, kernel_size=2, stride=2
-        )
-    fc = fluid.layers.fc(input=pool, size=10)
+        pool = paddle.nn.functional.avg_pool2d(hidden, kernel_size=2, stride=2)
+    fc = paddle.static.nn.fc(pool, size=10)
     loss = paddle.nn.functional.cross_entropy(
         input=fc, label=label, reduction='none', use_softmax=False
     )
@@ -97,28 +99,29 @@ def conv_bn_layer(
 
 
 def conv_net(img, label, quant_skip_pattern):
-    conv_pool_1 = fluid.nets.simple_img_conv_pool(
+    conv_out_1 = paddle.static.nn.conv2d(
         input=img,
         filter_size=5,
         num_filters=20,
-        pool_size=2,
-        pool_stride=2,
-        pool_type='max',
-        act="relu",
+        act='relu',
+    )
+    conv_pool_1 = paddle.nn.functional.max_pool2d(
+        conv_out_1, kernel_size=2, stride=2
     )
     conv_pool_1 = paddle.static.nn.batch_norm(conv_pool_1)
-    conv_pool_2 = fluid.nets.simple_img_conv_pool(
+
+    conv_out_2 = paddle.static.nn.conv2d(
         input=conv_pool_1,
         filter_size=5,
-        num_filters=50,
-        pool_size=2,
-        pool_stride=2,
-        pool_type='avg',
-        act="relu",
+        num_filters=20,
+        act='relu',
+    )
+    conv_pool_2 = paddle.nn.functional.avg_pool2d(
+        conv_out_2, kernel_size=2, stride=2
     )
-    hidden = fluid.layers.fc(input=conv_pool_2, size=100, act='relu')
-    with fluid.name_scope(quant_skip_pattern):
-        prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
+    hidden = paddle.static.nn.fc(conv_pool_2, size=100, activation='relu')
+    with paddle.static.name_scope(quant_skip_pattern):
+        prediction = paddle.static.nn.fc(hidden, size=10, activation='softmax')
     loss = paddle.nn.functional.cross_entropy(
         input=prediction, label=label, reduction='none', use_softmax=False
     )
@@ -164,16 +167,16 @@ def check_program(self, program):
     def linear_fc_quant(
         self, activation_quant_type, weight_quantize_type, for_ci=True
     ):
-        main = fluid.Program()
-        startup = fluid.Program()
-        with fluid.program_guard(main, startup):
+        main = paddle.static.Program()
+        startup = paddle.static.Program()
+        with paddle.static.program_guard(main, startup):
             loss = linear_fc(3)
-            opt = fluid.optimizer.Adam(learning_rate=0.001)
+            opt = paddle.optimizer.Adam(learning_rate=0.001)
             opt.minimize(loss)
-        place = fluid.CPUPlace()
+        place = paddle.CPUPlace()
         graph = IrGraph(core.Graph(main.desc), for_test=False)
         transform_pass = QuantizationTransformPass(
-            scope=fluid.global_scope(),
+            scope=paddle.static.global_scope(),
             place=place,
             activation_quantize_type=activation_quant_type,
             weight_quantize_type=weight_quantize_type,
@@ -217,16 +220,16 @@ def residual_block_quant(
         quantizable_op_type,
         for_ci=True,
     ):
-        main = fluid.Program()
-        startup = fluid.Program()
-        with fluid.program_guard(main, startup):
+        main = paddle.static.Program()
+        startup = paddle.static.Program()
+        with paddle.static.program_guard(main, startup):
             loss = residual_block(2)
-            opt = fluid.optimizer.Adam(learning_rate=0.001)
+            opt = paddle.optimizer.Adam(learning_rate=0.001)
             opt.minimize(loss)
-        place = fluid.CPUPlace()
+        place = paddle.CPUPlace()
         graph = IrGraph(core.Graph(main.desc), for_test=False)
         transform_pass = QuantizationTransformPass(
-            scope=fluid.global_scope(),
+            scope=paddle.static.global_scope(),
             place=place,
             activation_quantize_type=activation_quant_type,
             weight_quantize_type=weight_quantize_type,
@@ -289,36 +292,36 @@ def freeze_graph(
         def build_program(main, startup, is_test):
             main.random_seed = seed
             startup.random_seed = seed
-            with fluid.unique_name.guard():
-                with fluid.program_guard(main, startup):
-                    img = fluid.layers.data(
-                        name='image', shape=[1, 28, 28], dtype='float32'
+            with paddle.utils.unique_name.guard():
+                with paddle.static.program_guard(main, startup):
+                    img = paddle.static.data(
+                        name='image', shape=[-1, 1, 28, 28], dtype='float32'
                     )
-                    label = fluid.layers.data(
-                        name='label', shape=[1], dtype='int64'
+                    label = paddle.static.data(
+                        name='label', shape=[-1, 1], dtype='int64'
                     )
                     loss = conv_net(img, label, quant_skip_pattern)
                     if not is_test:
-                        opt = fluid.optimizer.Adam(learning_rate=0.001)
+                        opt = paddle.optimizer.Adam(learning_rate=0.001)
                         opt.minimize(loss)
             return [img, label], loss
 
         random.seed(0)
         np.random.seed(0)
 
-        main = fluid.Program()
-        startup = fluid.Program()
-        test_program = fluid.Program()
+        main = paddle.static.Program()
+        startup = paddle.static.Program()
+        test_program = paddle.static.Program()
         feeds, loss = build_program(main, startup, False)
         build_program(test_program, startup, True)
         test_program = test_program.clone(for_test=True)
         main_graph = IrGraph(core.Graph(main.desc), for_test=False)
         test_graph = IrGraph(core.Graph(test_program.desc), for_test=True)
 
-        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        scope = fluid.Scope()
-        with fluid.scope_guard(scope):
+        place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+        exe = paddle.static.Executor(place)
+        scope = paddle.static.global_scope()
+        with paddle.static.scope_guard(scope):
             exe.run(startup)
         transform_pass = QuantizationTransformPass(
             scope=scope,
@@ -365,13 +368,13 @@ def build_program(main, startup, is_test):
                 marked_nodes,
             )
 
-        build_strategy = fluid.BuildStrategy()
+        build_strategy = paddle.static.BuildStrategy()
         build_strategy.memory_optimize = False
         build_strategy.enable_inplace = False
         build_strategy.fuse_all_reduce_ops = False
-        binary = fluid.CompiledProgram(main_graph.graph).with_data_parallel(
-            loss_name=loss.name, build_strategy=build_strategy
-        )
+        binary = paddle.static.CompiledProgram(
+            main_graph.graph
+        ).with_data_parallel(loss_name=loss.name, build_strategy=build_strategy)
         quantized_test_program = test_graph.to_program()
         iters = 5
         batch_size = 8
@@ -383,8 +386,8 @@ def build_program(main, startup, is_test):
         test_reader = paddle.batch(
             paddle.dataset.mnist.test(), batch_size=batch_size
         )
-        feeder = fluid.DataFeeder(feed_list=feeds, place=place)
-        with fluid.scope_guard(scope):
+        feeder = paddle.fluid.DataFeeder(feed_list=feeds, place=place)
+        with paddle.static.scope_guard(scope):
             for _ in range(iters):
                 data = next(train_reader())
                 loss_v = exe.run(
@@ -403,12 +406,12 @@ def build_program(main, startup, is_test):
                     )
 
         test_data = next(test_reader())
-        with fluid.program_guard(quantized_test_program):
+        with paddle.static.program_guard(quantized_test_program):
             w_var = fluid.framework._get_var(
                 'conv2d_1.w_0.quantized', quantized_test_program
             )
         # Testing
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             test_loss1, w_quant = exe.run(
                 program=quantized_test_program,
                 feed=feeder.feed(test_data),
@@ -439,7 +442,7 @@ def build_program(main, startup, is_test):
             )
 
         server_program = test_graph.to_program()
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             (test_loss2,) = exe.run(
                 program=server_program,
                 feed=feeder.feed(test_data),
@@ -511,25 +514,32 @@ def build_program(main, startup, is_test):
             )
         server_program_int8 = test_graph.to_program()
         # Save the 8-bit parameter and model file.
-        with fluid.scope_guard(scope):
-            fluid.io.save_inference_model(
+        with paddle.static.scope_guard(scope):
+            feed_list = ['image', 'label']
+            feed_vars = [
+                server_program_int8.global_block().var(name)
+                for name in feed_list
+            ]
+            paddle.static.save_inference_model(
                 'server_int8'
                 + dev_name
                 + activation_quant_type
                 + '_'
-                + weight_quant_type,
-                ['image', 'label'],
+                + weight_quant_type
+                + '/model',
+                feed_vars,
                 [loss],
                 exe,
-                server_program_int8,
+                program=server_program_int8,
             )
             # Test whether the 8-bit parameter and model file can be loaded successfully.
-            [infer, feed, fetch] = fluid.io.load_inference_model(
+            [infer, feed, fetch] = paddle.static.load_inference_model(
                 'server_int8'
                 + dev_name
                 + activation_quant_type
                 + '_'
-                + weight_quant_type,
+                + weight_quant_type
+                + '/model',
                 exe,
             )
         # Check the loaded 8-bit weight.
@@ -576,22 +586,27 @@ def build_program(main, startup, is_test):
             )
 
         mobile_program = test_graph.to_program()
-        with fluid.scope_guard(scope):
-            fluid.io.save_inference_model(
+        with paddle.static.scope_guard(scope):
+            feed_list = ['image', 'label']
+            feed_vars = [
+                mobile_program.global_block().var(name) for name in feed_list
+            ]
+            paddle.static.save_inference_model(
                 'mobile_int8'
                 + dev_name
                 + activation_quant_type
                 + '_'
-                + weight_quant_type,
-                ['image', 'label'],
+                + weight_quant_type
+                + '/model',
+                feed_vars,
                 [loss],
                 exe,
-                mobile_program,
+                program=mobile_program,
             )
 
     def test_freeze_graph_cuda_dynamic(self):
-        if fluid.core.is_compiled_with_cuda():
-            with fluid.unique_name.guard():
+        if core.is_compiled_with_cuda():
+            with paddle.utils.unique_name.guard():
                 self.freeze_graph(
                     True,
                     seed=1,
@@ -599,7 +614,7 @@ def test_freeze_graph_cuda_dynamic(self):
                     weight_quant_type='abs_max',
                     for_ci=True,
                 )
-            with fluid.unique_name.guard():
+            with paddle.utils.unique_name.guard():
                 self.freeze_graph(
                     True,
                     seed=1,
@@ -609,7 +624,7 @@ def test_freeze_graph_cuda_dynamic(self):
                 )
 
     def test_freeze_graph_cpu_dynamic(self):
-        with fluid.unique_name.guard():
+        with paddle.utils.unique_name.guard():
             self.freeze_graph(
                 False,
                 seed=2,
@@ -626,8 +641,8 @@ def test_freeze_graph_cpu_dynamic(self):
             )
 
     def test_freeze_graph_cuda_static(self):
-        if fluid.core.is_compiled_with_cuda():
-            with fluid.unique_name.guard():
+        if core.is_compiled_with_cuda():
+            with paddle.utils.unique_name.guard():
                 self.freeze_graph(
                     True,
                     seed=1,
@@ -674,7 +689,7 @@ def test_freeze_graph_cuda_static(self):
                 )
 
     def test_freeze_graph_cpu_static(self):
-        with fluid.unique_name.guard():
+        with paddle.utils.unique_name.guard():
             self.freeze_graph(
                 False,
                 seed=2,
@@ -720,48 +735,50 @@ def conv_bn_layer(
         )
         return paddle.static.nn.batch_norm(input=tmp, act=act)
 
-    data1 = fluid.layers.data(name='image', shape=[1, 32, 32], dtype='float32')
-    data2 = fluid.layers.data(
-        name='matmul_input', shape=[16, 32, 32], dtype='float32'
+    data1 = paddle.static.data(
+        name='image', shape=[-1, 1, 32, 32], dtype='float32'
+    )
+    data2 = paddle.static.data(
+        name='matmul_input', shape=[-1, 16, 32, 32], dtype='float32'
     )
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    label = paddle.static.data(name='label', shape=[-1, 1], dtype='int64')
     hidden = data1
     for _ in range(num):
         conv = conv_bn_layer(hidden, 16, 3, 1, 1, act=None, bias_attr=True)
         short = conv_bn_layer(hidden, 16, 1, 1, 0, act=None)
-        hidden = paddle.nn.functional.relu(paddle.add(x=conv, y=short))
+        hidden = paddle.add(x=conv, y=short)
+        hidden = paddle.nn.functional.relu(hidden)
     hidden = paddle.matmul(hidden, data2, True, True)
     if isinstance(quant_skip_pattern, str):
-        with fluid.name_scope(quant_skip_pattern):
+        with paddle.static.name_scope(quant_skip_pattern):
             pool1 = paddle.nn.functional.avg_pool2d(
-                x=hidden, kernel_size=2, stride=2
+                hidden, kernel_size=2, stride=2
             )
             pool2 = paddle.nn.functional.max_pool2d(
-                x=hidden, kernel_size=2, stride=2
+                hidden, kernel_size=2, stride=2
             )
-            pool_add = paddle.nn.functional.relu(paddle.add(x=pool1, y=pool2))
+            pool_add = paddle.add(pool1, pool2)
+            pool_add = paddle.nn.functional.relu(pool_add)
     elif isinstance(quant_skip_pattern, list):
         assert (
             len(quant_skip_pattern) > 1
         ), 'test config error: the len of quant_skip_pattern list should be greater than 1.'
-        with fluid.name_scope(quant_skip_pattern[0]):
+        with paddle.static.name_scope(quant_skip_pattern[0]):
             pool1 = paddle.nn.functional.avg_pool2d(
-                x=hidden, kernel_size=2, stride=2
+                hidden, kernel_size=2, stride=2
             )
             pool2 = paddle.nn.functional.max_pool2d(
-                x=hidden, kernel_size=2, stride=2
+                hidden, kernel_size=2, stride=2
             )
-        with fluid.name_scope(quant_skip_pattern[1]):
-            pool_add = paddle.nn.functional.relu(paddle.add(x=pool1, y=pool2))
+        with paddle.static.name_scope(quant_skip_pattern[1]):
+            pool_add = paddle.add(pool1, pool2)
+            pool_add = paddle.nn.functional.relu(pool_add)
     else:
-        pool1 = paddle.nn.functional.avg_pool2d(
-            x=hidden, kernel_size=2, stride=2
-        )
-        pool2 = paddle.nn.functional.max_pool2d(
-            x=hidden, kernel_size=2, stride=2
-        )
-        pool_add = paddle.nn.functional.relu(paddle.add(x=pool1, y=pool2))
-    fc = fluid.layers.fc(input=pool_add, size=10)
+        pool1 = paddle.nn.functional.avg_pool2d(hidden, kernel_size=2, stride=2)
+        pool2 = paddle.nn.functional.max_pool2d(hidden, kernel_size=2, stride=2)
+        pool_add = paddle.add(pool1, pool2)
+        pool_add = paddle.nn.functional.relu(pool_add)
+    fc = paddle.static.nn.fc(pool_add, size=10)
     loss = paddle.nn.functional.cross_entropy(
         input=fc, label=label, reduction='none', use_softmax=False
     )
@@ -814,16 +831,16 @@ def check_graph(self, graph, skip_pattern=None):
     def residual_block_quant(
         self, quantizable_op_type, skip_pattern=None, for_ci=True
     ):
-        main = fluid.Program()
-        startup = fluid.Program()
-        with fluid.program_guard(main, startup):
+        main = paddle.static.Program()
+        startup = paddle.static.Program()
+        with paddle.static.program_guard(main, startup):
             loss = quant_dequant_residual_block(2, skip_pattern)
-            opt = fluid.optimizer.Adam(learning_rate=0.001)
+            opt = paddle.optimizer.Adam(learning_rate=0.001)
             opt.minimize(loss)
-        place = fluid.CPUPlace()
+        place = paddle.CPUPlace()
         graph = IrGraph(core.Graph(main.desc), for_test=False)
         add_quant_dequant_pass = AddQuantDequantPass(
-            scope=fluid.global_scope(),
+            scope=paddle.static.global_scope(),
             place=place,
             skip_pattern=skip_pattern,
             quantizable_op_type=quantizable_op_type,
@@ -904,16 +921,16 @@ def check_program(self, program):
     def linear_fc_quant(
         self, activation_quant_type, weight_quantize_type, for_ci=True
     ):
-        main = fluid.Program()
-        startup = fluid.Program()
-        with fluid.program_guard(main, startup):
+        main = paddle.static.Program()
+        startup = paddle.static.Program()
+        with paddle.static.program_guard(main, startup):
             loss = linear_fc(3)
-            opt = fluid.optimizer.Adam(learning_rate=0.001)
+            opt = paddle.optimizer.Adam(learning_rate=0.001)
             opt.minimize(loss)
-        place = fluid.CPUPlace()
+        place = paddle.CPUPlace()
         graph = IrGraph(core.Graph(main.desc), for_test=False)
         transform_pass = QuantizationTransformPassV2(
-            scope=fluid.global_scope(),
+            scope=paddle.static.global_scope(),
             place=place,
             activation_quantize_type=activation_quant_type,
             weight_quantize_type=weight_quantize_type,
@@ -952,16 +969,16 @@ def residual_block_quant(
         quantizable_op_type,
         for_ci=True,
     ):
-        main = fluid.Program()
-        startup = fluid.Program()
-        with fluid.program_guard(main, startup):
+        main = paddle.static.Program()
+        startup = paddle.static.Program()
+        with paddle.static.program_guard(main, startup):
             loss = residual_block(2)
-            opt = fluid.optimizer.Adam(learning_rate=0.001)
+            opt = paddle.optimizer.Adam(learning_rate=0.001)
             opt.minimize(loss)
-        place = fluid.CPUPlace()
+        place = paddle.CPUPlace()
         graph = IrGraph(core.Graph(main.desc), for_test=False)
         transform_pass = QuantizationTransformPass(
-            scope=fluid.global_scope(),
+            scope=paddle.static.global_scope(),
             place=place,
             activation_quantize_type=activation_quant_type,
             weight_quantize_type=weight_quantize_type,
diff --git a/python/paddle/fluid/contrib/slim/tests/test_quantization_scale_pass.py b/python/paddle/static/quantization/tests/test_quantization_scale_pass.py
similarity index 71%
rename from python/paddle/fluid/contrib/slim/tests/test_quantization_scale_pass.py
rename to python/paddle/static/quantization/tests/test_quantization_scale_pass.py
index 49393b5a89f126..6e9a1bf11f342a 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_quantization_scale_pass.py
+++ b/python/paddle/static/quantization/tests/test_quantization_scale_pass.py
@@ -13,19 +13,22 @@
 # limitations under the license.
 
 import os
-import unittest
 import random
-import numpy as np
 import tempfile
-import paddle.fluid as fluid
+import unittest
+
+import numpy as np
+
 import paddle
 from paddle.fluid.framework import IrGraph
-from paddle.fluid.contrib.slim.quantization import QuantizationTransformPass
-from paddle.fluid.contrib.slim.quantization import QuantizationFreezePass
-from paddle.fluid.contrib.slim.quantization import OutScaleForTrainingPass
-from paddle.fluid.contrib.slim.quantization import OutScaleForInferencePass
-from paddle.fluid.contrib.slim.quantization import AddQuantDequantPass
-from paddle.fluid import core
+from paddle.framework import core
+from paddle.static.quantization import (
+    AddQuantDequantPass,
+    OutScaleForInferencePass,
+    OutScaleForTrainingPass,
+    QuantizationFreezePass,
+    QuantizationTransformPass,
+)
 
 paddle.enable_static()
 
@@ -34,27 +37,27 @@
 
 
 def conv_net(img, label):
-    conv_pool_1 = fluid.nets.simple_img_conv_pool(
+    conv_out_1 = paddle.static.nn.conv2d(
         input=img,
         filter_size=5,
         num_filters=20,
-        pool_size=2,
-        pool_stride=2,
-        pool_type='max',
-        act="relu",
+        act='relu',
+    )
+    conv_pool_1 = paddle.nn.functional.max_pool2d(
+        conv_out_1, kernel_size=2, stride=2
     )
     conv_pool_1 = paddle.static.nn.batch_norm(conv_pool_1)
-    conv_pool_2 = fluid.nets.simple_img_conv_pool(
+    conv_out_2 = paddle.static.nn.conv2d(
         input=conv_pool_1,
         filter_size=5,
-        num_filters=50,
-        pool_size=2,
-        pool_stride=2,
-        pool_type='avg',
-        act="relu",
+        num_filters=20,
+        act='relu',
     )
-    hidden = fluid.layers.fc(input=conv_pool_2, size=100, act='relu')
-    prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
+    conv_pool_2 = paddle.nn.functional.avg_pool2d(
+        conv_out_2, kernel_size=2, stride=2
+    )
+    hidden = paddle.static.nn.fc(conv_pool_2, size=100, activation='relu')
+    prediction = paddle.static.nn.fc(hidden, size=10, activation='softmax')
     loss = paddle.nn.functional.cross_entropy(
         input=prediction, label=label, reduction='none', use_softmax=False
     )
@@ -74,36 +77,36 @@ def quantization_scale(
         def build_program(main, startup, is_test):
             main.random_seed = seed
             startup.random_seed = seed
-            with fluid.unique_name.guard():
-                with fluid.program_guard(main, startup):
-                    img = fluid.layers.data(
-                        name='image', shape=[1, 28, 28], dtype='float32'
+            with paddle.utils.unique_name.guard():
+                with paddle.static.program_guard(main, startup):
+                    img = paddle.static.data(
+                        name='image', shape=[-1, 1, 28, 28], dtype='float32'
                     )
-                    label = fluid.layers.data(
-                        name='label', shape=[1], dtype='int64'
+                    label = paddle.static.data(
+                        name='label', shape=[-1, 1], dtype='int64'
                     )
                     loss = conv_net(img, label)
                     if not is_test:
-                        opt = fluid.optimizer.Adam(learning_rate=0.0001)
+                        opt = paddle.optimizer.Adam(learning_rate=0.0001)
                         opt.minimize(loss)
             return [img, label], loss
 
         random.seed(0)
         np.random.seed(0)
 
-        main = fluid.Program()
-        startup = fluid.Program()
-        test_program = fluid.Program()
+        main = paddle.static.Program()
+        startup = paddle.static.Program()
+        test_program = paddle.static.Program()
         feeds, loss = build_program(main, startup, False)
         build_program(test_program, startup, True)
         test_program = test_program.clone(for_test=True)
         main_graph = IrGraph(core.Graph(main.desc), for_test=False)
         test_graph = IrGraph(core.Graph(test_program.desc), for_test=True)
 
-        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        scope = fluid.Scope()
-        with fluid.scope_guard(scope):
+        place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+        exe = paddle.static.Executor(place)
+        scope = paddle.static.global_scope()
+        with paddle.static.scope_guard(scope):
             exe.run(startup)
 
         transform_pass = QuantizationTransformPass(
@@ -135,13 +138,13 @@ def build_program(main, startup, is_test):
                     marked_nodes.add(op)
             test_graph.draw('.', 'test_scale' + dev_name, marked_nodes)
 
-        build_strategy = fluid.BuildStrategy()
+        build_strategy = paddle.static.BuildStrategy()
         build_strategy.memory_optimize = False
         build_strategy.enable_inplace = False
         build_strategy.fuse_all_reduce_ops = False
-        binary = fluid.CompiledProgram(main_graph.graph).with_data_parallel(
-            loss_name=loss.name, build_strategy=build_strategy
-        )
+        binary = paddle.static.CompiledProgram(
+            main_graph.graph
+        ).with_data_parallel(loss_name=loss.name, build_strategy=build_strategy)
         iters = 5
         batch_size = 8
 
@@ -149,8 +152,8 @@ def build_program(main, startup, is_test):
             paddle.reader.shuffle(paddle.dataset.mnist.train(), buf_size=500),
             batch_size=batch_size,
         )
-        feeder = fluid.DataFeeder(feed_list=feeds, place=place)
-        with fluid.scope_guard(scope):
+        feeder = paddle.fluid.DataFeeder(feed_list=feeds, place=place)
+        with paddle.static.scope_guard(scope):
             for _ in range(iters):
                 data = next(train_reader())
                 loss_v = exe.run(
@@ -184,20 +187,24 @@ def build_program(main, startup, is_test):
         with open(mapping_table_path, 'w') as f:
             f.write(str(server_program))
 
-        with fluid.scope_guard(scope):
-            fluid.io.save_inference_model(
+        with paddle.static.scope_guard(scope):
+            feed_list = ['image', 'label']
+            feed_vars = [
+                server_program.global_block().var(name) for name in feed_list
+            ]
+            paddle.static.save_inference_model(
                 save_path,
-                ['image', 'label'],
+                feed_vars,
                 [loss],
                 exe,
-                server_program,
+                program=server_program,
                 clip_extra=True,
             )
         tempdir.cleanup()
 
     def test_quant_scale_cuda(self):
-        if fluid.core.is_compiled_with_cuda():
-            with fluid.unique_name.guard():
+        if core.is_compiled_with_cuda():
+            with paddle.utils.unique_name.guard():
                 self.quantization_scale(
                     True,
                     seed=1,
@@ -207,7 +214,7 @@ def test_quant_scale_cuda(self):
                 )
 
     def test_quant_scale_cpu(self):
-        with fluid.unique_name.guard():
+        with paddle.utils.unique_name.guard():
             self.quantization_scale(
                 False,
                 seed=2,
diff --git a/python/paddle/fluid/contrib/slim/tests/test_user_defined_quantization.py b/python/paddle/static/quantization/tests/test_user_defined_quantization.py
similarity index 75%
rename from python/paddle/fluid/contrib/slim/tests/test_user_defined_quantization.py
rename to python/paddle/static/quantization/tests/test_user_defined_quantization.py
index 63813f7ebfc177..b5856854b44fd9 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_user_defined_quantization.py
+++ b/python/paddle/static/quantization/tests/test_user_defined_quantization.py
@@ -12,23 +12,24 @@
 # see the license for the specific language governing permissions and
 # limitations under the license.
 
-import os
-import unittest
 import json
+import os
 import random
-import numpy as np
 import tempfile
-import paddle.fluid as fluid
+import unittest
+
+import numpy as np
+
 import paddle
 from paddle.fluid.framework import IrGraph
-from paddle.fluid.contrib.slim.quantization import QuantizationTransformPass
-from paddle.fluid.contrib.slim.quantization import QuantizationFreezePass
-from paddle.fluid.contrib.slim.quantization import OutScaleForTrainingPass
-from paddle.fluid.contrib.slim.quantization import OutScaleForInferencePass
-from paddle.fluid.contrib.slim.quantization import AddQuantDequantPass
-from paddle.fluid import core
-from paddle.fluid.layer_helper import LayerHelper
-import paddle.nn.functional as F
+from paddle.framework import LayerHelper, core
+from paddle.static.quantization import (
+    AddQuantDequantPass,
+    OutScaleForInferencePass,
+    OutScaleForTrainingPass,
+    QuantizationFreezePass,
+    QuantizationTransformPass,
+)
 
 paddle.enable_static()
 
@@ -37,27 +38,27 @@
 
 
 def conv_net(img, label):
-    conv_pool_1 = fluid.nets.simple_img_conv_pool(
+    conv_out_1 = paddle.static.nn.conv2d(
         input=img,
         filter_size=5,
         num_filters=20,
-        pool_size=2,
-        pool_stride=2,
-        pool_type='max',
-        act="relu",
+        act='relu',
+    )
+    conv_pool_1 = paddle.nn.functional.max_pool2d(
+        conv_out_1, kernel_size=2, stride=2
     )
     conv_pool_1 = paddle.static.nn.batch_norm(conv_pool_1)
-    conv_pool_2 = fluid.nets.simple_img_conv_pool(
+    conv_out_2 = paddle.static.nn.conv2d(
         input=conv_pool_1,
         filter_size=5,
-        num_filters=50,
-        pool_size=2,
-        pool_stride=2,
-        pool_type='avg',
-        act="relu",
+        num_filters=20,
+        act='relu',
     )
-    hidden = fluid.layers.fc(input=conv_pool_2, size=100, act='relu')
-    prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
+    conv_pool_2 = paddle.nn.functional.avg_pool2d(
+        conv_out_2, kernel_size=2, stride=2
+    )
+    hidden = paddle.static.nn.fc(conv_pool_2, size=100, activation='relu')
+    prediction = paddle.static.nn.fc(hidden, size=10, activation='softmax')
     loss = paddle.nn.functional.cross_entropy(
         input=prediction, label=label, reduction='none', use_softmax=False
     )
@@ -69,15 +70,17 @@ def pact(x, name=None):
     helper = LayerHelper("pact", **locals())
     dtype = 'float32'
     init_thres = 20
-    u_param_attr = fluid.ParamAttr(
+    u_param_attr = paddle.ParamAttr(
         name=x.name + '_pact',
-        initializer=fluid.initializer.ConstantInitializer(value=init_thres),
-        regularizer=fluid.regularizer.L2Decay(0.0001),
+        initializer=paddle.nn.initializer.Constant(value=init_thres),
+        regularizer=paddle.regularizer.L2Decay(0.0001),
         learning_rate=1,
     )
     u_param = helper.create_parameter(attr=u_param_attr, shape=[1], dtype=dtype)
-    x = paddle.subtract(x, F.relu(paddle.subtract(x, u_param)))
-    x = paddle.add(x, F.relu(paddle.subtract(-u_param, x)))
+    x = paddle.subtract(
+        x, paddle.nn.functional.relu(paddle.subtract(x, u_param))
+    )
+    x = paddle.add(x, paddle.nn.functional.relu(paddle.subtract(-u_param, x)))
 
     return x
 
@@ -98,23 +101,23 @@ def quantization_scale(
         def build_program(main, startup, is_test):
             main.random_seed = seed
             startup.random_seed = seed
-            with fluid.unique_name.guard():
-                with fluid.program_guard(main, startup):
-                    img = fluid.layers.data(
-                        name='image', shape=[1, 28, 28], dtype='float32'
+            with paddle.utils.unique_name.guard():
+                with paddle.static.program_guard(main, startup):
+                    img = paddle.static.data(
+                        name='image', shape=[-1, 1, 28, 28], dtype='float32'
                     )
                     img.stop_gradient = False
-                    label = fluid.layers.data(
-                        name='label', shape=[1], dtype='int64'
+                    label = paddle.static.data(
+                        name='label', shape=[-1, 1], dtype='int64'
                     )
                     loss = conv_net(img, label)
                     if not is_test:
-                        opt = fluid.optimizer.SGD(learning_rate=0.0001)
+                        opt = paddle.optimizer.SGD(learning_rate=0.0001)
                         opt.minimize(loss)
             return [img, label], loss
 
         def get_optimizer():
-            return fluid.optimizer.MomentumOptimizer(0.0001, 0.9)
+            return paddle.optimizer.Momentum(0.0001, 0.9)
 
         def load_dict(mapping_table_path):
             with open(mapping_table_path, 'r') as file:
@@ -131,19 +134,19 @@ def save_dict(Dict, mapping_table_path):
         tempdir = tempfile.TemporaryDirectory()
         mapping_table_path = os.path.join(tempdir.name, 'inference')
 
-        main = fluid.Program()
-        startup = fluid.Program()
-        test_program = fluid.Program()
+        main = paddle.static.Program()
+        startup = paddle.static.Program()
+        test_program = paddle.static.Program()
         feeds, loss = build_program(main, startup, False)
         build_program(test_program, startup, True)
         test_program = test_program.clone(for_test=True)
         main_graph = IrGraph(core.Graph(main.desc), for_test=False)
         test_graph = IrGraph(core.Graph(test_program.desc), for_test=True)
 
-        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        scope = fluid.Scope()
-        with fluid.scope_guard(scope):
+        place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+        exe = paddle.static.Executor(place)
+        scope = paddle.static.global_scope()
+        with paddle.static.scope_guard(scope):
             exe.run(startup)
         train_transform_pass = QuantizationTransformPass(
             scope=scope,
@@ -183,13 +186,13 @@ def save_dict(Dict, mapping_table_path):
 
         dev_name = '_gpu' if use_cuda else '_cpu'
 
-        build_strategy = fluid.BuildStrategy()
+        build_strategy = paddle.static.BuildStrategy()
         build_strategy.memory_optimize = False
         build_strategy.enable_inplace = False
         build_strategy.fuse_all_reduce_ops = False
-        binary = fluid.CompiledProgram(main_graph.graph).with_data_parallel(
-            loss_name=loss.name, build_strategy=build_strategy
-        )
+        binary = paddle.static.CompiledProgram(
+            main_graph.graph
+        ).with_data_parallel(loss_name=loss.name, build_strategy=build_strategy)
         iters = 5
         batch_size = 8
 
@@ -197,8 +200,8 @@ def save_dict(Dict, mapping_table_path):
             paddle.reader.shuffle(paddle.dataset.mnist.train(), buf_size=500),
             batch_size=batch_size,
         )
-        feeder = fluid.DataFeeder(feed_list=feeds, place=place)
-        with fluid.scope_guard(scope):
+        feeder = paddle.fluid.DataFeeder(feed_list=feeds, place=place)
+        with paddle.static.scope_guard(scope):
             for _ in range(iters):
                 data = next(train_reader())
                 loss_v = exe.run(
@@ -223,8 +226,8 @@ def save_dict(Dict, mapping_table_path):
         tempdir.cleanup()
 
     def test_act_preprocess_cuda(self):
-        if fluid.core.is_compiled_with_cuda():
-            with fluid.unique_name.guard():
+        if core.is_compiled_with_cuda():
+            with paddle.utils.unique_name.guard():
                 self.quantization_scale(
                     True,
                     seed=1,
@@ -235,7 +238,7 @@ def test_act_preprocess_cuda(self):
                 )
 
     def test_act_preprocess_cpu(self):
-        with fluid.unique_name.guard():
+        with paddle.utils.unique_name.guard():
             self.quantization_scale(
                 False,
                 seed=2,
@@ -246,8 +249,8 @@ def test_act_preprocess_cpu(self):
             )
 
     def test_weight_preprocess_cuda(self):
-        if fluid.core.is_compiled_with_cuda():
-            with fluid.unique_name.guard():
+        if core.is_compiled_with_cuda():
+            with paddle.utils.unique_name.guard():
                 self.quantization_scale(
                     True,
                     seed=1,
@@ -258,7 +261,7 @@ def test_weight_preprocess_cuda(self):
                 )
 
     def test_weight_preprocess_cpu(self):
-        with fluid.unique_name.guard():
+        with paddle.utils.unique_name.guard():
             self.quantization_scale(
                 False,
                 seed=2,
@@ -269,8 +272,8 @@ def test_weight_preprocess_cpu(self):
             )
 
     def test_act_quantize_cuda(self):
-        if fluid.core.is_compiled_with_cuda():
-            with fluid.unique_name.guard():
+        if core.is_compiled_with_cuda():
+            with paddle.utils.unique_name.guard():
                 self.quantization_scale(
                     True,
                     seed=1,
@@ -281,7 +284,7 @@ def test_act_quantize_cuda(self):
                 )
 
     def test_act_quantize_cpu(self):
-        with fluid.unique_name.guard():
+        with paddle.utils.unique_name.guard():
             self.quantization_scale(
                 False,
                 seed=2,
@@ -292,8 +295,8 @@ def test_act_quantize_cpu(self):
             )
 
     def test_weight_quantize_cuda(self):
-        if fluid.core.is_compiled_with_cuda():
-            with fluid.unique_name.guard():
+        if core.is_compiled_with_cuda():
+            with paddle.utils.unique_name.guard():
                 self.quantization_scale(
                     True,
                     seed=1,
@@ -304,7 +307,7 @@ def test_weight_quantize_cuda(self):
                 )
 
     def test_weight_quantize_cpu(self):
-        with fluid.unique_name.guard():
+        with paddle.utils.unique_name.guard():
             self.quantization_scale(
                 False,
                 seed=2,
diff --git a/python/paddle/fluid/contrib/slim/tests/test_weight_quantization_mobilenetv1.py b/python/paddle/static/quantization/tests/test_weight_quantization_mobilenetv1.py
similarity index 88%
rename from python/paddle/fluid/contrib/slim/tests/test_weight_quantization_mobilenetv1.py
rename to python/paddle/static/quantization/tests/test_weight_quantization_mobilenetv1.py
index 8a8099df945e1b..e85064f5847a0e 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_weight_quantization_mobilenetv1.py
+++ b/python/paddle/static/quantization/tests/test_weight_quantization_mobilenetv1.py
@@ -12,13 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import unittest
 import os
 import time
+import unittest
+
 import numpy as np
-from paddle.dataset.common import download, DATA_HOME
-from paddle.fluid.contrib.slim.quantization import WeightQuantization
+
 import paddle
+from paddle.dataset.common import DATA_HOME, download
+from paddle.static.quantization import WeightQuantization
 
 paddle.enable_static()
 
@@ -73,6 +75,8 @@ def cache_unzipping(self, target_folder, zip_path):
     def quantize_to_int(
         self,
         model_name,
+        model_filename,
+        params_filename,
         model_data_url,
         model_data_md5,
         weight_bits,
@@ -93,7 +97,11 @@ def quantize_to_int(
             model_name + "_wq_" + str(weight_bits) + "_" + timestamp,
         )
 
-        weight_quant = WeightQuantization(model_dir=load_model_dir)
+        weight_quant = WeightQuantization(
+            model_dir=load_model_dir,
+            model_filename=model_filename,
+            params_filename=params_filename,
+        )
         weight_quant.quantize_weight_to_int(
             save_model_dir=save_model_dir,
             weight_bits=weight_bits,
@@ -183,7 +191,7 @@ def run_models(
                 inference_program,
                 feed_target_names,
                 fetch_targets,
-            ] = paddle.fluid.io.load_inference_model(
+            ] = paddle.static.load_inference_model(
                 model_dir,
                 exe,
                 model_filename=model_filename,
@@ -193,10 +201,10 @@ def run_models(
         if is_fp16_model:
             for var in inference_program.list_vars():
                 if (
-                    (var.type == paddle.fluid.core.VarDesc.VarType.RAW)
+                    (var.type == paddle.framework.core.VarDesc.VarType.RAW)
                     or (not var.persistable)
                     or (var.name in ['feed', 'fetch'])
-                    or (var.dtype != paddle.fluid.core.VarDesc.VarType.FP16)
+                    or (var.dtype != paddle.framework.core.VarDesc.VarType.FP16)
                 ):
                     continue
                 tensor = _load_variable_data(scope, var.name)
@@ -228,9 +236,11 @@ def test_weight_quantization_mobilenetv1_8bit_abs_max(self):
         generate_test_model = True
         threshold_rate = 0.0
         self.quantize_to_int(
-            self.nocomb_model_name,
-            self.nocomb_model_data_url,
-            self.nocomb_model_data_md5,
+            self.comb_model_name,
+            '__model__',
+            '__params__',
+            self.comb_model_data_url,
+            self.comb_model_data_md5,
             weight_bits,
             quantizable_op_type,
             weight_quantize_type,
@@ -245,9 +255,11 @@ def test_weight_quantization_mobilenetv1_8bit_channel_wise_abs_max(self):
         generate_test_model = True
         threshold_rate = 0.0
         self.quantize_to_int(
-            self.nocomb_model_name,
-            self.nocomb_model_data_url,
-            self.nocomb_model_data_md5,
+            self.comb_model_name,
+            '__model__',
+            '__params__',
+            self.comb_model_data_url,
+            self.comb_model_data_md5,
             weight_bits,
             quantizable_op_type,
             weight_quantize_type,
@@ -262,9 +274,11 @@ def test_weight_quantization_mobilenetv1_16bit_abs_max(self):
         generate_test_model = False
         threshold_rate = 0
         self.quantize_to_int(
-            self.nocomb_model_name,
-            self.nocomb_model_data_url,
-            self.nocomb_model_data_md5,
+            self.comb_model_name,
+            '__model__',
+            '__params__',
+            self.comb_model_data_url,
+            self.comb_model_data_md5,
             weight_bits,
             quantizable_op_type,
             weight_quantize_type,
@@ -279,9 +293,11 @@ def test_weight_quantization_mobilenetv1_16bit_channel_wise_abs_max(self):
         generate_test_model = False
         threshold_rate = 1e-9
         self.quantize_to_int(
-            self.nocomb_model_name,
-            self.nocomb_model_data_url,
-            self.nocomb_model_data_md5,
+            self.comb_model_name,
+            '__model__',
+            '__params__',
+            self.comb_model_data_url,
+            self.comb_model_data_md5,
             weight_bits,
             quantizable_op_type,
             weight_quantize_type,
@@ -300,17 +316,6 @@ def test_mobilenetv1_fp16_combined(self):
             params_filename,
         )
 
-    def test_mobilenetv1_fp16_nocombined(self):
-        model_filename = None
-        params_filename = None
-        self.convert_to_fp16(
-            self.nocomb_model_name,
-            self.nocomb_model_data_url,
-            self.nocomb_model_data_md5,
-            model_filename,
-            params_filename,
-        )
-
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/contrib/slim/quantization/utils.py b/python/paddle/static/quantization/utils.py
similarity index 98%
rename from python/paddle/fluid/contrib/slim/quantization/utils.py
rename to python/paddle/static/quantization/utils.py
index 5f5fc99b44c7db..05e197bb750490 100644
--- a/python/paddle/fluid/contrib/slim/quantization/utils.py
+++ b/python/paddle/static/quantization/utils.py
@@ -1,4 +1,4 @@
-#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,9 +13,10 @@
 # limitations under the License.
 
 import sys
+
 import numpy as np
-from ....framework import IrNode
-from ....framework import Operator
+
+from ...fluid.framework import IrNode, Operator
 
 _weight_supported_quantizable_op_type = [
     'conv2d',
@@ -158,7 +159,6 @@
     "reshape": [["X"], ["Out"]],
     "reshape2": [["X"], ["Out"]],
     "transpose2": [["X"], ["Out"]],
-    "bilinear_interp": [["X"], ["Out"]],
     "nearest_interp": [["X"], ["Out"]],
     "trilinear_interp": [["X"], ["Out"]],
     "slice": [["Input"], ["Out"]],
@@ -185,7 +185,6 @@
     "flatten": [["X"], ["Out"]],
     "flatten2": [["X"], ["Out"]],
     "unsqueeze2": [["X"], ["Out"]],
-    "unsqueeze2": [["X"], ["Out"]],
     "flatten_contiguous_range": [["X"], ["Out"]],
     "split": [["X"], ["Out"]],
     "squeeze2": [["X"], ["Out"]],
diff --git a/python/paddle/tensor/array.py b/python/paddle/tensor/array.py
index fdee6500e114cc..70b606c3c6fbea 100644
--- a/python/paddle/tensor/array.py
+++ b/python/paddle/tensor/array.py
@@ -26,7 +26,7 @@ def array_length(array):
     This OP is used to get the length of the input array.
 
     Args:
-        array (list|Tensor): The input array that will be used to compute the length. In dynamic mode, ``array`` is a Python list. But in static mode, array is a Tensor whose VarType is LOD_TENSOR_ARRAY.
+        array (list|Tensor): The input array that will be used to compute the length. In dynamic mode, ``array`` is a Python list. But in static graph mode, array is a Tensor whose VarType is LOD_TENSOR_ARRAY.
 
     Returns:
         Tensor: 1-D Tensor with shape [1], which is the length of array.
@@ -88,7 +88,7 @@ def array_read(array, i):
             output = [0.4, 0.2]
 
     Args:
-        array (list|Tensor): The input array. In dynamic mode, ``array`` is a Python list. But in static mode, array is a Tensor whose ``VarType`` is ``LOD_TENSOR_ARRAY``.
+        array (list|Tensor): The input array. In dynamic mode, ``array`` is a Python list. But in static graph mode, array is a Tensor whose ``VarType`` is ``LOD_TENSOR_ARRAY``.
         i (Tensor): 1-D Tensor, whose shape is [1] and dtype is int64. It represents the
             specified read position of ``array``.
 
@@ -150,7 +150,7 @@ def array_write(x, i, array=None):
             ``x`` is written.
         array (list|Tensor, optional): The array into which ``x`` is written. The default value is None,
             when a new array will be created and returned as a result. In dynamic mode, ``array`` is a Python list.
-            But in static mode, array is a Tensor whose ``VarType`` is ``LOD_TENSOR_ARRAY``.
+            But in static graph mode, array is a Tensor whose ``VarType`` is ``LOD_TENSOR_ARRAY``.
 
     Returns:
         list|Tensor: The input ``array`` after ``x`` is written into.
@@ -230,7 +230,7 @@ def create_array(dtype, initialized_list=None):
                     All values in initialized list should be a Tensor.
 
     Returns:
-        list|Tensor: An empty array. In dynamic mode, ``array`` is a Python list. But in static mode, array is a Tensor
+        list|Tensor: An empty array. In dynamic mode, ``array`` is a Python list. But in static graph mode, array is a Tensor
         whose ``VarType`` is ``LOD_TENSOR_ARRAY``.
 
     Examples:
@@ -258,7 +258,7 @@ def create_array(dtype, initialized_list=None):
             )
         array = list(initialized_list)
 
-    # NOTE: Only support plain list like [x, y,...], not support nested list in static mode.
+    # NOTE: Only support plain list like [x, y,...], not support nested list in static graph mode.
     for val in array:
         if not isinstance(val, Variable):
             raise TypeError(
diff --git a/python/paddle/tensor/layer_function_generator.py b/python/paddle/tensor/layer_function_generator.py
index a30fbdde5e77d5..299e41d2aea94e 100644
--- a/python/paddle/tensor/layer_function_generator.py
+++ b/python/paddle/tensor/layer_function_generator.py
@@ -334,6 +334,7 @@ def generate_inplace_fn(inplace_op_type):
     origin_op_type = inplace_op_type[:-1]
 
     def func(x, name=None):
+
         if in_dygraph_mode():
             if hasattr(_C_ops, inplace_op_type):
                 op = getattr(_C_ops, inplace_op_type)
@@ -343,7 +344,7 @@ def func(x, name=None):
                 return op(x)
         else:
             warnings.warn(
-                "In static mode, {}() is the same as {}() and does not perform inplace operation.".format(
+                "In static graph mode, {}() is the same as {}() and does not perform inplace operation.".format(
                     inplace_op_type, origin_op_type
                 )
             )
diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py
index 17ded2c21466b3..be215c287f9832 100644
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -1018,7 +1018,9 @@ def svd_norm(input, porder, axis=[-1]):
     def empty_tensor(input, shape):
         if in_dygraph_mode():
             return input.reshape(shape)
-        raise ValueError("only support x is nonempty tensor in static mode")
+        raise ValueError(
+            "only support x is nonempty tensor in static graph mode"
+        )
 
     x_shape = list(x.shape)
     if not len(x_shape) >= 2:
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index 15b327d22211ca..842deaac991a9c 100644
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -1163,7 +1163,7 @@ def concat(x, axis=0, name=None):
         if input[0].desc.type() == core.VarDesc.VarType.LOD_TENSOR_ARRAY:
             # NOTE(liym27): Don't remove this if branch!
             # This feature is supported for Dynamic-to-Static, because after transformed, the type of inputs[0]
-            # is LOD_TENSOR_ARRAY in some scenarios. And this feature can be used in static mode.
+            # is LOD_TENSOR_ARRAY in some scenarios. And this feature can be used in static graph mode.
 
             assert len(input) == 1, (
                 "If the elements of 'input' in concat are Variable(LoDTensorArray), "
@@ -3450,7 +3450,7 @@ def reshape(x, shape, name=None):
     Args:
         x (Tensor): An N-D Tensor. The data type is ``float32``, ``float64``, ``int32``, ``int64`` or ``bool``
         shape (list|tuple|Tensor): Define the target shape. At most one dimension of the target shape can be -1.
-                        The data type is ``int32`` . If ``shape`` is a list or tuple, the elements of it should be integers or Tensors with shape [1].
+                        The data type is ``int32`` . If ``shape`` is a list or tuple, the elements of it should be integers or Tensors with shape [].
                         If ``shape`` is an Tensor, it should be an 1-D Tensor .
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
@@ -3574,10 +3574,6 @@ def get_attr_shape(list_shape):
             shape.stop_gradient = True
             inputs["Shape"] = shape
         elif isinstance(shape, (list, tuple)):
-            assert len(shape) > 0, (
-                "The size of 'shape' in reshape can't be zero, "
-                "but received %s." % len(shape)
-            )
             attrs["shape"] = get_attr_shape(shape)
             if utils._contain_var(shape):
                 inputs['ShapeTensor'] = utils._convert_to_tensor_list(shape)
diff --git a/python/paddle/tensor/stat.py b/python/paddle/tensor/stat.py
index 98b346d02b4dd9..e2dcbd178ea46c 100644
--- a/python/paddle/tensor/stat.py
+++ b/python/paddle/tensor/stat.py
@@ -218,7 +218,7 @@ def std(x, axis=None, unbiased=True, keepdim=False, name=None):
 
 def numel(x, name=None):
     """
-    Returns the number of elements for a tensor, which is a int64 Tensor with shape [1] in static mode
+    Returns the number of elements for a tensor, which is a int64 Tensor with shape [1] in static graph mode
     or a scalar value in imperative mode.
 
     Args:
diff --git a/python/paddle/utils/flops.py b/python/paddle/utils/flops.py
index a930e0ef5488d8..71f54ee29cbe9f 100644
--- a/python/paddle/utils/flops.py
+++ b/python/paddle/utils/flops.py
@@ -73,7 +73,7 @@ def _c_embedding_flops(input_shapes, attrs):
 def _dropout_flops(input_shapes, attrs):
     """FLOPs computation for dropout op.
     For dropout(input):
-    equation: flops = 0
+        equation: flops = 0
     """
     return 0
 
@@ -191,7 +191,7 @@ def _matmul_v2_flops(input_shapes, attrs):
     """FLOPs computation for matmul_v2 op.
     For matmul_v2(input,other):
         input_shapes = [shape_of_input, shape_of_ohther]
-        shape_of_input =                  [dim1, dim2 ...dim_n_1, dim_n] length:n
+        shape_of_input =                   [dim1, dim2 ...dim_n_1, dim_n] length:n
         shape_of_other = [odim1, odim2 ... odim(n-m) ... odim_m_1, dim_m] length:m
         suppose n > m and dim_n = odim_m_1:
         shape_of_output = [dim1, dim2 ... max(dim(n-m), odim(n-m)), max(dim(n-m+1), odim(n-m+1))...dim_n_1, dim_m]
@@ -216,13 +216,43 @@ def _matmul_v2_flops(input_shapes, attrs):
     return 2 * macs
 
 
-@register_flops("relu")
-def _relu_flops(input_shapes, attrs):
-    """FLOPs computation for relu op.
-    For relu(input):
+def _relu_class_flops(input_shapes, attrs):
+    """FLOPs computation for relu_like ops.
+    For elu/leaky_relu/prelu/relu/relu6/silu (input):
         equation: flops = (numel)total number of elements in the input tensor.
     """
-    return prod(input_shapes.get('X')[0])
+    input = input_shapes.get('X')[0]
+    return prod(input)
+
+
+@register_flops("elu")
+def _elu_flops(input_shapes, attrs):
+    return _relu_class_flops(input_shapes, attrs)
+
+
+@register_flops("leaky_relu")
+def _leaky_relu_flops(input_shapes, attrs):
+    return _relu_class_flops(input_shapes, attrs)
+
+
+@register_flops("prelu")
+def _prelu_flops(input_shapes, attrs):
+    return _relu_class_flops(input_shapes, attrs)
+
+
+@register_flops("relu")
+def _relu_flops(input_shapes, attrs):
+    return _relu_class_flops(input_shapes, attrs)
+
+
+@register_flops("relu6")
+def _relu6_flops(input_shapes, attrs):
+    return _relu_class_flops(input_shapes, attrs)
+
+
+@register_flops("silu")
+def _silu_flops(input_shapes, attrs):
+    return _relu_class_flops(input_shapes, attrs)
 
 
 @register_flops("reshape2")
diff --git a/python/paddle/utils/inplace_utils.py b/python/paddle/utils/inplace_utils.py
index 48a40847708ce8..65cac04350ca44 100644
--- a/python/paddle/utils/inplace_utils.py
+++ b/python/paddle/utils/inplace_utils.py
@@ -20,14 +20,14 @@
 
 
 # NOTE(pangyoki): The Inplace APIs with underline(`_`) is only valid for the method of calling `_C_ops`
-# in dygraph mode. If static mode is used, the inplace mechanism will not be used, and the static method
+# in dygraph mode. If static graph mode is used, the inplace mechanism will not be used, and the static method
 # of the original API will be called.
 def _inplace_apis_in_dygraph_only_(func):
     def __impl__(*args, **kwargs):
         if not _non_static_mode():
             origin_api_name = func.__name__[:-1]
             warnings.warn(
-                "In static mode, {}() is the same as {}() and does not perform inplace operation.".format(
+                "In static graph mode, {}() is the same as {}() and does not perform inplace operation.".format(
                     func.__name__, origin_api_name
                 )
             )
diff --git a/python/paddle/vision/transforms/functional.py b/python/paddle/vision/transforms/functional.py
index 91a600efd38658..5ab8d576dfbc61 100644
--- a/python/paddle/vision/transforms/functional.py
+++ b/python/paddle/vision/transforms/functional.py
@@ -34,7 +34,7 @@ def _is_pil_image(img):
 
 def _is_tensor_image(img):
     """
-    Return True if img is a Tensor for dynamic mode or Variable for static mode.
+    Return True if img is a Tensor for dynamic mode or Variable for static graph mode.
     """
     return isinstance(img, (paddle.Tensor, Variable))
 
diff --git a/python/paddle/vision/transforms/functional_tensor.py b/python/paddle/vision/transforms/functional_tensor.py
index b321f62f6aa28a..2b9a6ca658fd0c 100644
--- a/python/paddle/vision/transforms/functional_tensor.py
+++ b/python/paddle/vision/transforms/functional_tensor.py
@@ -776,7 +776,7 @@ def resize(img, size, interpolation='bilinear', data_format='CHW'):
 
     if isinstance(size, int):
         w, h = _get_image_size(img, data_format)
-        # TODO(Aurelius84): In static mode, w and h will be -1 for dynamic shape.
+        # TODO(Aurelius84): In static graph mode, w and h will be -1 for dynamic shape.
         # We should consider to support this case in future.
         if w <= 0 or h <= 0:
             raise NotImplementedError(
diff --git a/python/setup.py.in b/python/setup.py.in
old mode 100755
new mode 100644
index 00c9f738040145..d4f9ef9b7cfc4e
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -106,7 +106,7 @@ __all__ = ['cuda', 'cudnn', 'show']
 
 def show():
     """Get the version of paddle if `paddle` package if tagged. Otherwise, output the corresponding commit id.
-    
+
     Returns:
         If paddle package is not tagged, the commit-id of paddle will be output.
         Otherwise, the following information will be output.
@@ -118,13 +118,13 @@ def show():
         minor: the minor version of paddle
 
         patch: the patch level version of paddle
-        
+
         rc: whether it's rc version
 
         cuda: the cuda version of package. It will return `False` if CPU version paddle package is installed
 
         cudnn: the cudnn version of package. It will return `False` if CPU version paddle package is installed
-    
+
     Examples:
         .. code-block:: python
 
@@ -338,10 +338,6 @@ packages=['paddle',
           'paddle.fluid.layers',
           'paddle.fluid.dataloader',
           'paddle.fluid.contrib',
-          'paddle.fluid.contrib.quantize',
-          'paddle.fluid.contrib.slim',
-          'paddle.fluid.contrib.slim.quantization',
-          'paddle.fluid.contrib.slim.quantization.imperative',
           'paddle.fluid.contrib.extend_optimizer',
           'paddle.fluid.contrib.mixed_precision',
           'paddle.fluid.contrib.mixed_precision.bf16',
@@ -405,6 +401,9 @@ packages=['paddle',
           'paddle.static',
           'paddle.static.nn',
           'paddle.static.amp',
+          'paddle.static.quantization',
+          'paddle.quantization',
+          'paddle.quantization.imperative',
           'paddle.tensor',
           'paddle.onnx',
           'paddle.autograd',
diff --git a/setup.py b/setup.py
index 5408843083ce2e..092af614010060 100644
--- a/setup.py
+++ b/setup.py
@@ -1209,10 +1209,6 @@ def get_setup_parameters():
         'paddle.fluid.layers',
         'paddle.fluid.dataloader',
         'paddle.fluid.contrib',
-        'paddle.fluid.contrib.quantize',
-        'paddle.fluid.contrib.slim',
-        'paddle.fluid.contrib.slim.quantization',
-        'paddle.fluid.contrib.slim.quantization.imperative',
         'paddle.fluid.contrib.extend_optimizer',
         'paddle.fluid.contrib.mixed_precision',
         'paddle.fluid.contrib.mixed_precision.bf16',
@@ -1276,6 +1272,9 @@ def get_setup_parameters():
         'paddle.static',
         'paddle.static.nn',
         'paddle.static.amp',
+        'paddle.static.quantization',
+        'paddle.quantization',
+        'paddle.quantization.imperative',
         'paddle.tensor',
         'paddle.onnx',
         'paddle.autograd',
diff --git a/tools/get_single_test_cov.py b/tools/get_single_test_cov.py
index 8cdf32f348f090..b3b1df8afe29a5 100644
--- a/tools/get_single_test_cov.py
+++ b/tools/get_single_test_cov.py
@@ -12,12 +12,25 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import json
 import os
 import re
+import subprocess
 import sys
 
 
 def getFNDAFile(rootPath, test):
+    # load base fnda
+    fnda_base_dict = {}
+    find_file_cmd = os.popen("find %s -name %s.cc" % (rootPath, test))
+    if find_file_cmd.read() != "":
+        print("%s is a c++ unittest" % test)
+        with open(
+            "%s/build/ut_map/simple_precision_test/base_fnda.json" % rootPath,
+            'r',
+        ) as load_f:
+            fnda_base_dict = json.load(load_f)
+    # analyse fnda
     filename = '%s/build/ut_map/%s/coverage.info.tmp' % (rootPath, test)
     fn_filename = '%s/build/ut_map/%s/fnda.tmp' % (rootPath, test)
     os.system('touch %s' % fn_filename)
@@ -27,15 +40,28 @@ def getFNDAFile(rootPath, test):
     except FileNotFoundError:
         print("%s is not found." % filename)
         return
-    lines = f.readlines()
-    for line in lines:
-        line = line.replace('\n', '')
-        if line.startswith(('SF:')):
-            os.system('echo %s >> %s' % (line, fn_filename))
-        elif line.startswith(('FNDA:')):
-            hit = int(line.split('FNDA:')[1].split(',')[0])
-            if hit != 0:
-                os.system('echo %s >> %s' % (line, fn_filename))
+    all_data = f.read().split('TN:')
+    del all_data[0]
+    for gcov_data in all_data:
+        message_list = gcov_data.split('\n')
+        os.system('echo %s >> %s' % (message_list[1], fn_filename))
+        if 'FNH:0' not in gcov_data:
+            for message in message_list:
+                if message.startswith(('FNDA:')) and (
+                    not message.startswith(('FNDA:0,'))
+                ):
+                    tmp_data = message.split('FNDA:')[1].split(',')
+                    hit = int(tmp_data[0])
+                    symbol = tmp_data[1]
+                    if symbol in fnda_base_dict:
+                        if (hit - fnda_base_dict[symbol]) > 0:
+                            fnda_str = 'FNDA:%s,%s' % (
+                                str(hit - fnda_base_dict[symbol]),
+                                symbol,
+                            )
+                            os.system('echo %s >> %s' % (fnda_str, fn_filename))
+                    else:
+                        os.system('echo %s >> %s' % (message, fn_filename))
     f.close()
 
 
@@ -112,10 +138,55 @@ def analysisFNDAFile(rootPath, test):
     f.close()
 
 
+def getBaseFnda(rootPath, test):
+    filename = '%s/build/ut_map/%s/coverage.info.tmp' % (rootPath, test)
+    try:
+        f = open(filename)
+        print("oepn %s succesfully" % filename)
+    except FileNotFoundError:
+        print("%s is not found." % filename)
+    symbol_fnda = {}
+    all_data = f.read().split('TN:')
+    del all_data[0]
+    for gcov_data in all_data:
+        message_list = gcov_data.split('\n')
+        # only for cc file
+        if ".cc" in message_list[1]:
+            for message in message_list:
+                if message.startswith(('FNDA:')) and (
+                    not message.startswith(('FNDA:0,'))
+                ):
+                    tmp_data = message.split('FNDA:')[1].split(',')
+                    symbol_fnda[tmp_data[1]] = int(tmp_data[0])
+    f.close()
+
+    with open("%s/build/ut_map/%s/base_fnda.json" % (rootPath, test), "w") as f:
+        json.dump(symbol_fnda, f, indent=4)
+
+
 def getCovinfo(rootPath, test):
     ut_map_path = '%s/build/ut_map/%s' % (rootPath, test)
+    print("start get fluid ===>")
+    cmd_fluid = 'lcov --capture -d ./paddle/fluid/ -o ./paddle/fluid/coverage_fluid.info --rc lcov_branch_coverage=0'
+    p_fluid = subprocess.Popen(cmd_fluid, shell=True, stdout=subprocess.DEVNULL)
+
+    print("start get phi ===>")
+    cmd_phi = 'lcov --capture -d ./paddle/phi -o ./paddle/phi/coverage_phi.info --rc lcov_branch_coverage=0'
+    p_phi = subprocess.Popen(cmd_phi, shell=True, stdout=subprocess.DEVNULL)
+
+    print("start get utils ===>")
+    cmd_utils = 'lcov --capture -d ./paddle/utils -o ./paddle/utils/coverage_utils.info --rc lcov_branch_coverage=0'
+    p_utils = subprocess.Popen(cmd_utils, shell=True, stdout=subprocess.DEVNULL)
+
+    print("start wiat fluid ===>")
+    p_fluid.wait()
+    print("start wiat phi ===>")
+    p_phi.wait()
+    print("start wiat utils ===>")
+    p_utils.wait()
+    print("end wait...")
     os.system(
-        'cd %s && lcov --capture -d . -o coverage.info --rc lcov_branch_coverage=0 > /dev/null 2>&1'
+        'cd %s && lcov -a paddle/fluid/coverage_fluid.info -a paddle/phi/coverage_phi.info -a paddle/utils/coverage_utils.info -o coverage.info --rc lcov_branch_coverage=0 > /dev/null 2>&1'
         % ut_map_path
     )
     coverage_info_path = ut_map_path + '/coverage.info'
@@ -139,8 +210,11 @@ def getCovinfo(rootPath, test):
 
     os.system('rm -rf %s/paddle' % ut_map_path)
     os.system('rm -rf %s/coverage.info' % ut_map_path)
-    getFNDAFile(rootPath, test)
-    analysisFNDAFile(rootPath, test)
+    if test == "simple_precision_test":
+        getBaseFnda(rootPath, test)
+    else:
+        getFNDAFile(rootPath, test)
+        analysisFNDAFile(rootPath, test)
     os.system('rm -rf %s/coverage.info.tmp' % ut_map_path)
 
 
diff --git a/tools/handle_h_cu_file.py b/tools/handle_h_cu_file.py
index 1a096fa894e463..a33c1cd6681191 100644
--- a/tools/handle_h_cu_file.py
+++ b/tools/handle_h_cu_file.py
@@ -72,6 +72,31 @@ def insert_pile_to_h_file(rootPath):
         os.system('echo "\n#endif" >> %s' % line)
 
 
+def add_simple_cxx_test(rootPath):
+    variant_test_path = '%s/paddle/utils/variant_test.cc' % rootPath
+    variant_test_cmakeflie_path = '%s/paddle/utils/CMakeLists.txt' % rootPath
+    if os.path.exists(variant_test_path) and os.path.exists(
+        variant_test_cmakeflie_path
+    ):
+        simple_test_path = '%s/paddle/utils/simple_precision_test.cc' % rootPath
+        os.system('touch %s' % simple_test_path)
+        os.system(
+            "echo '#include \"gtest/gtest.h\"\n' >> %s" % simple_test_path
+        )
+        os.system(
+            'echo "TEST(interface_test, type) { }\n" >> %s' % simple_test_path
+        )
+        os.system('echo "cc_test(" >> %s' % variant_test_cmakeflie_path)
+        os.system(
+            'echo "  simple_precision_test" >> %s' % variant_test_cmakeflie_path
+        )
+        os.system(
+            'echo "  SRCS simple_precision_test.cc" >> %s'
+            % variant_test_cmakeflie_path
+        )
+        os.system('echo "  DEPS gtest)\n" >> %s' % variant_test_cmakeflie_path)
+
+
 def remove_pile_from_h_file(rootPath):
     h_cu_files = '%s/tools/h_cu_files.log' % rootPath
     f = open(h_cu_files)
@@ -130,6 +155,7 @@ def main(rootPath, dir_path):
     elif func == 'insert_pile_to_h_file':
         rootPath = sys.argv[2]
         insert_pile_to_h_file(rootPath)
+        add_simple_cxx_test(rootPath)
     elif func == 'analy_h_cu_file':
         dir_path = sys.argv[2]
         rootPath = sys.argv[3]
diff --git a/tools/nvcc_lazy.sh b/tools/nvcc_lazy.sh
index 011ac564cf91ba..3c6f0e140f250a 100755
--- a/tools/nvcc_lazy.sh
+++ b/tools/nvcc_lazy.sh
@@ -65,12 +65,14 @@ echo "sed -i -e '/LIBRARIES=/{s/\s//g;s/\"\"/ /g}' \${BUILDSH}.pre" >> $1
 echo -e >> $1
 echo "/usr/bin/env bash \${BUILDSH}.pre" >> $1
 echo "STUBF=\$(find \$BUILDDIR -name *.cudafe1.stub.c)" >> $1
-echo "CUFILE=\$(basename -s '.cudafe1.stub.c' \$STUBF)" >> $1
-echo "sed -i -e '/__sti____cudaRegisterAll.*__attribute__/a static void __try____cudaRegisterAll(int);' \$STUBF" >> $1
-echo "sed -i -e 's/__sti____cudaRegisterAll\(.*{\)/__do____cudaRegisterAll\1/' \$STUBF" >> $1
-echo "# sed -i -e \"/__do____cudaRegisterAll\(.*{\)/a static void __try____cudaRegisterAll(int l){static int _ls = 0; if (_ls) return; const char* lm = getenv(\\\"CUDA_MODULE_LOADING\\\"); if (lm&&(lm[0]=='L')&&(lm[1]=='A')&&(lm[2]=='Z')&&(lm[3]=='Y')&&(l!=1)) return; _ls = 1; fprintf(stderr,\\\"===> \${CUFILE} lazy-load? %d\\\\\\\\n\\\", l); __do____cudaRegisterAll();}\" \$STUBF" >> $1
-echo "sed -i -e \"/__do____cudaRegisterAll\(.*{\)/a static void __try____cudaRegisterAll(int l){static int _ls = 0; if (_ls) return; const char* lm = getenv(\\\"CUDA_MODULE_LOADING\\\"); if (lm&&(lm[0]=='L')&&(lm[1]=='A')&&(lm[2]=='Z')&&(lm[3]=='Y')&&(l!=1)) return; _ls = 1; __do____cudaRegisterAll();}\" \$STUBF" >> $1
-echo "sed -i -e '/__try____cudaRegisterAll\(.*{\)/a static void __sti____cudaRegisterAll(void){__try____cudaRegisterAll(0);}' \$STUBF" >> $1
-echo "sed -i -e 's/{\(__device_stub__\)/{__try____cudaRegisterAll(1);\1/' \$STUBF" >> $1
+echo "if [ ! -z \"\$STUBF\" ]; then" >> $1
+echo "  CUFILE=\$(basename -s '.cudafe1.stub.c' \$STUBF)" >> $1
+echo "  sed -i -e '/__sti____cudaRegisterAll.*__attribute__/a static void __try____cudaRegisterAll(int);' \$STUBF" >> $1
+echo "  sed -i -e 's/__sti____cudaRegisterAll\(.*{\)/__do____cudaRegisterAll\1/' \$STUBF" >> $1
+echo "  # sed -i -e \"/__do____cudaRegisterAll\(.*{\)/a static void __try____cudaRegisterAll(int l){static int _ls = 0; if (_ls) return; const char* lm = getenv(\\\"CUDA_MODULE_LOADING\\\"); if (lm&&(lm[0]=='L')&&(lm[1]=='A')&&(lm[2]=='Z')&&(lm[3]=='Y')&&(l!=1)) return; _ls = 1; fprintf(stderr,\\\"===> \${CUFILE} lazy-load? %d\\\\\\\\n\\\", l); __do____cudaRegisterAll();}\" \$STUBF" >> $1
+echo "  sed -i -e \"/__do____cudaRegisterAll\(.*{\)/a static void __try____cudaRegisterAll(int l){static int _ls = 0; if (_ls) return; const char* lm = getenv(\\\"CUDA_MODULE_LOADING\\\"); if (lm&&(lm[0]=='L')&&(lm[1]=='A')&&(lm[2]=='Z')&&(lm[3]=='Y')&&(l!=1)) return; _ls = 1; __do____cudaRegisterAll();}\" \$STUBF" >> $1
+echo "  sed -i -e '/__try____cudaRegisterAll\(.*{\)/a static void __sti____cudaRegisterAll(void){__try____cudaRegisterAll(0);}' \$STUBF" >> $1
+echo "  sed -i -e 's/{\(__device_stub__\)/{__try____cudaRegisterAll(1);\1/' \$STUBF" >> $1
+echo "fi" >> $1
 echo "/usr/bin/env bash \${BUILDSH}.post" >> $1
 echo "rm -rf \$BUILDDIR" >> $1
diff --git a/tools/sampcd_processor.py b/tools/sampcd_processor.py
index fb09e4e2580277..e4812aa45fce84 100644
--- a/tools/sampcd_processor.py
+++ b/tools/sampcd_processor.py
@@ -486,7 +486,7 @@ def get_filenames(full_test=False):
     '''
     global whl_error
     import paddle  # noqa: F401
-    import paddle.fluid.contrib.slim.quantization  # noqa: F401
+    import paddle.static.quantization  # noqa: F401
 
     whl_error = []
     if full_test: