diff --git a/paddle/fluid/distributed/fleet.h b/paddle/fluid/distributed/fleet.h
index 697dbb9170f184..be7fe8ea23fac1 100644
--- a/paddle/fluid/distributed/fleet.h
+++ b/paddle/fluid/distributed/fleet.h
@@ -36,7 +36,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace framework {
-class Tensor;
 class Scope;
 class SelectedRows;
 class Variable;
diff --git a/paddle/fluid/distributed/fleet_executor/dist_model.cc b/paddle/fluid/distributed/fleet_executor/dist_model.cc
index 7e820a38581af6..deb3d26527727e 100644
--- a/paddle/fluid/distributed/fleet_executor/dist_model.cc
+++ b/paddle/fluid/distributed/fleet_executor/dist_model.cc
@@ -145,8 +145,8 @@ bool DistModel::LoadParameters() {
   return true;
 }
 
-void DistModel::Run(const std::vector<framework::Tensor> &input_data,
-                    std::vector<framework::Tensor> *output_data) {
+void DistModel::Run(const std::vector<paddle::framework::Tensor> &input_data,
+                    std::vector<paddle::framework::Tensor> *output_data) {
   /* TODO(fleet exe dev): implement this funct */
 }
 
diff --git a/paddle/fluid/distributed/fleet_executor/dist_model.h b/paddle/fluid/distributed/fleet_executor/dist_model.h
index 57bfd88147746b..182c5a508098ed 100644
--- a/paddle/fluid/distributed/fleet_executor/dist_model.h
+++ b/paddle/fluid/distributed/fleet_executor/dist_model.h
@@ -18,6 +18,7 @@
 #include <vector>
 
 #include "paddle/fluid/distributed/fleet_executor/fleet_executor_desc.pb.h"
+#include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/macros.h"
 #include "paddle/fluid/platform/place.h"
 
@@ -25,7 +26,6 @@ namespace paddle {
 namespace framework {
 class ProgramDesc;
 class Scope;
-class Tensor;
 }
 
 namespace distributed {
@@ -45,8 +45,8 @@ class DistModel {
  public:
   explicit DistModel(const DistModelConfig& config) : config_(config) {}
   bool Init();
-  void Run(const std::vector<framework::Tensor>& input_data,
-           std::vector<framework::Tensor>* output_data);
+  void Run(const std::vector<paddle::framework::Tensor>& input_data,
+           std::vector<paddle::framework::Tensor>* output_data);
   ~DistModel() = default;
 
  private:
diff --git a/paddle/fluid/distributed/service/brpc_utils.cc b/paddle/fluid/distributed/service/brpc_utils.cc
index 4d9f84fdc6e0f3..147758abfd5553 100644
--- a/paddle/fluid/distributed/service/brpc_utils.cc
+++ b/paddle/fluid/distributed/service/brpc_utils.cc
@@ -20,10 +20,13 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 class Variable;
-class Tensor;
 }  // namespace framework
 }  // namespace paddle
 
+namespace pten {
+class DenseTensor;
+}  // namespace pten
+
 namespace paddle {
 namespace distributed {
 
diff --git a/paddle/fluid/distributed/test/brpc_service_dense_sgd_test.cc b/paddle/fluid/distributed/test/brpc_service_dense_sgd_test.cc
index c0c1fda4c4fca1..f83c7bdb15fa1c 100644
--- a/paddle/fluid/distributed/test/brpc_service_dense_sgd_test.cc
+++ b/paddle/fluid/distributed/test/brpc_service_dense_sgd_test.cc
@@ -31,11 +31,14 @@ class PSClient;
 class PSServer;
 }  // namespace distributed
 namespace framework {
-class Tensor;
 class Variable;
 }  // namespace framework
 }  // namespace paddle
 
+namespace pten {
+class DenseTensor;
+}  // namespace pten
+
 namespace framework = paddle::framework;
 namespace platform = paddle::platform;
 namespace operators = paddle::operators;
diff --git a/paddle/fluid/distributed/test/brpc_service_sparse_sgd_test.cc b/paddle/fluid/distributed/test/brpc_service_sparse_sgd_test.cc
index 471750feaefef7..f9c2b55eb4fee2 100644
--- a/paddle/fluid/distributed/test/brpc_service_sparse_sgd_test.cc
+++ b/paddle/fluid/distributed/test/brpc_service_sparse_sgd_test.cc
@@ -32,11 +32,14 @@ class PSClient;
 class PSServer;
 }  // namespace distributed
 namespace framework {
-class Tensor;
 class Variable;
 }  // namespace framework
 }  // namespace paddle
 
+namespace pten {
+class DenseTensor;
+}  // namespace pten
+
 namespace framework = paddle::framework;
 namespace platform = paddle::platform;
 namespace operators = paddle::operators;
diff --git a/paddle/fluid/eager/legacy/prepared_operator.h b/paddle/fluid/eager/legacy/prepared_operator.h
index 0e00b52e0481aa..87720fd8f005b8 100644
--- a/paddle/fluid/eager/legacy/prepared_operator.h
+++ b/paddle/fluid/eager/legacy/prepared_operator.h
@@ -29,7 +29,6 @@ DECLARE_bool(use_mkldnn);
 
 namespace paddle {
 namespace framework {
-class Tensor;
 class Variable;
 }  // namespace framework
 namespace platform {
@@ -37,6 +36,10 @@ class DeviceContext;
 }  // namespace platform
 }  // namespace paddle
 
+namespace pten {
+class DenseTensor;
+}  // namespace pten
+
 namespace egr {
 namespace legacy {
 
diff --git a/paddle/fluid/framework/copy_same_tensor_test.cc b/paddle/fluid/framework/copy_same_tensor_test.cc
index 14bef7fe023f63..496991fd6862d1 100644
--- a/paddle/fluid/framework/copy_same_tensor_test.cc
+++ b/paddle/fluid/framework/copy_same_tensor_test.cc
@@ -68,7 +68,7 @@ static bool CopySameTensorTestMain(const DDim &dims,
     if (sync_copy) {
       TensorCopySync(src_tensor, dst_place, &src_tensor);
     } else {
-      TensorCopy(src_tensor, dst_place, &src_tensor);
+      paddle::framework::TensorCopy(src_tensor, dst_place, &src_tensor);
       platform::DeviceContextPool::Instance().Get(src_place)->Wait();
       platform::DeviceContextPool::Instance().Get(dst_place)->Wait();
     }
diff --git a/paddle/fluid/framework/data_device_transform.cc b/paddle/fluid/framework/data_device_transform.cc
index d06f5a0227af74..1a4f283f511da4 100644
--- a/paddle/fluid/framework/data_device_transform.cc
+++ b/paddle/fluid/framework/data_device_transform.cc
@@ -28,8 +28,9 @@ void TransDataDevice(const Tensor &in, const platform::Place &dst_place,
 
   // NOTE(zhiqiu): Special case for CPU->NPU, avoid stream sync.
   if (platform::is_cpu_place(in.place()) && platform::is_npu_place(dst_place)) {
-    TensorCopy(in, dst_place,
-               *platform::DeviceContextPool::Instance().Get(dst_place), out);
+    paddle::framework::TensorCopy(
+        in, dst_place, *platform::DeviceContextPool::Instance().Get(dst_place),
+        out);
     return;
   }
 
diff --git a/paddle/fluid/framework/data_device_transform.h b/paddle/fluid/framework/data_device_transform.h
index 60b52a5e7069fb..8ff97646cfce79 100644
--- a/paddle/fluid/framework/data_device_transform.h
+++ b/paddle/fluid/framework/data_device_transform.h
@@ -21,8 +21,6 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-class Tensor;
-
 void TransDataDevice(const Tensor& in, const platform::Place& dst_place,
                      Tensor* out);
 
diff --git a/paddle/fluid/framework/data_feed.h b/paddle/fluid/framework/data_feed.h
index 2533acaa6d35ac..313ee9cd68a0b7 100644
--- a/paddle/fluid/framework/data_feed.h
+++ b/paddle/fluid/framework/data_feed.h
@@ -50,12 +50,15 @@ DECLARE_bool(enable_slotrecord_reset_shrink);
 namespace paddle {
 namespace framework {
 class DataFeedDesc;
-class Tensor;
 class Scope;
 class Variable;
 }  // namespace framework
 }  // namespace paddle
 
+namespace pten {
+class DenseTensor;
+}  // namespace pten
+
 namespace paddle {
 namespace framework {
 
diff --git a/paddle/fluid/framework/data_layout_transform.h b/paddle/fluid/framework/data_layout_transform.h
index f7b4a36d2f4001..182ffe65c3c7ec 100644
--- a/paddle/fluid/framework/data_layout_transform.h
+++ b/paddle/fluid/framework/data_layout_transform.h
@@ -25,7 +25,6 @@
 namespace paddle {
 namespace framework {
 class OpKernelType;
-class Tensor;
 }  // namespace framework
 }  // namespace paddle
 
diff --git a/paddle/fluid/framework/data_transform.h b/paddle/fluid/framework/data_transform.h
index 2bbdac52ee49fd..f8b36b48c308ea 100644
--- a/paddle/fluid/framework/data_transform.h
+++ b/paddle/fluid/framework/data_transform.h
@@ -31,7 +31,6 @@ namespace paddle {
 namespace framework {
 
 class OpKernelType;
-class Tensor;
 class Variable;
 
 void TransformData(const OpKernelType &expected_kernel_type,
diff --git a/paddle/fluid/framework/data_type_transform.h b/paddle/fluid/framework/data_type_transform.h
index 678764430f0ffa..76cea64dc47550 100644
--- a/paddle/fluid/framework/data_type_transform.h
+++ b/paddle/fluid/framework/data_type_transform.h
@@ -25,7 +25,6 @@ namespace paddle {
 namespace framework {
 
 class OpKernelType;
-class Tensor;
 
 using KernelTypePair = std::pair<OpKernelType, OpKernelType>;
 
diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.cc b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
index b8fac755709e76..052860cd0ab404 100644
--- a/paddle/fluid/framework/details/async_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
@@ -169,7 +169,7 @@ FetchResultType AsyncSSAGraphExecutor::Run(
       std::vector<const LoDTensor *> lodtensor_ptrs;
       lodtensor_ptrs.push_back(&(BOOST_GET(LoDTensor, val.at(fetch_idx))));
       LoDTensor var;
-      var.MergeLoDTensor(lodtensor_ptrs, platform::CPUPlace());
+      MergeLoDTensor(&var, lodtensor_ptrs, platform::CPUPlace());
       ret.emplace_back(var);
     } else {
       auto array = BOOST_GET(LoDTensorArray, val.at(fetch_idx));
@@ -179,7 +179,8 @@ FetchResultType AsyncSSAGraphExecutor::Run(
         std::vector<const LoDTensor *> lodtensor_ptrs;
         lodtensor_ptrs.push_back(&array[i]);
         item_array.emplace_back();
-        item_array.back().MergeLoDTensor(lodtensor_ptrs, platform::CPUPlace());
+        MergeLoDTensor(&(item_array.back()), lodtensor_ptrs,
+                       platform::CPUPlace());
       }
       ret.emplace_back(item_array);
     }
diff --git a/paddle/fluid/framework/details/fetch_async_op_handle.h b/paddle/fluid/framework/details/fetch_async_op_handle.h
index 41df0d90aaf817..3e9563ab1eda47 100644
--- a/paddle/fluid/framework/details/fetch_async_op_handle.h
+++ b/paddle/fluid/framework/details/fetch_async_op_handle.h
@@ -22,14 +22,18 @@
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/platform/device_context.h"
 
+namespace pten {
+class DenseTensor;
+}  // namespace pten
+
 namespace paddle {
 namespace framework {
-class Tensor;
 
 namespace ir {
 class Node;
 }  // namespace ir
 }  // namespace framework
+
 namespace platform {
 class DeviceContext;
 }  // namespace platform
diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc
index 0a116cd9d8abba..60e58fafa41983 100644
--- a/paddle/fluid/framework/details/fetch_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_op_handle.cc
@@ -81,7 +81,7 @@ void FetchOpHandle::WaitAndMergeCPUFetchVars() const {
       }
       auto &val = BOOST_GET(FetchList, *data_);
       LoDTensor var;
-      var.MergeLoDTensor(tensors_ptr, platform::CPUPlace());
+      MergeLoDTensor(&var, tensors_ptr, platform::CPUPlace());
       val.at(offset_) = std::move(var);
     } else {
       auto &array = BOOST_GET_CONST(LoDTensorArray, tensors_[0]);
@@ -99,7 +99,7 @@ void FetchOpHandle::WaitAndMergeCPUFetchVars() const {
           tensors_ptr.push_back(&element[i]);
         }
         tmp_array.emplace_back();
-        tmp_array.back().MergeLoDTensor(tensors_ptr, platform::CPUPlace());
+        MergeLoDTensor(&(tmp_array.back()), tensors_ptr, platform::CPUPlace());
       }
       auto &val = BOOST_GET(FetchList, *data_);
       val.at(offset_) = std::move(tmp_array);
diff --git a/paddle/fluid/framework/details/gather_op_handle.cc b/paddle/fluid/framework/details/gather_op_handle.cc
index 4d31069dd06eeb..74f5deed45557c 100644
--- a/paddle/fluid/framework/details/gather_op_handle.cc
+++ b/paddle/fluid/framework/details/gather_op_handle.cc
@@ -16,11 +16,10 @@
 
 #include "paddle/fluid/framework/details/container_cast.h"
 #include "paddle/fluid/framework/details/variable_visitor.h"
-namespace paddle {
-namespace framework {
-class Tensor;
-}  // namespace framework
-}  // namespace paddle
+
+namespace pten {
+class DenseTensor;
+}  // namespace pten
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.h b/paddle/fluid/framework/details/nan_inf_utils_detail.h
index 10b7ab0bc9c534..fa2cbb550339a3 100644
--- a/paddle/fluid/framework/details/nan_inf_utils_detail.h
+++ b/paddle/fluid/framework/details/nan_inf_utils_detail.h
@@ -19,11 +19,9 @@
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/platform/place.h"
 
-namespace paddle {
-namespace framework {
-class Tensor;
-}  // namespace framework
-}  // namespace paddle
+namespace pten {
+class DenseTensor;
+}  // namespace pten
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
index 51063f68d4cbd6..936e84a6c82b9a 100644
--- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
@@ -275,7 +275,7 @@ FetchResultType ParallelSSAGraphExecutor::Run(
       }
       if (lodtensor_ptrs.size() != 0) {
         LoDTensor var;
-        var.MergeLoDTensor(lodtensor_ptrs, platform::CPUPlace());
+        MergeLoDTensor(&var, lodtensor_ptrs, platform::CPUPlace());
         ret.emplace_back(var);
       } else {
         LoDTensorArray var_array(lodtensorarray_ptrs[0]->size());
@@ -285,7 +285,7 @@ FetchResultType ParallelSSAGraphExecutor::Run(
           for (size_t j = 0; j < lodtensorarray_ptrs.size(); ++j) {
             ptrs.push_back(&(lodtensorarray_ptrs[j]->at(i)));
           }
-          var.MergeLoDTensor(ptrs, platform::CPUPlace());
+          MergeLoDTensor(&var, ptrs, platform::CPUPlace());
           var_array[i] = std::move(var);
         }
         ret.emplace_back(var_array);
diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
index a2f7cc6fcecbf7..3d877dbbde248c 100644
--- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
+++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
@@ -18,11 +18,9 @@
 
 #include "paddle/fluid/platform/profiler.h"
 
-namespace paddle {
-namespace framework {
-class Tensor;
-}  // namespace framework
-}  // namespace paddle
+namespace pten {
+class DenseTensor;
+}  // namespace pten
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/details/share_tensor_buffer_functor.cc b/paddle/fluid/framework/details/share_tensor_buffer_functor.cc
index 1225e2ee025b2e..434ba325ae3acf 100644
--- a/paddle/fluid/framework/details/share_tensor_buffer_functor.cc
+++ b/paddle/fluid/framework/details/share_tensor_buffer_functor.cc
@@ -22,7 +22,6 @@
 namespace paddle {
 namespace framework {
 class Scope;
-class Tensor;
 class Variable;
 
 namespace ir {
@@ -31,6 +30,10 @@ class MemOptVarInfo;
 }  // namespace framework
 }  // namespace paddle
 
+namespace pten {
+class DenseTensor;
+}  // namespace pten
+
 namespace paddle {
 namespace framework {
 namespace details {
diff --git a/paddle/fluid/framework/details/variable_visitor.cc b/paddle/fluid/framework/details/variable_visitor.cc
index 82078555013845..be1371542f5306 100644
--- a/paddle/fluid/framework/details/variable_visitor.cc
+++ b/paddle/fluid/framework/details/variable_visitor.cc
@@ -16,9 +16,12 @@
 
 #include "paddle/fluid/framework/selected_rows.h"
 
+namespace pten {
+class DenseTensor;
+}  // namespace pten
+
 namespace paddle {
 namespace framework {
-class Tensor;
 class Variable;
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/details/variable_visitor.h b/paddle/fluid/framework/details/variable_visitor.h
index a882d5120bc668..a689c47a1611f4 100644
--- a/paddle/fluid/framework/details/variable_visitor.h
+++ b/paddle/fluid/framework/details/variable_visitor.h
@@ -19,7 +19,6 @@
 
 namespace paddle {
 namespace framework {
-class Tensor;
 class Variable;
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/device_worker.cc b/paddle/fluid/framework/device_worker.cc
index 3b70ef737f5bef..e191979c505223 100644
--- a/paddle/fluid/framework/device_worker.cc
+++ b/paddle/fluid/framework/device_worker.cc
@@ -14,10 +14,13 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/device_worker.h"
 
+namespace pten {
+class DenseTensor;
+}  // namespace pten
+
 namespace paddle {
 namespace framework {
 
-class Tensor;
 class Scope;
 
 void DeviceWorker::SetRootScope(Scope* root_scope) { root_scope_ = root_scope; }
diff --git a/paddle/fluid/framework/device_worker.h b/paddle/fluid/framework/device_worker.h
index 332a5840491274..edb87a378dd4c3 100644
--- a/paddle/fluid/framework/device_worker.h
+++ b/paddle/fluid/framework/device_worker.h
@@ -43,7 +43,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace framework {
-class Tensor;
 class ProgramDesc;
 class Scope;
 }  // namespace framework
diff --git a/paddle/fluid/framework/dlpack_tensor.h b/paddle/fluid/framework/dlpack_tensor.h
index 03ed8884925ce4..ff4cf23da6e965 100644
--- a/paddle/fluid/framework/dlpack_tensor.h
+++ b/paddle/fluid/framework/dlpack_tensor.h
@@ -21,8 +21,6 @@
 namespace paddle {
 namespace framework {
 
-class Tensor;
-
 class DLPackTensor {
  public:
   using LaneType = decltype(::DLTensor::dtype.lanes);  // uint16_t
diff --git a/paddle/fluid/framework/downpour_worker.cc b/paddle/fluid/framework/downpour_worker.cc
index cc97af4b1969d2..83d5a2efa342e5 100644
--- a/paddle/fluid/framework/downpour_worker.cc
+++ b/paddle/fluid/framework/downpour_worker.cc
@@ -15,9 +15,12 @@ limitations under the License. */
 #include "paddle/fluid/framework/device_worker.h"
 #include "paddle/fluid/platform/cpu_helper.h"
 
+namespace pten {
+class DenseTensor;
+}  // namespace pten
+
 namespace paddle {
 namespace framework {
-class Tensor;
 class Variable;
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/feed_fetch_method.cc b/paddle/fluid/framework/feed_fetch_method.cc
index 0c3aafd85f2835..6454874c028b85 100644
--- a/paddle/fluid/framework/feed_fetch_method.cc
+++ b/paddle/fluid/framework/feed_fetch_method.cc
@@ -19,10 +19,13 @@ limitations under the License. */
 #include <boost/variant.hpp>
 #include "glog/logging.h"
 
+namespace pten {
+class DenseTensor;
+}  // namespace pten
+
 namespace paddle {
 namespace framework {
 
-class Tensor;
 class Variable;
 
 void SetFeedVariable(Scope* scope, const LoDTensor& input,
diff --git a/paddle/fluid/framework/feed_fetch_method.h b/paddle/fluid/framework/feed_fetch_method.h
index dc9310ff5b2632..89c4bff922bbdc 100644
--- a/paddle/fluid/framework/feed_fetch_method.h
+++ b/paddle/fluid/framework/feed_fetch_method.h
@@ -20,10 +20,13 @@ limitations under the License. */
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/string_array.h"
 
+namespace pten {
+class DenseTensor;
+}  // namespace pten
+
 namespace paddle {
 namespace framework {
 
-class Tensor;
 class Scope;
 
 void SetFeedVariable(Scope* scope, const LoDTensor& input,
diff --git a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc
index c883412a9a4c32..30a1de15cb0528 100644
--- a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc
@@ -18,9 +18,12 @@
 
 #include "paddle/fluid/framework/op_version_registry.h"
 
+namespace pten {
+class DenseTensor;
+}  // namespace pten
+
 namespace paddle {
 namespace framework {
-class Tensor;
 class Scope;
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
index 6443d0594a9c5a..194686825ff2d8 100644
--- a/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
@@ -19,9 +19,12 @@
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/enforce.h"
 
+namespace pten {
+class DenseTensor;
+}  // namespace pten
+
 namespace paddle {
 namespace framework {
-class Tensor;
 class Scope;
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/delete_dropout_op_pass.cc b/paddle/fluid/framework/ir/delete_dropout_op_pass.cc
index c0a4f099e39d42..ae9c873e14113f 100644
--- a/paddle/fluid/framework/ir/delete_dropout_op_pass.cc
+++ b/paddle/fluid/framework/ir/delete_dropout_op_pass.cc
@@ -15,11 +15,9 @@
 
 #include "paddle/fluid/framework/ir/delete_dropout_op_pass.h"
 
-namespace paddle {
-namespace framework {
-class Tensor;
-}  // namespace framework
-}  // namespace paddle
+namespace pten {
+class DenseTensor;
+}  // namespace pten
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/ir/delete_quant_dequant_op_pass.cc b/paddle/fluid/framework/ir/delete_quant_dequant_op_pass.cc
index af75646551e285..aecbd8619a67d9 100644
--- a/paddle/fluid/framework/ir/delete_quant_dequant_op_pass.cc
+++ b/paddle/fluid/framework/ir/delete_quant_dequant_op_pass.cc
@@ -16,11 +16,9 @@
 
 #include <string>
 
-namespace paddle {
-namespace framework {
-class Tensor;
-}  // namespace framework
-}  // namespace paddle
+namespace pten {
+class DenseTensor;
+}  // namespace pten
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc b/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc
index 09fd6b8dd11167..74937313d130fe 100644
--- a/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc
+++ b/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc
@@ -22,11 +22,9 @@ limitations under the License. */
 #include "paddle/fluid/platform/device_code.h"
 #include "paddle/fluid/platform/float16.h"
 
-namespace paddle {
-namespace framework {
-class Tensor;
-}  // namespace framework
-}  // namespace paddle
+namespace pten {
+class DenseTensor;
+}  // namespace pten
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
@@ -206,9 +204,11 @@ void TestMainImpl(std::string func_name, std::string code_str,
         for (int64_t i = 0; i < cpu_tensors[id].numel(); ++i) {
           tmp_cpu_ptr[i] = paddle::platform::float16(cpu_ptr[i]);
         }
-        TensorCopySync(tmp_cpu_tensors[id], place, &gpu_tensors[id]);
+        paddle::framework::TensorCopySync(tmp_cpu_tensors[id], place,
+                                          &gpu_tensors[id]);
       } else {
-        TensorCopySync(cpu_tensors[id], place, &gpu_tensors[id]);
+        paddle::framework::TensorCopySync(cpu_tensors[id], place,
+                                          &gpu_tensors[id]);
       }
       args.push_back(&gpu_ptrs[id]);
     }
@@ -234,8 +234,8 @@ void TestMainImpl(std::string func_name, std::string code_str,
       paddle::platform::float16* tmp_cpu_ptr =
           tmp_cpu_tensors[id].mutable_data<paddle::platform::float16>(
               cpu_tensors[id].dims(), paddle::platform::CPUPlace());
-      TensorCopySync(gpu_tensors[id], paddle::platform::CPUPlace(),
-                     &tmp_cpu_tensors[id]);
+      paddle::framework::TensorCopySync(
+          gpu_tensors[id], paddle::platform::CPUPlace(), &tmp_cpu_tensors[id]);
 
       float* cpu_ptr = cpu_tensors[id].mutable_data<float>(
           cpu_tensors[id].dims(), paddle::platform::CPUPlace());
@@ -243,8 +243,8 @@ void TestMainImpl(std::string func_name, std::string code_str,
         cpu_ptr[i] = static_cast<float>(tmp_cpu_ptr[i]);
       }
     } else {
-      TensorCopySync(gpu_tensors[id], paddle::platform::CPUPlace(),
-                     &cpu_tensors[id]);
+      paddle::framework::TensorCopySync(
+          gpu_tensors[id], paddle::platform::CPUPlace(), &cpu_tensors[id]);
     }
   }
 }
diff --git a/paddle/fluid/framework/lod_tensor.cc b/paddle/fluid/framework/lod_tensor.cc
index 4681933a66cd34..48ba7cc0a2a8ac 100644
--- a/paddle/fluid/framework/lod_tensor.cc
+++ b/paddle/fluid/framework/lod_tensor.cc
@@ -319,14 +319,47 @@ void DeserializeFromStream(std::istream &is, LoDTensor *tensor,
   TensorFromStream(is, static_cast<Tensor *>(tensor), dev_ctx);
 }
 
-std::vector<LoDTensor> LoDTensor::SplitLoDTensor(
-    const std::vector<platform::Place> places) const {
+LoD ConvertToLengthBasedLoD(const LoD &offset_lod) {
+  LoD length_lod;
+  length_lod.reserve(offset_lod.size());
+  for (size_t lvl = 0; lvl < offset_lod.size(); ++lvl) {
+    std::vector<size_t> level;
+    if (offset_lod[lvl].size() > 0) {
+      level.reserve(offset_lod[lvl].size() - 1);
+    }
+    for (size_t idx = 0; idx < offset_lod[lvl].size() - 1; ++idx) {
+      level.push_back(offset_lod[lvl][idx + 1] - offset_lod[lvl][idx]);
+    }
+    length_lod.push_back(level);
+  }
+  return length_lod;
+}
+
+LoD ConvertToOffsetBasedLoD(const LoD &length_lod) {
+  LoD offset_lod;
+  offset_lod.reserve(length_lod.size());
+  for (size_t lvl = 0; lvl < length_lod.size(); ++lvl) {
+    std::vector<size_t> level;
+    level.reserve(length_lod[lvl].size() + 1);
+    size_t tmp = 0;
+    level.push_back(tmp);
+    for (size_t idx = 0; idx < length_lod[lvl].size(); ++idx) {
+      tmp += length_lod[lvl][idx];
+      level.push_back(tmp);
+    }
+    offset_lod.push_back(level);
+  }
+  return offset_lod;
+}
+
+std::vector<LoDTensor> SplitLoDTensor(
+    const LoDTensor &src, const std::vector<platform::Place> places) {
   PADDLE_ENFORCE_GT(places.size(), 0,
                     platform::errors::InvalidArgument(
                         "Place number cannot be empty when splitting."));
-  check_memory_size();
-  size_t batch_size =
-      lod().empty() ? static_cast<size_t>(dims()[0]) : lod()[0].size() - 1;
+  src.check_memory_size();
+  size_t batch_size = src.lod().empty() ? static_cast<size_t>(src.dims()[0])
+                                        : src.lod()[0].size() - 1;
 
   // if batch_size is 0, just return #places.size() copys of empty
   // tensors.
@@ -335,10 +368,10 @@ std::vector<LoDTensor> LoDTensor::SplitLoDTensor(
     empty_results.reserve(places.size());
     for (size_t i = 0; i < places.size(); ++i) {
       LoDTensor dst;
-      dst.Resize(dims());
-      dst.mutable_data(places[i], type());
-      if (!lod().empty()) {
-        dst.set_lod(lod());
+      dst.Resize(src.dims());
+      dst.mutable_data(places[i], src.type());
+      if (!src.lod().empty()) {
+        dst.set_lod(src.lod());
       }
       empty_results.emplace_back(std::move(dst));
     }
@@ -360,17 +393,18 @@ std::vector<LoDTensor> LoDTensor::SplitLoDTensor(
                           begin, end));
 
     LoDTensor dst;
-    if (lod().empty()) {
-      auto src = Slice(begin, end);
+    if (src.lod().empty()) {
+      auto sliced_src = src.Slice(begin, end);
       auto &dst_place = places[i];
-      framework::TensorCopy(src, dst_place, &dst);
+      framework::TensorCopy(sliced_src, dst_place, &dst);
     } else {
-      auto lod_and_offset = GetSubLoDAndAbsoluteOffset(lod(), begin, end, 0);
+      auto lod_and_offset =
+          GetSubLoDAndAbsoluteOffset(src.lod(), begin, end, 0);
 
       auto &offset = lod_and_offset.second;
-      auto src = Slice(offset.first, offset.second);
+      auto sliced_src = src.Slice(offset.first, offset.second);
       auto &dst_place = places[i];
-      framework::TensorCopy(src, dst_place, &dst);
+      framework::TensorCopy(sliced_src, dst_place, &dst);
 
       LoD my_lod;
       for (auto &l : lod_and_offset.first) {
@@ -388,9 +422,9 @@ std::vector<LoDTensor> LoDTensor::SplitLoDTensor(
   return results;
 }
 
-void LoDTensor::MergeLoDTensor(
-    const std::vector<const LoDTensor *> &lod_tensors,
-    platform::Place dst_place) {
+void MergeLoDTensor(LoDTensor *target,
+                    const std::vector<const LoDTensor *> &lod_tensors,
+                    platform::Place dst_place) {
   PADDLE_ENFORCE_EQ(lod_tensors.empty(), false,
                     platform::errors::InvalidArgument(
                         "The LoDTensors to be merged are empty."));
@@ -449,10 +483,10 @@ void LoDTensor::MergeLoDTensor(
       }
     }
   }
-  Resize(new_dim);
-  set_layout(new_layout);
-  set_lod(new_lod);
-  mutable_data(dst_place, new_type);
+  target->Resize(new_dim);
+  target->set_layout(new_layout);
+  target->set_lod(new_lod);
+  target->mutable_data(dst_place, new_type);
 
   int begin = 0;
   for (auto *src : lod_tensors) {
@@ -460,44 +494,11 @@ void LoDTensor::MergeLoDTensor(
     if (end == begin) {
       continue;
     }
-    auto dst = Slice(begin, end);
+    auto dst = target->Slice(begin, end);
     framework::TensorCopy(*src, dst_place, &dst);
     begin = end;
   }
 }
 
-LoD ConvertToLengthBasedLoD(const LoD &offset_lod) {
-  LoD length_lod;
-  length_lod.reserve(offset_lod.size());
-  for (size_t lvl = 0; lvl < offset_lod.size(); ++lvl) {
-    std::vector<size_t> level;
-    if (offset_lod[lvl].size() > 0) {
-      level.reserve(offset_lod[lvl].size() - 1);
-    }
-    for (size_t idx = 0; idx < offset_lod[lvl].size() - 1; ++idx) {
-      level.push_back(offset_lod[lvl][idx + 1] - offset_lod[lvl][idx]);
-    }
-    length_lod.push_back(level);
-  }
-  return length_lod;
-}
-
-LoD ConvertToOffsetBasedLoD(const LoD &length_lod) {
-  LoD offset_lod;
-  offset_lod.reserve(length_lod.size());
-  for (size_t lvl = 0; lvl < length_lod.size(); ++lvl) {
-    std::vector<size_t> level;
-    level.reserve(length_lod[lvl].size() + 1);
-    size_t tmp = 0;
-    level.push_back(tmp);
-    for (size_t idx = 0; idx < length_lod[lvl].size(); ++idx) {
-      tmp += length_lod[lvl][idx];
-      level.push_back(tmp);
-    }
-    offset_lod.push_back(level);
-  }
-  return offset_lod;
-}
-
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/lod_tensor.h b/paddle/fluid/framework/lod_tensor.h
index bbb8f8005168ca..41cd6b83fd1d58 100644
--- a/paddle/fluid/framework/lod_tensor.h
+++ b/paddle/fluid/framework/lod_tensor.h
@@ -36,7 +36,15 @@ class DeviceContext;
 namespace paddle {
 namespace framework {
 
-using LoDTensor = paddle::framework::Tensor;
+using LoDTensor = pten::DenseTensor;
+
+// Split Tensor and copy to each place specified in places.
+std::vector<LoDTensor> SplitLoDTensor(
+    const LoDTensor& src, const std::vector<platform::Place> places);
+
+void MergeLoDTensor(LoDTensor* target,
+                    const std::vector<const LoDTensor*>& lod_tensors,
+                    platform::Place dst_place);
 
 /*
  * LoD is short for Level of Details.
diff --git a/paddle/fluid/framework/lod_tensor_test.cc b/paddle/fluid/framework/lod_tensor_test.cc
index e3223e67fc94df..917bb7cc096c26 100644
--- a/paddle/fluid/framework/lod_tensor_test.cc
+++ b/paddle/fluid/framework/lod_tensor_test.cc
@@ -147,7 +147,7 @@ TEST(LoD, SplitLoDTensor) {
   lod1.push_back(std::vector<size_t>({0, 1, 2}));
   lod1.push_back(std::vector<size_t>({0, 2, 7}));
 
-  auto lods = lod_tensor.SplitLoDTensor(places);
+  auto lods = SplitLoDTensor(lod_tensor, places);
   EXPECT_EQ(lods[0].lod(), lod0);
   EXPECT_EQ(lods[1].lod(), lod1);
 }
@@ -167,7 +167,7 @@ TEST(LoD, SplitLoDTensorWithZeroBatchSize) {
   LoD lod_res;
   lod_res.push_back(std::vector<size_t>({0}));
 
-  auto lods = lod_tensor.SplitLoDTensor(places);
+  auto lods = SplitLoDTensor(lod_tensor, places);
   EXPECT_EQ(lods[0].lod(), lod_res);
   EXPECT_EQ(lods[1].lod(), lod_res);
 }
@@ -213,7 +213,7 @@ TEST(LoD, MergeLoDTensor) {
   std::vector<const LoDTensor*> lods{&lod_tensor0, &lod_tensor1, &lod_tensor2};
 
   LoDTensor lod_tensor;
-  lod_tensor.MergeLoDTensor(lods, place);
+  MergeLoDTensor(&lod_tensor, lods, place);
   EXPECT_EQ(lod_tensor.lod(), lod);
 }
 
diff --git a/paddle/fluid/framework/naive_executor.h b/paddle/fluid/framework/naive_executor.h
index f706eabb47988a..a74917e7e69c8f 100644
--- a/paddle/fluid/framework/naive_executor.h
+++ b/paddle/fluid/framework/naive_executor.h
@@ -24,6 +24,10 @@
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/place.h"
 
+namespace pten {
+class DenseTensor;
+}  // namespace pten
+
 namespace paddle {
 namespace framework {
 
@@ -31,7 +35,6 @@ namespace framework {
  * Simple, intuitive and effective. Only single thread is supported, and
  * currently designed for inference.
  */
-class Tensor;
 class ProgramDesc;
 class Scope;
 
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index e3f0fbbdfdc4a5..ea45ef857dfa0b 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -32,11 +32,10 @@ limitations under the License. */
 #include "paddle/pten/common/scalar.h"
 #include "paddle/pten/common/scalar_array.h"
 
-namespace paddle {
-namespace framework {
-class Tensor;
-}  // namespace framework
-}  // namespace paddle
+namespace pten {
+class DenseTensor;
+}  // namespace pten
+
 #ifdef PADDLE_WITH_XPU
 #include "paddle/fluid/platform/device/xpu/xpu_info.h"
 #include "paddle/fluid/platform/device/xpu/xpu_op_list.h"
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index d6c1c4cb6acc0f..8767b55062cdae 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -1048,7 +1048,7 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes(
     VLOG(3) << "Split " << (is_persistable ? "persistable" : "no persistable")
             << " data (" << pair.first << "), dim:" << pair.second.dims()
             << ", place: " << pair.second.place();
-    auto lod_tensors = pair.second.SplitLoDTensor(member_->places_);
+    auto lod_tensors = SplitLoDTensor(pair.second, member_->places_);
     bool is_cpu_place = platform::is_cpu_place(member_->places_.front());
     if (!is_persistable && num_places != lod_tensors.size() &&
         !allow_partial_feed) {
diff --git a/paddle/fluid/framework/pull_dense_worker.cc b/paddle/fluid/framework/pull_dense_worker.cc
index 62d6ba09735478..acd742a5822e2a 100644
--- a/paddle/fluid/framework/pull_dense_worker.cc
+++ b/paddle/fluid/framework/pull_dense_worker.cc
@@ -14,10 +14,13 @@ limitations under the License. */
 #include <time.h>
 #include "paddle/fluid/framework/device_worker.h"
 
+namespace pten {
+class DenseTensor;
+}  // namespace pten
+
 namespace paddle {
 namespace framework {
 
-class Tensor;
 class Scope;
 class Variable;
 
diff --git a/paddle/fluid/framework/selected_rows.h b/paddle/fluid/framework/selected_rows.h
index 3634ccca95126e..445f446ef2f4ae 100644
--- a/paddle/fluid/framework/selected_rows.h
+++ b/paddle/fluid/framework/selected_rows.h
@@ -30,8 +30,6 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-class Tensor;
-
 class SelectedRows {
   /*
    * @brief We can use the SelectedRows structure to reproduce a sparse table.
diff --git a/paddle/fluid/framework/tensor.cc b/paddle/fluid/framework/tensor.cc
index 6aa10a058081b8..bb8d7df7457501 100644
--- a/paddle/fluid/framework/tensor.cc
+++ b/paddle/fluid/framework/tensor.cc
@@ -18,105 +18,13 @@ limitations under the License. */
 DECLARE_bool(use_stream_safe_cuda_allocator);
 
 namespace paddle {
-namespace framework {
-
-Tensor Tensor::Slice(int64_t begin_idx, int64_t end_idx) const {
-  check_memory_size();
-  PADDLE_ENFORCE_GE(begin_idx, 0,
-                    paddle::platform::errors::OutOfRange(
-                        "The start row index must be greater than 0."
-                        "But received the start index is d%.",
-                        begin_idx));
-  PADDLE_ENFORCE_LE(end_idx, meta_.dims[0],
-                    paddle::platform::errors::OutOfRange(
-                        "The end row index is out of bound."));
-  PADDLE_ENFORCE_LT(
-      begin_idx, end_idx,
-      paddle::platform::errors::InvalidArgument(
-          "The start row index must be less than the end row index."
-          "But received the start index = %d, the end index = %d.",
-          begin_idx, end_idx));
-
-  if (meta_.dims[0] == 1) {
-    return *this;
-  } else {
-    size_t base = numel() / meta_.dims[0];
-    Tensor dst;
-    dst.storage_ = pten::make_intrusive<paddle::experimental::SharedStorage>(
-        storage_->data_shared());
-    dst.meta_.layout = meta_.layout;
-    dst.meta_.dtype = meta_.dtype;
-    DDim dst_dims = meta_.dims;
-    dst_dims[0] = end_idx - begin_idx;
-    dst.Resize(dst_dims);
-    dst.meta_.offset = meta_.offset + begin_idx * base * SizeOf(dtype());
-    return dst;
-  }
-}
-
-std::vector<Tensor> Tensor::Split(int64_t split_size, int64_t axis) const {
-  check_memory_size();
-
-  PADDLE_ENFORCE_GE(meta_.dims.size(), 0,
-                    paddle::platform::errors::OutOfRange(
-                        "split expects at least a 1-dimensional tensor"));
-
-  PADDLE_ENFORCE_GE(
-      split_size, 0,
-      paddle::platform::errors::OutOfRange(
-          "split expects split_size be non-negative, but got split_size is %d",
-          split_size));
-
-  int64_t numel_size = meta_.dims[axis];
-
-  int64_t num_splits = 1;
-  if (split_size != 0) {
-    num_splits =
-        std::max<int64_t>((numel_size + split_size - 1) / split_size, 1);
-  }
-
-  std::vector<Tensor> splits(num_splits);
-  int64_t last_split_size = split_size - (split_size * num_splits - numel_size);
-
-  for (int64_t i = 0; i < num_splits; ++i) {
-    int64_t length = i < num_splits - 1 ? split_size : last_split_size;
-    splits[i] = Slice(i * split_size, i * split_size + length);
-  }
-  return splits;
-}
-
-std::vector<Tensor> Tensor::Chunk(int64_t chunks, int64_t axis) const {
-  check_memory_size();
-  PADDLE_ENFORCE_GE(meta_.dims.size(), 0,
-                    paddle::platform::errors::OutOfRange(
-                        "split expects at least a 1-dimensional tensor"));
-  PADDLE_ENFORCE_GE(
-      chunks, 0,
-      paddle::platform::errors::OutOfRange(
-          "chunks expects to be greater than 0, but got chunks is %d", chunks));
-
-  int64_t numel_size = meta_.dims[axis];
-  int64_t split_size = (numel_size + chunks - 1) / chunks;
-  return Split(split_size, axis);
-}
-
-Tensor& Tensor::ShareDataWith(const Tensor& src) {
-  src.check_memory_size();
-  // Preserve LoD
-  auto lod = meta_.lod;
-  *this = src;
-  meta_.lod = lod;
-  return *this;
-}
-Tensor& Tensor::ShareInplaceVersionCounterWith(const Tensor& src) {
-  PADDLE_ENFORCE_NOT_NULL(
-      inplace_version_counter_,
-      platform::errors::PreconditionNotMet(
-          "Tensor does not hold inplace_version_counter_."));
-
-  inplace_version_counter_ = src.inplace_version_counter_;
-  return *this;
-}
+namespace memory {
+namespace allocation {
+class Allocation;
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
 
-}  // namespace framework
+namespace paddle {
+namespace framework {}  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h
index 95405820a48d98..8c7345f3e2f614 100644
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -69,35 +69,7 @@ using LoD = std::vector<paddle::framework::Vector<size_t>>;
   Variable object but not a pointer.
 */
 
-class Tensor : public pten::DenseTensor {
- public:
-  using DenseTensor = pten::DenseTensor;
-  using DenseTensor::DenseTensor;
-
-  // Split Tensor and copy to each place specified in places.
-  std::vector<Tensor> SplitLoDTensor(
-      const std::vector<platform::Place> places) const;
-
-  void MergeLoDTensor(const std::vector<const Tensor*>& lod_tensors,
-                      platform::Place place);
-
-  /*! The internal of two tensors share the same memory block. */
-  Tensor& ShareDataWith(const Tensor& src);
-
-  /*! The internal of two tensors share the same inplace version counter. */
-  Tensor& ShareInplaceVersionCounterWith(const Tensor& src);
-
-  Tensor Slice(int64_t begin_idx, int64_t end_idx) const;
-
-  std::vector<Tensor> Split(int64_t split_size, int64_t axis) const;
-
-  std::vector<Tensor> Chunk(int64_t chunks, int64_t axis) const;
-
-  Tensor& Resize(const DDim& dims) {
-    meta_.dims = dims;
-    return *this;
-  }
-};
+using Tensor = pten::DenseTensor;
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index 4298b159ead52f..dff48790960569 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -387,18 +387,10 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
                 Tensor* dst) {
   TensorCopyImpl<Tensor>(src, dst_place, dst);
 }
-void TensorCopy(const pten::DenseTensor& src, const platform::Place& dst_place,
-                pten::DenseTensor* dst) {
-  TensorCopyImpl<pten::DenseTensor>(src, dst_place, dst);
-}
 void TensorCopy(const Tensor& src, const platform::Place& dst_place,
                 const platform::DeviceContext& ctx, Tensor* dst) {
   TensorCopyImpl<Tensor>(src, dst_place, ctx, dst);
 }
-void TensorCopy(const pten::DenseTensor& src, const platform::Place& dst_place,
-                const platform::DeviceContext& ctx, pten::DenseTensor* dst) {
-  TensorCopyImpl<pten::DenseTensor>(src, dst_place, ctx, dst);
-}
 
 void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
                     Tensor* dst) {
@@ -1394,45 +1386,50 @@ std::ostream& operator<<(std::ostream& os, const LoD& lod) {
   return os;
 }
 
-std::ostream& operator<<(std::ostream& os, const Tensor& t) {
+}  // namespace framework
+}  // namespace paddle
+
+namespace pten {
+
+std::ostream& operator<<(std::ostream& os, const pten::DenseTensor& t) {
   if (t.lod().size() > 0) {
     os << "  - lod: " << t.lod() << "\n";
   }
 
   os << "  - place: " << t.place() << "\n";
   os << "  - shape: [" << t.dims() << "]\n";
-  os << "  - layout: " << DataLayoutToString(t.layout()) << "\n";
+  os << "  - layout: " << paddle::framework::DataLayoutToString(t.layout())
+     << "\n";
 
 #ifdef PADDLE_WITH_MKLDNN
   os << "  - format: "
      << dnnl_fmt_tag2str(static_cast<dnnl_format_tag_t>(t.format())) << "\n";
 #endif
 
-  Tensor tensor;
+  DenseTensor tensor;
   tensor.Resize(t.dims());
-  if (platform::is_cpu_place(t.place())) {
+  if (paddle::platform::is_cpu_place(t.place())) {
     tensor.ShareDataWith(t);
   } else {
-    platform::CPUPlace place;
-    framework::TensorCopy(t, place, &tensor);
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    paddle::platform::CPUPlace place;
+    paddle::framework::TensorCopy(t, place, &tensor);
+    paddle::platform::DeviceContextPool& pool =
+        paddle::platform::DeviceContextPool::Instance();
     auto& dev_ctx = *pool.Get(t.place());
     dev_ctx.Wait();
   }
 
-#define PrintTensorCallback(cpp_type, proto_type) \
-  do {                                            \
-    if (tensor.type() == proto_type) {            \
-      os << "  - dtype: " << proto_type << "\n";  \
-      print_tensor<cpp_type>(os, tensor);         \
-      return os;                                  \
-    }                                             \
+#define PrintTensorCallback(cpp_type, proto_type)            \
+  do {                                                       \
+    if (tensor.type() == proto_type) {                       \
+      os << "  - dtype: " << proto_type << "\n";             \
+      paddle::framework::print_tensor<cpp_type>(os, tensor); \
+      return os;                                             \
+    }                                                        \
   } while (0)
 
   _ForEachDataType_(PrintTensorCallback);
   VLOG(1) << "PrintVar: unrecognized data type:" << t.type();
   return os;
 }
-
-}  // namespace framework
-}  // namespace paddle
+}
diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h
index 3cb3c733f4042b..3c62f3c5e43d7e 100644
--- a/paddle/fluid/framework/tensor_util.h
+++ b/paddle/fluid/framework/tensor_util.h
@@ -39,9 +39,6 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-std::ostream& operator<<(std::ostream& os, const LoD& lod);
-std::ostream& operator<<(std::ostream& os, const Tensor& t);
-
 class PrintOptions {
  public:
   static PrintOptions& Instance() {
@@ -76,12 +73,8 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
 // If ctx_place and src_place are the same, src_ctx.Wait() is added
 // after memory::Copy; if ctx_place and dst_place are the same,
 // src_ctx.Wait() is added before memory::Copy.
-class Tensor;
-
 void TensorCopy(const Tensor& src, const platform::Place& dst_place,
                 const platform::DeviceContext& ctx, Tensor* dst);
-void TensorCopy(const pten::DenseTensor& src, const platform::Place& dst_place,
-                const platform::DeviceContext& ctx, pten::DenseTensor* dst);
 
 // NOTE(zcd): If the src.place() and dst_place are two different GPU,
 // the copy operation is carried out on the dst_place's stream. This is
@@ -92,8 +85,6 @@ void TensorCopy(const pten::DenseTensor& src, const platform::Place& dst_place,
 // not completed.
 void TensorCopy(const Tensor& src, const platform::Place& dst_place,
                 Tensor* dst);
-void TensorCopy(const pten::DenseTensor& src, const platform::Place& dst_place,
-                pten::DenseTensor* dst);
 
 void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
                     Tensor* dst);
@@ -469,5 +460,11 @@ inline void TensorToVector(const Tensor& src, std::vector<bool>* dst) {
   delete[] array;
 }
 
+std::ostream& operator<<(std::ostream& os, const LoD& lod);
+
 }  // namespace framework
 }  // namespace paddle
+
+namespace pten {
+std::ostream& operator<<(std::ostream& os, const DenseTensor& t);
+}
diff --git a/paddle/fluid/framework/trainer.h b/paddle/fluid/framework/trainer.h
index 8bba9492a56868..91d618970e30c8 100644
--- a/paddle/fluid/framework/trainer.h
+++ b/paddle/fluid/framework/trainer.h
@@ -40,7 +40,6 @@ namespace paddle {
 namespace framework {
 
 class Dataset;
-class Tensor;
 class ProgramDesc;
 class PullDenseWorker;
 class Scope;
diff --git a/paddle/fluid/framework/var_type_traits.h b/paddle/fluid/framework/var_type_traits.h
index 715e7a14c5529d..008b6829f9fe37 100644
--- a/paddle/fluid/framework/var_type_traits.h
+++ b/paddle/fluid/framework/var_type_traits.h
@@ -47,6 +47,10 @@
 #include "xpu/bkcl.h"
 #endif
 
+namespace pten {
+class DenseTensor;
+}  // namespace pten
+
 // Users should add forward declarations here
 namespace paddle {
 
@@ -70,7 +74,6 @@ class BKCLCommunicator;
 namespace framework {
 class LoDRankTable;
 class ScopeBase;
-class Tensor;
 class ReaderHolder;
 class Scope;
 class SelectedRows;
diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h
index 22f016e2cadc1a..09cc480fe17326 100644
--- a/paddle/fluid/imperative/prepared_operator.h
+++ b/paddle/fluid/imperative/prepared_operator.h
@@ -29,9 +29,12 @@
 
 DECLARE_bool(use_mkldnn);
 
+namespace pten {
+class DenseTensor;
+}  // namespace pten
+
 namespace paddle {
 namespace framework {
-class Tensor;
 class Variable;
 }  // namespace framework
 namespace platform {
diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc
index 0c9bedf3dca322..ad518eb96062d2 100644
--- a/paddle/fluid/imperative/reducer.cc
+++ b/paddle/fluid/imperative/reducer.cc
@@ -16,6 +16,7 @@
 
 #include <iostream>
 
+#include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/imperative/layer.h"
 #include "paddle/fluid/string/string_helper.h"
 
@@ -24,6 +25,7 @@
 
 #include "paddle/fluid/imperative/parallel_context.h"
 
+#include "paddle/pten/core/dense_tensor.h"
 namespace paddle {
 namespace imperative {
 
@@ -975,7 +977,8 @@ void Reducer::ProcessUnusedDenseVars() {
       auto *dest_grad_tensor =
           grad_var_base_tmp->MutableVar()->GetMutable<framework::LoDTensor>();
       const auto *dev_ctx = platform::DeviceContextPool::Instance().Get(place_);
-      TensorCopy(src_tensor, place_, *dev_ctx, dest_grad_tensor);
+      paddle::framework::TensorCopy(src_tensor, place_, *dev_ctx,
+                                    dest_grad_tensor);
       dest_grad_tensor->Resize(dest_dims);
     }
   }
diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
index 8bb08b6fdaf2aa..06a353d5622a70 100644
--- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
@@ -90,12 +90,12 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
       temp_tensor.mutable_data<float>(cpu_place);
 
       // Copy the parameter data to a tmp tensor.
-      TensorCopySync(*t, cpu_place, &temp_tensor);
+      paddle::framework::TensorCopySync(*t, cpu_place, &temp_tensor);
       // Reallocation the space on GPU
       t->clear();
 
       // Copy parameter data to newly allocated GPU space.
-      TensorCopySync(temp_tensor, place, t);
+      paddle::framework::TensorCopySync(temp_tensor, place, t);
     }
   }
 }
diff --git a/paddle/fluid/inference/api/api_impl.h b/paddle/fluid/inference/api/api_impl.h
index bf67cfed35f892..f0ce652beae11a 100644
--- a/paddle/fluid/inference/api/api_impl.h
+++ b/paddle/fluid/inference/api/api_impl.h
@@ -35,7 +35,6 @@ limitations under the License. */
 namespace paddle {
 
 namespace framework {
-class Tensor;
 class Scope;
 }  // namespace framework
 
diff --git a/paddle/fluid/inference/api/details/reset_tensor_array.h b/paddle/fluid/inference/api/details/reset_tensor_array.h
index 857160ad102828..fa5997d92dd231 100644
--- a/paddle/fluid/inference/api/details/reset_tensor_array.h
+++ b/paddle/fluid/inference/api/details/reset_tensor_array.h
@@ -21,9 +21,12 @@
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/variable.h"
 
+namespace pten {
+class DenseTensor;
+}  // namespace pten
+
 namespace paddle {
 namespace framework {
-class Tensor;
 class Scope;
 class SelectedRows;
 }  // namespace framework
diff --git a/paddle/fluid/inference/lite/test_tensor_utils.cc b/paddle/fluid/inference/lite/test_tensor_utils.cc
index a8ed703da95c65..b0c7c7448a50ef 100644
--- a/paddle/fluid/inference/lite/test_tensor_utils.cc
+++ b/paddle/fluid/inference/lite/test_tensor_utils.cc
@@ -122,7 +122,7 @@ void test_tensor_copy(const platform::DeviceContext& ctx) {
   }
 #endif
   std::vector<float> result;
-  TensorToVector(lod_tensor_n, ctx, &result);
+  paddle::framework::TensorToVector(lod_tensor_n, ctx, &result);
   ASSERT_EQ(result, vector);
   ASSERT_EQ(lod_tensor_n.lod(), lod_tensor.lod());
 }
@@ -142,7 +142,7 @@ void test_tensor_share(const platform::DeviceContext& ctx) {
   framework::LoDTensor lod_tensor_n;
   TensorCopyAsync(&lod_tensor_n, lite_api_tensor, ctx);
   std::vector<float> result;
-  TensorToVector(lod_tensor_n, ctx, &result);
+  paddle::framework::TensorToVector(lod_tensor_n, ctx, &result);
   ASSERT_EQ(result, vector);
   ASSERT_EQ(lod_tensor_n.lod(), lod_tensor.lod());
 }
diff --git a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc
index 0e661651914741..08e15b22b84cdf 100644
--- a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc
@@ -82,10 +82,11 @@ class BatchNormOpConverter : public OpConverter {
 
     platform::CPUPlace cpu_place;
     // copy data from gpu to cpu
-    TensorCopySync((*Bias_t), cpu_place, &bias_tensor);
-    TensorCopySync((*Mean_t), cpu_place, &mean_tensor);
-    TensorCopySync((*Scale_t), cpu_place, &scale_tensor);
-    TensorCopySync((*Variance_t), cpu_place, &variance_tensor);
+    paddle::framework::TensorCopySync((*Bias_t), cpu_place, &bias_tensor);
+    paddle::framework::TensorCopySync((*Mean_t), cpu_place, &mean_tensor);
+    paddle::framework::TensorCopySync((*Scale_t), cpu_place, &scale_tensor);
+    paddle::framework::TensorCopySync((*Variance_t), cpu_place,
+                                      &variance_tensor);
 
     auto* bias_data = bias_tensor.mutable_data<float>(platform::CPUPlace());
     auto* mean_data = mean_tensor.mutable_data<float>(platform::CPUPlace());
diff --git a/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc b/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc
index de5d3110e18903..67e7c78b62e9d2 100644
--- a/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc
@@ -55,8 +55,8 @@ class LayerNormOpConverter : public OpConverter {
     scale_tensor->Resize(Scale_t->dims());
 
     platform::CPUPlace cpu_place;
-    TensorCopySync((*Bias_t), cpu_place, &(*bias_tensor));
-    TensorCopySync((*Scale_t), cpu_place, &(*scale_tensor));
+    paddle::framework::TensorCopySync((*Bias_t), cpu_place, &(*bias_tensor));
+    paddle::framework::TensorCopySync((*Scale_t), cpu_place, &(*scale_tensor));
 
     auto* bias_data = bias_tensor->mutable_data<float>(platform::CPUPlace());
     auto* scale_data = scale_tensor->mutable_data<float>(platform::CPUPlace());
diff --git a/paddle/fluid/inference/tensorrt/convert/prelu_op.cc b/paddle/fluid/inference/tensorrt/convert/prelu_op.cc
index a883d2b5bbb49f..9e81d1177cfe10 100644
--- a/paddle/fluid/inference/tensorrt/convert/prelu_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/prelu_op.cc
@@ -46,7 +46,8 @@ class PReluOpConverter : public OpConverter {
     std::unique_ptr<framework::LoDTensor> alpha_tensor_temp(
         new framework::LoDTensor());
     alpha_tensor_temp->Resize(alpha_tensor->dims());
-    TensorCopySync(*alpha_tensor, cpu_place, alpha_tensor_temp.get());
+    paddle::framework::TensorCopySync(*alpha_tensor, cpu_place,
+                                      alpha_tensor_temp.get());
     float* alpha_data = alpha_tensor_temp->mutable_data<float>(cpu_place);
 
     nvinfer1::ILayer* layer = nullptr;
diff --git a/paddle/fluid/inference/tensorrt/convert/ut_helper.h b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
index cfb25eb2ba8276..1e503b83bbd67b 100644
--- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h
+++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
@@ -63,7 +63,7 @@ void RandomizeTensor(framework::LoDTensor* tensor, const platform::Place& place,
     *(temp_data + i) = random(0., 1.);
   }
 
-  TensorCopySync(temp_tensor, place, tensor);
+  paddle::framework::TensorCopySync(temp_tensor, place, tensor);
 }
 
 /*
diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc
index 2a35f497ed07f1..aa69463674f742 100644
--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -370,7 +370,8 @@ float *TensorRTEngine::GetWeightCPUData(const std::string &name,
                         name_with_suffix));
   weight_map[name_with_suffix].reset(new framework::Tensor());
   weight_map[name_with_suffix]->Resize(weight_tensor->dims());
-  TensorCopySync(*weight_tensor, cpu_place, weight_map[name_with_suffix].get());
+  paddle::framework::TensorCopySync(*weight_tensor, cpu_place,
+                                    weight_map[name_with_suffix].get());
   float *weight_data =
       weight_map[name_with_suffix]->mutable_data<float>(cpu_place);
   name_suffix_counter += 1;
diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h
index 849ec07d07ed7a..1f90ff216adbad 100644
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -35,12 +35,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/utils/any.h"
 
-namespace paddle {
-namespace framework {
-class Tensor;
-}  // namespace framework
-}  // namespace paddle
-
 namespace paddle {
 namespace inference {
 namespace tensorrt {
diff --git a/paddle/fluid/inference/tensorrt/test_engine.cc b/paddle/fluid/inference/tensorrt/test_engine.cc
index c627075bfe95d9..d5cc69ea661d92 100644
--- a/paddle/fluid/inference/tensorrt/test_engine.cc
+++ b/paddle/fluid/inference/tensorrt/test_engine.cc
@@ -41,12 +41,12 @@ class TensorRTEngineTest : public ::testing::Test {
 
   void PrepareInputOutput(const std::vector<float> &input,
                           std::vector<int> output_shape) {
-    TensorFromVector(input, *ctx_, &input_);
+    paddle::framework::TensorFromVector(input, *ctx_, &input_);
     output_.Resize(framework::make_ddim(output_shape));
   }
 
   void GetOutput(std::vector<float> *output) {
-    TensorToVector(output_, *ctx_, output);
+    paddle::framework::TensorToVector(output_, *ctx_, output);
   }
 
  protected:
diff --git a/paddle/fluid/operators/abs_op.cu b/paddle/fluid/operators/abs_op.cu
index 86748d4505d287..48e19defd03438 100644
--- a/paddle/fluid/operators/abs_op.cu
+++ b/paddle/fluid/operators/abs_op.cu
@@ -50,9 +50,9 @@ class AbsKernel<platform::CUDADeviceContext, T>
     std::vector<const framework::Tensor*> ins = {x};
     std::vector<framework::Tensor*> outs = {out};
     auto functor = CudaAbsFunctor<T>();
-    LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kUnary, T,
-                                        math::Real<T>>(dev_ctx, ins, &outs,
-                                                       functor);
+    paddle::operators::LaunchSameDimsElementwiseCudaKernel<
+        ElementwiseType::kUnary, T, math::Real<T>>(dev_ctx, ins, &outs,
+                                                   functor);
   }
 };
 
diff --git a/paddle/fluid/operators/activation_op.cu b/paddle/fluid/operators/activation_op.cu
index 8cced5cd919f24..b4a9386ce0fb8e 100644
--- a/paddle/fluid/operators/activation_op.cu
+++ b/paddle/fluid/operators/activation_op.cu
@@ -1368,14 +1368,14 @@ class ELUGradCudaKernel : public framework::OpKernel<T> {
     if (alpha > 0) {
       CudaELUGradFunctor<T> functor;
       functor.alpha = alpha;
-      LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
-          dev_ctx, ins, &outs, functor);
+      paddle::operators::LaunchSameDimsElementwiseCudaKernel<
+          ElementwiseType::kBinary, T, T>(dev_ctx, ins, &outs, functor);
     } else {
       CudaELUGradNegativeAlphaFunctor<T> functor;
       functor.alpha = alpha;
       ins.push_back(x);
-      LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
-          dev_ctx, ins, &outs, functor);
+      paddle::operators::LaunchSameDimsElementwiseCudaKernel<
+          ElementwiseType::kBinary, T, T>(dev_ctx, ins, &outs, functor);
     }
   }
 };
@@ -1451,8 +1451,8 @@ class ActivationCudaKernel
     for (auto& attr : attrs) {
       *attr.second = ctx.Attr<float>(attr.first);
     }
-    LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kUnary, T, T>(
-        dev_ctx, ins, &outs, functor);
+    paddle::operators::LaunchSameDimsElementwiseCudaKernel<
+        ElementwiseType::kUnary, T, T>(dev_ctx, ins, &outs, functor);
   }
 };
 
@@ -1481,17 +1481,17 @@ class ActivationGradCudaKernel
     if (static_cast<int>(Functor::FwdDeps()) == static_cast<int>(kDepOut)) {
       // Only need forward output Out
       ins.push_back(out);
-      LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
-          dev_ctx, ins, &outs, functor);
+      paddle::operators::LaunchSameDimsElementwiseCudaKernel<
+          ElementwiseType::kBinary, T, T>(dev_ctx, ins, &outs, functor);
     } else if (static_cast<int>(Functor::FwdDeps()) ==
                static_cast<int>(kDepX)) {
       // Only need forward input X
       ins.push_back(x);
-      LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
-          dev_ctx, ins, &outs, functor);
+      paddle::operators::LaunchSameDimsElementwiseCudaKernel<
+          ElementwiseType::kBinary, T, T>(dev_ctx, ins, &outs, functor);
     } else {
-      LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kUnary, T, T>(
-          dev_ctx, ins, &outs, functor);
+      paddle::operators::LaunchSameDimsElementwiseCudaKernel<
+          ElementwiseType::kUnary, T, T>(dev_ctx, ins, &outs, functor);
     }
   }
 };
diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h
index 6e32860d69c62f..a089f6b4a3c19a 100644
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -2696,8 +2696,8 @@ class PowKernel : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
       auto* factor_data = factor_tensor->data<float>();
       framework::Tensor cpu_factor_tensor;
       if (platform::is_gpu_place(factor_tensor->place())) {
-        TensorCopySync(*factor_tensor, platform::CPUPlace(),
-                       &cpu_factor_tensor);
+        framework::TensorCopySync(*factor_tensor, platform::CPUPlace(),
+                                  &cpu_factor_tensor);
         factor_data = cpu_factor_tensor.data<float>();
       }
       auto factor =
@@ -2751,8 +2751,8 @@ class PowGradKernel
       auto* factor_data = factor_tensor->data<float>();
       framework::Tensor cpu_factor_tensor;
       if (platform::is_gpu_place(factor_tensor->place())) {
-        TensorCopySync(*factor_tensor, platform::CPUPlace(),
-                       &cpu_factor_tensor);
+        framework::TensorCopySync(*factor_tensor, platform::CPUPlace(),
+                                  &cpu_factor_tensor);
         factor_data = cpu_factor_tensor.data<float>();
       }
       auto factor =
diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc b/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc
index 6582be7354f636..0a710dd842fd49 100644
--- a/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc
+++ b/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc
@@ -50,7 +50,7 @@ void Update(const platform::NPUDeviceContext& ctx,
     runner_p2.Run(stream);
 
     std::vector<int> bad_out_data;
-    TensorToVector(*bad_out_tensor, ctx, &bad_out_data);
+    paddle::framework::TensorToVector(*bad_out_tensor, ctx, &bad_out_data);
     if (bad_out_data[0] >= decr_every_n_nan_or_inf) {
       const auto& runner_p3 = NpuOpRunner("Power", {*pre_loss_scaling_tensor},
                                           {*updated_loss_scaling_tensor},
@@ -61,7 +61,8 @@ void Update(const platform::NPUDeviceContext& ctx,
       runner_p3.Run(stream);
 
       std::vector<T> new_loss_scaling;
-      TensorToVector(*updated_loss_scaling_tensor, ctx, &new_loss_scaling);
+      paddle::framework::TensorToVector(*updated_loss_scaling_tensor, ctx,
+                                        &new_loss_scaling);
       float min_value = 1.0;
       if (FLAGS_min_loss_scaling > 1) {
         min_value = static_cast<float>(FLAGS_min_loss_scaling);
@@ -98,7 +99,7 @@ void Update(const platform::NPUDeviceContext& ctx,
     runner_p2.Run(stream);
 
     std::vector<int> good_out_data;
-    TensorToVector(*good_out_tensor, ctx, &good_out_data);
+    paddle::framework::TensorToVector(*good_out_tensor, ctx, &good_out_data);
 
     if (good_out_data[0] >= incr_every_n_steps) {
       const auto& runner_p3 = NpuOpRunner("Power", {*pre_loss_scaling_tensor},
@@ -109,7 +110,8 @@ void Update(const platform::NPUDeviceContext& ctx,
       runner_p3.Run(stream);
 
       std::vector<T> new_loss_scaling;
-      TensorToVector(*updated_loss_scaling_tensor, ctx, &new_loss_scaling);
+      paddle::framework::TensorToVector(*updated_loss_scaling_tensor, ctx,
+                                        &new_loss_scaling);
       if (!std::isfinite(new_loss_scaling[0])) {
         // updated_loss_scaling_data = pre_loss_scaling_data
         const auto& runner_p4 = NpuOpRunner("Power", {*pre_loss_scaling_tensor},
@@ -209,7 +211,8 @@ class UpdateLossScalingNPUKernel : public framework::OpKernel<T> {
                           "FoundInfinite must has only one element."));
 
     std::vector<bool> found_inf_vec;
-    TensorToVector(*found_inf, ctx.device_context(), &found_inf_vec);
+    paddle::framework::TensorToVector(*found_inf, ctx.device_context(),
+                                      &found_inf_vec);
 
     LazyZerosNPU<T>{}(dev_ctx, found_inf_vec, xs, outs);
     const bool stop_update = ctx.Attr<bool>("stop_update");
diff --git a/paddle/fluid/operators/assert_op.cc b/paddle/fluid/operators/assert_op.cc
index 466e0e793e4e3b..215f6ad4be9ff1 100644
--- a/paddle/fluid/operators/assert_op.cc
+++ b/paddle/fluid/operators/assert_op.cc
@@ -16,10 +16,13 @@
 #include "paddle/fluid/operators/controlflow/while_op_helper.h"
 #include "paddle/fluid/operators/tensor_formatter.h"
 
+namespace pten {
+class DenseTensor;
+}  // namespace pten
+
 namespace paddle {
 namespace framework {
 class InferShapeContext;
-class Tensor;
 class OpDesc;
 class Scope;
 class Variable;
diff --git a/paddle/fluid/operators/assign_op.h b/paddle/fluid/operators/assign_op.h
index d9648c9617255e..1dd28c9389daf5 100644
--- a/paddle/fluid/operators/assign_op.h
+++ b/paddle/fluid/operators/assign_op.h
@@ -25,9 +25,12 @@ class DeviceContext;
 }  // namespace platform
 }  // namespace paddle
 
+namespace pten {
+class DenseTensor;
+}  // namespace pten
+
 namespace paddle {
 namespace framework {
-class Tensor;
 class Variable;
 }  // namespace framework
 }  // namespace paddle
@@ -76,7 +79,7 @@ class AssignFunctor {
                    framework::LoDTensor *out) const {
     if (lod_tensor.numel() == 0) return;
     auto &out_tensor = *out;
-    TensorCopy(lod_tensor, lod_tensor.place(), &out_tensor);
+    paddle::framework::TensorCopy(lod_tensor, lod_tensor.place(), &out_tensor);
     out_tensor.set_lod(lod_tensor.lod());
   }
 
diff --git a/paddle/fluid/operators/assign_op_npu_test.cc b/paddle/fluid/operators/assign_op_npu_test.cc
index 792d01a5efe430..049cfb8046f80e 100644
--- a/paddle/fluid/operators/assign_op_npu_test.cc
+++ b/paddle/fluid/operators/assign_op_npu_test.cc
@@ -47,7 +47,7 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx,
   init.push_back(static_cast<T>(3.0));
   init.push_back(static_cast<T>(4.0));
 
-  TensorFromVector(init, ctx, tensor_x);
+  paddle::framework::TensorFromVector(init, ctx, tensor_x);
   tensor_x->Resize({4});
 
   ctx.Wait();
@@ -62,7 +62,7 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx,
   op->Run(*scope, place);
 
   std::vector<T> out_vec;
-  TensorToVector(*tensor_out, ctx, &out_vec);
+  paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
 
   ctx.Wait();
 
diff --git a/paddle/fluid/operators/batch_norm_op.cu b/paddle/fluid/operators/batch_norm_op.cu
index e3dc54e17cd7fd..5f32d697bae408 100644
--- a/paddle/fluid/operators/batch_norm_op.cu
+++ b/paddle/fluid/operators/batch_norm_op.cu
@@ -382,7 +382,8 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
       if (ctx.HasInput("MomentumTensor")) {
         const auto *mom_tensor = ctx.Input<Tensor>("MomentumTensor");
         Tensor mom_cpu;
-        TensorCopySync(*mom_tensor, platform::CPUPlace(), &mom_cpu);
+        paddle::framework::TensorCopySync(*mom_tensor, platform::CPUPlace(),
+                                          &mom_cpu);
         momentum = mom_cpu.data<float>()[0];
       }
 
diff --git a/paddle/fluid/operators/batch_norm_op_npu.cc b/paddle/fluid/operators/batch_norm_op_npu.cc
index be4847da51f187..aa8ceca5416200 100644
--- a/paddle/fluid/operators/batch_norm_op_npu.cc
+++ b/paddle/fluid/operators/batch_norm_op_npu.cc
@@ -86,7 +86,8 @@ class NPUBatchNormOpKernel : public framework::OpKernel<T> {
       if (ctx.HasInput("MomentumTensor")) {
         const auto *mom_tensor = ctx.Input<Tensor>("MomentumTensor");
         Tensor mom_cpu;
-        TensorCopySync(*mom_tensor, platform::CPUPlace(), &mom_cpu);
+        paddle::framework::TensorCopySync(*mom_tensor, platform::CPUPlace(),
+                                          &mom_cpu);
         momentum = mom_cpu.data<float>()[0];
       }
 
diff --git a/paddle/fluid/operators/batch_norm_op_xpu.cc b/paddle/fluid/operators/batch_norm_op_xpu.cc
index d232891f3d6840..505acbbdbde1b0 100644
--- a/paddle/fluid/operators/batch_norm_op_xpu.cc
+++ b/paddle/fluid/operators/batch_norm_op_xpu.cc
@@ -87,7 +87,8 @@ class BatchNormXPUKernel : public framework::OpKernel<T> {
       if (ctx.HasInput("MomentumTensor")) {
         const auto *mom_tensor = ctx.Input<Tensor>("MomentumTensor");
         Tensor mom_cpu;
-        TensorCopySync(*mom_tensor, platform::CPUPlace(), &mom_cpu);
+        paddle::framework::TensorCopySync(*mom_tensor, platform::CPUPlace(),
+                                          &mom_cpu);
         momentum = mom_tensor->data<float>()[0];
       }
 
diff --git a/paddle/fluid/operators/bce_loss_op.cu b/paddle/fluid/operators/bce_loss_op.cu
index da96aa92cd25a9..d493dad132992a 100644
--- a/paddle/fluid/operators/bce_loss_op.cu
+++ b/paddle/fluid/operators/bce_loss_op.cu
@@ -91,8 +91,8 @@ class BCELossGradCUDAKernel : public framework::OpKernel<T> {
     std::vector<framework::Tensor*> outs = {dx};
     auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
     auto functor = BCELossGradFunctor<T>();
-    LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kTernary, T, T>(
-        dev_ctx, ins, &outs, functor);
+    paddle::operators::LaunchSameDimsElementwiseCudaKernel<
+        ElementwiseType::kTernary, T, T>(dev_ctx, ins, &outs, functor);
   }
 };
 
diff --git a/paddle/fluid/operators/benchmark/op_tester.cc b/paddle/fluid/operators/benchmark/op_tester.cc
index 603eec4d52232e..ace2b656e8efb3 100644
--- a/paddle/fluid/operators/benchmark/op_tester.cc
+++ b/paddle/fluid/operators/benchmark/op_tester.cc
@@ -308,7 +308,7 @@ void OpTester::SetupTensor(framework::LoDTensor *tensor,
   }
 
   if (!platform::is_cpu_place(place_)) {
-    TensorCopySync(cpu_tensor, place_, tensor);
+    paddle::framework::TensorCopySync(cpu_tensor, place_, tensor);
   }
 }
 
diff --git a/paddle/fluid/operators/bincount_op.cu b/paddle/fluid/operators/bincount_op.cu
index 34facf1ea1fa90..cf189193d1c11a 100644
--- a/paddle/fluid/operators/bincount_op.cu
+++ b/paddle/fluid/operators/bincount_op.cu
@@ -77,8 +77,10 @@ void BincountCUDAInner(const framework::ExecutionContext& context) {
   input_min_scala.device(*place) = input_x.minimum();
 
   Tensor input_min_cpu, input_max_cpu;
-  TensorCopySync(input_max_t, platform::CPUPlace(), &input_max_cpu);
-  TensorCopySync(input_min_t, platform::CPUPlace(), &input_min_cpu);
+  paddle::framework::TensorCopySync(input_max_t, platform::CPUPlace(),
+                                    &input_max_cpu);
+  paddle::framework::TensorCopySync(input_min_t, platform::CPUPlace(),
+                                    &input_min_cpu);
 
   InputT input_min = input_min_cpu.data<InputT>()[0];
 
diff --git a/paddle/fluid/operators/cinn/cinn_launch_op_test.cc b/paddle/fluid/operators/cinn/cinn_launch_op_test.cc
index e10fdf522ff7c5..849cdb715049ba 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_op_test.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_op_test.cc
@@ -100,7 +100,7 @@ void CopyInputDataToPlace(const framework::Scope& scope,
   for (const auto& var_name : scope.LocalVarNames()) {
     const auto& src_tensor = scope.GetVar(var_name)->Get<LoDTensor>();
     auto* dst_tensor = dst_scope->Var(var_name)->GetMutable<LoDTensor>();
-    TensorCopySync(src_tensor, dst_place, dst_tensor);
+    paddle::framework::TensorCopySync(src_tensor, dst_place, dst_tensor);
   }
 }
 
@@ -135,10 +135,12 @@ TEST(CinnLaunchOpTest, TestElementwiseAddPass) {
     elementwise_add_op->Run(scope, run_place);
 
     LoDTensor test_out, expected_out;
-    TensorCopySync(scope.Var(test_out_name)->Get<LoDTensor>(),
-                   platform::CPUPlace(), &test_out);
-    TensorCopySync(scope.Var(expected_out_name)->Get<LoDTensor>(),
-                   platform::CPUPlace(), &expected_out);
+    paddle::framework::TensorCopySync(
+        scope.Var(test_out_name)->Get<LoDTensor>(), platform::CPUPlace(),
+        &test_out);
+    paddle::framework::TensorCopySync(
+        scope.Var(expected_out_name)->Get<LoDTensor>(), platform::CPUPlace(),
+        &expected_out);
 
     ASSERT_TRUE(test_out.IsInitialized());
     ASSERT_TRUE(expected_out.IsInitialized());
diff --git a/paddle/fluid/operators/clip_op.h b/paddle/fluid/operators/clip_op.h
index 3672fa983e495c..fb41dc16d65129 100644
--- a/paddle/fluid/operators/clip_op.h
+++ b/paddle/fluid/operators/clip_op.h
@@ -64,7 +64,8 @@ class ClipKernel : public framework::OpKernel<T> {
       auto* max_t = context.Input<Tensor>("Max");
       auto* max_data = max_t->data<T>();
       if (platform::is_gpu_place(max_t->place())) {
-        TensorCopySync(*max_t, platform::CPUPlace(), &max_cpu);
+        paddle::framework::TensorCopySync(*max_t, platform::CPUPlace(),
+                                          &max_cpu);
         max_data = max_cpu.data<T>();
       }
       max = max_data[0];
@@ -77,7 +78,8 @@ class ClipKernel : public framework::OpKernel<T> {
       auto* min_t = context.Input<Tensor>("Min");
       auto* min_data = min_t->data<T>();
       if (platform::is_gpu_place(min_t->place())) {
-        TensorCopySync(*min_t, platform::CPUPlace(), &min_cpu);
+        paddle::framework::TensorCopySync(*min_t, platform::CPUPlace(),
+                                          &min_cpu);
         min_data = min_cpu.data<T>();
       }
       min = min_data[0];
@@ -101,7 +103,8 @@ class ClipKernel : public framework::OpKernel<T> {
         std::vector<const framework::Tensor*> ins = {x};
         std::vector<framework::Tensor*> outs = {out};
         auto functor = ClipFunctor<T>(min, max);
-        LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kUnary, T, T>(
+        paddle::operators::LaunchSameDimsElementwiseCudaKernel<
+            ElementwiseType::kUnary, T, T>(
             context.template device_context<platform::CUDADeviceContext>(), ins,
             &outs, functor);
 #endif
@@ -141,7 +144,8 @@ class ClipGradKernel : public framework::OpKernel<T> {
       auto* max_t = context.Input<Tensor>("Max");
       auto* max_data = max_t->data<T>();
       if (platform::is_gpu_place(max_t->place())) {
-        TensorCopySync(*max_t, platform::CPUPlace(), &max_cpu);
+        paddle::framework::TensorCopySync(*max_t, platform::CPUPlace(),
+                                          &max_cpu);
         max_data = max_cpu.data<T>();
       }
       max = max_data[0];
@@ -154,7 +158,8 @@ class ClipGradKernel : public framework::OpKernel<T> {
       auto* min_t = context.Input<Tensor>("Min");
       auto* min_data = min_t->data<T>();
       if (platform::is_gpu_place(min_t->place())) {
-        TensorCopySync(*min_t, platform::CPUPlace(), &min_cpu);
+        paddle::framework::TensorCopySync(*min_t, platform::CPUPlace(),
+                                          &min_cpu);
         min_data = min_cpu.data<T>();
       }
       min = min_data[0];
diff --git a/paddle/fluid/operators/clip_op_xpu.cc b/paddle/fluid/operators/clip_op_xpu.cc
index 7d4b02af418bef..c53bb2d9e4d0cb 100644
--- a/paddle/fluid/operators/clip_op_xpu.cc
+++ b/paddle/fluid/operators/clip_op_xpu.cc
@@ -36,7 +36,8 @@ class ClipXPUKernel : public framework::OpKernel<T> {
       auto* max_t = ctx.Input<Tensor>("Max");
       auto* max_data = max_t->data<T>();
       if (platform::is_xpu_place(max_t->place())) {
-        TensorCopySync(*max_t, platform::CPUPlace(), &max_cpu);
+        paddle::framework::TensorCopySync(*max_t, platform::CPUPlace(),
+                                          &max_cpu);
         max_data = max_cpu.data<T>();
       }
       max = max_data[0];
@@ -48,7 +49,8 @@ class ClipXPUKernel : public framework::OpKernel<T> {
       auto* min_t = ctx.Input<Tensor>("Min");
       auto* min_data = min_t->data<T>();
       if (platform::is_xpu_place(min_t->place())) {
-        TensorCopySync(*min_t, platform::CPUPlace(), &min_cpu);
+        paddle::framework::TensorCopySync(*min_t, platform::CPUPlace(),
+                                          &min_cpu);
         min_data = min_cpu.data<T>();
       }
       min = min_data[0];
diff --git a/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc b/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc
index b1e09e487fb3fb..ecf682aa52432a 100644
--- a/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc
@@ -139,7 +139,7 @@ void TestHCCLAllGatherOp(f::Scope* scope, const p::DeviceContext& ctx) {
   }
   PrintDebugInfo("input data", init);
 
-  TensorFromVector(init, ctx, tensor_x);
+  paddle::framework::TensorFromVector(init, ctx, tensor_x);
   tensor_x->Resize({num1, num2});
   ctx.Wait();
 
@@ -165,7 +165,7 @@ void TestHCCLAllGatherOp(f::Scope* scope, const p::DeviceContext& ctx) {
   ctx.Wait();
 
   std::vector<float> out_vec;
-  TensorToVector(*tensor_out, ctx, &out_vec);
+  paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
   ctx.Wait();
 
   PrintDebugInfo("output data", out_vec);
diff --git a/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc b/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc
index b998aaa3e689ce..fa134b60e28deb 100644
--- a/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc
@@ -139,7 +139,7 @@ void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx) {
   }
   PrintDebugInfo("input data", init);
 
-  TensorFromVector(init, ctx, tensor_x);
+  paddle::framework::TensorFromVector(init, ctx, tensor_x);
   tensor_x->Resize({num1, num2});
   ctx.Wait();
 
@@ -164,7 +164,7 @@ void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx) {
   ctx.Wait();
 
   std::vector<float> out_vec;
-  TensorToVector(*tensor_out, ctx, &out_vec);
+  paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
   ctx.Wait();
 
   PrintDebugInfo("output data", out_vec);
diff --git a/paddle/fluid/operators/collective/c_allreduce_op.h b/paddle/fluid/operators/collective/c_allreduce_op.h
index 714dc4e19f9b13..0e4210ea7304ae 100644
--- a/paddle/fluid/operators/collective/c_allreduce_op.h
+++ b/paddle/fluid/operators/collective/c_allreduce_op.h
@@ -144,7 +144,7 @@ inline bool ContainsNan(const paddle::platform::NPUDeviceContext& dev_ctx,
   try {
     const auto& runner_mean = paddle::operators::NpuOpRunner(
         "ReduceMeanD", {*in}, {mean}, {{"axes", axes}, {"keep_dims", false}});
-    TensorToVector(mean, dev_ctx, &vec);
+    paddle::framework::TensorToVector(mean, dev_ctx, &vec);
   } catch (...) {
     LOG(WARNING) << "ContainsNan catch exception";
     return true;
diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc
index edbc19eea23b61..3e91220423e6a5 100644
--- a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc
@@ -146,7 +146,7 @@ void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx,
 
   auto place = ctx.GetPlace();
 
-  TensorFromVector(init, ctx, tensor_x);
+  paddle::framework::TensorFromVector(init, ctx, tensor_x);
   tensor_x->Resize({num1, num2});
   ctx.Wait();
 
@@ -170,7 +170,7 @@ void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx,
   ctx.Wait();
 
   std::vector<T> out_vec;
-  TensorToVector(*tensor_out, ctx, &out_vec);
+  paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
   ctx.Wait();
 
   PrintDebugInfo("output data", out_vec);
diff --git a/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc b/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc
index 2ea217afb776fd..1ea34c8200333f 100644
--- a/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc
@@ -133,7 +133,7 @@ void TestHCCLBroadcastOp(f::Scope* scope, const p::DeviceContext& ctx) {
   }
   PrintDebugInfo("input data", init);
 
-  TensorFromVector(init, ctx, tensor_x);
+  paddle::framework::TensorFromVector(init, ctx, tensor_x);
   tensor_x->Resize({num, num});
   ctx.Wait();
 
@@ -159,7 +159,7 @@ void TestHCCLBroadcastOp(f::Scope* scope, const p::DeviceContext& ctx) {
   ctx.Wait();
 
   std::vector<float> out_vec;
-  TensorToVector(*tensor_out, ctx, &out_vec);
+  paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
   ctx.Wait();
 
   PrintDebugInfo("output data", out_vec);
diff --git a/paddle/fluid/operators/collective/c_embedding_op_npu.cc b/paddle/fluid/operators/collective/c_embedding_op_npu.cc
index 1ad809dbfc469c..3e96f15d5d3dd6 100644
--- a/paddle/fluid/operators/collective/c_embedding_op_npu.cc
+++ b/paddle/fluid/operators/collective/c_embedding_op_npu.cc
@@ -71,8 +71,8 @@ void shard_index(const Tensor &table_t, const Tensor &ids_t, int64_t start_idx,
 #if (CANN_VERSION_CODE >= 503003)
   Tensor factor_tensor(ids_t.type());
   factor_tensor.mutable_data<T>({1}, context.GetPlace());
-  TensorFromVector(std::vector<T>{static_cast<T>(start_idx)},
-                   context.device_context(), &factor_tensor);
+  paddle::framework::TensorFromVector(std::vector<T>{static_cast<T>(start_idx)},
+                                      context.device_context(), &factor_tensor);
   sub_runner.SetType("Sub")
       .AddInput(ids_t)
       .AddInput(factor_tensor)
diff --git a/paddle/fluid/operators/collective/c_identity_op.h b/paddle/fluid/operators/collective/c_identity_op.h
index c8577a96174898..a5e28235c168c1 100644
--- a/paddle/fluid/operators/collective/c_identity_op.h
+++ b/paddle/fluid/operators/collective/c_identity_op.h
@@ -48,7 +48,7 @@ class CIdentityOpKernel : public framework::OpKernel<T> {
             "The ring_id (%d) for c_identity op must be non-negative.", rid));
     out->mutable_data<T>(ctx.GetPlace());
 
-    TensorCopy(*x, out->place(), out);
+    paddle::framework::TensorCopy(*x, out->place(), out);
   }
 };
 
diff --git a/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc b/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc
index 1919b8ee35edf5..d589d0a25e694c 100644
--- a/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc
@@ -137,7 +137,7 @@ void TestHCCLReduceOp(f::Scope* scope, const p::DeviceContext& ctx, int iter) {
 
   auto place = ctx.GetPlace();
 
-  TensorFromVector(init, ctx, tensor_x);
+  paddle::framework::TensorFromVector(init, ctx, tensor_x);
   tensor_x->Resize({num1, num2});
   ctx.Wait();
 
@@ -161,7 +161,7 @@ void TestHCCLReduceOp(f::Scope* scope, const p::DeviceContext& ctx, int iter) {
   ctx.Wait();
 
   std::vector<float> out_vec;
-  TensorToVector(*tensor_out, ctx, &out_vec);
+  paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
   ctx.Wait();
 
   PrintDebugInfo("output data", out_vec);
diff --git a/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc b/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc
index 5fa0df97c655f3..db78652f87980e 100644
--- a/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc
@@ -137,7 +137,7 @@ void TestHCCLReduceScatterOp(f::Scope* scope, const p::DeviceContext& ctx) {
   }
   PrintDebugInfo("input data", init);
 
-  TensorFromVector(init, ctx, tensor_x);
+  paddle::framework::TensorFromVector(init, ctx, tensor_x);
   tensor_x->Resize({num1, num2});
 
   ctx.Wait();
@@ -166,7 +166,7 @@ void TestHCCLReduceScatterOp(f::Scope* scope, const p::DeviceContext& ctx) {
   }
 
   std::vector<float> out_vec;
-  TensorToVector(*tensor_out, ctx, &out_vec);
+  paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
   ctx.Wait();
 
   PrintDebugInfo("output data", out_vec);
diff --git a/paddle/fluid/operators/collective/c_sync_calc_stream_op_npu_test.cc b/paddle/fluid/operators/collective/c_sync_calc_stream_op_npu_test.cc
index 45613715b8260c..5778a270f19926 100644
--- a/paddle/fluid/operators/collective/c_sync_calc_stream_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_sync_calc_stream_op_npu_test.cc
@@ -56,9 +56,9 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
     init_y.push_back(static_cast<T>(2.0));
   }
 
-  TensorFromVector(init_x, ctx, tensor_x);
+  paddle::framework::TensorFromVector(init_x, ctx, tensor_x);
   tensor_x->Resize({10, 10});
-  TensorFromVector(init_y, ctx, tensor_y);
+  paddle::framework::TensorFromVector(init_y, ctx, tensor_y);
   tensor_y->Resize({10, 10});
 
   f::AttributeMap attrs;
@@ -85,7 +85,7 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
   sync_op->Run(*scope, place);
 
   std::vector<T> out_vec;
-  TensorToVector(*tensor_out, ctx, &out_vec);
+  paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
 
   // sync op copy
   auto sync_op2 = f::OpRegistry::CreateOp("c_sync_calc_stream", {{"X", {"X"}}},
diff --git a/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc b/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc
index a6c5149ed283ad..e7017835686940 100644
--- a/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc
@@ -136,7 +136,7 @@ void TestHCCLBroadcastOp(f::Scope* scope, const p::DeviceContext& ctx) {
   }
   std::cout << std::endl;
 
-  TensorFromVector(init, ctx, tensor_x);
+  paddle::framework::TensorFromVector(init, ctx, tensor_x);
   tensor_x->Resize({num, num});
 
   ctx.Wait();
@@ -169,7 +169,7 @@ void TestHCCLBroadcastOp(f::Scope* scope, const p::DeviceContext& ctx) {
   // ctx.Wait();
 
   std::vector<float> out_vec;
-  TensorToVector(*tensor_out, ctx, &out_vec);
+  paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
 
   EXPECT_EQ(out_vec.size(), init.size());
   for (uint32_t i = 0; i < out_vec.size(); i++) {
diff --git a/paddle/fluid/operators/collective/checknumeric_npu_test.cc b/paddle/fluid/operators/collective/checknumeric_npu_test.cc
index ac1559f87d1f66..2be37cc456b973 100644
--- a/paddle/fluid/operators/collective/checknumeric_npu_test.cc
+++ b/paddle/fluid/operators/collective/checknumeric_npu_test.cc
@@ -64,7 +64,7 @@ bool Check(T value, int size = 2 * 512 * 8192) {
     init.push_back(static_cast<T>(value));
   }
 
-  TensorFromVector(init, ctx, tensor_x);
+  paddle::framework::TensorFromVector(init, ctx, tensor_x);
   bool result = paddle::operators::ContainsNan(ctx, ctx.stream(), tensor_x);
   return result;
 }
diff --git a/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc b/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc
index c1f55df5e8d860..edd4b18b35a6d3 100644
--- a/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc
@@ -145,7 +145,7 @@ void TestHcomRecvOp(f::Scope* scope, const p::DeviceContext& ctx) {
   }
   VLOG(3) << "Run op recv_v2";
   std::vector<float> out_vec;
-  TensorToVector(*tensor_out, ctx, &out_vec);
+  paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
   ctx.Wait();
   std::vector<float> init(num * num, 1.0 * atoi(getenv("DEST_RANK")));
   EXPECT_EQ(out_vec == init, true);
diff --git a/paddle/fluid/operators/collective/send_v2_op_npu_test.cc b/paddle/fluid/operators/collective/send_v2_op_npu_test.cc
index e47ae646b148ec..b2470ab4c0570e 100644
--- a/paddle/fluid/operators/collective/send_v2_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/send_v2_op_npu_test.cc
@@ -119,7 +119,7 @@ void TestHcomSendOp(f::Scope* scope, const p::DeviceContext& ctx) {
   std::vector<float> init(num * num, 1.0 * atoi(getenv("DEST_RANK")));
   int rank_id = atoi(getenv("RANK_ID"));
   VLOG(3) << "rank id:" << rank_id;
-  TensorFromVector(init, ctx, tensor_x);
+  paddle::framework::TensorFromVector(init, ctx, tensor_x);
   tensor_x->Resize({num, num});
   ctx.Wait();
   auto place = ctx.GetPlace();
diff --git a/paddle/fluid/operators/controlflow/bitwise_op.cu b/paddle/fluid/operators/controlflow/bitwise_op.cu
index 2f4098c2608220..3a4d5303953ac4 100644
--- a/paddle/fluid/operators/controlflow/bitwise_op.cu
+++ b/paddle/fluid/operators/controlflow/bitwise_op.cu
@@ -35,8 +35,9 @@ class BinaryBitwiseOpKernel<platform::CUDADeviceContext, Functor>
     std::vector<framework::Tensor*> outs = {out};
     const auto& cuda_ctx =
         ctx.template device_context<platform::CUDADeviceContext>();
-    LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
-        cuda_ctx, ins, &outs, -1, functor);
+    paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T,
+                                                   T>(cuda_ctx, ins, &outs, -1,
+                                                      functor);
   }
 };
 
@@ -56,8 +57,8 @@ class UnaryBitwiseOpKernel<platform::CUDADeviceContext, Functor>
     std::vector<framework::Tensor*> outs = {out};
     const auto& cuda_ctx =
         ctx.template device_context<platform::CUDADeviceContext>();
-    LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kUnary, T, T>(
-        cuda_ctx, ins, &outs, functor);
+    paddle::operators::LaunchSameDimsElementwiseCudaKernel<
+        ElementwiseType::kUnary, T, T>(cuda_ctx, ins, &outs, functor);
   }
 };
 
diff --git a/paddle/fluid/operators/controlflow/compare_all_op.cu b/paddle/fluid/operators/controlflow/compare_all_op.cu
index 64a96ae9e8ee12..54f59c40a205d7 100644
--- a/paddle/fluid/operators/controlflow/compare_all_op.cu
+++ b/paddle/fluid/operators/controlflow/compare_all_op.cu
@@ -55,8 +55,8 @@ class CompareReduceOpKernel
           context.template device_context<platform::CUDADeviceContext>();
       std::vector<const framework::Tensor*> ins = {x, y};
       std::vector<framework::Tensor*> outs = {&tmp};
-      LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kBinary, T, bool>(
-          cuda_ctx, ins, &outs, Functor());
+      paddle::operators::LaunchSameDimsElementwiseCudaKernel<
+          ElementwiseType::kBinary, T, bool>(cuda_ctx, ins, &outs, Functor());
 
       // Reduce by 'bitwise and' operator
       std::vector<int> reduce_dims;
diff --git a/paddle/fluid/operators/controlflow/compare_op.cu b/paddle/fluid/operators/controlflow/compare_op.cu
index fc7dce208c4869..f03a11d906f4e7 100644
--- a/paddle/fluid/operators/controlflow/compare_op.cu
+++ b/paddle/fluid/operators/controlflow/compare_op.cu
@@ -35,7 +35,8 @@ class CompareOpKernel<platform::CUDADeviceContext, Functor, InverseFunctor>
         ctx.template device_context<platform::CUDADeviceContext>();
 
     int axis = PackTensorsIntoVector<OutT>(ctx, &ins, &outs);
-    LaunchElementwiseCudaKernel<ElementwiseType::kBinary, InT, OutT>(
+    paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary,
+                                                   InT, OutT>(
         cuda_ctx, ins, &outs, axis, functor);
   }
 };
diff --git a/paddle/fluid/operators/controlflow/fetch_op.cc b/paddle/fluid/operators/controlflow/fetch_op.cc
index 99b16d9b692538..ed4995d4fbeda2 100644
--- a/paddle/fluid/operators/controlflow/fetch_op.cc
+++ b/paddle/fluid/operators/controlflow/fetch_op.cc
@@ -39,12 +39,13 @@ static void DataCopy(const framework::LoDTensor &src_item,
                                  : paddle::platform::MKLDNNDeviceContext::tls()
                                        .get_cur_paddle_data_layout(),
           src_item, &out, platform::CPUPlace());
-      TensorCopySync(out, platform::CPUPlace(), dst_item);
+      paddle::framework::TensorCopySync(out, platform::CPUPlace(), dst_item);
     } else {
-      TensorCopySync(src_item, platform::CPUPlace(), dst_item);
+      paddle::framework::TensorCopySync(src_item, platform::CPUPlace(),
+                                        dst_item);
     }
 #else
-    TensorCopySync(src_item, platform::CPUPlace(), dst_item);
+    paddle::framework::TensorCopySync(src_item, platform::CPUPlace(), dst_item);
 #endif
   } else {
     // Not copy, if the src tensor is empty.
diff --git a/paddle/fluid/operators/controlflow/fetch_v2_op.cc b/paddle/fluid/operators/controlflow/fetch_v2_op.cc
index 9bb9e481034bd5..d7f74c44bd522a 100644
--- a/paddle/fluid/operators/controlflow/fetch_v2_op.cc
+++ b/paddle/fluid/operators/controlflow/fetch_v2_op.cc
@@ -50,12 +50,13 @@ static void DeepCopy(const framework::LoDTensor &src_item,
                                  : paddle::platform::MKLDNNDeviceContext::tls()
                                        .get_cur_paddle_data_layout(),
           src_item, &out, platform::CPUPlace());
-      TensorCopySync(out, platform::CPUPlace(), dst_item);
+      paddle::framework::TensorCopySync(out, platform::CPUPlace(), dst_item);
     } else {
-      TensorCopySync(src_item, platform::CPUPlace(), dst_item);
+      paddle::framework::TensorCopySync(src_item, platform::CPUPlace(),
+                                        dst_item);
     }
 #else
-    TensorCopySync(src_item, platform::CPUPlace(), dst_item);
+    paddle::framework::TensorCopySync(src_item, platform::CPUPlace(), dst_item);
 #endif
   } else {
     // Not copy, if the src tensor is empty.
diff --git a/paddle/fluid/operators/controlflow/logical_op.cu b/paddle/fluid/operators/controlflow/logical_op.cu
index 4a3fc6c895174c..53261160205350 100644
--- a/paddle/fluid/operators/controlflow/logical_op.cu
+++ b/paddle/fluid/operators/controlflow/logical_op.cu
@@ -33,10 +33,12 @@ class BinaryLogicalOpKernel<platform::CUDADeviceContext, Functor>
     int axis = PackTensorsIntoVector<OutT>(ctx, &ins, &outs);
 
     if (ins.size() == 1) {
-      LaunchElementwiseCudaKernel<ElementwiseType::kUnary, InT, OutT>(
+      paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kUnary,
+                                                     InT, OutT>(
           cuda_ctx, ins, &outs, axis, functor);
     } else {
-      LaunchElementwiseCudaKernel<ElementwiseType::kBinary, InT, OutT>(
+      paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary,
+                                                     InT, OutT>(
           cuda_ctx, ins, &outs, axis, functor);
     }
   }
diff --git a/paddle/fluid/operators/controlflow/tensor_array_read_write_op.cc b/paddle/fluid/operators/controlflow/tensor_array_read_write_op.cc
index c4451c3b583c72..8e46c7acf09181 100644
--- a/paddle/fluid/operators/controlflow/tensor_array_read_write_op.cc
+++ b/paddle/fluid/operators/controlflow/tensor_array_read_write_op.cc
@@ -55,7 +55,7 @@ class WriteToArrayOp : public ArrayOp {
           platform::DeviceContextPool::Instance();
       auto &dev_ctx = *pool.Get(place);
 
-      TensorCopy(x_tensor, place, dev_ctx, out_tensor);
+      paddle::framework::TensorCopy(x_tensor, place, dev_ctx, out_tensor);
     } else {
       VLOG(10) << "WARNING: The input tensor 'x_tensor' holds no memory, so "
                   "nothing has been written to output array["
diff --git a/paddle/fluid/operators/controlflow/while_op_helper.h b/paddle/fluid/operators/controlflow/while_op_helper.h
index 8ef12ca05e36a6..46c61842914a78 100644
--- a/paddle/fluid/operators/controlflow/while_op_helper.h
+++ b/paddle/fluid/operators/controlflow/while_op_helper.h
@@ -22,9 +22,12 @@
 #include "paddle/fluid/operators/controlflow/op_variant.h"
 #include "paddle/fluid/platform/variant.h"
 
+namespace pten {
+class DenseTensor;
+}  // namespace pten
+
 namespace paddle {
 namespace framework {
-class Tensor;
 class ProgramDesc;
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/operators/copy_cross_scope_op.cc b/paddle/fluid/operators/copy_cross_scope_op.cc
index 721354954c7035..08f900884d612f 100644
--- a/paddle/fluid/operators/copy_cross_scope_op.cc
+++ b/paddle/fluid/operators/copy_cross_scope_op.cc
@@ -65,7 +65,8 @@ class CopyCrossScopeOp : public framework::OperatorBase {
     auto id_tensor = id_var->GetMutable<LoDTensor>();
     auto it = scope.kids().begin();
     framework::Tensor cpu_id_tensor;
-    TensorCopySync(*id_tensor, platform::CPUPlace(), &cpu_id_tensor);
+    paddle::framework::TensorCopySync(*id_tensor, platform::CPUPlace(),
+                                      &cpu_id_tensor);
     auto id_value = cpu_id_tensor.data<int64_t>();
     for (auto i = 0; i < *id_value; i++) {
       it++;
@@ -87,7 +88,8 @@ class CopyCrossScopeOp : public framework::OperatorBase {
                 x_name));
         auto dst_tensor = dst_var->GetMutable<LoDTensor>();
         auto main_tensor = main_var->GetMutable<LoDTensor>();
-        TensorCopySync(*dst_tensor, main_tensor->place(), main_tensor);
+        paddle::framework::TensorCopySync(*dst_tensor, main_tensor->place(),
+                                          main_tensor);
       }
       return;
     }
@@ -107,7 +109,8 @@ class CopyCrossScopeOp : public framework::OperatorBase {
             "No variable with name %s found in destination scope.", x_name));
     auto src_tensor = source_var->GetMutable<LoDTensor>();
     auto dst_tensor = dst_var->GetMutable<LoDTensor>();
-    TensorCopySync(*src_tensor, dst_tensor->place(), dst_tensor);
+    paddle::framework::TensorCopySync(*src_tensor, dst_tensor->place(),
+                                      dst_tensor);
 
     if (ToM) {
       auto* main_var = scope.FindVar(x_name);
@@ -116,7 +119,8 @@ class CopyCrossScopeOp : public framework::OperatorBase {
           platform::errors::NotFound(
               "No variable with name %s found in destination scope.", x_name));
       auto main_tensor = main_var->GetMutable<LoDTensor>();
-      TensorCopySync(*dst_tensor, main_tensor->place(), main_tensor);
+      paddle::framework::TensorCopySync(*dst_tensor, main_tensor->place(),
+                                        main_tensor);
     }
   }
 };
diff --git a/paddle/fluid/operators/copy_cross_scope_test.cc b/paddle/fluid/operators/copy_cross_scope_test.cc
index 37bc32d745edab..5f951ad337e8e7 100644
--- a/paddle/fluid/operators/copy_cross_scope_test.cc
+++ b/paddle/fluid/operators/copy_cross_scope_test.cc
@@ -43,18 +43,18 @@ void Compare1(f::Scope* scope, const p::DeviceContext& ctx,
   auto var_x = scope->Var("tmp");
   auto x = var_x->GetMutable<f::LoDTensor>();
   std::vector<T> main_x = {1.0};
-  TensorFromVector(main_x, ctx, x);
+  paddle::framework::TensorFromVector(main_x, ctx, x);
 
   auto var_id = scope->Var("Id");
   auto id = var_id->GetMutable<f::LoDTensor>();
   std::vector<int64_t> main_id = {1};
-  TensorFromVector(main_id, ctx, id);
+  paddle::framework::TensorFromVector(main_id, ctx, id);
   for (int i = 0; i < 3; i++) {
     auto& child_scope = scope->NewScope();
     auto child_var = child_scope.Var("tmp");
     auto tensor_x = child_var->GetMutable<f::LoDTensor>();
     std::vector<T> init_x = {static_cast<T>(i)};
-    TensorFromVector(init_x, ctx, tensor_x);
+    paddle::framework::TensorFromVector(init_x, ctx, tensor_x);
   }
 
   ctx.Wait();
@@ -78,7 +78,7 @@ void Compare1(f::Scope* scope, const p::DeviceContext& ctx,
   auto* tensor_out = dst_var->GetMutable<f::LoDTensor>();
 
   std::vector<T> out_vec;
-  TensorToVector(*tensor_out, ctx, &out_vec);
+  paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
 
   int expected = 1;
   EXPECT_EQ(static_cast<int>(out_vec[0]), expected);
@@ -91,18 +91,18 @@ void Compare2(f::Scope* scope, const p::DeviceContext& ctx,
   auto var_x = scope->Var("tmp");
   auto x = var_x->GetMutable<f::LoDTensor>();
   std::vector<T> main_x = {1.0};
-  TensorFromVector(main_x, ctx, x);
+  paddle::framework::TensorFromVector(main_x, ctx, x);
 
   auto var_id = scope->Var("Id");
   auto id = var_id->GetMutable<f::LoDTensor>();
   std::vector<int64_t> main_id = {0};
-  TensorFromVector(main_id, ctx, id);
+  paddle::framework::TensorFromVector(main_id, ctx, id);
   for (int i = 0; i < 3; i++) {
     auto& child_scope = scope->NewScope();
     auto child_var = child_scope.Var("tmp");
     auto tensor_x = child_var->GetMutable<f::LoDTensor>();
     std::vector<T> init_x = {static_cast<T>(i)};
-    TensorFromVector(init_x, ctx, tensor_x);
+    paddle::framework::TensorFromVector(init_x, ctx, tensor_x);
   }
 
   ctx.Wait();
@@ -121,7 +121,7 @@ void Compare2(f::Scope* scope, const p::DeviceContext& ctx,
   auto* tensor_out = dst_var->GetMutable<f::LoDTensor>();
 
   std::vector<T> out_vec;
-  TensorToVector(*tensor_out, ctx, &out_vec);
+  paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
 
   int expected = 0;
   EXPECT_EQ(static_cast<int>(out_vec[0]), expected);
diff --git a/paddle/fluid/operators/crop_op_npu.cc b/paddle/fluid/operators/crop_op_npu.cc
index 4096d872cd8e1e..013ad5dd8cb86c 100644
--- a/paddle/fluid/operators/crop_op_npu.cc
+++ b/paddle/fluid/operators/crop_op_npu.cc
@@ -29,7 +29,8 @@ class CropNPUKernel : public framework::OpKernel<T> {
     std::vector<int> offset_list;
     if (ctx.HasInput("Offsets")) {
       auto* offsets_tensor = ctx.Input<framework::Tensor>("Offsets");
-      TensorToVector(*offsets_tensor, ctx.device_context(), &offset_list);
+      paddle::framework::TensorToVector(*offsets_tensor, ctx.device_context(),
+                                        &offset_list);
       if (offset_list.empty()) {
         offset_list.resize(x->dims().size(), 0);
       }
diff --git a/paddle/fluid/operators/crop_tensor_op.h b/paddle/fluid/operators/crop_tensor_op.h
index 54666c8482c021..9ee10e49fcb5ac 100644
--- a/paddle/fluid/operators/crop_tensor_op.h
+++ b/paddle/fluid/operators/crop_tensor_op.h
@@ -42,7 +42,7 @@ inline std::vector<int> get_new_data(
             tensor->dims()));
     if (platform::is_gpu_place(tensor->place())) {
       framework::Tensor temp;
-      TensorCopySync(*tensor, platform::CPUPlace(), &temp);
+      paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp);
 
       vec_new_data.push_back(static_cast<int32_t>(*temp.data<int32_t>()));
     } else {
@@ -111,7 +111,8 @@ static std::vector<int> GetShape(const framework::ExecutionContext& ctx) {
     auto* shape_data = shape_tensor->data<int>();
     framework::Tensor cpu_shape_tensor;
     if (platform::is_gpu_place(shape_tensor->place())) {
-      TensorCopySync(*shape_tensor, platform::CPUPlace(), &cpu_shape_tensor);
+      paddle::framework::TensorCopySync(*shape_tensor, platform::CPUPlace(),
+                                        &cpu_shape_tensor);
       shape_data = cpu_shape_tensor.data<int>();
     }
     res = std::vector<int>(shape_data, shape_data + shape_tensor->numel());
diff --git a/paddle/fluid/operators/dequantize_abs_max_op.h b/paddle/fluid/operators/dequantize_abs_max_op.h
index 0d9d20fc120ca4..6f4a76f9b4af1c 100644
--- a/paddle/fluid/operators/dequantize_abs_max_op.h
+++ b/paddle/fluid/operators/dequantize_abs_max_op.h
@@ -20,11 +20,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 
-namespace paddle {
-namespace framework {
-class Tensor;
-}  // namespace framework
-}  // namespace paddle
+namespace pten {
+class DenseTensor;
+}  // namespace pten
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/dequantize_log_op.h b/paddle/fluid/operators/dequantize_log_op.h
index 67ce9cc84d3a85..75b7f04645fc3c 100644
--- a/paddle/fluid/operators/dequantize_log_op.h
+++ b/paddle/fluid/operators/dequantize_log_op.h
@@ -19,11 +19,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/op_registry.h"
 
-namespace paddle {
-namespace framework {
-class Tensor;
-}  // namespace framework
-}  // namespace paddle
+namespace pten {
+class DenseTensor;
+}  // namespace pten
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/dequeue_op.cc b/paddle/fluid/operators/dequeue_op.cc
index f8ab97040ee7c2..fb5d53dacf0ed6 100644
--- a/paddle/fluid/operators/dequeue_op.cc
+++ b/paddle/fluid/operators/dequeue_op.cc
@@ -70,7 +70,8 @@ class DequeueOp : public framework::OperatorBase {
                             "Op(dequeue), but poped %d element.",
                             lod_tensor_vec.size()));
       for (size_t j = 0; j < lod_tensor_vec.size(); ++j) {
-        TensorCopySync(lod_tensor_vec[j], dev_place, out_tensor);
+        paddle::framework::TensorCopySync(lod_tensor_vec[j], dev_place,
+                                          out_tensor);
       }
     }
   }
diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
index 60cb16ce6c0470..eddb25d57b47cc 100644
--- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
+++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
@@ -93,7 +93,8 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel<T> {
       auto score_in = score_ins[i];
       if (multi_rois_num.size() > 0) {
         framework::Tensor temp;
-        TensorCopySync(*multi_rois_num[i], platform::CPUPlace(), &temp);
+        paddle::framework::TensorCopySync(*multi_rois_num[i],
+                                          platform::CPUPlace(), &temp);
         const int* length_in = temp.data<int>();
         lod_size = multi_rois_num[i]->numel();
         for (size_t n = 0; n < lod_size; ++n) {
diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
index a9a6dcea1bbe5f..355a35d4dd21b1 100644
--- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
+++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
@@ -200,7 +200,8 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
       }
       if (multi_rois_num.size() > 0) {
         Tensor* rois_num_t = multi_rois_num[i];
-        TensorCopySync(sub_lod, dev_ctx.GetPlace(), rois_num_t);
+        paddle::framework::TensorCopySync(sub_lod, dev_ctx.GetPlace(),
+                                          rois_num_t);
         rois_num_t->Resize({lod_size});
       }
       framework::LoD lod;
diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h
index e3c125b0a68885..f1b454913f7424 100644
--- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h
+++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h
@@ -33,7 +33,8 @@ inline std::vector<size_t> GetLodFromRoisNum(const Tensor* rois_num) {
   auto* rois_num_data = rois_num->data<int>();
   Tensor cpu_tensor;
   if (platform::is_gpu_place(rois_num->place())) {
-    TensorCopySync(*rois_num, platform::CPUPlace(), &cpu_tensor);
+    paddle::framework::TensorCopySync(*rois_num, platform::CPUPlace(),
+                                      &cpu_tensor);
     rois_num_data = cpu_tensor.data<int>();
   }
   rois_lod.push_back(static_cast<size_t>(0));
diff --git a/paddle/fluid/operators/detection/locality_aware_nms_op.cc b/paddle/fluid/operators/detection/locality_aware_nms_op.cc
index 8422a1fa6ccbfd..8cc0ebcab61f7b 100644
--- a/paddle/fluid/operators/detection/locality_aware_nms_op.cc
+++ b/paddle/fluid/operators/detection/locality_aware_nms_op.cc
@@ -320,8 +320,10 @@ class LocalityAwareNMSKernel : public framework::OpKernel<T> {
 
     LoDTensor scores;
     LoDTensor boxes;
-    TensorCopySync(*scores_input, platform::CPUPlace(), &scores);
-    TensorCopySync(*boxes_input, platform::CPUPlace(), &boxes);
+    paddle::framework::TensorCopySync(*scores_input, platform::CPUPlace(),
+                                      &scores);
+    paddle::framework::TensorCopySync(*boxes_input, platform::CPUPlace(),
+                                      &boxes);
     std::vector<std::map<int, std::vector<int>>> all_indices;
     std::vector<size_t> batch_starts = {0};
     int64_t batch_size = score_dims[0];
diff --git a/paddle/fluid/operators/detection/roi_perspective_transform_op.cu b/paddle/fluid/operators/detection/roi_perspective_transform_op.cu
index 2ddcc7a06f6797..fbf631f75b61f9 100644
--- a/paddle/fluid/operators/detection/roi_perspective_transform_op.cu
+++ b/paddle/fluid/operators/detection/roi_perspective_transform_op.cu
@@ -384,7 +384,8 @@ class CUDAROIPerspectiveTransformOpKernel : public framework::OpKernel<T> {
         roi2image_data[j] = i;
       }
     }
-    TensorCopySync(roi2image, ctx.GetPlace(), &roi2image_dev);
+    paddle::framework::TensorCopySync(roi2image, ctx.GetPlace(),
+                                      &roi2image_dev);
 
     int out_size = rois_num * transformed_height * transformed_width * channels;
     auto stream = ctx.cuda_device_context().stream();
diff --git a/paddle/fluid/operators/dropout_impl_util.h b/paddle/fluid/operators/dropout_impl_util.h
index 33fa7a092768c4..d7db7dddce3887 100644
--- a/paddle/fluid/operators/dropout_impl_util.h
+++ b/paddle/fluid/operators/dropout_impl_util.h
@@ -30,7 +30,8 @@ inline void GetSeedDataAndIncrement(const platform::CUDADeviceContext& dev_ctx,
 
   if (seed) {
     framework::Tensor seed_cpu_tensor;
-    TensorCopySync(*seed, platform::CPUPlace(), &seed_cpu_tensor);
+    paddle::framework::TensorCopySync(*seed, platform::CPUPlace(),
+                                      &seed_cpu_tensor);
     *seed_data = static_cast<uint64_t>(seed_cpu_tensor.data<int>()[0]);
     *increment = offset;
   } else if (gen_cuda->GetIsInitPy() && (!is_fix_seed)) {
diff --git a/paddle/fluid/operators/dropout_op_npu.cc b/paddle/fluid/operators/dropout_op_npu.cc
index cf6401db926007..98a38a07dadaac 100644
--- a/paddle/fluid/operators/dropout_op_npu.cc
+++ b/paddle/fluid/operators/dropout_op_npu.cc
@@ -73,7 +73,8 @@ class DropoutNPUKernel : public framework::OpKernel<T> {
       float keep_prob = 1. - dropout_prob;
       if (seed_tensor) {
         std::vector<int> seed_data;
-        TensorToVector(*seed_tensor, ctx.device_context(), &seed_data);
+        paddle::framework::TensorToVector(*seed_tensor, ctx.device_context(),
+                                          &seed_data);
         seed = seed_data[0];
       } else {
         seed = ctx.Attr<bool>("fix_seed") ? ctx.Attr<int>("seed") : 0;
diff --git a/paddle/fluid/operators/dropout_op_test.cc b/paddle/fluid/operators/dropout_op_test.cc
index 3e401d1c4f9f4f..5c9be588419e34 100644
--- a/paddle/fluid/operators/dropout_op_test.cc
+++ b/paddle/fluid/operators/dropout_op_test.cc
@@ -45,7 +45,7 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
     init.push_back(1.0);
   }
 
-  TensorFromVector(init, ctx, tensor);
+  paddle::framework::TensorFromVector(init, ctx, tensor);
 
   auto place = ctx.GetPlace();
   auto out_var = scope->Var("Out");
@@ -70,7 +70,7 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
   dropout_op->Run(*scope, place);
 
   std::vector<float> out_vec;
-  TensorToVector(*out_tensor, ctx, &out_vec);
+  paddle::framework::TensorToVector(*out_tensor, ctx, &out_vec);
 
   std::vector<float> std_out = {
       0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1,
diff --git a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cu b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cu
index 3202b0a7d254bb..9b146fe7279dea 100644
--- a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cu
@@ -28,7 +28,8 @@ class ElementwiseFloorDivKernel<platform::CUDADeviceContext, T>
         ctx.template device_context<platform::CUDADeviceContext>();
 
     int axis = PackTensorsIntoVector<T>(ctx, &ins, &outs);
-    LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
+    paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T,
+                                                   T>(
         cuda_ctx, ins, &outs, axis, FloorDivFunctor<T>());
   }
 };
diff --git a/paddle/fluid/operators/elementwise/elementwise_max_op.cu b/paddle/fluid/operators/elementwise/elementwise_max_op.cu
index eaf77744285657..7433c505f472a2 100644
--- a/paddle/fluid/operators/elementwise/elementwise_max_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_max_op.cu
@@ -28,8 +28,9 @@ class ElementwiseMaxKernel<platform::CUDADeviceContext, T>
         ctx.template device_context<platform::CUDADeviceContext>();
 
     int axis = PackTensorsIntoVector<T>(ctx, &ins, &outs);
-    LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
-        dev_ctx, ins, &outs, axis, MaxFunctor<T>());
+    paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T,
+                                                   T>(dev_ctx, ins, &outs, axis,
+                                                      MaxFunctor<T>());
   }
 };
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_min_op.cu b/paddle/fluid/operators/elementwise/elementwise_min_op.cu
index a733b4a66f1294..5af985567d898d 100644
--- a/paddle/fluid/operators/elementwise/elementwise_min_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_min_op.cu
@@ -28,8 +28,9 @@ class ElementwiseMinKernel<platform::CUDADeviceContext, T>
         ctx.template device_context<platform::CUDADeviceContext>();
 
     int axis = PackTensorsIntoVector<T>(ctx, &ins, &outs);
-    LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
-        dev_ctx, ins, &outs, axis, MinFunctor<T>());
+    paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T,
+                                                   T>(dev_ctx, ins, &outs, axis,
+                                                      MinFunctor<T>());
   }
 };
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_mod_op.cu b/paddle/fluid/operators/elementwise/elementwise_mod_op.cu
index 4ef957c617870e..379684aa9ba63b 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mod_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_mod_op.cu
@@ -30,8 +30,9 @@ class ElementwiseModKernel<platform::CUDADeviceContext, T>
     const auto& cuda_ctx =
         ctx.template device_context<platform::CUDADeviceContext>();
     int axis = PackTensorsIntoVector<T>(ctx, &ins, &outs);
-    LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
-        cuda_ctx, ins, &outs, axis, ModFunctor<T>());
+    paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T,
+                                                   T>(cuda_ctx, ins, &outs,
+                                                      axis, ModFunctor<T>());
   }
 };
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
index 5ece5cadc603fa..86a803106347d2 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
@@ -38,7 +38,8 @@ class ElementwiseMulKernel<platform::CUDADeviceContext, T>
       std::vector<framework::Tensor*> outs;
       int axis =
           PackTensorsIntoVector<T>(ctx, &ins, &outs, &x_for_selectedrows);
-      LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
+      paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary,
+                                                     T, T>(
           cuda_ctx, ins, &outs, axis, MulFunctor<T>());
     } else if (x_var->IsType<framework::LoDTensor>()) {
       auto* x_lod = ctx.Input<framework::LoDTensor>("X");
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h
index 7cd04318d3f49c..3fddb553e117e6 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h
@@ -211,8 +211,9 @@ void ElementwiseComputeEx(const framework::ExecutionContext &ctx,
 
     const auto &dev_ctx =
         ctx.template device_context<platform::CUDADeviceContext>();
-    LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, OutType>(
-        dev_ctx, ins, &outs, axis, func);
+    paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T,
+                                                   OutType>(dev_ctx, ins, &outs,
+                                                            axis, func);
 #endif
     return;
   }
@@ -1271,8 +1272,8 @@ void GetGradXAndYOut(const platform::CUDADeviceContext &dev_ctx,
     outs = {&tmp_dx, &tmp_dy};
   }
 
-  LaunchElementwiseCudaKernel<ET, T, T, decltype(func), 2>(dev_ctx, ins, &outs,
-                                                           axis, func);
+  paddle::operators::LaunchElementwiseCudaKernel<ET, T, T, decltype(func), 2>(
+      dev_ctx, ins, &outs, axis, func);
 
   if (dx->dims() != dout->dims() && dy->dims() == dout->dims()) {
     ReduceWrapper<T>(dev_ctx, axis, &tmp_dx, dx);
@@ -1301,7 +1302,8 @@ void GetGradXOrYOut(const platform::CUDADeviceContext &dev_ctx,
     outs = {dxy};
   }
 
-  LaunchElementwiseCudaKernel<ET, T, T>(dev_ctx, ins, &outs, axis, func);
+  paddle::operators::LaunchElementwiseCudaKernel<ET, T, T>(dev_ctx, ins, &outs,
+                                                           axis, func);
   if (dxy->dims() != dout->dims()) {
     ReduceWrapper<T>(dev_ctx, axis, &tmp_dxy, dxy);
   }
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc b/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc
index f06dbd26873a60..3cd9729d3443c5 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc
@@ -57,9 +57,9 @@ void Compare(f::Scope *scope, const p::DeviceContext &ctx,
     init_y.push_back(static_cast<T>(2.0));
   }
 
-  TensorFromVector(init_x, ctx, tensor_x);
+  paddle::framework::TensorFromVector(init_x, ctx, tensor_x);
   tensor_x->Resize({10, 10});
-  TensorFromVector(init_y, ctx, tensor_y);
+  paddle::framework::TensorFromVector(init_y, ctx, tensor_y);
   tensor_y->Resize({10, 10});
 
   auto place = ctx.GetPlace();
@@ -74,7 +74,7 @@ void Compare(f::Scope *scope, const p::DeviceContext &ctx,
   op->Run(*scope, place);
 
   std::vector<T> out_vec;
-  TensorToVector(*tensor_out, ctx, &out_vec);
+  paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
 
   ctx.Wait();
   float expected;
@@ -116,7 +116,7 @@ void CompareGrad(f::Scope *scope, const p::DeviceContext &ctx,
     init_dout.push_back(static_cast<T>(1.0));
   }
 
-  TensorFromVector(init_dout, ctx, tensor_dout);
+  paddle::framework::TensorFromVector(init_dout, ctx, tensor_dout);
   tensor_dout->Resize({2, 3, 5});
 
   // run
@@ -129,10 +129,10 @@ void CompareGrad(f::Scope *scope, const p::DeviceContext &ctx,
   op->Run(*scope, place);
 
   std::vector<T> dx_vec;
-  TensorToVector(*tensor_dx, ctx, &dx_vec);
+  paddle::framework::TensorToVector(*tensor_dx, ctx, &dx_vec);
 
   std::vector<T> dy_vec;
-  TensorToVector(*tensor_dy, ctx, &dy_vec);
+  paddle::framework::TensorToVector(*tensor_dy, ctx, &dy_vec);
 
   ctx.Wait();
   float expected_x, expected_y;
diff --git a/paddle/fluid/operators/elementwise/elementwise_pow_op.cu b/paddle/fluid/operators/elementwise/elementwise_pow_op.cu
index 722a53d188061b..1b24d5be3442fd 100644
--- a/paddle/fluid/operators/elementwise/elementwise_pow_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_pow_op.cu
@@ -27,8 +27,9 @@ class ElementwisePowKernel<platform::CUDADeviceContext, T>
         ctx.template device_context<platform::CUDADeviceContext>();
 
     int axis = PackTensorsIntoVector<T>(ctx, &ins, &outs);
-    LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
-        cuda_ctx, ins, &outs, axis, PowFunctor<T>());
+    paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T,
+                                                   T>(cuda_ctx, ins, &outs,
+                                                      axis, PowFunctor<T>());
   }
 };
 
diff --git a/paddle/fluid/operators/expand_op.h b/paddle/fluid/operators/expand_op.h
index 809bad1d6c1eec..05cd893b057af7 100644
--- a/paddle/fluid/operators/expand_op.h
+++ b/paddle/fluid/operators/expand_op.h
@@ -32,18 +32,21 @@ inline std::vector<int> get_expand_times(
     auto* expand_data = expand_tensor->data<int>();
     framework::Tensor cpu_expand_tensor;
     if (platform::is_gpu_place(expand_tensor->place())) {
-      TensorCopySync(*expand_tensor, platform::CPUPlace(), &cpu_expand_tensor);
+      paddle::framework::TensorCopySync(*expand_tensor, platform::CPUPlace(),
+                                        &cpu_expand_tensor);
       expand_data = cpu_expand_tensor.data<int>();
     }
 #ifdef PADDLE_WITH_ASCEND_CL
     if (platform::is_npu_place(expand_tensor->place())) {
-      TensorCopySync(*expand_tensor, platform::CPUPlace(), &cpu_expand_tensor);
+      paddle::framework::TensorCopySync(*expand_tensor, platform::CPUPlace(),
+                                        &cpu_expand_tensor);
       expand_data = cpu_expand_tensor.data<int>();
     }
 #endif
 #ifdef PADDLE_WITH_XPU
     if (platform::is_xpu_place(expand_tensor->place())) {
-      TensorCopySync(*expand_tensor, platform::CPUPlace(), &cpu_expand_tensor);
+      paddle::framework::TensorCopySync(*expand_tensor, platform::CPUPlace(),
+                                        &cpu_expand_tensor);
       expand_data = cpu_expand_tensor.data<int>();
     }
 #endif
@@ -61,13 +64,13 @@ inline std::vector<int> get_expand_times(
       auto tensor = list_expand_times_tensor[i];
       if (platform::is_gpu_place(tensor->place())) {
         framework::Tensor temp;
-        TensorCopySync(*tensor, platform::CPUPlace(), &temp);
+        paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp);
         vec_epxand_times.push_back(*temp.data<int32_t>());
       }
 #ifdef PADDLE_WITH_XPU
       else if (platform::is_xpu_place(tensor->place())) {  // NOLINT
         framework::Tensor temp;
-        TensorCopySync(*tensor, platform::CPUPlace(), &temp);
+        paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp);
         vec_epxand_times.push_back(*temp.data<int32_t>());
       }
 #endif
diff --git a/paddle/fluid/operators/expand_op_npu_test.cc b/paddle/fluid/operators/expand_op_npu_test.cc
index 880eb341f2093b..7de2bf2e6990db 100644
--- a/paddle/fluid/operators/expand_op_npu_test.cc
+++ b/paddle/fluid/operators/expand_op_npu_test.cc
@@ -46,8 +46,9 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
   auto expand_times_t = expand_times->GetMutable<f::LoDTensor>();
 
   auto place = ctx.GetPlace();
-  TensorFromVector(std::vector<T>(3 * 1 * 7, 1), ctx, in_t);
-  TensorFromVector(std::vector<int>({1, 10, 1}), ctx, expand_times_t);
+  paddle::framework::TensorFromVector(std::vector<T>(3 * 1 * 7, 1), ctx, in_t);
+  paddle::framework::TensorFromVector(std::vector<int>({1, 10, 1}), ctx,
+                                      expand_times_t);
 
   in_t->Resize(f::make_ddim({3, 1, 7}));
   expand_times_t->Resize(f::make_ddim({3}));
diff --git a/paddle/fluid/operators/expand_v2_op.h b/paddle/fluid/operators/expand_v2_op.h
index fd7c6b3f27e75a..dd1625013444b6 100644
--- a/paddle/fluid/operators/expand_v2_op.h
+++ b/paddle/fluid/operators/expand_v2_op.h
@@ -33,18 +33,21 @@ inline std::vector<int> get_expand_shape(
     auto* shape_data = shape_tensor->data<int>();
     framework::Tensor cpu_shape_tensor;
     if (platform::is_gpu_place(shape_tensor->place())) {
-      TensorCopySync(*shape_tensor, platform::CPUPlace(), &cpu_shape_tensor);
+      paddle::framework::TensorCopySync(*shape_tensor, platform::CPUPlace(),
+                                        &cpu_shape_tensor);
       shape_data = cpu_shape_tensor.data<int>();
     }
 #ifdef PADDLE_WITH_ASCEND_CL
     if (platform::is_npu_place(shape_tensor->place())) {
-      TensorCopySync(*shape_tensor, platform::CPUPlace(), &cpu_shape_tensor);
+      paddle::framework::TensorCopySync(*shape_tensor, platform::CPUPlace(),
+                                        &cpu_shape_tensor);
       shape_data = cpu_shape_tensor.data<int>();
     }
 #endif
 #ifdef PADDLE_WITH_XPU
     if (platform::is_xpu_place(shape_tensor->place())) {
-      TensorCopySync(*shape_tensor, platform::CPUPlace(), &cpu_shape_tensor);
+      paddle::framework::TensorCopySync(*shape_tensor, platform::CPUPlace(),
+                                        &cpu_shape_tensor);
       shape_data = cpu_shape_tensor.data<int>();
     }
 #endif
@@ -62,20 +65,20 @@ inline std::vector<int> get_expand_shape(
       auto tensor = list_expand_shapes_tensor[i];
       if (platform::is_gpu_place(tensor->place())) {
         framework::Tensor temp;
-        TensorCopySync(*tensor, platform::CPUPlace(), &temp);
+        paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp);
         vec_epxand_shape.push_back(*temp.data<int32_t>());
       }
 #ifdef PADDLE_WITH_ASCEND_CL
       else if (platform::is_npu_place(tensor->place())) {  // NOLINT
         framework::Tensor temp;
-        TensorCopySync(*tensor, platform::CPUPlace(), &temp);
+        paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp);
         vec_epxand_shape.push_back(*temp.data<int32_t>());
       }
 #endif
 #ifdef PADDLE_WITH_XPU
       else if (platform::is_xpu_place(tensor->place())) {  // NOLINT
         framework::Tensor temp;
-        TensorCopySync(*tensor, platform::CPUPlace(), &temp);
+        paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp);
         vec_epxand_shape.push_back(*temp.data<int32_t>());
       }
 #endif
diff --git a/paddle/fluid/operators/feed_forward_test.cu b/paddle/fluid/operators/feed_forward_test.cu
index fe631500a3de52..9f44c39a92c5ef 100644
--- a/paddle/fluid/operators/feed_forward_test.cu
+++ b/paddle/fluid/operators/feed_forward_test.cu
@@ -409,9 +409,9 @@ class TestFeedForward {
   void CheckOut(const T diff, bool is_relative_atol = false) {
     std::vector<T> out(size_output_);
     std::vector<T> bias_out(size_output_);
-    TensorToVector(out_, *ctx_, &out);
+    paddle::framework::TensorToVector(out_, *ctx_, &out);
     if (has_bias_) {
-      TensorToVector(bias_out_, *ctx_, &bias_out);
+      paddle::framework::TensorToVector(bias_out_, *ctx_, &bias_out);
     }
     ctx_->Wait();
 
@@ -437,7 +437,7 @@ class TestFeedForward {
   // check backward correctness between baseline and results of feedforward.
   void CheckGrad(const T diff, bool is_relative_atol = false) {
     std::vector<T> h_dinput(size_src_);
-    TensorToVector(dinput_, *ctx_, &h_dinput);
+    paddle::framework::TensorToVector(dinput_, *ctx_, &h_dinput);
     for (int i = 0; i < size_src_; i++) {
       if (is_relative_atol) {
         EXPECT_LT(
@@ -448,7 +448,7 @@ class TestFeedForward {
       }
     }
     std::vector<T> h_dweight(size_weight_);
-    TensorToVector(dweight_, *ctx_, &h_dweight);
+    paddle::framework::TensorToVector(dweight_, *ctx_, &h_dweight);
     for (int i = 0; i < size_weight_; i++) {
       if (is_relative_atol) {
         EXPECT_LT(std::abs((h_dweight[i] - base_dweight_vec_[i]) /
@@ -460,7 +460,7 @@ class TestFeedForward {
     }
     if (has_bias_) {
       std::vector<T> h_dbias(size_bias_);
-      TensorToVector(dbias_, *ctx_, &h_dbias);
+      paddle::framework::TensorToVector(dbias_, *ctx_, &h_dbias);
       for (int i = 0; i < size_bias_; i++) {
         if (is_relative_atol) {
           EXPECT_LT(
diff --git a/paddle/fluid/operators/fill_constant_op.h b/paddle/fluid/operators/fill_constant_op.h
index 17c7321122b174..32cd07c916b330 100644
--- a/paddle/fluid/operators/fill_constant_op.h
+++ b/paddle/fluid/operators/fill_constant_op.h
@@ -81,7 +81,8 @@ class FillConstantKernel : public framework::OpKernel<T> {
       auto tmp_place = value_tensor->place();
       if (platform::is_gpu_place(tmp_place) ||
           platform::is_xpu_place(tmp_place)) {
-        TensorCopySync(*value_tensor, platform::CPUPlace(), &cpu_tensor);
+        paddle::framework::TensorCopySync(*value_tensor, platform::CPUPlace(),
+                                          &cpu_tensor);
         tensor_data = cpu_tensor.data<T>();
       }
       value = tensor_data[0];
diff --git a/paddle/fluid/operators/fused/attn_gemm.h b/paddle/fluid/operators/fused/attn_gemm.h
index e0873608fa2814..b9b881cf83e0bd 100644
--- a/paddle/fluid/operators/fused/attn_gemm.h
+++ b/paddle/fluid/operators/fused/attn_gemm.h
@@ -67,7 +67,8 @@ class AttnMatMul {
       ins.emplace_back(bias);
       outs.emplace_back(bias_out);
       int elewise_add_axis = -1;
-      LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
+      paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary,
+                                                     T, T>(
           dev_ctx_, ins, &outs, elewise_add_axis, AddFunctor<T>());
     }
   }
diff --git a/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc b/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc
index c5995fe3554b4e..74307c3ba79175 100644
--- a/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc
+++ b/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc
@@ -164,11 +164,11 @@ void ComputeBatchNormForward(const platform::CUDADeviceContext &ctx,
       scope.Var("ReserveSpace")->GetMutable<framework::LoDTensor>();
 
   auto place = ctx.GetPlace();
-  TensorCopySync(cpu_x, place, x);
-  TensorCopySync(cpu_scale, place, scale);
-  TensorCopySync(cpu_bias, place, bias);
-  TensorCopySync(*cpu_mean, place, mean);
-  TensorCopySync(*cpu_var, place, var);
+  paddle::framework::TensorCopySync(cpu_x, place, x);
+  paddle::framework::TensorCopySync(cpu_scale, place, scale);
+  paddle::framework::TensorCopySync(cpu_bias, place, bias);
+  paddle::framework::TensorCopySync(*cpu_mean, place, mean);
+  paddle::framework::TensorCopySync(*cpu_var, place, var);
 
   int64_t channels = x->dims()[3];
   scale->Resize({channels});
@@ -195,11 +195,13 @@ void ComputeBatchNormForward(const platform::CUDADeviceContext &ctx,
       attrs);
   op->Run(scope, ctx.GetPlace());
 
-  TensorCopySync(*y, platform::CPUPlace(), cpu_y);
-  TensorCopySync(*mean, platform::CPUPlace(), cpu_mean);
-  TensorCopySync(*var, platform::CPUPlace(), cpu_var);
-  TensorCopySync(*saved_mean, platform::CPUPlace(), cpu_saved_mean);
-  TensorCopySync(*saved_var, platform::CPUPlace(), cpu_saved_var);
+  paddle::framework::TensorCopySync(*y, platform::CPUPlace(), cpu_y);
+  paddle::framework::TensorCopySync(*mean, platform::CPUPlace(), cpu_mean);
+  paddle::framework::TensorCopySync(*var, platform::CPUPlace(), cpu_var);
+  paddle::framework::TensorCopySync(*saved_mean, platform::CPUPlace(),
+                                    cpu_saved_mean);
+  paddle::framework::TensorCopySync(*saved_var, platform::CPUPlace(),
+                                    cpu_saved_var);
   // reserved_space will stay on GPU and used in grad op.
   saved_reserve_space->ShareDataWith(*reserve_space);
 }
@@ -226,12 +228,12 @@ void ComputeFusedBNAddReluForward(const platform::CUDADeviceContext &ctx,
       scope.Var("ReserveSpace")->GetMutable<framework::LoDTensor>();
 
   auto place = ctx.GetPlace();
-  TensorCopySync(cpu_x, place, x);
-  TensorCopySync(cpu_z, place, z);
-  TensorCopySync(cpu_scale, place, scale);
-  TensorCopySync(cpu_bias, place, bias);
-  TensorCopySync(*cpu_mean, place, mean);
-  TensorCopySync(*cpu_var, place, var);
+  paddle::framework::TensorCopySync(cpu_x, place, x);
+  paddle::framework::TensorCopySync(cpu_z, place, z);
+  paddle::framework::TensorCopySync(cpu_scale, place, scale);
+  paddle::framework::TensorCopySync(cpu_bias, place, bias);
+  paddle::framework::TensorCopySync(*cpu_mean, place, mean);
+  paddle::framework::TensorCopySync(*cpu_var, place, var);
 
   int64_t channels = x->dims()[3];
   scale->Resize({channels});
@@ -253,11 +255,13 @@ void ComputeFusedBNAddReluForward(const platform::CUDADeviceContext &ctx,
       attrs);
   op->Run(scope, ctx.GetPlace());
 
-  TensorCopySync(*y, platform::CPUPlace(), cpu_y);
-  TensorCopySync(*mean, platform::CPUPlace(), cpu_mean);
-  TensorCopySync(*var, platform::CPUPlace(), cpu_var);
-  TensorCopySync(*saved_mean, platform::CPUPlace(), cpu_saved_mean);
-  TensorCopySync(*saved_var, platform::CPUPlace(), cpu_saved_var);
+  paddle::framework::TensorCopySync(*y, platform::CPUPlace(), cpu_y);
+  paddle::framework::TensorCopySync(*mean, platform::CPUPlace(), cpu_mean);
+  paddle::framework::TensorCopySync(*var, platform::CPUPlace(), cpu_var);
+  paddle::framework::TensorCopySync(*saved_mean, platform::CPUPlace(),
+                                    cpu_saved_mean);
+  paddle::framework::TensorCopySync(*saved_var, platform::CPUPlace(),
+                                    cpu_saved_var);
   // reserved_space will stay on GPU and used in grad op.
   saved_reserve_space->ShareDataWith(*reserve_space);
 }
@@ -285,13 +289,13 @@ void ComputeFusedBNAddReluBackward(
   auto *dbias = scope.Var("Bias@GRAD")->GetMutable<framework::LoDTensor>();
 
   auto place = ctx.GetPlace();
-  TensorCopySync(cpu_x, place, x);
-  TensorCopySync(cpu_y, place, y);
-  TensorCopySync(cpu_dy, place, dy);
-  TensorCopySync(cpu_scale, place, scale);
-  TensorCopySync(cpu_bias, place, bias);
-  TensorCopySync(cpu_saved_mean, place, saved_mean);
-  TensorCopySync(cpu_saved_var, place, saved_var);
+  paddle::framework::TensorCopySync(cpu_x, place, x);
+  paddle::framework::TensorCopySync(cpu_y, place, y);
+  paddle::framework::TensorCopySync(cpu_dy, place, dy);
+  paddle::framework::TensorCopySync(cpu_scale, place, scale);
+  paddle::framework::TensorCopySync(cpu_bias, place, bias);
+  paddle::framework::TensorCopySync(cpu_saved_mean, place, saved_mean);
+  paddle::framework::TensorCopySync(cpu_saved_var, place, saved_var);
   reserve_space->ShareDataWith(saved_reserve_space);
 
   int64_t channels = x->dims()[3];
@@ -324,10 +328,10 @@ void ComputeFusedBNAddReluBackward(
       attrs);
   op->Run(scope, ctx.GetPlace());
 
-  TensorCopySync(*dx, platform::CPUPlace(), cpu_dx);
-  TensorCopySync(*dz, platform::CPUPlace(), cpu_dz);
-  TensorCopySync(*dscale, platform::CPUPlace(), cpu_dscale);
-  TensorCopySync(*dbias, platform::CPUPlace(), cpu_dbias);
+  paddle::framework::TensorCopySync(*dx, platform::CPUPlace(), cpu_dx);
+  paddle::framework::TensorCopySync(*dz, platform::CPUPlace(), cpu_dz);
+  paddle::framework::TensorCopySync(*dscale, platform::CPUPlace(), cpu_dscale);
+  paddle::framework::TensorCopySync(*dbias, platform::CPUPlace(), cpu_dbias);
 }
 
 template <typename T>
@@ -527,10 +531,10 @@ class CudnnBNAddReluTester {
     ComputeSumAndSquareSum<T>(cpu_x, &cpu_sum, &cpu_sum_of_square);
 
     auto place = ctx.GetPlace();
-    TensorCopySync(cpu_sum, place, sum);
-    TensorCopySync(cpu_sum_of_square, place, sum_of_square);
-    TensorCopySync(cpu_bn_scale, place, bn_scale);
-    TensorCopySync(cpu_bn_bias, place, bn_bias);
+    paddle::framework::TensorCopySync(cpu_sum, place, sum);
+    paddle::framework::TensorCopySync(cpu_sum_of_square, place, sum_of_square);
+    paddle::framework::TensorCopySync(cpu_bn_scale, place, bn_scale);
+    paddle::framework::TensorCopySync(cpu_bn_bias, place, bn_bias);
 
     bn_scale->Resize({1, 1, 1, channels_});
     bn_bias->Resize({1, 1, 1, channels_});
@@ -572,9 +576,9 @@ class CudnnBNAddReluTester {
     framework::Tensor bn_bias_z;
 
     auto place = ctx.GetPlace();
-    TensorCopySync(cpu_x_, place, &x);
+    paddle::framework::TensorCopySync(cpu_x_, place, &x);
     if (fuse_add_ || has_shortcut_) {
-      TensorCopySync(cpu_z_, place, &z);
+      paddle::framework::TensorCopySync(cpu_z_, place, &z);
     }
 
     framework::Tensor mean_x;
@@ -595,12 +599,12 @@ class CudnnBNAddReluTester {
     framework::Tensor bitmask;
 
     InitMeanVar(cpu_mean_x, cpu_var_x, cpu_saved_mean_x, cpu_saved_var_x);
-    TensorCopySync(*cpu_mean_x, place, &mean_x);
-    TensorCopySync(*cpu_var_x, place, &var_x);
+    paddle::framework::TensorCopySync(*cpu_mean_x, place, &mean_x);
+    paddle::framework::TensorCopySync(*cpu_var_x, place, &var_x);
     if (has_shortcut_) {
       InitMeanVar(cpu_mean_z, cpu_var_z, cpu_saved_mean_z, cpu_saved_var_z);
-      TensorCopySync(*cpu_mean_z, place, &mean_z);
-      TensorCopySync(*cpu_var_z, place, &var_z);
+      paddle::framework::TensorCopySync(*cpu_mean_z, place, &mean_z);
+      paddle::framework::TensorCopySync(*cpu_var_z, place, &var_z);
     }
 
     // 1. BN Stats Finalize
@@ -634,18 +638,24 @@ class CudnnBNAddReluTester {
     sbar_op.Forward(ctx, x, equiv_scale_x, equiv_bias_x, &z, &equiv_scale_z,
                     &equiv_bias_z, &y, &bitmask);
 
-    TensorCopySync(mean_x, platform::CPUPlace(), cpu_mean_x);
-    TensorCopySync(var_x, platform::CPUPlace(), cpu_var_x);
-    TensorCopySync(saved_mean_x, platform::CPUPlace(), cpu_saved_mean_x);
-    TensorCopySync(saved_var_x, platform::CPUPlace(), cpu_saved_var_x);
+    paddle::framework::TensorCopySync(mean_x, platform::CPUPlace(), cpu_mean_x);
+    paddle::framework::TensorCopySync(var_x, platform::CPUPlace(), cpu_var_x);
+    paddle::framework::TensorCopySync(saved_mean_x, platform::CPUPlace(),
+                                      cpu_saved_mean_x);
+    paddle::framework::TensorCopySync(saved_var_x, platform::CPUPlace(),
+                                      cpu_saved_var_x);
     if (has_shortcut_) {
-      TensorCopySync(mean_z, platform::CPUPlace(), cpu_mean_z);
-      TensorCopySync(var_z, platform::CPUPlace(), cpu_var_z);
-      TensorCopySync(saved_mean_z, platform::CPUPlace(), cpu_saved_mean_z);
-      TensorCopySync(saved_var_z, platform::CPUPlace(), cpu_saved_var_z);
+      paddle::framework::TensorCopySync(mean_z, platform::CPUPlace(),
+                                        cpu_mean_z);
+      paddle::framework::TensorCopySync(var_z, platform::CPUPlace(), cpu_var_z);
+      paddle::framework::TensorCopySync(saved_mean_z, platform::CPUPlace(),
+                                        cpu_saved_mean_z);
+      paddle::framework::TensorCopySync(saved_var_z, platform::CPUPlace(),
+                                        cpu_saved_var_z);
     }
-    TensorCopySync(y, platform::CPUPlace(), cpu_y);
-    TensorCopySync(bitmask, platform::CPUPlace(), cpu_bitmask);
+    paddle::framework::TensorCopySync(y, platform::CPUPlace(), cpu_y);
+    paddle::framework::TensorCopySync(bitmask, platform::CPUPlace(),
+                                      cpu_bitmask);
   }
 
   // Get backward results of CudnnBNStatsFinalize + CudnnScaleBiasAddRelu
@@ -664,13 +674,13 @@ class CudnnBNAddReluTester {
     framework::Tensor dbias;
 
     auto place = ctx.GetPlace();
-    TensorCopySync(cpu_dy_, place, &dy);
-    TensorCopySync(cpu_x_, place, &x);
-    TensorCopySync(cpu_bn_scale_x_, place, &bn_scale);
-    TensorCopySync(cpu_bn_bias_x_, place, &bn_bias);
-    TensorCopySync(cpu_saved_mean_x_, place, &saved_mean);
-    TensorCopySync(cpu_saved_var_x_, place, &saved_var);
-    TensorCopySync(cpu_bitmask_, place, &bitmask);
+    paddle::framework::TensorCopySync(cpu_dy_, place, &dy);
+    paddle::framework::TensorCopySync(cpu_x_, place, &x);
+    paddle::framework::TensorCopySync(cpu_bn_scale_x_, place, &bn_scale);
+    paddle::framework::TensorCopySync(cpu_bn_bias_x_, place, &bn_bias);
+    paddle::framework::TensorCopySync(cpu_saved_mean_x_, place, &saved_mean);
+    paddle::framework::TensorCopySync(cpu_saved_var_x_, place, &saved_var);
+    paddle::framework::TensorCopySync(cpu_bitmask_, place, &bitmask);
 
     bn_scale.Resize({1, 1, 1, channels_});
     bn_bias.Resize({1, 1, 1, channels_});
@@ -692,10 +702,10 @@ class CudnnBNAddReluTester {
     sbar_op.Backward(ctx, dy, x, bn_scale, bn_bias, saved_mean, saved_var,
                      &bitmask, &dx, &dz, &dscale, &dbias, eps_);
 
-    TensorCopySync(dx, platform::CPUPlace(), cpu_dx);
-    TensorCopySync(dz, platform::CPUPlace(), cpu_dz);
-    TensorCopySync(dscale, platform::CPUPlace(), cpu_dscale);
-    TensorCopySync(dbias, platform::CPUPlace(), cpu_dbias);
+    paddle::framework::TensorCopySync(dx, platform::CPUPlace(), cpu_dx);
+    paddle::framework::TensorCopySync(dz, platform::CPUPlace(), cpu_dz);
+    paddle::framework::TensorCopySync(dscale, platform::CPUPlace(), cpu_dscale);
+    paddle::framework::TensorCopySync(dbias, platform::CPUPlace(), cpu_dbias);
   }
 
  private:
diff --git a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
index 23983d447e4788..425782d7900b48 100644
--- a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
+++ b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
@@ -101,8 +101,8 @@ void ComputeConv2DForward(const platform::CUDADeviceContext &ctx,
   auto *output = scope.Var("Output")->GetMutable<framework::LoDTensor>();
 
   auto place = ctx.GetPlace();
-  TensorCopySync(cpu_input, place, input);
-  TensorCopySync(cpu_filter, place, filter);
+  paddle::framework::TensorCopySync(cpu_input, place, input);
+  paddle::framework::TensorCopySync(cpu_filter, place, filter);
 
   framework::AttributeMap attrs;
   bool use_cudnn = true;
@@ -119,7 +119,7 @@ void ComputeConv2DForward(const platform::CUDADeviceContext &ctx,
       {{"Output", {"Output"}}}, attrs);
   op->Run(scope, ctx.GetPlace());
 
-  TensorCopySync(*output, platform::CPUPlace(), cpu_output);
+  paddle::framework::TensorCopySync(*output, platform::CPUPlace(), cpu_output);
 }
 
 // Use Paddle conv2d_grad op results as baseline
@@ -140,9 +140,9 @@ void ComputeConv2DBackward(const platform::CUDADeviceContext &ctx,
       scope.Var("Filter@GRAD")->GetMutable<framework::LoDTensor>();
 
   auto place = ctx.GetPlace();
-  TensorCopySync(cpu_input, place, input);
-  TensorCopySync(cpu_filter, place, filter);
-  TensorCopySync(cpu_output_grad, place, output_grad);
+  paddle::framework::TensorCopySync(cpu_input, place, input);
+  paddle::framework::TensorCopySync(cpu_filter, place, filter);
+  paddle::framework::TensorCopySync(cpu_output_grad, place, output_grad);
 
   framework::AttributeMap attrs;
   bool use_cudnn = true;
@@ -172,8 +172,10 @@ void ComputeConv2DBackward(const platform::CUDADeviceContext &ctx,
       attrs);
   op->Run(scope, ctx.GetPlace());
 
-  TensorCopySync(*input_grad, platform::CPUPlace(), cpu_input_grad);
-  TensorCopySync(*filter_grad, platform::CPUPlace(), cpu_filter_grad);
+  paddle::framework::TensorCopySync(*input_grad, platform::CPUPlace(),
+                                    cpu_input_grad);
+  paddle::framework::TensorCopySync(*filter_grad, platform::CPUPlace(),
+                                    cpu_filter_grad);
 }
 
 template <typename T>
@@ -313,8 +315,8 @@ class CudnnNormConvolutionTester {
     framework::Tensor sum_of_square;
 
     auto place = ctx.GetPlace();
-    TensorCopySync(cpu_input_, place, &input);
-    TensorCopySync(cpu_filter_nhwc_, place, &filter_nhwc);
+    paddle::framework::TensorCopySync(cpu_input_, place, &input);
+    paddle::framework::TensorCopySync(cpu_filter_nhwc_, place, &filter_nhwc);
 
     output.Resize(framework::make_ddim(
         {batch_size_, out_height_, out_width_, output_channels_}));
@@ -329,9 +331,10 @@ class CudnnNormConvolutionTester {
                                         dilation_, group_);
     conv_op.Forward(ctx, input, filter_nhwc, &output, &sum, &sum_of_square);
 
-    TensorCopySync(output, platform::CPUPlace(), cpu_output);
-    TensorCopySync(sum, platform::CPUPlace(), cpu_sum);
-    TensorCopySync(sum_of_square, platform::CPUPlace(), cpu_sum_of_square);
+    paddle::framework::TensorCopySync(output, platform::CPUPlace(), cpu_output);
+    paddle::framework::TensorCopySync(sum, platform::CPUPlace(), cpu_sum);
+    paddle::framework::TensorCopySync(sum_of_square, platform::CPUPlace(),
+                                      cpu_sum_of_square);
   }
 
   void FusedBackward(const platform::CUDADeviceContext &ctx,
@@ -344,9 +347,9 @@ class CudnnNormConvolutionTester {
     framework::Tensor filter_grad;
 
     auto place = ctx.GetPlace();
-    TensorCopySync(cpu_input_, place, &input);
-    TensorCopySync(cpu_filter_nhwc_, place, &filter_nhwc);
-    TensorCopySync(cpu_output_grad_, place, &output_grad);
+    paddle::framework::TensorCopySync(cpu_input_, place, &input);
+    paddle::framework::TensorCopySync(cpu_filter_nhwc_, place, &filter_nhwc);
+    paddle::framework::TensorCopySync(cpu_output_grad_, place, &output_grad);
 
     input_grad.Resize(input.dims());
     filter_grad.Resize(filter_nhwc.dims());
@@ -360,8 +363,10 @@ class CudnnNormConvolutionTester {
     conv_grad_op.Backward(ctx, input, filter_nhwc, output_grad, &input_grad,
                           &filter_grad);
 
-    TensorCopySync(input_grad, platform::CPUPlace(), cpu_input_grad);
-    TensorCopySync(filter_grad, platform::CPUPlace(), cpu_filter_grad);
+    paddle::framework::TensorCopySync(input_grad, platform::CPUPlace(),
+                                      cpu_input_grad);
+    paddle::framework::TensorCopySync(filter_grad, platform::CPUPlace(),
+                                      cpu_filter_grad);
   }
 
  private:
diff --git a/paddle/fluid/operators/fused/fmha_ref.h b/paddle/fluid/operators/fused/fmha_ref.h
index 066e7e15e88312..8c080f97cba82c 100644
--- a/paddle/fluid/operators/fused/fmha_ref.h
+++ b/paddle/fluid/operators/fused/fmha_ref.h
@@ -119,7 +119,8 @@ class FMHARef {
       ins.emplace_back(src_mask_tensor);
       outs.emplace_back(src_mask_out_tensor);
       int elewise_add_axis = -1;
-      LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
+      paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary,
+                                                     T, T>(
           dev_ctx_, ins, &outs, elewise_add_axis, AddFunctor<T>());
 
       SoftmaxForwardCUDAKernelDriver<T>(dev_ctx_, *src_mask_out_tensor,
diff --git a/paddle/fluid/operators/fused/fused_attention_op.cu b/paddle/fluid/operators/fused/fused_attention_op.cu
index 173ef48b83dc2d..581fc45e268c2c 100644
--- a/paddle/fluid/operators/fused/fused_attention_op.cu
+++ b/paddle/fluid/operators/fused/fused_attention_op.cu
@@ -494,7 +494,8 @@ class FusedAttentionGradKernel : public framework::OpKernel<T> {
     ins.emplace_back(d_x);
     outs.emplace_back(d_x);
     int elewise_add_axis = -1;
-    LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
+    paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T,
+                                                   T>(
         ctx.cuda_device_context(), ins, &outs, elewise_add_axis,
         AddFunctor<T>());
   }
diff --git a/paddle/fluid/operators/fused/fused_feedforward_op.cu b/paddle/fluid/operators/fused/fused_feedforward_op.cu
index a241e3c3027250..934ce78e715bbe 100644
--- a/paddle/fluid/operators/fused/fused_feedforward_op.cu
+++ b/paddle/fluid/operators/fused/fused_feedforward_op.cu
@@ -308,7 +308,8 @@ class FusedFeedForwardGradKernel : public framework::OpKernel<T> {
     ins[1] = d_x;
     outs[0] = d_x;
     int elewise_add_axis = -1;
-    LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
+    paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T,
+                                                   T>(
         ctx, ins, &outs, elewise_add_axis, AddFunctor<T>());
   }
 
diff --git a/paddle/fluid/operators/fused/fusion_group_op_test.cc b/paddle/fluid/operators/fused/fusion_group_op_test.cc
index 55b4dce4929b8c..e49a71cf720be5 100644
--- a/paddle/fluid/operators/fused/fusion_group_op_test.cc
+++ b/paddle/fluid/operators/fused/fusion_group_op_test.cc
@@ -110,7 +110,8 @@ void CheckOutputs(framework::Scope* scope,
   for (size_t j = 0; j < output_names.size(); ++j) {
     auto* var = scope->Var(output_names[j]);
     const auto& dev_tensor = var->Get<framework::LoDTensor>();
-    TensorCopySync(dev_tensor, platform::CPUPlace(), &(cpu_outputs[j]));
+    paddle::framework::TensorCopySync(dev_tensor, platform::CPUPlace(),
+                                      &(cpu_outputs[j]));
 
     cpu_tensors->at(num_inputs + j)
         .mutable_data<float>(dev_tensor.dims(), platform::CPUPlace());
@@ -159,7 +160,7 @@ void TestMain(const std::vector<std::string>& input_names,
     SetupRandomCPUTensor<float>(&(cpu_tensors[i]), input_shapes[i]);
     framework::Tensor* dev_tensor =
         CreateTensor<float>(&scope, place, input_names[i], input_shapes[i]);
-    TensorCopySync(cpu_tensors[i], place, dev_tensor);
+    paddle::framework::TensorCopySync(cpu_tensors[i], place, dev_tensor);
   }
   // Create output tensors.
   std::vector<int64_t> empty_shape;
diff --git a/paddle/fluid/operators/gather_op_npu_test.cc b/paddle/fluid/operators/gather_op_npu_test.cc
index 31e19d8f600c39..f50c4f5528e741 100644
--- a/paddle/fluid/operators/gather_op_npu_test.cc
+++ b/paddle/fluid/operators/gather_op_npu_test.cc
@@ -54,7 +54,7 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx,
   }
 
   // [[1, 2],[3, 4],[5, 6]]
-  TensorFromVector(init_x, ctx, tensor_x);
+  paddle::framework::TensorFromVector(init_x, ctx, tensor_x);
   tensor_x->Resize(paddle::framework::make_ddim({3, 2}));
 
   std::vector<int> init_index = {1, 2};
@@ -75,7 +75,7 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx,
   op->Run(*scope, place);
 
   std::vector<T> out_vec;
-  TensorToVector(*tensor_out, ctx, &out_vec);
+  paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
 
   ctx.Wait();
 
@@ -114,11 +114,11 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx,
   tensor_index->Resize(paddle::framework::make_ddim({2}));
 
   std::vector<T> init_x = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
-  TensorFromVector(init_x, ctx, tensor_x);
+  paddle::framework::TensorFromVector(init_x, ctx, tensor_x);
   tensor_x->Resize(paddle::framework::make_ddim({3, 2}));
 
   std::vector<T> init_dout = {5.0, 10.0, 2.0, 3.0};
-  TensorFromVector(init_dout, ctx, tensor_dout);
+  paddle::framework::TensorFromVector(init_dout, ctx, tensor_dout);
   tensor_dout->Resize(paddle::framework::make_ddim({2, 2}));
 
   ctx.Wait();
@@ -136,7 +136,7 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx,
   op->Run(*scope, place);
 
   std::vector<T> dx_vec;
-  TensorToVector(*tensor_dx, ctx, &dx_vec);
+  paddle::framework::TensorToVector(*tensor_dx, ctx, &dx_vec);
 
   ctx.Wait();
 
diff --git a/paddle/fluid/operators/gelu_op.cu b/paddle/fluid/operators/gelu_op.cu
index 2f72fbff2668b7..da7ed05ddf55c9 100644
--- a/paddle/fluid/operators/gelu_op.cu
+++ b/paddle/fluid/operators/gelu_op.cu
@@ -219,10 +219,12 @@ class GeluKernel<platform::CUDADeviceContext, T>
         }
       }
 #endif
-      LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
+      paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary,
+                                                     T, T>(
           dev_ctx, ins, &outs, 0, GeluWithApproximateFunctor<T>());
     } else {
-      LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
+      paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary,
+                                                     T, T>(
           dev_ctx, ins, &outs, 0, GeluWithoutApproximateFunctor<T>());
     }
   }
@@ -291,10 +293,12 @@ class GeluGradKernel<platform::CUDADeviceContext, T>
         }
       }
 #endif
-      LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
+      paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary,
+                                                     T, T>(
           dev_ctx, ins, &outs, 0, GeluWithApproximateGradFunctor<T>());
     } else {
-      LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
+      paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary,
+                                                     T, T>(
           dev_ctx, ins, &outs, 0, GeluWithoutApproximateGradFunctor<T>());
     }
   }
diff --git a/paddle/fluid/operators/gelu_op_npu_test.cc b/paddle/fluid/operators/gelu_op_npu_test.cc
index 830dcd59839015..f47250c96817a7 100644
--- a/paddle/fluid/operators/gelu_op_npu_test.cc
+++ b/paddle/fluid/operators/gelu_op_npu_test.cc
@@ -46,7 +46,7 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
     init_x.push_back(static_cast<T>(1.0));
   }
 
-  TensorFromVector(init_x, ctx, tensor_x);
+  paddle::framework::TensorFromVector(init_x, ctx, tensor_x);
   tensor_x->Resize({10, 10});
 
   auto out = scope->Var("Out");
@@ -82,7 +82,7 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
 
   // eval value
   std::vector<T> out_vec;
-  TensorToVector(*tensor_out, ctx, &out_vec);
+  paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
 
   float expected = 0.841192;
   for (uint32_t i = 0; i < out_vec.size(); i++) {
@@ -108,9 +108,9 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx) {
     init_x.push_back(static_cast<T>(1.0));
   }
 
-  TensorFromVector(init_dout, ctx, tensor_dout);
+  paddle::framework::TensorFromVector(init_dout, ctx, tensor_dout);
   tensor_dout->Resize({10, 10});
-  TensorFromVector(init_x, ctx, tensor_x);
+  paddle::framework::TensorFromVector(init_x, ctx, tensor_x);
   tensor_x->Resize({10, 10});
 
   auto dx = scope->Var("DX");
@@ -147,7 +147,7 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx) {
 
   // eval value
   std::vector<T> dx_vec;
-  TensorToVector(*tensor_dx, ctx, &dx_vec);
+  paddle::framework::TensorToVector(*tensor_dx, ctx, &dx_vec);
 
   float expected = 1.082964;
   for (uint32_t i = 0; i < dx_vec.size(); i++) {
diff --git a/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc b/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc
index 080dadeacaae71..73fc79004b97cb 100644
--- a/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc
@@ -18,11 +18,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
-namespace paddle {
-namespace framework {
-class Tensor;
-}  // namespace framework
-}  // namespace paddle
+namespace pten {
+class DenseTensor;
+}  // namespace pten
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/group_norm_op_npu.cc b/paddle/fluid/operators/group_norm_op_npu.cc
index 3fd4e27de3251b..08f3169c283097 100644
--- a/paddle/fluid/operators/group_norm_op_npu.cc
+++ b/paddle/fluid/operators/group_norm_op_npu.cc
@@ -136,7 +136,7 @@ class GroupNormNPUKernel : public framework::OpKernel<T> {
       xnorm.Resize({x->dims()[0], x->dims()[3], x->dims()[1], x->dims()[2]});
       F.Transpose(x, &xnorm, std::vector<int>{0, 3, 1, 2});
     } else {
-      TensorCopy(*x, platform::NPUPlace(), &xnorm);
+      paddle::framework::TensorCopy(*x, platform::NPUPlace(), &xnorm);
     }
     auto N = xnorm.dims()[0];
     auto C = xnorm.dims()[1];
diff --git a/paddle/fluid/operators/gumbel_softmax_op.cu b/paddle/fluid/operators/gumbel_softmax_op.cu
index 51d912f451b925..4e91e689fa58c6 100644
--- a/paddle/fluid/operators/gumbel_softmax_op.cu
+++ b/paddle/fluid/operators/gumbel_softmax_op.cu
@@ -98,7 +98,7 @@ struct OneHotGenerator<platform::CUDADeviceContext, T> {
 
     Tensor input_tensor;
     input_tensor.mutable_data<T>(Out->dims(), platform::CUDAPlace());
-    TensorCopy(*Out, context.GetPlace(), &input_tensor);
+    paddle::framework::TensorCopy(*Out, context.GetPlace(), &input_tensor);
     math::set_constant(context, Out, 0.0);
     OneHotCUDAKernel<
         T, thread_size><<<block_size, thread_size, 0, context.stream()>>>(
diff --git a/paddle/fluid/operators/histogram_op.cu b/paddle/fluid/operators/histogram_op.cu
index b9419cbcc57b58..2bf259f7d7a7a3 100644
--- a/paddle/fluid/operators/histogram_op.cu
+++ b/paddle/fluid/operators/histogram_op.cu
@@ -108,8 +108,10 @@ class HistogramCUDAKernel : public framework::OpKernel<T> {
       input_max_scala.device(*place) = input_x.maximum();
 
       Tensor input_min_cpu, input_max_cpu;
-      TensorCopySync(input_min_t, platform::CPUPlace(), &input_min_cpu);
-      TensorCopySync(input_max_t, platform::CPUPlace(), &input_max_cpu);
+      paddle::framework::TensorCopySync(input_min_t, platform::CPUPlace(),
+                                        &input_min_cpu);
+      paddle::framework::TensorCopySync(input_max_t, platform::CPUPlace(),
+                                        &input_max_cpu);
 
       output_min = input_min_cpu.data<T>()[0];
       output_max = input_max_cpu.data<T>()[0];
diff --git a/paddle/fluid/operators/im2sequence_op.h b/paddle/fluid/operators/im2sequence_op.h
index 760d6a63de13ac..39ff7ea40aaa8c 100644
--- a/paddle/fluid/operators/im2sequence_op.h
+++ b/paddle/fluid/operators/im2sequence_op.h
@@ -53,7 +53,8 @@ class Im2SequenceKernel : public framework::OpKernel<T> {
       const Tensor* imgrealsize = ctx.Input<Tensor>("Y");
       auto out_stride = ctx.Attr<std::vector<int>>("out_stride");
       Tensor cpu_shape_tensor;
-      TensorCopySync(*imgrealsize, platform::CPUPlace(), &cpu_shape_tensor);
+      paddle::framework::TensorCopySync(*imgrealsize, platform::CPUPlace(),
+                                        &cpu_shape_tensor);
       std::vector<int> imgreal_h;
       std::vector<int> imgreal_w;
       std::vector<int> output_height;
diff --git a/paddle/fluid/operators/increment_op_npu_test.cc b/paddle/fluid/operators/increment_op_npu_test.cc
index bde349b0a33b9d..ca9420c04a2933 100644
--- a/paddle/fluid/operators/increment_op_npu_test.cc
+++ b/paddle/fluid/operators/increment_op_npu_test.cc
@@ -45,7 +45,7 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx,
   std::vector<T> init;
   init.push_back(static_cast<T>(1.0));
 
-  TensorFromVector(init, ctx, tensor_x);
+  paddle::framework::TensorFromVector(init, ctx, tensor_x);
   tensor_x->Resize({1});
 
   ctx.Wait();
@@ -61,7 +61,7 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx,
   op->Run(*scope, place);
 
   std::vector<T> out_vec;
-  TensorToVector(*tensor_out, ctx, &out_vec);
+  paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
 
   ctx.Wait();
 
diff --git a/paddle/fluid/operators/index_sample_op.h b/paddle/fluid/operators/index_sample_op.h
index 1e6b4a457ed936..ab03c17fd1a6fc 100644
--- a/paddle/fluid/operators/index_sample_op.h
+++ b/paddle/fluid/operators/index_sample_op.h
@@ -44,8 +44,10 @@ void IndexSampleInner(const framework::ExecutionContext &context,
 
   std::vector<T> input_vec;
   std::vector<IndexT> index_vec;
-  TensorToVector(input, context.device_context(), &input_vec);
-  TensorToVector(index, context.device_context(), &index_vec);
+  paddle::framework::TensorToVector(input, context.device_context(),
+                                    &input_vec);
+  paddle::framework::TensorToVector(index, context.device_context(),
+                                    &index_vec);
 
   std::vector<T> res(index_ids_num);
   for (int i = 0; i < index_ids_num; i++) {
@@ -117,8 +119,10 @@ void IndexSampleGradInner(const framework::ExecutionContext &context,
                           LoDTensor *x_grad) {
   std::vector<T> out_grad_vec;
   std::vector<IndexT> index_vec;
-  TensorToVector(out_grad, context.device_context(), &out_grad_vec);
-  TensorToVector(index, context.device_context(), &index_vec);
+  paddle::framework::TensorToVector(out_grad, context.device_context(),
+                                    &out_grad_vec);
+  paddle::framework::TensorToVector(index, context.device_context(),
+                                    &index_vec);
 
   auto index_dims = index.dims();
   auto x_grad_dims = x_grad->dims();
diff --git a/paddle/fluid/operators/interpolate_op.h b/paddle/fluid/operators/interpolate_op.h
index 0c90a3869a2a20..baa292319d36e4 100644
--- a/paddle/fluid/operators/interpolate_op.h
+++ b/paddle/fluid/operators/interpolate_op.h
@@ -39,7 +39,7 @@ inline std::vector<int> get_new_shape(
                           tensor->dims()));
     if (platform::is_gpu_place(tensor->place())) {
       framework::Tensor temp;
-      TensorCopySync(*tensor, platform::CPUPlace(), &temp);
+      paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp);
       vec_new_shape.push_back(static_cast<int32_t>(*temp.data<int32_t>()));
     } else {
       vec_new_shape.push_back(static_cast<int32_t>(*tensor->data<int32_t>()));
@@ -55,7 +55,8 @@ inline std::vector<T> get_new_data_from_tensor(const Tensor* new_data_tensor) {
   auto* new_data = new_data_tensor->data<T>();
   framework::Tensor cpu_starts_tensor;
   if (platform::is_gpu_place(new_data_tensor->place())) {
-    TensorCopySync(*new_data_tensor, platform::CPUPlace(), &cpu_starts_tensor);
+    paddle::framework::TensorCopySync(*new_data_tensor, platform::CPUPlace(),
+                                      &cpu_starts_tensor);
     new_data = cpu_starts_tensor.data<T>();
   }
   vec_new_data = std::vector<T>(new_data, new_data + new_data_tensor->numel());
diff --git a/paddle/fluid/operators/interpolate_op_xpu.cc b/paddle/fluid/operators/interpolate_op_xpu.cc
index 882edc00f231b6..6d98e0220d3d13 100644
--- a/paddle/fluid/operators/interpolate_op_xpu.cc
+++ b/paddle/fluid/operators/interpolate_op_xpu.cc
@@ -35,7 +35,7 @@ inline std::vector<int> get_new_shape_xpu(
         platform::errors::InvalidArgument("shape of dim tensor should be [1]"));
     if (platform::is_xpu_place(tensor->place())) {
       framework::Tensor temp;
-      TensorCopySync(*tensor, platform::CPUPlace(), &temp);
+      paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp);
       vec_new_shape.push_back(static_cast<int32_t>(*temp.data<int32_t>()));
     } else {
       vec_new_shape.push_back(static_cast<int32_t>(*tensor->data<int32_t>()));
@@ -52,7 +52,8 @@ inline std::vector<T> get_new_data_from_tensor_xpu(
   auto* new_data = new_data_tensor->data<T>();
   framework::Tensor cpu_starts_tensor;
   if (platform::is_xpu_place(new_data_tensor->place())) {
-    TensorCopySync(*new_data_tensor, platform::CPUPlace(), &cpu_starts_tensor);
+    paddle::framework::TensorCopySync(*new_data_tensor, platform::CPUPlace(),
+                                      &cpu_starts_tensor);
     new_data = cpu_starts_tensor.data<T>();
   }
   vec_new_data = std::vector<T>(new_data, new_data + new_data_tensor->numel());
diff --git a/paddle/fluid/operators/interpolate_v2_op.h b/paddle/fluid/operators/interpolate_v2_op.h
index 0af799eca0c55c..a5afb18b3ff6f4 100644
--- a/paddle/fluid/operators/interpolate_v2_op.h
+++ b/paddle/fluid/operators/interpolate_v2_op.h
@@ -39,7 +39,7 @@ inline std::vector<int> get_new_shape(
                           tensor->dims()));
     if (platform::is_gpu_place(tensor->place())) {
       framework::Tensor temp;
-      TensorCopySync(*tensor, platform::CPUPlace(), &temp);
+      paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp);
       vec_new_shape.push_back(static_cast<int32_t>(*temp.data<int32_t>()));
     } else {
       vec_new_shape.push_back(static_cast<int32_t>(*tensor->data<int32_t>()));
@@ -55,12 +55,14 @@ inline std::vector<T> get_new_data_from_tensor(const Tensor* new_data_tensor) {
   auto* new_data = new_data_tensor->data<T>();
   framework::Tensor cpu_starts_tensor;
   if (platform::is_gpu_place(new_data_tensor->place())) {
-    TensorCopySync(*new_data_tensor, platform::CPUPlace(), &cpu_starts_tensor);
+    paddle::framework::TensorCopySync(*new_data_tensor, platform::CPUPlace(),
+                                      &cpu_starts_tensor);
     new_data = cpu_starts_tensor.data<T>();
   }
 #ifdef PADDLE_WITH_ASCEND_CL
   if (platform::is_npu_place(new_data_tensor->place())) {
-    TensorCopySync(*new_data_tensor, platform::CPUPlace(), &cpu_starts_tensor);
+    paddle::framework::TensorCopySync(*new_data_tensor, platform::CPUPlace(),
+                                      &cpu_starts_tensor);
     new_data = cpu_starts_tensor.data<T>();
   }
 #endif
diff --git a/paddle/fluid/operators/interpolate_v2_op_xpu.cc b/paddle/fluid/operators/interpolate_v2_op_xpu.cc
index c960f9a58be07f..33f49297a16893 100644
--- a/paddle/fluid/operators/interpolate_v2_op_xpu.cc
+++ b/paddle/fluid/operators/interpolate_v2_op_xpu.cc
@@ -34,7 +34,7 @@ inline std::vector<int> get_new_shape_xpu(
         tensor->dims(), framework::make_ddim({1}),
         platform::errors::InvalidArgument("shape of dim tensor should be [1]"));
     framework::Tensor temp;
-    TensorCopySync(*tensor, platform::CPUPlace(), &temp);
+    paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp);
     vec_new_shape.push_back(static_cast<int32_t>(*temp.data<int32_t>()));
   }
 
@@ -46,7 +46,8 @@ inline std::vector<T> get_new_data_from_tensor_xpu(
     const Tensor* new_data_tensor) {
   std::vector<T> vec_new_data;
   framework::Tensor cpu_starts_tensor;
-  TensorCopySync(*new_data_tensor, platform::CPUPlace(), &cpu_starts_tensor);
+  paddle::framework::TensorCopySync(*new_data_tensor, platform::CPUPlace(),
+                                    &cpu_starts_tensor);
   auto* new_data = cpu_starts_tensor.data<T>();
   vec_new_data = std::vector<T>(new_data, new_data + new_data_tensor->numel());
   return vec_new_data;
diff --git a/paddle/fluid/operators/isfinite_op.h b/paddle/fluid/operators/isfinite_op.h
index a54134910d0b86..99db1c7e081dad 100644
--- a/paddle/fluid/operators/isfinite_op.h
+++ b/paddle/fluid/operators/isfinite_op.h
@@ -22,11 +22,9 @@
 #include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/platform/transform.h"
 
-namespace paddle {
-namespace framework {
-class Tensor;
-}  // namespace framework
-}  // namespace paddle
+namespace pten {
+class DenseTensor;
+}  // namespace pten
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/isfinite_v2_op.h b/paddle/fluid/operators/isfinite_v2_op.h
index 332c50d75513f8..9edf3493b678c9 100644
--- a/paddle/fluid/operators/isfinite_v2_op.h
+++ b/paddle/fluid/operators/isfinite_v2_op.h
@@ -23,11 +23,9 @@
 #include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/platform/transform.h"
 
-namespace paddle {
-namespace framework {
-class Tensor;
-}  // namespace framework
-}  // namespace paddle
+namespace pten {
+class DenseTensor;
+}  // namespace pten
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/label_smooth_op.cu b/paddle/fluid/operators/label_smooth_op.cu
index 2c7a08de0f65b8..7979d3a74bb7d0 100644
--- a/paddle/fluid/operators/label_smooth_op.cu
+++ b/paddle/fluid/operators/label_smooth_op.cu
@@ -87,8 +87,8 @@ class LabelSmoothGPUKernel : public framework::OpKernel<T> {
       std::vector<const framework::Tensor*> ins = {in_t};
       std::vector<framework::Tensor*> outs = {out_t};
       auto functor = LabelSmoothFunctor<T>(epsilon, label_dim);
-      LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kUnary, T, T>(
-          dev_ctx, ins, &outs, functor);
+      paddle::operators::LaunchSameDimsElementwiseCudaKernel<
+          ElementwiseType::kUnary, T, T>(dev_ctx, ins, &outs, functor);
     }
   }
 };
@@ -107,8 +107,8 @@ class LabelSmoothGradGPUKernel : public framework::OpKernel<T> {
     std::vector<const framework::Tensor*> ins = {d_out_t};
     std::vector<framework::Tensor*> outs = {d_in_t};
     auto functor = LabelSmoothGradFunctor<T>(epsilon);
-    LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kUnary, T, T>(
-        dev_ctx, ins, &outs, functor);
+    paddle::operators::LaunchSameDimsElementwiseCudaKernel<
+        ElementwiseType::kUnary, T, T>(dev_ctx, ins, &outs, functor);
   }
 };
 }  // namespace operators
diff --git a/paddle/fluid/operators/lgamma_op.cu b/paddle/fluid/operators/lgamma_op.cu
index da40518d9b4b2c..64d1a479627ce1 100644
--- a/paddle/fluid/operators/lgamma_op.cu
+++ b/paddle/fluid/operators/lgamma_op.cu
@@ -39,8 +39,8 @@ class LgammaKernel<platform::CUDADeviceContext, T>
     std::vector<const framework::Tensor*> ins = {x};
     std::vector<framework::Tensor*> outs = {out};
     auto functor = CudaLgammaFunctor<T>();
-    LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kUnary, T, T>(
-        dev_ctx, ins, &outs, functor);
+    paddle::operators::LaunchSameDimsElementwiseCudaKernel<
+        ElementwiseType::kUnary, T, T>(dev_ctx, ins, &outs, functor);
   }
 };
 
diff --git a/paddle/fluid/operators/lite/ut_helper.h b/paddle/fluid/operators/lite/ut_helper.h
index 08dd41e7b341b2..210bb7b3f0bc9a 100644
--- a/paddle/fluid/operators/lite/ut_helper.h
+++ b/paddle/fluid/operators/lite/ut_helper.h
@@ -95,7 +95,7 @@ void RandomizeTensor(framework::LoDTensor* tensor,
   for (size_t i = 0; i < num_elements; i++) {
     *(temp_data + i) = random(0., 1.);
   }
-  TensorCopySync(temp_tensor, place, tensor);
+  paddle::framework::TensorCopySync(temp_tensor, place, tensor);
 }
 
 void CreateTensor(framework::Scope* scope, const std::string& name,
diff --git a/paddle/fluid/operators/load_combine_op.h b/paddle/fluid/operators/load_combine_op.h
index a02b0e61d9278e..3a97b1cd848192 100644
--- a/paddle/fluid/operators/load_combine_op.h
+++ b/paddle/fluid/operators/load_combine_op.h
@@ -106,7 +106,7 @@ class LoadCombineOpKernel : public framework::OpKernel<T> {
         auto *tensor = out_vars[i]->GetMutable<framework::LoDTensor>();
 
         // Get data from fin to tensor
-        DeserializeFromStream(*buffer, tensor, dev_ctx);
+        paddle::framework::DeserializeFromStream(*buffer, tensor, dev_ctx);
 
         auto in_dtype = tensor->type();
         auto out_dtype =
diff --git a/paddle/fluid/operators/load_op.h b/paddle/fluid/operators/load_op.h
index 3e58a6462d860d..66160695c3d5aa 100644
--- a/paddle/fluid/operators/load_op.h
+++ b/paddle/fluid/operators/load_op.h
@@ -75,9 +75,10 @@ class LoadOpKernel : public framework::OpKernel<T> {
                         platform::errors::InvalidArgument(
                             "seek witn tensor must great than or equal to 0"));
       auto shape = ctx.Attr<std::vector<int64_t>>("shape");
-      DeserializeFromStream(fin, tensor, dev_ctx, seek, shape);
+      paddle::framework::DeserializeFromStream(fin, tensor, dev_ctx, seek,
+                                               shape);
     } else {
-      DeserializeFromStream(fin, tensor, dev_ctx);
+      paddle::framework::DeserializeFromStream(fin, tensor, dev_ctx);
     }
 
     auto load_as_fp16 = ctx.Attr<bool>("load_as_fp16");
diff --git a/paddle/fluid/operators/lu_op.h b/paddle/fluid/operators/lu_op.h
index f241caa857a07a..f78c5b9d36187e 100644
--- a/paddle/fluid/operators/lu_op.h
+++ b/paddle/fluid/operators/lu_op.h
@@ -86,7 +86,7 @@ void SetValueCompute(const framework::ExecutionContext& ctx,
   // be two ops points to the output in graph: op1 -> output <- set_value.
   // In this case, we have to find a way to handle the running order of
   // set_value is what we want.
-  TensorCopy(*in, place, out);
+  paddle::framework::TensorCopy(*in, place, out);
 
   Tensor slice_tensor(dtype), pad_tensor(dtype);
   slice_tensor.mutable_data<T>(slice_dims, place);
diff --git a/paddle/fluid/operators/masked_select_op_npu.cc b/paddle/fluid/operators/masked_select_op_npu.cc
index cb21c687e9982f..828a3b002c20d1 100644
--- a/paddle/fluid/operators/masked_select_op_npu.cc
+++ b/paddle/fluid/operators/masked_select_op_npu.cc
@@ -60,7 +60,7 @@ class MaskedSelectedNPUKernel : public framework::OpKernel<T> {
       sum_runner.AddOutput(out_size);
       sum_runner.AddAttr("keep_dims", false);
       sum_runner.Run(stream);
-      TensorToVector(out_size, dev_ctx, &out_size_vec);
+      paddle::framework::TensorToVector(out_size, dev_ctx, &out_size_vec);
     }
 
     out->Resize({out_size_vec[0]});
@@ -135,7 +135,7 @@ class MaskedSelectedGradNPUKernel : public framework::OpKernel<T> {
       sum_runner.AddOutput(out_size);
       sum_runner.AddAttr("keep_dims", false);
       sum_runner.Run(stream);
-      TensorToVector(out_size, dev_ctx, &out_size_vec);
+      paddle::framework::TensorToVector(out_size, dev_ctx, &out_size_vec);
     }
 
     Tensor topkv2_out, indices;
diff --git a/paddle/fluid/operators/math/beam_search.cc b/paddle/fluid/operators/math/beam_search.cc
index c52ba68331580c..410abd265430c9 100644
--- a/paddle/fluid/operators/math/beam_search.cc
+++ b/paddle/fluid/operators/math/beam_search.cc
@@ -14,10 +14,12 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/math/beam_search.h"
 
+namespace pten {
+class DenseTensor;
+}  // namespace pten
+
 namespace paddle {
-namespace framework {
-class Tensor;
-}  // namespace framework
+namespace framework {}  // namespace framework
 namespace platform {
 class CPUDeviceContext;
 }  // namespace platform
diff --git a/paddle/fluid/operators/math/beam_search_npu.cc b/paddle/fluid/operators/math/beam_search_npu.cc
index 5aede02263dd53..2d5a3dae33b32f 100644
--- a/paddle/fluid/operators/math/beam_search_npu.cc
+++ b/paddle/fluid/operators/math/beam_search_npu.cc
@@ -15,10 +15,12 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/beam_search.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
+namespace pten {
+class DenseTensor;
+}  // namespace pten
+
 namespace paddle {
-namespace framework {
-class Tensor;
-}  // namespace framework
+namespace framework {}  // namespace framework
 namespace platform {
 class NPUDeviceContext;
 }  // namespace platform
diff --git a/paddle/fluid/operators/math/beam_search_test.cc b/paddle/fluid/operators/math/beam_search_test.cc
index 0df06621d9bab7..ec2e9516fcd4b9 100644
--- a/paddle/fluid/operators/math/beam_search_test.cc
+++ b/paddle/fluid/operators/math/beam_search_test.cc
@@ -80,10 +80,10 @@ void TestBeamSearch() {
 
     PrepareCPUTensors(&cpu_ids, &cpu_scores, &cpu_pre_ids, &cpu_pre_scores);
 
-    TensorCopySync(cpu_ids, *place, &ids);
-    TensorCopySync(cpu_scores, *place, &scores);
-    TensorCopySync(cpu_pre_ids, *place, &pre_ids);
-    TensorCopySync(cpu_pre_scores, *place, &pre_scores);
+    paddle::framework::TensorCopySync(cpu_ids, *place, &ids);
+    paddle::framework::TensorCopySync(cpu_scores, *place, &scores);
+    paddle::framework::TensorCopySync(cpu_pre_ids, *place, &pre_ids);
+    paddle::framework::TensorCopySync(cpu_pre_scores, *place, &pre_scores);
 
     ids.set_lod(cpu_ids.lod());
     scores.set_lod(cpu_scores.lod());
@@ -110,10 +110,10 @@ void TestBeamSearch() {
     cpu_selected_ids = selected_ids;
     cpu_selected_scores = selected_scores;
   } else {
-    TensorCopySync(selected_ids, paddle::platform::CPUPlace(),
-                   &cpu_selected_ids);
-    TensorCopySync(selected_scores, paddle::platform::CPUPlace(),
-                   &cpu_selected_scores);
+    paddle::framework::TensorCopySync(
+        selected_ids, paddle::platform::CPUPlace(), &cpu_selected_ids);
+    paddle::framework::TensorCopySync(
+        selected_scores, paddle::platform::CPUPlace(), &cpu_selected_scores);
     cpu_selected_ids.set_lod(selected_ids.lod());
     cpu_selected_scores.set_lod(selected_scores.lod());
   }
diff --git a/paddle/fluid/operators/math/blas.h b/paddle/fluid/operators/math/blas.h
index 2be7695e6a8c47..f17cc3094f7fc0 100644
--- a/paddle/fluid/operators/math/blas.h
+++ b/paddle/fluid/operators/math/blas.h
@@ -20,7 +20,6 @@
 namespace paddle {
 namespace framework {
 class ExecutionContext;
-class Tensor;
 }  // namespace framework
 }  // namespace paddle
 
diff --git a/paddle/fluid/operators/math/concat_and_split.cc b/paddle/fluid/operators/math/concat_and_split.cc
index 45effd404cfb32..2d23f52c0b27b0 100644
--- a/paddle/fluid/operators/math/concat_and_split.cc
+++ b/paddle/fluid/operators/math/concat_and_split.cc
@@ -17,10 +17,12 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 #endif
 
+namespace pten {
+class DenseTensor;
+}  // namespace pten
+
 namespace paddle {
-namespace framework {
-class Tensor;
-}  // namespace framework
+namespace framework {}  // namespace framework
 namespace platform {
 class CPUDeviceContext;
 struct bfloat16;
diff --git a/paddle/fluid/operators/math/im2col_test.cc b/paddle/fluid/operators/math/im2col_test.cc
index 0122e6cdeb4744..1400b9d105ce10 100644
--- a/paddle/fluid/operators/math/im2col_test.cc
+++ b/paddle/fluid/operators/math/im2col_test.cc
@@ -63,7 +63,7 @@ void testIm2col() {
   if (paddle::platform::is_cpu_place(*place)) {
     input = input_tmp;
   } else {
-    TensorCopySync(input_tmp, *place, &input);
+    paddle::framework::TensorCopySync(input_tmp, *place, &input);
   }
   output_cfo.mutable_data<float>(
       {1, filter_size, filter_size, output_height, output_width}, *place);
@@ -88,7 +88,8 @@ void testIm2col() {
   if (paddle::platform::is_cpu_place(*place)) {
     out_cfo_ptr = output_cfo.data<float>();
   } else {
-    TensorCopySync(output_cfo, paddle::platform::CPUPlace(), &output_tmp);
+    paddle::framework::TensorCopySync(output_cfo, paddle::platform::CPUPlace(),
+                                      &output_tmp);
     out_cfo_ptr = output_tmp.data<float>();
   }
   for (int i = 0; i < 6; ++i) {
@@ -99,7 +100,8 @@ void testIm2col() {
   if (paddle::platform::is_cpu_place(*place)) {
     out_ocf_ptr = output_ocf.data<float>();
   } else {
-    TensorCopySync(output_ocf, paddle::platform::CPUPlace(), &output_tmp);
+    paddle::framework::TensorCopySync(output_ocf, paddle::platform::CPUPlace(),
+                                      &output_tmp);
     out_ocf_ptr = output_tmp.data<float>();
   }
 
@@ -120,7 +122,7 @@ void testIm2col() {
   if (paddle::platform::is_cpu_place(*place)) {
     input = input_tmp;
   } else {
-    TensorCopySync(input_tmp, *place, &input);
+    paddle::framework::TensorCopySync(input_tmp, *place, &input);
   }
 
   col2im(*context, output_cfo, dilation, stride, padding, &input);
@@ -129,7 +131,8 @@ void testIm2col() {
   if (paddle::platform::is_cpu_place(*place)) {
     in_ptr = input.data<float>();
   } else {
-    TensorCopySync(input, paddle::platform::CPUPlace(), &input_tmp);
+    paddle::framework::TensorCopySync(input, paddle::platform::CPUPlace(),
+                                      &input_tmp);
     in_ptr = input_tmp.data<float>();
   }
   for (int i = 0; i < 6; ++i) {
@@ -141,7 +144,7 @@ void testIm2col() {
   if (paddle::platform::is_cpu_place(*place)) {
     input = input_tmp;
   } else {
-    TensorCopySync(input_tmp, *place, &input);
+    paddle::framework::TensorCopySync(input_tmp, *place, &input);
   }
 
   col2im_ocf(*context, output_ocf, dilation, stride, padding, &input);
@@ -149,7 +152,8 @@ void testIm2col() {
   if (paddle::platform::is_cpu_place(*place)) {
     in_ptr = input.data<float>();
   } else {
-    TensorCopySync(input, paddle::platform::CPUPlace(), &input_tmp);
+    paddle::framework::TensorCopySync(input, paddle::platform::CPUPlace(),
+                                      &input_tmp);
     in_ptr = input_tmp.data<float>();
   }
   for (int i = 0; i < 6; ++i) {
diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc
index 6ca3abe0f05a57..f2d1e79f03524a 100644
--- a/paddle/fluid/operators/math/math_function.cc
+++ b/paddle/fluid/operators/math/math_function.cc
@@ -284,13 +284,6 @@ struct ElementwiseAddTo<platform::CPUDeviceContext, T> {
     auto& place = *(ctx->eigen_device());
     out.device(place) = out + in;
   }
-  void operator()(platform::CPUDeviceContext* ctx, const pten::DenseTensor& src,
-                  pten::DenseTensor* dst) {
-    auto in = pten::EigenVector<T>::Flatten(src);
-    auto out = pten::EigenVector<T>::Flatten(*dst);
-    auto& place = *(ctx->eigen_device());
-    out.device(place) = out + in;
-  }
 };
 
 template struct ElementwiseAddTo<platform::CPUDeviceContext, platform::float16>;
diff --git a/paddle/fluid/operators/math/math_function.cu b/paddle/fluid/operators/math/math_function.cu
index 6e2547145cfed2..960453dbe65ddf 100644
--- a/paddle/fluid/operators/math/math_function.cu
+++ b/paddle/fluid/operators/math/math_function.cu
@@ -283,13 +283,6 @@ struct ElementwiseAddTo<platform::CUDADeviceContext, T> {
     auto& place = *(ctx->eigen_device());
     out.device(place) = out + in;
   }
-  void operator()(platform::CUDADeviceContext* ctx,
-                  const pten::DenseTensor& src, pten::DenseTensor* dst) {
-    auto in = pten::EigenVector<T>::Flatten(src);
-    auto out = pten::EigenVector<T>::Flatten(*dst);
-    auto& place = *(ctx->eigen_device());
-    out.device(place) = out + in;
-  }
 };
 
 template struct ElementwiseAddTo<platform::CUDADeviceContext,
diff --git a/paddle/fluid/operators/math/math_function.h b/paddle/fluid/operators/math/math_function.h
index 40293ad725b93d..9dbbf455f18334 100644
--- a/paddle/fluid/operators/math/math_function.h
+++ b/paddle/fluid/operators/math/math_function.h
@@ -66,8 +66,6 @@ struct ElementwiseAddTo {
   // dst = dst + src
   void operator()(DeviceContext* ctx, const framework::Tensor& src,
                   framework::Tensor* dst);
-  void operator()(DeviceContext* ctx, const pten::DenseTensor& src,
-                  pten::DenseTensor* dst);
 };
 
 template <typename DeviceContext, typename T>
diff --git a/paddle/fluid/operators/math/matrix_solve.cu.cc b/paddle/fluid/operators/math/matrix_solve.cu.cc
index f0b41f98dc0cd7..8aaac0295c818d 100644
--- a/paddle/fluid/operators/math/matrix_solve.cu.cc
+++ b/paddle/fluid/operators/math/matrix_solve.cu.cc
@@ -68,7 +68,7 @@ class MatrixSolveFunctor<platform::CUDADeviceContext, T> {
     Tensor tmp_a(a.type());
     tmp_a.Resize(a.dims());
     tmp_a.mutable_data<T>(context.GetPlace());
-    TensorCopy(a, context.GetPlace(), &tmp_a);
+    framework::TensorCopy(a, context.GetPlace(), &tmp_a);
 
     // copy input B to a temporary tensor tmp_b, and transpose tmp_b,
     // because cuBlas assumes column-major while Paddle uses row-majar.
diff --git a/paddle/fluid/operators/math/sequence_padding.cc b/paddle/fluid/operators/math/sequence_padding.cc
index 491d40d3ae5676..8f533c446026b1 100644
--- a/paddle/fluid/operators/math/sequence_padding.cc
+++ b/paddle/fluid/operators/math/sequence_padding.cc
@@ -14,10 +14,12 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/math/sequence_padding.h"
 
+namespace pten {
+class DenseTensor;
+}  // namespace pten
+
 namespace paddle {
-namespace framework {
-class Tensor;
-}  // namespace framework
+namespace framework {}  // namespace framework
 namespace platform {
 class CPUDeviceContext;
 }  // namespace platform
diff --git a/paddle/fluid/operators/math/sequence_padding.cu b/paddle/fluid/operators/math/sequence_padding.cu
index 19c3af03411b8c..65bf77f0d152b9 100644
--- a/paddle/fluid/operators/math/sequence_padding.cu
+++ b/paddle/fluid/operators/math/sequence_padding.cu
@@ -133,7 +133,8 @@ class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
               step_width, layout);
     /*
     if (!norm_by_times && seq_num == 1UL && pad_seq_len == max_seq_len) {
-      TensorCopy(pad_tensor, context.GetPlace(), context, seq_tensor);
+      paddle::framework::TensorCopy(pad_tensor, context.GetPlace(), context,
+    seq_tensor);
       seq_tensor->Resize(seq_tensor_dims);
       return;
     }
diff --git a/paddle/fluid/operators/math/sequence_padding_test.cc b/paddle/fluid/operators/math/sequence_padding_test.cc
index ea31b10c5558f6..6e7aae2ec7d9f6 100644
--- a/paddle/fluid/operators/math/sequence_padding_test.cc
+++ b/paddle/fluid/operators/math/sequence_padding_test.cc
@@ -42,7 +42,7 @@ void TestSequencePadding(const DeviceContext &context,
   if (paddle::platform::is_cpu_place(place)) {
     seq = cpu_seq;
   } else {
-    TensorCopySync(cpu_seq, place, &seq);
+    paddle::framework::TensorCopySync(cpu_seq, place, &seq);
     seq.set_lod(lod);
   }
 
@@ -62,7 +62,7 @@ void TestSequencePadding(const DeviceContext &context,
   if (paddle::platform::is_cpu_place(place)) {
     pad_value = cpu_pad_value;
   } else {
-    TensorCopySync(cpu_pad_value, place, &pad_value);
+    paddle::framework::TensorCopySync(cpu_pad_value, place, &pad_value);
   }
 
   paddle::operators::math::PaddingLoDTensorFunctor<DeviceContext, T>()(
@@ -78,7 +78,8 @@ void TestSequencePadding(const DeviceContext &context,
   if (paddle::platform::is_cpu_place(place)) {
     cpu_seq_back = seq_back;
   } else {
-    TensorCopySync(seq_back, paddle::platform::CPUPlace(), &cpu_seq_back);
+    paddle::framework::TensorCopySync(seq_back, paddle::platform::CPUPlace(),
+                                      &cpu_seq_back);
     cpu_seq_back.set_lod(lod);
   }
 
diff --git a/paddle/fluid/operators/math/sequence_pooling_test.cc b/paddle/fluid/operators/math/sequence_pooling_test.cc
index 775d8029bfd3ac..38db6b7b7e527b 100644
--- a/paddle/fluid/operators/math/sequence_pooling_test.cc
+++ b/paddle/fluid/operators/math/sequence_pooling_test.cc
@@ -39,7 +39,7 @@ void TestSequencePoolingSum(const DeviceContext &context,
   if (paddle::platform::is_cpu_place(place)) {
     out_grad = cpu_out_grad;
   } else {
-    TensorCopySync(cpu_out_grad, place, &out_grad);
+    paddle::framework::TensorCopySync(cpu_out_grad, place, &out_grad);
   }
 
   // construct in_grad
@@ -73,7 +73,8 @@ void TestSequencePoolingSum(const DeviceContext &context,
   if (paddle::platform::is_cpu_place(place)) {
     cpu_in_grad = in_grad;
   } else {
-    TensorCopySync(in_grad, paddle::platform::CPUPlace(), &cpu_in_grad);
+    paddle::framework::TensorCopySync(in_grad, paddle::platform::CPUPlace(),
+                                      &cpu_in_grad);
     cpu_in_grad.set_lod(in_grad.lod());
   }
 
diff --git a/paddle/fluid/operators/math/sequence_scale.cc b/paddle/fluid/operators/math/sequence_scale.cc
index f4193bb71fabb6..3c2956f20889f5 100644
--- a/paddle/fluid/operators/math/sequence_scale.cc
+++ b/paddle/fluid/operators/math/sequence_scale.cc
@@ -14,11 +14,9 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/math/sequence_scale.h"
 
-namespace paddle {
-namespace framework {
-class Tensor;
-}  // namespace framework
-}  // namespace paddle
+namespace pten {
+class DenseTensor;
+}  // namespace pten
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/math/sequence_scale.h b/paddle/fluid/operators/math/sequence_scale.h
index c6c84bb55dfa7a..574c2945dc2050 100644
--- a/paddle/fluid/operators/math/sequence_scale.h
+++ b/paddle/fluid/operators/math/sequence_scale.h
@@ -17,12 +17,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/platform/device_context.h"
 
-namespace paddle {
-namespace framework {
-class Tensor;
-}  // namespace framework
-}  // namespace paddle
-
 namespace paddle {
 namespace operators {
 namespace math {
diff --git a/paddle/fluid/operators/math/sparse.h b/paddle/fluid/operators/math/sparse.h
index 4ac68a3bdc4c66..7a5880bbfe7da1 100644
--- a/paddle/fluid/operators/math/sparse.h
+++ b/paddle/fluid/operators/math/sparse.h
@@ -20,7 +20,6 @@
 namespace paddle {
 namespace framework {
 class ExecutionContext;
-class Tensor;
 }  // namespace framework
 }  // namespace paddle
 
diff --git a/paddle/fluid/operators/math/vol2col_test.cc b/paddle/fluid/operators/math/vol2col_test.cc
index 5a8e7fcc2a76c2..8cd2824465879d 100644
--- a/paddle/fluid/operators/math/vol2col_test.cc
+++ b/paddle/fluid/operators/math/vol2col_test.cc
@@ -85,7 +85,8 @@ void testVol2col() {
   if (paddle::platform::is_cpu_place(*place)) {
     out_cfo_ptr = output.data<float>();
   } else {
-    TensorCopySync(output, paddle::platform::CPUPlace(), &output_tmp);
+    paddle::framework::TensorCopySync(output, paddle::platform::CPUPlace(),
+                                      &output_tmp);
     out_cfo_ptr = output_tmp.data<float>();
   }
 
@@ -99,7 +100,7 @@ void testVol2col() {
   if (paddle::platform::is_cpu_place(*place)) {
     input = input_tmp;
   } else {
-    TensorCopySync(input_tmp, *place, &input);
+    paddle::framework::TensorCopySync(input_tmp, *place, &input);
   }
 
   paddle::operators::math::Col2VolFunctor<DeviceContext, float> col2vol;
@@ -109,7 +110,8 @@ void testVol2col() {
   if (paddle::platform::is_cpu_place(*place)) {
     in_ptr = input.data<float>();
   } else {
-    TensorCopySync(input, paddle::platform::CPUPlace(), &input_tmp);
+    paddle::framework::TensorCopySync(input, paddle::platform::CPUPlace(),
+                                      &input_tmp);
     in_ptr = input_tmp.data<float>();
   }
 
diff --git a/paddle/fluid/operators/matrix_rank_op.cu b/paddle/fluid/operators/matrix_rank_op.cu
index 87c8abc1c432ee..1891a7be24e456 100644
--- a/paddle/fluid/operators/matrix_rank_op.cu
+++ b/paddle/fluid/operators/matrix_rank_op.cu
@@ -81,7 +81,7 @@ class MatrixRankGPUKernel : public framework::OpKernel<T> {
 
     // Must Copy X once, because the gesvdj will destory the content when exit.
     Tensor x_tmp;
-    TensorCopy(*x, context.GetPlace(), &x_tmp);
+    paddle::framework::TensorCopy(*x, context.GetPlace(), &x_tmp);
     auto info = memory::Alloc(dev_ctx, sizeof(int) * batches);
     int* info_ptr = reinterpret_cast<int*>(info->ptr());
 
diff --git a/paddle/fluid/operators/memcpy_d2h_op.h b/paddle/fluid/operators/memcpy_d2h_op.h
index 94eed5cf83fee5..fb5610dda70d9b 100644
--- a/paddle/fluid/operators/memcpy_d2h_op.h
+++ b/paddle/fluid/operators/memcpy_d2h_op.h
@@ -22,9 +22,12 @@ class DeviceContext;
 }  // namespace platform
 }  // namespace paddle
 
+namespace pten {
+class DenseTensor;
+}  // namespace pten
+
 namespace paddle {
 namespace framework {
-class Tensor;
 class Variable;
 class SelectedRows;
 }  // namespace framework
diff --git a/paddle/fluid/operators/memcpy_h2d_op.h b/paddle/fluid/operators/memcpy_h2d_op.h
index cc6e771d105ae0..e84dedd9112b74 100644
--- a/paddle/fluid/operators/memcpy_h2d_op.h
+++ b/paddle/fluid/operators/memcpy_h2d_op.h
@@ -23,9 +23,12 @@ class DeviceContext;
 }  // namespace platform
 }  // namespace paddle
 
+namespace pten {
+class DenseTensor;
+}  // namespace pten
+
 namespace paddle {
 namespace framework {
-class Tensor;
 class Variable;
 class SelectedRows;
 }  // namespace framework
diff --git a/paddle/fluid/operators/memcpy_op.h b/paddle/fluid/operators/memcpy_op.h
index b270d87ad00ea2..d2a081ac3c2ade 100644
--- a/paddle/fluid/operators/memcpy_op.h
+++ b/paddle/fluid/operators/memcpy_op.h
@@ -25,9 +25,12 @@ class DeviceContext;
 }  // namespace platform
 }  // namespace paddle
 
+namespace pten {
+class DenseTensor;
+}  // namespace pten
+
 namespace paddle {
 namespace framework {
-class Tensor;
 class Variable;
 class SelectedRows;
 }  // namespace framework
diff --git a/paddle/fluid/operators/merge_lod_tensor_op.cc b/paddle/fluid/operators/merge_lod_tensor_op.cc
index dae598ef64220b..653283b604f072 100644
--- a/paddle/fluid/operators/merge_lod_tensor_op.cc
+++ b/paddle/fluid/operators/merge_lod_tensor_op.cc
@@ -14,10 +14,13 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
 
+namespace pten {
+class DenseTensor;
+}  // namespace pten
+
 namespace paddle {
 namespace framework {
 class InferShapeContext;
-class Tensor;
 class OpDesc;
 class Scope;
 }  // namespace framework
diff --git a/paddle/fluid/operators/meshgrid_op.h b/paddle/fluid/operators/meshgrid_op.h
index e01469f26d74fa..39a62384a2740a 100644
--- a/paddle/fluid/operators/meshgrid_op.h
+++ b/paddle/fluid/operators/meshgrid_op.h
@@ -94,8 +94,9 @@ class MeshgridKernel : public framework::OpKernel<T> {
       view_shape[i] = shape[i];
 
       framework::Tensor reshape_ins_tensor;
-      TensorCopy(*ins[i], context.GetPlace(), context.device_context(),
-                 &reshape_ins_tensor);
+      paddle::framework::TensorCopy(*ins[i], context.GetPlace(),
+                                    context.device_context(),
+                                    &reshape_ins_tensor);
       framework::DDim out_dims_reshape = framework::make_ddim(view_shape);
       reshape_ins_tensor.Resize(out_dims_reshape);
       framework::DDim out_dims = framework::make_ddim(shape);
diff --git a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
index 8630515a9fdafb..0cb074beb60d79 100644
--- a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
@@ -16,10 +16,12 @@
 #include "paddle/fluid/operators/mkldnn/softplus_mkldnn_op.h"
 #include "paddle/fluid/platform/mkldnn_reuse.h"
 
+namespace pten {
+class DenseTensor;
+}  // namespace pten
+
 namespace paddle {
-namespace framework {
-class Tensor;
-}  // namespace framework
+namespace framework {}  // namespace framework
 namespace platform {
 class MKLDNNDeviceContext;
 }  // namespace platform
diff --git a/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc
index bf95ffdc11eccc..07f9183d9f8001 100644
--- a/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc
@@ -15,10 +15,12 @@ limitations under the License. */
 #include "paddle/fluid/operators/batch_norm_op.h"
 #include "paddle/fluid/platform/mkldnn_reuse.h"
 
+namespace pten {
+class DenseTensor;
+}  // namespace pten
+
 namespace paddle {
-namespace framework {
-class Tensor;
-}  // namespace framework
+namespace framework {}  // namespace framework
 namespace platform {
 class MKLDNNDeviceContext;
 }  // namespace platform
diff --git a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
index 754b46c823b28f..f6a6c6940a79d3 100644
--- a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
@@ -17,10 +17,12 @@ limitations under the License. */
 #include "paddle/fluid/operators/fc_op.h"
 #include "paddle/fluid/platform/mkldnn_helper.h"
 
+namespace pten {
+class DenseTensor;
+}  // namespace pten
+
 namespace paddle {
-namespace framework {
-class Tensor;
-}  // namespace framework
+namespace framework {}  // namespace framework
 namespace platform {
 class MKLDNNDeviceContext;
 }  // namespace platform
diff --git a/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc
index 49c896ef80fcc2..1b9d9b8f31d357 100644
--- a/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc
@@ -17,10 +17,12 @@ limitations under the License. */
 #include "paddle/fluid/operators/mul_op.h"
 #include "paddle/fluid/platform/mkldnn_reuse.h"
 
+namespace pten {
+class DenseTensor;
+}  // namespace pten
+
 namespace paddle {
-namespace framework {
-class Tensor;
-}  // namespace framework
+namespace framework {}  // namespace framework
 namespace platform {
 class MKLDNNDeviceContext;
 }  // namespace platform
diff --git a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
index 5a19584ae380b9..0c442f2fe4d596 100644
--- a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
@@ -27,10 +27,12 @@
 #include "paddle/fluid/operators/sum_op.h"
 #include "paddle/fluid/platform/mkldnn_reuse.h"
 
+namespace pten {
+class DenseTensor;
+}  // namespace pten
+
 namespace paddle {
-namespace framework {
-class Tensor;
-}  // namespace framework
+namespace framework {}  // namespace framework
 namespace platform {
 class CPUDeviceContext;
 class MKLDNNDeviceContext;
diff --git a/paddle/fluid/operators/mlu/activation_op_mlu_test.cc b/paddle/fluid/operators/mlu/activation_op_mlu_test.cc
index 6e392bcc75e824..9da3a4c48728e7 100644
--- a/paddle/fluid/operators/mlu/activation_op_mlu_test.cc
+++ b/paddle/fluid/operators/mlu/activation_op_mlu_test.cc
@@ -49,7 +49,7 @@ void Compare(fw::Scope* scope, const plat::DeviceContext& ctx,
   for (int64_t i = 0; i < num * num; ++i) {
     init_x.push_back(static_cast<T>(i - 50));
   }
-  TensorFromVector(init_x, ctx, tensor_x);
+  paddle::framework::TensorFromVector(init_x, ctx, tensor_x);
   tensor_x->Resize({num, num});
 
   auto place = ctx.GetPlace();
@@ -80,7 +80,7 @@ void Compare(fw::Scope* scope, const plat::DeviceContext& ctx,
 
   // eval value
   std::vector<T> out_vec;
-  TensorToVector(*tensor_out, ctx, &out_vec);
+  paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
 
   ctx.Wait();
 
@@ -108,9 +108,9 @@ void CompareGrad(fw::Scope* scope, const plat::DeviceContext& ctx,
     init_out.push_back(static_cast<T>(i - 50));
   }
 
-  TensorFromVector(init_dout, ctx, tensor_dout);
+  paddle::framework::TensorFromVector(init_dout, ctx, tensor_dout);
   tensor_dout->Resize({num, num});
-  TensorFromVector(init_out, ctx, tensor_out);
+  paddle::framework::TensorFromVector(init_out, ctx, tensor_out);
   tensor_out->Resize({num, num});
 
   auto dx = scope->Var("DX");
@@ -143,7 +143,7 @@ void CompareGrad(fw::Scope* scope, const plat::DeviceContext& ctx,
 
   // eval value
   std::vector<T> dx_vec;
-  TensorToVector(*tensor_dx, ctx, &dx_vec);
+  paddle::framework::TensorToVector(*tensor_dx, ctx, &dx_vec);
 
   ctx.Wait();
 
diff --git a/paddle/fluid/operators/multiplex_op.cu b/paddle/fluid/operators/multiplex_op.cu
index 5a212bcacae50d..0a32ee96fb6938 100644
--- a/paddle/fluid/operators/multiplex_op.cu
+++ b/paddle/fluid/operators/multiplex_op.cu
@@ -41,7 +41,7 @@ class MultiplexGPUKernel : public framework::OpKernel<T> {
     auto cols = ins[0]->numel() / rows;
     // copy index to cpu
     Tensor index_t_cpu;
-    TensorCopySync(*ids, platform::CPUPlace(), &index_t_cpu);
+    paddle::framework::TensorCopySync(*ids, platform::CPUPlace(), &index_t_cpu);
     auto* index = index_t_cpu.data<int32_t>();
     auto stream = ctx.cuda_device_context().stream();
     platform::CUDAPlace place = ctx.GetPlace();
@@ -84,7 +84,7 @@ class MultiplexGradGPUKernel : public framework::OpKernel<T> {
     auto cols = d_ins[idx]->numel() / rows;
     // copy index to cpu
     Tensor index_t_cpu;
-    TensorCopySync(*ids, platform::CPUPlace(), &index_t_cpu);
+    paddle::framework::TensorCopySync(*ids, platform::CPUPlace(), &index_t_cpu);
     auto* index = index_t_cpu.data<int32_t>();
 
     auto stream = ctx.cuda_device_context().stream();
diff --git a/paddle/fluid/operators/one_hot_op.cu b/paddle/fluid/operators/one_hot_op.cu
index 3da7a3afcc93dc..2b021748048c76 100644
--- a/paddle/fluid/operators/one_hot_op.cu
+++ b/paddle/fluid/operators/one_hot_op.cu
@@ -68,7 +68,8 @@ class OneHotCUDAKernel : public framework::OpKernel<T> {
       auto* depth_tensor = context.Input<framework::Tensor>("depth_tensor");
       if (platform::is_gpu_place(depth_tensor->place())) {
         framework::Tensor temp;
-        TensorCopySync(*depth_tensor, platform::CPUPlace(), &temp);
+        paddle::framework::TensorCopySync(*depth_tensor, platform::CPUPlace(),
+                                          &temp);
         depth = *temp.data<int32_t>();
       } else {
         depth = *depth_tensor->data<int32_t>();
diff --git a/paddle/fluid/operators/one_hot_v2_op.cu b/paddle/fluid/operators/one_hot_v2_op.cu
index 22eb6c81845d15..115c9460846838 100644
--- a/paddle/fluid/operators/one_hot_v2_op.cu
+++ b/paddle/fluid/operators/one_hot_v2_op.cu
@@ -69,7 +69,8 @@ class OneHotV2CUDAKernel : public framework::OpKernel<T> {
       auto* depth_tensor = context.Input<framework::Tensor>("depth_tensor");
       if (platform::is_gpu_place(depth_tensor->place())) {
         framework::Tensor temp;
-        TensorCopySync(*depth_tensor, platform::CPUPlace(), &temp);
+        paddle::framework::TensorCopySync(*depth_tensor, platform::CPUPlace(),
+                                          &temp);
         depth = *temp.data<int32_t>();
       } else {
         depth = *depth_tensor->data<int32_t>();
diff --git a/paddle/fluid/operators/optimizers/adam_op.cu b/paddle/fluid/operators/optimizers/adam_op.cu
index 3b9cf159f1b6b1..1ef46ef085c5d7 100644
--- a/paddle/fluid/operators/optimizers/adam_op.cu
+++ b/paddle/fluid/operators/optimizers/adam_op.cu
@@ -176,8 +176,8 @@ class AdamOpCUDAKernel : public framework::OpKernel<T> {
                             "Input(SkipUpdate) size must be 1, but get %d",
                             skip_update_tensor->numel()));
       std::vector<bool> skip_update_vec;
-      TensorToVector(*skip_update_tensor, ctx.device_context(),
-                     &skip_update_vec);
+      paddle::framework::TensorToVector(*skip_update_tensor,
+                                        ctx.device_context(), &skip_update_vec);
       skip_update = skip_update_vec[0];
     }
     // skip_update=true, just copy input to output, and TensorCopy will call
diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h
index 233bef22d83af0..bb044b4b4986e3 100644
--- a/paddle/fluid/operators/optimizers/adam_op.h
+++ b/paddle/fluid/operators/optimizers/adam_op.h
@@ -33,11 +33,13 @@ static inline float GetAttrFromTensor(const framework::Tensor* tensor) {
   const float* tensor_data = tensor->data<float>();
   framework::Tensor cpu_tensor;
   if (platform::is_gpu_place(tensor->place())) {
-    TensorCopySync(*tensor, platform::CPUPlace(), &cpu_tensor);
+    paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(),
+                                      &cpu_tensor);
     tensor_data = cpu_tensor.data<float>();
   }
   if (platform::is_xpu_place(tensor->place())) {
-    TensorCopySync(*tensor, platform::CPUPlace(), &cpu_tensor);
+    paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(),
+                                      &cpu_tensor);
     tensor_data = cpu_tensor.data<float>();
   }
   return tensor_data[0];
@@ -431,8 +433,8 @@ class AdamOpKernel : public framework::OpKernel<T> {
                             "Input(SkipUpdate) size must be 1, but get %d",
                             skip_update_tensor->numel()));
       std::vector<bool> skip_update_vec;
-      TensorToVector(*skip_update_tensor, ctx.device_context(),
-                     &skip_update_vec);
+      paddle::framework::TensorToVector(*skip_update_tensor,
+                                        ctx.device_context(), &skip_update_vec);
       skip_update = skip_update_vec[0];
     }
     // skip_update=true, just copy input to output, and TensorCopy will call
diff --git a/paddle/fluid/operators/optimizers/adam_op_npu.cc b/paddle/fluid/operators/optimizers/adam_op_npu.cc
index 744fcd3b412c45..c1846f148fd920 100644
--- a/paddle/fluid/operators/optimizers/adam_op_npu.cc
+++ b/paddle/fluid/operators/optimizers/adam_op_npu.cc
@@ -66,8 +66,8 @@ class AdamNPUKernel : public framework::OpKernel<T> {
                             "Input(SkipUpdate) size must be 1, but get %d",
                             skip_update_tensor->numel()));
       std::vector<bool> skip_update_vec;
-      TensorToVector(*skip_update_tensor, ctx.device_context(),
-                     &skip_update_vec);
+      paddle::framework::TensorToVector(*skip_update_tensor,
+                                        ctx.device_context(), &skip_update_vec);
       skip_update = skip_update_vec[0];
     }
     // skip_update=true, just copy input to output, and TensorCopy will call
@@ -239,8 +239,8 @@ class AdamWNPUKernel : public AdamNPUKernel<platform::NPUDeviceContext, T> {
                             "Input(SkipUpdate) size must be 1, but get %d",
                             skip_update_tensor->numel()));
       std::vector<bool> skip_update_vec;
-      TensorToVector(*skip_update_tensor, ctx.device_context(),
-                     &skip_update_vec);
+      paddle::framework::TensorToVector(*skip_update_tensor,
+                                        ctx.device_context(), &skip_update_vec);
       skip_update = skip_update_vec[0];
     }
     VLOG(3) << "Skip update" << skip_update;
diff --git a/paddle/fluid/operators/optimizers/adam_op_xpu.cc b/paddle/fluid/operators/optimizers/adam_op_xpu.cc
index cb06b06824be15..0a653c40117194 100644
--- a/paddle/fluid/operators/optimizers/adam_op_xpu.cc
+++ b/paddle/fluid/operators/optimizers/adam_op_xpu.cc
@@ -68,8 +68,8 @@ class AdamOpXPUKernel : public framework::OpKernel<T> {
                             "Input(SkipUpdate) size must be 1, but get %d",
                             skip_update_tensor->numel()));
       std::vector<bool> skip_update_vec;
-      TensorToVector(*skip_update_tensor, ctx.device_context(),
-                     &skip_update_vec);
+      paddle::framework::TensorToVector(*skip_update_tensor,
+                                        ctx.device_context(), &skip_update_vec);
       skip_update = skip_update_vec[0];
     }
     // skip_update=true, just copy input to output, and TensorCopy will call
@@ -138,8 +138,10 @@ class AdamOpXPUKernel : public framework::OpKernel<T> {
       Tensor xpu_beta2_pow;
       if (beta1_pow.place() == platform::CPUPlace() &&
           beta2_pow.place() == platform::CPUPlace()) {
-        TensorCopy(beta1_pow, ctx.GetPlace(), dev_ctx, &xpu_beta1_pow);
-        TensorCopy(beta2_pow, ctx.GetPlace(), dev_ctx, &xpu_beta2_pow);
+        paddle::framework::TensorCopy(beta1_pow, ctx.GetPlace(), dev_ctx,
+                                      &xpu_beta1_pow);
+        paddle::framework::TensorCopy(beta2_pow, ctx.GetPlace(), dev_ctx,
+                                      &xpu_beta2_pow);
         dev_ctx.Wait();
         beta1_pow_ptr = xpu_beta1_pow.template data<float>();
         beta2_pow_ptr = xpu_beta2_pow.template data<float>();
diff --git a/paddle/fluid/operators/optimizers/adamw_op.cu b/paddle/fluid/operators/optimizers/adamw_op.cu
index 8b152bc67a30bd..a8b16e73dbfffe 100644
--- a/paddle/fluid/operators/optimizers/adamw_op.cu
+++ b/paddle/fluid/operators/optimizers/adamw_op.cu
@@ -192,8 +192,8 @@ class AdamWOpCUDAKernel : public framework::OpKernel<T> {
                             "Input(SkipUpdate) size must be 1, but get %d",
                             skip_update_tensor->numel()));
       std::vector<bool> skip_update_vec;
-      TensorToVector(*skip_update_tensor, ctx.device_context(),
-                     &skip_update_vec);
+      paddle::framework::TensorToVector(*skip_update_tensor,
+                                        ctx.device_context(), &skip_update_vec);
       skip_update = skip_update_vec[0];
     }
 
diff --git a/paddle/fluid/operators/optimizers/adamw_op.h b/paddle/fluid/operators/optimizers/adamw_op.h
index 1904db4f7d6116..efd3a2b691f72d 100644
--- a/paddle/fluid/operators/optimizers/adamw_op.h
+++ b/paddle/fluid/operators/optimizers/adamw_op.h
@@ -177,8 +177,8 @@ class AdamWOpKernel : public AdamOpKernel<DeviceContext, T> {
                             "Input(SkipUpdate) size must be 1, but get %d",
                             skip_update_tensor->numel()));
       std::vector<bool> skip_update_vec;
-      TensorToVector(*skip_update_tensor, ctx.device_context(),
-                     &skip_update_vec);
+      paddle::framework::TensorToVector(*skip_update_tensor,
+                                        ctx.device_context(), &skip_update_vec);
       skip_update = skip_update_vec[0];
     }
     VLOG(3) << "Skip update" << skip_update;
diff --git a/paddle/fluid/operators/optimizers/adamw_op_xpu.cc b/paddle/fluid/operators/optimizers/adamw_op_xpu.cc
index c20bd6a9fadc0e..56fa11d2b08576 100644
--- a/paddle/fluid/operators/optimizers/adamw_op_xpu.cc
+++ b/paddle/fluid/operators/optimizers/adamw_op_xpu.cc
@@ -68,8 +68,8 @@ class AdamwOpXPUKernel : public framework::OpKernel<T> {
                             "Input(SkipUpdate) size must be 1, but get %d",
                             skip_update_tensor->numel()));
       std::vector<bool> skip_update_vec;
-      TensorToVector(*skip_update_tensor, ctx.device_context(),
-                     &skip_update_vec);
+      paddle::framework::TensorToVector(*skip_update_tensor,
+                                        ctx.device_context(), &skip_update_vec);
       skip_update = skip_update_vec[0];
     }
     auto& dev_ctx = ctx.template device_context<platform::XPUDeviceContext>();
@@ -129,8 +129,10 @@ class AdamwOpXPUKernel : public framework::OpKernel<T> {
       Tensor xpu_beta2_pow;
       if (beta1_pow.place() == platform::CPUPlace() &&
           beta2_pow.place() == platform::CPUPlace()) {
-        TensorCopy(beta1_pow, ctx.GetPlace(), dev_ctx, &xpu_beta1_pow);
-        TensorCopy(beta2_pow, ctx.GetPlace(), dev_ctx, &xpu_beta2_pow);
+        paddle::framework::TensorCopy(beta1_pow, ctx.GetPlace(), dev_ctx,
+                                      &xpu_beta1_pow);
+        paddle::framework::TensorCopy(beta2_pow, ctx.GetPlace(), dev_ctx,
+                                      &xpu_beta2_pow);
         dev_ctx.Wait();
         beta1_pow_ptr = xpu_beta1_pow.template data<float>();
         beta2_pow_ptr = xpu_beta2_pow.template data<float>();
diff --git a/paddle/fluid/operators/optimizers/rmsprop_op_xpu.cc b/paddle/fluid/operators/optimizers/rmsprop_op_xpu.cc
index a3a39e36e8244c..6a962b241fafb5 100644
--- a/paddle/fluid/operators/optimizers/rmsprop_op_xpu.cc
+++ b/paddle/fluid/operators/optimizers/rmsprop_op_xpu.cc
@@ -26,7 +26,8 @@ static inline float GetAttrFromTensor(const framework::Tensor* tensor) {
   framework::Tensor cpu_tensor;
   if (platform::is_gpu_place(tensor->place()) ||
       platform::is_xpu_place(tensor->place())) {
-    TensorCopySync(*tensor, platform::CPUPlace(), &cpu_tensor);
+    paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(),
+                                      &cpu_tensor);
     tensor_data = cpu_tensor.data<float>();
   }
   return tensor_data[0];
diff --git a/paddle/fluid/operators/p_norm_op.cu b/paddle/fluid/operators/p_norm_op.cu
index b2a9ca6f937427..88e94ba039ac27 100644
--- a/paddle/fluid/operators/p_norm_op.cu
+++ b/paddle/fluid/operators/p_norm_op.cu
@@ -133,8 +133,8 @@ class PnormCUDAKernel : public framework::OpKernel<T> {
       const auto& cuda_ctx =
           ctx.template device_context<platform::CUDADeviceContext>();
 
-      LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kUnary, MT, T,
-                                          UnsignedPowFunctor<MT, T>>(
+      paddle::operators::LaunchSameDimsElementwiseCudaKernel<
+          ElementwiseType::kUnary, MT, T, UnsignedPowFunctor<MT, T>>(
           cuda_ctx, ins, &outs, func);
       framework::Tensor tmp_y;
       tmp_y.mutable_data<T>(ndim, ctx.GetPlace());
@@ -145,8 +145,8 @@ class PnormCUDAKernel : public framework::OpKernel<T> {
       outs = {out_norm};
       auto func_inverse = UnsignedPowFunctor<MT, T>(1. / porder);
 
-      LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kUnary, MT, T,
-                                          UnsignedPowFunctor<MT, T>>(
+      paddle::operators::LaunchSameDimsElementwiseCudaKernel<
+          ElementwiseType::kUnary, MT, T, UnsignedPowFunctor<MT, T>>(
           cuda_ctx, ins, &outs, func_inverse);
     }
   }
@@ -215,14 +215,14 @@ class PnormGradCUDAKernel : public framework::OpKernel<T> {
       std::vector<const framework::Tensor*> ins = {in_norm};
       std::vector<framework::Tensor*> outs = {&tmp_norm};
       auto pow_functor = PowFunctor<T>(1. - porder);
-      LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kUnary, T, T,
-                                          PowFunctor<T>>(cuda_ctx, ins, &outs,
-                                                         pow_functor);
+      paddle::operators::LaunchSameDimsElementwiseCudaKernel<
+          ElementwiseType::kUnary, T, T, PowFunctor<T>>(cuda_ctx, ins, &outs,
+                                                        pow_functor);
       ins = {in_x};
       outs = {out_dx};
       auto unsigned_pow = UnsignedPowFunctor<T>(porder - 1.);
-      LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kUnary, T, T,
-                                          UnsignedPowFunctor<T>>(
+      paddle::operators::LaunchSameDimsElementwiseCudaKernel<
+          ElementwiseType::kUnary, T, T, UnsignedPowFunctor<T>>(
           cuda_ctx, ins, &outs, unsigned_pow);
       const framework::Tensor* tmp_norm_const = &tmp_norm;
       LaunchReduceGradKernel<DeviceContext, T, PNormPostGradFunctor<T>>(
diff --git a/paddle/fluid/operators/pad3d_op_npu.cc b/paddle/fluid/operators/pad3d_op_npu.cc
index 37dbebd0762583..12501c9bebfdf6 100644
--- a/paddle/fluid/operators/pad3d_op_npu.cc
+++ b/paddle/fluid/operators/pad3d_op_npu.cc
@@ -26,7 +26,8 @@ static inline std::vector<int> GetPaddings(
   std::vector<int> paddings(6);
   auto* paddings_t = context.Input<Tensor>("Paddings");
   if (paddings_t) {
-    TensorToVector(*paddings_t, context.device_context(), &paddings);
+    paddle::framework::TensorToVector(*paddings_t, context.device_context(),
+                                      &paddings);
   } else {
     auto pads = context.Attr<std::vector<int>>("paddings");
     std::copy(pads.begin(), pads.end(), paddings.data());
diff --git a/paddle/fluid/operators/print_op.cc b/paddle/fluid/operators/print_op.cc
index cef2993fc30d5f..11ec3c2842b791 100644
--- a/paddle/fluid/operators/print_op.cc
+++ b/paddle/fluid/operators/print_op.cc
@@ -16,10 +16,13 @@
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/tensor_formatter.h"
 
+namespace pten {
+class DenseTensor;
+}  // namespace pten
+
 namespace paddle {
 namespace framework {
 class InferShapeContext;
-class Tensor;
 class OpDesc;
 class Scope;
 }  // namespace framework
diff --git a/paddle/fluid/operators/qr_op.cu b/paddle/fluid/operators/qr_op.cu
index 9eddb03828b5d4..c8b6404830cdac 100644
--- a/paddle/fluid/operators/qr_op.cu
+++ b/paddle/fluid/operators/qr_op.cu
@@ -74,7 +74,7 @@ class QrGPUKernel : public framework::OpKernel<T> {
         context.GetPlace(), size_t(batch_size * m * n * sizeof(math::Real<T>)));
     // BatchedGeqrf performs computation in-place and 'qr' must be a copy of
     // input
-    TensorCopy(x, context.GetPlace(), &qr);
+    paddle::framework::TensorCopy(x, context.GetPlace(), &qr);
 
     // Prepare tau
     auto tau_dims_vec = framework::vectorize<int>(x_dims);
diff --git a/paddle/fluid/operators/range_op_npu_test.cc b/paddle/fluid/operators/range_op_npu_test.cc
index f2f395314c0cc8..081cafdf67b99b 100644
--- a/paddle/fluid/operators/range_op_npu_test.cc
+++ b/paddle/fluid/operators/range_op_npu_test.cc
@@ -43,21 +43,21 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx,
   auto tensor_start = start->GetMutable<f::LoDTensor>();
   std::vector<T> init_start;
   init_start.push_back(static_cast<T>(1));
-  TensorFromVector(init_start, ctx, tensor_start);
+  paddle::framework::TensorFromVector(init_start, ctx, tensor_start);
   tensor_start->Resize({1});
 
   auto end = scope->Var("End");
   auto tensor_end = end->GetMutable<f::LoDTensor>();
   std::vector<T> init_end;
   init_end.push_back(static_cast<T>(10));
-  TensorFromVector(init_end, ctx, tensor_end);
+  paddle::framework::TensorFromVector(init_end, ctx, tensor_end);
   tensor_end->Resize({1});
 
   auto step = scope->Var("Step");
   auto tensor_step = step->GetMutable<f::LoDTensor>();
   std::vector<T> init_step;
   init_step.push_back(static_cast<T>(2));
-  TensorFromVector(init_step, ctx, tensor_step);
+  paddle::framework::TensorFromVector(init_step, ctx, tensor_step);
   tensor_step->Resize({1});
 
   ctx.Wait();
@@ -74,7 +74,7 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx,
   op->Run(*scope, place);
 
   std::vector<T> out_vec;
-  TensorToVector(*tensor_out, ctx, &out_vec);
+  paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
   ctx.Wait();
 
   EXPECT_EQ(static_cast<T>(out_vec.size()), static_cast<T>(5));
diff --git a/paddle/fluid/operators/recurrent_op.cc b/paddle/fluid/operators/recurrent_op.cc
index 7adf7962e1987c..fcfdb8b72ecde6 100644
--- a/paddle/fluid/operators/recurrent_op.cc
+++ b/paddle/fluid/operators/recurrent_op.cc
@@ -14,10 +14,13 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/recurrent_op.h"
 
+namespace pten {
+class DenseTensor;
+}  // namespace pten
+
 namespace paddle {
 namespace framework {
 class InferShapeContext;
-class Tensor;
 class OpDesc;
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/operators/reduce_ops/logsumexp_op.h b/paddle/fluid/operators/reduce_ops/logsumexp_op.h
index a478690976bd39..06c9f23dd2c26f 100644
--- a/paddle/fluid/operators/reduce_ops/logsumexp_op.h
+++ b/paddle/fluid/operators/reduce_ops/logsumexp_op.h
@@ -23,7 +23,8 @@ namespace operators {
 
 #define HANDLE_DIM(NDIM, RDIM)                                            \
   if (ndim == NDIM && rdim == RDIM) {                                     \
-    ReduceFunctor<DeviceContext, OutT, NDIM, RDIM, LogsumexpFunctor>(     \
+    paddle::operators::ReduceFunctor<DeviceContext, OutT, NDIM, RDIM,     \
+                                     LogsumexpFunctor>(                   \
         context.template device_context<DeviceContext>(), *input, output, \
         axis, keepdim);                                                   \
   }
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.h b/paddle/fluid/operators/reduce_ops/reduce_op.h
index eb4d4a5c1680ec..25603b07c7ad3a 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.h
@@ -37,7 +37,8 @@ namespace operators {
 
 #define HANDLE_DIM(NDIM, RDIM)                                            \
   if (ndim == NDIM && rdim == RDIM) {                                     \
-    ReduceFunctor<DeviceContext, OutT, NDIM, RDIM, Functor>(              \
+    paddle::operators::ReduceFunctor<DeviceContext, OutT, NDIM, RDIM,     \
+                                     Functor>(                            \
         context.template device_context<DeviceContext>(), *input, output, \
         dims, keep_dim);                                                  \
   }
@@ -131,7 +132,7 @@ void HandleLargeDim(const framework::ExecutionContext& context,
   shuffled_input.Resize({unreduced, reduced});
   DDim output_dim = output->dims();
   output->Resize({unreduced});
-  ReduceFunctor<DeviceContext, OutT, 2, 1, Functor>(
+  paddle::operators::ReduceFunctor<DeviceContext, OutT, 2, 1, Functor>(
       context.template device_context<DeviceContext>(), shuffled_input, output,
       {1}, keep_dim);
   output->Resize(output_dim);
diff --git a/paddle/fluid/operators/renorm_op.cu b/paddle/fluid/operators/renorm_op.cu
index b21b9fde56f247..1d76eaf27e8189 100644
--- a/paddle/fluid/operators/renorm_op.cu
+++ b/paddle/fluid/operators/renorm_op.cu
@@ -151,8 +151,8 @@ class CUDARenormKernel : public framework::OpKernel<T> {
     const auto& cuda_ctx =
         context.template device_context<platform::CUDADeviceContext>();
 
-    LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kUnary, MT, T,
-                                        UnsignedPowFunctor<MT, T>>(
+    paddle::operators::LaunchSameDimsElementwiseCudaKernel<
+        ElementwiseType::kUnary, MT, T, UnsignedPowFunctor<MT, T>>(
         cuda_ctx, ins, &outs, func);
     std::vector<int> reduce_axis = {0, 2};
     TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
diff --git a/paddle/fluid/operators/reorder_lod_tensor_by_rank_op.cc b/paddle/fluid/operators/reorder_lod_tensor_by_rank_op.cc
index 4ba071032162a4..fe2c38850fba07 100644
--- a/paddle/fluid/operators/reorder_lod_tensor_by_rank_op.cc
+++ b/paddle/fluid/operators/reorder_lod_tensor_by_rank_op.cc
@@ -15,10 +15,13 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device_context.h"
 
+namespace pten {
+class DenseTensor;
+}  // namespace pten
+
 namespace paddle {
 namespace framework {
 class LoDRankTable;
-class Tensor;
 class OpDesc;
 class Scope;
 }  // namespace framework
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index 9e343517e3fbf6..01e13cd1dcf836 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -56,7 +56,7 @@ inline std::vector<int> get_new_shape(
     if (platform::is_gpu_place(tensor->place()) ||
         platform::is_xpu_place(tensor->place())) {
       framework::Tensor temp;
-      TensorCopySync(*tensor, platform::CPUPlace(), &temp);
+      paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp);
 
       vec_new_shape.push_back(static_cast<int32_t>(*temp.data<int32_t>()));
     } else {
@@ -410,7 +410,8 @@ class ReshapeKernel {
         if (platform::is_gpu_place(tensor->place()) ||
             platform::is_xpu_place(tensor->place())) {
           framework::Tensor temp;
-          TensorCopySync(*tensor, platform::CPUPlace(), &temp);
+          paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(),
+                                            &temp);
           pt_vec_shape.push_back(
               std::move(*(paddle::experimental::MakePtenDenseTensor(temp))));
         } else {
@@ -424,7 +425,8 @@ class ReshapeKernel {
       if (platform::is_gpu_place(shape_tensor->place()) ||
           platform::is_xpu_place(shape_tensor->place())) {
         framework::Tensor temp;
-        TensorCopySync(*shape_tensor, platform::CPUPlace(), &temp);
+        paddle::framework::TensorCopySync(*shape_tensor, platform::CPUPlace(),
+                                          &temp);
         pt_shape = paddle::experimental::MakePtenDenseTensor(temp);
       } else {
         pt_shape = paddle::experimental::MakePtenDenseTensor(*shape_tensor);
diff --git a/paddle/fluid/operators/reverse_op.h b/paddle/fluid/operators/reverse_op.h
index bf91e2f57a6676..d5e331e2fe5f69 100644
--- a/paddle/fluid/operators/reverse_op.h
+++ b/paddle/fluid/operators/reverse_op.h
@@ -66,7 +66,7 @@ class ReverseKernel : public framework::OpKernel<T> {
         auto* out_tensor = &out_array->at(out_offset);
 
         out_tensor->set_lod(x_tensor.lod());
-        TensorCopy(x_tensor, context.GetPlace(), out_tensor);
+        paddle::framework::TensorCopy(x_tensor, context.GetPlace(), out_tensor);
       }
       return;
     }
diff --git a/paddle/fluid/operators/roll_op.h b/paddle/fluid/operators/roll_op.h
index affb5f226ed555..413c7bcfc15eb1 100644
--- a/paddle/fluid/operators/roll_op.h
+++ b/paddle/fluid/operators/roll_op.h
@@ -100,7 +100,8 @@ class RollKernel : public framework::OpKernel<T> {
     std::vector<int64_t> dims = context.Attr<std::vector<int64_t>>("axis");
 
     std::vector<T> out_vec;
-    TensorToVector(input, context.device_context(), &out_vec);
+    paddle::framework::TensorToVector(input, context.device_context(),
+                                      &out_vec);
 
     size_t nums = shifts.size();
     DDim input_dim = input.dims();
@@ -143,7 +144,8 @@ class RollGradKernel : public framework::OpKernel<T> {
     std::vector<int64_t> dims = context.Attr<std::vector<int64_t>>("axis");
 
     std::vector<T> out_vec;
-    TensorToVector(input, context.device_context(), &out_vec);
+    paddle::framework::TensorToVector(input, context.device_context(),
+                                      &out_vec);
 
     size_t nums = shifts.size();
     DDim input_dim = input.dims();
diff --git a/paddle/fluid/operators/scale_op.h b/paddle/fluid/operators/scale_op.h
index da7c8c607a92a9..4faa23b6c16048 100644
--- a/paddle/fluid/operators/scale_op.h
+++ b/paddle/fluid/operators/scale_op.h
@@ -30,7 +30,8 @@ static inline T GetAttrFromTensor(const framework::Tensor* tensor) {
   framework::Tensor cpu_tensor;
   if (platform::is_gpu_place(tensor->place()) ||
       platform::is_npu_place(tensor->place())) {
-    TensorCopySync(*tensor, platform::CPUPlace(), &cpu_tensor);
+    paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(),
+                                      &cpu_tensor);
     tensor_data = cpu_tensor.data<T>();
   }
   return tensor_data[0];
diff --git a/paddle/fluid/operators/sequence_ops/sequence_mask_op.h b/paddle/fluid/operators/sequence_ops/sequence_mask_op.h
index 2ce0b02d437b77..a9de2d683fee92 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_mask_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_mask_op.h
@@ -86,7 +86,8 @@ class SequenceMaskKernel : public framework::OpKernel<Tx> {
                                   "But received Input(MaxLenTensor) is NULL"));
       if (platform::is_gpu_place(max_len_tensor->place())) {
         framework::Tensor temp;
-        TensorCopySync(*max_len_tensor, platform::CPUPlace(), &temp);
+        paddle::framework::TensorCopySync(*max_len_tensor, platform::CPUPlace(),
+                                          &temp);
         maxlen = *temp.data<int32_t>();
       } else {
         maxlen = *max_len_tensor->data<int32_t>();
diff --git a/paddle/fluid/operators/sequence_ops/sequence_mask_op_npu.cc b/paddle/fluid/operators/sequence_ops/sequence_mask_op_npu.cc
index a69961afe02214..675ea175a16a96 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_mask_op_npu.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_mask_op_npu.cc
@@ -36,7 +36,8 @@ class SequenceMaskNPUKernel : public framework::OpKernel<T> {
                                   "Input(MaxLenTensor) should not be NULL."
                                   "But received Input(MaxLenTensor) is NULL"));
       framework::Tensor temp;
-      TensorCopySync(*max_len_tensor, platform::CPUPlace(), &temp);
+      paddle::framework::TensorCopySync(*max_len_tensor, platform::CPUPlace(),
+                                        &temp);
       maxlen = *temp.data<int32_t>();
       PADDLE_ENFORCE_GT(
           maxlen, 0,
diff --git a/paddle/fluid/operators/set_value_op.h b/paddle/fluid/operators/set_value_op.h
index 71eb03895404d2..1580ef140ada1c 100644
--- a/paddle/fluid/operators/set_value_op.h
+++ b/paddle/fluid/operators/set_value_op.h
@@ -233,7 +233,7 @@ class SetValueKernel : public framework::OpKernel<T> {
     // be two ops points to the output in graph: op1 -> output <- set_value.
     // In this case, we have to find a way to handle the running order of
     // set_value is what we want.
-    TensorCopy(*in, place, out);
+    paddle::framework::TensorCopy(*in, place, out);
 
     Tensor slice_tensor(dtype), pad_tensor(dtype);
     slice_tensor.mutable_data<T>(slice_dims, place);
@@ -441,7 +441,7 @@ class SetValueGradKernel : public framework::OpKernel<T> {
 
     if (grad_input) {
       // Set gradient of `Input`
-      TensorCopy(*in, context.GetPlace(), grad_input);
+      paddle::framework::TensorCopy(*in, context.GetPlace(), grad_input);
 
       auto grad_input_t =
           framework::EigenTensor<T, D, Eigen::RowMajor,
diff --git a/paddle/fluid/operators/set_value_op_npu.cc b/paddle/fluid/operators/set_value_op_npu.cc
index 3a6f3566e033fb..2b4ad94058c277 100644
--- a/paddle/fluid/operators/set_value_op_npu.cc
+++ b/paddle/fluid/operators/set_value_op_npu.cc
@@ -81,7 +81,7 @@ class SetValueNPUKernel : public framework::OpKernel<T> {
       slice_dims_for_assign = framework::make_ddim(slice_dims_with_none);
     }
 
-    TensorCopy(*in, ctx.GetPlace(), out);
+    paddle::framework::TensorCopy(*in, ctx.GetPlace(), out);
 
     auto starts_indices = std::vector<int64_t>(in_dims.size(), 0);
     auto ends_indices = std::vector<int64_t>(in_dims.size(), 0);
diff --git a/paddle/fluid/operators/size_op.h b/paddle/fluid/operators/size_op.h
index e8c53d6e683305..8840fde287d662 100644
--- a/paddle/fluid/operators/size_op.h
+++ b/paddle/fluid/operators/size_op.h
@@ -36,7 +36,7 @@ class SizeKernel : public framework::OpKernel<T> {
       auto cpu_data =
           cpu_tensor.mutable_data<int64_t>(out_t->dims(), cpu_place);
       cpu_data[0] = in_t->numel();
-      TensorCopy(cpu_tensor, place, out_t);
+      paddle::framework::TensorCopy(cpu_tensor, place, out_t);
     }
   }
 };
diff --git a/paddle/fluid/operators/size_op_npu.cc b/paddle/fluid/operators/size_op_npu.cc
index 08b73c2040a626..5826d2b4a8742b 100644
--- a/paddle/fluid/operators/size_op_npu.cc
+++ b/paddle/fluid/operators/size_op_npu.cc
@@ -30,8 +30,9 @@ class SizeNPUKernel : public framework::OpKernel<T> {
     auto cpu_data =
         cpu_tensor.mutable_data<int64_t>(out->dims(), platform::CPUPlace());
     cpu_data[0] = x->numel();
-    TensorCopy(cpu_tensor, ctx.GetPlace(),
-               ctx.template device_context<platform::DeviceContext>(), out);
+    paddle::framework::TensorCopy(
+        cpu_tensor, ctx.GetPlace(),
+        ctx.template device_context<platform::DeviceContext>(), out);
     ctx.template device_context<paddle::platform::NPUDeviceContext>().Wait();
   }
 };
diff --git a/paddle/fluid/operators/slice_op.h b/paddle/fluid/operators/slice_op.h
index 658939a91f39a7..15d52880ed9ca1 100644
--- a/paddle/fluid/operators/slice_op.h
+++ b/paddle/fluid/operators/slice_op.h
@@ -59,7 +59,7 @@ inline void DealTensorArray(const framework::ExecutionContext& ctx,
       auto in_tensor = in_array->at(i + start);
       out_tensor->set_lod(in_tensor.lod());
       if (in_tensor.memory_size() > 0) {
-        TensorCopy(in_tensor, ctx.GetPlace(), out_tensor);
+        paddle::framework::TensorCopy(in_tensor, ctx.GetPlace(), out_tensor);
       } else {
         VLOG(10) << "WARNING: The input tensor 'x_tensor' holds no memory, so "
                     "nothing has been written to output array["
@@ -69,7 +69,7 @@ inline void DealTensorArray(const framework::ExecutionContext& ctx,
   } else {
     auto out = ctx.Output<Tensor>("Out");
     auto in_tensor = in_array->at(start);
-    TensorCopy(in_tensor, ctx.GetPlace(), out);
+    paddle::framework::TensorCopy(in_tensor, ctx.GetPlace(), out);
   }
 }
 
@@ -309,12 +309,13 @@ class SliceGradKernel : public framework::OpKernel<T> {
             ctx.Input<LoDTensorArray>(framework::GradVarName("Out"));
         int d_out_size = d_out_arr->size();
         for (int i = 0; i < d_out_size; ++i) {
-          TensorCopy(d_out_arr->at(i), ctx.GetPlace(),
-                     &(d_in_arr->at(start + i)));
+          paddle::framework::TensorCopy(d_out_arr->at(i), ctx.GetPlace(),
+                                        &(d_in_arr->at(start + i)));
         }
       } else {
         auto* d_out = ctx.Input<Tensor>(framework::GradVarName("Out"));
-        TensorCopy(*d_out, ctx.GetPlace(), &(d_in_arr->at(start)));
+        paddle::framework::TensorCopy(*d_out, ctx.GetPlace(),
+                                      &(d_in_arr->at(start)));
       }
       return;
     }
diff --git a/paddle/fluid/operators/softmax_op_npu_test.cc b/paddle/fluid/operators/softmax_op_npu_test.cc
index d20b3ac04bf95c..8e9e077b845cea 100644
--- a/paddle/fluid/operators/softmax_op_npu_test.cc
+++ b/paddle/fluid/operators/softmax_op_npu_test.cc
@@ -44,7 +44,7 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
     init.push_back(static_cast<T>(i));
   }
 
-  TensorFromVector(init, ctx, tensor_x);
+  paddle::framework::TensorFromVector(init, ctx, tensor_x);
   tensor_x->Resize({2, 3});
 
   ctx.Wait();
@@ -70,7 +70,7 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
   ctx.Wait();
 
   std::vector<T> out_vec;
-  TensorToVector(*tensor_out, ctx, &out_vec);
+  paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
 
   for (int i = 0; i < static_cast<int>(out_vec.size()); ++i) {
     VLOG(3) << "out_vec[" << i << "] : " << out_vec[i];
@@ -96,7 +96,7 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx) {
   out_init.push_back(static_cast<T>(0.4112));
   out_init.push_back(static_cast<T>(0.5457));
 
-  TensorFromVector(out_init, ctx, tensor_out);
+  paddle::framework::TensorFromVector(out_init, ctx, tensor_out);
   tensor_out->Resize({2, 3});
 
   ctx.Wait();
@@ -109,7 +109,7 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx) {
     dout_init.push_back(static_cast<T>(1.0));
   }
 
-  TensorFromVector(dout_init, ctx, tensor_dout);
+  paddle::framework::TensorFromVector(dout_init, ctx, tensor_dout);
   tensor_dout->Resize({2, 3});
 
   ctx.Wait();
@@ -144,7 +144,7 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx) {
   ctx.Wait();
 
   std::vector<float> out_vec;
-  TensorToVector(*tensor_dx, ctx, &out_vec);
+  paddle::framework::TensorToVector(*tensor_dx, ctx, &out_vec);
 
   ctx.Wait();
 
diff --git a/paddle/fluid/operators/spectral_norm_op.h b/paddle/fluid/operators/spectral_norm_op.h
index 954edc796914c8..b8a15579e5345a 100644
--- a/paddle/fluid/operators/spectral_norm_op.h
+++ b/paddle/fluid/operators/spectral_norm_op.h
@@ -138,15 +138,15 @@ class SpectralNormKernel : public framework::OpKernel<T> {
       for (int i = 0; i < rank; i++) {
         real_dims.push_back(i);
       }
-      TensorCopySync(*weight, ctx.GetPlace(), &weight_mat);
+      paddle::framework::TensorCopySync(*weight, ctx.GetPlace(), &weight_mat);
     }
     weight_mat = weight_mat.Resize({h, w});
 
     Tensor sigma;
     sigma.mutable_data<T>(weight_mat.dims(), ctx.GetPlace());
     Tensor uu, vv;
-    TensorCopySync(*u, ctx.GetPlace(), &uu);
-    TensorCopySync(*v, ctx.GetPlace(), &vv);
+    paddle::framework::TensorCopySync(*u, ctx.GetPlace(), &uu);
+    paddle::framework::TensorCopySync(*v, ctx.GetPlace(), &vv);
     CalcMatrixSigmaAndNormWeight<DeviceContext, T>(
         &sigma, &(uu.Resize({h, 1})), &(vv.Resize({w, 1})), &weight_mat,
         power_iters, eps, ctx);
@@ -167,7 +167,8 @@ class SpectralNormKernel : public framework::OpKernel<T> {
           rank, weight_mat.Resize(framework::make_ddim(real_dims)), out, perm,
           dev_ctx);
     } else {
-      TensorCopySync(weight_mat.Resize(dims), ctx.GetPlace(), out);
+      paddle::framework::TensorCopySync(weight_mat.Resize(dims), ctx.GetPlace(),
+                                        out);
     }
   }
 };
@@ -217,8 +218,9 @@ class SpectralNormGradKernel : public framework::OpKernel<T> {
       for (int i = 0; i < rank; i++) {
         real_dims.push_back(i);
       }
-      TensorCopySync(*weight, ctx.GetPlace(), &weight_mat);
-      TensorCopySync(*out_grad, ctx.GetPlace(), &out_grad_mat);
+      paddle::framework::TensorCopySync(*weight, ctx.GetPlace(), &weight_mat);
+      paddle::framework::TensorCopySync(*out_grad, ctx.GetPlace(),
+                                        &out_grad_mat);
     }
     weight_mat = weight_mat.Resize({h, w});
     out_grad_mat = out_grad_mat.Resize({h, w});
@@ -226,8 +228,8 @@ class SpectralNormGradKernel : public framework::OpKernel<T> {
     Tensor sigma;
     sigma.mutable_data<T>(weight_mat.dims(), ctx.GetPlace());
     Tensor uu, vv;
-    TensorCopySync(*u, ctx.GetPlace(), &uu);
-    TensorCopySync(*v, ctx.GetPlace(), &vv);
+    paddle::framework::TensorCopySync(*u, ctx.GetPlace(), &uu);
+    paddle::framework::TensorCopySync(*v, ctx.GetPlace(), &vv);
     CalcMatrixSigmaAndNormWeight<DeviceContext, T>(
         &sigma, &(uu.Resize({h, 1})), &(vv.Resize({w, 1})), &weight_mat,
         power_iters, eps, ctx);
@@ -266,7 +268,8 @@ class SpectralNormGradKernel : public framework::OpKernel<T> {
           rank, weight_grad_mat.Resize(framework::make_ddim(real_dims)),
           weight_grad, perm, dev_ctx);
     } else {
-      TensorCopySync(weight_grad_mat.Resize(dims), ctx.GetPlace(), weight_grad);
+      paddle::framework::TensorCopySync(weight_grad_mat.Resize(dims),
+                                        ctx.GetPlace(), weight_grad);
     }
   }
 };
diff --git a/paddle/fluid/operators/split_lod_tensor_op.cc b/paddle/fluid/operators/split_lod_tensor_op.cc
index 0ff622d3299195..9c22fa4797219f 100644
--- a/paddle/fluid/operators/split_lod_tensor_op.cc
+++ b/paddle/fluid/operators/split_lod_tensor_op.cc
@@ -15,10 +15,13 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device_context.h"
 
+namespace pten {
+class DenseTensor;
+}  // namespace pten
+
 namespace paddle {
 namespace framework {
 class InferShapeContext;
-class Tensor;
 class OpDesc;
 class Scope;
 }  // namespace framework
diff --git a/paddle/fluid/operators/squeeze_op_npu_test.cc b/paddle/fluid/operators/squeeze_op_npu_test.cc
index 1de7ca8c7bdbf4..3f6c43d7af2fe0 100644
--- a/paddle/fluid/operators/squeeze_op_npu_test.cc
+++ b/paddle/fluid/operators/squeeze_op_npu_test.cc
@@ -50,7 +50,7 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
     init.push_back(static_cast<T>(0.1));
   }
 
-  TensorFromVector(init, ctx, tensor_x);
+  paddle::framework::TensorFromVector(init, ctx, tensor_x);
   tensor_x->Resize({dim0, dim1, dim2});
 
   ctx.Wait();
@@ -75,7 +75,7 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
   EXPECT_EQ((uint32_t)tensor_out->dims()[1], uint32_t(dim1));
 
   std::vector<T> out_vec;
-  TensorToVector(*tensor_out, ctx, &out_vec);
+  paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
   for (uint32_t i = 0; i < out_vec.size(); i++) {
     EXPECT_EQ(out_vec[i], static_cast<T>(0.1));
   }
diff --git a/paddle/fluid/operators/strided_slice_op.h b/paddle/fluid/operators/strided_slice_op.h
index 9eae27cca6840d..eaef9496a92dcf 100644
--- a/paddle/fluid/operators/strided_slice_op.h
+++ b/paddle/fluid/operators/strided_slice_op.h
@@ -376,7 +376,8 @@ class StridedSliceKernel : public framework::OpKernel<T> {
         auto* out_tensor = &out_array->at(out_offset);
 
         out_tensor->set_lod(in_tensor.lod());
-        TensorCopy(in_tensor, context.GetPlace(), out_tensor);
+        paddle::framework::TensorCopy(in_tensor, context.GetPlace(),
+                                      out_tensor);
       }
 
     } else {
@@ -608,7 +609,8 @@ class StridedSliceGradKernel : public framework::OpKernel<T> {
                   in_offset));
 
           d_out_tensor->set_lod(in_tensor.lod());
-          TensorCopy(in_tensor, context.GetPlace(), d_out_tensor);
+          paddle::framework::TensorCopy(in_tensor, context.GetPlace(),
+                                        d_out_tensor);
 
         } else {
           d_out_tensor->Resize(dim);
diff --git a/paddle/fluid/operators/strided_slice_op_npu.cc b/paddle/fluid/operators/strided_slice_op_npu.cc
index 6a8726ce3351dc..95c4357f3280b3 100644
--- a/paddle/fluid/operators/strided_slice_op_npu.cc
+++ b/paddle/fluid/operators/strided_slice_op_npu.cc
@@ -145,12 +145,12 @@ class StridedSliceNPUKernel : public framework::OpKernel<T> {
     ends_indices_tensor.mutable_data<int64_t>({D}, place);
     strides_indices_tensor.mutable_data<int64_t>({D}, place);
 
-    TensorFromVector(starts_indices_vector, ctx.device_context(),
-                     &starts_indices_tensor);
-    TensorFromVector(ends_indices_vector, ctx.device_context(),
-                     &ends_indices_tensor);
-    TensorFromVector(strides_indices_vector, ctx.device_context(),
-                     &strides_indices_tensor);
+    paddle::framework::TensorFromVector(
+        starts_indices_vector, ctx.device_context(), &starts_indices_tensor);
+    paddle::framework::TensorFromVector(
+        ends_indices_vector, ctx.device_context(), &ends_indices_tensor);
+    paddle::framework::TensorFromVector(
+        strides_indices_vector, ctx.device_context(), &strides_indices_tensor);
 
     auto out_dims_origin = out_dims;
     if (decrease_axis.size() > 0) {
@@ -199,9 +199,9 @@ class StridedSliceNPUKernel : public framework::OpKernel<T> {
     if (need_reverse) {
       Tensor out_tmp;
       out_tmp.mutable_data<T>(out_dims, place);
-      TensorCopy(*out, place,
-                 ctx.template device_context<platform::DeviceContext>(),
-                 &out_tmp);
+      paddle::framework::TensorCopy(
+          *out, place, ctx.template device_context<platform::DeviceContext>(),
+          &out_tmp);
 
       Tensor reverse_axis;
       std::vector<int> reverse_axis_vector;
@@ -212,8 +212,8 @@ class StridedSliceNPUKernel : public framework::OpKernel<T> {
       }
       reverse_axis.mutable_data<int>(
           {static_cast<int>(reverse_axis_vector.size())}, place);
-      TensorFromVector(reverse_axis_vector, ctx.device_context(),
-                       &reverse_axis);
+      paddle::framework::TensorFromVector(reverse_axis_vector,
+                                          ctx.device_context(), &reverse_axis);
 
       const auto& runner_reverse =
           NpuOpRunner("ReverseV2", {out_tmp, reverse_axis}, {*out});
@@ -346,16 +346,20 @@ class StridedSliceGradNPUKernel : public framework::OpKernel<T> {
     ends_indices_tensor.mutable_data<int64_t>({D}, place);
     strides_indices_tensor.mutable_data<int64_t>({D}, place);
 
-    TensorFromVector(starts_indices_vector, dev_ctx, &starts_indices_tensor);
-    TensorFromVector(ends_indices_vector, dev_ctx, &ends_indices_tensor);
-    TensorFromVector(strides_indices_vector, dev_ctx, &strides_indices_tensor);
+    paddle::framework::TensorFromVector(starts_indices_vector, dev_ctx,
+                                        &starts_indices_tensor);
+    paddle::framework::TensorFromVector(ends_indices_vector, dev_ctx,
+                                        &ends_indices_tensor);
+    paddle::framework::TensorFromVector(strides_indices_vector, dev_ctx,
+                                        &strides_indices_tensor);
 
     std::vector<int64_t> input_dims_vector;
     for (int i = 0; i < input_dims.size(); i++) {
       input_dims_vector.push_back(input_dims[i]);
     }
     Tensor input_dims_tensor;
-    TensorFromVector(input_dims_vector, dev_ctx, &input_dims_tensor);
+    paddle::framework::TensorFromVector(input_dims_vector, dev_ctx,
+                                        &input_dims_tensor);
 
     bool need_reverse = false;
     for (size_t axis = 0; axis < axes.size(); axis++) {
@@ -382,7 +386,8 @@ class StridedSliceGradNPUKernel : public framework::OpKernel<T> {
       }
       reverse_axis.mutable_data<int>(
           {static_cast<int>(reverse_axis_vector.size())}, place);
-      TensorFromVector(reverse_axis_vector, dev_ctx, &reverse_axis);
+      paddle::framework::TensorFromVector(reverse_axis_vector, dev_ctx,
+                                          &reverse_axis);
 
       Tensor dout_tmp;
       dout_tmp.mutable_data<T>(dout->dims(), place);
diff --git a/paddle/fluid/operators/sum_op_npu.cc b/paddle/fluid/operators/sum_op_npu.cc
index a2e446e1525ad5..ec7ba1d03237bf 100644
--- a/paddle/fluid/operators/sum_op_npu.cc
+++ b/paddle/fluid/operators/sum_op_npu.cc
@@ -38,7 +38,7 @@ class SumNPUKernel : public framework::OpKernel<T> {
 
       int n = static_cast<int>(x.size());
       if (n == 1) {
-        TensorCopy(*x[0], place, out);
+        paddle::framework::TensorCopy(*x[0], place, out);
         return;
       }
 
diff --git a/paddle/fluid/operators/svd_op.cu b/paddle/fluid/operators/svd_op.cu
index f17e92e47b7312..e987589e83c19c 100644
--- a/paddle/fluid/operators/svd_op.cu
+++ b/paddle/fluid/operators/svd_op.cu
@@ -55,7 +55,7 @@ class SvdGPUKernel : public framework::OpKernel<T> {
     // then view A as n x m and do A^T SVD, we can avoid transpose
     // Must Copy X once, because the gesvdj will change the origin input matrix
     Tensor x_tmp;
-    TensorCopy(*x, context.GetPlace(), &x_tmp);
+    paddle::framework::TensorCopy(*x, context.GetPlace(), &x_tmp);
     auto info = memory::Alloc(dev_ctx, sizeof(int) * batch_count);
     int* info_ptr = reinterpret_cast<int*>(info->ptr());
 
diff --git a/paddle/fluid/operators/sync_batch_norm_op_npu.cc b/paddle/fluid/operators/sync_batch_norm_op_npu.cc
index a89bb0861c9cee..c666d5a11fa312 100644
--- a/paddle/fluid/operators/sync_batch_norm_op_npu.cc
+++ b/paddle/fluid/operators/sync_batch_norm_op_npu.cc
@@ -49,7 +49,7 @@ void training_or_inference(
   {
     common_mean_tile_1.Resize({C});
     common_mean_tile_1.mutable_data<float>(place);
-    TensorCopySync(*common_mean, place, &common_mean_tile_1);
+    paddle::framework::TensorCopySync(*common_mean, place, &common_mean_tile_1);
     if (layout == framework::DataLayout::kNCHW)
       common_mean_tile_1.Resize({1, C, 1, 1});
     else if (layout == framework::DataLayout::kNHWC)
@@ -70,7 +70,7 @@ void training_or_inference(
   {
     common_var_tile_1.Resize({C});
     common_var_tile_1.mutable_data<float>(place);
-    TensorCopySync(*common_var, place, &common_var_tile_1);
+    paddle::framework::TensorCopySync(*common_var, place, &common_var_tile_1);
     if (layout == framework::DataLayout::kNCHW)
       common_var_tile_1.Resize({1, C, 1, 1});
     else if (layout == framework::DataLayout::kNHWC)
@@ -129,7 +129,7 @@ void training_or_inference(
   {
     scale_tile_1.Resize({C});
     scale_tile_1.mutable_data<float>(place);
-    TensorCopySync(*scale, place, &scale_tile_1);
+    paddle::framework::TensorCopySync(*scale, place, &scale_tile_1);
     if (layout == framework::DataLayout::kNCHW)
       scale_tile_1.Resize({1, C, 1, 1});
     else if (layout == framework::DataLayout::kNHWC)
@@ -159,7 +159,7 @@ void training_or_inference(
   {
     bias_tile_1.Resize({C});
     bias_tile_1.mutable_data<float>(place);
-    TensorCopySync(*bias, place, &bias_tile_1);
+    paddle::framework::TensorCopySync(*bias, place, &bias_tile_1);
     if (layout == framework::DataLayout::kNCHW)
       bias_tile_1.Resize({1, C, 1, 1});
     else if (layout == framework::DataLayout::kNHWC)
@@ -339,11 +339,11 @@ class SyncBatchNormNPUKernel : public framework::OpKernel<T> {
     if (test_mode) {  // inference
       // cacl saved_mean
       saved_mean->mutable_data<float>(place);
-      TensorCopySync(*mean, place, saved_mean);
+      paddle::framework::TensorCopySync(*mean, place, saved_mean);
 
       // cacl saved_variance
       saved_variance->mutable_data<float>(place);
-      TensorCopySync(*variance, place, saved_variance);
+      paddle::framework::TensorCopySync(*variance, place, saved_variance);
 
       // cacl y
       training_or_inference<T>(ctx, stream, place, layout, test_mode, N, C, H,
@@ -354,7 +354,8 @@ class SyncBatchNormNPUKernel : public framework::OpKernel<T> {
       if (ctx.HasInput("MomentumTensor")) {
         const auto *mom_tensor = ctx.Input<Tensor>("MomentumTensor");
         Tensor mom_cpu;
-        TensorCopySync(*mom_tensor, platform::CPUPlace(), &mom_cpu);
+        paddle::framework::TensorCopySync(*mom_tensor, platform::CPUPlace(),
+                                          &mom_cpu);
         momentum = mom_cpu.data<float>()[0];
       }
 
@@ -417,8 +418,8 @@ class SyncBatchNormNPUKernel : public framework::OpKernel<T> {
           }
 
           std::vector<float> device_count_vec(1);
-          TensorToVector(device_count_tensor, ctx.device_context(),
-                         &device_count_vec);
+          paddle::framework::TensorToVector(
+              device_count_tensor, ctx.device_context(), &device_count_vec);
           device_counts = device_count_vec[0];
 
           // HcclAllReduce x_sum
@@ -560,8 +561,8 @@ class SyncBatchNormNPUGradKernel : public framework::OpKernel<T> {
       }
 
       std::vector<float> device_count_vec(1);
-      TensorToVector(device_count_tensor, ctx.device_context(),
-                     &device_count_vec);
+      paddle::framework::TensorToVector(
+          device_count_tensor, ctx.device_context(), &device_count_vec);
       device_counts = device_count_vec[0];
       PADDLE_ENFORCE_GE(device_counts, 2, platform::errors::PreconditionNotMet(
                                               "device_counts should >= 2."));
@@ -626,7 +627,7 @@ class SyncBatchNormNPUGradKernel : public framework::OpKernel<T> {
     {
       saved_mean_tile_1.Resize({C});
       saved_mean_tile_1.mutable_data<float>(place);
-      TensorCopySync(*saved_mean, place, &saved_mean_tile_1);
+      paddle::framework::TensorCopySync(*saved_mean, place, &saved_mean_tile_1);
       if (layout == framework::DataLayout::kNCHW)
         saved_mean_tile_1.Resize({1, C, 1, 1});
       else if (layout == framework::DataLayout::kNHWC)
@@ -656,7 +657,7 @@ class SyncBatchNormNPUGradKernel : public framework::OpKernel<T> {
     {
       var_ref_tile_1.Resize({C});
       var_ref_tile_1.mutable_data<float>(place);
-      TensorCopySync(var_ref, place, &var_ref_tile_1);
+      paddle::framework::TensorCopySync(var_ref, place, &var_ref_tile_1);
       if (layout == framework::DataLayout::kNCHW)
         var_ref_tile_1.Resize({1, C, 1, 1});
       else if (layout == framework::DataLayout::kNHWC)
@@ -793,7 +794,7 @@ class SyncBatchNormNPUGradKernel : public framework::OpKernel<T> {
       {
         dy_mean_tile_1.Resize({C});
         dy_mean_tile_1.mutable_data<float>(place);
-        TensorCopySync(dy_mean, place, &dy_mean_tile_1);
+        paddle::framework::TensorCopySync(dy_mean, place, &dy_mean_tile_1);
         if (layout == framework::DataLayout::kNCHW)
           dy_mean_tile_1.Resize({1, C, 1, 1});
         else if (layout == framework::DataLayout::kNHWC)
@@ -842,8 +843,8 @@ class SyncBatchNormNPUGradKernel : public framework::OpKernel<T> {
       {
         dy_mul_x_sub_mean_mean_tile_1.Resize({C});
         dy_mul_x_sub_mean_mean_tile_1.mutable_data<float>(place);
-        TensorCopySync(dy_mul_x_sub_mean_mean, place,
-                       &dy_mul_x_sub_mean_mean_tile_1);
+        paddle::framework::TensorCopySync(dy_mul_x_sub_mean_mean, place,
+                                          &dy_mul_x_sub_mean_mean_tile_1);
         if (layout == framework::DataLayout::kNCHW)
           dy_mul_x_sub_mean_mean_tile_1.Resize({1, C, 1, 1});
         else if (layout == framework::DataLayout::kNHWC)
@@ -900,7 +901,7 @@ class SyncBatchNormNPUGradKernel : public framework::OpKernel<T> {
       {
         scale_tile_1.Resize({C});
         scale_tile_1.mutable_data<float>(place);
-        TensorCopySync(*scale, place, &scale_tile_1);
+        paddle::framework::TensorCopySync(*scale, place, &scale_tile_1);
         if (layout == framework::DataLayout::kNCHW)
           scale_tile_1.Resize({1, C, 1, 1});
         else if (layout == framework::DataLayout::kNHWC)
diff --git a/paddle/fluid/operators/tensor_formatter.cc b/paddle/fluid/operators/tensor_formatter.cc
index 558f5f2a3128f4..b65ebee9b2662f 100644
--- a/paddle/fluid/operators/tensor_formatter.cc
+++ b/paddle/fluid/operators/tensor_formatter.cc
@@ -124,7 +124,7 @@ void TensorFormatter::FormatData(const framework::LoDTensor& print_tensor,
     data = print_tensor.data<T>();
   } else {
     platform::CPUPlace cpu_place;
-    TensorCopy(print_tensor, cpu_place, &cpu_tensor);
+    paddle::framework::TensorCopy(print_tensor, cpu_place, &cpu_tensor);
 #ifdef PADDLE_WITH_ASCEND_CL
     if (platform::is_npu_place(print_tensor.place())) {
       platform::DeviceContextPool::Instance().Get(print_tensor.place())->Wait();
diff --git a/paddle/fluid/operators/tensor_formatter.h b/paddle/fluid/operators/tensor_formatter.h
index 38e3e7a94a5240..5181e65faf50a4 100644
--- a/paddle/fluid/operators/tensor_formatter.h
+++ b/paddle/fluid/operators/tensor_formatter.h
@@ -18,11 +18,9 @@
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/framework/var_type.h"
 
-namespace paddle {
-namespace framework {
-class Tensor;
-}  // namespace framework
-}  // namespace paddle
+namespace pten {
+class DenseTensor;
+}  // namespace pten
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/tile_op.h b/paddle/fluid/operators/tile_op.h
index 5211d72336124e..7dda2865cd802f 100644
--- a/paddle/fluid/operators/tile_op.h
+++ b/paddle/fluid/operators/tile_op.h
@@ -35,7 +35,8 @@ inline std::vector<int> get_repeat_times(
     if (platform::is_gpu_place(repeat_tensor->place()) ||
         platform::is_xpu_place(repeat_tensor->place()) ||
         platform::is_npu_place(repeat_tensor->place())) {
-      TensorCopySync(*repeat_tensor, platform::CPUPlace(), &cpu_repeat_tensor);
+      paddle::framework::TensorCopySync(*repeat_tensor, platform::CPUPlace(),
+                                        &cpu_repeat_tensor);
       repeat_data = cpu_repeat_tensor.data<int>();
     }
     auto vec_repeat_times =
@@ -54,7 +55,7 @@ inline std::vector<int> get_repeat_times(
           platform::is_xpu_place(tensor->place()) ||
           platform::is_npu_place(tensor->place())) {
         framework::Tensor temp;
-        TensorCopySync(*tensor, platform::CPUPlace(), &temp);
+        paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp);
         vec_repeat_times.push_back(*temp.data<int32_t>());
       } else {
         vec_repeat_times.push_back(*tensor->data<int32_t>());
diff --git a/paddle/fluid/operators/top_k_v2_op_npu.cc b/paddle/fluid/operators/top_k_v2_op_npu.cc
index 5d7e423590b1cf..843f7620cac44e 100755
--- a/paddle/fluid/operators/top_k_v2_op_npu.cc
+++ b/paddle/fluid/operators/top_k_v2_op_npu.cc
@@ -41,7 +41,7 @@ class TopkV2NPUKernel : public framework::OpKernel<T> {
 
     if (k_tensor != nullptr) {
       std::vector<int> v_tmp(1);
-      TensorToVector(
+      paddle::framework::TensorToVector(
           *k_tensor,
           context.template device_context<paddle::platform::NPUDeviceContext>(),
           &v_tmp);
diff --git a/paddle/fluid/operators/transfer_layout_op.h b/paddle/fluid/operators/transfer_layout_op.h
index 28135e37ed7bbe..74d086015eeb46 100644
--- a/paddle/fluid/operators/transfer_layout_op.h
+++ b/paddle/fluid/operators/transfer_layout_op.h
@@ -27,9 +27,12 @@ class DeviceContext;
 }  // namespace platform
 }  // namespace paddle
 
+namespace pten {
+class DenseTensor;
+}  // namespace pten
+
 namespace paddle {
 namespace framework {
-class Tensor;
 class Variable;
 }  // namespace framework
 }  // namespace paddle
@@ -78,7 +81,7 @@ class TransferLayoutFunctor {
       } else {
         // Case2 - transfrom from MKLDNN OPKernel to Non-MKLDNN OPKernel
         // Do transform via MKLDNN lib
-        innerTransDataLayoutFromMKLDNN(
+        paddle::framework::innerTransDataLayoutFromMKLDNN(
             in_layout, paddle::platform::MKLDNNDeviceContext::tls()
                            .get_cur_paddle_data_layout(),
             in_tensor, &out_tensor, dev_ctx_.GetPlace());
diff --git a/paddle/fluid/operators/transpose_op_npu_test.cc b/paddle/fluid/operators/transpose_op_npu_test.cc
index f6712814e1e3b8..91923da819dc5e 100644
--- a/paddle/fluid/operators/transpose_op_npu_test.cc
+++ b/paddle/fluid/operators/transpose_op_npu_test.cc
@@ -48,7 +48,8 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
 
   int dim0 = 2;
   int dim1 = 3;
-  TensorFromVector(std::vector<T>({0, 1, 2, 3, 4, 5}), ctx, x_t);
+  paddle::framework::TensorFromVector(std::vector<T>({0, 1, 2, 3, 4, 5}), ctx,
+                                      x_t);
   ctx.Wait();
   x_t->Resize({dim0, dim1});
   out_t->Resize({dim0, dim1});
@@ -66,7 +67,7 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
   op->Run(*scope, place);
   ctx.Wait();
   std::vector<T> out_v;
-  TensorToVector(*out_t, ctx, &out_v);
+  paddle::framework::TensorToVector(*out_t, ctx, &out_v);
   ctx.Wait();
 
   EXPECT_EQ(out_t->numel(), dim0 * dim1);
@@ -93,7 +94,8 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx) {
   int dim1 = 3;
   auto place = ctx.GetPlace();
 
-  TensorFromVector(std::vector<T>({0, 1, 2, 3, 4, 5}), ctx, out_grad_t);
+  paddle::framework::TensorFromVector(std::vector<T>({0, 1, 2, 3, 4, 5}), ctx,
+                                      out_grad_t);
   ctx.Wait();
 
   x_grad_t->Resize({dim0, dim1});
@@ -112,7 +114,7 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx) {
   op->Run(*scope, place);
   ctx.Wait();
   std::vector<T> out_v;
-  TensorToVector(*x_grad_t, ctx, &out_v);
+  paddle::framework::TensorToVector(*x_grad_t, ctx, &out_v);
   ctx.Wait();
 
   EXPECT_EQ(x_grad_t->numel(), dim0 * dim1);
diff --git a/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc b/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc
index 8d135e698f204e..f88fefd1c6a78a 100644
--- a/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc
+++ b/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc
@@ -31,7 +31,8 @@ class TruncatedGaussianRandomNPUKernel : public framework::OpKernel<T> {
     Tensor shape_tensor(framework::proto::VarType::INT32);
     shape_tensor.mutable_data<int32_t>({static_cast<int>(shape.size())},
                                        ctx.GetPlace());
-    TensorFromVector(shape, ctx.device_context(), &shape_tensor);
+    paddle::framework::TensorFromVector(shape, ctx.device_context(),
+                                        &shape_tensor);
     float mean = ctx.Attr<float>("mean");
     Tensor mean_tensor(framework::proto::VarType::FP32);
     mean_tensor.mutable_data<float>({1}, ctx.GetPlace());
diff --git a/paddle/fluid/operators/uniform_random_op.h b/paddle/fluid/operators/uniform_random_op.h
index 18a4154be30ac7..f4ae8d82690566 100644
--- a/paddle/fluid/operators/uniform_random_op.h
+++ b/paddle/fluid/operators/uniform_random_op.h
@@ -29,8 +29,8 @@ inline std::vector<int64_t> GetNewDataFromShapeTensor(
     auto* new_data = new_data_tensor->data<int64_t>();
     framework::Tensor cpu_starts_tensor;
     if (platform::is_gpu_place(new_data_tensor->place())) {
-      TensorCopySync(*new_data_tensor, platform::CPUPlace(),
-                     &cpu_starts_tensor);
+      paddle::framework::TensorCopySync(*new_data_tensor, platform::CPUPlace(),
+                                        &cpu_starts_tensor);
       new_data = cpu_starts_tensor.data<int64_t>();
     }
     std::vector<int64_t> vec_new_data(new_data,
@@ -41,8 +41,8 @@ inline std::vector<int64_t> GetNewDataFromShapeTensor(
     std::vector<int64_t> vec_new_data;
     framework::Tensor cpu_starts_tensor;
     if (platform::is_gpu_place(new_data_tensor->place())) {
-      TensorCopySync(*new_data_tensor, platform::CPUPlace(),
-                     &cpu_starts_tensor);
+      paddle::framework::TensorCopySync(*new_data_tensor, platform::CPUPlace(),
+                                        &cpu_starts_tensor);
       new_data = cpu_starts_tensor.data<int32_t>();
     }
     for (int i = 0; i < new_data_tensor->numel(); ++i) {
@@ -73,7 +73,7 @@ inline std::vector<int64_t> GetNewDataFromShapeTensorList(
     if (tensor->type() == framework::proto::VarType::INT32) {
       if (platform::is_gpu_place(tensor->place())) {
         framework::Tensor temp;
-        TensorCopySync(*tensor, platform::CPUPlace(), &temp);
+        paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp);
         vec_new_shape.push_back(static_cast<int64_t>(*temp.data<int32_t>()));
       } else {
         vec_new_shape.push_back(static_cast<int64_t>(*tensor->data<int32_t>()));
@@ -81,7 +81,7 @@ inline std::vector<int64_t> GetNewDataFromShapeTensorList(
     } else if (tensor->type() == framework::proto::VarType::INT64) {
       if (platform::is_gpu_place(tensor->place())) {
         framework::Tensor temp;
-        TensorCopySync(*tensor, platform::CPUPlace(), &temp);
+        paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp);
         vec_new_shape.push_back(*temp.data<int64_t>());
       } else {
         vec_new_shape.push_back(*tensor->data<int64_t>());
diff --git a/paddle/fluid/operators/unique_consecutive_op.cu b/paddle/fluid/operators/unique_consecutive_op.cu
index 1f0023c467c01c..12bd742c9f9b43 100644
--- a/paddle/fluid/operators/unique_consecutive_op.cu
+++ b/paddle/fluid/operators/unique_consecutive_op.cu
@@ -96,8 +96,10 @@ void IndexSelect(const framework::ExecutionContext& context,
 
   std::vector<InT> input_vec;
   std::vector<IndexT> index_vec;
-  TensorToVector(input, context.device_context(), &input_vec);
-  TensorToVector(index, context.device_context(), &index_vec);
+  paddle::framework::TensorToVector(input, context.device_context(),
+                                    &input_vec);
+  paddle::framework::TensorToVector(index, context.device_context(),
+                                    &index_vec);
   std::vector<InT> out_vec(output->numel());
 
   for (int i = 0; i < index_size; i++) {
diff --git a/paddle/fluid/operators/unique_op.cu b/paddle/fluid/operators/unique_op.cu
index 87a46e11d9f91b..98cd13a600f205 100644
--- a/paddle/fluid/operators/unique_op.cu
+++ b/paddle/fluid/operators/unique_op.cu
@@ -119,8 +119,10 @@ void IndexSelect(const framework::ExecutionContext& context,
 
   std::vector<InT> input_vec;
   std::vector<IndexT> index_vec;
-  TensorToVector(input, context.device_context(), &input_vec);
-  TensorToVector(index, context.device_context(), &index_vec);
+  paddle::framework::TensorToVector(input, context.device_context(),
+                                    &input_vec);
+  paddle::framework::TensorToVector(index, context.device_context(),
+                                    &index_vec);
   std::vector<InT> out_vec(output->numel());
 
   for (int i = 0; i < index_size; i++) {
diff --git a/paddle/fluid/operators/unsqueeze_op_npu_test.cc b/paddle/fluid/operators/unsqueeze_op_npu_test.cc
index a145c914a8621b..cf96ef57a4df08 100644
--- a/paddle/fluid/operators/unsqueeze_op_npu_test.cc
+++ b/paddle/fluid/operators/unsqueeze_op_npu_test.cc
@@ -49,7 +49,7 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
     init.push_back(static_cast<T>(0.1));
   }
 
-  TensorFromVector(init, ctx, tensor_x);
+  paddle::framework::TensorFromVector(init, ctx, tensor_x);
   tensor_x->Resize({dim0, dim1});
 
   ctx.Wait();
@@ -75,7 +75,7 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
   EXPECT_EQ((uint32_t)tensor_out->dims()[2], uint32_t(10));
 
   std::vector<T> out_vec;
-  TensorToVector(*tensor_out, ctx, &out_vec);
+  paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
   for (uint32_t i = 0; i < out_vec.size(); i++) {
     EXPECT_EQ(out_vec[i], static_cast<T>(0.1));
   }
diff --git a/paddle/fluid/operators/utils.h b/paddle/fluid/operators/utils.h
index 770369e64f46fd..a413c4a331b65b 100644
--- a/paddle/fluid/operators/utils.h
+++ b/paddle/fluid/operators/utils.h
@@ -27,7 +27,8 @@ inline std::vector<T> GetDataFromTensor(const framework::Tensor* x) {
     auto* data = x->data<int>();
     framework::Tensor cpu_attr_tensor;
     if (!platform::is_cpu_place(x->place())) {
-      TensorCopySync(*x, platform::CPUPlace(), &cpu_attr_tensor);
+      paddle::framework::TensorCopySync(*x, platform::CPUPlace(),
+                                        &cpu_attr_tensor);
       data = cpu_attr_tensor.data<int>();
     }
     vec_new_data = std::vector<T>(data, data + x->numel());
@@ -35,7 +36,8 @@ inline std::vector<T> GetDataFromTensor(const framework::Tensor* x) {
     auto* data = x->data<int64_t>();
     framework::Tensor cpu_attr_tensor;
     if (!platform::is_cpu_place(x->place())) {
-      TensorCopySync(*x, platform::CPUPlace(), &cpu_attr_tensor);
+      paddle::framework::TensorCopySync(*x, platform::CPUPlace(),
+                                        &cpu_attr_tensor);
       data = cpu_attr_tensor.data<int64_t>();
     }
     // NOTE: Converting int64 to int32 may cause data overflow.
@@ -64,7 +66,7 @@ inline std::vector<T> GetDataFromTensorList(
     if (tensor->type() == framework::proto::VarType::INT32) {
       if (!platform::is_cpu_place(tensor->place())) {
         framework::Tensor temp;
-        TensorCopySync(*tensor, platform::CPUPlace(), &temp);
+        paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp);
         vec_new_data.push_back(static_cast<T>(*temp.data<int>()));
       } else {
         vec_new_data.push_back(static_cast<T>(*tensor->data<int>()));
@@ -72,7 +74,7 @@ inline std::vector<T> GetDataFromTensorList(
     } else if (tensor->type() == framework::proto::VarType::INT64) {
       if (!platform::is_cpu_place(tensor->place())) {
         framework::Tensor temp;
-        TensorCopySync(*tensor, platform::CPUPlace(), &temp);
+        paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp);
         // NOTE: Converting int64 to int32 may cause data overflow.
         vec_new_data.push_back(static_cast<T>(*temp.data<int64_t>()));
       } else {
diff --git a/paddle/fluid/operators/viterbi_decode_op.cu b/paddle/fluid/operators/viterbi_decode_op.cu
index 086ff05b084612..d40d14435a5fd0 100644
--- a/paddle/fluid/operators/viterbi_decode_op.cu
+++ b/paddle/fluid/operators/viterbi_decode_op.cu
@@ -66,8 +66,9 @@ struct BinaryOperation<platform::CUDADeviceContext, BinaryFunctor, T> {
                   const Tensor& rhs, Tensor* output) {
     std::vector<const Tensor*> ins{&lhs, &rhs};
     std::vector<Tensor*> outs{output};
-    LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
-        dev_ctx, ins, &outs, -1, BinaryFunctor<T>());
+    paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T,
+                                                   T>(dev_ctx, ins, &outs, -1,
+                                                      BinaryFunctor<T>());
   }
 };
 
@@ -78,8 +79,9 @@ struct GetMask<platform::CUDADeviceContext, CompareFunctor, T> {
     std::vector<const Tensor*> ins = {&lhs, &rhs};
     std::vector<Tensor*> outs = {mask};
     auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kBinary, int64_t, T>(
-        dev_ctx, ins, &outs, CompareFunctor<int64_t>());
+    paddle::operators::LaunchSameDimsElementwiseCudaKernel<
+        ElementwiseType::kBinary, int64_t, T>(dev_ctx, ins, &outs,
+                                              CompareFunctor<int64_t>());
   }
 };
 
diff --git a/paddle/fluid/operators/warpctc_op.h b/paddle/fluid/operators/warpctc_op.h
index 4cce33c3f520f0..56f1d8d97ba618 100644
--- a/paddle/fluid/operators/warpctc_op.h
+++ b/paddle/fluid/operators/warpctc_op.h
@@ -299,7 +299,8 @@ class WarpCTCKernel : public framework::OpKernel<T> {
         ctx.AllocateTmpTensor<T, DeviceContext>(warpctc_logits_dims, dev_ctx);
     warpctc_logits.ShareDataWith(warpctc_logits_tmp);
     if (ctx.HasInput("LogitsLength")) {
-      TensorCopySync(*logits, ctx.GetPlace(), &warpctc_logits);
+      paddle::framework::TensorCopySync(*logits, ctx.GetPlace(),
+                                        &warpctc_logits);
     } else {
       LoDTensor cpu_pad_value;
       T* pad_value_data =
@@ -309,7 +310,8 @@ class WarpCTCKernel : public framework::OpKernel<T> {
       if (platform::is_cpu_place(ctx.GetPlace())) {
         pad_value = cpu_pad_value;
       } else {
-        TensorCopySync(cpu_pad_value, ctx.GetPlace(), &pad_value);
+        paddle::framework::TensorCopySync(cpu_pad_value, ctx.GetPlace(),
+                                          &pad_value);
       }
 
       math::PaddingLoDTensorFunctor<DeviceContext, T>()(
@@ -361,10 +363,12 @@ class WarpCTCKernel : public framework::OpKernel<T> {
             ctx.template device_context<DeviceContext>(), *label, &gpu_label,
             label->dims()[1] /*pad_seq_len*/, 0 /*lod_level*/,
             false /*norm_by_times*/, math::kBatchLengthWidth);
-        TensorCopySync(gpu_label, platform::CPUPlace(), &warpctc_label);
+        paddle::framework::TensorCopySync(gpu_label, platform::CPUPlace(),
+                                          &warpctc_label);
       }
     } else {
-      TensorCopySync(*label, platform::CPUPlace(), &warpctc_label);
+      paddle::framework::TensorCopySync(*label, platform::CPUPlace(),
+                                        &warpctc_label);
     }
 
     const int* warpctc_label_data = warpctc_label.data<int>();
@@ -381,7 +385,8 @@ class WarpCTCKernel : public framework::OpKernel<T> {
         sequence_width, num_sequences, blank, warpctc_loss_data);
 
     // Copy the loss back
-    TensorCopy(warpctc_loss, ctx.GetPlace(), ctx.device_context(), loss);
+    paddle::framework::TensorCopy(warpctc_loss, ctx.GetPlace(),
+                                  ctx.device_context(), loss);
   }
 };
 
diff --git a/paddle/fluid/operators/where_index_op_npu.cc b/paddle/fluid/operators/where_index_op_npu.cc
index 9a11f300bcb096..226f1461ed4390 100644
--- a/paddle/fluid/operators/where_index_op_npu.cc
+++ b/paddle/fluid/operators/where_index_op_npu.cc
@@ -70,7 +70,8 @@ class NPUWhereIndexKernel : public framework::OpKernel<T> {
     sum_runner.Run(stream);
 
     Tensor local_true_num;
-    TensorCopySync(sumed_true_num, platform::CPUPlace(), &local_true_num);
+    paddle::framework::TensorCopySync(sumed_true_num, platform::CPUPlace(),
+                                      &local_true_num);
     auto true_num = *local_true_num.data<int64_t>();
 
     out->Resize(framework::make_ddim({true_num, rank}));
diff --git a/paddle/fluid/platform/device/gpu/cuda/cudnn_desc.h b/paddle/fluid/platform/device/gpu/cuda/cudnn_desc.h
index 7bff2c69381e69..a75759e2ae0796 100644
--- a/paddle/fluid/platform/device/gpu/cuda/cudnn_desc.h
+++ b/paddle/fluid/platform/device/gpu/cuda/cudnn_desc.h
@@ -26,11 +26,9 @@
 #include "paddle/fluid/platform/device/gpu/cuda/cudnn_helper.h"
 #include "paddle/fluid/platform/device_context.h"
 
-namespace paddle {
-namespace framework {
-class Tensor;
-}  // namespace framework
-}  // namespace paddle
+namespace pten {
+class DenseTensor;
+}  // namespace pten
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/fluid/platform/device/gpu/rocm/miopen_desc.h b/paddle/fluid/platform/device/gpu/rocm/miopen_desc.h
index d2389ba409e5eb..f4533c859fcd35 100644
--- a/paddle/fluid/platform/device/gpu/rocm/miopen_desc.h
+++ b/paddle/fluid/platform/device/gpu/rocm/miopen_desc.h
@@ -26,11 +26,9 @@
 #include "paddle/fluid/platform/device/gpu/rocm/miopen_helper.h"
 #include "paddle/fluid/platform/device_context.h"
 
-namespace paddle {
-namespace framework {
-class Tensor;
-}  // namespace framework
-}  // namespace paddle
+namespace pten {
+class DenseTensor;
+}  // namespace pten
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/fluid/platform/device/npu/npu_op_runner.cc b/paddle/fluid/platform/device/npu/npu_op_runner.cc
index 78e5cb0ab106e4..b9c92e07612b08 100644
--- a/paddle/fluid/platform/device/npu/npu_op_runner.cc
+++ b/paddle/fluid/platform/device/npu/npu_op_runner.cc
@@ -231,7 +231,7 @@ NpuOpRunner &NpuOpRunner::AddInput(std::vector<int32_t> &&dims) {
   auto *dev_ctx =
       static_cast<platform::CPUDeviceContext *>(pool.Get(platform::CPUPlace()));
   Tensor host_tensor;
-  TensorFromVector(dims, *dev_ctx, &host_tensor);
+  paddle::framework::TensorFromVector(dims, *dev_ctx, &host_tensor);
   host_tensors_.emplace_back(host_tensor);
 
   // create aclTensorDesc
@@ -247,7 +247,7 @@ NpuOpRunner &NpuOpRunner::AddInput(std::vector<int64_t> &&dims) {
   auto *dev_ctx =
       static_cast<platform::CPUDeviceContext *>(pool.Get(platform::CPUPlace()));
   Tensor host_tensor;
-  TensorFromVector(dims, *dev_ctx, &host_tensor);
+  paddle::framework::TensorFromVector(dims, *dev_ctx, &host_tensor);
   host_tensors_.emplace_back(host_tensor);
 
   // create aclTensorDesc
@@ -263,7 +263,7 @@ NpuOpRunner &NpuOpRunner::AddInput(std::vector<float> &&values) {
   auto *dev_ctx =
       static_cast<platform::CPUDeviceContext *>(pool.Get(platform::CPUPlace()));
   Tensor host_tensor;
-  TensorFromVector(values, *dev_ctx, &host_tensor);
+  paddle::framework::TensorFromVector(values, *dev_ctx, &host_tensor);
   host_tensors_.emplace_back(host_tensor);
 
   // create aclTensorDesc
@@ -279,7 +279,7 @@ NpuOpRunner &NpuOpRunner::AddInput(std::vector<double> &&values) {
   auto *dev_ctx =
       static_cast<platform::CPUDeviceContext *>(pool.Get(platform::CPUPlace()));
   Tensor host_tensor;
-  TensorFromVector(values, *dev_ctx, &host_tensor);
+  paddle::framework::TensorFromVector(values, *dev_ctx, &host_tensor);
   host_tensors_.emplace_back(host_tensor);
 
   // create aclTensorDesc
diff --git a/paddle/fluid/platform/device_code_test.cc b/paddle/fluid/platform/device_code_test.cc
index aadfffb59133bf..2307f843838aff 100644
--- a/paddle/fluid/platform/device_code_test.cc
+++ b/paddle/fluid/platform/device_code_test.cc
@@ -80,8 +80,8 @@ TEST(DeviceCode, cuda) {
   float* y_data = y.mutable_data<float>(dims, place);
   float* z_data = z.mutable_data<float>(dims, place);
 
-  TensorCopySync(cpu_x, place, &x);
-  TensorCopySync(cpu_y, place, &y);
+  paddle::framework::TensorCopySync(cpu_x, place, &x);
+  paddle::framework::TensorCopySync(cpu_y, place, &y);
 
   EXPECT_EQ(code.Compile(), true);
 
@@ -93,7 +93,7 @@ TEST(DeviceCode, cuda) {
   auto* dev_ctx = paddle::platform::DeviceContextPool::Instance().Get(place);
   dev_ctx->Wait();
 
-  TensorCopySync(z, paddle::platform::CPUPlace(), &cpu_z);
+  paddle::framework::TensorCopySync(z, paddle::platform::CPUPlace(), &cpu_z);
   for (size_t i = 0; i < n; i++) {
     EXPECT_EQ(cpu_z.data<float>()[i], static_cast<float>(i) * scale + 0.5);
   }
diff --git a/paddle/fluid/platform/lodtensor_printer.cc b/paddle/fluid/platform/lodtensor_printer.cc
index 4a5dfbee15de28..ca5d156802851f 100644
--- a/paddle/fluid/platform/lodtensor_printer.cc
+++ b/paddle/fluid/platform/lodtensor_printer.cc
@@ -16,9 +16,12 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/scope.h"
 
+namespace pten {
+class DenseTensor;
+}  // namespace pten
+
 namespace paddle {
 namespace framework {
-class Tensor;
 class Variable;
 }  // namespace framework
 }  // namespace paddle
@@ -47,29 +50,29 @@ void PrintVar(framework::Scope* scope, const std::string& var_name,
 
   *sstream << print_info;
 
-#define PrintTensorCallback(cpp_type, proto_type)    \
-  do {                                               \
-    if (tensor->type() == proto_type) {              \
-      *sstream << "[";                               \
-      const cpp_type* data = nullptr;                \
-      framework::LoDTensor cpu_tensor;               \
-      if (is_cpu_place(tensor->place())) {           \
-        data = tensor->data<cpp_type>();             \
-      } else {                                       \
-        platform::CPUPlace cpu_place;                \
-        TensorCopy(*tensor, cpu_place, &cpu_tensor); \
-        data = cpu_tensor.data<cpp_type>();          \
-      }                                              \
-      auto element_num = tensor->numel();            \
-      *sstream << element_num << "]:[";              \
-      if (element_num > 0) {                         \
-        *sstream << data[0];                         \
-        for (int j = 1; j < element_num; ++j) {      \
-          *sstream << " " << data[j];                \
-        }                                            \
-      }                                              \
-      *sstream << "]";                               \
-    }                                                \
+#define PrintTensorCallback(cpp_type, proto_type)                       \
+  do {                                                                  \
+    if (tensor->type() == proto_type) {                                 \
+      *sstream << "[";                                                  \
+      const cpp_type* data = nullptr;                                   \
+      framework::LoDTensor cpu_tensor;                                  \
+      if (is_cpu_place(tensor->place())) {                              \
+        data = tensor->data<cpp_type>();                                \
+      } else {                                                          \
+        platform::CPUPlace cpu_place;                                   \
+        paddle::framework::TensorCopy(*tensor, cpu_place, &cpu_tensor); \
+        data = cpu_tensor.data<cpp_type>();                             \
+      }                                                                 \
+      auto element_num = tensor->numel();                               \
+      *sstream << element_num << "]:[";                                 \
+      if (element_num > 0) {                                            \
+        *sstream << data[0];                                            \
+        for (int j = 1; j < element_num; ++j) {                         \
+          *sstream << " " << data[j];                                   \
+        }                                                               \
+      }                                                                 \
+      *sstream << "]";                                                  \
+    }                                                                   \
   } while (0)
 
   _ForEachDataType_(PrintTensorCallback);
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index 3650b44ed0a85b..cc3b066de47256 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -1253,7 +1253,8 @@ void BindImperative(py::module *m_ptr) {
                                       ->GetMutable<framework::LoDTensor>();
                auto *dev_ctx = platform::DeviceContextPool::Instance().Get(
                    tracer->ExpectedPlace());
-               TensorFromVector(list_select_idxs, *dev_ctx, idx_tensor);
+               paddle::framework::TensorFromVector(list_select_idxs, *dev_ctx,
+                                                   idx_tensor);
 
                imperative::NameVarBaseMap ins = {{"X", {self}},
                                                  {"Index", {select_index}}};
diff --git a/paddle/pten/api/include/tensor.h b/paddle/pten/api/include/tensor.h
index a6e2c4d1037696..c26c9ce8394581 100644
--- a/paddle/pten/api/include/tensor.h
+++ b/paddle/pten/api/include/tensor.h
@@ -36,6 +36,10 @@ using gpuStream_t = hipStream_t;
 #include "paddle/pten/common/layout.h"
 #include "paddle/pten/common/place.h"
 
+namespace pten {
+class DenseTensor;
+}  // namespace pten
+
 namespace pten {
 class TensorBase;
 }  // namespace pten
@@ -47,7 +51,6 @@ class DDim;
 
 namespace experimental {
 
-class Tensor;
 class CompatiblePTenTensorUtils;
 
 class AbstractAutogradMeta {
diff --git a/paddle/pten/api/lib/utils/tensor_utils.cc b/paddle/pten/api/lib/utils/tensor_utils.cc
index f304268bedf45d..93b1957fe14428 100644
--- a/paddle/pten/api/lib/utils/tensor_utils.cc
+++ b/paddle/pten/api/lib/utils/tensor_utils.cc
@@ -260,7 +260,8 @@ std::unique_ptr<pten::TensorBase> MakePtenTensorBaseFromVar(
     const auto& tensor = variable.Get<framework::SelectedRows>();
     if (!platform::is_same_place(tensor.value().place(), expected_place)) {
       framework::Tensor tmp_tensor;
-      TensorCopySync(tensor.value(), expected_place, &tmp_tensor);
+      paddle::framework::TensorCopySync(
+          tensor.value(), expected_place, &tmp_tensor);
       // TODO(chenweihang): adapt SelectedRows by xiaowei's design
       return MakePtenDenseTensor(tmp_tensor);
     } else {
@@ -303,7 +304,7 @@ void MovesStorageBase(pten::DenseTensor* src, paddle::framework::Tensor* dst) {
       dst,
       platform::errors::InvalidArgument(
           "The destination Tensor is nullptr when move storage."));
-  dst->Resize(src->dims());
+  dst->ResizeAndAllocate(src->dims());
   dst->set_type(pten::TransToProtoVarType(src->dtype()));
   auto storage = src->MoveMemoryHolder();
   dst->ResetHolderWithType(storage, pten::TransToProtoVarType(src->dtype()));
@@ -324,7 +325,7 @@ void SharesStorageBase(pten::DenseTensor* src, paddle::framework::Tensor* dst) {
       dst,
       platform::errors::InvalidArgument(
           "The destination Tensor is nullptr when move allocation."));
-  dst->Resize(src->dims());
+  dst->ResizeAndAllocate(src->dims());
   dst->ResetHolderWithType(src->Holder(),
                            pten::TransToProtoVarType(src->dtype()));
   dst->set_offset(src->meta().offset);
@@ -412,7 +413,8 @@ void ReMakePtenDenseTensorFromVar(const framework::Variable& variable,
             "argument's definition in kernel."));
     if (!platform::is_same_place(tensor.value().place(), expected_place)) {
       framework::Tensor tmp_tensor;
-      TensorCopySync(tensor.value(), expected_place, &tmp_tensor);
+      paddle::framework::TensorCopySync(
+          tensor.value(), expected_place, &tmp_tensor);
       // TODO(chenweihang): adapt SelectedRows by xiaowei's design
       ReMakePtenDenseTensorByArgDef(tmp_tensor, arg_def, dst);
     } else {
@@ -457,7 +459,7 @@ void MakeVariableFromPtenTensor(pten::DenseTensor* src,
     auto* tensor = variable->GetMutable<framework::LoDTensor>();
 
     auto dtype = pten::TransToProtoVarType(src->dtype());
-    tensor->Resize(src->dims());
+    tensor->ResizeAndAllocate(src->dims());
     SetLoD(tensor->mutable_lod(), src->lod());
 
     if (!tensor->IsInitialized() ||
diff --git a/paddle/pten/core/dense_tensor.cc b/paddle/pten/core/dense_tensor.cc
index fe088a95681468..06531fe8bfd3b8 100644
--- a/paddle/pten/core/dense_tensor.cc
+++ b/paddle/pten/core/dense_tensor.cc
@@ -22,6 +22,14 @@ limitations under the License. */
 #include "paddle/pten/api/lib/utils/storage.h"
 #include "paddle/pten/core/convert_utils.h"
 
+namespace paddle {
+namespace framework {
+extern void TensorCopy(const pten::DenseTensor& src,
+                       const paddle::platform::Place& dst_place,
+                       pten::DenseTensor* dst);
+}
+}
+
 namespace pten {
 
 DenseTensor::DenseTensor(Allocator* a, const DenseTensorMeta& meta)
@@ -198,7 +206,7 @@ void DenseTensor::set_meta(DenseTensorMeta&& meta) {
                                storage_ won't be initialized until the first
    call to mutable_data(place)
    */
-void DenseTensor::Resize(const DDim& dims) {
+void DenseTensor::ResizeAndAllocate(const DDim& dims) {
   meta_.dims = dims;
   if (storage_ != nullptr) {
     mutable_data();
@@ -519,4 +527,119 @@ size_t DenseTensor::NumElements(size_t level) const {
   return (meta_.lod)[level].size() - 1;
 }
 
+DenseTensor& DenseTensor::Resize(const DDim& dims) {
+  meta_.dims = dims;
+  return *this;
+}
+
+DenseTensor DenseTensor::Slice(int64_t begin_idx, int64_t end_idx) const {
+  check_memory_size();
+  PADDLE_ENFORCE_GE(begin_idx,
+                    0,
+                    paddle::platform::errors::OutOfRange(
+                        "The start row index must be greater than 0."
+                        "But received the start index is d%.",
+                        begin_idx));
+  PADDLE_ENFORCE_LE(end_idx,
+                    meta_.dims[0],
+                    paddle::platform::errors::OutOfRange(
+                        "The end row index is out of bound."));
+  PADDLE_ENFORCE_LT(
+      begin_idx,
+      end_idx,
+      paddle::platform::errors::InvalidArgument(
+          "The start row index must be less than the end row index."
+          "But received the start index = %d, the end index = %d.",
+          begin_idx,
+          end_idx));
+
+  if (meta_.dims[0] == 1) {
+    return *this;
+  } else {
+    size_t base = numel() / meta_.dims[0];
+    DenseTensor dst;
+    dst.storage_ = pten::make_intrusive<paddle::experimental::SharedStorage>(
+        storage_->data_shared());
+    dst.meta_.layout = meta_.layout;
+    dst.meta_.dtype = meta_.dtype;
+    DDim dst_dims = meta_.dims;
+    dst_dims[0] = end_idx - begin_idx;
+    dst.Resize(dst_dims);
+    dst.meta_.offset = meta_.offset + begin_idx * base * SizeOf(dtype());
+    return dst;
+  }
+}
+
+std::vector<DenseTensor> DenseTensor::Split(int64_t split_size,
+                                            int64_t axis) const {
+  check_memory_size();
+
+  PADDLE_ENFORCE_GE(meta_.dims.size(),
+                    0,
+                    paddle::platform::errors::OutOfRange(
+                        "split expects at least a 1-dimensional tensor"));
+
+  PADDLE_ENFORCE_GE(
+      split_size,
+      0,
+      paddle::platform::errors::OutOfRange(
+          "split expects split_size be non-negative, but got split_size is %d",
+          split_size));
+
+  int64_t numel_size = meta_.dims[axis];
+
+  int64_t num_splits = 1;
+  if (split_size != 0) {
+    num_splits =
+        std::max<int64_t>((numel_size + split_size - 1) / split_size, 1);
+  }
+
+  std::vector<DenseTensor> splits(num_splits);
+  int64_t last_split_size = split_size - (split_size * num_splits - numel_size);
+
+  for (int64_t i = 0; i < num_splits; ++i) {
+    int64_t length = i < num_splits - 1 ? split_size : last_split_size;
+    splits[i] = Slice(i * split_size, i * split_size + length);
+  }
+  return splits;
+}
+
+std::vector<DenseTensor> DenseTensor::Chunk(int64_t chunks,
+                                            int64_t axis) const {
+  check_memory_size();
+  PADDLE_ENFORCE_GE(meta_.dims.size(),
+                    0,
+                    paddle::platform::errors::OutOfRange(
+                        "split expects at least a 1-dimensional tensor"));
+  PADDLE_ENFORCE_GE(
+      chunks,
+      0,
+      paddle::platform::errors::OutOfRange(
+          "chunks expects to be greater than 0, but got chunks is %d", chunks));
+
+  int64_t numel_size = meta_.dims[axis];
+  int64_t split_size = (numel_size + chunks - 1) / chunks;
+  return Split(split_size, axis);
+}
+
+DenseTensor& DenseTensor::ShareDataWith(const DenseTensor& src) {
+  src.check_memory_size();
+  // Preserve LoD
+  auto lod = meta_.lod;
+  *this = src;
+  meta_.lod = lod;
+  return *this;
+}
+
+DenseTensor& DenseTensor::ShareInplaceVersionCounterWith(
+    const DenseTensor& src) {
+  PADDLE_ENFORCE_NOT_NULL(
+      inplace_version_counter_,
+      paddle::platform::errors::PreconditionNotMet(
+          "Tensor does not hold inplace_version_counter_."));
+
+  inplace_version_counter_ = src.inplace_version_counter_;
+  return *this;
+}
+
 }  // namespace pten
diff --git a/paddle/pten/core/dense_tensor.h b/paddle/pten/core/dense_tensor.h
index db8d7a2a39c908..216689c9b64173 100644
--- a/paddle/pten/core/dense_tensor.h
+++ b/paddle/pten/core/dense_tensor.h
@@ -159,7 +159,9 @@ class DenseTensor : public TensorBase,
   /// \param dims The new dims of the dense tensor.
   /// \param lod The new lod of the dense tensor.
   // void Resize(const DDim& dims);
-  void Resize(const DDim& dims);
+  void ResizeAndAllocate(const DDim& dims);
+
+  DenseTensor& Resize(const DDim& dims);
 
   /// \brief Change the lod information in the metadata.
   /// \param lod The new lod of the dense tensor.
@@ -309,6 +311,18 @@ class DenseTensor : public TensorBase,
     return *inplace_version_counter_;
   }
 
+  /*! The internal of two tensors share the same memory block. */
+  DenseTensor& ShareDataWith(const DenseTensor& src);
+
+  /*! The internal of two tensors share the same inplace version counter. */
+  DenseTensor& ShareInplaceVersionCounterWith(const DenseTensor& src);
+
+  DenseTensor Slice(int64_t begin_idx, int64_t end_idx) const;
+
+  std::vector<DenseTensor> Split(int64_t split_size, int64_t axis) const;
+
+  std::vector<DenseTensor> Chunk(int64_t chunks, int64_t axis) const;
+
  protected:
   std::shared_ptr<TensorInplaceVersion> inplace_version_counter_;
 
diff --git a/paddle/pten/kernels/cpu/copy_kernel.cc b/paddle/pten/kernels/cpu/copy_kernel.cc
index 28623b539d8475..1889838e253c93 100644
--- a/paddle/pten/kernels/cpu/copy_kernel.cc
+++ b/paddle/pten/kernels/cpu/copy_kernel.cc
@@ -37,7 +37,7 @@ void Copy(const Context& dev_ctx,
   VLOG(3) << "TensorCopy " << src.dims() << " from " << src.place() << " to "
           << dst_place;
 
-  dst->Resize(src.dims());
+  dst->ResizeAndAllocate(src.dims());
   auto* dst_ptr = dst->mutable_data();
 
   if (src_ptr == dst_ptr && src_place == dst_place) {
diff --git a/paddle/pten/kernels/cpu/reduce.h b/paddle/pten/kernels/cpu/reduce.h
index 1e9c1e885f44d6..b38f17aa02a556 100644
--- a/paddle/pten/kernels/cpu/reduce.h
+++ b/paddle/pten/kernels/cpu/reduce.h
@@ -118,7 +118,7 @@ void GetShuffledInput(const DeviceContext& dev_ctx,
   std::vector<int64_t> perm_axis(input.dims().size());
   GetShuffledDim(input.dims(), &shuffled_dims, dims, &perm_axis);
 
-  shuffled_input->Resize(shuffled_dims);
+  shuffled_input->ResizeAndAllocate(shuffled_dims);
   shuffled_input->mutable_data<OutT>();
 
   pten::math::TransposeNormal<DeviceContext, OutT> trans;
@@ -141,12 +141,12 @@ void HandleLargeDim(const DeviceContext& dev_ctx,
   // transpose to 2D tensor whose shape is {unreduced, reduced}.
   const int64_t unreduced = output->numel();
   const int64_t reduced = shuffled_input.numel() / unreduced;
-  shuffled_input.Resize({unreduced, reduced});
+  shuffled_input.ResizeAndAllocate({unreduced, reduced});
   DDim output_dim = output->dims();
-  output->Resize({unreduced});
+  output->ResizeAndAllocate({unreduced});
   ReduceFunctor<DeviceContext, OutT, 2, 1, Functor>(
       dev_ctx, shuffled_input, output, {1}, keep_dim);
-  output->Resize(output_dim);
+  output->ResizeAndAllocate(output_dim);
 }
 
 ////////////// ReduceKernel
diff --git a/paddle/pten/kernels/empty_kernel.cc b/paddle/pten/kernels/empty_kernel.cc
index eb67ed6655f479..c133d7fc791349 100644
--- a/paddle/pten/kernels/empty_kernel.cc
+++ b/paddle/pten/kernels/empty_kernel.cc
@@ -24,7 +24,7 @@ template <typename T, typename Context>
 void EmptyKernel(const Context& dev_ctx,
                  const ScalarArray& shape,
                  DenseTensor* out) {
-  out->Resize(paddle::framework::make_ddim(shape.GetData()));
+  out->ResizeAndAllocate(paddle::framework::make_ddim(shape.GetData()));
 }
 
 template <typename T, typename Context>
diff --git a/paddle/pten/kernels/flatten_grad_kernel.cc b/paddle/pten/kernels/flatten_grad_kernel.cc
index 45f3c6558d9c87..e45ac516e16ed3 100644
--- a/paddle/pten/kernels/flatten_grad_kernel.cc
+++ b/paddle/pten/kernels/flatten_grad_kernel.cc
@@ -28,7 +28,7 @@ void FlattenGradKernel(const Context& dev_ctx,
   auto x_dims =
       paddle::framework::slice_ddim(xshape_dims, 1, xshape_dims.size());
   pten::Copy(dev_ctx, out_grad, false, x_grad);
-  x_grad->Resize(x_dims);
+  x_grad->ResizeAndAllocate(x_dims);
 }
 
 }  // namespace pten
diff --git a/paddle/pten/kernels/flatten_kernel.cc b/paddle/pten/kernels/flatten_kernel.cc
index 9201a8df9d166c..b0d05803ac351c 100644
--- a/paddle/pten/kernels/flatten_kernel.cc
+++ b/paddle/pten/kernels/flatten_kernel.cc
@@ -29,7 +29,7 @@ void FlattenKernel(const Context& dev_ctx,
                    DenseTensor* out) {
   auto out_dims = out->dims();
   pten::Copy(dev_ctx, x, false, out);
-  out->Resize(out_dims);
+  out->ResizeAndAllocate(out_dims);
 }
 
 // TODO(yuanrisheng): this kernel is for training and xshape is a Intermediate
diff --git a/paddle/pten/kernels/funcs/common_shape.h b/paddle/pten/kernels/funcs/common_shape.h
index 9d16d18d6b6ec3..8693fd2b36c4e7 100644
--- a/paddle/pten/kernels/funcs/common_shape.h
+++ b/paddle/pten/kernels/funcs/common_shape.h
@@ -26,7 +26,7 @@ inline void SetXShape(const DenseTensor &x, DenseTensor *xshape) {
   for (int i = 0; i < in_dims.size(); ++i) {
     xshape_dims[i + 1] = in_dims[i];
   }
-  xshape->Resize(paddle::framework::make_ddim(xshape_dims));
+  xshape->ResizeAndAllocate(paddle::framework::make_ddim(xshape_dims));
   xshape->ResetLoD(x.meta().lod);
 }
 
diff --git a/paddle/pten/kernels/gpu/cast_kernel.cu b/paddle/pten/kernels/gpu/cast_kernel.cu
index 687519debcb704..aa61155221bf15 100644
--- a/paddle/pten/kernels/gpu/cast_kernel.cu
+++ b/paddle/pten/kernels/gpu/cast_kernel.cu
@@ -44,9 +44,9 @@ void CastCUDAKernelImpl(const GPUContext& dev_ctx,
   inputs.emplace_back(&x);
   outputs.emplace_back(out);
   out->mutable_data<OutT>();
-  funcs::LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kUnary,
-                                             InT,
-                                             OutT>(
+  pten::funcs::LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kUnary,
+                                                   InT,
+                                                   OutT>(
       dev_ctx, inputs, &outputs, CastFuctor<InT, OutT>());
 }
 
diff --git a/paddle/pten/kernels/gpu/copy_kernel.cu b/paddle/pten/kernels/gpu/copy_kernel.cu
index 7eeef85f0f3e61..10b2aa415d45bf 100644
--- a/paddle/pten/kernels/gpu/copy_kernel.cu
+++ b/paddle/pten/kernels/gpu/copy_kernel.cu
@@ -42,7 +42,7 @@ void Copy(const Context& dev_ctx,
   VLOG(3) << "TensorCopy " << src.dims() << " from " << src.place() << " to "
           << dst_place;
 
-  dst->Resize(src.dims());
+  dst->ResizeAndAllocate(src.dims());
   auto* dst_ptr = dst->mutable_data();
 
   if (src_ptr == dst_ptr && src_place == dst_place) {
diff --git a/paddle/pten/kernels/gpu/elementwise.h b/paddle/pten/kernels/gpu/elementwise.h
index 291550a3d6e702..c3ff91e7b15cd6 100644
--- a/paddle/pten/kernels/gpu/elementwise.h
+++ b/paddle/pten/kernels/gpu/elementwise.h
@@ -574,7 +574,7 @@ void LaunchElementwiseCudaKernel(const KPDevice &ctx,
                ? *std::max_element(dims_size.begin(), dims_size.end()) -
                      *std::min_element(dims_size.begin(), dims_size.end())
                : axis;
-    LaunchBroadcastElementwiseCudaKernel<ET, InT, OutT, Functor, NumOuts>(
+    pten::LaunchBroadcastElementwiseCudaKernel<ET, InT, OutT, Functor, NumOuts>(
         ctx, ins, outs, axis, func);
   }
 }
diff --git a/paddle/pten/kernels/gpu/reduce.h b/paddle/pten/kernels/gpu/reduce.h
index c4e3a0b354d68e..e7d1d2d5f44fc1 100644
--- a/paddle/pten/kernels/gpu/reduce.h
+++ b/paddle/pten/kernels/gpu/reduce.h
@@ -326,7 +326,7 @@ struct ReduceConfig {
                      const paddle::platform::Place& place,
                      pten::DenseTensor* tmp) {
     if (should_reduce_again) {
-      tmp->Resize(paddle::framework::make_ddim(
+      tmp->ResizeAndAllocate(paddle::framework::make_ddim(
           {static_cast<int64_t>(left_num * grid.z * grid.y * sizeof(Ty))}));
       output_data = tmp->mutable_data<Ty>();
     } else {
diff --git a/paddle/pten/kernels/gpu/scale_kernel.cu b/paddle/pten/kernels/gpu/scale_kernel.cu
index b6897efcdd25bd..b49902ff5e300a 100644
--- a/paddle/pten/kernels/gpu/scale_kernel.cu
+++ b/paddle/pten/kernels/gpu/scale_kernel.cu
@@ -55,7 +55,9 @@ void ScaleKernel(const Context& dev_ctx,
   inputs.emplace_back(&x);
   outputs.emplace_back(out);
   out->mutable_data<T>();
-  funcs::LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kUnary, T, T>(
+  pten::funcs::LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kUnary,
+                                                   T,
+                                                   T>(
       dev_ctx,
       inputs,
       &outputs,
diff --git a/paddle/pten/kernels/impl/full_kernel_impl.h b/paddle/pten/kernels/impl/full_kernel_impl.h
index 79ca63c9b0669b..134a815799de60 100644
--- a/paddle/pten/kernels/impl/full_kernel_impl.h
+++ b/paddle/pten/kernels/impl/full_kernel_impl.h
@@ -36,7 +36,7 @@ void FullKernel(const Context& dev_ctx,
                 const ScalarArray& shape,
                 const Scalar& val,
                 DenseTensor* out) {
-  out->Resize(paddle::framework::make_ddim(shape.GetData()));
+  out->ResizeAndAllocate(paddle::framework::make_ddim(shape.GetData()));
   FullValue<T>(dev_ctx, out, val.to<T>());
 }
 
diff --git a/paddle/pten/kernels/impl/matmul_kernel_impl.h b/paddle/pten/kernels/impl/matmul_kernel_impl.h
index f5f69f327a69f2..5ea9729655ecc8 100644
--- a/paddle/pten/kernels/impl/matmul_kernel_impl.h
+++ b/paddle/pten/kernels/impl/matmul_kernel_impl.h
@@ -164,7 +164,7 @@ void MatMulFunction(const Context& dev_ctx,
       std::copy_n(y_dims.cbegin(), y_ndim - 2, out_dims.begin());
       out_dims.back() = y_dims.back();
     }
-    Out->Resize(paddle::framework::make_ddim(out_dims));
+    Out->ResizeAndAllocate(paddle::framework::make_ddim(out_dims));
     Out->mutable_data<T>();
     if (trans_y) {
       const int M = Y.numel() / N;
@@ -242,7 +242,7 @@ void MatMulFunction(const Context& dev_ctx,
     } else {
       std::copy_n(x_dims.cbegin(), x_ndim - 1, out_dims.begin());
     }
-    Out->Resize(paddle::framework::make_ddim(out_dims));
+    Out->ResizeAndAllocate(paddle::framework::make_ddim(out_dims));
     Out->mutable_data<T>();
 
     if (trans_x) {
@@ -330,7 +330,7 @@ void MatMulFunction(const Context& dev_ctx,
   out_broadcast_dims[ndim - 2] = M;
   out_broadcast_dims[ndim - 1] = N;
 
-  Out->Resize(paddle::framework::make_ddim(out_broadcast_dims));
+  Out->ResizeAndAllocate(paddle::framework::make_ddim(out_broadcast_dims));
   Out->mutable_data<T>();
 
   const int batch_dim = ndim - 2;
diff --git a/paddle/pten/kernels/reshape_kernel.cc b/paddle/pten/kernels/reshape_kernel.cc
index d7e2e2707ee1b9..7f58bbbd3732d0 100644
--- a/paddle/pten/kernels/reshape_kernel.cc
+++ b/paddle/pten/kernels/reshape_kernel.cc
@@ -28,11 +28,11 @@ void ReshapeKernel(const Context& dev_ctx,
                    DenseTensor* out) {
   auto out_meta = InferMetaFromVecValue(x.meta(), shape.GetData());
   if (x.data() == out->data() && x.numel() == out->numel()) {
-    out->Resize(out_meta.dims);
+    out->ResizeAndAllocate(out_meta.dims);
     return;
   }
   pten::Copy(dev_ctx, x, false, out);
-  out->Resize(out_meta.dims);
+  out->ResizeAndAllocate(out_meta.dims);
   out->ResetLoD(x.lod());
 }
 
diff --git a/paddle/pten/kernels/xpu/copy_kernel.cc b/paddle/pten/kernels/xpu/copy_kernel.cc
index f464a4926d3b59..3287fa1f7a8572 100644
--- a/paddle/pten/kernels/xpu/copy_kernel.cc
+++ b/paddle/pten/kernels/xpu/copy_kernel.cc
@@ -43,7 +43,7 @@ void Copy(const Context& dev_ctx,
 
   VLOG(3) << "TensorCopy " << src.dims() << " from " << src.place() << " to "
           << dst_place;
-  dst->Resize(src.dims());
+  dst->ResizeAndAllocate(src.dims());
   CHECK(dst->layout() == src.layout());
   auto size = src.numel() *
               paddle::framework::SizeOfType(TransToProtoVarType(src.dtype()));
diff --git a/paddle/pten/tests/core/test_dense_tensor.cc b/paddle/pten/tests/core/test_dense_tensor.cc
index 8564969796c7ec..56722d35f325ec 100644
--- a/paddle/pten/tests/core/test_dense_tensor.cc
+++ b/paddle/pten/tests/core/test_dense_tensor.cc
@@ -110,7 +110,7 @@ TEST(dense_tensor, resize) {
   DenseTensor tensor_0(alloc, meta);
 
   CHECK_EQ(tensor_0.capacity(), 2u);
-  tensor_0.Resize({1, 2, 3});
+  tensor_0.ResizeAndAllocate({1, 2, 3});
   CHECK_EQ(tensor_0.capacity(), 6u);
   tensor_0.mutable_data<int8_t>();
   CHECK_EQ(tensor_0.capacity(), 6u);