diff --git a/python/paddle/amp/grad_scaler.py b/python/paddle/amp/grad_scaler.py
index 594e518c402e87..ff7f09b7f02e68 100644
--- a/python/paddle/amp/grad_scaler.py
+++ b/python/paddle/amp/grad_scaler.py
@@ -18,10 +18,10 @@
 
 import numpy as np
 
+import paddle
 from paddle import _C_ops, _legacy_C_ops
 from paddle.base import core
 from paddle.base.data_feeder import check_type
-from paddle.base.dygraph import to_variable
 from paddle.base.framework import _dygraph_tracer, dygraph_only
 from paddle.framework import in_dynamic_mode
 
@@ -130,20 +130,20 @@ def __init__(
             self._decr_count = 0
             self._use_dynamic_loss_scaling = use_dynamic_loss_scaling
 
-            self._found_inf = to_variable(np.array([0]).astype(np.bool_))
-            self._temp_found_inf_value_false = to_variable(
+            self._found_inf = paddle.to_tensor(np.array([0]).astype(np.bool_))
+            self._temp_found_inf_value_false = paddle.to_tensor(
                 np.array([0]).astype(np.bool_)
             )
-            self._temp_found_inf_fp16 = to_variable(
+            self._temp_found_inf_fp16 = paddle.to_tensor(
                 np.array([0]).astype(np.bool_)
             )
-            self._temp_found_inf_bf16 = to_variable(
+            self._temp_found_inf_bf16 = paddle.to_tensor(
                 np.array([0]).astype(np.bool_)
             )
-            self._temp_found_inf_fp32 = to_variable(
+            self._temp_found_inf_fp32 = paddle.to_tensor(
                 np.array([0]).astype(np.bool_)
             )
-            self._scale = to_variable(
+            self._scale = paddle.to_tensor(
                 np.array([self._init_loss_scaling]).astype(np.float32)
             )
             self._cache_founf_inf = None
@@ -438,7 +438,7 @@ def set_init_loss_scaling(self, new_init_loss_scaling):
             new_init_loss_scaling(int):  The new_init_loss_scaling used to update initial loss scaling factor.s
         """
         self._init_loss_scaling = new_init_loss_scaling
-        self._scale = to_variable(
+        self._scale = paddle.to_tensor(
             np.array([self._init_loss_scaling]).astype(np.float32)
         )
 
@@ -563,7 +563,7 @@ def load_state_dict(self, state_dict):
             )
 
         self._init_loss_scaling = state_dict["scale"][0]
-        self._scale = to_variable(
+        self._scale = paddle.to_tensor(
             np.array([self._init_loss_scaling]).astype(np.float32)
         )
         self._incr_ratio = state_dict["incr_ratio"]
diff --git a/python/paddle/base/dygraph/tensor_patch_methods.py b/python/paddle/base/dygraph/tensor_patch_methods.py
index aed4833188d6c1..7c7a3d60ebf45c 100644
--- a/python/paddle/base/dygraph/tensor_patch_methods.py
+++ b/python/paddle/base/dygraph/tensor_patch_methods.py
@@ -94,12 +94,12 @@ def _to_static_var(self, to_parameter=False, **kwargs):
             .. code-block:: python
 
                 >>> import paddle.base as base
-                >>> from paddle.base.dygraph.base import to_variable
+                >>> import paddle
                 >>> import numpy as np
 
                 >>> data = np.ones([3, 1024], dtype='float32')
                 >>> with base.dygraph.guard():
-                ...     tensor = to_variable(data)
+                ...     tensor = paddle.to_tensor(data)
                 ...     static_var = tensor._to_static_var()
         """
 
@@ -175,14 +175,14 @@ def set_value(self, value):
             .. code-block:: python
 
                 >>> import paddle.base as base
-                >>> from paddle.base.dygraph.base import to_variable
+                >>> import paddle
                 >>> from paddle.nn import Linear
                 >>> import numpy as np
 
                 >>> data = np.ones([3, 1024], dtype='float32')
                 >>> with base.dygraph.guard():
                 ...     linear = Linear(1024, 4)
-                ...     t = to_variable(data)
+                ...     t = paddle.to_tensor(data)
                 ...     linear(t)  # call with default weight
                 ...     custom_weight = np.random.randn(1024, 4).astype("float32")
                 ...     linear.weight.set_value(custom_weight)  # change existing weight
diff --git a/python/paddle/base/framework.py b/python/paddle/base/framework.py
index 8f5ecbe2c11dbd..57d74c5130f48e 100644
--- a/python/paddle/base/framework.py
+++ b/python/paddle/base/framework.py
@@ -1505,9 +1505,10 @@ class Variable(metaclass=VariableMetaClass):
 
             >>> import paddle.base as base
             >>> import numpy as np
+            >>> import paddle
 
             >>> with base.dygraph.guard():
-            ...     new_variable = base.dygraph.to_variable(np.arange(10))
+            ...     new_variable = paddle.to_tensor(np.arange(10))
 
     """
 
@@ -1693,14 +1694,13 @@ def numpy(self):
             .. code-block:: python
 
                 >>> import paddle.base as base
-                >>> from paddle.base.dygraph.base import to_variable
                 >>> from paddle.nn import Linear
                 >>> import numpy as np
 
                 >>> data = np.random.uniform(-1, 1, [30, 10, 32]).astype('float32')
                 >>> with base.dygraph.guard():
                 ...     linear = Linear(32, 64)
-                ...     data = to_variable(data)
+                ...     data = paddle.to_tensor(data)
                 ...     x = linear(data)
                 ...     print(x.numpy())
 
@@ -1779,7 +1779,7 @@ def gradient(self):
                 >>> with base.dygraph.guard():
                 ...     inputs2 = []
                 ...     for _ in range(10):
-                ...         tmp = base.dygraph.base.to_variable(x)
+                ...         tmp = paddle.to_tensor(x)
                 ...         tmp.stop_gradient=False
                 ...         inputs2.append(tmp)
                 ...     ret2 = paddle.add_n(inputs2)
@@ -1797,7 +1797,7 @@ def gradient(self):
                 ...         sparse=True)
                 ...     x_data = np.arange(12).reshape(4, 3).astype('int64')
                 ...     x_data = x_data.reshape((-1, 3, 1))
-                ...     x = base.dygraph.base.to_variable(x_data)
+                ...     x = paddle.to_tensor(x_data)
                 ...     out = embedding(x)
                 ...     out.backward()
                 ...     print(embedding.weight.gradient())
@@ -1827,7 +1827,7 @@ def clear_gradient(self):
                 >>> x = np.ones([2, 2], np.float32)
                 >>> inputs2 = []
                 >>> for _ in range(10):
-                >>>     tmp = base.dygraph.base.to_variable(x)
+                >>>     tmp = paddle.to_tensor(x)
                 >>>     tmp.stop_gradient=False
                 >>>     inputs2.append(tmp)
                 >>> ret2 = paddle.add_n(inputs2)
@@ -2052,9 +2052,9 @@ def stop_gradient(self):
                 ...     value2 = np.arange(10).reshape(2, 5).astype("float32")
                 ...     linear = paddle.nn.Linear(13, 5)
                 ...     linear2 = paddle.nn.Linear(3, 3)
-                ...     a = base.dygraph.to_variable(value0)
-                ...     b = base.dygraph.to_variable(value1)
-                ...     c = base.dygraph.to_variable(value2)
+                ...     a = paddle.to_tensor(value0)
+                ...     b = paddle.to_tensor(value1)
+                ...     c = paddle.to_tensor(value2)
                 ...     out1 = linear(a)
                 ...     out2 = linear2(b)
                 ...     out1.stop_gradient = True
diff --git a/python/paddle/base/layers/math_op_patch.py b/python/paddle/base/layers/math_op_patch.py
index de95503099f913..e4b9ed5198a9e1 100644
--- a/python/paddle/base/layers/math_op_patch.py
+++ b/python/paddle/base/layers/math_op_patch.py
@@ -326,11 +326,12 @@ def astype(self, dtype):
             .. code-block:: python
 
                 >>> import paddle.base as base
+                >>> import paddle
                 >>> import numpy as np
 
                 >>> x = np.ones([2, 2], np.float32)
                 >>> with base.dygraph.guard():
-                ...     original_variable = base.dygraph.to_variable(x)
+                ...     original_variable = paddle.to_tensor(x)
                 ...     print("original var's dtype is: {}, numpy dtype is {}".format(original_variable.dtype, original_variable.numpy().dtype))
                 ...     new_variable = original_variable.astype('int64')
                 ...     print("new var's dtype is: {}, numpy dtype is {}".format(new_variable.dtype, new_variable.numpy().dtype))
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py
index 6da2dd1a61c29d..2a691c2c4d4fc6 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py
@@ -21,7 +21,6 @@
 import paddle
 from paddle import _C_ops, _legacy_C_ops
 from paddle.base import core
-from paddle.base.dygraph import to_variable
 from paddle.common_ops_import import dygraph_only
 from paddle.nn import clip
 
@@ -278,9 +277,9 @@ def unscale_method(self, optimizer):
                     else:
                         param_grads_fp32.append(tgt_grad)
 
-        temp_found_inf_fp16 = to_variable(np.array([0]).astype(np.bool_))
-        temp_found_inf_bfp16 = to_variable(np.array([0]).astype(np.bool_))
-        temp_found_inf_fp32 = to_variable(np.array([0]).astype(np.bool_))
+        temp_found_inf_fp16 = paddle.to_tensor(np.array([0]).astype(np.bool_))
+        temp_found_inf_bfp16 = paddle.to_tensor(np.array([0]).astype(np.bool_))
+        temp_found_inf_fp32 = paddle.to_tensor(np.array([0]).astype(np.bool_))
 
         device = paddle.get_device().split(":")[0]
         device = "cpu" if optimizer.offload else device
diff --git a/python/paddle/distributed/fleet/utils/mix_precision_utils.py b/python/paddle/distributed/fleet/utils/mix_precision_utils.py
index 9552ac7e3eef0f..b67f738ff3c5aa 100644
--- a/python/paddle/distributed/fleet/utils/mix_precision_utils.py
+++ b/python/paddle/distributed/fleet/utils/mix_precision_utils.py
@@ -23,7 +23,6 @@
 from paddle.base import framework
 from paddle.base.dygraph import (
     base as imperative_base,
-    to_variable,
 )
 from paddle.distributed import fleet
 from paddle.distributed.fleet.utils.hybrid_parallel_util import (
@@ -215,7 +214,7 @@ def unscale_method(self, optimizer):
                 assert param.main_grad.dtype == core.VarDesc.VarType.FP32
                 param_grads.append(param.main_grad)
 
-    temp_found_inf = to_variable(np.array([0]).astype(np.bool_))
+    temp_found_inf = paddle.to_tensor(np.array([0]).astype(np.bool_))
     if len(param_grads):
         _legacy_C_ops.check_finite_and_unscale(
             param_grads,