From d0915f8f379ea4d76dcbed2aa9cdc529151494d1 Mon Sep 17 00:00:00 2001
From: chentianyu03 <chentianyu03@baidu.com>
Date: Thu, 11 Mar 2021 01:58:33 +0000
Subject: [PATCH 01/22] add custom init grad for backward function

---
 paddle/fluid/imperative/basic_engine.cc       | 14 +++--
 paddle/fluid/imperative/basic_engine.h        |  3 +-
 paddle/fluid/pybind/imperative.cc             |  4 +-
 .../fluid/dygraph/varbase_patch_methods.py    | 24 ++++++++-
 .../tests/unittests/test_custom_grad_input.py | 53 +++++++++++++++++++
 5 files changed, 89 insertions(+), 9 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/test_custom_grad_input.py

diff --git a/paddle/fluid/imperative/basic_engine.cc b/paddle/fluid/imperative/basic_engine.cc
index 29ba54986801f1..f2613df33cd2cf 100644
--- a/paddle/fluid/imperative/basic_engine.cc
+++ b/paddle/fluid/imperative/basic_engine.cc
@@ -36,7 +36,7 @@ DECLARE_bool(sort_sum_gradient);
 namespace paddle {
 namespace imperative {
 
-void BasicEngine::Init(VarBase* var, bool retain_graph) {
+void BasicEngine::Init(VarBase* var, bool retain_graph, VarBase* grad_tensor) {
   retain_graph_ = retain_graph;
   init_node_ = var->GradVarBase()->GradNode();
   PADDLE_ENFORCE_EQ(var->GradVarBase()->GraphIsFreed(), false,
@@ -75,9 +75,15 @@ void BasicEngine::Init(VarBase* var, bool retain_graph) {
           << " as stop_gradient false";
   var->GradVarBase()->InnerSetOverridedStopGradient(false);
   auto* dev_ctx = platform::DeviceContextPool::Instance().Get(fwd_var.place());
-  grad_var->Resize(fwd_var.dims());
-  grad_var->mutable_data(fwd_var.place(), fwd_var.type());
-  operators::math::set_constant(*dev_ctx, grad_var, 1.0);
+  if (grad_tensor == nullptr) {
+    grad_var->Resize(fwd_var.dims());
+    grad_var->mutable_data(fwd_var.place(), fwd_var.type());
+    operators::math::set_constant(*dev_ctx, grad_var, 1.0);
+  } else {
+    paddle::framework::TensorCopy(
+        grad_tensor->Var().Get<framework::LoDTensor>(), fwd_var.place(),
+        *dev_ctx, grad_var);
+  }
 }
 
 void BasicEngine::CheckBackwardInputs(const OpBase& op) {
diff --git a/paddle/fluid/imperative/basic_engine.h b/paddle/fluid/imperative/basic_engine.h
index a2ad8b5f8aa61e..6a188b073b2d85 100644
--- a/paddle/fluid/imperative/basic_engine.h
+++ b/paddle/fluid/imperative/basic_engine.h
@@ -30,7 +30,8 @@ class OpBase;
 
 class BasicEngine : public Engine {
  public:
-  void Init(VarBase* var, bool retain_graph = false);
+  void Init(VarBase* var, bool retain_graph = false,
+            VarBase* grad_tensor = nullptr);
 
   void Execute() override;
 
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index 58ef177863093d..48f1954a5bc52f 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -920,11 +920,11 @@ void BindImperative(py::module *m_ptr) {
        )DOC")
       .def("_run_backward",
            [](imperative::VarBase &self, const imperative::Tracer &tracer,
-              bool retain_graph) {
+              bool retain_graph, imperative::VarBase &grad_tensor) {
              // TODO(jiabin): when we impl more backward execution we can
              // select them
              auto *engine = tracer.GetEngine();
-             engine->Init(&self, retain_graph);
+             engine->Init(&self, retain_graph, &grad_tensor);
              VLOG(3) << "Start backward";
              engine->Execute();
              VLOG(3) << "Finish backward";
diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py
index ac0944c5718908..a065a3c2e47846 100644
--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
@@ -133,7 +133,7 @@ def set_value(self, value):
                                       framework._current_expected_place())
 
     @framework.dygraph_only
-    def backward(self, retain_graph=False):
+    def backward(self, retain_graph=False, grad_tensor=None):
         """
         Run backward of current Graph which starts from current Tensor.
 
@@ -147,6 +147,10 @@ def backward(self, retain_graph=False):
                 :code:`retain_graph` to True, then the grads will be retained. Thus, seting it to False is much more memory-efficient.
                 Defaults to False.
 
+            grad_tensor(Tensor, optional): initial gradient values of `outputs` . If `grad_tensor` is None, 
+            the initial gradient values of `outputs` would be Tensor filled with 1; 
+            if `grad_tensor` is not None, it must have the same length as `outputs`.
+            Default None.
         Returns:
             NoneType: None
 
@@ -168,6 +172,17 @@ def backward(self, retain_graph=False):
                 print("{}".format(x.grad))
                 # 0.
 
+                grad_tensor=paddle.to_tensor(2.)
+                for i in range(5):
+                    y = paddle.pow(x, 4.0)
+                    y.backward(grad_tensor=grad_tensor)
+                    print("{}: {}".format(i, x.grad))
+                # 0: [1000.]
+                # 1: [2000.]
+                # 2: [3000.]
+                # 3: [4000.]
+                # 4: [5000.]
+
         """
         if framework.in_dygraph_mode():
             if paddle.is_compiled_with_xpu():
@@ -176,7 +191,12 @@ def backward(self, retain_graph=False):
                 scaled_loss._run_backward(framework._dygraph_tracer(),
                                           retain_graph)
             else:
-                self._run_backward(framework._dygraph_tracer(), retain_graph)
+                if grad_tensor is not None:
+                    assert grad_tensor.shape == self.shape, "Variable Shape not match, Variable of grad_tensor [ {} ] with shape {} mismatch Variable [ {} ] with shape {}".format(
+                        grad_tensor.name, grad_tensor.shape, self.name,
+                        self.shape)
+                self._run_backward(framework._dygraph_tracer(), retain_graph,
+                                   grad_tensor)
         else:
             raise ValueError(
                 "Variable.backward() is only available in DyGraph mode")
diff --git a/python/paddle/fluid/tests/unittests/test_custom_grad_input.py b/python/paddle/fluid/tests/unittests/test_custom_grad_input.py
new file mode 100644
index 00000000000000..c545565d86c944
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_custom_grad_input.py
@@ -0,0 +1,53 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+
+import paddle
+import paddle.fluid.dygraph as dg
+from op_test import OpTest
+
+
+class TestBackward(unittest.TestCase):
+    def setUp(self):
+        self._dtypes = ["float32", "float64"]
+        self._places = [paddle.CPUPlace()]
+        if paddle.is_compiled_with_cuda():
+            self._places.append(paddle.CUDAPlace(0))
+
+    def test_all_positive(self):
+        for dtype in self._dtypes:
+            x = np.random.random([2, 100]).astype(dtype)
+            y = np.random.random([100, 2]).astype(dtype)
+            z = np.matmul(x, y)
+            grad = np.random.random(z.shape).astype(dtype)
+            for place in self._places:
+                with dg.guard(place):
+                    x_tensor = paddle.to_tensor(x, stop_gradient=False)
+                    y_tensor = paddle.to_tensor(y)
+                    z_tensor = paddle.matmul(x_tensor, y_tensor)
+
+                    grad_tensor = paddle.to_tensor(grad)
+                    z_tensor.backward(grad_tensor=grad_tensor)
+
+                    x_grad = np.matmul(grad, y.T)
+
+                    self.assertTrue(np.allclose(x_grad, x_tensor.grad))
+
+
+if __name__ == '__main__':
+    unittest.main()

From 0bccce663638df4131fc11e0d334916c0efb548b Mon Sep 17 00:00:00 2001
From: chentianyu03 <chentianyu03@baidu.com>
Date: Thu, 11 Mar 2021 06:56:57 +0000
Subject: [PATCH 02/22] add custom init grad for backward function

---
 paddle/fluid/imperative/basic_engine.cc              | 11 ++---------
 python/paddle/fluid/dygraph/varbase_patch_methods.py |  9 +++++++--
 2 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/paddle/fluid/imperative/basic_engine.cc b/paddle/fluid/imperative/basic_engine.cc
index f2613df33cd2cf..bc03e178588791 100644
--- a/paddle/fluid/imperative/basic_engine.cc
+++ b/paddle/fluid/imperative/basic_engine.cc
@@ -75,15 +75,8 @@ void BasicEngine::Init(VarBase* var, bool retain_graph, VarBase* grad_tensor) {
           << " as stop_gradient false";
   var->GradVarBase()->InnerSetOverridedStopGradient(false);
   auto* dev_ctx = platform::DeviceContextPool::Instance().Get(fwd_var.place());
-  if (grad_tensor == nullptr) {
-    grad_var->Resize(fwd_var.dims());
-    grad_var->mutable_data(fwd_var.place(), fwd_var.type());
-    operators::math::set_constant(*dev_ctx, grad_var, 1.0);
-  } else {
-    paddle::framework::TensorCopy(
-        grad_tensor->Var().Get<framework::LoDTensor>(), fwd_var.place(),
-        *dev_ctx, grad_var);
-  }
+  paddle::framework::TensorCopy(grad_tensor->Var().Get<framework::LoDTensor>(),
+                                fwd_var.place(), *dev_ctx, grad_var);
 }
 
 void BasicEngine::CheckBackwardInputs(const OpBase& op) {
diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py
index a065a3c2e47846..bf89fea31b6b5c 100644
--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
@@ -191,8 +191,13 @@ def backward(self, retain_graph=False, grad_tensor=None):
                 scaled_loss._run_backward(framework._dygraph_tracer(),
                                           retain_graph)
             else:
-                if grad_tensor is not None:
-                    assert grad_tensor.shape == self.shape, "Variable Shape not match, Variable of grad_tensor [ {} ] with shape {} mismatch Variable [ {} ] with shape {}".format(
+                if grad_tensor is None:
+                    grad_tensor = paddle.ones_like(self)
+                else:
+                    assert isinstance(
+                        grad_tensor, core.VarBase
+                    ), "The type of grad_tensot must be paddle.VarBase"
+                    assert grad_tensor.shape == self.shape, "Variable shape not match, Variable of grad_tensor [ {} ] with shape {} mismatch Variable [ {} ] with shape {}".format(
                         grad_tensor.name, grad_tensor.shape, self.name,
                         self.shape)
                 self._run_backward(framework._dygraph_tracer(), retain_graph,

From 5dac8e9568693d985d3d4bbee807a098db00fe1e Mon Sep 17 00:00:00 2001
From: chentianyu03 <chentianyu03@baidu.com>
Date: Fri, 12 Mar 2021 07:29:09 +0000
Subject: [PATCH 03/22] handle when the grad_tensor is none

---
 paddle/fluid/imperative/basic_engine.cc               | 11 +++++++++--
 paddle/fluid/pybind/imperative.cc                     |  6 ++++--
 python/paddle/fluid/dygraph/varbase_patch_methods.py  |  4 +---
 .../fluid/tests/unittests/test_imperative_basic.py    |  1 +
 4 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/paddle/fluid/imperative/basic_engine.cc b/paddle/fluid/imperative/basic_engine.cc
index bc03e178588791..f2613df33cd2cf 100644
--- a/paddle/fluid/imperative/basic_engine.cc
+++ b/paddle/fluid/imperative/basic_engine.cc
@@ -75,8 +75,15 @@ void BasicEngine::Init(VarBase* var, bool retain_graph, VarBase* grad_tensor) {
           << " as stop_gradient false";
   var->GradVarBase()->InnerSetOverridedStopGradient(false);
   auto* dev_ctx = platform::DeviceContextPool::Instance().Get(fwd_var.place());
-  paddle::framework::TensorCopy(grad_tensor->Var().Get<framework::LoDTensor>(),
-                                fwd_var.place(), *dev_ctx, grad_var);
+  if (grad_tensor == nullptr) {
+    grad_var->Resize(fwd_var.dims());
+    grad_var->mutable_data(fwd_var.place(), fwd_var.type());
+    operators::math::set_constant(*dev_ctx, grad_var, 1.0);
+  } else {
+    paddle::framework::TensorCopy(
+        grad_tensor->Var().Get<framework::LoDTensor>(), fwd_var.place(),
+        *dev_ctx, grad_var);
+  }
 }
 
 void BasicEngine::CheckBackwardInputs(const OpBase& op) {
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index 48f1954a5bc52f..7cd8dcd4004f0b 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -920,15 +920,17 @@ void BindImperative(py::module *m_ptr) {
        )DOC")
       .def("_run_backward",
            [](imperative::VarBase &self, const imperative::Tracer &tracer,
-              bool retain_graph, imperative::VarBase &grad_tensor) {
+              bool retain_graph, imperative::VarBase *grad_tensor) {
              // TODO(jiabin): when we impl more backward execution we can
              // select them
              auto *engine = tracer.GetEngine();
-             engine->Init(&self, retain_graph, &grad_tensor);
+             engine->Init(&self, retain_graph, grad_tensor);
              VLOG(3) << "Start backward";
              engine->Execute();
              VLOG(3) << "Finish backward";
            },
+           py::arg("tracer"), py::arg("retain_graph"),
+           py::arg("grad_tensor") = static_cast<imperative::VarBase *>(nullptr),
            py::call_guard<py::gil_scoped_release>())
       .def("_grad_name", &imperative::VarBase::GradVarName)
       .def("_grad_value",
diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py
index bf89fea31b6b5c..814a4bccad2212 100644
--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
@@ -191,9 +191,7 @@ def backward(self, retain_graph=False, grad_tensor=None):
                 scaled_loss._run_backward(framework._dygraph_tracer(),
                                           retain_graph)
             else:
-                if grad_tensor is None:
-                    grad_tensor = paddle.ones_like(self)
-                else:
+                if grad_tensor is not None:
                     assert isinstance(
                         grad_tensor, core.VarBase
                     ), "The type of grad_tensot must be paddle.VarBase"
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_basic.py b/python/paddle/fluid/tests/unittests/test_imperative_basic.py
index cb48013902a532..b1f231b1051c22 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_basic.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_basic.py
@@ -807,4 +807,5 @@ def test_without_guard(self):
 
 if __name__ == '__main__':
     paddle.enable_static()
+    #paddle.set_device("cpu")
     unittest.main()

From ef4c7b90af5a2cd8ad0982e1a5bd7fa3a67b484b Mon Sep 17 00:00:00 2001
From: chentianyu03 <chentianyu03@baidu.com>
Date: Fri, 12 Mar 2021 07:41:13 +0000
Subject: [PATCH 04/22] handle when the grad_tensor is none

---
 python/paddle/fluid/tests/unittests/test_imperative_basic.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/paddle/fluid/tests/unittests/test_imperative_basic.py b/python/paddle/fluid/tests/unittests/test_imperative_basic.py
index b1f231b1051c22..cb48013902a532 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_basic.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_basic.py
@@ -807,5 +807,4 @@ def test_without_guard(self):
 
 if __name__ == '__main__':
     paddle.enable_static()
-    #paddle.set_device("cpu")
     unittest.main()

From 33b041687c401069b61561dc2e83ea670f13dec1 Mon Sep 17 00:00:00 2001
From: chentianyu03 <chentianyu03@baidu.com>
Date: Mon, 15 Mar 2021 07:37:36 +0000
Subject: [PATCH 05/22] fix the args type error on windows platform

---
 paddle/fluid/pybind/imperative.cc | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index 7cd8dcd4004f0b..c9952c01bcec4b 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -920,17 +920,15 @@ void BindImperative(py::module *m_ptr) {
        )DOC")
       .def("_run_backward",
            [](imperative::VarBase &self, const imperative::Tracer &tracer,
-              bool retain_graph, imperative::VarBase *grad_tensor) {
+              bool retain_graph, std::shared_ptr<imperative::VarBase> &grad_tensor) {
              // TODO(jiabin): when we impl more backward execution we can
              // select them
              auto *engine = tracer.GetEngine();
-             engine->Init(&self, retain_graph, grad_tensor);
+             engine->Init(&self, retain_graph, grad_tensor.get());
              VLOG(3) << "Start backward";
              engine->Execute();
              VLOG(3) << "Finish backward";
            },
-           py::arg("tracer"), py::arg("retain_graph"),
-           py::arg("grad_tensor") = static_cast<imperative::VarBase *>(nullptr),
            py::call_guard<py::gil_scoped_release>())
       .def("_grad_name", &imperative::VarBase::GradVarName)
       .def("_grad_value",

From 837e26be9a4c6d5f9f379b806df26d93fe0b4a41 Mon Sep 17 00:00:00 2001
From: chentianyu03 <chentianyu03@baidu.com>
Date: Mon, 15 Mar 2021 08:08:55 +0000
Subject: [PATCH 06/22] modify the args order and doc

---
 python/paddle/fluid/dygraph/varbase_patch_methods.py | 12 ++++++------
 .../fluid/tests/unittests/test_custom_grad_input.py  |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py
index 814a4bccad2212..bbb001a05e2e4a 100644
--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
@@ -133,7 +133,7 @@ def set_value(self, value):
                                       framework._current_expected_place())
 
     @framework.dygraph_only
-    def backward(self, retain_graph=False, grad_tensor=None):
+    def backward(self, grad_tensor=None, retain_graph=False):
         """
         Run backward of current Graph which starts from current Tensor.
 
@@ -142,15 +142,15 @@ def backward(self, retain_graph=False, grad_tensor=None):
         You can clear gradient by ``Tensor.clear_grad()`` .
 
         Args:
+            grad_tensor(Tensor, optional): initial gradient values of the current Tensor. If `grad_tensor` is None, 
+            the initial gradient values of the current Tensor would be Tensor filled with 1.0; 
+            if `grad_tensor` is not None, it must have the same length as the current Tensor.
+            Teh default value is None.
+
             retain_graph(bool, optional): If False, the graph used to compute grads will be freed. If you would
                 like to add more ops to the built graph after calling this method( :code:`backward` ), set the parameter
                 :code:`retain_graph` to True, then the grads will be retained. Thus, seting it to False is much more memory-efficient.
                 Defaults to False.
-
-            grad_tensor(Tensor, optional): initial gradient values of `outputs` . If `grad_tensor` is None, 
-            the initial gradient values of `outputs` would be Tensor filled with 1; 
-            if `grad_tensor` is not None, it must have the same length as `outputs`.
-            Default None.
         Returns:
             NoneType: None
 
diff --git a/python/paddle/fluid/tests/unittests/test_custom_grad_input.py b/python/paddle/fluid/tests/unittests/test_custom_grad_input.py
index c545565d86c944..73e19197326cc2 100644
--- a/python/paddle/fluid/tests/unittests/test_custom_grad_input.py
+++ b/python/paddle/fluid/tests/unittests/test_custom_grad_input.py
@@ -42,7 +42,7 @@ def test_all_positive(self):
                     z_tensor = paddle.matmul(x_tensor, y_tensor)
 
                     grad_tensor = paddle.to_tensor(grad)
-                    z_tensor.backward(grad_tensor=grad_tensor)
+                    z_tensor.backward(grad_tensor)
 
                     x_grad = np.matmul(grad, y.T)
 

From 19019708c44ec121738d7f4e616782c97c40337a Mon Sep 17 00:00:00 2001
From: chentianyu03 <chentianyu03@baidu.com>
Date: Mon, 15 Mar 2021 08:44:38 +0000
Subject: [PATCH 07/22] format code

---
 paddle/fluid/pybind/imperative.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index c9952c01bcec4b..3f2bb5fbca9ba3 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -920,7 +920,8 @@ void BindImperative(py::module *m_ptr) {
        )DOC")
       .def("_run_backward",
            [](imperative::VarBase &self, const imperative::Tracer &tracer,
-              bool retain_graph, std::shared_ptr<imperative::VarBase> &grad_tensor) {
+              bool retain_graph,
+              std::shared_ptr<imperative::VarBase> &grad_tensor) {
              // TODO(jiabin): when we impl more backward execution we can
              // select them
              auto *engine = tracer.GetEngine();

From 55e0cfb7350f512e8d23d8b08b53827dabbf9c3f Mon Sep 17 00:00:00 2001
From: chentianyu03 <chentianyu03@baidu.com>
Date: Mon, 15 Mar 2021 10:26:52 +0000
Subject: [PATCH 08/22] add grad_tensor to xpu

---
 .../fluid/dygraph/varbase_patch_methods.py    | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py
index bbb001a05e2e4a..f9b1d1836683ab 100644
--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
@@ -157,6 +157,7 @@ def backward(self, grad_tensor=None, retain_graph=False):
         Examples:
             .. code-block:: python
 
+                import paddle
                 x = paddle.to_tensor(5., stop_gradient=False)
                 for i in range(5):
                     y = paddle.pow(x, 4.0)
@@ -175,7 +176,7 @@ def backward(self, grad_tensor=None, retain_graph=False):
                 grad_tensor=paddle.to_tensor(2.)
                 for i in range(5):
                     y = paddle.pow(x, 4.0)
-                    y.backward(grad_tensor=grad_tensor)
+                    y.backward(grad_tensor)
                     print("{}: {}".format(i, x.grad))
                 # 0: [1000.]
                 # 1: [2000.]
@@ -185,19 +186,19 @@ def backward(self, grad_tensor=None, retain_graph=False):
 
         """
         if framework.in_dygraph_mode():
+            if grad_tensor is not None:
+                assert isinstance(
+                    grad_tensor, core.
+                    VarBase), "The type of grad_tensot must be paddle.VarBase"
+                assert grad_tensor.shape == self.shape, "Variable shape not match, Variable of grad_tensor [ {} ] with shape {} mismatch Variable [ {} ] with shape {}".format(
+                    grad_tensor.name, grad_tensor.shape, self.name, self.shape)
+
             if paddle.is_compiled_with_xpu():
                 # TODO(liuyuhui): Currently only for xpu. Will be removed in the future.
                 scaled_loss = scale_loss(self)
                 scaled_loss._run_backward(framework._dygraph_tracer(),
-                                          retain_graph)
+                                          retain_graph, grad_tensor)
             else:
-                if grad_tensor is not None:
-                    assert isinstance(
-                        grad_tensor, core.VarBase
-                    ), "The type of grad_tensot must be paddle.VarBase"
-                    assert grad_tensor.shape == self.shape, "Variable shape not match, Variable of grad_tensor [ {} ] with shape {} mismatch Variable [ {} ] with shape {}".format(
-                        grad_tensor.name, grad_tensor.shape, self.name,
-                        self.shape)
                 self._run_backward(framework._dygraph_tracer(), retain_graph,
                                    grad_tensor)
         else:

From 8271dc0a4ddc33f800d7bbaf9752f11f3fd3db15 Mon Sep 17 00:00:00 2001
From: chentianyu03 <chentianyu03@baidu.com>
Date: Tue, 16 Mar 2021 06:17:08 +0000
Subject: [PATCH 09/22] modify the grad_tensor type check

---
 python/paddle/fluid/dygraph/varbase_patch_methods.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py
index f9b1d1836683ab..07fabc9cb0a0bd 100644
--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
@@ -188,9 +188,10 @@ def backward(self, grad_tensor=None, retain_graph=False):
         if framework.in_dygraph_mode():
             if grad_tensor is not None:
                 assert isinstance(
-                    grad_tensor, core.
-                    VarBase), "The type of grad_tensot must be paddle.VarBase"
-                assert grad_tensor.shape == self.shape, "Variable shape not match, Variable of grad_tensor [ {} ] with shape {} mismatch Variable [ {} ] with shape {}".format(
+                    grad_tensor, paddle.
+                    Tensor), "The type of grad_tensot must be paddle.Tensor"
+                assert grad_tensor.shape == self.shape, \
+                    "Tensor shape not match, Tensor of grad_tensor [ {} ] with shape {} mismatch Tensor [ {} ] with shape {}".format(
                     grad_tensor.name, grad_tensor.shape, self.name, self.shape)
 
             if paddle.is_compiled_with_xpu():

From 5af3bd00f8f4b2e22e13388eaf5b0b4f590c21f5 Mon Sep 17 00:00:00 2001
From: chentianyu03 <chentianyu03@baidu.com>
Date: Thu, 18 Mar 2021 02:38:10 +0000
Subject: [PATCH 10/22] add paddle.backward api to support multi tensors
 gradient compute

---
 paddle/fluid/imperative/basic_engine.cc | 119 ++++++++++++++----------
 paddle/fluid/imperative/basic_engine.h  |   9 +-
 paddle/fluid/pybind/imperative.cc       |  20 +++-
 python/paddle/fluid/dygraph/base.py     |  59 +++++++++++-
 4 files changed, 152 insertions(+), 55 deletions(-)

diff --git a/paddle/fluid/imperative/basic_engine.cc b/paddle/fluid/imperative/basic_engine.cc
index f2613df33cd2cf..977419f5f84617 100644
--- a/paddle/fluid/imperative/basic_engine.cc
+++ b/paddle/fluid/imperative/basic_engine.cc
@@ -36,53 +36,72 @@ DECLARE_bool(sort_sum_gradient);
 namespace paddle {
 namespace imperative {
 
-void BasicEngine::Init(VarBase* var, bool retain_graph, VarBase* grad_tensor) {
+void BasicEngine::Init(
+    const std::vector<std::shared_ptr<VarBase>>& tensors,
+    const std::vector<std::shared_ptr<VarBase>>& grad_tensors,
+    bool retain_graph, bool create_graph,
+    const std::vector<std::shared_ptr<VarBase>>& inputs) {
   retain_graph_ = retain_graph;
-  init_node_ = var->GradVarBase()->GradNode();
-  PADDLE_ENFORCE_EQ(var->GradVarBase()->GraphIsFreed(), false,
-                    platform::errors::Unavailable(
-                        "%s trying to backward through the same graph a second "
-                        "time, but this graph have already been freed. Please "
-                        "specify Tensor.backward(retain_graph=True) when "
-                        "calling backward at the first time.",
-                        var->Name()));
-
-  if (!retain_graph) {
-    VLOG(5) << "Clear the auto-grad graph from grad var " << var->Name()
-            << " because of retain_graph=False when calling backward";
-    var->GradVarBase()->SetGraphIsFreed(true);
-    var->GradVarBase()->ClearGradNode();
-  }
 
-  if (init_node_ == nullptr || var->OverridedStopGradient()) {
-    VLOG(3) << "Skip auto grad since there is no grad op for var or loss is "
-               "stop_gradient=True: "
-            << var->Name();
-    return;
-  }
+  PADDLE_ENFORCE_EQ(
+      tensors.size(), grad_tensors.size(),
+      platform::errors::Unavailable(
+          "the size of tensors must equal the size of grad_tensors, but"
+          "the size of tensors is %s, and the size of grad_tensors is %s.",
+          tensors.size(), grad_tensors.size()));
+
+  for (size_t i = 0; i < tensors.size(); ++i) {
+    auto var = tensors[i];
+    auto grad_tensor = grad_tensors[i];
+
+    auto init_node_ = var->GradVarBase()->GradNode();
+    PADDLE_ENFORCE_EQ(
+        var->GradVarBase()->GraphIsFreed(), false,
+        platform::errors::Unavailable(
+            "%s trying to backward through the same graph a second "
+            "time, but this graph have already been freed. Please "
+            "specify Tensor.backward(retain_graph=True) when "
+            "calling backward at the first time.",
+            var->Name()));
+
+    if (!retain_graph) {
+      VLOG(5) << "Clear the auto-grad graph from grad var " << var->Name()
+              << " because of retain_graph=False when calling backward";
+      var->GradVarBase()->SetGraphIsFreed(true);
+      var->GradVarBase()->ClearGradNode();
+    }
 
-  VLOG(3) << "Init first node of backward";
+    if (init_node_ == nullptr || var->OverridedStopGradient()) {
+      VLOG(3) << "Skip auto grad since there is no grad op for var or loss is "
+                 "stop_gradient=True: "
+              << var->Name();
+      continue;
+    }
 
-  PADDLE_ENFORCE_EQ(
-      var->HasGradVar(), true,
-      platform::errors::NotFound("Grad variable not exist for variable %s",
-                                 var->Name()));
-
-  auto& fwd_var = var->Var().Get<framework::LoDTensor>();
-  auto* grad_var =
-      var->GradVarBase()->MutableVar()->GetMutable<framework::LoDTensor>();
-  VLOG(6) << "init loss grad:" << var->GradVarBase()->Name()
-          << " as stop_gradient false";
-  var->GradVarBase()->InnerSetOverridedStopGradient(false);
-  auto* dev_ctx = platform::DeviceContextPool::Instance().Get(fwd_var.place());
-  if (grad_tensor == nullptr) {
-    grad_var->Resize(fwd_var.dims());
-    grad_var->mutable_data(fwd_var.place(), fwd_var.type());
-    operators::math::set_constant(*dev_ctx, grad_var, 1.0);
-  } else {
-    paddle::framework::TensorCopy(
-        grad_tensor->Var().Get<framework::LoDTensor>(), fwd_var.place(),
-        *dev_ctx, grad_var);
+    VLOG(3) << "Init node of backward";
+
+    PADDLE_ENFORCE_EQ(
+        var->HasGradVar(), true,
+        platform::errors::NotFound("Grad variable not exist for variable %s",
+                                   var->Name()));
+
+    auto& fwd_var = var->Var().Get<framework::LoDTensor>();
+    auto* grad_var =
+        var->GradVarBase()->MutableVar()->GetMutable<framework::LoDTensor>();
+    VLOG(6) << "init loss grad:" << var->GradVarBase()->Name()
+            << " as stop_gradient false";
+    var->GradVarBase()->InnerSetOverridedStopGradient(false);
+    auto* dev_ctx =
+        platform::DeviceContextPool::Instance().Get(fwd_var.place());
+    if (grad_tensor == nullptr) {
+      grad_var->Resize(fwd_var.dims());
+      grad_var->mutable_data(fwd_var.place(), fwd_var.type());
+      operators::math::set_constant(*dev_ctx, grad_var, 1.0);
+    } else {
+      paddle::framework::TensorCopy(
+          grad_tensor->Var().Get<framework::LoDTensor>(), fwd_var.place(),
+          *dev_ctx, grad_var);
+    }
   }
 }
 
@@ -241,8 +260,10 @@ void BasicEngine::PrepareDeps() {
   std::queue<GradOpNode*> q;
   std::unordered_set<GradOpNode*> visited;
 
-  q.push(init_node_.get());
-  visited.insert(init_node_.get());
+  for (size_t i = 0; i < init_nodes_.size(); ++i) {
+    q.push(init_nodes_[i].get());
+    visited.insert(init_nodes_[i].get());
+  }
 
   while (!q.empty()) {
     auto* cur_node = q.front();
@@ -269,14 +290,16 @@ void BasicEngine::PrepareDeps() {
 }
 
 void BasicEngine::Execute() {
-  if (init_node_ == nullptr) {
+  if (init_nodes_.empty()) {
     return;
   }
 
   PrepareDeps();
   // Start execute Computation graph
   std::queue<std::shared_ptr<GradOpNode>> q;
-  q.push(std::move(init_node_));
+  for (size_t i = 0; i < init_nodes_.size(); ++i) {
+    q.push(std::move(init_nodes_[i]));
+  }
 
   size_t op_num = 0;
 
@@ -476,7 +499,7 @@ void BasicEngine::Execute() {
 }
 
 void BasicEngine::Clear() {
-  init_node_.reset();
+  init_nodes_.clear();
   node_deps_.clear();
   accumulators_.clear();
   accumulators_with_grad_node_.clear();
diff --git a/paddle/fluid/imperative/basic_engine.h b/paddle/fluid/imperative/basic_engine.h
index 6a188b073b2d85..992066661c9605 100644
--- a/paddle/fluid/imperative/basic_engine.h
+++ b/paddle/fluid/imperative/basic_engine.h
@@ -30,8 +30,10 @@ class OpBase;
 
 class BasicEngine : public Engine {
  public:
-  void Init(VarBase* var, bool retain_graph = false,
-            VarBase* grad_tensor = nullptr);
+  void Init(const std::vector<std::shared_ptr<VarBase>>& tensors,
+            const std::vector<std::shared_ptr<VarBase>>& grad_tensors,
+            bool retain_graph, bool create_graph,
+            const std::vector<std::shared_ptr<VarBase>>& inputs);
 
   void Execute() override;
 
@@ -47,7 +49,7 @@ class BasicEngine : public Engine {
   void Clear();
 
  private:
-  std::shared_ptr<GradOpNode> init_node_;
+  std::vector<std::shared_ptr<GradOpNode>> init_nodes_;
   std::unordered_map<GradOpNode*, size_t> node_deps_;
   // The input and output of Inplace op are the same. If only `var` is used
   // as the key, then the input and output of inplace op must be gradient
@@ -75,6 +77,7 @@ class BasicEngine : public Engine {
   std::vector<GradientAccumulator*> leaf_accumulators_;
 
   bool retain_graph_;
+  bool create_graph_;
 };
 
 }  // namespace imperative
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index 3f2bb5fbca9ba3..402c2e83e8fadd 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -924,10 +924,10 @@ void BindImperative(py::module *m_ptr) {
               std::shared_ptr<imperative::VarBase> &grad_tensor) {
              // TODO(jiabin): when we impl more backward execution we can
              // select them
-             auto *engine = tracer.GetEngine();
-             engine->Init(&self, retain_graph, grad_tensor.get());
+             // auto *engine = tracer.GetEngine();
+             // engine->Init(&self, retain_graph, grad_tensor.get());
              VLOG(3) << "Start backward";
-             engine->Execute();
+             // engine->Execute();
              VLOG(3) << "Finish backward";
            },
            py::call_guard<py::gil_scoped_release>())
@@ -1413,6 +1413,20 @@ void BindImperative(py::module *m_ptr) {
       },
       py::call_guard<py::gil_scoped_release>());
 
+  m.def(
+      "dygraph_run_backward",
+      [](const std::vector<std::shared_ptr<imperative::VarBase>> &tensors,
+         const std::vector<std::shared_ptr<imperative::VarBase>> &grad_tensors,
+         bool retain_graph, bool create_graph,
+         const std::vector<std::shared_ptr<imperative::VarBase>> &inputs,
+         const imperative::Tracer &tracer) {
+        auto *engine = tracer.GetEngine();
+        engine->Init(tensors, grad_tensors, retain_graph, create_graph, inputs);
+        VLOG(3) << "Start backward";
+        engine->Execute();
+        VLOG(3) << "Finish backward";
+      });
+
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
     defined(PADDLE_WITH_XPU_BKCL)
   py::class_<imperative::ParallelContext,
diff --git a/python/paddle/fluid/dygraph/base.py b/python/paddle/fluid/dygraph/base.py
index 08d58e0c808b83..8f7ed202670ea0 100644
--- a/python/paddle/fluid/dygraph/base.py
+++ b/python/paddle/fluid/dygraph/base.py
@@ -26,10 +26,11 @@
 from ..data_feeder import convert_dtype
 import warnings
 from ..framework import _get_paddle_place
+import paddle
 
 __all__ = [
     'no_grad', 'no_grad_', 'grad', 'guard', 'enable_dygraph', 'disable_dygraph',
-    'enabled', 'to_variable'
+    'enabled', 'to_variable', 'backward'
 ]
 
 
@@ -592,6 +593,62 @@ def check_in_out(in_out_list, name):
                                      retain_graph, allow_unused, only_inputs)
 
 
+@framework.dygraph_only
+def backward(tensors,
+             grad_tensors,
+             retain_graph=None,
+             create_graph=False,
+             inputs=None):
+    def check_tensors(in_out_list, name):
+        assert in_out_list is not None, "{} should not be None".format(name)
+
+        if isinstance(in_out_list, (list, tuple)):
+            assert len(in_out_list) > 0, "{} connot be empyt".format(name)
+            for each_var in in_out_list:
+                assert isinstance(
+                    each_var, paddle.
+                    Tensor), "Elements of {} must be paddle.Tensor".format(name)
+            return in_out_list
+        else:
+            assert isinstance(
+                in_out_list,
+                paddle.Tensor), "{} must be Tensor or list of Tensor".format(
+                    name)
+            return [in_out_list]
+
+    tensors = check_tensors(tensors, "tensors")
+
+    if grad_tensors is not None:
+        if not isinstance(grad_tensors, (list, tuple)):
+            grad_tensors = [grad_tensors]
+
+        for each_tensor in grad_tensors:
+            if each_tensor is not None:
+                assert isinstance(
+                    each_tensor, paddle.Tensor
+                ), "grad_tensors must be None, Tensor or list containing None or Tensor"
+    else:
+        grad_tensors = []
+
+    if len(grad_tensors) > 0:
+        assert len(tensors) == len(
+            grad_tensors), "The length of grad_tensors must be equal to tensors"
+
+    assert isinstance(create_graph, bool), "create_graph must be True or False"
+
+    if retain_graph is None:
+        retain_graph = create_graph
+
+    assert isinstance(retain_graph,
+                      bool), "retain_graph must be None, True or False"
+
+    if inputs is not None:
+        assert len(inputs) > 0, "inputs cannot be empty list"
+
+    core.dygraph_run_backward(tensors, grad_tensors, retain_graph, create_graph,
+                              inputs)
+
+
 @framework.dygraph_only
 def to_variable(value, name=None, zero_copy=None, dtype=None):
     r"""

From 1467feb19a8f746e29e302241d2f7ba2cc6b590c Mon Sep 17 00:00:00 2001
From: chentianyu03 <chentianyu03@baidu.com>
Date: Thu, 18 Mar 2021 08:37:32 +0000
Subject: [PATCH 11/22] add paddle.backward api to support multi tensors
 gradient compute

---
 paddle/fluid/imperative/basic_engine.cc | 2 ++
 python/paddle/fluid/dygraph/base.py     | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/imperative/basic_engine.cc b/paddle/fluid/imperative/basic_engine.cc
index 977419f5f84617..a5f8bf26578e4a 100644
--- a/paddle/fluid/imperative/basic_engine.cc
+++ b/paddle/fluid/imperative/basic_engine.cc
@@ -102,6 +102,8 @@ void BasicEngine::Init(
           grad_tensor->Var().Get<framework::LoDTensor>(), fwd_var.place(),
           *dev_ctx, grad_var);
     }
+
+    init_nodes_.push_back(init_node_);
   }
 }
 
diff --git a/python/paddle/fluid/dygraph/base.py b/python/paddle/fluid/dygraph/base.py
index 8f7ed202670ea0..8c580c03c5f264 100644
--- a/python/paddle/fluid/dygraph/base.py
+++ b/python/paddle/fluid/dygraph/base.py
@@ -646,7 +646,7 @@ def check_tensors(in_out_list, name):
         assert len(inputs) > 0, "inputs cannot be empty list"
 
     core.dygraph_run_backward(tensors, grad_tensors, retain_graph, create_graph,
-                              inputs)
+                              inputs, framework._dygraph_tracer())
 
 
 @framework.dygraph_only

From eb267fa3ec5051ab0fd5ac7b5781648990ec0789 Mon Sep 17 00:00:00 2001
From: chentianyu03 <chentianyu03@baidu.com>
Date: Fri, 19 Mar 2021 02:16:06 +0000
Subject: [PATCH 12/22] add paddle.atuograd module and backward api

---
 python/paddle/__init__.py               |  1 +
 python/paddle/autograd/__init__.py      | 22 ++++++++
 python/paddle/autograd/backward_mode.py | 74 +++++++++++++++++++++++++
 python/paddle/fluid/dygraph/base.py     | 58 +------------------
 4 files changed, 98 insertions(+), 57 deletions(-)
 create mode 100644 python/paddle/autograd/__init__.py
 create mode 100644 python/paddle/autograd/backward_mode.py

diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index 8dabe19f57c58f..02725751cb6694 100755
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -44,6 +44,7 @@
 import paddle.device
 import paddle.regularizer
 import paddle.incubate
+import paddle.autograd
 
 # TODO: define alias in tensor and framework directory
 
diff --git a/python/paddle/autograd/__init__.py b/python/paddle/autograd/__init__.py
new file mode 100644
index 00000000000000..8b3f3086a4a728
--- /dev/null
+++ b/python/paddle/autograd/__init__.py
@@ -0,0 +1,22 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..fluid.dygraph.base import grad  #DEFINE_ALIAS
+
+from . import backward_mode
+from .backward_mode import backward
+
+__all__ = ['grad']
+
+__all__ += backward_mode.__all__
diff --git a/python/paddle/autograd/backward_mode.py b/python/paddle/autograd/backward_mode.py
new file mode 100644
index 00000000000000..a045c7b2840ffc
--- /dev/null
+++ b/python/paddle/autograd/backward_mode.py
@@ -0,0 +1,74 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.fluid import core
+from paddle.fluid import framework
+import paddle
+__all__ = ['backward']
+
+
+@framework.dygraph_only
+def backward(tensors,
+             grad_tensors,
+             retain_graph=None,
+             create_graph=False,
+             inputs=None):
+    def check_tensors(in_out_list, name):
+        assert in_out_list is not None, "{} should not be None".format(name)
+
+        if isinstance(in_out_list, (list, tuple)):
+            assert len(in_out_list) > 0, "{} connot be empyt".format(name)
+            for each_var in in_out_list:
+                assert isinstance(
+                    each_var, paddle.
+                    Tensor), "Elements of {} must be paddle.Tensor".format(name)
+            return in_out_list
+        else:
+            assert isinstance(
+                in_out_list,
+                paddle.Tensor), "{} must be Tensor or list of Tensor".format(
+                    name)
+            return [in_out_list]
+
+    tensors = check_tensors(tensors, "tensors")
+
+    if grad_tensors is not None:
+        if not isinstance(grad_tensors, (list, tuple)):
+            grad_tensors = [grad_tensors]
+
+        for each_tensor in grad_tensors:
+            if each_tensor is not None:
+                assert isinstance(
+                    each_tensor, paddle.Tensor
+                ), "grad_tensors must be None, Tensor or list containing None or Tensor"
+    else:
+        grad_tensors = []
+
+    if len(grad_tensors) > 0:
+        assert len(tensors) == len(
+            grad_tensors), "The length of grad_tensors must be equal to tensors"
+
+    assert isinstance(create_graph, bool), "create_graph must be True or False"
+
+    if retain_graph is None:
+        retain_graph = create_graph
+
+    assert isinstance(retain_graph,
+                      bool), "retain_graph must be None, True or False"
+
+    if inputs is not None:
+        assert len(inputs) > 0, "inputs cannot be empty list"
+
+    core.dygraph_run_backward(tensors, grad_tensors, retain_graph, create_graph,
+                              inputs, framework._dygraph_tracer())
diff --git a/python/paddle/fluid/dygraph/base.py b/python/paddle/fluid/dygraph/base.py
index 8c580c03c5f264..be5d9ac58311b5 100644
--- a/python/paddle/fluid/dygraph/base.py
+++ b/python/paddle/fluid/dygraph/base.py
@@ -30,7 +30,7 @@
 
 __all__ = [
     'no_grad', 'no_grad_', 'grad', 'guard', 'enable_dygraph', 'disable_dygraph',
-    'enabled', 'to_variable', 'backward'
+    'enabled', 'to_variable'
 ]
 
 
@@ -593,62 +593,6 @@ def check_in_out(in_out_list, name):
                                      retain_graph, allow_unused, only_inputs)
 
 
-@framework.dygraph_only
-def backward(tensors,
-             grad_tensors,
-             retain_graph=None,
-             create_graph=False,
-             inputs=None):
-    def check_tensors(in_out_list, name):
-        assert in_out_list is not None, "{} should not be None".format(name)
-
-        if isinstance(in_out_list, (list, tuple)):
-            assert len(in_out_list) > 0, "{} connot be empyt".format(name)
-            for each_var in in_out_list:
-                assert isinstance(
-                    each_var, paddle.
-                    Tensor), "Elements of {} must be paddle.Tensor".format(name)
-            return in_out_list
-        else:
-            assert isinstance(
-                in_out_list,
-                paddle.Tensor), "{} must be Tensor or list of Tensor".format(
-                    name)
-            return [in_out_list]
-
-    tensors = check_tensors(tensors, "tensors")
-
-    if grad_tensors is not None:
-        if not isinstance(grad_tensors, (list, tuple)):
-            grad_tensors = [grad_tensors]
-
-        for each_tensor in grad_tensors:
-            if each_tensor is not None:
-                assert isinstance(
-                    each_tensor, paddle.Tensor
-                ), "grad_tensors must be None, Tensor or list containing None or Tensor"
-    else:
-        grad_tensors = []
-
-    if len(grad_tensors) > 0:
-        assert len(tensors) == len(
-            grad_tensors), "The length of grad_tensors must be equal to tensors"
-
-    assert isinstance(create_graph, bool), "create_graph must be True or False"
-
-    if retain_graph is None:
-        retain_graph = create_graph
-
-    assert isinstance(retain_graph,
-                      bool), "retain_graph must be None, True or False"
-
-    if inputs is not None:
-        assert len(inputs) > 0, "inputs cannot be empty list"
-
-    core.dygraph_run_backward(tensors, grad_tensors, retain_graph, create_graph,
-                              inputs, framework._dygraph_tracer())
-
-
 @framework.dygraph_only
 def to_variable(value, name=None, zero_copy=None, dtype=None):
     r"""

From 2bb8f3cea4eb9d2fbaa04221441274bbe95f6116 Mon Sep 17 00:00:00 2001
From: chentianyu03 <chentianyu03@baidu.com>
Date: Tue, 23 Mar 2021 07:52:33 +0000
Subject: [PATCH 13/22] change tensor.backward func args

---
 paddle/fluid/pybind/imperative.cc             | 15 ++++++----
 python/paddle/autograd/backward_mode.py       |  2 ++
 .../tests/unittests/test_custom_grad_input.py | 30 +++++++++++++++++--
 3 files changed, 39 insertions(+), 8 deletions(-)

diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index 402c2e83e8fadd..db000a94c2567e 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -919,15 +919,17 @@ void BindImperative(py::module *m_ptr) {
               print(x.grad)          # None
        )DOC")
       .def("_run_backward",
-           [](imperative::VarBase &self, const imperative::Tracer &tracer,
+           [](std::vector<std::shared_ptr<imperative::VarBase>> &self, const imperative::Tracer &tracer,
               bool retain_graph,
-              std::shared_ptr<imperative::VarBase> &grad_tensor) {
+              std::vector<std::shared_ptr<imperative::VarBase>> &grad_tensor) {
              // TODO(jiabin): when we impl more backward execution we can
              // select them
-             // auto *engine = tracer.GetEngine();
-             // engine->Init(&self, retain_graph, grad_tensor.get());
+             std::vector<std::shared_ptr<imperative::VarBase>> inputs;
+            
+             auto *engine = tracer.GetEngine();
+             engine->Init(self, grad_tensor, retain_graph, false, inputs);
              VLOG(3) << "Start backward";
-             // engine->Execute();
+             engine->Execute();
              VLOG(3) << "Finish backward";
            },
            py::call_guard<py::gil_scoped_release>())
@@ -1425,7 +1427,8 @@ void BindImperative(py::module *m_ptr) {
         VLOG(3) << "Start backward";
         engine->Execute();
         VLOG(3) << "Finish backward";
-      });
+      },
+      py::call_guard<py::gil_scoped_release>());
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
     defined(PADDLE_WITH_XPU_BKCL)
diff --git a/python/paddle/autograd/backward_mode.py b/python/paddle/autograd/backward_mode.py
index a045c7b2840ffc..556763defc0d90 100644
--- a/python/paddle/autograd/backward_mode.py
+++ b/python/paddle/autograd/backward_mode.py
@@ -69,6 +69,8 @@ def check_tensors(in_out_list, name):
 
     if inputs is not None:
         assert len(inputs) > 0, "inputs cannot be empty list"
+    else:
+        inputs = []
 
     core.dygraph_run_backward(tensors, grad_tensors, retain_graph, create_graph,
                               inputs, framework._dygraph_tracer())
diff --git a/python/paddle/fluid/tests/unittests/test_custom_grad_input.py b/python/paddle/fluid/tests/unittests/test_custom_grad_input.py
index 73e19197326cc2..d8efca1f25a325 100644
--- a/python/paddle/fluid/tests/unittests/test_custom_grad_input.py
+++ b/python/paddle/fluid/tests/unittests/test_custom_grad_input.py
@@ -22,14 +22,14 @@
 from op_test import OpTest
 
 
-class TestBackward(unittest.TestCase):
+class TestTensorBackward(unittest.TestCase):
     def setUp(self):
         self._dtypes = ["float32", "float64"]
         self._places = [paddle.CPUPlace()]
         if paddle.is_compiled_with_cuda():
             self._places.append(paddle.CUDAPlace(0))
 
-    def test_all_positive(self):
+    def test_tensor_backward(self):
         for dtype in self._dtypes:
             x = np.random.random([2, 100]).astype(dtype)
             y = np.random.random([100, 2]).astype(dtype)
@@ -48,6 +48,32 @@ def test_all_positive(self):
 
                     self.assertTrue(np.allclose(x_grad, x_tensor.grad))
 
+class TestBackwardAPI(unittest.TestCase):
+    def setUp(self):
+        self._dtypes = ["float32", "float64"]
+        self._places = [paddle.CPUPlace()]
+        if paddle.is_compiled_with_cuda():
+            self._places.append(paddle.CUDAPlace(0))
+
+    def test_backward_api(self):
+        for dtype in self._dtypes:
+            x = np.random.random([2, 2]).astype(dtype)
+            y = np.random.random([2, 2]).astype(dtype)
+            z = np.matmul(x, y)
+            grad = np.random.random(z.shape).astype(dtype)
+            for place in self._places:
+                with dg.guard(place):
+                    x_tensor = paddle.to_tensor(x, stop_gradient=False)
+                    y_tensor = paddle.to_tensor(y)
+                    z_tensor = paddle.matmul(x_tensor, y_tensor)
+
+                    grad_tensor = paddle.to_tensor(grad)
+                    paddle.autograd.backward([z_tensor, z_tensor], [grad_tensor, grad_tensor], True)
+
+                    x_grad = np.matmul(grad, y.T)
+
+                    self.assertTrue(np.allclose(x_grad*2, x_tensor.grad))
+
 
 if __name__ == '__main__':
     unittest.main()

From 41b375fb81f6ccd1164602256e018d483ef7fc7c Mon Sep 17 00:00:00 2001
From: chentianyu03 <chentianyu03@baidu.com>
Date: Tue, 23 Mar 2021 09:28:05 +0000
Subject: [PATCH 14/22] modify tensor backward api

---
 paddle/fluid/pybind/imperative.cc                  | 14 +++++++++-----
 .../tests/unittests/test_custom_grad_input.py      |  6 ++++--
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index db000a94c2567e..52008abce2193e 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -720,6 +720,7 @@ void BindImperative(py::module *m_ptr) {
          Bump the version whenever the Tensor is modified through an inplace operation.
             )DOC")
       .def("numpy",
+
            [](imperative::VarBase &self) -> py::array {
              const auto &tensor =
                  self.MutableVar()->Get<framework::LoDTensor>();
@@ -919,15 +920,18 @@ void BindImperative(py::module *m_ptr) {
               print(x.grad)          # None
        )DOC")
       .def("_run_backward",
-           [](std::vector<std::shared_ptr<imperative::VarBase>> &self, const imperative::Tracer &tracer,
-              bool retain_graph,
-              std::vector<std::shared_ptr<imperative::VarBase>> &grad_tensor) {
+           [](std::shared_ptr<imperative::VarBase> &self,
+              const imperative::Tracer &tracer, bool retain_graph,
+              std::shared_ptr<imperative::VarBase> &grad_tensor) {
              // TODO(jiabin): when we impl more backward execution we can
              // select them
+             std::vector<std::shared_ptr<imperative::VarBase>> tensors{self};
+             std::vector<std::shared_ptr<imperative::VarBase>> grad_tensors{
+                 grad_tensor};
              std::vector<std::shared_ptr<imperative::VarBase>> inputs;
-            
+
              auto *engine = tracer.GetEngine();
-             engine->Init(self, grad_tensor, retain_graph, false, inputs);
+             engine->Init(tensors, grad_tensors, retain_graph, false, inputs);
              VLOG(3) << "Start backward";
              engine->Execute();
              VLOG(3) << "Finish backward";
diff --git a/python/paddle/fluid/tests/unittests/test_custom_grad_input.py b/python/paddle/fluid/tests/unittests/test_custom_grad_input.py
index d8efca1f25a325..b6b496b8be8ed6 100644
--- a/python/paddle/fluid/tests/unittests/test_custom_grad_input.py
+++ b/python/paddle/fluid/tests/unittests/test_custom_grad_input.py
@@ -48,6 +48,7 @@ def test_tensor_backward(self):
 
                     self.assertTrue(np.allclose(x_grad, x_tensor.grad))
 
+
 class TestBackwardAPI(unittest.TestCase):
     def setUp(self):
         self._dtypes = ["float32", "float64"]
@@ -68,11 +69,12 @@ def test_backward_api(self):
                     z_tensor = paddle.matmul(x_tensor, y_tensor)
 
                     grad_tensor = paddle.to_tensor(grad)
-                    paddle.autograd.backward([z_tensor, z_tensor], [grad_tensor, grad_tensor], True)
+                    paddle.autograd.backward([z_tensor, z_tensor],
+                                             [grad_tensor, grad_tensor], True)
 
                     x_grad = np.matmul(grad, y.T)
 
-                    self.assertTrue(np.allclose(x_grad*2, x_tensor.grad))
+                    self.assertTrue(np.allclose(x_grad * 2, x_tensor.grad))
 
 
 if __name__ == '__main__':

From 6974e5c838c6536fcee5e5a4c4731045d1ff1bf2 Mon Sep 17 00:00:00 2001
From: chentianyu03 <chentianyu03@baidu.com>
Date: Tue, 23 Mar 2021 09:54:42 +0000
Subject: [PATCH 15/22] remove create_graph intputs args

---
 paddle/fluid/imperative/basic_engine.cc |  3 +--
 paddle/fluid/imperative/basic_engine.h  |  3 +--
 paddle/fluid/pybind/imperative.cc       |  9 +++------
 python/paddle/autograd/backward_mode.py | 23 ++++-------------------
 4 files changed, 9 insertions(+), 29 deletions(-)

diff --git a/paddle/fluid/imperative/basic_engine.cc b/paddle/fluid/imperative/basic_engine.cc
index a5f8bf26578e4a..5902376560f09b 100644
--- a/paddle/fluid/imperative/basic_engine.cc
+++ b/paddle/fluid/imperative/basic_engine.cc
@@ -39,8 +39,7 @@ namespace imperative {
 void BasicEngine::Init(
     const std::vector<std::shared_ptr<VarBase>>& tensors,
     const std::vector<std::shared_ptr<VarBase>>& grad_tensors,
-    bool retain_graph, bool create_graph,
-    const std::vector<std::shared_ptr<VarBase>>& inputs) {
+    bool retain_graph) {
   retain_graph_ = retain_graph;
 
   PADDLE_ENFORCE_EQ(
diff --git a/paddle/fluid/imperative/basic_engine.h b/paddle/fluid/imperative/basic_engine.h
index 992066661c9605..d4c0ae84191f97 100644
--- a/paddle/fluid/imperative/basic_engine.h
+++ b/paddle/fluid/imperative/basic_engine.h
@@ -32,8 +32,7 @@ class BasicEngine : public Engine {
  public:
   void Init(const std::vector<std::shared_ptr<VarBase>>& tensors,
             const std::vector<std::shared_ptr<VarBase>>& grad_tensors,
-            bool retain_graph, bool create_graph,
-            const std::vector<std::shared_ptr<VarBase>>& inputs);
+            bool retain_graph);
 
   void Execute() override;
 
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index 52008abce2193e..c16999d1fc52dd 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -928,10 +928,9 @@ void BindImperative(py::module *m_ptr) {
              std::vector<std::shared_ptr<imperative::VarBase>> tensors{self};
              std::vector<std::shared_ptr<imperative::VarBase>> grad_tensors{
                  grad_tensor};
-             std::vector<std::shared_ptr<imperative::VarBase>> inputs;
 
              auto *engine = tracer.GetEngine();
-             engine->Init(tensors, grad_tensors, retain_graph, false, inputs);
+             engine->Init(tensors, grad_tensors, retain_graph);
              VLOG(3) << "Start backward";
              engine->Execute();
              VLOG(3) << "Finish backward";
@@ -1423,11 +1422,9 @@ void BindImperative(py::module *m_ptr) {
       "dygraph_run_backward",
       [](const std::vector<std::shared_ptr<imperative::VarBase>> &tensors,
          const std::vector<std::shared_ptr<imperative::VarBase>> &grad_tensors,
-         bool retain_graph, bool create_graph,
-         const std::vector<std::shared_ptr<imperative::VarBase>> &inputs,
-         const imperative::Tracer &tracer) {
+         bool retain_graph, const imperative::Tracer &tracer) {
         auto *engine = tracer.GetEngine();
-        engine->Init(tensors, grad_tensors, retain_graph, create_graph, inputs);
+        engine->Init(tensors, grad_tensors, retain_graph);
         VLOG(3) << "Start backward";
         engine->Execute();
         VLOG(3) << "Finish backward";
diff --git a/python/paddle/autograd/backward_mode.py b/python/paddle/autograd/backward_mode.py
index 556763defc0d90..f274cab3d75279 100644
--- a/python/paddle/autograd/backward_mode.py
+++ b/python/paddle/autograd/backward_mode.py
@@ -19,11 +19,7 @@
 
 
 @framework.dygraph_only
-def backward(tensors,
-             grad_tensors,
-             retain_graph=None,
-             create_graph=False,
-             inputs=None):
+def backward(tensors, grad_tensors, retain_graph=False):
     def check_tensors(in_out_list, name):
         assert in_out_list is not None, "{} should not be None".format(name)
 
@@ -59,18 +55,7 @@ def check_tensors(in_out_list, name):
         assert len(tensors) == len(
             grad_tensors), "The length of grad_tensors must be equal to tensors"
 
-    assert isinstance(create_graph, bool), "create_graph must be True or False"
+    assert isinstance(retain_graph, bool), "retain_graph must be True or False"
 
-    if retain_graph is None:
-        retain_graph = create_graph
-
-    assert isinstance(retain_graph,
-                      bool), "retain_graph must be None, True or False"
-
-    if inputs is not None:
-        assert len(inputs) > 0, "inputs cannot be empty list"
-    else:
-        inputs = []
-
-    core.dygraph_run_backward(tensors, grad_tensors, retain_graph, create_graph,
-                              inputs, framework._dygraph_tracer())
+    core.dygraph_run_backward(tensors, grad_tensors, retain_graph,
+                              framework._dygraph_tracer())

From 1e3e9759a7d61c2af0ae972ea311a3c71ea1185a Mon Sep 17 00:00:00 2001
From: chentianyu03 <chentianyu03@baidu.com>
Date: Wed, 24 Mar 2021 03:27:35 +0000
Subject: [PATCH 16/22] add doc and examplex code for backward api

---
 python/paddle/autograd/backward_mode.py | 45 +++++++++++++++++++++++--
 1 file changed, 43 insertions(+), 2 deletions(-)

diff --git a/python/paddle/autograd/backward_mode.py b/python/paddle/autograd/backward_mode.py
index f274cab3d75279..2d44a65f7afae7 100644
--- a/python/paddle/autograd/backward_mode.py
+++ b/python/paddle/autograd/backward_mode.py
@@ -19,7 +19,48 @@
 
 
 @framework.dygraph_only
-def backward(tensors, grad_tensors, retain_graph=False):
+def backward(tensors, grad_tensors=None, retain_graph=False):
+    """
+    Compute the backward gradients of given tensors.
+    
+    Args:
+        tensors(list of Tensors): the tensors which the gradient to be computed.
+
+        grad_tensors(list of Tensors of None, optional): the init gradients of the `tensors`` .If not None, it must have the same length with ``tensors`` ,
+            and if any of the elements is None, then the init gradient is the default value which is filled with 1.0. 
+            If None, all the gradients of the ``tensors`` is the default value which is filled with 1.0.
+
+        retain_graph(bool, optional): If False, the graph used to compute grads will be freed. If you would
+            like to add more ops to the built graph after calling this method( :code:`backward` ), set the parameter
+            :code:`retain_graph` to True, then the grads will be retained. Thus, seting it to False is much more memory-efficient.
+            Defaults to False.
+    
+    Returns:
+        NoneType: Non
+
+
+    Examples:
+        .. code-block:: python
+
+        import paddle
+        x = paddle.to_tensor([[1, 2], [3, 4]], dtype='float32', stop_gradient=False)
+        y = paddle.to_tensor([[3, 2], [3, 4]], dtype='float32')
+
+        grad_tensor = paddle.to_tensor([[1,2], [1, 1]], dtype='float32')
+
+        z = paddle.matmul(x, y)
+        #Tensor(shape=[2, 2], dtype=float32, place=CUDAPlace(0), stop_gradient=False,
+        #       [[9. , 10.],
+        #       [21., 22.]])
+
+        paddle.autograd.backward([z, z], [grad_tensor, grad_tensor], True)
+
+        print(x.grad)
+        #[[14. 22.]
+        # [10. 14.]]]
+
+    """
+
     def check_tensors(in_out_list, name):
         assert in_out_list is not None, "{} should not be None".format(name)
 
@@ -49,7 +90,7 @@ def check_tensors(in_out_list, name):
                     each_tensor, paddle.Tensor
                 ), "grad_tensors must be None, Tensor or list containing None or Tensor"
     else:
-        grad_tensors = []
+        grad_tensors = [None] * len(tensors)
 
     if len(grad_tensors) > 0:
         assert len(tensors) == len(

From c7de011fda6b0808c2abea47ed3eff8cd1d386c5 Mon Sep 17 00:00:00 2001
From: chentianyu03 <chentianyu03@baidu.com>
Date: Wed, 24 Mar 2021 06:34:35 +0000
Subject: [PATCH 17/22] when have the same tensor, throw error

---
 python/paddle/autograd/backward_mode.py       | 41 +++++++++++++------
 .../tests/unittests/test_custom_grad_input.py |  5 ++-
 2 files changed, 31 insertions(+), 15 deletions(-)

diff --git a/python/paddle/autograd/backward_mode.py b/python/paddle/autograd/backward_mode.py
index 2d44a65f7afae7..a3e211893c510d 100644
--- a/python/paddle/autograd/backward_mode.py
+++ b/python/paddle/autograd/backward_mode.py
@@ -24,7 +24,7 @@ def backward(tensors, grad_tensors=None, retain_graph=False):
     Compute the backward gradients of given tensors.
     
     Args:
-        tensors(list of Tensors): the tensors which the gradient to be computed.
+        tensors(list of Tensors): the tensors which the gradient to be computed. The tensors can not contain the same tensor.
 
         grad_tensors(list of Tensors of None, optional): the init gradients of the `tensors`` .If not None, it must have the same length with ``tensors`` ,
             and if any of the elements is None, then the init gradient is the default value which is filled with 1.0. 
@@ -42,22 +42,34 @@ def backward(tensors, grad_tensors=None, retain_graph=False):
     Examples:
         .. code-block:: python
 
-        import paddle
-        x = paddle.to_tensor([[1, 2], [3, 4]], dtype='float32', stop_gradient=False)
-        y = paddle.to_tensor([[3, 2], [3, 4]], dtype='float32')
+            import paddle
+            x = paddle.to_tensor([[1, 2], [3, 4]], dtype='float32', stop_gradient=False)
+            y = paddle.to_tensor([[3, 2], [3, 4]], dtype='float32')
 
-        grad_tensor = paddle.to_tensor([[1,2], [1, 1]], dtype='float32')
+            grad_tensor1 = paddle.to_tensor([[1,2], [2, 3]], dtype='float32')
+            grad_tensor2 = paddle.to_tensor([[1,1], [1, 1]], dtype='float32')
 
-        z = paddle.matmul(x, y)
-        #Tensor(shape=[2, 2], dtype=float32, place=CUDAPlace(0), stop_gradient=False,
-        #       [[9. , 10.],
-        #       [21., 22.]])
+            z1 = paddle.matmul(x, y)
+            z2 = paddle.matmul(x, y)
 
-        paddle.autograd.backward([z, z], [grad_tensor, grad_tensor], True)
+            paddle.autograd.backward([z1, z2], [grad_tensor1, grad_tensor2], True)
+            print(x.grad)
+            #[[12. 18.]
+            # [17. 25.]]
 
-        print(x.grad)
-        #[[14. 22.]
-        # [10. 14.]]]
+            x.clear_grad()
+
+            paddle.autograd.backward([z1, z2], [grad_tensor1, None], True)
+            print(x.grad)
+            #[[12. 18.]
+            # [17. 25.]]
+
+            x.clear_grad()
+
+            paddle.autograd.backward([z1, z2])
+            print(x.grad)
+            #[[10. 14.]
+            # [10. 14.]]
 
     """
 
@@ -80,6 +92,9 @@ def check_tensors(in_out_list, name):
 
     tensors = check_tensors(tensors, "tensors")
 
+    assert len(tensors) == len(set(
+        tensors)), "the arg tensors should not contains same element"
+
     if grad_tensors is not None:
         if not isinstance(grad_tensors, (list, tuple)):
             grad_tensors = [grad_tensors]
diff --git a/python/paddle/fluid/tests/unittests/test_custom_grad_input.py b/python/paddle/fluid/tests/unittests/test_custom_grad_input.py
index b6b496b8be8ed6..ddd92138ec2b8d 100644
--- a/python/paddle/fluid/tests/unittests/test_custom_grad_input.py
+++ b/python/paddle/fluid/tests/unittests/test_custom_grad_input.py
@@ -66,10 +66,11 @@ def test_backward_api(self):
                 with dg.guard(place):
                     x_tensor = paddle.to_tensor(x, stop_gradient=False)
                     y_tensor = paddle.to_tensor(y)
-                    z_tensor = paddle.matmul(x_tensor, y_tensor)
+                    z_tensor1 = paddle.matmul(x_tensor, y_tensor)
+                    z_tensor2 = paddle.matmul(x_tensor, y_tensor)
 
                     grad_tensor = paddle.to_tensor(grad)
-                    paddle.autograd.backward([z_tensor, z_tensor],
+                    paddle.autograd.backward([z_tensor1, z_tensor2],
                                              [grad_tensor, grad_tensor], True)
 
                     x_grad = np.matmul(grad, y.T)

From 2f2824ce955d0ff1be9696746874ff18fb4ff670 Mon Sep 17 00:00:00 2001
From: chentianyu03 <chentianyu03@baidu.com>
Date: Wed, 24 Mar 2021 07:40:43 +0000
Subject: [PATCH 18/22] modify test Init func args

---
 paddle/fluid/imperative/basic_engine.h       | 2 +-
 paddle/fluid/imperative/tests/test_hooks.cc  | 8 ++++++--
 paddle/fluid/imperative/tests/test_tracer.cc | 9 +++++++--
 3 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/imperative/basic_engine.h b/paddle/fluid/imperative/basic_engine.h
index d4c0ae84191f97..ee245270270e3a 100644
--- a/paddle/fluid/imperative/basic_engine.h
+++ b/paddle/fluid/imperative/basic_engine.h
@@ -32,7 +32,7 @@ class BasicEngine : public Engine {
  public:
   void Init(const std::vector<std::shared_ptr<VarBase>>& tensors,
             const std::vector<std::shared_ptr<VarBase>>& grad_tensors,
-            bool retain_graph);
+            bool retain_graph = false);
 
   void Execute() override;
 
diff --git a/paddle/fluid/imperative/tests/test_hooks.cc b/paddle/fluid/imperative/tests/test_hooks.cc
index 7bf5f876681bab..56d2af75f8602b 100644
--- a/paddle/fluid/imperative/tests/test_hooks.cc
+++ b/paddle/fluid/imperative/tests/test_hooks.cc
@@ -93,8 +93,10 @@ TEST(TestHooks, TestGradVarLeafBackwardHook) {
   ASSERT_EQ(out->GradVarBase()->GradOpNum(), 1UL);
 
   // 3. backward
+  std::vector<std::shared_ptr<imperative::VarBase>> tensors{out};
+  std::vector<std::shared_ptr<imperative::VarBase>> grad_tensors { nullptr }
   BasicEngine engine;
-  engine.Init(out.get());
+  engine.Init(tensors, grad_tensors);
   engine.Execute();
 
   framework::LoDTensor x_grad;
@@ -193,8 +195,10 @@ void GradVarLeafBackwardHookWithGradAccmulatedTest() {
   ASSERT_EQ(out->GradVarBase()->GradOpNum(), 1UL);
 
   // 3. backward
+  std::vector<std::shared_ptr<imperative::VarBase>> tensors{out};
+  std::vector<std::shared_ptr<imperative::VarBase>> grad_tensors { nullptr }
   BasicEngine engine;
-  engine.Init(out.get());
+  engine.Init(tensors, grad_tensors);
   engine.Execute();
 
   framework::LoDTensor x_grad;
diff --git a/paddle/fluid/imperative/tests/test_tracer.cc b/paddle/fluid/imperative/tests/test_tracer.cc
index 9e3b0ea5df6838..76de413b3e6033 100644
--- a/paddle/fluid/imperative/tests/test_tracer.cc
+++ b/paddle/fluid/imperative/tests/test_tracer.cc
@@ -250,7 +250,10 @@ TEST(test_tracer, test_trace_op_with_multi_device_inputs) {
   tracer.TraceOp("reduce_sum", reduce_in, reduce_out, reduce_attr_map,
                  gpu_place, true);
   imperative::BasicEngine engine;
-  engine.Init(reduce_sum_out.get());
+
+  std::vector<std::shared_ptr<imperative::VarBase>> tensors{reduce_sum_out};
+  std::vector<std::shared_ptr<imperative::VarBase>> grad_tensors{nullptr};
+  engine.Init(tensors, grad_tensors);
   engine.Execute();
 
   framework::LoDTensor rlt;
@@ -376,8 +379,10 @@ TEST(test_tracer, test_var_without_grad_var) {
   ASSERT_EQ(y_in->GradVarBase()->GradOpNum(), 0UL);
   ASSERT_EQ(vout->GradVarBase()->GradOpNum(), 1UL);
 
+  std::vector<std::shared_ptr<imperative::VarBase>> tensors{vout};
+  std::vector<std::shared_ptr<imperative::VarBase>> grad_tensors{nullptr};
   imperative::BasicEngine engine;
-  engine.Init(vout.get());
+  engine.Init(tensors, grad_tensors);
   engine.Execute();
 
   // check the grad

From 8415df41fd6bca9b951060ab5cdc6678a5055b4c Mon Sep 17 00:00:00 2001
From: chentianyu03 <chentianyu03@baidu.com>
Date: Wed, 24 Mar 2021 08:18:51 +0000
Subject: [PATCH 19/22] modify the execute.Init func args in test files

---
 paddle/fluid/imperative/tests/test_hooks.cc | 4 ++--
 python/paddle/autograd/backward_mode.py     | 3 ++-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/imperative/tests/test_hooks.cc b/paddle/fluid/imperative/tests/test_hooks.cc
index 56d2af75f8602b..0e538bd44d9867 100644
--- a/paddle/fluid/imperative/tests/test_hooks.cc
+++ b/paddle/fluid/imperative/tests/test_hooks.cc
@@ -94,7 +94,7 @@ TEST(TestHooks, TestGradVarLeafBackwardHook) {
 
   // 3. backward
   std::vector<std::shared_ptr<imperative::VarBase>> tensors{out};
-  std::vector<std::shared_ptr<imperative::VarBase>> grad_tensors { nullptr }
+  std::vector<std::shared_ptr<imperative::VarBase>> grad_tensors{nullptr};
   BasicEngine engine;
   engine.Init(tensors, grad_tensors);
   engine.Execute();
@@ -196,7 +196,7 @@ void GradVarLeafBackwardHookWithGradAccmulatedTest() {
 
   // 3. backward
   std::vector<std::shared_ptr<imperative::VarBase>> tensors{out};
-  std::vector<std::shared_ptr<imperative::VarBase>> grad_tensors { nullptr }
+  std::vector<std::shared_ptr<imperative::VarBase>> grad_tensors{nullptr};
   BasicEngine engine;
   engine.Init(tensors, grad_tensors);
   engine.Execute();
diff --git a/python/paddle/autograd/backward_mode.py b/python/paddle/autograd/backward_mode.py
index a3e211893c510d..ac19ef3ff102d6 100644
--- a/python/paddle/autograd/backward_mode.py
+++ b/python/paddle/autograd/backward_mode.py
@@ -29,6 +29,7 @@ def backward(tensors, grad_tensors=None, retain_graph=False):
         grad_tensors(list of Tensors of None, optional): the init gradients of the `tensors`` .If not None, it must have the same length with ``tensors`` ,
             and if any of the elements is None, then the init gradient is the default value which is filled with 1.0. 
             If None, all the gradients of the ``tensors`` is the default value which is filled with 1.0.
+            Defaults to None.
 
         retain_graph(bool, optional): If False, the graph used to compute grads will be freed. If you would
             like to add more ops to the built graph after calling this method( :code:`backward` ), set the parameter
@@ -36,7 +37,7 @@ def backward(tensors, grad_tensors=None, retain_graph=False):
             Defaults to False.
     
     Returns:
-        NoneType: Non
+        NoneType: None
 
 
     Examples:

From be065e4b2fcad351f54aa67c21588c3c2a618c9c Mon Sep 17 00:00:00 2001
From: chentianyu03 <chentianyu03@baidu.com>
Date: Wed, 24 Mar 2021 08:57:18 +0000
Subject: [PATCH 20/22] add paddle.autograd package in setup.py.in

---
 python/setup.py.in | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/setup.py.in b/python/setup.py.in
index 64cfe6e9ccff74..0114b4c223a5f8 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -216,6 +216,7 @@ packages=['paddle',
           'paddle.static.amp',
           'paddle.tensor',
           'paddle.onnx',
+          'paddle.autograd',
           ]
 
 with open('@PADDLE_SOURCE_DIR@/python/requirements.txt') as f:

From 7f8e58c46798c0f05fe8814be0b127763e221fd9 Mon Sep 17 00:00:00 2001
From: chentianyu03 <chentianyu03@baidu.com>
Date: Mon, 29 Mar 2021 03:57:59 +0000
Subject: [PATCH 21/22] modify error msg, remove _run_backward method in class
 Tensor

---
 paddle/fluid/imperative/basic_engine.cc         | 13 ++++++-------
 paddle/fluid/imperative/basic_engine.h          |  1 -
 paddle/fluid/pybind/imperative.cc               | 17 -----------------
 python/paddle/autograd/backward_mode.py         |  7 ++++---
 .../fluid/dygraph/varbase_patch_methods.py      |  9 +++++----
 5 files changed, 15 insertions(+), 32 deletions(-)

diff --git a/paddle/fluid/imperative/basic_engine.cc b/paddle/fluid/imperative/basic_engine.cc
index 5902376560f09b..6601916d9d583e 100644
--- a/paddle/fluid/imperative/basic_engine.cc
+++ b/paddle/fluid/imperative/basic_engine.cc
@@ -45,15 +45,15 @@ void BasicEngine::Init(
   PADDLE_ENFORCE_EQ(
       tensors.size(), grad_tensors.size(),
       platform::errors::Unavailable(
-          "the size of tensors must equal the size of grad_tensors, but"
-          "the size of tensors is %s, and the size of grad_tensors is %s.",
+          "The size of tensors do not equal the size of grad_tensors,"
+          "the size of tensors is %s, but the size of grad_tensors is %s.",
           tensors.size(), grad_tensors.size()));
 
   for (size_t i = 0; i < tensors.size(); ++i) {
     auto var = tensors[i];
     auto grad_tensor = grad_tensors[i];
 
-    auto init_node_ = var->GradVarBase()->GradNode();
+    auto init_node = var->GradVarBase()->GradNode();
     PADDLE_ENFORCE_EQ(
         var->GradVarBase()->GraphIsFreed(), false,
         platform::errors::Unavailable(
@@ -70,7 +70,7 @@ void BasicEngine::Init(
       var->GradVarBase()->ClearGradNode();
     }
 
-    if (init_node_ == nullptr || var->OverridedStopGradient()) {
+    if (init_node == nullptr || var->OverridedStopGradient()) {
       VLOG(3) << "Skip auto grad since there is no grad op for var or loss is "
                  "stop_gradient=True: "
               << var->Name();
@@ -81,8 +81,7 @@ void BasicEngine::Init(
 
     PADDLE_ENFORCE_EQ(
         var->HasGradVar(), true,
-        platform::errors::NotFound("Grad variable not exist for variable %s",
-                                   var->Name()));
+        platform::errors::NotFound("Tensor %s has no gradient", var->Name()));
 
     auto& fwd_var = var->Var().Get<framework::LoDTensor>();
     auto* grad_var =
@@ -102,7 +101,7 @@ void BasicEngine::Init(
           *dev_ctx, grad_var);
     }
 
-    init_nodes_.push_back(init_node_);
+    init_nodes_.push_back(init_node);
   }
 }
 
diff --git a/paddle/fluid/imperative/basic_engine.h b/paddle/fluid/imperative/basic_engine.h
index ee245270270e3a..49761a8df0b6b1 100644
--- a/paddle/fluid/imperative/basic_engine.h
+++ b/paddle/fluid/imperative/basic_engine.h
@@ -76,7 +76,6 @@ class BasicEngine : public Engine {
   std::vector<GradientAccumulator*> leaf_accumulators_;
 
   bool retain_graph_;
-  bool create_graph_;
 };
 
 }  // namespace imperative
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index c16999d1fc52dd..501efa0e83c89f 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -919,23 +919,6 @@ void BindImperative(py::module *m_ptr) {
               print(x.stop_gradient) # True
               print(x.grad)          # None
        )DOC")
-      .def("_run_backward",
-           [](std::shared_ptr<imperative::VarBase> &self,
-              const imperative::Tracer &tracer, bool retain_graph,
-              std::shared_ptr<imperative::VarBase> &grad_tensor) {
-             // TODO(jiabin): when we impl more backward execution we can
-             // select them
-             std::vector<std::shared_ptr<imperative::VarBase>> tensors{self};
-             std::vector<std::shared_ptr<imperative::VarBase>> grad_tensors{
-                 grad_tensor};
-
-             auto *engine = tracer.GetEngine();
-             engine->Init(tensors, grad_tensors, retain_graph);
-             VLOG(3) << "Start backward";
-             engine->Execute();
-             VLOG(3) << "Finish backward";
-           },
-           py::call_guard<py::gil_scoped_release>())
       .def("_grad_name", &imperative::VarBase::GradVarName)
       .def("_grad_value",
            [](imperative::VarBase &self) {
diff --git a/python/paddle/autograd/backward_mode.py b/python/paddle/autograd/backward_mode.py
index ac19ef3ff102d6..96e4336abaa6fa 100644
--- a/python/paddle/autograd/backward_mode.py
+++ b/python/paddle/autograd/backward_mode.py
@@ -93,8 +93,9 @@ def check_tensors(in_out_list, name):
 
     tensors = check_tensors(tensors, "tensors")
 
-    assert len(tensors) == len(set(
-        tensors)), "the arg tensors should not contains same element"
+    assert len(tensors) == len(
+        set(tensors)
+    ), "The argument 'tensors' of paddle.autograd.backward contains duplicate paddle.Tensor object."
 
     if grad_tensors is not None:
         if not isinstance(grad_tensors, (list, tuple)):
@@ -104,7 +105,7 @@ def check_tensors(in_out_list, name):
             if each_tensor is not None:
                 assert isinstance(
                     each_tensor, paddle.Tensor
-                ), "grad_tensors must be None, Tensor or list containing None or Tensor"
+                ), "The argument 'grad_tensors' of paddle.autograd.backward is invalid, it can be 'None', 'paddle.Tensor' or 'list[None/paddle.Tensor]'."
     else:
         grad_tensors = [None] * len(tensors)
 
diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py
index 07fabc9cb0a0bd..42479d07c20eac 100644
--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
@@ -197,11 +197,12 @@ def backward(self, grad_tensor=None, retain_graph=False):
             if paddle.is_compiled_with_xpu():
                 # TODO(liuyuhui): Currently only for xpu. Will be removed in the future.
                 scaled_loss = scale_loss(self)
-                scaled_loss._run_backward(framework._dygraph_tracer(),
-                                          retain_graph, grad_tensor)
+                core.dygraph_run_backward([scaled_loss], [grad_tensor],
+                                          retain_graph,
+                                          framework._dygraph_tracer())
             else:
-                self._run_backward(framework._dygraph_tracer(), retain_graph,
-                                   grad_tensor)
+                core.dygraph_run_backward([self], [grad_tensor], retain_graph,
+                                          framework._dygraph_tracer())
         else:
             raise ValueError(
                 "Variable.backward() is only available in DyGraph mode")

From 0374c0bb25fe3cc327b0a7b1e5059342d6eb2691 Mon Sep 17 00:00:00 2001
From: chentianyu03 <chentianyu03@baidu.com>
Date: Tue, 30 Mar 2021 02:21:29 +0000
Subject: [PATCH 22/22] add test cases for backward api

---
 .../tests/unittests/test_custom_grad_input.py | 37 +++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/python/paddle/fluid/tests/unittests/test_custom_grad_input.py b/python/paddle/fluid/tests/unittests/test_custom_grad_input.py
index ddd92138ec2b8d..a7472e7ffd7609 100644
--- a/python/paddle/fluid/tests/unittests/test_custom_grad_input.py
+++ b/python/paddle/fluid/tests/unittests/test_custom_grad_input.py
@@ -77,6 +77,43 @@ def test_backward_api(self):
 
                     self.assertTrue(np.allclose(x_grad * 2, x_tensor.grad))
 
+    def test_backward_single_tensor(self):
+        for dtype in self._dtypes:
+            x = np.random.random([2, 2]).astype(dtype)
+            y = np.random.random([2, 2]).astype(dtype)
+            z = np.matmul(x, y)
+            grad = np.random.random(z.shape).astype(dtype)
+            for place in self._places:
+                with dg.guard(place):
+                    x_tensor = paddle.to_tensor(x, stop_gradient=False)
+                    y_tensor = paddle.to_tensor(y)
+                    z_tensor1 = paddle.matmul(x_tensor, y_tensor)
+
+                    grad_tensor = paddle.to_tensor(grad)
+                    paddle.autograd.backward(z_tensor1, grad_tensor, True)
+
+                    x_grad = np.matmul(grad, y.T)
+
+                    self.assertTrue(np.allclose(x_grad, x_tensor.grad))
+
+    def test_backward_none_grad_tensor(self):
+        for dtype in self._dtypes:
+            x = np.random.random([2, 2]).astype(dtype)
+            y = np.random.random([2, 2]).astype(dtype)
+            z = np.matmul(x, y)
+            grad = np.ones(z.shape).astype(dtype)
+            for place in self._places:
+                with dg.guard(place):
+                    x_tensor = paddle.to_tensor(x, stop_gradient=False)
+                    y_tensor = paddle.to_tensor(y)
+                    z_tensor1 = paddle.matmul(x_tensor, y_tensor)
+
+                    paddle.autograd.backward(z_tensor1, None)
+
+                    x_grad = np.matmul(grad, y.T)
+
+                    self.assertTrue(np.allclose(x_grad, x_tensor.grad))
+
 
 if __name__ == '__main__':
     unittest.main()