From 580c87579bae7b20991ab957289310353a83bc6f Mon Sep 17 00:00:00 2001
From: ForFishes <1422485404@qq.com>
Date: Mon, 10 May 2021 18:24:19 +0800
Subject: [PATCH 1/6] fix find_unused_parameters default value

---
 .../framework/distributed_strategy.proto      |   2 +-
 paddle/fluid/imperative/reducer.cc            | 115 ++++++++++--------
 paddle/fluid/imperative/reducer.h             |   8 +-
 .../fleet/base/distributed_strategy.py        |   2 +-
 python/paddle/fluid/dygraph/parallel.py       |  12 +-
 .../parallel_dygraph_gradient_check.py        |   4 +-
 .../fluid/tests/unittests/test_dist_base.py   |   2 +-
 .../test_parallel_dygraph_unused_variables.py |  88 +++++++-------
 8 files changed, 120 insertions(+), 113 deletions(-)
diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto
index dbe9b8cb9aa9ee..d102fcdbe0cec1 100644
--- a/paddle/fluid/framework/distributed_strategy.proto
+++ b/paddle/fluid/framework/distributed_strategy.proto
@@ -172,7 +172,7 @@ message DistributedStrategy {
   optional bool fp16_allreduce = 25 [ default = false ];
   optional bool sharding = 26 [ default = false ];
   optional float last_comm_group_size_MB = 27 [ default = 1 ];
-  optional bool find_unused_parameters = 28 [ default = true ];
+  optional bool find_unused_parameters = 28 [ default = false ];
   optional bool tensor_parallel = 29 [ default = false ];
   optional bool without_graph_optimization = 30 [ default = false ];
 
diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc
index e3dd0a2aa75b41..117c0dbbf13802 100644
--- a/paddle/fluid/imperative/reducer.cc
+++ b/paddle/fluid/imperative/reducer.cc
@@ -297,7 +297,7 @@ Reducer::Reducer(const std::vector<std::shared_ptr<imperative::VarBase>> &vars,
       is_sparse_gradient_(is_sparse_gradient),
       parallel_ctx_(parallel_ctx),
       group_size_limits_(group_size_limits),
-      find_unused_vars_(find_unused_vars) {
+      find_unused_vars_each_step_(find_unused_vars) {
   VLOG(3) << "Start construct the Reducer ...";
   nrings_ = parallel_ctx->GetNRings();
   nranks_ = parallel_ctx->GetNRanks();
@@ -457,42 +457,8 @@ void Reducer::PrepareDeps(const std::unordered_set<GradOpNode *> &init_nodes) {
   }
 }
 
-// After each batch is calculated, the counter of each group(group.pending_)
-// and allreudce sequence counter(next_group_) will be cleaned up again.
-void Reducer::PrepareForBackward(
+void Reducer::TraverseBackwardGraph(
     const std::vector<std::shared_ptr<imperative::VarBase>> &outputs) {
-  VLOG(3) << "after forward, then reset count for backward.";
-  next_group_ = 0;
-  std::for_each(groups_.begin(), groups_.end(), [](Group &group) {
-    group.pending_ = group.variable_indices_.size();
-    group.sparse_contents_ = nullptr;
-  });
-
-  // reinitialize vars_marked_ready_ for next iteration
-  vars_marked_ready_.clear();
-  vars_marked_ready_.resize(vars_.size(), false);
-
-  PADDLE_ENFORCE_EQ(
-      groups_need_finalize_, false,
-      platform::errors::PreconditionNotMet(
-          "A serious error has occurred here. There may be several reasons: "
-          "1) Please note that all forward outputs derived from the module "
-          "parameters must participate in the calculation of losses and "
-          "subsequent gradient calculations. If not, the wrapper will hang, "
-          "waiting for autograd to generate gradients for these parameters. "
-          "you can use detach or stop_gradient to make the unused parameters "
-          "detached from the autograd graph. "
-          "2) Used multiple forwards and one backward. You may be able to wrap "
-          "multiple forwards in a model."));
-
-  // The first var to trigger the unused parameter
-  has_marked_unused_vars_ = false;
-  unused_vars_.clear();
-
-  if (!find_unused_vars_) {
-    return;
-  }
-
   node_deps_.clear();
   std::queue<std::shared_ptr<GradOpNode>> q;
   std::unordered_set<VariableWrapper *> var_visited;
@@ -554,22 +520,63 @@ void Reducer::PrepareForBackward(
               << "] is not used";
     }
   }
+}
+
+// After each batch is calculated, the counter of each group(group.pending_)
+// and allreudce sequence counter(next_group_) will be cleaned up again.
+void Reducer::PrepareForBackward(
+    const std::vector<std::shared_ptr<imperative::VarBase>> &outputs) {
+  VLOG(3) << "after forward, then reset count for backward.";
+  next_group_ = 0;
+  std::for_each(groups_.begin(), groups_.end(), [](Group &group) {
+    group.pending_ = group.variable_indices_.size();
+    group.sparse_contents_ = nullptr;
+  });
+
+  // reinitialize vars_marked_ready_ for next iteration
+  vars_marked_ready_.clear();
+  vars_marked_ready_.resize(vars_.size(), false);
+
+  PADDLE_ENFORCE_EQ(
+      groups_need_finalize_, false,
+      platform::errors::PreconditionNotMet(
+          "A serious error has occurred here. There may be several reasons: "
+          "1) Please note that all forward outputs derived from the module "
+          "parameters must participate in the calculation of losses and "
+          "subsequent gradient calculations. If not, the wrapper will hang, "
+          "waiting for autograd to generate gradients for these parameters. "
+          "you can use detach or stop_gradient to make the unused parameters "
+          "detached from the autograd graph. "
+          "2) Used multiple forwards and one backward. You may be able to wrap "
+          "multiple forwards in a model."));
+
+  // The first var to trigger the unused parameter
+  has_marked_unused_vars_ = false;
+
+  if (find_unused_vars_once_ || find_unused_vars_each_step_) {
+    unused_vars_.clear();
+    TraverseBackwardGraph(outputs);
+    // only check once in first step
+    find_unused_vars_once_ = false;
+  }
 
-  if (unused_vars_.empty()) {
-    LOG_FIRST_N(WARNING, 1)
-        << "All parameters are involved in the backward pass. "
-           "It is recommended to set find_unused_parameters to False "
-           "to improve performance. However, if unused parameters "
-           "appear in subsequent iterative training, then an error "
-           "will occur. Please make it clear that in the subsequent "
-           "training, there will be no parameters that are not used "
-           "in the backward pass, and then set find_unused_parameters";
-  } else if (unused_vars_.size() == vars_.size()) {
-    LOG_FIRST_N(WARNING, 1)
-        << "There is no parameter in the device involved "
-           "in the backward calculation. If there are "
-           "parameters on other devices involved in the "
-           "backward, then a serious error will occur here.";
+  if (find_unused_vars_each_step_) {
+    if (unused_vars_.empty()) {
+      LOG_FIRST_N(WARNING, 1)
+          << "All parameters are involved in the backward pass. "
+             "It is recommended to set find_unused_parameters to False "
+             "to improve performance. However, if unused parameters "
+             "appear in subsequent iterative training, then an error "
+             "will occur. Please make it clear that in the subsequent "
+             "training, there will be no parameters that are not used "
+             "in the backward pass, and then set find_unused_parameters";
+    } else if (unused_vars_.size() == vars_.size()) {
+      LOG_FIRST_N(WARNING, 1)
+          << "There is no parameter in the device involved "
+             "in the backward calculation. If there are "
+             "parameters on other devices involved in the "
+             "backward, then a serious error will occur here.";
+    }
   }
 }
 
@@ -595,13 +602,13 @@ void Reducer::AddDistHook(size_t var_index) {
 
   local_used_vars_[var_index] = 1;
 
-  // rebuild group when find_unused_vars_ is false
+  // rebuild group when find_unused_vars_each_step_ is false
   if (NeedRebuildGroup()) {
     rebuild_vars_.push_back(vars_[var_index]);
     rebuild_var_indices_.push_back(var_index);
   }
 
-  if (!has_marked_unused_vars_ && find_unused_vars_) {
+  if (!has_marked_unused_vars_) {
     has_marked_unused_vars_ = true;
     for (const auto &unused_index : unused_vars_) {
       MarkVarReady(unused_index, false);
@@ -943,7 +950,7 @@ void Reducer::FinalizeBackward() {
     InitializeGroups(group_indices_);
   }
 
-  if (find_unused_vars_) {
+  if (find_unused_vars_each_step_) {
 // TODO(liuyuhui) support xpu about Tensorcopy/TensorFromVector/TensorToVector
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
     ProcessUnusedDenseVars();
diff --git a/paddle/fluid/imperative/reducer.h b/paddle/fluid/imperative/reducer.h
index 0d613dbea89633..8392ab2c704d50 100644
--- a/paddle/fluid/imperative/reducer.h
+++ b/paddle/fluid/imperative/reducer.h
@@ -162,13 +162,16 @@ class Reducer {
   std::vector<std::vector<size_t>> RebuildGruops();
 
   inline bool NeedRebuildGroup() {
-    return !has_rebuilt_group_ && !find_unused_vars_;
+    return !has_rebuilt_group_ && !find_unused_vars_each_step_;
   }
 
   void ProcessUnusedDenseVars();
 
   bool HasGrad(size_t var_index);
 
+  void TraverseBackwardGraph(
+      const std::vector<std::shared_ptr<imperative::VarBase>>& outputs);
+
  private:
   std::vector<std::shared_ptr<imperative::VarBase>> vars_;
   std::vector<std::vector<size_t>> group_indices_;
@@ -195,7 +198,8 @@ class Reducer {
   std::unordered_map<VariableWrapper*, size_t> var_index_map_;
   std::vector<size_t> unused_vars_;
   bool has_marked_unused_vars_{false};
-  bool find_unused_vars_{false};
+  bool find_unused_vars_each_step_{false};
+  bool find_unused_vars_once_{true};
   bool groups_need_finalize_{false};
 #ifdef PADDLE_WITH_XPU_BKCL
   // comm_pool_ is used for scheduling allreduce in multi Kunlun cards training.
diff --git a/python/paddle/distributed/fleet/base/distributed_strategy.py b/python/paddle/distributed/fleet/base/distributed_strategy.py
index 469b45d20065a5..122ef4357af726 100755
--- a/python/paddle/distributed/fleet/base/distributed_strategy.py
+++ b/python/paddle/distributed/fleet/base/distributed_strategy.py
@@ -626,7 +626,7 @@ def find_unused_parameters(self):
         Indicating whether we are using find_unused_parameters to 
         find unused parameters in DataParallel.
 
-        Default value: True
+        Default value: False
 
         Examples:
 
diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py
index ca5e5606e432b0..2b2e3e0fb895b1 100644
--- a/python/paddle/fluid/dygraph/parallel.py
+++ b/python/paddle/fluid/dygraph/parallel.py
@@ -417,7 +417,7 @@ class DataParallel(layers.Layer):
                                                 Note that setting the find_unused_parameters to True 
                                                 will affect computing performance. Therefore, if all parameters
                                                 are sure to participate in the loss calculation and the 
-                                                autograd graph construction, please set it False. Default: True.
+                                                autograd graph construction, please set it False. Default: False.
             
     Returns:
         Layer: The data paralleled module.
@@ -474,7 +474,7 @@ def __init__(self,
                  strategy=None,
                  comm_buffer_size=25,
                  last_comm_buffer_size=1,
-                 find_unused_parameters=True):
+                 find_unused_parameters=False):
         super(DataParallel,
               self).__init__(layers.full_name() + "_data_parallel")
 
@@ -576,12 +576,8 @@ def _find_varbase(self, obj):
     def forward(self, *inputs, **kwargs):
         outputs = self._layers(*inputs, **kwargs)
         if self._strategy.nranks > 1 and framework._dygraph_tracer()._has_grad:
-            if self.find_unused_parameters:
-                self._reducer.prepare_for_backward(
-                    list(self._find_varbase(outputs)))
-            else:
-                self._reducer.prepare_for_backward(list(self._find_varbase([])))
-
+            self._reducer.prepare_for_backward(
+                list(self._find_varbase(outputs)))
         return outputs
 
     @deprecated(
diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_gradient_check.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_gradient_check.py
index 7002352240973e..5c518976d1f36c 100644
--- a/python/paddle/fluid/tests/unittests/parallel_dygraph_gradient_check.py
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_gradient_check.py
@@ -74,8 +74,8 @@ def test_multiple_gpus(self):
         state_dict = model_a.state_dict()
         model_b.set_state_dict(state_dict)
 
-        model_a = paddle.DataParallel(model_a)
-        model_b = paddle.DataParallel(model_b)
+        model_a = paddle.DataParallel(model_a, find_unused_parameters=True)
+        model_b = paddle.DataParallel(model_b, find_unused_parameters=True)
 
         ones_input = paddle.ones(shape=(batch, in_dim))
         ones_input.stop_gradient = True
diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py
index 37494294418f1c..5ee53bca2ec40c 100755
--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -737,7 +737,7 @@ def setUp(self):
         self._save_model = False
         self._fuse_all_reduce = None
         self._accumulate_gradient = False
-        self._find_unused_parameters = True
+        self._find_unused_parameters = False
         self._setup_config()
 
         global DIST_UT_PORT
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_unused_variables.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_unused_variables.py
index 75fa6f7c71d0a5..14f1b01d5db2db 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_unused_variables.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_unused_variables.py
@@ -41,50 +41,50 @@ def test_net(self):
                 log_name=flag_name)
 
 
-class TestFleetDygraphUnusedVar(TestParallelDygraphUnusedVar):
-    def _setup_config(self):
-        self._sync_mode = False
-        self._nccl2_mode = True
-        self._dygraph = True
-        self._use_fleet_api = True
-
-
-class TestSparseEmbeddingUnusedVarsSpawn(TestDistSpawnRunner):
-    def test_mnist_with_spawn(self):
-        if fluid.core.is_compiled_with_cuda() and sys.version_info >= (3, 4):
-            self.check_dist_result_with_spawn(
-                test_class=TestSparseEmbeddingUnusedVars, delta=1e-5)
-
-
-class TestParallelDygraphNoVar(TestDistBase):
-    def _setup_config(self):
-        self._sync_mode = False
-        self._nccl2_mode = True
-        self._dygraph = True
-
-    def test_net(self):
-        if fluid.core.is_compiled_with_cuda():
-            self.check_with_place(
-                "parallel_dygraph_none_var.py",
-                delta=1e-5,
-                check_error_log=True,
-                log_name=flag_name)
-
-
-class TestParallelDygraphSharedUnusedVariables(TestDistBase):
-    def _setup_config(self):
-        self._sync_mode = False
-        self._nccl2_mode = True
-        self._dygraph = True
-
-    def test_mnist(self):
-        if fluid.core.is_compiled_with_cuda():
-            self.check_with_place(
-                "parallel_dygraph_shared_unused_var.py",
-                delta=1e-5,
-                check_error_log=True,
-                log_name=flag_name)
-
+#class TestFleetDygraphUnusedVar(TestParallelDygraphUnusedVar):
+#    def _setup_config(self):
+#        self._sync_mode = False
+#        self._nccl2_mode = True
+#        self._dygraph = True
+#        self._use_fleet_api = True
+#
+#
+#class TestSparseEmbeddingUnusedVarsSpawn(TestDistSpawnRunner):
+#    def test_mnist_with_spawn(self):
+#        if fluid.core.is_compiled_with_cuda() and sys.version_info >= (3, 4):
+#            self.check_dist_result_with_spawn(
+#                test_class=TestSparseEmbeddingUnusedVars, delta=1e-5)
+#
+#
+#class TestParallelDygraphNoVar(TestDistBase):
+#    def _setup_config(self):
+#        self._sync_mode = False
+#        self._nccl2_mode = True
+#        self._dygraph = True
+#
+#    def test_net(self):
+#        if fluid.core.is_compiled_with_cuda():
+#            self.check_with_place(
+#                "parallel_dygraph_none_var.py",
+#                delta=1e-5,
+#                check_error_log=True,
+#                log_name=flag_name)
+#
+#
+#class TestParallelDygraphSharedUnusedVariables(TestDistBase):
+#    def _setup_config(self):
+#        self._sync_mode = False
+#        self._nccl2_mode = True
+#        self._dygraph = True
+#
+#    def test_mnist(self):
+#        if fluid.core.is_compiled_with_cuda():
+#            self.check_with_place(
+#                "parallel_dygraph_shared_unused_var.py",
+#                delta=1e-5,
+#                check_error_log=True,
+#                log_name=flag_name)
+#
 
 if __name__ == "__main__":
     unittest.main()

From fddc3f3f7517cb66a4bae34772d7724e90040bfa Mon Sep 17 00:00:00 2001
From: ForFishes <1422485404@qq.com>
Date: Mon, 10 May 2021 19:13:52 +0800
Subject: [PATCH 2/6] fix error log for reducer

---
 paddle/fluid/imperative/reducer.cc            | 23 +++--
 .../test_parallel_dygraph_unused_variables.py | 88 +++++++++----------
 2 files changed, 61 insertions(+), 50 deletions(-)

diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc
index 117c0dbbf13802..5d4215bc63caa2 100644
--- a/paddle/fluid/imperative/reducer.cc
+++ b/paddle/fluid/imperative/reducer.cc
@@ -540,7 +540,10 @@ void Reducer::PrepareForBackward(
   PADDLE_ENFORCE_EQ(
       groups_need_finalize_, false,
       platform::errors::PreconditionNotMet(
-          "A serious error has occurred here. There may be several reasons: "
+          "A serious error has occurred here. Please "
+          "set find_unused_parameters=True to traverse backward graph "
+          "in each step to prepare reduce in advance. If you have "
+          "set, There may be several reasons for this error: "
           "1) Please note that all forward outputs derived from the module "
           "parameters must participate in the calculation of losses and "
           "subsequent gradient calculations. If not, the wrapper will hang, "
@@ -629,7 +632,9 @@ void Reducer::MarkVarReady(const size_t var_index, const bool is_used_var) {
   if (vars_marked_ready_[var_index]) {
     auto error_info = string::Sprintf(
         "Error happened, when parameter[%d][%s] has been ready before. "
-        "There may be several reasons for this error: "
+        "Please set find_unused_parameters=True to traverse backward graph "
+        "in each step to prepare reduce in advance. If you have set, "
+        "there may be several reasons for this error: "
         "1) In multiple reentrant backward phase, some parameters are reused."
         "2) Using model parameters outside of forward function. Please "
         "make sure that model parameters are not shared in concurrent "
@@ -697,10 +702,16 @@ void Reducer::MarkVarReady(const size_t var_index, const bool is_used_var) {
     }
   } else {
     // process sparse group
-    PADDLE_ENFORCE_EQ(HasGrad(var_index), true,
-                      platform::errors::PreconditionNotMet(
-                          "The sparse parameter[%d][%s] must have a gradient",
-                          var_index, vars_[var_index]->Name()));
+    PADDLE_ENFORCE_EQ(
+        HasGrad(var_index), true,
+        platform::errors::PreconditionNotMet(
+            "The sparse parameter[%d][%s] should have gradient. "
+            "Currently, DataParallel does not support sparse "
+            "parameters without generating gradients during training. "
+            "For example, if is_sparese=True is used in Embedding, "
+            "the current step of this parameter cannot generate gradient "
+            "because of stop_gradient/detatch, where error will occur.",
+            var_index, vars_[var_index]->Name()));
     auto var_base = vars_[var_index]->GradVarBase();
     // need to check tensor type
     PADDLE_ENFORCE_EQ(
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_unused_variables.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_unused_variables.py
index 14f1b01d5db2db..75fa6f7c71d0a5 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_unused_variables.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_unused_variables.py
@@ -41,50 +41,50 @@ def test_net(self):
                 log_name=flag_name)
 
 
-#class TestFleetDygraphUnusedVar(TestParallelDygraphUnusedVar):
-#    def _setup_config(self):
-#        self._sync_mode = False
-#        self._nccl2_mode = True
-#        self._dygraph = True
-#        self._use_fleet_api = True
-#
-#
-#class TestSparseEmbeddingUnusedVarsSpawn(TestDistSpawnRunner):
-#    def test_mnist_with_spawn(self):
-#        if fluid.core.is_compiled_with_cuda() and sys.version_info >= (3, 4):
-#            self.check_dist_result_with_spawn(
-#                test_class=TestSparseEmbeddingUnusedVars, delta=1e-5)
-#
-#
-#class TestParallelDygraphNoVar(TestDistBase):
-#    def _setup_config(self):
-#        self._sync_mode = False
-#        self._nccl2_mode = True
-#        self._dygraph = True
-#
-#    def test_net(self):
-#        if fluid.core.is_compiled_with_cuda():
-#            self.check_with_place(
-#                "parallel_dygraph_none_var.py",
-#                delta=1e-5,
-#                check_error_log=True,
-#                log_name=flag_name)
-#
-#
-#class TestParallelDygraphSharedUnusedVariables(TestDistBase):
-#    def _setup_config(self):
-#        self._sync_mode = False
-#        self._nccl2_mode = True
-#        self._dygraph = True
-#
-#    def test_mnist(self):
-#        if fluid.core.is_compiled_with_cuda():
-#            self.check_with_place(
-#                "parallel_dygraph_shared_unused_var.py",
-#                delta=1e-5,
-#                check_error_log=True,
-#                log_name=flag_name)
-#
+class TestFleetDygraphUnusedVar(TestParallelDygraphUnusedVar):
+    def _setup_config(self):
+        self._sync_mode = False
+        self._nccl2_mode = True
+        self._dygraph = True
+        self._use_fleet_api = True
+
+
+class TestSparseEmbeddingUnusedVarsSpawn(TestDistSpawnRunner):
+    def test_mnist_with_spawn(self):
+        if fluid.core.is_compiled_with_cuda() and sys.version_info >= (3, 4):
+            self.check_dist_result_with_spawn(
+                test_class=TestSparseEmbeddingUnusedVars, delta=1e-5)
+
+
+class TestParallelDygraphNoVar(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = False
+        self._nccl2_mode = True
+        self._dygraph = True
+
+    def test_net(self):
+        if fluid.core.is_compiled_with_cuda():
+            self.check_with_place(
+                "parallel_dygraph_none_var.py",
+                delta=1e-5,
+                check_error_log=True,
+                log_name=flag_name)
+
+
+class TestParallelDygraphSharedUnusedVariables(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = False
+        self._nccl2_mode = True
+        self._dygraph = True
+
+    def test_mnist(self):
+        if fluid.core.is_compiled_with_cuda():
+            self.check_with_place(
+                "parallel_dygraph_shared_unused_var.py",
+                delta=1e-5,
+                check_error_log=True,
+                log_name=flag_name)
+
 
 if __name__ == "__main__":
     unittest.main()

From c3777f5ae520cf58f16404d211dcf7ba66506556 Mon Sep 17 00:00:00 2001
From: ForFishes <1422485404@qq.com>
Date: Mon, 10 May 2021 19:23:07 +0800
Subject: [PATCH 3/6] fix doc

---
 python/paddle/fluid/dygraph/parallel.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py
index 2b2e3e0fb895b1..2be062962ec9d3 100644
--- a/python/paddle/fluid/dygraph/parallel.py
+++ b/python/paddle/fluid/dygraph/parallel.py
@@ -424,7 +424,8 @@ class DataParallel(layers.Layer):
 
     Examples:
         .. code-block:: python
-
+        
+            # required: distributed
             import paddle
             import paddle.nn as nn
             import paddle.optimizer as opt

From c07485403610028b00468f9c70dc9f7cba2a4c84 Mon Sep 17 00:00:00 2001
From: ForFishes <1422485404@qq.com>
Date: Mon, 10 May 2021 20:02:46 +0800
Subject: [PATCH 4/6] fix bug of utest

---
 python/paddle/fluid/tests/unittests/test_dist_base.py    | 9 ++++++---
 .../unittests/test_parallel_dygraph_control_flow.py      | 6 ++++++
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py
index 5ee53bca2ec40c..edc510e4e766d7 100755
--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -548,7 +548,10 @@ def run_trainer_with_spawn(self, args):
         # 4. train model
         model, train_reader, opt = self.get_model()
         if args.update_method == "nccl2":
-            model = paddle.DataParallel(model)
+            if args.find_unused_parameters:
+                model = paddle.DataParallel(model, find_unused_parameters=True)
+            else:
+                model = paddle.DataParallel(model, find_unused_parameters=False)
 
         out_losses = []
         for step_id, data in enumerate(train_reader()):
@@ -581,8 +584,8 @@ def run_use_fleet_api_trainer(self, args):
 
         # set strategy
         strategy = fleet.DistributedStrategy()
-        if not args.find_unused_parameters:
-            strategy.find_unused_parameters = False
+        if args.find_unused_parameters:
+            strategy.find_unused_parameters = True
 
         # 3. init parallel env
         if args.update_method == "nccl2" or "bkcl":
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_control_flow.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_control_flow.py
index fa571bde5e43bf..3c45b2c7950377 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_control_flow.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_control_flow.py
@@ -30,6 +30,7 @@ def _setup_config(self):
         self._sync_mode = False
         self._nccl2_mode = True
         self._dygraph = True
+        self._find_unused_parameters = True
 
     def test_net(self):
         if fluid.core.is_compiled_with_cuda():
@@ -46,6 +47,7 @@ def _setup_config(self):
         self._nccl2_mode = True
         self._dygraph = True
         self._use_fleet_api = True
+        self._find_unused_parameters = True
 
 
 class TestFleetDygraphControlFlowSameAccGrad(TestDygraphControlFlowSame):
@@ -54,6 +56,7 @@ def _setup_config(self):
         self._nccl2_mode = True
         self._dygraph = True
         self._accumulate_gradient = True
+        self._find_unused_parameters = True
 
 
 class TestDygraphControlFlowDiff(TestDistBase):
@@ -61,6 +64,7 @@ def _setup_config(self):
         self._sync_mode = False
         self._nccl2_mode = True
         self._dygraph = True
+        self._find_unused_parameters = True
 
     def test_net(self):
         if fluid.core.is_compiled_with_cuda():
@@ -77,6 +81,7 @@ def _setup_config(self):
         self._nccl2_mode = True
         self._dygraph = True
         self._use_fleet_api = True
+        self._find_unused_parameters = True
 
 
 class TestFleetDygraphControlFlowDiffAccGrad(TestDygraphControlFlowDiff):
@@ -85,6 +90,7 @@ def _setup_config(self):
         self._nccl2_mode = True
         self._dygraph = True
         self._accumulate_gradient = True
+        self._find_unused_parameters = True
 
 
 if __name__ == "__main__":

From e7bf08ef95cd3cccc86474f0e2f6cc7997c52228 Mon Sep 17 00:00:00 2001
From: ForFishes <1422485404@qq.com>
Date: Mon, 10 May 2021 20:28:33 +0800
Subject: [PATCH 5/6] fix spawn

---
 python/paddle/fluid/tests/unittests/spawn_runner_base.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/paddle/fluid/tests/unittests/spawn_runner_base.py b/python/paddle/fluid/tests/unittests/spawn_runner_base.py
index 278d7b27c52880..2719e28fea08b0 100644
--- a/python/paddle/fluid/tests/unittests/spawn_runner_base.py
+++ b/python/paddle/fluid/tests/unittests/spawn_runner_base.py
@@ -27,6 +27,7 @@
 class SpawnAssistTestArgs(object):
     update_method = "local"
     trainer_id = 0
+    find_unused_parameters = False
 
 
 class TestDistSpawnRunner(unittest.TestCase):

From 201b14bab09ee300233b8d62779482c651496697 Mon Sep 17 00:00:00 2001
From: ForFishes <1422485404@qq.com>
Date: Mon, 10 May 2021 23:54:38 +0800
Subject: [PATCH 6/6] fix converage

---
 paddle/fluid/imperative/reducer.cc            | 34 +++++++++----------
 .../unittests/test_parallel_dygraph_mnist.py  |  1 +
 2 files changed, 18 insertions(+), 17 deletions(-)

diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc
index 5d4215bc63caa2..0f6676ed48f349 100644
--- a/paddle/fluid/imperative/reducer.cc
+++ b/paddle/fluid/imperative/reducer.cc
@@ -563,23 +563,23 @@ void Reducer::PrepareForBackward(
     find_unused_vars_once_ = false;
   }
 
-  if (find_unused_vars_each_step_) {
-    if (unused_vars_.empty()) {
-      LOG_FIRST_N(WARNING, 1)
-          << "All parameters are involved in the backward pass. "
-             "It is recommended to set find_unused_parameters to False "
-             "to improve performance. However, if unused parameters "
-             "appear in subsequent iterative training, then an error "
-             "will occur. Please make it clear that in the subsequent "
-             "training, there will be no parameters that are not used "
-             "in the backward pass, and then set find_unused_parameters";
-    } else if (unused_vars_.size() == vars_.size()) {
-      LOG_FIRST_N(WARNING, 1)
-          << "There is no parameter in the device involved "
-             "in the backward calculation. If there are "
-             "parameters on other devices involved in the "
-             "backward, then a serious error will occur here.";
-    }
+  if (find_unused_vars_each_step_ && unused_vars_.empty()) {
+    LOG_FIRST_N(WARNING, 1)
+        << "All parameters are involved in the backward pass. "
+           "It is recommended to set find_unused_parameters to False "
+           "to improve performance. However, if unused parameters "
+           "appear in subsequent iterative training, then an error "
+           "will occur. Please make it clear that in the subsequent "
+           "training, there will be no parameters that are not used "
+           "in the backward pass, and then set find_unused_parameters";
+  }
+
+  if (unused_vars_.size() == vars_.size()) {
+    LOG_FIRST_N(WARNING, 1)
+        << "There is no parameter in the device involved "
+           "in the backward calculation. If there are "
+           "parameters on other devices involved in the "
+           "backward, then a serious error will occur here.";
   }
 }
 
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py
index 782d2304619f2a..0c55e135721ce8 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py
@@ -31,6 +31,7 @@ def _setup_config(self):
         self._sync_mode = False
         self._nccl2_mode = True
         self._dygraph = True
+        self._find_unused_parameters = True
 
     def test_mnist(self):
         if fluid.core.is_compiled_with_cuda():