From 580c87579bae7b20991ab957289310353a83bc6f Mon Sep 17 00:00:00 2001 From: ForFishes <1422485404@qq.com> Date: Mon, 10 May 2021 18:24:19 +0800 Subject: [PATCH 1/6] fix find_unused_parameters default value --- .../framework/distributed_strategy.proto | 2 +- paddle/fluid/imperative/reducer.cc | 115 ++++++++++-------- paddle/fluid/imperative/reducer.h | 8 +- .../fleet/base/distributed_strategy.py | 2 +- python/paddle/fluid/dygraph/parallel.py | 12 +- .../parallel_dygraph_gradient_check.py | 4 +- .../fluid/tests/unittests/test_dist_base.py | 2 +- .../test_parallel_dygraph_unused_variables.py | 88 +++++++------- 8 files changed, 120 insertions(+), 113 deletions(-) diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto index dbe9b8cb9aa9ee..d102fcdbe0cec1 100644 --- a/paddle/fluid/framework/distributed_strategy.proto +++ b/paddle/fluid/framework/distributed_strategy.proto @@ -172,7 +172,7 @@ message DistributedStrategy { optional bool fp16_allreduce = 25 [ default = false ]; optional bool sharding = 26 [ default = false ]; optional float last_comm_group_size_MB = 27 [ default = 1 ]; - optional bool find_unused_parameters = 28 [ default = true ]; + optional bool find_unused_parameters = 28 [ default = false ]; optional bool tensor_parallel = 29 [ default = false ]; optional bool without_graph_optimization = 30 [ default = false ]; diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc index e3dd0a2aa75b41..117c0dbbf13802 100644 --- a/paddle/fluid/imperative/reducer.cc +++ b/paddle/fluid/imperative/reducer.cc @@ -297,7 +297,7 @@ Reducer::Reducer(const std::vector> &vars, is_sparse_gradient_(is_sparse_gradient), parallel_ctx_(parallel_ctx), group_size_limits_(group_size_limits), - find_unused_vars_(find_unused_vars) { + find_unused_vars_each_step_(find_unused_vars) { VLOG(3) << "Start construct the Reducer ..."; nrings_ = parallel_ctx->GetNRings(); nranks_ = parallel_ctx->GetNRanks(); @@ -457,42 +457,8 @@ void Reducer::PrepareDeps(const std::unordered_set &init_nodes) { } } -// After each batch is calculated, the counter of each group(group.pending_) -// and allreudce sequence counter(next_group_) will be cleaned up again. -void Reducer::PrepareForBackward( +void Reducer::TraverseBackwardGraph( const std::vector> &outputs) { - VLOG(3) << "after forward, then reset count for backward."; - next_group_ = 0; - std::for_each(groups_.begin(), groups_.end(), [](Group &group) { - group.pending_ = group.variable_indices_.size(); - group.sparse_contents_ = nullptr; - }); - - // reinitialize vars_marked_ready_ for next iteration - vars_marked_ready_.clear(); - vars_marked_ready_.resize(vars_.size(), false); - - PADDLE_ENFORCE_EQ( - groups_need_finalize_, false, - platform::errors::PreconditionNotMet( - "A serious error has occurred here. There may be several reasons: " - "1) Please note that all forward outputs derived from the module " - "parameters must participate in the calculation of losses and " - "subsequent gradient calculations. If not, the wrapper will hang, " - "waiting for autograd to generate gradients for these parameters. " - "you can use detach or stop_gradient to make the unused parameters " - "detached from the autograd graph. " - "2) Used multiple forwards and one backward. You may be able to wrap " - "multiple forwards in a model.")); - - // The first var to trigger the unused parameter - has_marked_unused_vars_ = false; - unused_vars_.clear(); - - if (!find_unused_vars_) { - return; - } - node_deps_.clear(); std::queue> q; std::unordered_set var_visited; @@ -554,22 +520,63 @@ void Reducer::PrepareForBackward( << "] is not used"; } } +} + +// After each batch is calculated, the counter of each group(group.pending_) +// and allreudce sequence counter(next_group_) will be cleaned up again. +void Reducer::PrepareForBackward( + const std::vector> &outputs) { + VLOG(3) << "after forward, then reset count for backward."; + next_group_ = 0; + std::for_each(groups_.begin(), groups_.end(), [](Group &group) { + group.pending_ = group.variable_indices_.size(); + group.sparse_contents_ = nullptr; + }); + + // reinitialize vars_marked_ready_ for next iteration + vars_marked_ready_.clear(); + vars_marked_ready_.resize(vars_.size(), false); + + PADDLE_ENFORCE_EQ( + groups_need_finalize_, false, + platform::errors::PreconditionNotMet( + "A serious error has occurred here. There may be several reasons: " + "1) Please note that all forward outputs derived from the module " + "parameters must participate in the calculation of losses and " + "subsequent gradient calculations. If not, the wrapper will hang, " + "waiting for autograd to generate gradients for these parameters. " + "you can use detach or stop_gradient to make the unused parameters " + "detached from the autograd graph. " + "2) Used multiple forwards and one backward. You may be able to wrap " + "multiple forwards in a model.")); + + // The first var to trigger the unused parameter + has_marked_unused_vars_ = false; + + if (find_unused_vars_once_ || find_unused_vars_each_step_) { + unused_vars_.clear(); + TraverseBackwardGraph(outputs); + // only check once in first step + find_unused_vars_once_ = false; + } - if (unused_vars_.empty()) { - LOG_FIRST_N(WARNING, 1) - << "All parameters are involved in the backward pass. " - "It is recommended to set find_unused_parameters to False " - "to improve performance. However, if unused parameters " - "appear in subsequent iterative training, then an error " - "will occur. Please make it clear that in the subsequent " - "training, there will be no parameters that are not used " - "in the backward pass, and then set find_unused_parameters"; - } else if (unused_vars_.size() == vars_.size()) { - LOG_FIRST_N(WARNING, 1) - << "There is no parameter in the device involved " - "in the backward calculation. If there are " - "parameters on other devices involved in the " - "backward, then a serious error will occur here."; + if (find_unused_vars_each_step_) { + if (unused_vars_.empty()) { + LOG_FIRST_N(WARNING, 1) + << "All parameters are involved in the backward pass. " + "It is recommended to set find_unused_parameters to False " + "to improve performance. However, if unused parameters " + "appear in subsequent iterative training, then an error " + "will occur. Please make it clear that in the subsequent " + "training, there will be no parameters that are not used " + "in the backward pass, and then set find_unused_parameters"; + } else if (unused_vars_.size() == vars_.size()) { + LOG_FIRST_N(WARNING, 1) + << "There is no parameter in the device involved " + "in the backward calculation. If there are " + "parameters on other devices involved in the " + "backward, then a serious error will occur here."; + } } } @@ -595,13 +602,13 @@ void Reducer::AddDistHook(size_t var_index) { local_used_vars_[var_index] = 1; - // rebuild group when find_unused_vars_ is false + // rebuild group when find_unused_vars_each_step_ is false if (NeedRebuildGroup()) { rebuild_vars_.push_back(vars_[var_index]); rebuild_var_indices_.push_back(var_index); } - if (!has_marked_unused_vars_ && find_unused_vars_) { + if (!has_marked_unused_vars_) { has_marked_unused_vars_ = true; for (const auto &unused_index : unused_vars_) { MarkVarReady(unused_index, false); @@ -943,7 +950,7 @@ void Reducer::FinalizeBackward() { InitializeGroups(group_indices_); } - if (find_unused_vars_) { + if (find_unused_vars_each_step_) { // TODO(liuyuhui) support xpu about Tensorcopy/TensorFromVector/TensorToVector #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) ProcessUnusedDenseVars(); diff --git a/paddle/fluid/imperative/reducer.h b/paddle/fluid/imperative/reducer.h index 0d613dbea89633..8392ab2c704d50 100644 --- a/paddle/fluid/imperative/reducer.h +++ b/paddle/fluid/imperative/reducer.h @@ -162,13 +162,16 @@ class Reducer { std::vector> RebuildGruops(); inline bool NeedRebuildGroup() { - return !has_rebuilt_group_ && !find_unused_vars_; + return !has_rebuilt_group_ && !find_unused_vars_each_step_; } void ProcessUnusedDenseVars(); bool HasGrad(size_t var_index); + void TraverseBackwardGraph( + const std::vector>& outputs); + private: std::vector> vars_; std::vector> group_indices_; @@ -195,7 +198,8 @@ class Reducer { std::unordered_map var_index_map_; std::vector unused_vars_; bool has_marked_unused_vars_{false}; - bool find_unused_vars_{false}; + bool find_unused_vars_each_step_{false}; + bool find_unused_vars_once_{true}; bool groups_need_finalize_{false}; #ifdef PADDLE_WITH_XPU_BKCL // comm_pool_ is used for scheduling allreduce in multi Kunlun cards training. diff --git a/python/paddle/distributed/fleet/base/distributed_strategy.py b/python/paddle/distributed/fleet/base/distributed_strategy.py index 469b45d20065a5..122ef4357af726 100755 --- a/python/paddle/distributed/fleet/base/distributed_strategy.py +++ b/python/paddle/distributed/fleet/base/distributed_strategy.py @@ -626,7 +626,7 @@ def find_unused_parameters(self): Indicating whether we are using find_unused_parameters to find unused parameters in DataParallel. - Default value: True + Default value: False Examples: diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py index ca5e5606e432b0..2b2e3e0fb895b1 100644 --- a/python/paddle/fluid/dygraph/parallel.py +++ b/python/paddle/fluid/dygraph/parallel.py @@ -417,7 +417,7 @@ class DataParallel(layers.Layer): Note that setting the find_unused_parameters to True will affect computing performance. Therefore, if all parameters are sure to participate in the loss calculation and the - autograd graph construction, please set it False. Default: True. + autograd graph construction, please set it False. Default: False. Returns: Layer: The data paralleled module. @@ -474,7 +474,7 @@ def __init__(self, strategy=None, comm_buffer_size=25, last_comm_buffer_size=1, - find_unused_parameters=True): + find_unused_parameters=False): super(DataParallel, self).__init__(layers.full_name() + "_data_parallel") @@ -576,12 +576,8 @@ def _find_varbase(self, obj): def forward(self, *inputs, **kwargs): outputs = self._layers(*inputs, **kwargs) if self._strategy.nranks > 1 and framework._dygraph_tracer()._has_grad: - if self.find_unused_parameters: - self._reducer.prepare_for_backward( - list(self._find_varbase(outputs))) - else: - self._reducer.prepare_for_backward(list(self._find_varbase([]))) - + self._reducer.prepare_for_backward( + list(self._find_varbase(outputs))) return outputs @deprecated( diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_gradient_check.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_gradient_check.py index 7002352240973e..5c518976d1f36c 100644 --- a/python/paddle/fluid/tests/unittests/parallel_dygraph_gradient_check.py +++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_gradient_check.py @@ -74,8 +74,8 @@ def test_multiple_gpus(self): state_dict = model_a.state_dict() model_b.set_state_dict(state_dict) - model_a = paddle.DataParallel(model_a) - model_b = paddle.DataParallel(model_b) + model_a = paddle.DataParallel(model_a, find_unused_parameters=True) + model_b = paddle.DataParallel(model_b, find_unused_parameters=True) ones_input = paddle.ones(shape=(batch, in_dim)) ones_input.stop_gradient = True diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py index 37494294418f1c..5ee53bca2ec40c 100755 --- a/python/paddle/fluid/tests/unittests/test_dist_base.py +++ b/python/paddle/fluid/tests/unittests/test_dist_base.py @@ -737,7 +737,7 @@ def setUp(self): self._save_model = False self._fuse_all_reduce = None self._accumulate_gradient = False - self._find_unused_parameters = True + self._find_unused_parameters = False self._setup_config() global DIST_UT_PORT diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_unused_variables.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_unused_variables.py index 75fa6f7c71d0a5..14f1b01d5db2db 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_unused_variables.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_unused_variables.py @@ -41,50 +41,50 @@ def test_net(self): log_name=flag_name) -class TestFleetDygraphUnusedVar(TestParallelDygraphUnusedVar): - def _setup_config(self): - self._sync_mode = False - self._nccl2_mode = True - self._dygraph = True - self._use_fleet_api = True - - -class TestSparseEmbeddingUnusedVarsSpawn(TestDistSpawnRunner): - def test_mnist_with_spawn(self): - if fluid.core.is_compiled_with_cuda() and sys.version_info >= (3, 4): - self.check_dist_result_with_spawn( - test_class=TestSparseEmbeddingUnusedVars, delta=1e-5) - - -class TestParallelDygraphNoVar(TestDistBase): - def _setup_config(self): - self._sync_mode = False - self._nccl2_mode = True - self._dygraph = True - - def test_net(self): - if fluid.core.is_compiled_with_cuda(): - self.check_with_place( - "parallel_dygraph_none_var.py", - delta=1e-5, - check_error_log=True, - log_name=flag_name) - - -class TestParallelDygraphSharedUnusedVariables(TestDistBase): - def _setup_config(self): - self._sync_mode = False - self._nccl2_mode = True - self._dygraph = True - - def test_mnist(self): - if fluid.core.is_compiled_with_cuda(): - self.check_with_place( - "parallel_dygraph_shared_unused_var.py", - delta=1e-5, - check_error_log=True, - log_name=flag_name) - +#class TestFleetDygraphUnusedVar(TestParallelDygraphUnusedVar): +# def _setup_config(self): +# self._sync_mode = False +# self._nccl2_mode = True +# self._dygraph = True +# self._use_fleet_api = True +# +# +#class TestSparseEmbeddingUnusedVarsSpawn(TestDistSpawnRunner): +# def test_mnist_with_spawn(self): +# if fluid.core.is_compiled_with_cuda() and sys.version_info >= (3, 4): +# self.check_dist_result_with_spawn( +# test_class=TestSparseEmbeddingUnusedVars, delta=1e-5) +# +# +#class TestParallelDygraphNoVar(TestDistBase): +# def _setup_config(self): +# self._sync_mode = False +# self._nccl2_mode = True +# self._dygraph = True +# +# def test_net(self): +# if fluid.core.is_compiled_with_cuda(): +# self.check_with_place( +# "parallel_dygraph_none_var.py", +# delta=1e-5, +# check_error_log=True, +# log_name=flag_name) +# +# +#class TestParallelDygraphSharedUnusedVariables(TestDistBase): +# def _setup_config(self): +# self._sync_mode = False +# self._nccl2_mode = True +# self._dygraph = True +# +# def test_mnist(self): +# if fluid.core.is_compiled_with_cuda(): +# self.check_with_place( +# "parallel_dygraph_shared_unused_var.py", +# delta=1e-5, +# check_error_log=True, +# log_name=flag_name) +# if __name__ == "__main__": unittest.main() From fddc3f3f7517cb66a4bae34772d7724e90040bfa Mon Sep 17 00:00:00 2001 From: ForFishes <1422485404@qq.com> Date: Mon, 10 May 2021 19:13:52 +0800 Subject: [PATCH 2/6] fix error log for reducer --- paddle/fluid/imperative/reducer.cc | 23 +++-- .../test_parallel_dygraph_unused_variables.py | 88 +++++++++---------- 2 files changed, 61 insertions(+), 50 deletions(-) diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc index 117c0dbbf13802..5d4215bc63caa2 100644 --- a/paddle/fluid/imperative/reducer.cc +++ b/paddle/fluid/imperative/reducer.cc @@ -540,7 +540,10 @@ void Reducer::PrepareForBackward( PADDLE_ENFORCE_EQ( groups_need_finalize_, false, platform::errors::PreconditionNotMet( - "A serious error has occurred here. There may be several reasons: " + "A serious error has occurred here. Please " + "set find_unused_parameters=True to traverse backward graph " + "in each step to prepare reduce in advance. If you have " + "set, There may be several reasons for this error: " "1) Please note that all forward outputs derived from the module " "parameters must participate in the calculation of losses and " "subsequent gradient calculations. If not, the wrapper will hang, " @@ -629,7 +632,9 @@ void Reducer::MarkVarReady(const size_t var_index, const bool is_used_var) { if (vars_marked_ready_[var_index]) { auto error_info = string::Sprintf( "Error happened, when parameter[%d][%s] has been ready before. " - "There may be several reasons for this error: " + "Please set find_unused_parameters=True to traverse backward graph " + "in each step to prepare reduce in advance. If you have set, " + "there may be several reasons for this error: " "1) In multiple reentrant backward phase, some parameters are reused." "2) Using model parameters outside of forward function. Please " "make sure that model parameters are not shared in concurrent " @@ -697,10 +702,16 @@ void Reducer::MarkVarReady(const size_t var_index, const bool is_used_var) { } } else { // process sparse group - PADDLE_ENFORCE_EQ(HasGrad(var_index), true, - platform::errors::PreconditionNotMet( - "The sparse parameter[%d][%s] must have a gradient", - var_index, vars_[var_index]->Name())); + PADDLE_ENFORCE_EQ( + HasGrad(var_index), true, + platform::errors::PreconditionNotMet( + "The sparse parameter[%d][%s] should have gradient. " + "Currently, DataParallel does not support sparse " + "parameters without generating gradients during training. " + "For example, if is_sparese=True is used in Embedding, " + "the current step of this parameter cannot generate gradient " + "because of stop_gradient/detatch, where error will occur.", + var_index, vars_[var_index]->Name())); auto var_base = vars_[var_index]->GradVarBase(); // need to check tensor type PADDLE_ENFORCE_EQ( diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_unused_variables.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_unused_variables.py index 14f1b01d5db2db..75fa6f7c71d0a5 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_unused_variables.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_unused_variables.py @@ -41,50 +41,50 @@ def test_net(self): log_name=flag_name) -#class TestFleetDygraphUnusedVar(TestParallelDygraphUnusedVar): -# def _setup_config(self): -# self._sync_mode = False -# self._nccl2_mode = True -# self._dygraph = True -# self._use_fleet_api = True -# -# -#class TestSparseEmbeddingUnusedVarsSpawn(TestDistSpawnRunner): -# def test_mnist_with_spawn(self): -# if fluid.core.is_compiled_with_cuda() and sys.version_info >= (3, 4): -# self.check_dist_result_with_spawn( -# test_class=TestSparseEmbeddingUnusedVars, delta=1e-5) -# -# -#class TestParallelDygraphNoVar(TestDistBase): -# def _setup_config(self): -# self._sync_mode = False -# self._nccl2_mode = True -# self._dygraph = True -# -# def test_net(self): -# if fluid.core.is_compiled_with_cuda(): -# self.check_with_place( -# "parallel_dygraph_none_var.py", -# delta=1e-5, -# check_error_log=True, -# log_name=flag_name) -# -# -#class TestParallelDygraphSharedUnusedVariables(TestDistBase): -# def _setup_config(self): -# self._sync_mode = False -# self._nccl2_mode = True -# self._dygraph = True -# -# def test_mnist(self): -# if fluid.core.is_compiled_with_cuda(): -# self.check_with_place( -# "parallel_dygraph_shared_unused_var.py", -# delta=1e-5, -# check_error_log=True, -# log_name=flag_name) -# +class TestFleetDygraphUnusedVar(TestParallelDygraphUnusedVar): + def _setup_config(self): + self._sync_mode = False + self._nccl2_mode = True + self._dygraph = True + self._use_fleet_api = True + + +class TestSparseEmbeddingUnusedVarsSpawn(TestDistSpawnRunner): + def test_mnist_with_spawn(self): + if fluid.core.is_compiled_with_cuda() and sys.version_info >= (3, 4): + self.check_dist_result_with_spawn( + test_class=TestSparseEmbeddingUnusedVars, delta=1e-5) + + +class TestParallelDygraphNoVar(TestDistBase): + def _setup_config(self): + self._sync_mode = False + self._nccl2_mode = True + self._dygraph = True + + def test_net(self): + if fluid.core.is_compiled_with_cuda(): + self.check_with_place( + "parallel_dygraph_none_var.py", + delta=1e-5, + check_error_log=True, + log_name=flag_name) + + +class TestParallelDygraphSharedUnusedVariables(TestDistBase): + def _setup_config(self): + self._sync_mode = False + self._nccl2_mode = True + self._dygraph = True + + def test_mnist(self): + if fluid.core.is_compiled_with_cuda(): + self.check_with_place( + "parallel_dygraph_shared_unused_var.py", + delta=1e-5, + check_error_log=True, + log_name=flag_name) + if __name__ == "__main__": unittest.main() From c3777f5ae520cf58f16404d211dcf7ba66506556 Mon Sep 17 00:00:00 2001 From: ForFishes <1422485404@qq.com> Date: Mon, 10 May 2021 19:23:07 +0800 Subject: [PATCH 3/6] fix doc --- python/paddle/fluid/dygraph/parallel.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py index 2b2e3e0fb895b1..2be062962ec9d3 100644 --- a/python/paddle/fluid/dygraph/parallel.py +++ b/python/paddle/fluid/dygraph/parallel.py @@ -424,7 +424,8 @@ class DataParallel(layers.Layer): Examples: .. code-block:: python - + + # required: distributed import paddle import paddle.nn as nn import paddle.optimizer as opt From c07485403610028b00468f9c70dc9f7cba2a4c84 Mon Sep 17 00:00:00 2001 From: ForFishes <1422485404@qq.com> Date: Mon, 10 May 2021 20:02:46 +0800 Subject: [PATCH 4/6] fix bug of utest --- python/paddle/fluid/tests/unittests/test_dist_base.py | 9 ++++++--- .../unittests/test_parallel_dygraph_control_flow.py | 6 ++++++ 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py index 5ee53bca2ec40c..edc510e4e766d7 100755 --- a/python/paddle/fluid/tests/unittests/test_dist_base.py +++ b/python/paddle/fluid/tests/unittests/test_dist_base.py @@ -548,7 +548,10 @@ def run_trainer_with_spawn(self, args): # 4. train model model, train_reader, opt = self.get_model() if args.update_method == "nccl2": - model = paddle.DataParallel(model) + if args.find_unused_parameters: + model = paddle.DataParallel(model, find_unused_parameters=True) + else: + model = paddle.DataParallel(model, find_unused_parameters=False) out_losses = [] for step_id, data in enumerate(train_reader()): @@ -581,8 +584,8 @@ def run_use_fleet_api_trainer(self, args): # set strategy strategy = fleet.DistributedStrategy() - if not args.find_unused_parameters: - strategy.find_unused_parameters = False + if args.find_unused_parameters: + strategy.find_unused_parameters = True # 3. init parallel env if args.update_method == "nccl2" or "bkcl": diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_control_flow.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_control_flow.py index fa571bde5e43bf..3c45b2c7950377 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_control_flow.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_control_flow.py @@ -30,6 +30,7 @@ def _setup_config(self): self._sync_mode = False self._nccl2_mode = True self._dygraph = True + self._find_unused_parameters = True def test_net(self): if fluid.core.is_compiled_with_cuda(): @@ -46,6 +47,7 @@ def _setup_config(self): self._nccl2_mode = True self._dygraph = True self._use_fleet_api = True + self._find_unused_parameters = True class TestFleetDygraphControlFlowSameAccGrad(TestDygraphControlFlowSame): @@ -54,6 +56,7 @@ def _setup_config(self): self._nccl2_mode = True self._dygraph = True self._accumulate_gradient = True + self._find_unused_parameters = True class TestDygraphControlFlowDiff(TestDistBase): @@ -61,6 +64,7 @@ def _setup_config(self): self._sync_mode = False self._nccl2_mode = True self._dygraph = True + self._find_unused_parameters = True def test_net(self): if fluid.core.is_compiled_with_cuda(): @@ -77,6 +81,7 @@ def _setup_config(self): self._nccl2_mode = True self._dygraph = True self._use_fleet_api = True + self._find_unused_parameters = True class TestFleetDygraphControlFlowDiffAccGrad(TestDygraphControlFlowDiff): @@ -85,6 +90,7 @@ def _setup_config(self): self._nccl2_mode = True self._dygraph = True self._accumulate_gradient = True + self._find_unused_parameters = True if __name__ == "__main__": From e7bf08ef95cd3cccc86474f0e2f6cc7997c52228 Mon Sep 17 00:00:00 2001 From: ForFishes <1422485404@qq.com> Date: Mon, 10 May 2021 20:28:33 +0800 Subject: [PATCH 5/6] fix spawn --- python/paddle/fluid/tests/unittests/spawn_runner_base.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/paddle/fluid/tests/unittests/spawn_runner_base.py b/python/paddle/fluid/tests/unittests/spawn_runner_base.py index 278d7b27c52880..2719e28fea08b0 100644 --- a/python/paddle/fluid/tests/unittests/spawn_runner_base.py +++ b/python/paddle/fluid/tests/unittests/spawn_runner_base.py @@ -27,6 +27,7 @@ class SpawnAssistTestArgs(object): update_method = "local" trainer_id = 0 + find_unused_parameters = False class TestDistSpawnRunner(unittest.TestCase): From 201b14bab09ee300233b8d62779482c651496697 Mon Sep 17 00:00:00 2001 From: ForFishes <1422485404@qq.com> Date: Mon, 10 May 2021 23:54:38 +0800 Subject: [PATCH 6/6] fix converage --- paddle/fluid/imperative/reducer.cc | 34 +++++++++---------- .../unittests/test_parallel_dygraph_mnist.py | 1 + 2 files changed, 18 insertions(+), 17 deletions(-) diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc index 5d4215bc63caa2..0f6676ed48f349 100644 --- a/paddle/fluid/imperative/reducer.cc +++ b/paddle/fluid/imperative/reducer.cc @@ -563,23 +563,23 @@ void Reducer::PrepareForBackward( find_unused_vars_once_ = false; } - if (find_unused_vars_each_step_) { - if (unused_vars_.empty()) { - LOG_FIRST_N(WARNING, 1) - << "All parameters are involved in the backward pass. " - "It is recommended to set find_unused_parameters to False " - "to improve performance. However, if unused parameters " - "appear in subsequent iterative training, then an error " - "will occur. Please make it clear that in the subsequent " - "training, there will be no parameters that are not used " - "in the backward pass, and then set find_unused_parameters"; - } else if (unused_vars_.size() == vars_.size()) { - LOG_FIRST_N(WARNING, 1) - << "There is no parameter in the device involved " - "in the backward calculation. If there are " - "parameters on other devices involved in the " - "backward, then a serious error will occur here."; - } + if (find_unused_vars_each_step_ && unused_vars_.empty()) { + LOG_FIRST_N(WARNING, 1) + << "All parameters are involved in the backward pass. " + "It is recommended to set find_unused_parameters to False " + "to improve performance. However, if unused parameters " + "appear in subsequent iterative training, then an error " + "will occur. Please make it clear that in the subsequent " + "training, there will be no parameters that are not used " + "in the backward pass, and then set find_unused_parameters"; + } + + if (unused_vars_.size() == vars_.size()) { + LOG_FIRST_N(WARNING, 1) + << "There is no parameter in the device involved " + "in the backward calculation. If there are " + "parameters on other devices involved in the " + "backward, then a serious error will occur here."; } } diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py index 782d2304619f2a..0c55e135721ce8 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py @@ -31,6 +31,7 @@ def _setup_config(self): self._sync_mode = False self._nccl2_mode = True self._dygraph = True + self._find_unused_parameters = True def test_mnist(self): if fluid.core.is_compiled_with_cuda():