@@ -297,7 +297,7 @@ Reducer::Reducer(const std::vector<std::shared_ptr<imperative::VarBase>> &vars,
297297 is_sparse_gradient_(is_sparse_gradient),
298298 parallel_ctx_(parallel_ctx),
299299 group_size_limits_(group_size_limits),
300- find_unused_vars_ (find_unused_vars) {
300+ find_unused_vars_each_step_ (find_unused_vars) {
301301 VLOG (3 ) << " Start construct the Reducer ..." ;
302302 nrings_ = parallel_ctx->GetNRings ();
303303 nranks_ = parallel_ctx->GetNRanks ();
@@ -457,42 +457,8 @@ void Reducer::PrepareDeps(const std::unordered_set<GradOpNode *> &init_nodes) {
457457 }
458458}
459459
460- // After each batch is calculated, the counter of each group(group.pending_)
461- // and allreudce sequence counter(next_group_) will be cleaned up again.
462- void Reducer::PrepareForBackward (
460+ void Reducer::TraverseBackwardGraph (
463461 const std::vector<std::shared_ptr<imperative::VarBase>> &outputs) {
464- VLOG (3 ) << " after forward, then reset count for backward." ;
465- next_group_ = 0 ;
466- std::for_each (groups_.begin (), groups_.end (), [](Group &group) {
467- group.pending_ = group.variable_indices_ .size ();
468- group.sparse_contents_ = nullptr ;
469- });
470-
471- // reinitialize vars_marked_ready_ for next iteration
472- vars_marked_ready_.clear ();
473- vars_marked_ready_.resize (vars_.size (), false );
474-
475- PADDLE_ENFORCE_EQ (
476- groups_need_finalize_, false ,
477- platform::errors::PreconditionNotMet (
478- " A serious error has occurred here. There may be several reasons: "
479- " 1) Please note that all forward outputs derived from the module "
480- " parameters must participate in the calculation of losses and "
481- " subsequent gradient calculations. If not, the wrapper will hang, "
482- " waiting for autograd to generate gradients for these parameters. "
483- " you can use detach or stop_gradient to make the unused parameters "
484- " detached from the autograd graph. "
485- " 2) Used multiple forwards and one backward. You may be able to wrap "
486- " multiple forwards in a model." ));
487-
488- // The first var to trigger the unused parameter
489- has_marked_unused_vars_ = false ;
490- unused_vars_.clear ();
491-
492- if (!find_unused_vars_) {
493- return ;
494- }
495-
496462 node_deps_.clear ();
497463 std::queue<std::shared_ptr<GradOpNode>> q;
498464 std::unordered_set<VariableWrapper *> var_visited;
@@ -554,8 +520,50 @@ void Reducer::PrepareForBackward(
554520 << " ] is not used" ;
555521 }
556522 }
523+ }
557524
558- if (unused_vars_.empty ()) {
525+ // After each batch is calculated, the counter of each group(group.pending_)
526+ // and allreudce sequence counter(next_group_) will be cleaned up again.
527+ void Reducer::PrepareForBackward (
528+ const std::vector<std::shared_ptr<imperative::VarBase>> &outputs) {
529+ VLOG (3 ) << " after forward, then reset count for backward." ;
530+ next_group_ = 0 ;
531+ std::for_each (groups_.begin (), groups_.end (), [](Group &group) {
532+ group.pending_ = group.variable_indices_ .size ();
533+ group.sparse_contents_ = nullptr ;
534+ });
535+
536+ // reinitialize vars_marked_ready_ for next iteration
537+ vars_marked_ready_.clear ();
538+ vars_marked_ready_.resize (vars_.size (), false );
539+
540+ PADDLE_ENFORCE_EQ (
541+ groups_need_finalize_, false ,
542+ platform::errors::PreconditionNotMet (
543+ " A serious error has occurred here. Please "
544+ " set find_unused_parameters=True to traverse backward graph "
545+ " in each step to prepare reduce in advance. If you have "
546+ " set, There may be several reasons for this error: "
547+ " 1) Please note that all forward outputs derived from the module "
548+ " parameters must participate in the calculation of losses and "
549+ " subsequent gradient calculations. If not, the wrapper will hang, "
550+ " waiting for autograd to generate gradients for these parameters. "
551+ " you can use detach or stop_gradient to make the unused parameters "
552+ " detached from the autograd graph. "
553+ " 2) Used multiple forwards and one backward. You may be able to wrap "
554+ " multiple forwards in a model." ));
555+
556+ // The first var to trigger the unused parameter
557+ has_marked_unused_vars_ = false ;
558+
559+ if (find_unused_vars_once_ || find_unused_vars_each_step_) {
560+ unused_vars_.clear ();
561+ TraverseBackwardGraph (outputs);
562+ // only check once in first step
563+ find_unused_vars_once_ = false ;
564+ }
565+
566+ if (find_unused_vars_each_step_ && unused_vars_.empty ()) {
559567 LOG_FIRST_N (WARNING, 1 )
560568 << " All parameters are involved in the backward pass. "
561569 " It is recommended to set find_unused_parameters to False "
@@ -564,7 +572,9 @@ void Reducer::PrepareForBackward(
564572 " will occur. Please make it clear that in the subsequent "
565573 " training, there will be no parameters that are not used "
566574 " in the backward pass, and then set find_unused_parameters" ;
567- } else if (unused_vars_.size () == vars_.size ()) {
575+ }
576+
577+ if (unused_vars_.size () == vars_.size ()) {
568578 LOG_FIRST_N (WARNING, 1 )
569579 << " There is no parameter in the device involved "
570580 " in the backward calculation. If there are "
@@ -595,13 +605,13 @@ void Reducer::AddDistHook(size_t var_index) {
595605
596606 local_used_vars_[var_index] = 1 ;
597607
598- // rebuild group when find_unused_vars_ is false
608+ // rebuild group when find_unused_vars_each_step_ is false
599609 if (NeedRebuildGroup ()) {
600610 rebuild_vars_.push_back (vars_[var_index]);
601611 rebuild_var_indices_.push_back (var_index);
602612 }
603613
604- if (!has_marked_unused_vars_ && find_unused_vars_ ) {
614+ if (!has_marked_unused_vars_) {
605615 has_marked_unused_vars_ = true ;
606616 for (const auto &unused_index : unused_vars_) {
607617 MarkVarReady (unused_index, false );
@@ -622,7 +632,9 @@ void Reducer::MarkVarReady(const size_t var_index, const bool is_used_var) {
622632 if (vars_marked_ready_[var_index]) {
623633 auto error_info = string::Sprintf (
624634 " Error happened, when parameter[%d][%s] has been ready before. "
625- " There may be several reasons for this error: "
635+ " Please set find_unused_parameters=True to traverse backward graph "
636+ " in each step to prepare reduce in advance. If you have set, "
637+ " there may be several reasons for this error: "
626638 " 1) In multiple reentrant backward phase, some parameters are reused."
627639 " 2) Using model parameters outside of forward function. Please "
628640 " make sure that model parameters are not shared in concurrent "
@@ -690,10 +702,16 @@ void Reducer::MarkVarReady(const size_t var_index, const bool is_used_var) {
690702 }
691703 } else {
692704 // process sparse group
693- PADDLE_ENFORCE_EQ (HasGrad (var_index), true ,
694- platform::errors::PreconditionNotMet (
695- " The sparse parameter[%d][%s] must have a gradient" ,
696- var_index, vars_[var_index]->Name ()));
705+ PADDLE_ENFORCE_EQ (
706+ HasGrad (var_index), true ,
707+ platform::errors::PreconditionNotMet (
708+ " The sparse parameter[%d][%s] should have gradient. "
709+ " Currently, DataParallel does not support sparse "
710+ " parameters without generating gradients during training. "
711+ " For example, if is_sparese=True is used in Embedding, "
712+ " the current step of this parameter cannot generate gradient "
713+ " because of stop_gradient/detatch, where error will occur." ,
714+ var_index, vars_[var_index]->Name ()));
697715 auto var_base = vars_[var_index]->GradVarBase ();
698716 // need to check tensor type
699717 PADDLE_ENFORCE_EQ (
@@ -943,7 +961,7 @@ void Reducer::FinalizeBackward() {
943961 InitializeGroups (group_indices_);
944962 }
945963
946- if (find_unused_vars_ ) {
964+ if (find_unused_vars_each_step_ ) {
947965// TODO(liuyuhui) support xpu about Tensorcopy/TensorFromVector/TensorToVector
948966#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
949967 ProcessUnusedDenseVars ();
0 commit comments