From 31db8e42b44a5809ad992d9800eea97c2bbec96b Mon Sep 17 00:00:00 2001 From: co63oc Date: Wed, 14 Feb 2024 09:15:41 +0800 Subject: [PATCH] Fix --- .../framework/ir/fused_attention_pass.cc | 24 +++++++++---------- .../fluid/framework/ir/fused_attention_pass.h | 10 ++++---- paddle/fluid/imperative/reducer.cc | 2 +- .../auto_parallel/static/operators/common.py | 4 ++-- .../fleet/utils/tensor_fusion_helper.py | 2 +- .../test_fleet_sharding_meta_optimizer.py | 2 +- 6 files changed, 22 insertions(+), 22 deletions(-) diff --git a/paddle/fluid/framework/ir/fused_attention_pass.cc b/paddle/fluid/framework/ir/fused_attention_pass.cc index 2b1700669b5f8d..0de9a34bf28d0f 100644 --- a/paddle/fluid/framework/ir/fused_attention_pass.cc +++ b/paddle/fluid/framework/ir/fused_attention_pass.cc @@ -267,9 +267,9 @@ PDNode* FusedAttentionPattern::operator()(PDNode* x, PDNode* mp_allreduce_out_node{nullptr}; if (use_mp) { - mp_allreduce_out_node = pattern->NewNode(mp_allreudce_sum_out_repr()) + mp_allreduce_out_node = pattern->NewNode(mp_allreduce_sum_out_repr()) ->assert_is_op_output("mp_allreduce_sum"); - auto* mp_allreduce_node = pattern->NewNode(mp_allreudce_sum_op_repr()) + auto* mp_allreduce_node = pattern->NewNode(mp_allreduce_sum_op_repr()) ->assert_is_op("mp_allreduce_sum"); out_linear_ele_add_out_node->assert_is_op_input("mp_allreduce_sum"); mp_allreduce_node->LinksFrom({out_linear_ele_add_out_node}) @@ -460,9 +460,9 @@ PDNode* FusedAttentionGradPattern::operator()(PDNode* x, PDNode* mp_c_identity_out_node{nullptr}; if (use_mp) { - mp_c_identity_out_node = pattern->NewNode(mp_allreudce_sum_grad_out_repr()) + mp_c_identity_out_node = pattern->NewNode(mp_allreduce_sum_grad_out_repr()) ->assert_is_op_output("c_identity", "Out"); - auto* mp_c_identity_node = pattern->NewNode(mp_allreudce_sum_grad_op_repr()) + auto* mp_c_identity_node = pattern->NewNode(mp_allreduce_sum_grad_op_repr()) ->assert_is_op("c_identity"); out_linear_dropout_grad_out_node->assert_is_op_input("c_identity"); mp_c_identity_node->LinksFrom({out_linear_dropout_grad_out_node}) @@ -989,13 +989,13 @@ ir::Graph* FusedAttentionsPass::ForwardHandlerHelper( if (use_mp) { GET_IR_NODE_FROM_SUBGRAPH( c_identity_op_node, c_identity_op, fused_attention_pattern); - GET_IR_NODE_FROM_SUBGRAPH(mp_allreudce_sum_op_node, - mp_allreudce_sum_op, + GET_IR_NODE_FROM_SUBGRAPH(mp_allreduce_sum_op_node, + mp_allreduce_sum_op, fused_attention_pattern); remove_nodes.insert(c_identity_op_node); - remove_nodes.insert(mp_allreudce_sum_op_node); + remove_nodes.insert(mp_allreduce_sum_op_node); ring_id = PADDLE_GET_CONST( - int, mp_allreudce_sum_op_node->Op()->GetAttr("ring_id")); + int, mp_allreduce_sum_op_node->Op()->GetAttr("ring_id")); } std::string cache_anchor_name = fuse_qkv_matmul_w_node->Var()->Name(); @@ -1367,16 +1367,16 @@ ir::Graph* FusedAttentionsPass::BackwardHandlerHelper( int ring_id = -1; if (use_mp) { - GET_IR_NODE_FROM_SUBGRAPH(mp_allreudce_sum_grad_op_node, - mp_allreudce_sum_grad_op, + GET_IR_NODE_FROM_SUBGRAPH(mp_allreduce_sum_grad_op_node, + mp_allreduce_sum_grad_op, fused_attention_grad_pattern); GET_IR_NODE_FROM_SUBGRAPH(c_identity_grad_op_node, c_identity_grad_op, fused_attention_grad_pattern); - remove_nodes.insert(mp_allreudce_sum_grad_op_node); + remove_nodes.insert(mp_allreduce_sum_grad_op_node); remove_nodes.insert(c_identity_grad_op_node); ring_id = PADDLE_GET_CONST( - int, mp_allreudce_sum_grad_op_node->Op()->GetAttr("ring_id")); + int, mp_allreduce_sum_grad_op_node->Op()->GetAttr("ring_id")); } OpDesc fused_attention_grad_op_desc( diff --git a/paddle/fluid/framework/ir/fused_attention_pass.h b/paddle/fluid/framework/ir/fused_attention_pass.h index 79d051f6dad6d1..e22b6121f5c3f6 100644 --- a/paddle/fluid/framework/ir/fused_attention_pass.h +++ b/paddle/fluid/framework/ir/fused_attention_pass.h @@ -117,9 +117,9 @@ struct FusedAttentionPattern : public PatternBase { PATTERN_DECL_NODE(out_linear_ele_add_bias); PATTERN_DECL_NODE(out_linear_ele_add_out); - // allreudce for mp - PATTERN_DECL_NODE(mp_allreudce_sum_op); - PATTERN_DECL_NODE(mp_allreudce_sum_out); + // allreduce for mp + PATTERN_DECL_NODE(mp_allreduce_sum_op); + PATTERN_DECL_NODE(mp_allreduce_sum_out); PATTERN_DECL_NODE(out_linear_dropout_op); PATTERN_DECL_NODE(out_linear_dropout_out); @@ -174,8 +174,8 @@ struct FusedAttentionGradPattern : public PatternBase { PATTERN_DECL_NODE(out_linear_dropout_grad_out); // c_identity for mp - PATTERN_DECL_NODE(mp_allreudce_sum_grad_op); // c_identity - PATTERN_DECL_NODE(mp_allreudce_sum_grad_out); + PATTERN_DECL_NODE(mp_allreduce_sum_grad_op); // c_identity + PATTERN_DECL_NODE(mp_allreduce_sum_grad_out); PATTERN_DECL_NODE(out_linear_ele_add_grad_op); PATTERN_DECL_NODE(out_linear_ele_add_grad_x); diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc index 4bbc52662fc96e..d0d5c78b26ea32 100644 --- a/paddle/fluid/imperative/reducer.cc +++ b/paddle/fluid/imperative/reducer.cc @@ -572,7 +572,7 @@ void Reducer::TraverseBackwardGraph( } // After each batch is calculated, the counter of each group(group.pending_) -// and allreudce sequence counter(next_group_) will be cleaned up again. +// and allreduce sequence counter(next_group_) will be cleaned up again. void Reducer::PrepareForBackward( const std::vector> &outputs) { VLOG(3) << "after forward, then reset count for backward."; diff --git a/python/paddle/distributed/auto_parallel/static/operators/common.py b/python/paddle/distributed/auto_parallel/static/operators/common.py index 75a45a510b0cad..ed081c18c78339 100644 --- a/python/paddle/distributed/auto_parallel/static/operators/common.py +++ b/python/paddle/distributed/auto_parallel/static/operators/common.py @@ -488,7 +488,7 @@ def get_data_parallel_group(dist_ctx, op, act_grad_names, rank): def sync_and_scale_gradients(dist_ctx, op, groups, allreduce_var_names): """ - insert the allreudce and scale ops for gradients of model + insert the allreduce and scale ops for gradients of model parameters for operator in data parallelism. Args: @@ -608,7 +608,7 @@ def gradient_synchronization( dist_ctx, op, act_grad_names, out_grad_names, rank ): """ - conduct the allreudce and scaling for gradients of model + conduct the allreduce and scaling for gradients of model parameters for operator in parallelism train. Args: diff --git a/python/paddle/distributed/fleet/utils/tensor_fusion_helper.py b/python/paddle/distributed/fleet/utils/tensor_fusion_helper.py index 3c79671e6d9a77..959f9eb49f40ff 100644 --- a/python/paddle/distributed/fleet/utils/tensor_fusion_helper.py +++ b/python/paddle/distributed/fleet/utils/tensor_fusion_helper.py @@ -359,7 +359,7 @@ def __init__( assert dst != -1 else: raise ValueError( - "The act should be allreudce for dp or reduce for sharding." + "The act should be allreduce for dp or reduce for sharding." ) self._dst = dst diff --git a/test/collective/fleet/test_fleet_sharding_meta_optimizer.py b/test/collective/fleet/test_fleet_sharding_meta_optimizer.py index 5d4ca28e12dfb2..f89b5a706e7809 100755 --- a/test/collective/fleet/test_fleet_sharding_meta_optimizer.py +++ b/test/collective/fleet/test_fleet_sharding_meta_optimizer.py @@ -880,7 +880,7 @@ def test_sharding_hybrid_dp(self): loss_scale = 1.0 / scale self.assertAlmostEqual(float(op.attr('value')), loss_scale) - # check program (allreudce) + # check program (allreduce) ops = [op.type for op in main_prog_ops] self.assertEqual( ops,