@@ -166,11 +166,11 @@ def __init__(
166166 # In all other modes (EP, EP+attn-TP, no parallelism) each branch handles
167167 # its own reduction internally (reduce_results default=True), so we must
168168 # NOT add an extra all-reduce here.
169- self ._pure_tp = self .use_tp and not self .use_ep
169+ self .merge_ffn_tp = self .use_tp and not self .use_ep
170170
171171 self .experts = FusedMoE (
172172 fd_config ,
173- reduce_results = not self ._pure_tp ,
173+ reduce_results = not self .merge_ffn_tp ,
174174 renormalize = self .norm_topk_prob ,
175175 moe_intermediate_size = fd_config .model_config .moe_intermediate_size ,
176176 num_experts = fd_config .model_config .n_routed_experts ,
@@ -191,14 +191,14 @@ def __init__(
191191 intermediate_size = shared_experts_intermediate_size ,
192192 layer_id = layer_id ,
193193 prefix = f"{ prefix } .shared_experts" ,
194- reduce_results = not self ._pure_tp ,
194+ reduce_results = not self .merge_ffn_tp ,
195195 )
196196
197197 def forward (self , x , forward_meta : ForwardMeta = None ):
198198 out = self .experts (x , self .gate , forward_meta )
199199 if self .n_shared_experts > 0 :
200200 out = out + self .shared_experts (x )
201- if self ._pure_tp :
201+ if self .merge_ffn_tp :
202202 # Both branches produced partial sums; combine first, then single all-reduce.
203203 out = tensor_model_parallel_all_reduce (out , self .tp_group )
204204 return out
0 commit comments