small fix

MengqingCao · MengqingCao · commit 2a854288dd61 · 2025-06-04T10:31:23.000Z
Signed-off-by: MengqingCao &lt;cmq0113@163.com&gt;
diff --git a/vllm_ascend/ops/fused_moe.py b/vllm_ascend/ops/fused_moe.py
@@ -623,6 +623,7 @@ def apply(
         scoring_func: str = "softmax",
         e_score_correction_bias: Optional[torch.Tensor] = None,
         is_prefill: bool = False,
+        enable_force_load_balance: bool = False,
         **kwargs,
     ):
         # NOTE: now npu_moe_gating_top_k can only support `group_count=256` pattern
@@ -655,6 +656,11 @@ def apply(
             )
 
         topk_weights = topk_weights.to(x.dtype)
+        # this is a naive implementation for experts load balance so as
+        # to avoid accumulating too much tokens on a single rank.
+        # currently it is only activated when doing profile runs.
+        if enable_force_load_balance:
+            topk_ids = torch.randint_like(topk_ids, 0, global_num_experts)
 
         if VLLM_ENABLE_MC2 and not is_prefill:
             return fused_experts_with_mc2(