[TPU][Bugfix] fix moe layer (vllm-project#21340)

yaochengji · simon-mo · diegocastanibm · commit 3798de65bc19 · 2025-08-15T11:43:19.000-04:00
Signed-off-by: Chengji Yao &lt;chengjiyao@google.com&gt;
Co-authored-by: Simon Mo &lt;simon.mo@hey.com&gt;
Signed-off-by: Diego-Castan &lt;diego.castan@ibm.com&gt;
diff --git a/tests/v1/tpu/test_basic.py b/tests/v1/tpu/test_basic.py
@@ -18,6 +18,7 @@
 
 MODELS = [
     "Qwen/Qwen2.5-1.5B-Instruct",
+    "Qwen/Qwen1.5-MoE-A2.7B",
     # TODO: Enable this models with v6e
     # "Qwen/Qwen2-7B-Instruct",
     # "meta-llama/Llama-3.1-8B",
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
@@ -481,8 +481,16 @@ def forward_cpu(
         e_score_correction_bias: Optional[torch.Tensor] = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
-        **kwargs,
+        enable_eplb: bool = False,
+        expert_load_view: Optional[torch.Tensor] = None,
+        logical_to_physical_map: Optional[torch.Tensor] = None,
+        logical_replica_count: Optional[torch.Tensor] = None,
     ):
+        if enable_eplb is not False or expert_load_view is not None or \
+                logical_to_physical_map is not None or \
+                logical_replica_count is not None:
+            raise NotImplementedError("Expert load balancing is not supported "
+                                      "for CPU.")
         return layer.cpu_fused_moe(
             layer,
             x,
@@ -518,6 +526,10 @@ def forward_tpu(
         e_score_correction_bias: Optional[torch.Tensor] = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
+        enable_eplb: bool = False,
+        expert_load_view: Optional[torch.Tensor] = None,
+        logical_to_physical_map: Optional[torch.Tensor] = None,
+        logical_replica_count: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         assert not use_grouped_topk
         assert num_expert_group is None
@@ -531,6 +543,11 @@ def forward_tpu(
             raise NotImplementedError(
                 "Expert score correction bias is not supported for TPU.")
         assert activation == "silu", f"{activation} is not supported for TPU."
+        if enable_eplb is not False or expert_load_view is not None or \
+                logical_to_physical_map is not None or \
+                logical_replica_count is not None:
+            raise NotImplementedError("Expert load balancing is not supported "
+                                      "for TPU.")
         return fused_moe_pallas(hidden_states=x,
                                 w1=layer.w13_weight,
                                 w2=layer.w2_weight,