diff --git a/tests/v1/tpu/test_basic.py b/tests/v1/tpu/test_basic.py index c8cd099a98cf..b9ee9d66a38f 100644 --- a/tests/v1/tpu/test_basic.py +++ b/tests/v1/tpu/test_basic.py @@ -18,6 +18,7 @@ MODELS = [ "Qwen/Qwen2.5-1.5B-Instruct", + "Qwen/Qwen1.5-MoE-A2.7B", # TODO: Enable this models with v6e # "Qwen/Qwen2-7B-Instruct", # "meta-llama/Llama-3.1-8B", diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 4a6a3b95ec7f..2a283a6d12b9 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -481,8 +481,16 @@ def forward_cpu( e_score_correction_bias: Optional[torch.Tensor] = None, apply_router_weight_on_input: bool = False, activation: str = "silu", - **kwargs, + enable_eplb: bool = False, + expert_load_view: Optional[torch.Tensor] = None, + logical_to_physical_map: Optional[torch.Tensor] = None, + logical_replica_count: Optional[torch.Tensor] = None, ): + if enable_eplb is not False or expert_load_view is not None or \ + logical_to_physical_map is not None or \ + logical_replica_count is not None: + raise NotImplementedError("Expert load balancing is not supported " + "for CPU.") return layer.cpu_fused_moe( layer, x, @@ -518,6 +526,10 @@ def forward_tpu( e_score_correction_bias: Optional[torch.Tensor] = None, apply_router_weight_on_input: bool = False, activation: str = "silu", + enable_eplb: bool = False, + expert_load_view: Optional[torch.Tensor] = None, + logical_to_physical_map: Optional[torch.Tensor] = None, + logical_replica_count: Optional[torch.Tensor] = None, ) -> torch.Tensor: assert not use_grouped_topk assert num_expert_group is None @@ -531,6 +543,11 @@ def forward_tpu( raise NotImplementedError( "Expert score correction bias is not supported for TPU.") assert activation == "silu", f"{activation} is not supported for TPU." + if enable_eplb is not False or expert_load_view is not None or \ + logical_to_physical_map is not None or \ + logical_replica_count is not None: + raise NotImplementedError("Expert load balancing is not supported " + "for TPU.") return fused_moe_pallas(hidden_states=x, w1=layer.w13_weight, w2=layer.w2_weight,