[Feature][EPLB] Add support for unquantized models

hsliuustc · hsliuustc · commit fe7527429043 · 2025-07-19T11:31:19.000+08:00
Signed-off-by: hsliu &lt;hsliuustc@gmail.com&gt;
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
@@ -375,8 +375,10 @@ def apply(
         logical_replica_count: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         if enable_eplb:
-            raise NotImplementedError(
-                "EPLB not supported for `UnquantizedFusedMoEMethod` yet.")
+            assert expert_load_view is not None
+            assert logical_to_physical_map is not None
+            assert logical_replica_count is not None
+            assert isinstance(layer, FusedMoE)
 
         return self.forward(
             x=x,
@@ -393,7 +395,12 @@ def apply(
             scoring_func=scoring_func,
             e_score_correction_bias=e_score_correction_bias,
             activation=activation,
-            apply_router_weight_on_input=apply_router_weight_on_input)
+            apply_router_weight_on_input=apply_router_weight_on_input,
+            enable_eplb=enable_eplb,
+            expert_load_view=expert_load_view,
+            logical_to_physical_map=logical_to_physical_map,
+            logical_replica_count=logical_replica_count
+        )
 
     def forward_cuda(
         self,
@@ -412,7 +419,16 @@ def forward_cuda(
         e_score_correction_bias: Optional[torch.Tensor] = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
+        enable_eplb: bool = False,
+        expert_load_view: Optional[torch.Tensor] = None,
+        logical_to_physical_map: Optional[torch.Tensor] = None,
+        logical_replica_count: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
+        if enable_eplb:
+            assert expert_load_view is not None
+            assert logical_to_physical_map is not None
+            assert logical_replica_count is not None
+            assert isinstance(layer, FusedMoE)
 
         topk_weights, topk_ids = FusedMoE.select_experts(
             hidden_states=x,
@@ -425,7 +441,13 @@ def forward_cuda(
             custom_routing_function=custom_routing_function,
             scoring_func=scoring_func,
             e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype)
+            indices_type=self.topk_indices_dtype,
+            enable_eplb=enable_eplb,
+            expert_map=expert_map,
+            expert_load_view=expert_load_view,
+            logical_to_physical_map=logical_to_physical_map,
+            logical_replica_count=logical_replica_count
+        )
 
         if self.rocm_aiter_moe_enabled:
             return self.rocm_aiter_fused_experts(
@@ -730,16 +752,19 @@ def __init__(
         if self.enable_eplb:
             from vllm.model_executor.layers.quantization.fp8 import (
                 Fp8MoEMethod)
-            if not isinstance(quant_method, Fp8MoEMethod):
-                # TODO: Add support for additional quantization methods.
-                # The implementation for other quantization methods does not
-                # contain essential differences, but the current quant API
-                # design causes duplicated work when extending to new
-                # quantization methods, so I'm leaving it for now.
-                # If you plan to add support for more quantization methods,
-                # please refer to the implementation in `Fp8MoEMethod`.
-                raise NotImplementedError("EPLB is only supported for FP8 "
-                                          "quantization for now.")
+            
+            # TODO: Add support for additional quantization methods.
+            SUPPORTED_MOE_QUANT_METHODS = {
+                UnquantizedFusedMoEMethod,
+                Fp8MoEMethod,
+            }
+            quant_method_type = type(quant_method)
+
+            if quant_method_type not in SUPPORTED_MOE_QUANT_METHODS:
+                raise NotImplementedError(
+                    "EPLB is only supported the following quantization methods: "
+                    f"{[cls.__name__ for cls in SUPPORTED_MOE_QUANT_METHODS]}"
+                )
 
         moe_quant_params = {
             "num_experts": self.local_num_experts,