review comments

bnellnm · bnellnm · commit 8c8b6a8549ba · 2025-10-24T22:10:14.000Z
Signed-off-by: Bill Nell &lt;bnell@redhat.com&gt;
diff --git a/vllm/distributed/device_communicators/base_device_communicator.py b/vllm/distributed/device_communicators/base_device_communicator.py
@@ -266,14 +266,14 @@ def prepare_communication_buffer_for_model(self, model: torch.nn.Module) -> None
             module
             for module in model.modules()
             # TODO(bnell): Should use isinstance but can't.  Maybe search for
-            # presence of quant_method.init_prepare_finalize?
+            # presence of quant_method.maybe_init_modular_kernel?
             if (
                 module.__class__.__name__ == "FusedMoE"
                 or module.__class__.__name__ == "SharedFusedMoE"
             )
         ]
         for module in moe_modules:
-            module.init_prepare_finalize()
+            module.maybe_init_modular_kernel()
 
     def dispatch(
         self,
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
@@ -119,7 +119,6 @@ def __init__(self, moe: FusedMoEConfig):
         super().__init__()
         self.moe = moe
         self.moe_quant_config: FusedMoEQuantConfig | None = None
-        self.topk_indices_dtype = None
 
     @abstractmethod
     def create_weights(
@@ -244,7 +243,7 @@ def maybe_make_prepare_finalize(self) -> FusedMoEPrepareAndFinalize | None:
         else:
             return None
 
-    def init_prepare_finalize(
+    def maybe_init_modular_kernel(
         self, layer: torch.nn.Module
     ) -> FusedMoEModularKernel | None:
         assert self.moe is not None
@@ -260,8 +259,6 @@ def init_prepare_finalize(
             logger.debug(
                 "%s for %s(%s)", prepare_finalize.__class__.__name__, self, id(self)
             )
-            assert self.topk_indices_dtype is None
-            self.topk_indices_dtype = prepare_finalize.topk_indices_dtype()
             experts = self.select_gemm_impl(prepare_finalize, layer)
             return FusedMoEModularKernel(
                 prepare_finalize,
@@ -289,6 +286,10 @@ def get_fused_moe_quant_config(
     ) -> FusedMoEQuantConfig | None:
         raise NotImplementedError
 
+    @property
+    def topk_indices_dtype(self) -> torch.dtype | None:
+        return None
+
     @property
     def supports_eplb(self) -> bool:
         return False
@@ -328,31 +329,33 @@ def apply(
 class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp):
     def __init__(
         self,
-        old_moe_method: FusedMoEMethodBase,
+        old_quant_method: FusedMoEMethodBase,
         fused_experts: FusedMoEModularKernel,
     ):
-        super().__init__(old_moe_method.moe)
-        # Find better way to copy attributes?
-        # self.__dict__.update(old_moe_method.__dict__)
-
-        self.moe_quant_config = old_moe_method.moe_quant_config
+        super().__init__(old_quant_method.moe)
+        # Find better way to copy attributes?  Should we even copy attributes?
+        # self.__dict__.update(old_quant_method.__dict__)
+        self.moe_quant_config = old_quant_method.moe_quant_config
         self.fused_experts = fused_experts
-        self.topk_indices_dtype = old_moe_method.topk_indices_dtype
-        self.disable_expert_map = not fused_experts.supports_expert_map()
-        self.old_method_name = old_moe_method.__class__.__name__
-        self._supports_eplb = old_moe_method.supports_eplb
-        self._allow_inplace = old_moe_method.allow_inplace
-        if isinstance(old_moe_method, torch.nn.Module):
-            self.load_state_dict(old_moe_method.state_dict())
-        logger.debug("Swapping out %s", self.old_method_name)
+        self.disable_expert_map = getattr(
+            old_quant_method,
+            "disable_expert_map",
+            not fused_experts.supports_expert_map(),
+        )
+        self.old_quant_method = old_quant_method
+        logger.debug("Swapping out %s", self.old_quant_method.__class__.__name__)
+
+    @property
+    def topk_indices_dtype(self) -> torch.dtype | None:
+        return self.fused_experts.prepare_finalize.topk_indices_dtype()
 
     @property
     def supports_eplb(self) -> bool:
-        return self._supports_eplb
+        return self.old_quant_method.supports_eplb
 
     @property
     def allow_inplace(self) -> bool:
-        return self._allow_inplace
+        return self.old_quant_method.allow_inplace
 
     def create_weights(
         self,
@@ -405,10 +408,11 @@ def apply(
                 assert isinstance(layer, FusedMoE)
             else:
                 raise NotImplementedError(
-                    f"EPLB is not supported for {self.old_method_name}"
+                    "EPLB is not supported for "
+                    f"{self.old_quant_method.__class__.__name__}."
                 )
 
-        select_result = FusedMoE.select_experts(
+        topk_weights, topk_ids, zero_expert_result = FusedMoE.select_experts(
             hidden_states=x,
             router_logits=router_logits,
             use_grouped_topk=use_grouped_topk,
@@ -431,8 +435,6 @@ def apply(
             zero_expert_type=zero_expert_type,
         )
 
-        topk_weights, topk_ids, zero_expert_result = select_result
-
         result = self.fused_experts(
             hidden_states=x,
             w1=layer.w13_weight,
@@ -1421,7 +1423,7 @@ def _get_quant_method() -> FusedMoEMethodBase:
             )
 
             if not isinstance(
-                quant_method, (UnquantizedFusedMoEMethod, ModelOptFp8MoEMethod)
+                self.quant_method, (UnquantizedFusedMoEMethod, ModelOptFp8MoEMethod)
             ):
                 raise NotImplementedError(
                     "is_act_and_mul=False is supported only for unquantized "
@@ -1441,6 +1443,7 @@ def _get_quant_method() -> FusedMoEMethodBase:
             # If you plan to add support for more quantization methods,
             # please refer to the implementation in `Fp8MoEMethod`.
             raise NotImplementedError(
+                f"EPLB is not supported {self.quant_method.__class__.__name__}. "
                 "EPLB is only supported for FP8 quantization for now."
             )
 
@@ -1466,12 +1469,12 @@ def _get_quant_method() -> FusedMoEMethodBase:
         self.batched_hidden_states: torch.Tensor | None = None
         self.batched_router_logits: torch.Tensor | None = None
 
-    # Note: init_prepare_finalize should only be called by
+    # Note: maybe_init_modular_kernel should only be called by
     # prepare_communication_buffer_for_model.
     # This is called after all weight loading and post-processing, so it
     # should be safe to swap out the quant_method.
-    def init_prepare_finalize(self) -> None:
-        mk = self.quant_method.init_prepare_finalize(self)
+    def maybe_init_modular_kernel(self) -> None:
+        mk = self.quant_method.maybe_init_modular_kernel(self)
         if mk is not None:
             self.quant_method = FusedMoEModularMethod(self.quant_method, mk)
 
diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py
@@ -709,12 +709,9 @@ def __init__(
 
     def supports_expert_map(self) -> bool:
         """
-        A flag indicating whether or not this class supports expert maps
+        A flag indicating whether or not this class supports expert maps.
         """
-        return (
-            self.prepare_finalize.num_dispatchers() <= 1
-            and self.fused_experts.supports_expert_map()
-        )
+        return self.fused_experts.supports_expert_map()
 
     def output_is_reduced(self) -> bool:
         """
diff --git a/vllm/model_executor/layers/quantization/moe_wna16.py b/vllm/model_executor/layers/quantization/moe_wna16.py
@@ -226,7 +226,6 @@ def create_weights(
         params_dtype: torch.dtype,
         **extra_weight_attrs,
     ):
-        self.moe = layer
         layer.quant_config = self.quant_config
         bit8_pack_factor = self.quant_config.bit8_pack_factor
         group_size = self.quant_config.group_size
diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -181,8 +181,6 @@ def get_quant_method(
 class Mxfp4MoEMethod(FusedMoEMethodBase):
     def __init__(self, moe: FusedMoEConfig):
         super().__init__(moe)
-        self.topk_indices_dtype = None
-        self.moe = moe
         self.mxfp4_backend = get_mxfp4_backend()
         self.max_capture_size = (
             get_current_vllm_config().compilation_config.max_cudagraph_capture_size