[torch.compile] Unwrap fused_marlin_moe custom op (#26739)

varun-sundar-rabindranath · Varun Sundar Rabindranath · web-flow · commit 8ae169286f93 · 2025-10-14T02:22:16.000Z
Signed-off-by: Varun Sundar Rabindranath &lt;vsundarr@redhat.com&gt;
Co-authored-by: Varun Sundar Rabindranath &lt;vsundarr@redhat.com&gt;
diff --git a/tests/kernels/moe/test_moe.py b/tests/kernels/moe/test_moe.py
@@ -26,6 +26,7 @@
     int4_w4a16_moe_quant_config,
     int8_w8a16_moe_quant_config,
 )
+from vllm.model_executor.layers.fused_moe.fused_marlin_moe import fused_marlin_moe
 from vllm.model_executor.layers.fused_moe.fused_moe import (
     fused_topk,
     modular_triton_fused_moe,
@@ -724,7 +725,7 @@ def test_fused_marlin_moe(
     with set_current_vllm_config(vllm_config):
         torch_output = torch_moe(a, w_ref1, w_ref2, score, topk, expert_map=e_map)
 
-    marlin_output = torch.ops.vllm.fused_marlin_moe(
+    marlin_output = fused_marlin_moe(
         a,
         qweight1,
         qweight2,
@@ -837,7 +838,7 @@ def test_fused_marlin_moe_with_bias(m):
     with set_current_vllm_config(vllm_config):
         torch_output = torch_moe(a, w_ref1, w_ref2, score, topk, b_bias1, b_bias2)
 
-    marlin_output = torch.ops.vllm.fused_marlin_moe(
+    marlin_output = fused_marlin_moe(
         a,
         qweight1,
         qweight2,
diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py
@@ -51,7 +51,6 @@ def get_config() -> dict[str, Any] | None:
 
 if HAS_TRITON:
     # import to register the custom ops
-    import vllm.model_executor.layers.fused_moe.fused_marlin_moe  # noqa
     from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
         BatchedDeepGemmExperts,
     )
diff --git a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
@@ -19,7 +19,6 @@
     maybe_warn_marlin_atomic_add,
 )
 from vllm.scalar_type import ScalarType, scalar_types
-from vllm.utils import direct_register_custom_op
 
 
 def fused_marlin_moe(
@@ -241,44 +240,6 @@ def fused_marlin_moe(
     return torch.sum(intermediate_cache3.view(-1, topk, K), dim=1, out=output)
 
 
-def fused_marlin_moe_fake(
-    hidden_states: torch.Tensor,
-    w1: torch.Tensor,
-    w2: torch.Tensor,
-    w1_scale: torch.Tensor,
-    w2_scale: torch.Tensor,
-    gating_output: torch.Tensor | None,
-    topk_weights: torch.Tensor,
-    topk_ids: torch.Tensor,
-    quant_type_id: int,
-    apply_router_weight_on_input: bool = False,
-    global_num_experts: int = -1,
-    global_scale1: torch.Tensor | None = None,
-    global_scale2: torch.Tensor | None = None,
-    expert_map: torch.Tensor | None = None,
-    g_idx1: torch.Tensor | None = None,
-    g_idx2: torch.Tensor | None = None,
-    sort_indices1: torch.Tensor | None = None,
-    sort_indices2: torch.Tensor | None = None,
-    w1_zeros: torch.Tensor | None = None,
-    w2_zeros: torch.Tensor | None = None,
-    workspace: torch.Tensor | None = None,
-    intermediate_cache13: torch.Tensor | None = None,
-    intermediate_cache2: torch.Tensor | None = None,
-    is_k_full: bool = True,
-    output: torch.Tensor | None = None,
-    inplace: bool = False,
-) -> torch.Tensor:
-    return torch.empty_like(hidden_states)
-
-
-direct_register_custom_op(
-    op_name="fused_marlin_moe",
-    op_func=fused_marlin_moe,
-    fake_impl=fused_marlin_moe_fake,
-)
-
-
 class MarlinExperts(mk.FusedMoEPermuteExpertsUnpermute):
     def __init__(self, quant_config: FusedMoEQuantConfig):
         # TODO (varun) : Enable activation quantization
diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py
@@ -14,6 +14,7 @@
     FusedMoEConfig,
     FusedMoEQuantConfig,
 )
+from vllm.model_executor.layers.fused_moe.fused_marlin_moe import fused_marlin_moe
 from vllm.model_executor.layers.fused_moe.layer import (
     FusedMoE,
     FusedMoEMethodBase,
@@ -604,7 +605,7 @@ def apply(
             indices_type=self.topk_indices_dtype,
         )
 
-        return torch.ops.vllm.fused_marlin_moe(
+        return fused_marlin_moe(
             x,
             layer.w13_qweight,
             layer.w2_qweight,
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -34,6 +34,7 @@
 from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (
     is_valid_flashinfer_cutlass_fused_moe,
 )
+from vllm.model_executor.layers.fused_moe.fused_marlin_moe import fused_marlin_moe
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes.compressed_tensors_wNa16 import (  # noqa
     WNA16_SUPPORTED_BITS,
     WNA16_SUPPORTED_TYPES_MAP,
@@ -462,7 +463,7 @@ def apply(
         #
         if self.use_marlin:
             assert self.fused_experts is None
-            return torch.ops.vllm.fused_marlin_moe(
+            return fused_marlin_moe(
                 x,
                 layer.w13_weight,
                 layer.w2_weight,
@@ -1067,7 +1068,7 @@ def apply(
         if self.use_marlin:
             assert activation == "silu", f"{activation} not supported for Marlin MoE."
             assert self.fused_experts is None
-            return torch.ops.vllm.fused_marlin_moe(
+            return fused_marlin_moe(
                 x,
                 layer.w13_weight,
                 layer.w2_weight,
@@ -1654,7 +1655,7 @@ def apply(
             indices_type=self.topk_indices_dtype,
         )
 
-        return torch.ops.vllm.fused_marlin_moe(
+        return fused_marlin_moe(
             x,
             layer.w13_weight_packed,
             layer.w2_weight_packed,
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
@@ -26,6 +26,7 @@
     FusedMoEQuantConfig,
     fp8_w8a8_moe_quant_config,
 )
+from vllm.model_executor.layers.fused_moe.fused_marlin_moe import fused_marlin_moe
 from vllm.model_executor.layers.fused_moe.layer import UnquantizedFusedMoEMethod
 from vllm.model_executor.layers.linear import (
     LinearBase,
@@ -1196,7 +1197,7 @@ def apply(
         elif self.use_marlin:
             assert activation == "silu", f"{activation} not supported for Marlin MoE."
             assert self.fused_experts is None
-            result = torch.ops.vllm.fused_marlin_moe(
+            result = fused_marlin_moe(
                 x,
                 layer.w13_weight,
                 layer.w2_weight,
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -15,6 +15,7 @@
     FusedMoEConfig,
     FusedMoEQuantConfig,
 )
+from vllm.model_executor.layers.fused_moe.fused_marlin_moe import fused_marlin_moe
 from vllm.model_executor.layers.fused_moe.layer import (
     FusedMoE,
     FusedMoEMethodBase,
@@ -765,7 +766,7 @@ def apply(
             indices_type=self.topk_indices_dtype,
         )
 
-        return torch.ops.vllm.fused_marlin_moe(
+        return fused_marlin_moe(
             x,
             layer.w13_qweight,
             layer.w2_qweight,
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
@@ -21,6 +21,7 @@
 from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (
     is_valid_flashinfer_cutlass_fused_moe,
 )
+from vllm.model_executor.layers.fused_moe.fused_marlin_moe import fused_marlin_moe
 from vllm.model_executor.layers.fused_moe.layer import (
     FusedMoE,
     FusedMoEMethodBase,
@@ -1701,7 +1702,7 @@ def apply(
         #
         if self.use_marlin:
             assert self.fused_experts is None
-            return torch.ops.vllm.fused_marlin_moe(
+            return fused_marlin_moe(
                 x,
                 layer.w13_weight,
                 layer.w2_weight,
diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -21,7 +21,10 @@
     mxfp4_w4a16_moe_quant_config,
     ocp_mx_moe_quant_config,
 )
-from vllm.model_executor.layers.fused_moe.fused_marlin_moe import MarlinExperts
+from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
+    MarlinExperts,
+    fused_marlin_moe,
+)
 from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import (
     OAITritonExperts,
 )
@@ -947,7 +950,7 @@ def apply(
                 e_score_correction_bias=e_score_correction_bias,
             )
 
-            return torch.ops.vllm.fused_marlin_moe(
+            return fused_marlin_moe(
                 x,
                 layer.w13_weight,
                 layer.w2_weight,
diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py
@@ -20,6 +20,7 @@
     fp8_w8a8_moe_quant_config,
     ocp_mx_moe_quant_config,
 )
+from vllm.model_executor.layers.fused_moe.fused_marlin_moe import fused_marlin_moe
 from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
     is_rocm_aiter_moe_enabled,
 )
@@ -402,7 +403,7 @@ def apply(
             )
         if self.use_marlin:
             assert activation == "silu", f"{activation} not supported for Marlin MoE."
-            return torch.ops.vllm.fused_marlin_moe(
+            return fused_marlin_moe(
                 x,
                 layer.w13_weight,
                 layer.w2_weight,

Original file line number	Diff line number	Diff line change
`@@ -51,7 +51,6 @@ def get_config() -> dict[str, Any] \| None:`
`51`	`51`
`52`	`52`	`if HAS_TRITON:`
`53`	`53`	`# import to register the custom ops`
`54`		`- import vllm.model_executor.layers.fused_moe.fused_marlin_moe # noqa`
`55`	`54`	`from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (`
`56`	`55`	`BatchedDeepGemmExperts,`
`57`	`56`	`)`
Original file line number	Diff line number	Diff line change
`@@ -21,7 +21,10 @@`
`21`	`21`	`mxfp4_w4a16_moe_quant_config,`
`22`	`22`	`ocp_mx_moe_quant_config,`
`23`	`23`	`)`
`24`		`-from vllm.model_executor.layers.fused_moe.fused_marlin_moe import MarlinExperts`
	`24`	`+from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (`
	`25`	`+ MarlinExperts,`
	`26`	`+ fused_marlin_moe,`
	`27`	`+)`
`25`	`28`	`from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import (`
`26`	`29`	`OAITritonExperts,`
`27`	`30`	`)`
`@@ -947,7 +950,7 @@ def apply(`
`947`	`950`	`e_score_correction_bias=e_score_correction_bias,`
`948`	`951`	`)`
`949`	`952`
`950`		`- return torch.ops.vllm.fused_marlin_moe(`
	`953`	`+ return fused_marlin_moe(`
`951`	`954`	`x,`
`952`	`955`	`layer.w13_weight,`
`953`	`956`	`layer.w2_weight,`
Original file line number	Diff line number	Diff line change
`@@ -20,6 +20,7 @@`
`20`	`20`	`fp8_w8a8_moe_quant_config,`
`21`	`21`	`ocp_mx_moe_quant_config,`
`22`	`22`	`)`
	`23`	`+from vllm.model_executor.layers.fused_moe.fused_marlin_moe import fused_marlin_moe`
`23`	`24`	`from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (`
`24`	`25`	`is_rocm_aiter_moe_enabled,`
`25`	`26`	`)`
`@@ -402,7 +403,7 @@ def apply(`
`402`	`403`	`)`
`403`	`404`	`if self.use_marlin:`
`404`	`405`	`assert activation == "silu", f"{activation} not supported for Marlin MoE."`
`405`		`- return torch.ops.vllm.fused_marlin_moe(`
	`406`	`+ return fused_marlin_moe(`
`406`	`407`	`x,`
`407`	`408`	`layer.w13_weight,`
`408`	`409`	`layer.w2_weight,`