remove per_channel_quant=True

fxmarty-amd · fxmarty-amd · commit 4ffff1df8c4a · 2025-07-09T14:20:42.000+02:00
Signed-off-by: Felix Marty &lt;Felix.Marty@amd.com&gt;
diff --git a/vllm/model_executor/layers/fused_moe/utils.py b/vllm/model_executor/layers/fused_moe/utils.py
@@ -84,7 +84,6 @@ def _mxfp4_quantize(
     block_shape: Optional[list[int]] = None,
 ) -> tuple[torch.Tensor, None]:
     assert block_shape is None
-    assert per_act_token_quant
     if not current_platform.supports_mx():
         A = quant_dequant_mxfp4(A)
     else:
diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py
@@ -388,8 +388,6 @@ def apply(
             scoring_func=scoring_func,
             e_score_correction_bias=e_score_correction_bias)
 
-        # We pass `per_channel_quant=True` as OCP MXFP4 quantization is a
-        # per-token quantization scheme (with groups of `OCP_MX_BLOCK_SIZE`).
         out = fused_experts(
             x,
             layer.w13_weight,
@@ -406,7 +404,6 @@ def apply(
             a1_scale=None,
             a2_scale=None,
             block_shape=None,
-            per_channel_quant=True,
             activation=activation,
         )
         return out