Move all kernels into Quark (#3)

BowenBao · fxmarty-amd · fxmarty-amd · commit fadffba2d305 · 2025-05-13T13:58:04.000+02:00
Co-authored-by: Felix Marty &lt;felmarty@amd.com&gt;
Signed-off-by: Felix Marty &lt;felmarty@amd.com&gt;
diff --git a/vllm/envs.py b/vllm/envs.py
@@ -593,11 +593,6 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
     "VLLM_QUARK_EMU_MEM_OPT":
     lambda: bool(int(os.getenv("VLLM_QUARK_EMU_MEM_OPT", "0"))),
 
-    # Selects the Q/DQ/QDQ implementation to use with mxfp4.
-    # Available: "hip", "torch", "triton".
-    "VLLM_QUARK_MXFP4_Q_DQ_QDQ_IMPLEM":
-    lambda: os.getenv("VLLM_QUARK_MXFP4_Q_DQ_QDQ_IMPLEM", "hip"),
-
     # Divisor for dynamic query scale factor calculation for FP8 KV Cache
     "Q_SCALE_CONSTANT":
     lambda: int(os.getenv("Q_SCALE_CONSTANT", "200")),
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -20,7 +20,10 @@
     per_token_group_quant_fp8)
 from vllm.model_executor.layers.quantization.utils.int8_utils import (
     per_token_group_quant_int8, per_token_quant_int8)
-from vllm.model_executor.layers.quantization.utils.mxfp4_utils import OCP_MX_BLOCK_SIZE, per_token_group_quant_mxfp4, per_token_group_dequant_mxfp4
+from vllm.model_executor.layers.quantization.utils.mxfp4_utils import (
+    quant_dequant_mxfp4,
+    dequant_mxfp4,
+)
 from vllm.platforms import current_platform
 from vllm.utils import direct_register_custom_op
 
@@ -1232,7 +1235,7 @@ def moe_kernel_prepare_input(
     elif use_mxfp4_w4a4:
         assert block_shape is None
         if not current_platform.supports_mx():
-            A = per_token_group_quant_mxfp4(A, OCP_MX_BLOCK_SIZE)
+            A = quant_dequant_mxfp4(A)
         else:
             raise NotImplementedError()
     else:
@@ -1345,13 +1348,11 @@ def fused_experts_impl(hidden_states: torch.Tensor,
     if use_mxfp4_w4a4 and not current_platform.supports_mx(
     ) and envs.VLLM_QUARK_EMU_MEM_OPT:
         # Weight has to be dequantized for mxfp4 emulation.
-        w1 = per_token_group_dequant_mxfp4(w1, w1_scale, OCP_MX_BLOCK_SIZE,
-                                        hidden_states.dtype)
+        w1 = dequant_mxfp4(w1, w1_scale, hidden_states.dtype)
         w1_scale = None
-        w2 = per_token_group_dequant_mxfp4(w2, w2_scale, OCP_MX_BLOCK_SIZE,
-                                        hidden_states.dtype)
+        w2 = dequant_mxfp4(w2, w2_scale, hidden_states.dtype)
         w2_scale = None
-    
+
     for chunk in range((num_tokens // CHUNK_SIZE) + 1):
         begin_chunk_idx, end_chunk_idx = (chunk * CHUNK_SIZE,
                                           min((chunk + 1) * CHUNK_SIZE,
diff --git a/vllm/model_executor/layers/quantization/quark/quark.py b/vllm/model_executor/layers/quantization/quark/quark.py
@@ -177,7 +177,7 @@ def _is_fp8_w8a8(self, weight_quant: Optional[Dict[str, Any]],
         is_static_weight = not weight_quant.get("is_dynamic")
         is_per_tensor_or_channel_weight = (weight_quant.get("qscheme")
                                            in ["per_tensor", "per_channel"])
-        
+
         if not (is_fp8_dtype and is_static_weight
                 and is_per_tensor_or_channel_weight):
             return False
@@ -325,7 +325,6 @@ def _get_scheme_from_config(self, config: Dict[str, Any]) -> "QuarkScheme":
                                  is_static_input_scheme=True,
                                  input_symmetric=input_config.get("symmetric"))
         elif self._is_mx_fp4(weight_config, input_config):
-            logger.info(f"Using VLLM_QUARK_MXFP4_Q_DQ_QDQ_IMPLEM='{envs.VLLM_QUARK_MXFP4_Q_DQ_QDQ_IMPLEM}'.")
             return QuarkW4A4MXFP4(weight_config, input_config)
 
         raise NotImplementedError("No quark compatible scheme was found. "
diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py
@@ -10,13 +10,11 @@
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEMethodBase,
                                                   FusedMoeWeightScaleSupported)
-from vllm.model_executor.layers.quantization.utils.mxfp4_utils import (
-    OCP_MX_BLOCK_SIZE)
+from vllm.model_executor.layers.quantization.utils.mxfp4_utils import OCP_MX_BLOCK_SIZE, dequant_mxfp4
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
     all_close_1d, normalize_e4m3fn_to_e4m3fnuz, per_tensor_dequantize)
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
-from vllm.model_executor.layers.quantization.utils.mxfp4_utils import SUPPORTED_IMPLEMS
 
 logger = init_logger(__name__)
 
@@ -263,9 +261,6 @@ def __init__(self, weight_config: Dict[str, Any], input_config: Dict[str,
         self.static_input_scales = not self.input_quant.get("is_dynamic")
         self.emulate = not current_platform.supports_mx()
 
-        if envs.VLLM_QUARK_MXFP4_Q_DQ_QDQ_IMPLEM not in SUPPORTED_IMPLEMS:
-            raise ValueError(f"VLLM_QUARK_MXFP4_Q_DQ_QDQ_IMPLEM='{envs.VLLM_QUARK_MXFP4_Q_DQ_QDQ_IMPLEM}' is not supported, only {SUPPORTED_IMPLEMS} are.")
-
     def create_weights(self, layer: torch.nn.Module, num_experts: int,
                        hidden_size: int, intermediate_size_per_partition: int,
                        params_dtype: torch.dtype, **extra_weight_attrs):
@@ -327,54 +322,21 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         float_dtype = torch.get_default_dtype()
 
         if self.emulate and not envs.VLLM_QUARK_EMU_MEM_OPT:
-            try:
-                from quark.torch.export.nn.modules import realquantizer
-                from quark.torch.quantization.config.config import (
-                    QuantizationSpec)
-            except ImportError as err:
-                raise ImportError(
-                    "The package `amd-quark` is required to use AMD Quark "
-                    "MX-FP4 models. Please install it with `pip install "
-                    "amd-quark`.") from err
-
-            weight_quant_spec = QuantizationSpec.from_dict(self.weight_quant)
-
             # Unpack and dequantize the weights (the operators are in high-precision, with simulated quantization).
-            w13_quantizer = realquantizer.get_real_quantizer(
-                qspec=weight_quant_spec,
-                quantizer=None,
-                real_quantized=True,
-                reorder=False,  # TODO: load from config
-                float_dtype=float_dtype,
-                scale_shape=layer.w13_weight_scale.shape,
-                zero_point_shape=None,
-            )
-            w13_quantizer.scale.data = layer.w13_weight_scale.data
-
             layer.w13_weight = torch.nn.Parameter(
-                w13_quantizer(layer.w13_weight.data).to(float_dtype),
+                dequant_mxfp4(layer.w13_weight.data, layer.w13_weight_scale.data, float_dtype),
                 requires_grad=False,
             )
             layer.w13_weight_scale = None
 
-            w2_quantizer = realquantizer.get_real_quantizer(
-                qspec=weight_quant_spec,
-                quantizer=None,
-                real_quantized=True,
-                reorder=False,  # TODO: load from config
-                float_dtype=float_dtype,
-                scale_shape=layer.w2_weight_scale.shape,
-                zero_point_shape=None,
-            )
-            w2_quantizer.scale.data = layer.w2_weight_scale.data
-
             layer.w2_weight = torch.nn.Parameter(
-                w2_quantizer(layer.w2_weight.data).to(float_dtype),
+                dequant_mxfp4(layer.w2_weight.data, layer.w2_weight_scale.data, float_dtype),
                 requires_grad=False,
             )
             layer.w2_weight_scale = None
 
             # This call is necessary to release the scales memory.
+            # TODO: is it still?
             torch.cuda.empty_cache()
 
     def apply(
diff --git a/vllm/model_executor/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py
@@ -7,7 +7,11 @@
 
 import vllm.envs as envs
 from vllm.model_executor.layers.quantization.quark.schemes import QuarkScheme
-from vllm.model_executor.layers.quantization.utils.mxfp4_utils import OCP_MX_BLOCK_SIZE, per_token_group_quant_mxfp4, per_token_group_dequant_mxfp4, SUPPORTED_IMPLEMS
+from vllm.model_executor.layers.quantization.utils.mxfp4_utils import (
+    OCP_MX_BLOCK_SIZE,
+    quant_dequant_mxfp4,
+    dequant_mxfp4,
+)
 from vllm.model_executor.parameter import (GroupQuantScaleParameter,
                                            PackedvLLMParameter)
 from vllm.platforms import current_platform
@@ -25,9 +29,6 @@ def __init__(self, weight_quant_spec: Dict[str, Any],
         self.input_quant_spec = input_quant_spec
         self.emulate = not current_platform.supports_mx()
 
-        if envs.VLLM_QUARK_MXFP4_Q_DQ_QDQ_IMPLEM not in SUPPORTED_IMPLEMS:
-            raise ValueError(f"VLLM_QUARK_MXFP4_Q_DQ_QDQ_IMPLEM='{envs.VLLM_QUARK_MXFP4_Q_DQ_QDQ_IMPLEM}' is not supported, only {SUPPORTED_IMPLEMS} are.")
-
     @classmethod
     def get_min_capability(cls) -> int:
         return 70
@@ -38,40 +39,16 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         layer.weight_scale = torch.nn.Parameter(layer.weight_scale.data,
                                                 requires_grad=False)
 
-        if self.emulate:
-            try:
-                from quark.torch.export.nn.modules import realquantizer
-                from quark.torch.quantization.config.config import (
-                    QuantizationSpec)
-            except ImportError as err:
-                raise ImportError(
-                    "The package `amd-quark` is required to use AMD Quark "
-                    "MX-FP4 models. Please install it with `pip install "
-                    "amd-quark`.") from err
-
-            weight_quant_spec = QuantizationSpec.from_dict(
-                self.weight_quant_spec)
-
-            weight_quantizer = realquantizer.get_real_quantizer(
-                qspec=weight_quant_spec,
-                quantizer=None,
-                real_quantized=True,
-                reorder=False,
-                float_dtype=self.out_dtype,
-                scale_shape=layer.weight_scale.shape,
-                zero_point_shape=None,
+        if self.emulate and not envs.VLLM_QUARK_EMU_MEM_OPT:
+            layer.weight = torch.nn.Parameter(
+                dequant_mxfp4(layer.weight.data, layer.weight_scale.data, self.out_dtype),
+                requires_grad=False,
             )
-            weight_quantizer.scale.data = layer.weight_scale.data
+            layer.weight_scale = None
 
-            if not envs.VLLM_QUARK_EMU_MEM_OPT:
-                layer.weight = torch.nn.Parameter(
-                    weight_quantizer(layer.weight.data).to(self.out_dtype),
-                    requires_grad=False,
-                )
-                layer.weight_scale = None
-
-                # This call is necessary to release the scales memory.
-                torch.cuda.empty_cache()
+            # This call is necessary to release the scales memory.
+            # TODO: is it still?
+            torch.cuda.empty_cache()
 
     def create_weights(self, layer: torch.nn.Module,
                        output_partition_sizes: List[int],
@@ -116,11 +93,11 @@ def apply_weights(self,
 
         if self.emulate:
             if envs.VLLM_QUARK_EMU_MEM_OPT:
-                dq_w = per_token_group_dequant_mxfp4(layer.weight, layer.weight_scale, OCP_MX_BLOCK_SIZE, x.dtype)
+                dq_w = dequant_mxfp4(layer.weight, layer.weight_scale, x.dtype)
             else:
                 dq_w = layer.weight
-            
-            x = per_token_group_quant_mxfp4(x, OCP_MX_BLOCK_SIZE)
+
+            x = quant_dequant_mxfp4(x)
 
             return F.linear(x, dq_w, bias)
         else:
diff --git a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py