Add envar if dequant weight at load time

BowenBao · BowenBao · commit 6ed434cb0908 · 2025-04-29T17:58:16.000Z
Signed-off-by: Bowen Bao &lt;bowenbao@amd.com&gt;
diff --git a/vllm/envs.py b/vllm/envs.py
@@ -82,6 +82,7 @@
     VLLM_ROCM_FP8_PADDING: bool = True
     VLLM_ROCM_MOE_PADDING: bool = True
     VLLM_ROCM_CUSTOM_PAGED_ATTN: bool = True
+    VLLM_QUARK_EMU_MEM_OPT: bool = False
     VLLM_ENABLE_V1_MULTIPROCESSING: bool = True
     VLLM_LOG_BATCHSIZE_INTERVAL: float = -1
     VLLM_DISABLE_COMPILE_CACHE: bool = False
@@ -571,6 +572,14 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
     lambda: (os.getenv("VLLM_ROCM_CUSTOM_PAGED_ATTN", "True").lower() in
              ("true", "1")),
 
+    # If set, when running in Quark emulation mode, do not dequantize the
+    # weights at load time. Instead, dequantize weights on-the-fly during
+    # kernel execution.
+    # This allows running larger models at the cost of slower inference.
+    # This flag has no effect when not running in Quark emulation mode.
+    "VLLM_QUARK_EMU_MEM_OPT":
+    lambda: bool(int(os.getenv("VLLM_QUARK_EMU_MEM_OPT", "0"))),
+
     # Divisor for dynamic query scale factor calculation for FP8 KV Cache
     "Q_SCALE_CONSTANT":
     lambda: int(os.getenv("Q_SCALE_CONSTANT", "200")),
diff --git a/vllm/model_executor/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py
@@ -5,18 +5,17 @@
 import torch
 import torch.nn.functional as F
 
+import vllm.envs as envs
 from vllm.model_executor.layers.quantization.quark.schemes import QuarkScheme
+from vllm.model_executor.layers.quantization.utils.mxfp4_utils import (
+    OCP_MX_BLOCK_SIZE)
 from vllm.model_executor.parameter import (GroupQuantScaleParameter,
                                            PackedvLLMParameter)
 from vllm.platforms import current_platform
 
-from vllm.model_executor.layers.quantization.utils.mxfp4_utils import OCP_MX_BLOCK_SIZE
-
 __all__ = ["QuarkW4A4MXFP4"]
 
 
-
-
 class QuarkW4A4MXFP4(QuarkScheme):
 
     def __init__(self, weight_quant_spec: Dict[str, Any],
@@ -48,7 +47,6 @@ def __init__(self, weight_quant_spec: Dict[str, Any],
                 float_dtype=self.out_dtype,
             )
 
-
     @classmethod
     def get_min_capability(cls) -> int:
         # lovelace and up
@@ -74,7 +72,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             weight_quant_spec = QuantizationSpec.from_dict(
                 self.weight_quant_spec)
 
-            weight_quantizer = realquantizer.get_real_quantizer(
+            self.weight_quantizer = realquantizer.get_real_quantizer(
                 qspec=weight_quant_spec,
                 quantizer=None,
                 real_quantized=True,
@@ -83,12 +81,14 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
                 scale_shape=layer.weight_scale.shape,
                 zero_point_shape=None,
             )
-            weight_quantizer.scale.data = layer.weight_scale.data
+            self.weight_quantizer.scale.data = layer.weight_scale.data
 
-            layer.weight = torch.nn.Parameter(
-                weight_quantizer(layer.weight.data).to(self.out_dtype),
-                requires_grad=False,
-            )
+            if not envs.VLLM_QUARK_EMU_MEM_OPT:
+                layer.weight = torch.nn.Parameter(
+                    self.weight_quantizer(layer.weight.data).to(
+                        self.out_dtype),
+                    requires_grad=False,
+                )
 
     def create_weights(self, layer: torch.nn.Module,
                        output_partition_sizes: List[int],
@@ -132,7 +132,11 @@ def apply_weights(self,
                       bias: Optional[torch.Tensor] = None) -> torch.Tensor:
 
         if self.emulate:
+            if envs.VLLM_QUARK_EMU_MEM_OPT:
+                dq_w = self.weight_quantizer(layer.weight).to(self.out_dtype)
+            else:
+                dq_w = layer.weight
             qdq_x = self.input_quantizer(x)
-            return F.linear(qdq_x, layer.weight, bias)
+            return F.linear(qdq_x, dq_w, bias)
         else:
             raise NotImplementedError()