Mxfp4 memory leak fixes (#2)

fxmarty-amd · web-flow · commit 489501fb441b · 2025-04-30T13:10:57.000+02:00
diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py
@@ -350,6 +350,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             w13_quantizer(layer.w13_weight.data).to(float_dtype),
             requires_grad=False,
         )
+        layer.w13_weight_scale = None
 
         w2_quantizer = realquantizer.get_real_quantizer(
             qspec=weight_quant_spec,
@@ -366,6 +367,10 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             w2_quantizer(layer.w2_weight.data).to(float_dtype),
             requires_grad=False,
         )
+        layer.w2_weight_scale = None
+
+        # This call is necessary to release the scales memory.
+        torch.cuda.empty_cache()
 
     def apply(
         self,
diff --git a/vllm/model_executor/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py
@@ -7,12 +7,12 @@
 
 import vllm.envs as envs
 from vllm.model_executor.layers.quantization.quark.schemes import QuarkScheme
-from vllm.model_executor.layers.quantization.utils.mxfp4_utils import (
-    OCP_MX_BLOCK_SIZE)
 from vllm.model_executor.parameter import (GroupQuantScaleParameter,
                                            PackedvLLMParameter)
 from vllm.platforms import current_platform
 
+from vllm.model_executor.layers.quantization.utils.mxfp4_utils import OCP_MX_BLOCK_SIZE, per_token_group_quant_mxfp4
+
 __all__ = ["QuarkW4A4MXFP4"]
 
 
@@ -37,16 +37,6 @@ def __init__(self, weight_quant_spec: Dict[str, Any],
                     "MX-FP4 models. Please install it with `pip install "
                     "amd-quark`.") from err
 
-            input_quant_spec = QuantizationSpec.from_dict(
-                self.input_quant_spec)
-
-            self.input_quantizer = realquantizer.get_real_quantizer(
-                qspec=input_quant_spec,
-                quantizer=None,
-                real_quantized=False,
-                float_dtype=self.out_dtype,
-            )
-
     @classmethod
     def get_min_capability(cls) -> int:
         # lovelace and up
@@ -72,7 +62,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             weight_quant_spec = QuantizationSpec.from_dict(
                 self.weight_quant_spec)
 
-            self.weight_quantizer = realquantizer.get_real_quantizer(
+            weight_quantizer = realquantizer.get_real_quantizer(
                 qspec=weight_quant_spec,
                 quantizer=None,
                 real_quantized=True,
@@ -81,14 +71,18 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
                 scale_shape=layer.weight_scale.shape,
                 zero_point_shape=None,
             )
-            self.weight_quantizer.scale.data = layer.weight_scale.data
+            weight_quantizer.scale.data = layer.weight_scale.data
 
             if not envs.VLLM_QUARK_EMU_MEM_OPT:
                 layer.weight = torch.nn.Parameter(
-                    self.weight_quantizer(layer.weight.data).to(
+                    weight_quantizer(layer.weight.data).to(
                         self.out_dtype),
                     requires_grad=False,
                 )
+            layer.weight_scale = None
+            
+            # This call is necessary to release the scales memory.
+            torch.cuda.empty_cache()
 
     def create_weights(self, layer: torch.nn.Module,
                        output_partition_sizes: List[int],
@@ -136,7 +130,7 @@ def apply_weights(self,
                 dq_w = self.weight_quantizer(layer.weight).to(self.out_dtype)
             else:
                 dq_w = layer.weight
-            qdq_x = self.input_quantizer(x)
+            qdq_x, _ = per_token_group_quant_mxfp4(x, 32)
             return F.linear(qdq_x, dq_w, bias)
         else:
             raise NotImplementedError()
diff --git a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
@@ -20,7 +20,8 @@ def per_token_group_quant_mxfp4(x: torch.Tensor, block_k: int):
     # TODO: there are other rounding strategies supported in quark and in the config.json that we do not check for here! 
     scale = even_round(amax, "fp4")
 
-    x_qdq = scaled_fake_quantize(
+    # Apply dequantize(quantize(x)).
+    x = scaled_fake_quantize(
         "fp4",
         x,
         scale.to(x.device),
@@ -34,4 +35,4 @@ def per_token_group_quant_mxfp4(x: torch.Tensor, block_k: int):
         'None',  # must be a string in quark hw_emulation_interface.py, why?
     )
 
-    return x_qdq, scale
+    return x, scale