lint

BowenBao · BowenBao · commit 7e5b7c1bb1ec · 2025-04-22T23:31:56.000Z
Signed-off-by: Bowen Bao &lt;bowenbao@amd.com&gt;
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
@@ -75,7 +75,6 @@ def __init__(
             block_size = 16
             is_attention_free = False
             calculate_kv_scales = False
-
         if num_kv_heads is None:
             num_kv_heads = num_heads
 
diff --git a/vllm/model_executor/layers/quantization/quark/quark.py b/vllm/model_executor/layers/quantization/quark/quark.py
@@ -6,6 +6,7 @@
 
 import torch
 
+from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
                                                UnquantizedLinearMethod)
@@ -15,16 +16,16 @@
 from vllm.model_executor.layers.quantization.quark.quark_moe import (  # noqa: E501
     QuarkMoEMethod)
 from vllm.model_executor.layers.quantization.quark.schemes import (
-    QuarkScheme, QuarkW8A8Fp8, QuarkW8A8Int8, QuarkW4A4MXFP4)
+    QuarkScheme, QuarkW4A4MXFP4, QuarkW8A8Fp8, QuarkW8A8Int8)
 from vllm.model_executor.layers.quantization.quark.utils import (
     deep_compare, should_ignore_layer)
 from vllm.platforms import current_platform
-from vllm.logger import init_logger
 
 __all__ = ["QuarkLinearMethod"]
 
 logger = init_logger(__name__)
 
+
 class QuarkConfig(QuantizationConfig):
 
     def __init__(self,
@@ -201,45 +202,53 @@ def _is_static_tensor_w8a8(self, weight_quant: Optional[Dict[str, Any]],
         return is_int8_dtype and is_tensor and is_weight_symmetric and is_static
 
     def _is_mx_fp4(self, weight_quant: Optional[Dict[str, Any]],
-                               input_quant: Optional[Dict[str, Any]]) -> bool:
+                   input_quant: Optional[Dict[str, Any]]) -> bool:
         # Confirm weights and input quantized.
         if weight_quant is None or input_quant is None:
-            logger.debug("Quark model is not in MX-FP4 format: weight_quant or input_quant not set")
+            logger.debug("Quark model is not in MX-FP4 format: "
+                         "weight_quant or input_quant not set")
             return False
 
         # Input and weight dtype needs to be fp4.
-        if weight_quant.get("dtype") != "fp4" or input_quant.get("dtype") != "fp4":
+        if weight_quant.get("dtype") != "fp4" or input_quant.get(
+                "dtype") != "fp4":
             logger.debug("Quark model is not in MX-FP4 format: dtype not fp4")
             return False
 
         # Input and weight qscheme needs to be per group.
-        if weight_quant.get("qscheme") != "per_group" or input_quant.get("qscheme") != "per_group":
+        if weight_quant.get("qscheme") != "per_group" or input_quant.get(
+                "qscheme") != "per_group":
             logger.debug("Quark model is not in MX-FP4 format: not per_group")
             return False
 
         # Input and weight group size needs to be 32.
-        if weight_quant.get("group_size") != 32 or input_quant.get("group_size") != 32:
-            logger.debug("Quark model is not in MX-FP4 format: not group_size=32")
+        if weight_quant.get("group_size") != 32 or input_quant.get(
+                "group_size") != 32:
+            logger.debug(
+                "Quark model is not in MX-FP4 format: not group_size=32")
             return False
 
         # Weights need to use static quantization.
         if weight_quant.get("is_dynamic") is True:
-            logger.debug("Quark model is not in MX-FP4 format: not weight static")
+            logger.debug(
+                "Quark model is not in MX-FP4 format: not weight static")
             return False
 
         # Activations need to use dynamic quantization.
         if input_quant.get("is_dynamic") is False:
-            logger.debug("Quark model is not in MX-FP4 format: not activation dynamic")
+            logger.debug(
+                "Quark model is not in MX-FP4 format: not activation dynamic")
             return False
 
         # Activations and weight scales need to be in e8m0 format.
-        if weight_quant.get("scale_format") != "e8m0" or input_quant.get("scale_format") != "e8m0":
-            logger.debug("Quark model is not in MX-FP4 format: not scale_format e8m0")
+        if weight_quant.get("scale_format") != "e8m0" or input_quant.get(
+                "scale_format") != "e8m0":
+            logger.debug(
+                "Quark model is not in MX-FP4 format: not scale_format e8m0")
             return False
 
         return True
 
-
     def _find_matched_config(self, layer_name: str,
                              module: torch.nn.Module) -> Dict[str, Any]:
 
diff --git a/vllm/model_executor/layers/quantization/quark/schemes/__init__.py b/vllm/model_executor/layers/quantization/quark/schemes/__init__.py
@@ -1,8 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from .quark_scheme import QuarkScheme
+from .quark_w4a4_mxfp4 import QuarkW4A4MXFP4
 from .quark_w8a8_fp8 import QuarkW8A8Fp8
 from .quark_w8a8_int8 import QuarkW8A8Int8
-from .quark_w4a4_mxfp4 import QuarkW4A4MXFP4
 
 __all__ = ["QuarkScheme", "QuarkW8A8Fp8", "QuarkW8A8Int8", "QuarkW4A4MXFP4"]
diff --git a/vllm/model_executor/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py
@@ -1,28 +1,23 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import Callable, List, Optional, Dict, Any
+from typing import Any, Callable, Dict, List, Optional
 
 import torch
-from torch.nn import Parameter
+import torch.nn.functional as F
 
 from vllm.model_executor.layers.quantization.quark.schemes import QuarkScheme
-from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    Fp8LinearOp, normalize_e4m3fn_to_e4m3fnuz, requantize_with_max_scale)
-from vllm.model_executor.parameter import (ChannelQuantScaleParameter,
-                                           ModelWeightParameter,
-                                           PerTensorScaleParameter)
-from vllm.platforms import current_platform
-from vllm.model_executor.parameter import GroupQuantScaleParameter, PackedvLLMParameter
-
-import torch.nn.functional as F
+from vllm.model_executor.parameter import (GroupQuantScaleParameter,
+                                           PackedvLLMParameter)
 
-__all__ = ["QuarkW8A8Fp8"]
+__all__ = ["QuarkW4A4MXFP4"]
 
 OCP_MX_BLOCK_SIZE = 32
 
+
 class QuarkW4A4MXFP4(QuarkScheme):
 
-    def __init__(self, weight_quant_spec: Dict[str, Any], input_quant_spec: Dict[str, Any]):
+    def __init__(self, weight_quant_spec: Dict[str, Any],
+                 input_quant_spec: Dict[str, Any]):
         self.out_dtype = torch.get_default_dtype()
         self.qscheme = "per_group"
         self.weight_quant_spec = weight_quant_spec
@@ -35,17 +30,18 @@ def get_min_capability(cls) -> int:
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         layer.weight = torch.nn.Parameter(layer.weight.data,
-                                           requires_grad=False)
-        layer.weight_scale = torch.nn.Parameter(layer.weight_scale.data,
                                           requires_grad=False)
+        layer.weight_scale = torch.nn.Parameter(layer.weight_scale.data,
+                                                requires_grad=False)
 
         try:
             from quark.torch.export.nn.modules import realquantizer
             from quark.torch.quantization.config.config import QuantizationSpec
         except ImportError as err:
             raise ImportError(
-                f"The package `amd-quark` is required to use AMD Quark MX-FP4 models. Please install it with `pip install amd-quark`. Error: {err}"
-            )
+                "The package `amd-quark` is required to use AMD Quark MX-FP4 "
+                "models. Please install it with `pip install amd-quark`."
+            ) from err
 
         weight_quant_spec = QuantizationSpec.from_dict(self.weight_quant_spec)
         input_quant_spec = QuantizationSpec.from_dict(self.input_quant_spec)
@@ -60,9 +56,9 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             zero_point_shape=None,
         )
         self.weight_quantizer.scale.data = layer.weight_scale.data
-        layer.weight = torch.nn.Parameter(
-            self.weight_quantizer(layer.weight.data).to(self.out_dtype), requires_grad=False
-        )
+        layer.weight = torch.nn.Parameter(self.weight_quantizer(
+            layer.weight.data).to(self.out_dtype),
+                                          requires_grad=False)
 
         self.input_quantizer = realquantizer.get_real_quantizer(
             qspec=input_quant_spec,
@@ -90,7 +86,7 @@ def create_weights(self, layer: torch.nn.Module,
             output_dim=0,
             packed_dim=1,
             packed_factor=2,
-            weight_loader=weight_loader
+            weight_loader=weight_loader,
         )
         layer.register_parameter("weight", weight)