rebase + fix layer_utils

bnellnm · bnellnm · commit 56b355ac934d · 2024-09-23T20:43:48.000Z
diff --git a/tests/compile/test_full_graph.py b/tests/compile/test_full_graph.py
@@ -28,13 +28,13 @@
 ]
 
 # TODO: enable in pytorch 2.5
-if False and is_quant_method_supported("aqlm"):
+if False and is_quant_method_supported("aqlm"):  # noqa: SIM223
     TEST_MODELS.append(("ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf", {
         "quantization": "aqlm"
     }))
 
 # TODO: enable in pytorch 2.5
-if False and is_quant_method_supported("gguf"):
+if False and is_quant_method_supported("gguf"):  # noqa: SIM223
     TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", {
         "quantization": "gguf"
     }))
@@ -85,7 +85,7 @@ def test_full_graph(model_info, tp_size, backend):
 
     # Inductor doesn't support fp8/gptq_marlin_24 yet.
     quantization = model_kwargs.get("quantization")
-    if (quantization == "fp8"
+    if (quantization == "fp8" or quantization == "gptq_marlin"
             or quantization == "gptq_marlin_24") and backend != "eager":
         return
 
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
@@ -416,8 +416,8 @@ def _fp8_marlin_gemm_fake(a: torch.Tensor, b_q_weight: torch.Tensor,
     @torch.library.register_fake("_C::machete_gemm")
     def machete_gemm_fake(
         a: torch.Tensor,
-        b_q: torch.
-        Tensor,  # Should be the tensor returned by machete_prepack_B
+        # Should be the tensor returned by machete_prepack_B
+        b_q: torch.Tensor,
         b_type: ScalarType,
         b_scales: Optional[torch.Tensor] = None,
         b_zeros: Optional[torch.Tensor] = None,
@@ -613,16 +613,12 @@ def machete_prepack_B(b_q_weight: torch.Tensor,
     return torch.ops._C.machete_prepack_B(b_q_weight, b_type)
 
 
-# TODO: has to be a better way to do this
-try:
-    torch.ops._C.permute_cols  # noqa B018
+if hasattr(torch.ops._C, 'permute_cols'):
 
     @torch.library.register_fake("_C::permute_cols")
     def _permute_cols_fake(a: torch.Tensor,
                            perm: torch.Tensor) -> torch.Tensor:
         return torch.empty_like(a)
-except Exception:
-    pass
 
 
 def permute_cols(a: torch.Tensor, perm: torch.Tensor) -> torch.Tensor:
diff --git a/vllm/model_executor/layers/quantization/utils/layer_utils.py b/vllm/model_executor/layers/quantization/utils/layer_utils.py
@@ -21,13 +21,14 @@ def replace_parameter(mod: torch.nn.Module, name: str,
                       new: Union[torch.Tensor, torch.nn.Parameter]) -> None:
 
     old = getattr(mod, name)
-    if old.dtype == new.dtype  and \
+    if type(old) is type(new) and old.dtype == new.dtype and \
         old.untyped_storage().nbytes() == new.untyped_storage().nbytes():
         # If we can just update in-place to avoid re-registering
         #   can be faster if the underlying storage is the same
         update_tensor_inplace(old, new)
     else:
         # Fallback re-register parameter
         if not isinstance(new, torch.nn.Parameter):
-            new = torch.nn.Parameter(new)
-        mod.register_parameter(name, torch.nn.Parameter(new))
+            new = torch.nn.Parameter(new, requires_grad=False)
+        mod.register_parameter(name,
+                               torch.nn.Parameter(new, requires_grad=False))