more clean-up

dsikka · dsikka · commit 79126f906b5e · 2024-10-01T13:54:42.000Z
diff --git a/vllm/model_executor/layers/quantization/awq.py b/vllm/model_executor/layers/quantization/awq.py
@@ -169,4 +169,4 @@ def apply(self,
                                pack_factor)
         if bias is not None:
             out.add_(bias)
-        return out.reshape(out_shape)
+        return out.reshape(out_shape)
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -509,7 +509,6 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         )
         replace_parameter(layer, "w2_qweight", marlin_w2_qweight)
         # Repack scales
-        # Why does this take the intermediate size for size_k?
         marlin_w13_scales = marlin_moe_permute_scales(
             s=layer.w13_scales,
             size_k=layer.intermediate_size_per_partition,
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
@@ -273,17 +273,6 @@ def moe_awq_to_marlin_zero_points(q_zp_packed: torch.Tensor, size_k: int,
     return output
 
 
-# Newly generated tensors need to replace existing tensors that are
-# already registered as parameters by vLLM (and won't be freed)
-def replace_tensor(layer: torch.nn.Module, name: str,
-                   new_t: torch.Tensor) -> None:
-    # It is important to use resize_() here since it ensures
-    # the same buffer is reused
-    getattr(layer, name).resize_(new_t.shape)
-    getattr(layer, name).copy_(new_t)
-    del new_t
-
-
 def apply_gptq_marlin_linear(
         input: torch.Tensor,
         weight: torch.Tensor,
diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
@@ -24,7 +24,7 @@ def get_model_architecture(
     # Special handling for quantized Mixtral.
     # FIXME(woosuk): This is a temporary hack.
     mixtral_supported = [
-        "fp8", "compressed-tensors", "gptq_marlin", "awq", "awq_marlin"
+        "fp8", "compressed-tensors", "gptq_marlin", "awq_marlin"
     ]
 
     if (model_config.quantization is not None

Original file line number	Diff line number	Diff line change
`@@ -509,7 +509,6 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:`
`509`	`509`	`)`
`510`	`510`	`replace_parameter(layer, "w2_qweight", marlin_w2_qweight)`
`511`	`511`	`# Repack scales`
`512`		`- # Why does this take the intermediate size for size_k?`
`513`	`512`	`marlin_w13_scales = marlin_moe_permute_scales(`
`514`	`513`	`s=layer.w13_scales,`
`515`	`514`	`size_k=layer.intermediate_size_per_partition,`
Original file line number	Diff line number	Diff line change
`@@ -24,7 +24,7 @@ def get_model_architecture(`
`24`	`24`	`# Special handling for quantized Mixtral.`
`25`	`25`	`# FIXME(woosuk): This is a temporary hack.`
`26`	`26`	`mixtral_supported = [`
`27`		`- "fp8", "compressed-tensors", "gptq_marlin", "awq", "awq_marlin"`
	`27`	`+ "fp8", "compressed-tensors", "gptq_marlin", "awq_marlin"`
`28`	`28`	`]`
`29`	`29`
`30`	`30`	`if (model_config.quantization is not None`