Skip to content

Commit 79126f9

Browse files
committed
more clean-up
1 parent bbf575e commit 79126f9

4 files changed

Lines changed: 2 additions & 14 deletions

File tree

vllm/model_executor/layers/quantization/awq.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -169,4 +169,4 @@ def apply(self,
169169
pack_factor)
170170
if bias is not None:
171171
out.add_(bias)
172-
return out.reshape(out_shape)
172+
return out.reshape(out_shape)

vllm/model_executor/layers/quantization/gptq_marlin.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -509,7 +509,6 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
509509
)
510510
replace_parameter(layer, "w2_qweight", marlin_w2_qweight)
511511
# Repack scales
512-
# Why does this take the intermediate size for size_k?
513512
marlin_w13_scales = marlin_moe_permute_scales(
514513
s=layer.w13_scales,
515514
size_k=layer.intermediate_size_per_partition,

vllm/model_executor/layers/quantization/utils/marlin_utils.py

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -273,17 +273,6 @@ def moe_awq_to_marlin_zero_points(q_zp_packed: torch.Tensor, size_k: int,
273273
return output
274274

275275

276-
# Newly generated tensors need to replace existing tensors that are
277-
# already registered as parameters by vLLM (and won't be freed)
278-
def replace_tensor(layer: torch.nn.Module, name: str,
279-
new_t: torch.Tensor) -> None:
280-
# It is important to use resize_() here since it ensures
281-
# the same buffer is reused
282-
getattr(layer, name).resize_(new_t.shape)
283-
getattr(layer, name).copy_(new_t)
284-
del new_t
285-
286-
287276
def apply_gptq_marlin_linear(
288277
input: torch.Tensor,
289278
weight: torch.Tensor,

vllm/model_executor/model_loader/utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ def get_model_architecture(
2424
# Special handling for quantized Mixtral.
2525
# FIXME(woosuk): This is a temporary hack.
2626
mixtral_supported = [
27-
"fp8", "compressed-tensors", "gptq_marlin", "awq", "awq_marlin"
27+
"fp8", "compressed-tensors", "gptq_marlin", "awq_marlin"
2828
]
2929

3030
if (model_config.quantization is not None

0 commit comments

Comments
 (0)