Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
cadence: "commit"
test_type: "regression"
model_stub: "nm-testing/tinyllama-fp8-dynamic-compressed"
model_stub: "nm-testing/tinyllama-fp8-dynamic-compressed"
empty_model: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
cadence: "commit"
test_type: "regression"
model_stub: "nm-testing/tinyllama-w4a16-compressed"
model_stub: "nm-testing/tinyllama-w4a16-compressed"
empty_model: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
cadence: "commit"
test_type: "regression"
model_stub: "nm-testing/tinyllama-w8a16-dense"
model_stub: "nm-testing/tinyllama-w8a16-dense"
empty_model: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
cadence: "commit"
test_type: "regression"
model_stub: "nm-testing/tinyllama-w8a8-compressed"
model_stub: "nm-testing/tinyllama-w8a8-compressed"
empty_model: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,11 @@
import unittest

import torch
from compressed_tensors import QUANTIZATION_CONFIG_NAME
from compressed_tensors.compressors import ModelCompressor
from compressed_tensors.quantization import QuantizationStatus
from parameterized import parameterized_class
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer

from tests.testing_utils import parse_params, requires_gpu, requires_torch

Expand All @@ -16,6 +19,7 @@
@parameterized_class(parse_params(CONFIG_DIR))
class TestQuantizationMatches(unittest.TestCase):
model_stub = None
empty_model = None

@classmethod
def setUpClass(cls):
Expand All @@ -29,25 +33,34 @@ def setUpClass(cls):
device_map="auto",
# run_compressed=True, # TODO: Give option on HFQuantizer
)
# TODO: Use ModelCompressor until decompression is supported through
# HFQuant/run_compressed can be turned off.
cls.uncompressed_model = AutoModelForCausalLM.from_pretrained(
cls.model_stub,
torch_dtype="auto",
device_map="auto",
# run_compressed=False, # TODO: Give option on HFQuantizer
cls.empty_model,
torch_dtype=cls.compressed_model.dtype,
device_map=cls.compressed_model.device,
)
config = AutoConfig.from_pretrained(cls.model_stub)
compression_config = getattr(config, QUANTIZATION_CONFIG_NAME, None)
cls.compressor = ModelCompressor.from_compression_config(compression_config)
cls.compressor.quantization_config.quantization_status = (
QuantizationStatus.FROZEN
)
cls.compressor.decompress(
model_path=cls.model_stub, model=cls.uncompressed_model
)

cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_stub)
cls.device = cls.compressed_model.device

def test_compressed_matches_uncompressed(self):
SAMPLE_INPUT = [
"I love 4-bit quantization because",
"What is the capital of Paris?",
"What is the capital of France?",
"def fibonacci(n):",
]

inputs = self.tokenizer(SAMPLE_INPUT, return_tensors="pt", padding=True).to(
self.device
self.compressed_model.device
)
compressed_output = self.tokenizer.batch_decode(
self.compressed_model.generate(**inputs, max_length=50)
Expand Down
Loading