Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docker/peft-gpu/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ RUN source activate peft && \
"soundfile>=0.12.1" \
scipy \
torchao \
fbgemm-gpu-genai>=1.2.0 \
git+https://github.com/huggingface/transformers \
git+https://github.com/huggingface/accelerate \
peft[test]@git+https://github.com/huggingface/peft \
Expand Down
103 changes: 9 additions & 94 deletions tests/test_gpu_examples.py
Original file line number Diff line number Diff line change
Expand Up @@ -4054,14 +4054,13 @@ class PeftTorchaoGPUTests(unittest.TestCase):
supported_quant_types = [
"int8_weight_only",
"int8_dynamic_activation_int8_weight",
# int4_weight_only raises an error:
# RuntimeError: derivative for aten::_weight_int4pack_mm is not implemented
# "int4_weight_only",
"int4_weight_only",
]

def setUp(self):
self.causal_lm_model_id = "facebook/opt-125m"
self.tokenizer = AutoTokenizer.from_pretrained(self.causal_lm_model_id)
self.dtype = torch.bfloat16 # better support in torchao
# torchao breaks with fp16 and if a previous test uses fp16, transformers will set this env var, which affects
# subsequent tests, therefore the env var needs to be cleared explicitly
#
Expand All @@ -4085,7 +4084,7 @@ def test_causal_lm_training_single_gpu_torchao(self, quant_type):
with tempfile.TemporaryDirectory() as tmp_dir:
quantization_config = TorchAoConfig(quant_type=quant_type)
model = AutoModelForCausalLM.from_pretrained(
self.causal_lm_model_id, device_map=device, quantization_config=quantization_config
self.causal_lm_model_id, device_map=device, quantization_config=quantization_config, dtype=self.dtype
)
model = prepare_model_for_kbit_training(model)

Expand Down Expand Up @@ -4136,7 +4135,7 @@ def test_causal_lm_training_single_gpu_torchao_dora_int8_weight_only(self):
with tempfile.TemporaryDirectory() as tmp_dir:
quantization_config = TorchAoConfig(quant_type="int8_weight_only")
model = AutoModelForCausalLM.from_pretrained(
self.causal_lm_model_id, device_map=device, quantization_config=quantization_config
self.causal_lm_model_id, device_map=device, quantization_config=quantization_config, dtype=self.dtype
)
model = prepare_model_for_kbit_training(model)

Expand Down Expand Up @@ -4187,7 +4186,7 @@ def test_causal_lm_training_single_gpu_torchao_dora_int8_dynamic_activation_int8

quantization_config = TorchAoConfig(quant_type="int8_dynamic_activation_int8_weight")
model = AutoModelForCausalLM.from_pretrained(
self.causal_lm_model_id, device_map=device, quantization_config=quantization_config
self.causal_lm_model_id, device_map=device, quantization_config=quantization_config, dtype=self.dtype
)
model = prepare_model_for_kbit_training(model)

Expand All @@ -4203,34 +4202,6 @@ def test_causal_lm_training_single_gpu_torchao_dora_int8_dynamic_activation_int8
with pytest.raises(NotImplementedError):
get_peft_model(model, config)

@pytest.mark.single_gpu_tests
def test_causal_lm_training_single_gpu_torchao_int4_raises(self):
# int4_weight_only raises an error:
# RuntimeError: derivative for aten::_weight_int4pack_mm is not implemented
# TODO: Once proper torchao support for int4 is added, remove this test and add int4 to supported_quant_types
from transformers import TorchAoConfig

device = 0

quantization_config = TorchAoConfig(quant_type="int4_weight_only")
model = AutoModelForCausalLM.from_pretrained(
self.causal_lm_model_id, device_map=device, quantization_config=quantization_config
)
model = prepare_model_for_kbit_training(model)

config = LoraConfig(
r=16,
lora_alpha=32,
target_modules=["q_proj", "v_proj"],
lora_dropout=0.05,
bias="none",
task_type="CAUSAL_LM",
)

msg = re.escape("TorchaoLoraLinear only supports int8 weights for now")
with pytest.raises(ValueError, match=msg):
get_peft_model(model, config)

@parameterized.expand(supported_quant_types)
@pytest.mark.multi_gpu_tests
@require_torch_multi_accelerator
Expand Down Expand Up @@ -4264,7 +4235,7 @@ def test_causal_lm_training_multi_accelerator_torchao(self, quant_type):
self.causal_lm_model_id,
device_map=device_map,
quantization_config=quantization_config,
dtype=torch.bfloat16,
dtype=self.dtype,
)

assert set(model.hf_device_map.values()) == set(range(device_count))
Expand Down Expand Up @@ -4312,62 +4283,6 @@ def test_causal_lm_training_multi_accelerator_torchao(self, quant_type):
# assert loss is not None
assert trainer.state.log_history[-1]["train_loss"] is not None

@pytest.mark.multi_gpu_tests
@require_torch_multi_accelerator
def test_causal_lm_training_multi_accelerator_torchao_int4_raises(self):
# int4_weight_only raises an error:
# RuntimeError: derivative for aten::_weight_int4pack_mm is not implemented
# TODO: Once proper torchao support for int4 is added, remove this test and add int4 to supported_quant_types
from transformers import TorchAoConfig

device_map = {
"model.decoder.embed_tokens": 0,
"lm_head": 0,
"model.decoder.embed_positions": 0,
"model.decoder.project_out": 0,
"model.decoder.project_in": 0,
"model.decoder.layers.0": 0,
"model.decoder.layers.1": 0,
"model.decoder.layers.2": 0,
"model.decoder.layers.3": 0,
"model.decoder.layers.4": 0,
"model.decoder.layers.5": 0,
"model.decoder.layers.6": 1,
"model.decoder.layers.7": 1,
"model.decoder.layers.8": 1,
"model.decoder.layers.9": 1,
"model.decoder.layers.10": 1,
"model.decoder.layers.11": 1,
"model.decoder.final_layer_norm": 1,
}
quantization_config = TorchAoConfig(quant_type="int4_weight_only")
model = AutoModelForCausalLM.from_pretrained(
self.causal_lm_model_id,
device_map=device_map,
quantization_config=quantization_config,
dtype=torch.bfloat16,
)

assert set(model.hf_device_map.values()) == set(range(device_count))
assert {p.device.index for p in model.parameters()} == set(range(device_count))

model = prepare_model_for_kbit_training(model)
model.model_parallel = True
model.is_parallelizable = True

config = LoraConfig(
r=16,
lora_alpha=32,
target_modules=["q_proj", "v_proj"],
lora_dropout=0.05,
bias="none",
task_type="CAUSAL_LM",
)

msg = re.escape("TorchaoLoraLinear only supports int8 weights for now")
with pytest.raises(ValueError, match=msg):
get_peft_model(model, config)

@pytest.mark.single_gpu_tests
def test_torchao_merge_layers_int8_weight_only(self):
from torchao.dtypes import AffineQuantizedTensor
Expand All @@ -4380,7 +4295,7 @@ def test_torchao_merge_layers_int8_weight_only(self):

quantization_config = TorchAoConfig(quant_type=quant_type)
model = AutoModelForCausalLM.from_pretrained(
self.causal_lm_model_id, device_map=device, quantization_config=quantization_config
self.causal_lm_model_id, device_map=device, quantization_config=quantization_config, dtype=self.dtype
).eval()
logits_base = model(dummy_input)[0]

Expand All @@ -4400,7 +4315,7 @@ def test_torchao_merge_layers_int8_weight_only(self):

# sanity check: outputs changed
# precision is quite low, so we need to use high atol and rtol
atol, rtol = 1e-1, 1e-1
atol, rtol = 2e-1, 2e-1
assert not torch.allclose(logits, logits_base, atol=atol, rtol=rtol)

model.merge_adapter()
Expand Down Expand Up @@ -4433,7 +4348,7 @@ def test_torchao_merge_layers_int8_dynamic_activation_int8_weight_raises(self):

quantization_config = TorchAoConfig(quant_type=quant_type)
model = AutoModelForCausalLM.from_pretrained(
self.causal_lm_model_id, device_map=device, quantization_config=quantization_config
self.causal_lm_model_id, device_map=device, quantization_config=quantization_config, dtype=self.dtype
).eval()

config = LoraConfig(
Expand Down
Loading