Skip to content
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 46 additions & 22 deletions tests/models/phimoe/test_modeling_phimoe.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,14 @@

"""Testing suite for the PyTorch PhiMoE model."""

import copy
import unittest

from parameterized import parameterized

from transformers import PhimoeConfig, StaticCache, is_torch_available
from transformers.testing_utils import (
cleanup,
require_torch,
slow,
torch_device,
Expand Down Expand Up @@ -130,31 +132,47 @@ def test_model_rope_scaling_from_config(self, scaling_type):
@slow
@require_torch
class PhimoeIntegrationTest(unittest.TestCase):
def test_model_phimoe_instruct_logits(self):
input_ids = {
"input_ids": torch.tensor(
[[1212, 318, 281, 1672, 2643, 290, 428, 318, 257, 1332]], dtype=torch.long, device=torch_device
model = None

@classmethod
def get_model(cls):
if cls.model is None:
cls.model = PhimoeForCausalLM.from_pretrained(
"microsoft/Phi-3.5-MoE-instruct", dtype="auto", device_map="auto"
)
}
return cls.model

@classmethod
def tearDownClass(cls):
del cls.model
cleanup(torch_device, gc_collect=True)

def setUp(self):
cleanup(torch_device, gc_collect=True)

def tearDown(self):
cleanup(torch_device, gc_collect=True)

def test_model_phimoe_instruct_logits(self):
input_ids = {"input_ids": torch.tensor([[1212, 318, 281, 1672]], dtype=torch.long, device=torch_device)}

model = PhimoeForCausalLM.from_pretrained("microsoft/Phi-3.5-MoE-instruct").to(torch_device)
model = self.get_model()
model.eval()

output = model(**input_ids).logits
with torch.no_grad():
output = model(**input_ids).logits

EXPECTED_OUTPUT = torch.tensor([[-3.5312, -2.5000, -1.2734, 0.3555, -0.7578, -0.4727, 0.5977, -0.4316,
0.2256, -1.2188, -1.6797, 0.9961, 3.7656, 11.3125, -1.3828, -4.8438,
-5.7500, -1.9375, 0.7227, -0.3438, -0.2100, -0.4277, -0.0444, -0.5352,
-0.6406, -0.1016, -0.4258, -1.0234, 0.4297, -0.6250],
[-0.9883, 0.1455, -0.4902, 2.3594, 0.7031, 3.1406, 0.4375, 0.2559,
0.6172, -2.1094, -1.3359, 2.5938, 4.9062, 10.8125, -0.1094, 1.5781,
-4.9375, 0.7148, -0.0972, 1.7656, -0.0801, 0.2217, 0.1875, -0.4629,
1.5781, 0.3535, 0.0874, 0.6836, -0.0518, -1.2969]]).to(torch_device) # fmt: skip
EXPECTED_OUTPUT = torch.tensor(
[
[-3.4844, -2.4531, -1.1719, 0.6055, -0.4922, -0.1001, 0.8086, -0.2422, 0.3477, -1.0078],
[-0.9766, 0.1631, -0.5508, 2.3594, 0.7031, 3.1719, 0.4141, 0.2305, 0.6055, -2.1250],
]
).to(device=torch_device, dtype=output.dtype) # fmt: skip

torch.testing.assert_close(EXPECTED_OUTPUT, output[0, :2, :30], rtol=1e-4, atol=1e-4)
torch.testing.assert_close(output[0, :2, :10], EXPECTED_OUTPUT, rtol=1e-4, atol=1e-4)

def test_phimoe_instruct_generation(self):
model = PhimoeForCausalLM.from_pretrained("microsoft/Phi-3.5-MoE-instruct")
model = self.get_model()
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3.5-MoE-instruct")

messages = [
Expand All @@ -166,17 +184,22 @@ def test_phimoe_instruct_generation(self):
]
inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt")

outputs = model.generate(inputs, max_new_tokens=32)
outputs = model.generate(inputs, max_new_tokens=10)
output_text = tokenizer.batch_decode(outputs)

EXPECTED_OUTPUT = [
"<|system|> You are a helpful digital assistant. Please provide safe, ethical and accurate information to the user.<|end|><|user|> Can you provide ways to eat combinations of bananas and dragonfruits?<|end|><|assistant|> Certainly! Bananas and dragonfruits are both delicious and nutritious fruits that can be combined in various ways to create tast"
"<|system|> You are a helpful digital assistant. Please provide safe, ethical and accurate information to the user.<|end|><|user|> Can you provide ways to eat combinations of bananas and dragonfruits?<|end|><|assistant|> Certainly! Bananas and dragonf",
]

self.assertListEqual(output_text, EXPECTED_OUTPUT)

def test_phimoe_instruct_with_static_cache(self):
model = PhimoeForCausalLM.from_pretrained("microsoft/Phi-3.5-MoE-instruct")
model = self.get_model()
# Can't run with the real checkpoint, even if offloaded. Let's just use a tiny dummy one
config = copy.deepcopy(model.config)
config.num_hidden_layers = 2
torch.manual_seed(42)
model = type(model)(config)
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3.5-MoE-instruct")

messages = [
Expand All @@ -188,12 +211,13 @@ def test_phimoe_instruct_with_static_cache(self):
]
inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt")

response_tokens = PhimoeMiniWithStaticCache.generate(model, inputs, 64)
response_tokens = PhimoeMiniWithStaticCache.generate(model, inputs, max_seq_len=10)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Isn't it just one (or a few) tokens generated then. Can we increase the output by a bit at least?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we are using dummy random model (with fixed seed), there is no point to have more tokens and compare the results here

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I just want to make sure that we can at least generate a few tokens (e.g. 5) even if it is a dummy model. The reason is more so that the static cache goes through the prefill and decoding for a few steps and doesnt break - not the output itself.

Also, can we use max_new_tokens instead then? With max_seq_len, I need to know the token count myself to calculate the output amount myself

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can change to 30 here. It's already 10 which is larger than 5.

Regarding

max_seq_len

class PhimoeMiniWithStaticCache(torch.nn.Module):

    @staticmethod
    def generate(model: PhimoeForCausalLM, prompt_tokens: torch.LongTensor, max_seq_len: int) -> list[int]:
        model = PhimoeMiniWithStaticCache(model, 1, max_seq_len + prompt_tokens.shape[-1])

It behaves as max_new_tokens.

I have no idea why we define PhimoeMiniWithStaticCache in the test file however, but I don't want to spend any more time on this PR.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh damn, I didn't notice it was a separate class... Could it be that the gradients are also calculated and thus the memory usage shoots up? And thx for the clearup, that was a misunderstanding of mine with the new class.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could it be that the gradients are also calculated and thus the memory usage shoots up?

Can you check this? With torch no grad or similar?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

change it to 30 but the generation just stop early.

Thats on me I didnt notice it was a separate class 😢 it was fine before as well, I thought it just generated like 1 token - Im getting tired argh

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oh, good point on graident! let me check, you are so goooood!

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

with it, no more GPU OOM, but yeah, only generate like one token. I will change back to using the real model however, but won't spend time on this custom class defined in the test file.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yea no worries no need to invest too deeply there. But that explains the overly high memory usage :D


output_text = tokenizer.batch_decode(torch.tensor([response_tokens], dtype=torch.long, device=torch_device))

# This is dummy outputs. We actually check if it could run with static cache, not the output quality.
EXPECTED_OUTPUT = [
"<|system|> You are a helpful digital assistant. Please provide safe, ethical and accurate information to the user.<|end|><|user|> Can you provide ways to eat combinations of bananas and dragonfruits?<|end|><|assistant|> Certainly! Bananas and dragonfruits are both delicious and nutritious fruits that can"
"<|system|> You are a helpful digital assistant. Please provide safe, ethical and accurate information to the user.<|end|><|user|> Can you provide ways to eat combinations of bananas and dragonfruits?<|end|><|assistant|>ington"
]

self.assertListEqual(output_text, EXPECTED_OUTPUT)