From 614960103e0545e040de58bf9f95df50a473dc4b Mon Sep 17 00:00:00 2001
From: ydshieh <ydshieh@users.noreply.github.com>
Date: Fri, 19 Sep 2025 16:20:10 +0200
Subject: [PATCH 1/5] fix

---
 tests/models/phimoe/test_modeling_phimoe.py | 68 ++++++++++++++-------
 1 file changed, 46 insertions(+), 22 deletions(-)

diff --git a/tests/models/phimoe/test_modeling_phimoe.py b/tests/models/phimoe/test_modeling_phimoe.py
index 46714244a14b..cb46a39c5518 100644
--- a/tests/models/phimoe/test_modeling_phimoe.py
+++ b/tests/models/phimoe/test_modeling_phimoe.py
@@ -14,12 +14,14 @@
 
 """Testing suite for the PyTorch PhiMoE model."""
 
+import copy
 import unittest
 
 from parameterized import parameterized
 
 from transformers import PhimoeConfig, StaticCache, is_torch_available
 from transformers.testing_utils import (
+    cleanup,
     require_torch,
     slow,
     torch_device,
@@ -130,31 +132,47 @@ def test_model_rope_scaling_from_config(self, scaling_type):
 @slow
 @require_torch
 class PhimoeIntegrationTest(unittest.TestCase):
-    def test_model_phimoe_instruct_logits(self):
-        input_ids = {
-            "input_ids": torch.tensor(
-                [[1212, 318, 281, 1672, 2643, 290, 428, 318, 257, 1332]], dtype=torch.long, device=torch_device
+    model = None
+
+    @classmethod
+    def get_model(cls):
+        if cls.model is None:
+            cls.model = PhimoeForCausalLM.from_pretrained(
+                "microsoft/Phi-3.5-MoE-instruct", dtype="auto", device_map="auto"
             )
-        }
+        return cls.model
+
+    @classmethod
+    def tearDownClass(cls):
+        del cls.model
+        cleanup(torch_device, gc_collect=True)
+
+    def setUp(self):
+        cleanup(torch_device, gc_collect=True)
+
+    def tearDown(self):
+        cleanup(torch_device, gc_collect=True)
+
+    def test_model_phimoe_instruct_logits(self):
+        input_ids = {"input_ids": torch.tensor([[1212, 318, 281, 1672]], dtype=torch.long, device=torch_device)}
 
-        model = PhimoeForCausalLM.from_pretrained("microsoft/Phi-3.5-MoE-instruct").to(torch_device)
+        model = self.get_model()
         model.eval()
 
-        output = model(**input_ids).logits
+        with torch.no_grad():
+            output = model(**input_ids).logits
 
-        EXPECTED_OUTPUT = torch.tensor([[-3.5312, -2.5000, -1.2734,  0.3555, -0.7578, -0.4727,  0.5977, -0.4316,
-          0.2256, -1.2188, -1.6797,  0.9961,  3.7656, 11.3125, -1.3828, -4.8438,
-         -5.7500, -1.9375,  0.7227, -0.3438, -0.2100, -0.4277, -0.0444, -0.5352,
-         -0.6406, -0.1016, -0.4258, -1.0234,  0.4297, -0.6250],
-        [-0.9883,  0.1455, -0.4902,  2.3594,  0.7031,  3.1406,  0.4375,  0.2559,
-          0.6172, -2.1094, -1.3359,  2.5938,  4.9062, 10.8125, -0.1094,  1.5781,
-         -4.9375,  0.7148, -0.0972,  1.7656, -0.0801,  0.2217,  0.1875, -0.4629,
-          1.5781,  0.3535,  0.0874,  0.6836, -0.0518, -1.2969]]).to(torch_device)  # fmt: skip
+        EXPECTED_OUTPUT = torch.tensor(
+            [
+                    [-3.4844, -2.4531, -1.1719, 0.6055, -0.4922, -0.1001, 0.8086, -0.2422, 0.3477, -1.0078],
+                    [-0.9766, 0.1631, -0.5508, 2.3594, 0.7031, 3.1719, 0.4141, 0.2305, 0.6055, -2.1250],
+            ]
+        ).to(device=torch_device, dtype=output.dtype)  # fmt: skip
 
-        torch.testing.assert_close(EXPECTED_OUTPUT, output[0, :2, :30], rtol=1e-4, atol=1e-4)
+        torch.testing.assert_close(output[0, :2, :10], EXPECTED_OUTPUT, rtol=1e-4, atol=1e-4)
 
     def test_phimoe_instruct_generation(self):
-        model = PhimoeForCausalLM.from_pretrained("microsoft/Phi-3.5-MoE-instruct")
+        model = self.get_model()
         tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3.5-MoE-instruct")
 
         messages = [
@@ -166,17 +184,22 @@ def test_phimoe_instruct_generation(self):
         ]
         inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt")
 
-        outputs = model.generate(inputs, max_new_tokens=32)
+        outputs = model.generate(inputs, max_new_tokens=10)
         output_text = tokenizer.batch_decode(outputs)
 
         EXPECTED_OUTPUT = [
-            "<|system|> You are a helpful digital assistant. Please provide safe, ethical and accurate information to the user.<|end|><|user|> Can you provide ways to eat combinations of bananas and dragonfruits?<|end|><|assistant|> Certainly! Bananas and dragonfruits are both delicious and nutritious fruits that can be combined in various ways to create tast"
+            "<|system|> You are a helpful digital assistant. Please provide safe, ethical and accurate information to the user.<|end|><|user|> Can you provide ways to eat combinations of bananas and dragonfruits?<|end|><|assistant|> Certainly! Bananas and dragonf",
         ]
 
         self.assertListEqual(output_text, EXPECTED_OUTPUT)
 
     def test_phimoe_instruct_with_static_cache(self):
-        model = PhimoeForCausalLM.from_pretrained("microsoft/Phi-3.5-MoE-instruct")
+        model = self.get_model()
+        # Can't run with the real checkpoint, even if offloaded. Let's just use a tiny dummy one
+        config = copy.deepcopy(model.config)
+        config.num_hidden_layers = 2
+        torch.manual_seed(42)
+        model = type(model)(config)
         tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3.5-MoE-instruct")
 
         messages = [
@@ -188,12 +211,13 @@ def test_phimoe_instruct_with_static_cache(self):
         ]
         inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt")
 
-        response_tokens = PhimoeMiniWithStaticCache.generate(model, inputs, 64)
+        response_tokens = PhimoeMiniWithStaticCache.generate(model, inputs, max_seq_len=10)
 
         output_text = tokenizer.batch_decode(torch.tensor([response_tokens], dtype=torch.long, device=torch_device))
 
+        # This is dummy outputs. We actually check if it could run with static cache, not the output quality.
         EXPECTED_OUTPUT = [
-            "<|system|> You are a helpful digital assistant. Please provide safe, ethical and accurate information to the user.<|end|><|user|> Can you provide ways to eat combinations of bananas and dragonfruits?<|end|><|assistant|> Certainly! Bananas and dragonfruits are both delicious and nutritious fruits that can"
+            "<|system|> You are a helpful digital assistant. Please provide safe, ethical and accurate information to the user.<|end|><|user|> Can you provide ways to eat combinations of bananas and dragonfruits?<|end|><|assistant|>ington"
         ]
 
         self.assertListEqual(output_text, EXPECTED_OUTPUT)

From b505efba72475f989adb50f0c2c36cfe2ce997a3 Mon Sep 17 00:00:00 2001
From: ydshieh <ydshieh@users.noreply.github.com>
Date: Fri, 19 Sep 2025 17:21:08 +0200
Subject: [PATCH 2/5] fix

---
 tests/models/phimoe/test_modeling_phimoe.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/models/phimoe/test_modeling_phimoe.py b/tests/models/phimoe/test_modeling_phimoe.py
index cb46a39c5518..ebdf9190ab12 100644
--- a/tests/models/phimoe/test_modeling_phimoe.py
+++ b/tests/models/phimoe/test_modeling_phimoe.py
@@ -184,7 +184,7 @@ def test_phimoe_instruct_generation(self):
         ]
         inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt")
 
-        outputs = model.generate(inputs, max_new_tokens=10)
+        outputs = model.generate(inputs, max_new_tokens=30)
         output_text = tokenizer.batch_decode(outputs)
 
         EXPECTED_OUTPUT = [
@@ -199,7 +199,8 @@ def test_phimoe_instruct_with_static_cache(self):
         config = copy.deepcopy(model.config)
         config.num_hidden_layers = 2
         torch.manual_seed(42)
-        model = type(model)(config)
+        model = PhimoeForCausalLM(config).to(torch_device)
+        model.eval()
         tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3.5-MoE-instruct")
 
         messages = [

From 3577ec4e07e1f423b0f5b6d750d44480ac5ac5e6 Mon Sep 17 00:00:00 2001
From: ydshieh <ydshieh@users.noreply.github.com>
Date: Fri, 19 Sep 2025 17:26:48 +0200
Subject: [PATCH 3/5] fix

---
 tests/models/phimoe/test_modeling_phimoe.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/models/phimoe/test_modeling_phimoe.py b/tests/models/phimoe/test_modeling_phimoe.py
index ebdf9190ab12..876703394f53 100644
--- a/tests/models/phimoe/test_modeling_phimoe.py
+++ b/tests/models/phimoe/test_modeling_phimoe.py
@@ -188,9 +188,8 @@ def test_phimoe_instruct_generation(self):
         output_text = tokenizer.batch_decode(outputs)
 
         EXPECTED_OUTPUT = [
-            "<|system|> You are a helpful digital assistant. Please provide safe, ethical and accurate information to the user.<|end|><|user|> Can you provide ways to eat combinations of bananas and dragonfruits?<|end|><|assistant|> Certainly! Bananas and dragonf",
+            "<|system|> You are a helpful digital assistant. Please provide safe, ethical and accurate information to the user.<|end|><|user|> Can you provide ways to eat combinations of bananas and dragonfruits?<|end|><|assistant|> Certainly! Bananas and dragonfruits are both delicious and nutritious fruits that can be combined in various ways to create",
         ]
-
         self.assertListEqual(output_text, EXPECTED_OUTPUT)
 
     def test_phimoe_instruct_with_static_cache(self):

From d2d74fb5f4f8910b9d48a82b43741990b0b005b5 Mon Sep 17 00:00:00 2001
From: ydshieh <ydshieh@users.noreply.github.com>
Date: Fri, 19 Sep 2025 18:03:48 +0200
Subject: [PATCH 4/5] fix

---
 tests/models/phimoe/test_modeling_phimoe.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/tests/models/phimoe/test_modeling_phimoe.py b/tests/models/phimoe/test_modeling_phimoe.py
index 876703394f53..30999dda149e 100644
--- a/tests/models/phimoe/test_modeling_phimoe.py
+++ b/tests/models/phimoe/test_modeling_phimoe.py
@@ -197,6 +197,13 @@ def test_phimoe_instruct_with_static_cache(self):
         # Can't run with the real checkpoint, even if offloaded. Let's just use a tiny dummy one
         config = copy.deepcopy(model.config)
         config.num_hidden_layers = 2
+        # make `head_dim = 128`
+        config.hidden_size = 512
+        config.num_attention_heads = 4
+        config.num_key_value_heads = 1
+        config.intermediate_size = 512
+        config.max_position_embeddinqgs = 64
+        config.num_local_experts = 4
         torch.manual_seed(42)
         model = PhimoeForCausalLM(config).to(torch_device)
         model.eval()
@@ -209,7 +216,9 @@ def test_phimoe_instruct_with_static_cache(self):
             },
             {"role": "user", "content": "Can you provide ways to eat combinations of bananas and dragonfruits?"},
         ]
-        inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt")
+        inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt").to(
+            torch_device
+        )
 
         response_tokens = PhimoeMiniWithStaticCache.generate(model, inputs, max_seq_len=10)
 
@@ -217,7 +226,7 @@ def test_phimoe_instruct_with_static_cache(self):
 
         # This is dummy outputs. We actually check if it could run with static cache, not the output quality.
         EXPECTED_OUTPUT = [
-            "<|system|> You are a helpful digital assistant. Please provide safe, ethical and accurate information to the user.<|end|><|user|> Can you provide ways to eat combinations of bananas and dragonfruits?<|end|><|assistant|>ington"
+            "<|system|> You are a helpful digital assistant. Please provide safe, ethical and accurate information to the user.<|end|><|user|> Can you provide ways to eat combinations of bananas and dragonfruits?<|end|><|assistant|> awards"
         ]
 
         self.assertListEqual(output_text, EXPECTED_OUTPUT)

From 391d405a524d5c430b6c163209ed89fe112db998 Mon Sep 17 00:00:00 2001
From: ydshieh <ydshieh@users.noreply.github.com>
Date: Fri, 19 Sep 2025 18:35:06 +0200
Subject: [PATCH 5/5] fix

---
 tests/models/phimoe/test_modeling_phimoe.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/phimoe/test_modeling_phimoe.py b/tests/models/phimoe/test_modeling_phimoe.py
index 30999dda149e..ac6fa3c2672a 100644
--- a/tests/models/phimoe/test_modeling_phimoe.py
+++ b/tests/models/phimoe/test_modeling_phimoe.py
@@ -220,7 +220,7 @@ def test_phimoe_instruct_with_static_cache(self):
             torch_device
         )
 
-        response_tokens = PhimoeMiniWithStaticCache.generate(model, inputs, max_seq_len=10)
+        response_tokens = PhimoeMiniWithStaticCache.generate(model, inputs, max_seq_len=30)
 
         output_text = tokenizer.batch_decode(torch.tensor([response_tokens], dtype=torch.long, device=torch_device))