vllm-project · maxdebayser · Dec 10, 2025 · Nov 27, 2025 · Nov 28, 2025 · Nov 30, 2025
@@ -104,11 +104,13 @@ def pytest_generate_tests(metafunc):
     # markers
     if ("mode" in metafunc.fixturenames and "cb" not in existing_markers
             and "chunked_prefill" not in existing_markers
+            and "cp" not in existing_markers and "pc" not in existing_markers
             and "mode" not in existing_markers):
         metafunc.parametrize("mode", [
             "sb",
             pytest.param("cb", marks=pytest.mark.cb, id="cb"),
-            pytest.param("cp", marks=pytest.mark.chunked_prefill, id="cp")
+            pytest.param("cp", marks=pytest.mark.chunked_prefill, id="cp"),
+            pytest.param("pc", marks=pytest.mark.prefix_caching, id="pc")
         ])
 
 
@@ -252,7 +254,7 @@ def remote_openai_server(request):
             skip_unsupported_tp_size(int(tp_size), backend)
             server_args.extend(["--tensor-parallel-size", str(tp_size)])
 
-    if "mode" in params and params["mode"] in ["cb", "cp"]:
+    if "mode" in params and params["mode"] in ["cb", "cp", "pc"]:
         max_model_len = params["max_model_len"]
         max_num_seqs = params["max_num_seqs"]
         env_dict = {
@@ -265,12 +267,16 @@ def remote_openai_server(request):
             str(max_model_len)
         ])
         # Chunked prefill extra
-        if params["mode"] == "cp":
+        if params["mode"] in ["cp", "pc"]:
             env_dict.update({"VLLM_SPYRE_USE_CHUNKED_PREFILL": "1"})
             server_args.extend([
                 "--max_num_batched_tokens",
                 str(128),
             ])
+        if params["mode"] == "pc":
+            server_args.extend([
+                "--enable-prefix-caching",
+            ])
 
     else:
         warmup_shapes = params['warmup_shapes']

@@ -3,6 +3,7 @@
 import pytest
 import torch
 from llm_cache import patch_environment
+from llm_cache_util import force_engine_shutdown
 from spyre_util import ModelInfo
 from vllm import LLM, SamplingParams
 from vllm.config import VllmConfig
@@ -57,6 +58,7 @@ def apply(self, logits: torch.Tensor) -> torch.Tensor:
     params = SamplingParams(max_tokens=5, temperature=0, logprobs=0)
 
     spyre_model.generate(prompt, params)
+    force_engine_shutdown(spyre_model)
 
     assert has_invoked_logits_processor
 
@@ -154,6 +156,7 @@ def apply(self, logits: torch.Tensor) -> torch.Tensor:
     spy_outputs = {}
     params = [params0, params1, params2]
     outputs = spyre_model.generate(prompt, params)
+    force_engine_shutdown(spyre_model)
 
     assert spy_outputs[5] == outputs[0].outputs[0].token_ids
     assert spy_outputs[10] == outputs[1].outputs[0].token_ids

@@ -7,6 +7,10 @@
 
 pytestmark = [pytest.mark.full_model, pytest.mark.other_e2e]
 
+sb_mark = pytest.param("sb", marks=pytest.mark.sb, id="sb")
+cb_mark = pytest.param("cb", marks=pytest.mark.cb, id="cb")
+cp_mark = pytest.param("cp", marks=pytest.mark.chunked_prefill, id="cp")
+
 
 def test_spyre_batch1_temperature(model: ModelInfo, backend, monkeypatch,
                                   use_llm_cache, warmup_shapes):
@@ -212,6 +216,7 @@ def test_spyre_batch1_top_k(model: ModelInfo, backend, monkeypatch,
     assert token_div1 < token_div2
 
 
+@pytest.mark.parametrize("mode", [sb_mark, cb_mark, cp_mark])
 def test_spyre_batch1_logit_bias(model: ModelInfo, backend, monkeypatch,
                                  use_llm_cache, warmup_shapes, max_model_len,
                                  max_num_seqs, mode: str):
@@ -253,6 +258,7 @@ def test_spyre_batch1_logit_bias(model: ModelInfo, backend, monkeypatch,
     assert output[0].outputs[0].text != output[1].outputs[0].text
 
 
+@pytest.mark.parametrize("mode", [sb_mark, cb_mark, cp_mark])
 def test_spyre_batch1_min_tokens(model: ModelInfo, backend, monkeypatch,
                                  use_llm_cache, max_model_len, max_num_seqs,
                                  warmup_shapes, mode: str):
@@ -322,6 +328,7 @@ def test_spyre_batch1_ignore_eos(model: ModelInfo, backend, monkeypatch,
     assert output2.outputs[0].finish_reason != 'length'
 
 
+@pytest.mark.parametrize("mode", [sb_mark, cb_mark, cp_mark])
 def test_spyre_batch1_min_p(model: ModelInfo, backend, monkeypatch,
                             use_llm_cache, max_model_len, max_num_seqs,
                             warmup_shapes, mode: str):

@@ -175,7 +175,7 @@ def test_max_model_len_override(model: ModelInfo, backend, warmup_shapes,
         "use_cb": True,
         "warmup_shapes": None,
         "use_chunked_prefill": mode == "cp",
-    } if mode in ["cb", "cp"] else {
+    } if mode in ["cb", "cp", "pc"] else {
         "use_cb": False,
         "warmup_shapes": warmup_shapes,
     })

@@ -8,8 +8,13 @@
 from spyre_util import DecodeWarmupShapes, ModelInfo, get_chicken_soup_prompts
 from vllm import SamplingParams
 
+sb_mark = pytest.param("sb", marks=pytest.mark.sb, id="sb")
+cb_mark = pytest.param("cb", marks=pytest.mark.cb, id="cb")
+cp_mark = pytest.param("cp", marks=pytest.mark.chunked_prefill, id="cp")
+
 
 @pytest.mark.parametrize("stop_last", [True, False])
+@pytest.mark.parametrize("mode", [sb_mark, cb_mark, cp_mark])
 def test_output(model: ModelInfo, stop_last: bool, max_model_len: int,
                 max_num_seqs: int, warmup_shapes: DecodeWarmupShapes,
                 backend: str, mode: str, monkeypatch: pytest.MonkeyPatch,