vllm-project · joerunde · Oct 8, 2025 · Oct 8, 2025
@@ -1,7 +1,7 @@
 # Other Tests
 
 !!! note
-    Unless otherwise specified, all the continuous batching tests are running with `max_model_len=256`
+    Unless otherwise specified, all the continuous batching tests are running with `max_model_len=512`
 
 ::: tests.e2e.test_spyre_cb
     options:

@@ -1,7 +1,7 @@
 # Output Tests
 
 !!! note
-    Unless otherwise specified, all the continuous batching tests are running with `max_model_len=256`
+    Unless otherwise specified, all the continuous batching tests are running with `max_model_len=512`
 
 ::: tests.e2e.test_spyre_basic
     options:

@@ -1,6 +1,6 @@
 # Scheduler Steps Tests
 
 !!! note
-    Unless otherwise specified, all the continuous batching tests are running with `max_model_len=256`
+    Unless otherwise specified, all the continuous batching tests are running with `max_model_len=512`
 
 ::: tests.e2e.test_spyre_cb_scheduler_steps
@@ -35,7 +35,7 @@ def mock_get_mask_dtype(mocker: MockerFixture):
 @pytest.mark.spyre
 @pytest.mark.cb
 def test_compare_graphs_cb(model: ModelInfo, max_num_seqs: int,
-                           monkeypatch: pytest.MonkeyPatch,
+                           max_model_len: int, monkeypatch: pytest.MonkeyPatch,
                            mocker: MockerFixture):
     """Test that the spyre worker correctly outputs
     continuous batches of requests by comparing to HF"""
@@ -45,7 +45,6 @@ def test_compare_graphs_cb(model: ModelInfo, max_num_seqs: int,
     if script_dir is None:
         pytest.skip("aiu-fms-testing-utils is required "
                     "and is not installed to run this test")
-    max_model_len = 256
 
     model_path = get_model_path(model)
 

@@ -48,6 +48,7 @@ async def generate(
     "output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
 @pytest.mark.asyncio
 async def test_abort(model: ModelInfo, backend: str, cb: int,
+                     max_model_len: int, max_num_seqs: int,
                      warmup_shapes: DecodeWarmupShapes,
                      output_kind: RequestOutputKind,
                      monkeypatch: pytest.MonkeyPatch):
@@ -72,8 +73,8 @@ async def test_abort(model: ModelInfo, backend: str, cb: int,
         engine = AsyncLLM.from_engine_args(
             AsyncEngineArgs(model=model.name,
                             tokenizer=model.name,
-                            max_model_len=256,
-                            max_num_seqs=4,
+                            max_model_len=max_model_len,
+                            max_num_seqs=max_num_seqs,
                             revision=model.revision))
         has_unfinished_requests = \
             engine.output_processor.has_unfinished_requests

@@ -34,7 +34,7 @@ def test_max_prompt_len_and_new_tokens(model: ModelInfo,
 
     llm = get_cached_llm(
         model=model,
-        max_model_len=256,
+        max_model_len=256,  # unused
         tensor_parallel_size=1,
         backend=backend,
         monkeypatch=monkeypatch,