vllm-project · maxdebayser · Sep 24, 2025 · Sep 15, 2025 · Sep 15, 2025 · Sep 15, 2025
@@ -13,7 +13,7 @@ license = {text = "Apache 2"}
 dependencies = [
     "fms-model-optimizer[fp8]>=0.6.0",
     "ibm-fms>=1.2.1",
-    "vllm==0.10.1.1",
+    "vllm>=0.10.1.1,<=0.10.2",
     "pytest-mock>=3.15.0",
 ]
 requires-python = ">=3.11"
@@ -164,7 +164,7 @@ dev = [
     "pytest-timeout==2.3.1",
     "requests==2.32.3",
     "sentence-transformers==3.4.1",
-    "aiu-fms-testing-utils>=0.2.1",
+    "aiu-fms-testing-utils>=0.2.3",
 ]
 lint = [
     "clang-format==18.1.5",

@@ -64,8 +64,7 @@ def test_api_cb_rejects_oversized_request(
     overflow_prompt = " ".join(["hi"] * max_model_len)
     max_tokens = 10
 
-    with pytest.raises(BadRequestError,
-                       match="This model's maximum context length is"):
+    with pytest.raises(BadRequestError, match="maximum context length is"):
         client.completions.create(
             model=model.name,
             prompt=overflow_prompt,

@@ -1,3 +1,4 @@
+import inspect
 import math
 import os
 import random
@@ -353,19 +354,31 @@ def create_random_request(
         assert (len(prompt_token_ids) == num_tokens
                 ), f"need {num_tokens} but got {len(prompt_token_ids)}"
 
-    return Request(
-        request_id=str(request_id),
-        prompt_token_ids=prompt_token_ids,
-        multi_modal_hashes=None,
-        multi_modal_placeholders=None,
-        sampling_params=sampling_params,
-        eos_token_id=None,
-        arrival_time=0,
-        lora_request=None,
-        pooling_params=None,
-        cache_salt=None,
-        multi_modal_kwargs=None,
-    )
+    # temporary backward compat code for 0.10.1.1
+    annotations = inspect.getfullargspec(Request).annotations
+    extra_args = {}  # noqa
+    if ('multi_modal_hashes' in annotations):
+        extra_args.update({
+            'multi_modal_hashes': None,
+        })
+    if ('multi_modal_placeholders' in annotations):
+        extra_args.update({
+            'multi_modal_placeholders': None,
+        })
+    if ('multi_modal_kwargs' in annotations):
+        extra_args.update({
+            'multi_modal_kwargs': None,
+        })
+
+    return Request(request_id=str(request_id),
+                   prompt_token_ids=prompt_token_ids,
+                   sampling_params=sampling_params,
+                   eos_token_id=None,
+                   arrival_time=0,
+                   lora_request=None,
+                   pooling_params=None,
+                   cache_salt=None,
+                   **extra_args)
 
 
 def skip_unsupported_tp_size(size: int, backend: str):

@@ -21,3 +21,63 @@ def test_init_distributed_environment():
         assert 'timeout' \
                 not in annotations, ("we should remove compat code which is now"
                 " part of released vllm version")
+
+
+def test_request():
+
+    from vllm.v1.request import Request
+
+    annotations = inspect.getfullargspec(Request).annotations
+
+    if VLLM_VERSION == "vLLM:main":
+        assert 'multi_modal_kwargs' not in annotations
+        assert 'multi_modal_hashes' not in annotations
+        assert 'multi_modal_placeholders' not in annotations
+    elif VLLM_VERSION == "vLLM:lowest":
+        assert 'multi_modal_hashes' in annotations
+        assert 'multi_modal_placeholders' in annotations
+        assert 'multi_modal_placeholders' in annotations
+        # The compat code introduced in the PR below can now be removed:
+        # https://github.com/vllm-project/vllm-spyre/pull/463
+
+
+def test_model_runner_output():
+
+    from vllm.v1.outputs import ModelRunnerOutput
+
+    annotations = inspect.getfullargspec(ModelRunnerOutput).annotations
+
+    if VLLM_VERSION == "vLLM:main":
+        assert 'spec_token_ids' not in annotations
+    elif VLLM_VERSION == "vLLM:lowest":
+        assert 'spec_token_ids' in annotations
+        # The compat code introduced in the PR below can now be removed:
+        # https://github.com/vllm-project/vllm-spyre/pull/463
+
+
+def test_pooling_metadata():
+
+    from vllm.v1.pool.metadata import PoolingMetadata
+
+    has_build_pooling_cursor = getattr(PoolingMetadata, "build_pooling_cursor",
+                                       False)
+
+    if VLLM_VERSION == "vLLM:main":
+        assert has_build_pooling_cursor
+    elif VLLM_VERSION == "vLLM:lowest":
+        assert not has_build_pooling_cursor
+        # The compat code introduced in the PR below can now be removed:
+        # https://github.com/vllm-project/vllm-spyre/pull/463
+
+
+def test_scheduler_output():
+
+    from vllm.v1.core.sched.output import SchedulerOutput
+    annotations = inspect.getfullargspec(SchedulerOutput).annotations
+
+    if VLLM_VERSION == "vLLM:main":
+        assert 'free_encoder_mm_hashes' in annotations
+    elif VLLM_VERSION == "vLLM:lowest":
+        assert 'free_encoder_mm_hashes' not in annotations
+        # The compat code introduced in the PR below can now be removed:
+        # https://github.com/vllm-project/vllm-spyre/pull/463