[Tests] Set up pytest markers (#108)

rafvasq · web-flow · commit 95e3f2a51f05 · 2025-04-25T16:31:57.000-04:00
* Add markers and refactors to use pytest params

* Fix utils marker and add worker

* Fix util tests and add backend one

* Skip eager for multi-card tests

* Dynamically mark all tests in tests/e2e 

* Add decoder marker to gptq

* Split GTPQ test for readability

---------

Signed-off-by: Rafael Vasquez &lt;rafvasq21@gmail.com&gt;
diff --git a/.github/workflows/test-spyre.yml b/.github/workflows/test-spyre.yml
@@ -44,13 +44,16 @@ jobs:
             repo: "git+https://github.com/vllm-project/vllm --branch main"
         test_suite:
           - name: "V0-e2e"
-            tests: "V0 and eager"
+            markers: "v0 and cpu and e2e"
             flags: "--timeout=300"
           - name: "V1-e2e"
-            tests: "V1- and eager"
+            markers: "v1 and cpu and e2e"
             flags: "--timeout=300 --forked"
           - name: "V1-worker"
-            tests: "test_sampling_metadata_in_input_batch"
+            markers: "v1 and not e2e"
+            flags: "--timeout=300"
+          - name: "utils"
+            markers: "utils"
             flags: "--timeout=300"
 
     name: "${{ matrix.test_suite.name }} (${{ matrix.vllm_version.name }})"
@@ -145,4 +148,4 @@ jobs:
           # re-install the vllm_sypre package from source
           source .venv/bin/activate
           python3 -m pytest ${{ matrix.test_suite.flags }} \
-            tests -v -k "${{ matrix.test_suite.tests }}"
+            tests -v -m "${{ matrix.test_suite.markers }}"
diff --git a/pyproject.toml b/pyproject.toml
@@ -113,13 +113,17 @@ skip_gitignore = true
 pythonpath = ["."]
 markers = [
     "skip_global_cleanup",
-    "core_model: enable this model test in each PR instead of only nightly",
-    "cpu_model: enable this model test in CPU tests",
-    "quant_model: run this model test under Quantized category",
-    "split: run this test as part of a split",
-    "distributed: run this test only in distributed GPU tests",
-    "skip_v1: do not run this test with v1",
-    "optional: optional tests that are automatically skipped, include --optional to run them",
+    "e2e: Tests using end-to-end engine spin-up",
+    "v0: Tests using vLLM v0 engine",
+    "v1: Tests using vLLM v1 engine",
+    "cpu: Tests using CPU (i.e. eager) backend",
+    "spyre: Tests using Spyre hardware backend",
+    "decoder: Tests for decoder models",
+    "embedding: Tests for embedding models",
+    "quantized: Tests for quantized models",
+    "multi: Tests that require >1 cards",
+    "utils: Tests for utility functions",
+    "worker: Tests for worker logic",
 ]
 
 [tool.pymarkdown]
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -5,6 +5,13 @@
 from vllm.distributed import cleanup_dist_env_and_memory
 
 
+def pytest_collection_modifyitems(config, items):
+    """ Mark all tests in e2e directory"""
+    for item in items:
+        if "tests/e2e" in str(item.nodeid):
+            item.add_marker(pytest.mark.e2e)
+
+
 @pytest.fixture(params=[True, False])
 def run_with_both_engines(request, monkeypatch):
     # Automatically runs tests twice, once with V1 and once without
diff --git a/tests/e2e/test_spyre_basic.py b/tests/e2e/test_spyre_basic.py
@@ -4,7 +4,7 @@
 """
 
 import pytest
-from spyre_util import (compare_results, generate_hf_output,
+from spyre_util import (VLLM_VERSIONS, compare_results, generate_hf_output,
                         generate_spyre_vllm_output, get_spyre_backend_list,
                         get_spyre_model_list)
 from vllm import SamplingParams
@@ -15,24 +15,6 @@
     "user.\n\n### Instruction:\n{}\n\n### Response:")
 
 
-# Basic test to make sure we return the model_list correctly
-def test_get_spyre_model_list(monkeypatch):
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_SPYRE_TEST_MODEL_DIR", "models")
-        m.setenv("VLLM_SPYRE_TEST_MODEL_LIST", "llama-194m, " \
-                 "all-roberta-large-v1")
-        assert get_spyre_model_list()[0] == "models/llama-194m"
-        assert get_spyre_model_list()[1] == \
-        "models/all-roberta-large-v1"
-
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_SPYRE_TEST_MODEL_DIR", "")
-        m.setenv("VLLM_SPYRE_TEST_MODEL_LIST", "llama-194m, " \
-            "all-roberta-large-v1")
-        assert get_spyre_model_list()[0] == "llama-194m"
-        assert get_spyre_model_list()[1] == "all-roberta-large-v1"
-
-
 @pytest.mark.parametrize("model", get_spyre_model_list())
 @pytest.mark.parametrize("prompts", [[
     template.format("Provide a list of instructions "
@@ -47,7 +29,7 @@ def test_get_spyre_model_list(monkeypatch):
     "warmup_shape", [(64, 20, 4), (64, 20, 8), (128, 20, 4),
                      (128, 20, 8)])  # (prompt_length/new_tokens/batch_size)
 @pytest.mark.parametrize("backend", get_spyre_backend_list())
-@pytest.mark.parametrize("vllm_version", ["V0", "V1"])
+@pytest.mark.parametrize("vllm_version", VLLM_VERSIONS)
 def test_output(
     model: str,
     prompts: list[str],
@@ -102,7 +84,7 @@ def test_output(
 
 @pytest.mark.parametrize("model", get_spyre_model_list())
 @pytest.mark.parametrize("backend", get_spyre_backend_list())
-@pytest.mark.parametrize("vllm_version", ["V0", "V1"])
+@pytest.mark.parametrize("vllm_version", VLLM_VERSIONS)
 def test_batch_handling(
     model: str,
     backend: str,
diff --git a/tests/e2e/test_spyre_embeddings.py b/tests/e2e/test_spyre_embeddings.py
@@ -20,7 +20,10 @@
                          [(64, 4), (64, 8), (128, 4),
                           (128, 8)])  # (prompt_length/new_tokens/batch_size)
 @pytest.mark.parametrize("backend", get_spyre_backend_list())
-@pytest.mark.parametrize("vllm_version", ["V0"])  # Todo: V1 support
+@pytest.mark.parametrize(
+    "vllm_version",
+    [pytest.param("V0", marks=pytest.mark.v0, id="v0")
+     ])  # TODO: Replace with VLLM_VERSIONS when v1 is supported.
 def test_output(
     model: str,
     prompts: list[str],
diff --git a/tests/e2e/test_spyre_max_new_tokens.py b/tests/e2e/test_spyre_max_new_tokens.py
@@ -4,7 +4,7 @@
 """
 
 import pytest
-from spyre_util import (compare_results, generate_hf_output,
+from spyre_util import (VLLM_VERSIONS, compare_results, generate_hf_output,
                         generate_spyre_vllm_output, get_spyre_backend_list,
                         get_spyre_model_list)
 from vllm import SamplingParams
@@ -27,7 +27,7 @@
 @pytest.mark.parametrize(
     "warmup_shape", [(64, 10, 4)])  # (prompt_length/new_tokens/batch_size)
 @pytest.mark.parametrize("backend", get_spyre_backend_list())
-@pytest.mark.parametrize("vllm_version", ["V0", "V1"])
+@pytest.mark.parametrize("vllm_version", VLLM_VERSIONS)
 def test_output(
     model: str,
     prompts: list[str],
diff --git a/tests/e2e/test_spyre_online.py b/tests/e2e/test_spyre_online.py
@@ -1,31 +1,18 @@
 import openai
 import pytest
 
-from tests.spyre_util import get_spyre_backend_list, get_spyre_model_list
+from tests.spyre_util import (VLLM_VERSIONS, get_spyre_backend_list,
+                              get_spyre_model_list)
 
 
-def get_test_combinations():
-    combinations = []
-
-    # Base model tests across all backends
-    for backend in get_spyre_backend_list():
-        for model in get_spyre_model_list():
-            combinations.append((model, backend, None))
-
-    # GPTQ model only tests on sendnn_decoder
-    for model in get_spyre_model_list(quantization="gptq"):
-        combinations.append((model, "sendnn_decoder", "gptq"))
-
-    return combinations
-
-
-@pytest.mark.parametrize("model,backend,quantization", get_test_combinations())
+@pytest.mark.parametrize("model", get_spyre_model_list())
+@pytest.mark.parametrize("backend", get_spyre_backend_list())
 @pytest.mark.parametrize("warmup_shape", [[
     (64, 20, 4),
 ]])
-@pytest.mark.parametrize("vllm_version", ["V0", "V1"])
+@pytest.mark.parametrize("vllm_version", VLLM_VERSIONS)
 def test_openai_serving(remote_openai_server, model, warmup_shape, backend,
-                        vllm_version, quantization):
+                        vllm_version):
     """Test online serving using the `vllm serve` CLI"""
 
     client = remote_openai_server.get_client()
@@ -68,3 +55,29 @@ def test_openai_serving(remote_openai_server, model, warmup_shape, backend,
         assert len(completion.choices[0].text) == 0
     except openai.BadRequestError as e:
         assert "warmup" in str(e)
+
+
+@pytest.mark.parametrize("model", get_spyre_model_list(quantization="gptq"))
+@pytest.mark.parametrize("backend", ["sendnn_decoder"])
+@pytest.mark.parametrize("quantization", ["gptq"])
+@pytest.mark.parametrize("warmup_shape", [[(64, 20, 4)]])
+@pytest.mark.parametrize("vllm_version", VLLM_VERSIONS)
+def test_openai_serving_gptq(remote_openai_server, model, backend,
+                             warmup_shape, vllm_version, quantization):
+    """Test online serving a GPTQ model with the sendnn_decoder backend only"""
+
+    client = remote_openai_server.get_client()
+    completion = client.completions.create(model=model,
+                                           prompt="Hello World!",
+                                           max_tokens=5,
+                                           temperature=0.0)
+    assert len(completion.choices) == 1
+    assert len(completion.choices[0].text) > 0
+
+    completion = client.completions.create(model=model,
+                                           prompt="Hello World!",
+                                           max_tokens=5,
+                                           temperature=1.0,
+                                           n=2)
+    assert len(completion.choices) == 2
+    assert len(completion.choices[0].text) > 0
diff --git a/tests/e2e/test_spyre_online_multi.py b/tests/e2e/test_spyre_online_multi.py
@@ -1,15 +1,18 @@
 import pytest
 
-from tests.spyre_util import get_spyre_backend_list, get_spyre_model_list
+from tests.spyre_util import (VLLM_VERSIONS, get_spyre_backend_list,
+                              get_spyre_model_list)
 
 
+@pytest.mark.multi
 @pytest.mark.parametrize("model", get_spyre_model_list())
 @pytest.mark.parametrize("warmup_shape", [[
     (64, 20, 4),
 ]])
-@pytest.mark.parametrize("backend", get_spyre_backend_list())
+@pytest.mark.parametrize(
+    "backend", [b for b in get_spyre_backend_list() if "eager" not in str(b)])
 @pytest.mark.parametrize("tensor_parallel_size", ["2"])
-@pytest.mark.parametrize("vllm_version", ["V0", "V1"])
+@pytest.mark.parametrize("vllm_version", VLLM_VERSIONS)
 def test_openai_tp_serving(remote_openai_server, model, warmup_shape, backend,
                            vllm_version, tensor_parallel_size):
     """Test online serving with tensor parallelism using the `vllm serve` CLI"""
diff --git a/tests/e2e/test_spyre_seed.py b/tests/e2e/test_spyre_seed.py
@@ -6,8 +6,8 @@
 import math
 
 import pytest
-from spyre_util import (generate_spyre_vllm_output, get_spyre_backend_list,
-                        get_spyre_model_list)
+from spyre_util import (VLLM_VERSIONS, generate_spyre_vllm_output,
+                        get_spyre_backend_list, get_spyre_model_list)
 from vllm import SamplingParams
 
 
@@ -22,7 +22,7 @@
     "warmup_shape", [(64, 20, 4), (64, 20, 8), (128, 20, 4),
                      (128, 20, 8)])  # (prompt_length/new_tokens/batch_size)
 @pytest.mark.parametrize("backend", get_spyre_backend_list())
-@pytest.mark.parametrize("vllm_version", ["V0", "V1"])
+@pytest.mark.parametrize("vllm_version", VLLM_VERSIONS)
 def test_seed(
     model: str,
     prompt: str,
diff --git a/tests/e2e/test_spyre_static_batching_limits.py b/tests/e2e/test_spyre_static_batching_limits.py
@@ -16,7 +16,8 @@
                      (128, 20, 4)]])  # (prompt_length/new_tokens/batch_size)
 @pytest.mark.parametrize("backend", get_spyre_backend_list())
 @pytest.mark.parametrize("vllm_version",
-                         ["V1"])  # v0 doesn't support multiple shapes
+                         [pytest.param("V1", marks=pytest.mark.v1, id="v1")
+                          ])  # v0 doesn't support multiple shapes
 def test_max_prompt_len_and_new_tokens(model: str,
                                        warmup_shapes: list[tuple[int, int,
                                                                  int]],
diff --git a/tests/e2e/test_spyre_tensor_parallel.py b/tests/e2e/test_spyre_tensor_parallel.py
@@ -4,12 +4,13 @@
 """
 
 import pytest
-from spyre_util import (compare_results, generate_hf_output,
+from spyre_util import (VLLM_VERSIONS, compare_results, generate_hf_output,
                         generate_spyre_vllm_output, get_spyre_backend_list,
                         get_spyre_model_list)
 from vllm import SamplingParams
 
 
+@pytest.mark.multi
 @pytest.mark.parametrize("model", get_spyre_model_list())
 @pytest.mark.parametrize("prompts", [[
     "Provide a list of instructions for preparing"
@@ -21,8 +22,9 @@
     [[(64, 20, 4)]])  #,[(64,20,8)],[(128,20,4)],[(128,20,8)]])
 # (prompt_length/new_tokens/batch_size)
 @pytest.mark.parametrize("tp_size", [2])
-@pytest.mark.parametrize("backend", get_spyre_backend_list())
-@pytest.mark.parametrize("vllm_version", ["V0", "V1"])
+@pytest.mark.parametrize(
+    "backend", [b for b in get_spyre_backend_list() if "eager" not in str(b)])
+@pytest.mark.parametrize("vllm_version", VLLM_VERSIONS)
 def test_output(
     model: str,
     prompts: list[str],
diff --git a/tests/e2e/test_spyre_warmup_shapes.py b/tests/e2e/test_spyre_warmup_shapes.py
@@ -26,7 +26,8 @@
     "warmup_shapes", [[(64, 20, 8),
                        (128, 20, 4)]])  # (prompt_length/new_tokens/batch_size)
 @pytest.mark.parametrize("backend", get_spyre_backend_list())
-@pytest.mark.parametrize("vllm_version", ["V1"])
+@pytest.mark.parametrize("vllm_version",
+                         [pytest.param("V1", marks=pytest.mark.v1, id="v1")])
 def test_output(
     model: str,
     prompts: list[str],
diff --git a/tests/spyre_util.py b/tests/spyre_util.py
@@ -8,6 +8,7 @@
 
 import numpy as np
 import openai
+import pytest
 import requests
 from sentence_transformers import SentenceTransformer, util
 from transformers import AutoModelForCausalLM, AutoTokenizer
@@ -20,6 +21,11 @@
 ISCLOSE_REL_TOL_CPU = 0.1
 ISCLOSE_REL_TOL_SPYRE = 0.35
 
+VLLM_VERSIONS = [
+    pytest.param("V0", marks=pytest.mark.v0, id="v0"),
+    pytest.param("V1", marks=pytest.mark.v1, id="v1"),
+]
+
 
 class RemoteOpenAIServer:
     """Subprocess wrapper that boots a vllm server with `vllm serve` for testing
@@ -426,38 +432,48 @@ def get_spyre_model_dir_path() -> Path:
     return Path(model_dir_path)
 
 
-# get model backend from env, if not set then default to "eager"
-# For multiple values:
-# export SPYRE_TEST_BACKEND_LIST="eager, inductor, sendnn_decoder"
+# get model backends from env or default to all and add pytest markers
 def get_spyre_backend_list():
-    test_backend_list = []
     user_backend_list = os.environ.get("VLLM_SPYRE_TEST_BACKEND_LIST",
                                        "eager,inductor,sendnn_decoder,sendnn")
 
-    for sypre_backend in user_backend_list.split(","):
-        test_backend_list.append(sypre_backend.strip())
-    return test_backend_list
+    backends = []
+    for backend in user_backend_list.split(","):
+        backend = backend.strip()
+        marks = []
+        if backend == "eager":
+            marks = [pytest.mark.cpu]
+        elif backend == "sendnn_decoder":
+            marks = [pytest.mark.spyre]
+
+        backends.append(pytest.param(backend, marks=marks, id=backend))
+    return backends
 
 
 # get model names from env, if not set then default to "llama-194m"
 # For multiple values:
 # export SPYRE_TEST_MODEL_LIST="llama-194m,all-roberta-large-v1"
 def get_spyre_model_list(isEmbeddings=False, quantization=None):
     spyre_model_dir_path = get_spyre_model_dir_path()
-    test_model_list = []
 
     if isEmbeddings:
         user_test_model_list = os.environ.get("VLLM_SPYRE_TEST_MODEL_LIST",
                                               "all-roberta-large-v1")
+        marks = [pytest.mark.embedding]
     elif quantization == "gptq":
         user_test_model_list = os.environ.get("VLLM_SPYRE_TEST_MODEL_LIST",
                                               "granite-3.0-8b-instruct-gptq")
+        marks = [pytest.mark.decoder, pytest.mark.quantized, pytest.mark.spyre]
     else:
         user_test_model_list = os.environ.get("VLLM_SPYRE_TEST_MODEL_LIST",
                                               "llama-194m")
+        marks = [pytest.mark.decoder]
 
+    test_model_list = []
     for model in user_test_model_list.split(","):
-        test_model_list.append(str(spyre_model_dir_path / model.strip()))
+        model_path = str(spyre_model_dir_path / model.strip())
+        test_model_list.append(
+            pytest.param(model_path, marks=marks, id=model.strip()))
     return test_model_list
 
 
diff --git a/tests/utils/test_spyre_backend_list.py b/tests/utils/test_spyre_backend_list.py
diff --git a/tests/utils/test_spyre_model_list.py b/tests/utils/test_spyre_model_list.py
diff --git a/tests/v1/worker/test_spyre_input_batch.py b/tests/v1/worker/test_spyre_input_batch.py