Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,23 +42,23 @@ jobs:
repo: "git+https://github.com/vllm-project/vllm --branch main"
test_suite:
- name: "static batching"
markers: "cpu and decoder and not cb and not other_e2e"
markers: "cpu and decoder and not cb and not other_e2e and not quantized"
flags: "--timeout=300"
hf_models: "JackFram/llama-160m"
- name: "fp8"
markers: "cpu and quantized and multi"
flags: "--timeout=600 -k 'basic and test_output' --durations=0"
hf_models: "ibm-ai-platform/micro-g3.3-8b-instruct-1b-FP8"
- name: "embedding"
markers: "cpu and embedding"
markers: "cpu and embedding and not quantized"
flags: "--timeout=300"
hf_models: "sentence-transformers/all-roberta-large-v1"
- name: "scoring"
markers: "cpu and scoring"
flags: "--timeout=300"
hf_models: "cross-encoder/stsb-roberta-large"
- name: "continuous batching"
markers: "cpu and cb"
markers: "cpu and cb and not quantized"
flags: "--timeout=300 --durations=0 -s"
- name: "worker and utils"
markers: "not e2e"
Expand All @@ -67,7 +67,7 @@ jobs:
markers: "compat"
flags: "--timeout=300"
- name: "other e2e"
markers: "cpu and other_e2e"
markers: "cpu and other_e2e and not quantized"
flags: "--timeout=300"
- name: "precompilation"
markers: "precompilation"
Expand Down
36 changes: 0 additions & 36 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,10 +132,6 @@ def pytest_collection_modifyitems(config, items):
""" Modify tests at collection time """
_mark_all_e2e(items)

_skip_quantized_by_default(config, items)

_xfail_fp8_on_spyre(items)

_skip_unsupported_compiler_tests(config, items)

sort_tests_for_llm_caching(items)
Expand All @@ -154,38 +150,6 @@ def _mark_all_e2e(items):
item.add_marker(pytest.mark.e2e)


def _skip_quantized_by_default(config, items):
"""Skip tests marked with `quantized` unless the `-m` flag includes it
Ref: https://stackoverflow.com/questions/56374588/how-can-i-ensure-tests-with-a-marker-are-only-run-if-explicitly-asked-in-pytest

This will skip the quantized tests at runtime, but they will still show up
as collected when running pytest --collect-only.
"""
markexpr = config.option.markexpr
if "quantized" in markexpr:
return # let pytest handle the collection logic

skip_mymarker = pytest.mark.skip(reason='quantized not selected')
for item in items:
if "quantized" in item.keywords:
item.add_marker(skip_mymarker)


def _xfail_fp8_on_spyre(items):
"""Set an xfail marker on all tests that run quantized models on Spyre
hardware.

TODO: Relax this to only "spyre and cb" once static batching is supported
on spyre.
"""

xfail_marker = pytest.mark.xfail(
reason="fp8 is not yet supported on Spyre")
for item in items:
if "quantized" in item.keywords and "spyre" in item.keywords:
item.add_marker(xfail_marker)


def _skip_unsupported_compiler_tests(config, items):
"""Skip all tests that need compiler changes to run.
This can be relaxed once the compiler changes are in place
Expand Down
2 changes: 2 additions & 0 deletions tests/e2e/test_sampling_params.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ def test_spyre_batch1_max_tokens(model: ModelInfo, backend, monkeypatch,
assert len(output2.outputs[0].token_ids) > 15


@pytest.mark.xfail(reason="Failing currently because of output mismatch")
def test_spyre_batch1_stop_sequence(model: ModelInfo, backend, monkeypatch,
use_llm_cache, warmup_shapes):
spyre_model = get_cached_llm(
Expand Down Expand Up @@ -328,6 +329,7 @@ def test_spyre_batch1_min_p(model: ModelInfo, backend, monkeypatch,
assert token_div1 < token_div2


@pytest.mark.xfail(reason="Failing currently because of output mismatch")
def test_spyre_batch1_bad_words(model: ModelInfo, backend, monkeypatch,
use_llm_cache, warmup_shapes):
spyre_model = get_cached_llm(
Expand Down
1 change: 1 addition & 0 deletions tests/e2e/test_spyre_seed.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from vllm import SamplingParams


@pytest.mark.xfail(reason="Failing currently because of output mismatch")
@pytest.mark.parametrize("temperature", [0.1, 1.0])
@pytest.mark.parametrize("seed", [42])
def test_seed(model: ModelInfo, temperature: float, seed: int,
Expand Down
1 change: 1 addition & 0 deletions tests/e2e/test_spyre_warmup_shapes.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from vllm import SamplingParams


@pytest.mark.xfail(reason="Failing currently because of output mismatch")
@pytest.mark.parametrize(
"warmup_shapes", [[(64, 20, 4),
(128, 20, 2)]]) # (prompt_length/new_tokens/batch_size)
Expand Down
Loading