|
1 | 1 | import openai |
2 | 2 | import pytest |
3 | 3 |
|
4 | | -from tests.spyre_util import get_spyre_backend_list, get_spyre_model_list |
| 4 | +from tests.spyre_util import (VLLM_VERSIONS, get_spyre_backend_list, |
| 5 | + get_spyre_model_list) |
5 | 6 |
|
6 | 7 |
|
7 | | -def get_test_combinations(): |
8 | | - combinations = [] |
9 | | - |
10 | | - # Base model tests across all backends |
11 | | - for backend in get_spyre_backend_list(): |
12 | | - for model in get_spyre_model_list(): |
13 | | - combinations.append((model, backend, None)) |
14 | | - |
15 | | - # GPTQ model only tests on sendnn_decoder |
16 | | - for model in get_spyre_model_list(quantization="gptq"): |
17 | | - combinations.append((model, "sendnn_decoder", "gptq")) |
18 | | - |
19 | | - return combinations |
20 | | - |
21 | | - |
22 | | -@pytest.mark.parametrize("model,backend,quantization", get_test_combinations()) |
| 8 | +@pytest.mark.parametrize("model", get_spyre_model_list()) |
| 9 | +@pytest.mark.parametrize("backend", get_spyre_backend_list()) |
23 | 10 | @pytest.mark.parametrize("warmup_shape", [[ |
24 | 11 | (64, 20, 4), |
25 | 12 | ]]) |
26 | | -@pytest.mark.parametrize("vllm_version", ["V0", "V1"]) |
| 13 | +@pytest.mark.parametrize("vllm_version", VLLM_VERSIONS) |
27 | 14 | def test_openai_serving(remote_openai_server, model, warmup_shape, backend, |
28 | | - vllm_version, quantization): |
| 15 | + vllm_version): |
29 | 16 | """Test online serving using the `vllm serve` CLI""" |
30 | 17 |
|
31 | 18 | client = remote_openai_server.get_client() |
@@ -68,3 +55,29 @@ def test_openai_serving(remote_openai_server, model, warmup_shape, backend, |
68 | 55 | assert len(completion.choices[0].text) == 0 |
69 | 56 | except openai.BadRequestError as e: |
70 | 57 | assert "warmup" in str(e) |
| 58 | + |
| 59 | + |
| 60 | +@pytest.mark.parametrize("model", get_spyre_model_list(quantization="gptq")) |
| 61 | +@pytest.mark.parametrize("backend", ["sendnn_decoder"]) |
| 62 | +@pytest.mark.parametrize("quantization", ["gptq"]) |
| 63 | +@pytest.mark.parametrize("warmup_shape", [[(64, 20, 4)]]) |
| 64 | +@pytest.mark.parametrize("vllm_version", VLLM_VERSIONS) |
| 65 | +def test_openai_serving_gptq(remote_openai_server, model, backend, |
| 66 | + warmup_shape, vllm_version, quantization): |
| 67 | + """Test online serving a GPTQ model with the sendnn_decoder backend only""" |
| 68 | + |
| 69 | + client = remote_openai_server.get_client() |
| 70 | + completion = client.completions.create(model=model, |
| 71 | + prompt="Hello World!", |
| 72 | + max_tokens=5, |
| 73 | + temperature=0.0) |
| 74 | + assert len(completion.choices) == 1 |
| 75 | + assert len(completion.choices[0].text) > 0 |
| 76 | + |
| 77 | + completion = client.completions.create(model=model, |
| 78 | + prompt="Hello World!", |
| 79 | + max_tokens=5, |
| 80 | + temperature=1.0, |
| 81 | + n=2) |
| 82 | + assert len(completion.choices) == 2 |
| 83 | + assert len(completion.choices[0].text) > 0 |
0 commit comments