|
1 | | -from typing import Type |
| 1 | +import pathlib |
| 2 | +from typing import List, Optional, Type |
2 | 3 |
|
3 | 4 | import pytest |
4 | 5 |
|
5 | | -from ..conftest import HfRunner, VllmRunner |
| 6 | +from vllm.multimodal.utils import rescale_image_size |
| 7 | + |
| 8 | +from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets |
6 | 9 | from .utils import check_logprobs_close |
7 | 10 |
|
8 | | -models = ["qwen/qwen-vl"] |
| 11 | +pytestmark = pytest.mark.vlm |
9 | 12 |
|
| 13 | +text_only_models = [ |
| 14 | + "Qwen/Qwen-7B-Chat" # Has no visual component |
| 15 | +] |
10 | 16 |
|
11 | | -@pytest.mark.parametrize("dtype", ["half"]) |
12 | | -@pytest.mark.parametrize("max_tokens", [32]) |
13 | | -@pytest.mark.parametrize("num_logprobs", [5]) |
14 | | -@pytest.mark.parametrize("model", models) |
15 | | -def test_text_only_qwen_model( |
| 17 | +multimodal_models = ["Qwen/Qwen-VL"] |
| 18 | + |
| 19 | +HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ |
| 20 | + "stop_sign": |
| 21 | + "Picture 1: <img></img>\nWhat's the content of the image?: ", |
| 22 | + "cherry_blossom": |
| 23 | + "Picture 1: <img></img>\nWhat is the season?: ", |
| 24 | +}) |
| 25 | + |
| 26 | + |
| 27 | +### Tests for multimodal Qwen models |
| 28 | +def run_test( |
| 29 | + tmp_path: pathlib.PosixPath, |
16 | 30 | hf_runner: Type[HfRunner], |
17 | 31 | vllm_runner: Type[VllmRunner], |
18 | | - example_prompts, |
| 32 | + image_assets: _ImageAssets, |
19 | 33 | model: str, |
20 | 34 | *, |
| 35 | + size_factors: List[float], |
21 | 36 | dtype: str, |
22 | 37 | max_tokens: int, |
23 | 38 | num_logprobs: int, |
| 39 | + tensor_parallel_size: int, |
| 40 | + distributed_executor_backend: Optional[str] = None, |
24 | 41 | ): |
25 | | - # This test checks language inputs only, since the visual component |
26 | | - # for qwen-vl is still unsupported in VLLM. In the near-future, the |
27 | | - # implementation and this test will be extended to consider |
28 | | - # visual inputs as well. |
| 42 | + """Inference result should be the same between hf and vllm. |
| 43 | +
|
| 44 | + All the image fixtures for the test is under tests/images. |
| 45 | + For huggingface runner, we provide the PIL images as input. |
| 46 | + For vllm runner, we provide MultiModalDataDict objects |
| 47 | + and corresponding MultiModalConfig as input. |
| 48 | + Note, the text input is also adjusted to abide by vllm contract. |
| 49 | + The text output is sanitized to be able to compare with hf. |
| 50 | + """ |
| 51 | + images = [asset.pil_image for asset in image_assets] |
| 52 | + |
| 53 | + # Export the images to a tempdir and substitute it into the hf prompt; |
| 54 | + # the contents between <img>/</img> will be ignored by VLLM, but the |
| 55 | + # transformers implementation for the visual transformer parses this to |
| 56 | + # reload it in the forward call; the contents are treated as a URL or a |
| 57 | + # local path. |
| 58 | + for idx, asset in enumerate(image_assets): |
| 59 | + image_tmp_path = tmp_path / f"{asset.name}.jpg" |
| 60 | + asset.pil_image.save(image_tmp_path) |
| 61 | + HF_IMAGE_PROMPTS[idx] = HF_IMAGE_PROMPTS[idx].replace( |
| 62 | + "<img></img>", f"<img>{image_tmp_path}</img>") |
| 63 | + |
| 64 | + inputs_per_image = [( |
| 65 | + [prompt for _ in size_factors], |
| 66 | + [rescale_image_size(image, factor) for factor in size_factors], |
| 67 | + ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)] |
| 68 | + |
| 69 | + # NOTE: take care of the order. run vLLM first, and then run HF. |
| 70 | + # vLLM needs a fresh new process without cuda initialization. |
| 71 | + # if we run HF first, the cuda initialization will be done and it |
| 72 | + # will hurt multiprocessing backend with fork method (the default method). |
| 73 | + |
| 74 | + # max_model_len should be greater than image_feature_size |
| 75 | + # Qwen encodes images into a fixed content size of 256 |
| 76 | + with vllm_runner(model, |
| 77 | + max_model_len=300, |
| 78 | + max_num_seqs=1, |
| 79 | + dtype=dtype, |
| 80 | + tensor_parallel_size=tensor_parallel_size, |
| 81 | + distributed_executor_backend=distributed_executor_backend, |
| 82 | + enforce_eager=True) as vllm_model: |
| 83 | + vllm_outputs_per_image = [ |
| 84 | + vllm_model.generate_greedy_logprobs(prompts, |
| 85 | + max_tokens, |
| 86 | + num_logprobs=num_logprobs, |
| 87 | + images=images) |
| 88 | + for prompts, images in inputs_per_image |
| 89 | + ] |
| 90 | + |
29 | 91 | with hf_runner(model, dtype=dtype) as hf_model: |
30 | | - hf_outputs = hf_model.generate_greedy_logprobs_limit( |
31 | | - example_prompts, |
32 | | - max_tokens, |
33 | | - num_logprobs=num_logprobs, |
| 92 | + hf_outputs_per_image = [ |
| 93 | + hf_model.generate_greedy_logprobs_limit(prompts, |
| 94 | + max_tokens, |
| 95 | + num_logprobs=num_logprobs, |
| 96 | + images=images) |
| 97 | + for prompts, images in inputs_per_image |
| 98 | + ] |
| 99 | + |
| 100 | + for hf_outputs, vllm_outputs in zip(hf_outputs_per_image, |
| 101 | + vllm_outputs_per_image): |
| 102 | + |
| 103 | + check_logprobs_close( |
| 104 | + outputs_0_lst=hf_outputs, |
| 105 | + outputs_1_lst=vllm_outputs, |
| 106 | + name_0="hf", |
| 107 | + name_1="vllm", |
34 | 108 | ) |
35 | 109 |
|
| 110 | + |
| 111 | +@pytest.mark.parametrize("model", multimodal_models) |
| 112 | +@pytest.mark.parametrize( |
| 113 | + "size_factors", |
| 114 | + [ |
| 115 | + # No image |
| 116 | + [], |
| 117 | + # Single-scale |
| 118 | + [1.0], |
| 119 | + # Single-scale, batched |
| 120 | + [1.0, 1.0, 1.0], |
| 121 | + # Multi-scale |
| 122 | + [0.25, 0.5, 1.0], |
| 123 | + ], |
| 124 | +) |
| 125 | +@pytest.mark.parametrize("dtype", ["bfloat16"]) |
| 126 | +@pytest.mark.parametrize("max_tokens", [8]) |
| 127 | +@pytest.mark.parametrize("num_logprobs", [5]) |
| 128 | +def test_multimodal_models(tmp_path, hf_runner, vllm_runner, image_assets, |
| 129 | + model, size_factors, dtype, max_tokens, |
| 130 | + num_logprobs) -> None: |
| 131 | + run_test( |
| 132 | + tmp_path, |
| 133 | + hf_runner, |
| 134 | + vllm_runner, |
| 135 | + image_assets, |
| 136 | + model, |
| 137 | + size_factors=size_factors, |
| 138 | + dtype=dtype, |
| 139 | + max_tokens=max_tokens, |
| 140 | + num_logprobs=num_logprobs, |
| 141 | + tensor_parallel_size=1, |
| 142 | + ) |
| 143 | + |
| 144 | + |
| 145 | +# Ensure that a text-only Qwen model can still be loaded and |
| 146 | +# used for inference in VLLM without throwing. |
| 147 | +@pytest.mark.parametrize("model", text_only_models) |
| 148 | +@pytest.mark.parametrize("dtype", ["bfloat16"]) |
| 149 | +@pytest.mark.parametrize("max_tokens", [32]) |
| 150 | +@pytest.mark.parametrize("num_logprobs", [5]) |
| 151 | +def test_text_only_qwen_model_can_be_loaded_and_run( |
| 152 | + vllm_runner: Type[VllmRunner], |
| 153 | + example_prompts, |
| 154 | + model: str, |
| 155 | + *, |
| 156 | + dtype: str, |
| 157 | + max_tokens: int, |
| 158 | + num_logprobs: int, |
| 159 | +): |
36 | 160 | with vllm_runner(model, dtype=dtype) as vllm_model: |
37 | | - vllm_outputs = vllm_model.generate_greedy_logprobs( |
| 161 | + vllm_model.generate_greedy_logprobs( |
38 | 162 | example_prompts, |
39 | 163 | max_tokens, |
40 | 164 | num_logprobs=num_logprobs, |
41 | 165 | ) |
42 | | - |
43 | | - check_logprobs_close( |
44 | | - outputs_0_lst=hf_outputs, |
45 | | - outputs_1_lst=vllm_outputs, |
46 | | - name_0="hf", |
47 | | - name_1="vllm", |
48 | | - ) |
|
0 commit comments