From 597cb3528657528ea7015303f0b0f9d32c0fe4cc Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 13 Jun 2024 01:17:31 +0000 Subject: [PATCH 01/20] Enable LLaVA test in CPU --- .buildkite/run-cpu-test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh index 6a86bc0ebfb6..8d1570c17e16 100644 --- a/.buildkite/run-cpu-test.sh +++ b/.buildkite/run-cpu-test.sh @@ -21,4 +21,4 @@ docker exec cpu-test bash -c "cd tests; pip install pytest Pillow protobuf bash ../.buildkite/download-images.sh cd ../ - pytest -v -s tests/models --ignore=tests/models/test_llava.py --ignore=tests/models/test_embedding.py --ignore=tests/models/test_registry.py" + pytest -v -s tests/models --ignore=tests/models/test_embedding.py --ignore=tests/models/test_registry.py" From 845b465a1400f8a7a5a0533878085a872a3e077d Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 13 Jun 2024 13:21:33 +0000 Subject: [PATCH 02/20] Fix failing test on CPU due to unsupported dtype --- tests/models/test_llava.py | 9 ++++++++- tests/models/test_llava_next.py | 9 ++++++++- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/tests/models/test_llava.py b/tests/models/test_llava.py index a1f0cff1cc0e..744923618699 100644 --- a/tests/models/test_llava.py +++ b/tests/models/test_llava.py @@ -1,6 +1,7 @@ from typing import List, Tuple import pytest +import torch from transformers import AutoTokenizer from vllm.config import VisionLanguageConfig @@ -65,9 +66,15 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str], return hf_input_ids, hf_output_str +# TODO: remove this after CPU float16 support ready +target_dtype = "float" +if torch.cuda.is_available(): + target_dtype = "half" + + # TODO: Add test for `tensor_parallel_size` [ref: PR #3883] @pytest.mark.parametrize("model_and_config", model_and_vl_config) -@pytest.mark.parametrize("dtype", ["half"]) +@pytest.mark.parametrize("dtype", [target_dtype]) @pytest.mark.parametrize("max_tokens", [128]) def test_models(hf_runner, vllm_runner, hf_images, vllm_images, model_and_config, dtype: str, max_tokens: int) -> None: diff --git a/tests/models/test_llava_next.py b/tests/models/test_llava_next.py index aa6ee268ae58..95d1711e8a1c 100644 --- a/tests/models/test_llava_next.py +++ b/tests/models/test_llava_next.py @@ -1,6 +1,7 @@ from typing import List, Tuple import pytest +import torch from transformers import AutoTokenizer from vllm.config import VisionLanguageConfig @@ -72,11 +73,17 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str], return hf_input_ids, hf_output_str +# TODO: remove this after CPU float16 support ready +target_dtype = "float" +if torch.cuda.is_available(): + target_dtype = "half" + + @pytest.mark.xfail( reason="Inconsistent image processor being used due to lack " "of support for dynamic image token replacement") @pytest.mark.parametrize("model_and_config", model_and_vl_config) -@pytest.mark.parametrize("dtype", ["half"]) +@pytest.mark.parametrize("dtype", [target_dtype]) @pytest.mark.parametrize("max_tokens", [128]) def test_models(hf_runner, vllm_runner, hf_images, vllm_images, model_and_config, dtype: str, max_tokens: int) -> None: From 783cb769e03fea1151923b0edd7f61c1c59137d5 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 21 Jun 2024 15:01:26 +0000 Subject: [PATCH 03/20] Install torchvision --- .buildkite/run-cpu-test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh index 11836cfc19c0..149b23e760e2 100644 --- a/.buildkite/run-cpu-test.sh +++ b/.buildkite/run-cpu-test.sh @@ -21,7 +21,7 @@ docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py" # Run basic model test docker exec cpu-test bash -c "cd tests; - pip install pytest Pillow protobuf + pip install pytest Pillow protobuf torchvision bash ../.buildkite/download-images.sh cd ../ pytest -v -s tests/models --ignore=tests/models/test_embedding.py --ignore=tests/models/test_registry.py" From e177bf8a311d17308c354762153b142d873c81b1 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 21 Jun 2024 15:46:56 +0000 Subject: [PATCH 04/20] Use CPU pypi index for torchvision --- .buildkite/run-cpu-test.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh index 149b23e760e2..c912a8b41140 100644 --- a/.buildkite/run-cpu-test.sh +++ b/.buildkite/run-cpu-test.sh @@ -21,7 +21,8 @@ docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py" # Run basic model test docker exec cpu-test bash -c "cd tests; - pip install pytest Pillow protobuf torchvision + pip install pytest Pillow protobuf + pip install torchvision --index-url https://download.pytorch.org/whl/cpu bash ../.buildkite/download-images.sh cd ../ pytest -v -s tests/models --ignore=tests/models/test_embedding.py --ignore=tests/models/test_registry.py" From d9260828d07a1b92f707f2b8b629776967cf3f00 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 31 Oct 2024 16:27:43 +0000 Subject: [PATCH 05/20] format --- tests/models/decoder_only/audio_language/test_ultravox.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/models/decoder_only/audio_language/test_ultravox.py b/tests/models/decoder_only/audio_language/test_ultravox.py index 8f3fcc9dc43b..b4154be164a8 100644 --- a/tests/models/decoder_only/audio_language/test_ultravox.py +++ b/tests/models/decoder_only/audio_language/test_ultravox.py @@ -158,6 +158,7 @@ def run_multi_audio_test( # just assert that some tokens were generated. assert all(tokens for tokens, *_ in vllm_outputs) + # TODO: remove this after CPU float16 support ready target_dtype = "float" if current_platform.is_cpu() else "half" From fe0ef62c0527044c60b78ab02b465b57c70fea2f Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 31 Oct 2024 16:40:56 +0000 Subject: [PATCH 06/20] Use bfloat16 --- tests/models/decoder_only/audio_language/test_ultravox.py | 2 +- tests/models/decoder_only/language/test_big_models.py | 2 +- tests/models/decoder_only/vision_language/test_models.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/models/decoder_only/audio_language/test_ultravox.py b/tests/models/decoder_only/audio_language/test_ultravox.py index b4154be164a8..2b7117484252 100644 --- a/tests/models/decoder_only/audio_language/test_ultravox.py +++ b/tests/models/decoder_only/audio_language/test_ultravox.py @@ -160,7 +160,7 @@ def run_multi_audio_test( # TODO: remove this after CPU float16 support ready -target_dtype = "float" if current_platform.is_cpu() else "half" +target_dtype = "bfloat16" if current_platform.is_cpu() else "half" @pytest.mark.core_model diff --git a/tests/models/decoder_only/language/test_big_models.py b/tests/models/decoder_only/language/test_big_models.py index fcfc159e4f5a..f0c61e06a931 100644 --- a/tests/models/decoder_only/language/test_big_models.py +++ b/tests/models/decoder_only/language/test_big_models.py @@ -29,7 +29,7 @@ ] # TODO: remove this after CPU float16 support ready -target_dtype = "float" if current_platform.is_cpu() else "half" +target_dtype = "bfloat16" if current_platform.is_cpu() else "half" @pytest.mark.parametrize("model", MODELS) diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py index 7ed8f8598221..20588906c80c 100644 --- a/tests/models/decoder_only/vision_language/test_models.py +++ b/tests/models/decoder_only/vision_language/test_models.py @@ -95,7 +95,7 @@ limit_mm_per_prompt={"image": 4}, )], # TODO: remove this after CPU float16 support ready - dtype="float" if current_platform.is_cpu() else "half", + dtype="bfloat16" if current_platform.is_cpu() else "half", marks=[pytest.mark.core_model], ), "paligemma": VLMTestInfo( @@ -134,7 +134,7 @@ auto_cls=AutoModelForVision2Seq, vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output, # TODO: remove this after CPU float16 support ready - dtype="float" if current_platform.is_cpu() else "half", + dtype="bfloat16" if current_platform.is_cpu() else "half", marks=[pytest.mark.core_model], image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], ), From f12d39fd26e7d44ecadff3f7b0d751db42309158 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 31 Oct 2024 16:41:31 +0000 Subject: [PATCH 07/20] Update --- tests/models/decoder_only/vision_language/test_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py index 20588906c80c..9be3379dd4fa 100644 --- a/tests/models/decoder_only/vision_language/test_models.py +++ b/tests/models/decoder_only/vision_language/test_models.py @@ -114,7 +114,7 @@ ), vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output, # TODO: update this after CPU float16 support ready - dtype=("float" if current_platform.is_cpu() + dtype=("bfloat16" if current_platform.is_cpu() else "half" if current_platform.is_rocm() else ("half", "float")), marks=[pytest.mark.core_model], From 656a49998df77ec83b53002de89d95a16bada3aa Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 31 Oct 2024 17:40:00 +0000 Subject: [PATCH 08/20] Update test dependencies --- .buildkite/run-cpu-test-ppc64le.sh | 2 +- .buildkite/run-cpu-test.sh | 2 +- .buildkite/test-pipeline.yaml | 1 - requirements-test.in | 5 ----- 4 files changed, 2 insertions(+), 8 deletions(-) diff --git a/.buildkite/run-cpu-test-ppc64le.sh b/.buildkite/run-cpu-test-ppc64le.sh index ae9c0733a104..508ff63c76fd 100755 --- a/.buildkite/run-cpu-test-ppc64le.sh +++ b/.buildkite/run-cpu-test-ppc64le.sh @@ -18,7 +18,7 @@ docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/hugg # Run basic model test docker exec cpu-test bash -c " pip install pytest matplotlib einops transformers_stream_generator datamodel_code_generator - pip install Pillow + pip install Pillow librosa pip install torchvision --index-url https://download.pytorch.org/whl/cpu pytest -v -s tests/models/embedding/language pytest -v -s tests/models/encoder_decoder/language diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh index d3f0d6a28ab8..9d8c3acdf397 100644 --- a/.buildkite/run-cpu-test.sh +++ b/.buildkite/run-cpu-test.sh @@ -23,7 +23,7 @@ docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py" # Run basic model test docker exec cpu-test bash -c " pip install pytest matplotlib einops transformers_stream_generator datamodel_code_generator - pip install Pillow + pip install Pillow librosa pip install torchvision --index-url https://download.pytorch.org/whl/cpu pytest -v -s tests/models/embedding/language pytest -v -s tests/models/encoder_decoder/language diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 85dbe1e0cff3..6cd97887a9c4 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -268,7 +268,6 @@ steps: source_file_dependencies: - benchmarks/ commands: - - pip install aiohttp - bash run-benchmarks.sh - label: Quantization Test # 33min diff --git a/requirements-test.in b/requirements-test.in index 3881f2566b55..e117829ccf31 100644 --- a/requirements-test.in +++ b/requirements-test.in @@ -11,9 +11,7 @@ awscli einops # required for MPT, qwen-vl and Mamba httpx librosa # required for audio tests -opencv-python # required for video tests peft -requests ray[adag]==2.35 sentence-transformers # required for embedding soundfile # required for audio test @@ -27,9 +25,6 @@ lm-eval[api]==0.4.4 # required for model evaluation test # TODO: Add this after fully implementing llava(mantis) # git+https://github.com/TIGER-AI-Lab/Mantis.git # required for llava(mantis) test -# Benchmarking -aiohttp - # quantization bitsandbytes>=0.44.0 buildkite-test-collector==0.1.8 From 8e33605c4d76ef6f59b616e46273c12327c0f427 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 7 Nov 2024 05:14:42 +0000 Subject: [PATCH 09/20] Remove unnecessary `is_cpu()` checks Signed-off-by: DarkLight1337 --- .../vision_language/test_h2ovl.py | 129 ++++++++++++++++++ .../vision_language/test_models.py | 4 +- .../vision_language/test_phi3v.py | 2 - tests/models/utils.py | 3 +- 4 files changed, 132 insertions(+), 6 deletions(-) create mode 100644 tests/models/decoder_only/vision_language/test_h2ovl.py diff --git a/tests/models/decoder_only/vision_language/test_h2ovl.py b/tests/models/decoder_only/vision_language/test_h2ovl.py new file mode 100644 index 000000000000..45a736520440 --- /dev/null +++ b/tests/models/decoder_only/vision_language/test_h2ovl.py @@ -0,0 +1,129 @@ +from typing import Optional, Tuple + +import pytest +import torch +from PIL.Image import Image +from transformers import AutoConfig + +# Import the functions to test +from vllm.model_executor.models.h2ovl import (calculate_num_blocks, + image_to_pixel_values_wrapper) +from vllm.multimodal.utils import rescale_image_size + +models = [ + "h2oai/h2ovl-mississippi-800m", # Replace with your actual model names + "h2oai/h2ovl-mississippi-2b", +] + + +def run_preprocessing_test( + image: Image, + config, + max_dynamic_patch: Optional[int] = None, +) -> Tuple[torch.Tensor, int]: + """Test the image preprocessing and calculate expected blocks.""" + + if max_dynamic_patch is None: + max_dynamic_patch = config.max_dynamic_patch + + width, height = image.size + use_MSAC = config.use_msac + + # Create the mapper function with the provided configuration + mapper = image_to_pixel_values_wrapper(config, max_dynamic_patch, use_MSAC) + pixel_values = mapper(image) + + # Calculate the expected number of blocks + if use_MSAC: + # First pass + blocks1, _, _, aspect_ratio = calculate_num_blocks( + width, + height, + config.min_dynamic_patch, + max_dynamic_patch, + config.vision_config.image_size, + use_thumbnail=False, # Thumbnail is handled separately + prior_aspect_ratio=None, + ) + + # Second pass + blocks2, _, _, _ = calculate_num_blocks( + width, + height, + config.min_dynamic_patch, + max_dynamic_patch, + config.vision_config.image_size, + use_thumbnail=False, + prior_aspect_ratio=aspect_ratio, + ) + + # Add thumbnail if use_thumbnail is True and total_blocks > 1 + if config.use_thumbnail: + blocks1 += 1 if blocks1 > 1 else 0 + blocks2 += 1 if blocks2 > 1 else 0 + + # Total blocks is the sum of blocks from both passes minus overlapping + total_blocks = blocks1 + blocks2 - 1 + + expected_blocks = total_blocks + + else: + blocks, _, _, _ = calculate_num_blocks( + width, + height, + config.min_dynamic_patch, + max_dynamic_patch, + config.vision_config.image_size, + use_thumbnail=False, + prior_aspect_ratio=None, + ) + expected_blocks = blocks + + if config.use_thumbnail and expected_blocks > 1: + expected_blocks += 1 + + return pixel_values, expected_blocks + + +@pytest.mark.parametrize("model_name", models) +@pytest.mark.parametrize( + "size_factors", + [ + # Single-scale + [1.0], + # Single-scale, batched + [1.0, 1.0, 1.0], + # Multi-scale + [0.25, 0.5, 1.0], + ], +) +@pytest.mark.parametrize("max_dynamic_patch", [None, 2, 4, 8]) +def test_image_preprocessing(image_assets, model_name, size_factors, + max_dynamic_patch): + """Test image preprocessing pipeline with different configurations.""" + # Load the configuration from the model + config = AutoConfig.from_pretrained(model_name, trust_remote_code=True) + + for asset in image_assets: + image = asset.pil_image + for factor in size_factors: + scaled_image = rescale_image_size(image, factor) + + # Test preprocessing and get expected number of blocks + pixel_values, expected_blocks = run_preprocessing_test( + scaled_image, config, max_dynamic_patch) + + # Verify output shapes and properties + actual_blocks = pixel_values.shape[0] + assert actual_blocks == expected_blocks, ( + f"Expected {expected_blocks} blocks, got {actual_blocks}") + + # Check image dimensions + expected_size = ( + 3, # Number of channels (C, H, W) + config.vision_config.image_size, + config.vision_config.image_size, + ) + for img in pixel_values: + assert img.shape == expected_size, ( + f"Expected image size {expected_size}, got {img.shape}") diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py index 81612d3f6b66..6922f2948eb9 100644 --- a/tests/models/decoder_only/vision_language/test_models.py +++ b/tests/models/decoder_only/vision_language/test_models.py @@ -179,7 +179,7 @@ use_tokenizer_eos=True, vllm_output_post_proc=model_utils.fuyu_vllm_to_hf_output, num_logprobs=10, - dtype="bfloat16" if current_platform.is_cpu() else "half", + dtype="half", image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], ), "glm4": VLMTestInfo( @@ -378,7 +378,7 @@ prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501 test_type=VLMTestType.CUSTOM_INPUTS, max_model_len=4096, - dtype="bfloat16" if current_platform.is_cpu() else "half", + dtype="half", use_tokenizer_eos=True, patch_hf_runner=model_utils.internvl_patch_hf_runner, custom_test_opts=[ diff --git a/tests/models/decoder_only/vision_language/test_phi3v.py b/tests/models/decoder_only/vision_language/test_phi3v.py index b9c20ddb2d74..82eae0705c9b 100644 --- a/tests/models/decoder_only/vision_language/test_phi3v.py +++ b/tests/models/decoder_only/vision_language/test_phi3v.py @@ -44,8 +44,6 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str, target_dtype = "half" -if current_platform.is_cpu(): - target_dtype = "bfloat16" # ROCm Triton FA can run into shared memory issues with these models, # use other backends in the meantime diff --git a/tests/models/utils.py b/tests/models/utils.py index f7802d98ad67..0eb3f61f1f04 100644 --- a/tests/models/utils.py +++ b/tests/models/utils.py @@ -5,7 +5,6 @@ from vllm.config import ModelConfig, TaskOption from vllm.inputs import InputContext -from vllm.platforms import current_platform from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs TokensText = Tuple[List[int], str] @@ -270,7 +269,7 @@ def build_model_context(model_name: str, if tokenizer_name is None: tokenizer_name = model_name if dtype is None: - dtype = "bfloat16" if current_platform.is_cpu() else "half" + dtype = "half" model_config = ModelConfig( model_name, From 1e63f857a884a85af2413cd55481cadc6f7658b9 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 7 Nov 2024 05:17:34 +0000 Subject: [PATCH 10/20] Update Signed-off-by: DarkLight1337 --- tests/models/decoder_only/audio_language/test_ultravox.py | 7 +------ tests/models/decoder_only/vision_language/test_models.py | 6 ++---- 2 files changed, 3 insertions(+), 10 deletions(-) diff --git a/tests/models/decoder_only/audio_language/test_ultravox.py b/tests/models/decoder_only/audio_language/test_ultravox.py index aafadae39884..3848a83c62bd 100644 --- a/tests/models/decoder_only/audio_language/test_ultravox.py +++ b/tests/models/decoder_only/audio_language/test_ultravox.py @@ -5,7 +5,6 @@ import pytest_asyncio from transformers import AutoModel, AutoTokenizer, BatchEncoding -from vllm.platforms import current_platform from vllm.sequence import SampleLogprobs from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE @@ -182,12 +181,8 @@ def run_multi_audio_test( assert all(tokens for tokens, *_ in vllm_outputs) -# TODO: remove this after CPU float16 support ready -target_dtype = "bfloat16" if current_platform.is_cpu() else "half" - - @pytest.mark.core_model -@pytest.mark.parametrize("dtype", [target_dtype]) +@pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("max_tokens", [128]) @pytest.mark.parametrize("num_logprobs", [5]) @pytest.mark.parametrize("vllm_kwargs", [{}, CHUNKED_PREFILL_KWARGS]) diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py index cdbb33ae15a7..a3640c862598 100644 --- a/tests/models/decoder_only/vision_language/test_models.py +++ b/tests/models/decoder_only/vision_language/test_models.py @@ -94,8 +94,7 @@ ), limit_mm_per_prompt={"image": 4}, )], - # TODO: remove this after CPU float16 support ready - dtype="bfloat16" if current_platform.is_cpu() else "half", + dtype="half", marks=[pytest.mark.core_model], ), "paligemma": VLMTestInfo( @@ -133,8 +132,7 @@ max_num_seqs=2, auto_cls=AutoModelForVision2Seq, vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output, - # TODO: remove this after CPU float16 support ready - dtype="bfloat16" if current_platform.is_cpu() else "half", + dtype="half", marks=[pytest.mark.core_model], image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], ), From c09b14025afe9abf8e96bc446f59a178b47b2b06 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 7 Nov 2024 05:18:36 +0000 Subject: [PATCH 11/20] Remove unnecessary args Signed-off-by: DarkLight1337 --- tests/models/decoder_only/vision_language/test_models.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py index a3640c862598..90ddd598d0a8 100644 --- a/tests/models/decoder_only/vision_language/test_models.py +++ b/tests/models/decoder_only/vision_language/test_models.py @@ -94,7 +94,6 @@ ), limit_mm_per_prompt={"image": 4}, )], - dtype="half", marks=[pytest.mark.core_model], ), "paligemma": VLMTestInfo( @@ -132,7 +131,6 @@ max_num_seqs=2, auto_cls=AutoModelForVision2Seq, vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output, - dtype="half", marks=[pytest.mark.core_model], image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], ), @@ -177,7 +175,6 @@ use_tokenizer_eos=True, vllm_output_post_proc=model_utils.fuyu_vllm_to_hf_output, num_logprobs=10, - dtype="half", image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], ), "glm4": VLMTestInfo( @@ -250,7 +247,6 @@ models=["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"], test_type=VLMTestType.CUSTOM_INPUTS, prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501 - dtype="half", num_video_frames=16, max_model_len=16384, postprocess_inputs=model_utils.get_key_type_post_processor( @@ -409,7 +405,6 @@ prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501 test_type=VLMTestType.CUSTOM_INPUTS, max_model_len=4096, - dtype="half", use_tokenizer_eos=True, patch_hf_runner=model_utils.internvl_patch_hf_runner, custom_test_opts=[ @@ -424,7 +419,6 @@ test_type=VLMTestType.CUSTOM_INPUTS, max_model_len=16384, max_num_seqs=2, - dtype="half", postprocess_inputs=model_utils.get_key_type_post_processor( "pixel_values" ), From 6e6b8387a0131d0a3134444fbf2b6db773972ef6 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 7 Nov 2024 05:19:19 +0000 Subject: [PATCH 12/20] Update Signed-off-by: DarkLight1337 --- tests/models/decoder_only/vision_language/test_models.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py index 90ddd598d0a8..2df0b0e19748 100644 --- a/tests/models/decoder_only/vision_language/test_models.py +++ b/tests/models/decoder_only/vision_language/test_models.py @@ -111,9 +111,7 @@ "pixel_values" ), vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output, - # TODO: update this after CPU float16 support ready - dtype=("bfloat16" if current_platform.is_cpu() - else "half" if current_platform.is_rocm() + dtype=("half" if current_platform.is_cpu() or current_platform.is_rocm() else ("half", "float")), marks=[pytest.mark.core_model], ), From e41db032150dd6819ea9c7e5f400d536de786523 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 7 Nov 2024 11:18:56 +0000 Subject: [PATCH 13/20] Fix missing library Signed-off-by: DarkLight1337 --- .buildkite/run-cpu-test-ppc64le.sh | 2 +- .buildkite/run-cpu-test.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.buildkite/run-cpu-test-ppc64le.sh b/.buildkite/run-cpu-test-ppc64le.sh index 508ff63c76fd..66f62183d870 100755 --- a/.buildkite/run-cpu-test-ppc64le.sh +++ b/.buildkite/run-cpu-test-ppc64le.sh @@ -17,7 +17,7 @@ docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/hugg # Run basic model test docker exec cpu-test bash -c " - pip install pytest matplotlib einops transformers_stream_generator datamodel_code_generator + pip install pytest pytest-asyncio matplotlib einops transformers_stream_generator datamodel_code_generator pip install Pillow librosa pip install torchvision --index-url https://download.pytorch.org/whl/cpu pytest -v -s tests/models/embedding/language diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh index c7bb9822a828..f82647c3b5f4 100644 --- a/.buildkite/run-cpu-test.sh +++ b/.buildkite/run-cpu-test.sh @@ -22,7 +22,7 @@ docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py" # Run basic model test docker exec cpu-test bash -c " - pip install pytest matplotlib einops transformers_stream_generator datamodel_code_generator + pip install pytest pytest-asyncio matplotlib einops transformers_stream_generator datamodel_code_generator pip install Pillow librosa pip install torchvision --index-url https://download.pytorch.org/whl/cpu pytest -v -s tests/models/embedding/language From 8e3cf443a2f06baa78113d1b404f1955c9cb8a99 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 7 Nov 2024 13:03:20 +0000 Subject: [PATCH 14/20] Fix loading image embeds on CPU Signed-off-by: DarkLight1337 --- vllm/assets/image.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/assets/image.py b/vllm/assets/image.py index 5eec78c32890..389ecd5c869b 100644 --- a/vllm/assets/image.py +++ b/vllm/assets/image.py @@ -27,4 +27,4 @@ def image_embeds(self) -> torch.Tensor: """ image_path = get_vllm_public_assets(filename=f"{self.name}.pt", s3_prefix=VLM_IMAGES_DIR) - return torch.load(image_path) + return torch.load(image_path, map_location="cpu") From cd1cd155d0cd4f548d0f191c014269600a056514 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 7 Nov 2024 17:12:25 +0000 Subject: [PATCH 15/20] Fix errors not being propagated to CI Signed-off-by: DarkLight1337 --- .buildkite/run-cpu-test-ppc64le.sh | 2 ++ .buildkite/run-cpu-test.sh | 8 +++++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/.buildkite/run-cpu-test-ppc64le.sh b/.buildkite/run-cpu-test-ppc64le.sh index 66f62183d870..57d67dec0eab 100755 --- a/.buildkite/run-cpu-test-ppc64le.sh +++ b/.buildkite/run-cpu-test-ppc64le.sh @@ -17,6 +17,7 @@ docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/hugg # Run basic model test docker exec cpu-test bash -c " + set -e pip install pytest pytest-asyncio matplotlib einops transformers_stream_generator datamodel_code_generator pip install Pillow librosa pip install torchvision --index-url https://download.pytorch.org/whl/cpu @@ -28,6 +29,7 @@ docker exec cpu-test bash -c " # online inference docker exec cpu-test bash -c " + set -e python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m & timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1 python3 benchmarks/benchmark_serving.py \ diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh index f82647c3b5f4..961165bd436c 100644 --- a/.buildkite/run-cpu-test.sh +++ b/.buildkite/run-cpu-test.sh @@ -18,10 +18,13 @@ docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/hugg --cpuset-mems=1 --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2 cpu-test-avx2 # offline inference -docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py" +docker exec cpu-test-avx2 bash -c " + set -e + python3 examples/offline_inference.py" # Run basic model test docker exec cpu-test bash -c " + set -e pip install pytest pytest-asyncio matplotlib einops transformers_stream_generator datamodel_code_generator pip install Pillow librosa pip install torchvision --index-url https://download.pytorch.org/whl/cpu @@ -33,17 +36,20 @@ docker exec cpu-test bash -c " # Run compressed-tensor test docker exec cpu-test bash -c " + set -e pytest -s -v \ tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \ tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token" # Run AWQ test docker exec cpu-test bash -c " + set -e pytest -s -v \ tests/quantization/test_ipex_quant.py" # online inference docker exec cpu-test bash -c " + set -e export VLLM_CPU_KVCACHE_SPACE=10 export VLLM_CPU_OMP_THREADS_BIND=48-92 python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half & From b401cb91ca36d7bd5a52ca6fb0cd2bb6c1ff00e7 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 7 Nov 2024 17:20:09 +0000 Subject: [PATCH 16/20] Fix missing libraries Signed-off-by: DarkLight1337 --- .buildkite/run-cpu-test-ppc64le.sh | 5 +++-- .buildkite/run-cpu-test.sh | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/.buildkite/run-cpu-test-ppc64le.sh b/.buildkite/run-cpu-test-ppc64le.sh index 57d67dec0eab..51f549c8da15 100755 --- a/.buildkite/run-cpu-test-ppc64le.sh +++ b/.buildkite/run-cpu-test-ppc64le.sh @@ -18,8 +18,9 @@ docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/hugg # Run basic model test docker exec cpu-test bash -c " set -e - pip install pytest pytest-asyncio matplotlib einops transformers_stream_generator datamodel_code_generator - pip install Pillow librosa + pip install pytest pytest-asyncio \ + einops librosa peft Pillow sentence-transformers soundfile \ + transformers_stream_generator matplotlib datamodel_code_generator pip install torchvision --index-url https://download.pytorch.org/whl/cpu pytest -v -s tests/models/embedding/language pytest -v -s tests/models/encoder_decoder/language diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh index 961165bd436c..3a355e01ff20 100644 --- a/.buildkite/run-cpu-test.sh +++ b/.buildkite/run-cpu-test.sh @@ -25,8 +25,9 @@ docker exec cpu-test-avx2 bash -c " # Run basic model test docker exec cpu-test bash -c " set -e - pip install pytest pytest-asyncio matplotlib einops transformers_stream_generator datamodel_code_generator - pip install Pillow librosa + pip install pytest pytest-asyncio \ + einops librosa peft Pillow sentence-transformers soundfile \ + transformers_stream_generator matplotlib datamodel_code_generator pip install torchvision --index-url https://download.pytorch.org/whl/cpu pytest -v -s tests/models/embedding/language pytest -v -s tests/models/encoder_decoder/language From 431a5c8cd13827f36fc398bd70f06d3347bdf558 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 8 Nov 2024 04:46:27 +0000 Subject: [PATCH 17/20] Embedding models are not supported for CPU backend Signed-off-by: DarkLight1337 --- .buildkite/run-cpu-test-ppc64le.sh | 3 ++- .buildkite/run-cpu-test.sh | 3 ++- vllm/worker/cpu_worker.py | 6 +++++- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/.buildkite/run-cpu-test-ppc64le.sh b/.buildkite/run-cpu-test-ppc64le.sh index 51f549c8da15..4abf4eeadf9e 100755 --- a/.buildkite/run-cpu-test-ppc64le.sh +++ b/.buildkite/run-cpu-test-ppc64le.sh @@ -22,7 +22,8 @@ docker exec cpu-test bash -c " einops librosa peft Pillow sentence-transformers soundfile \ transformers_stream_generator matplotlib datamodel_code_generator pip install torchvision --index-url https://download.pytorch.org/whl/cpu - pytest -v -s tests/models/embedding/language + # Embedding models are not supported on CPU yet + # pytest -v -s tests/models/embedding/language pytest -v -s tests/models/encoder_decoder/language pytest -v -s tests/models/decoder_only/language/test_models.py pytest -v -s tests/models/decoder_only/audio_language -m core_model diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh index 3a355e01ff20..391205df7549 100644 --- a/.buildkite/run-cpu-test.sh +++ b/.buildkite/run-cpu-test.sh @@ -29,7 +29,8 @@ docker exec cpu-test bash -c " einops librosa peft Pillow sentence-transformers soundfile \ transformers_stream_generator matplotlib datamodel_code_generator pip install torchvision --index-url https://download.pytorch.org/whl/cpu - pytest -v -s tests/models/embedding/language + # Embedding models are not supported on CPU yet + # pytest -v -s tests/models/embedding/language pytest -v -s tests/models/encoder_decoder/language pytest -v -s tests/models/decoder_only/language/test_models.py pytest -v -s tests/models/decoder_only/audio_language -m core_model diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py index 2914f520d823..162e1e4be873 100644 --- a/vllm/worker/cpu_worker.py +++ b/vllm/worker/cpu_worker.py @@ -151,7 +151,11 @@ def __init__( self.local_omp_cpuid = omp_cpuids.split("|")[rank] ModelRunnerClass: Type[CPUModelRunner] = CPUModelRunner - if self.model_config.is_encoder_decoder: + if self.model_config.task == "embedding": + raise NotImplementedError( + "Embedding models are not supported for CPU backend") + # ModelRunnerClass = CPUEmbeddingModelRunner + elif self.model_config.is_encoder_decoder: ModelRunnerClass = CPUEncoderDecoderModelRunner self.model_runner: CPUModelRunner = ModelRunnerClass( vllm_config=vllm_config, From 8c817e43a2f2fb4705efbf052b34d2af8c96ca30 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 8 Nov 2024 06:14:45 +0000 Subject: [PATCH 18/20] Chunked prefill not supported for CPU Signed-off-by: DarkLight1337 --- .buildkite/run-cpu-test-ppc64le.sh | 5 +++-- .buildkite/run-cpu-test.sh | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/.buildkite/run-cpu-test-ppc64le.sh b/.buildkite/run-cpu-test-ppc64le.sh index 50d446aaa174..cc5c4f480dce 100755 --- a/.buildkite/run-cpu-test-ppc64le.sh +++ b/.buildkite/run-cpu-test-ppc64le.sh @@ -24,11 +24,12 @@ docker exec cpu-test bash -c " einops librosa peft Pillow sentence-transformers soundfile \ transformers_stream_generator matplotlib datamodel_code_generator pip install torchvision --index-url https://download.pytorch.org/whl/cpu - # Embedding models are not supported on CPU yet + # Embedding models are not supported for CPU yet # pytest -v -s tests/models/embedding/language pytest -v -s tests/models/encoder_decoder/language pytest -v -s tests/models/decoder_only/language/test_models.py - pytest -v -s tests/models/decoder_only/audio_language -m core_model + # Chunked prefill not supported for CPU yet + # pytest -v -s tests/models/decoder_only/audio_language -m core_model pytest -v -s tests/models/decoder_only/vision_language -m core_model" # online inference diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh index 16a5a25c8b60..40859492086e 100644 --- a/.buildkite/run-cpu-test.sh +++ b/.buildkite/run-cpu-test.sh @@ -31,11 +31,12 @@ docker exec cpu-test bash -c " einops librosa peft Pillow sentence-transformers soundfile \ transformers_stream_generator matplotlib datamodel_code_generator pip install torchvision --index-url https://download.pytorch.org/whl/cpu - # Embedding models are not supported on CPU yet + # Embedding models are not supported for CPU yet # pytest -v -s tests/models/embedding/language pytest -v -s tests/models/encoder_decoder/language pytest -v -s tests/models/decoder_only/language/test_models.py - pytest -v -s tests/models/decoder_only/audio_language -m core_model + # Chunked prefill not supported for CPU yet + # pytest -v -s tests/models/decoder_only/audio_language -m core_model pytest -v -s tests/models/decoder_only/vision_language -m core_model" # Run compressed-tensor test From 4c39939434fa4383b7f0bbbdff61c57ec2ec4126 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 8 Nov 2024 08:35:37 +0000 Subject: [PATCH 19/20] Fix installation Signed-off-by: DarkLight1337 --- .buildkite/run-cpu-test-ppc64le.sh | 2 +- .buildkite/run-cpu-test.sh | 2 +- vllm/model_executor/models/ultravox.py | 4 ++-- vllm/multimodal/utils.py | 8 ++++---- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/.buildkite/run-cpu-test-ppc64le.sh b/.buildkite/run-cpu-test-ppc64le.sh index cc5c4f480dce..8d472fa358cd 100755 --- a/.buildkite/run-cpu-test-ppc64le.sh +++ b/.buildkite/run-cpu-test-ppc64le.sh @@ -21,7 +21,7 @@ docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/hugg docker exec cpu-test bash -c " set -e pip install pytest pytest-asyncio \ - einops librosa peft Pillow sentence-transformers soundfile \ + decord einops librosa peft Pillow sentence-transformers soundfile \ transformers_stream_generator matplotlib datamodel_code_generator pip install torchvision --index-url https://download.pytorch.org/whl/cpu # Embedding models are not supported for CPU yet diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh index 40859492086e..39053014f490 100644 --- a/.buildkite/run-cpu-test.sh +++ b/.buildkite/run-cpu-test.sh @@ -28,7 +28,7 @@ docker exec cpu-test-avx2 bash -c " docker exec cpu-test bash -c " set -e pip install pytest pytest-asyncio \ - einops librosa peft Pillow sentence-transformers soundfile \ + decord einops librosa peft Pillow sentence-transformers soundfile \ transformers_stream_generator matplotlib datamodel_code_generator pip install torchvision --index-url https://download.pytorch.org/whl/cpu # Embedding models are not supported for CPU yet diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index 3a343986a934..411584b1a6c3 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -134,9 +134,9 @@ def input_mapper_for_ultravox(ctx: InputContext, data: object): if sr != feature_extractor.sampling_rate: try: import librosa - except ImportError: + except ImportError as exc: raise ImportError( - "Please install vllm[audio] for audio support.") from None + "Please install vllm[audio] for audio support.") from exc audio = librosa.resample(audio, orig_sr=sr, target_sr=feature_extractor.sampling_rate) diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index 0c666b8cc2e6..bee3c25dbd8d 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -206,9 +206,9 @@ def try_import_audio_packages() -> Tuple[Any, Any]: try: import librosa import soundfile - except ImportError: + except ImportError as exc: raise ImportError( - "Please install vllm[audio] for audio support.") from None + "Please install vllm[audio] for audio support.") from exc return librosa, soundfile @@ -344,9 +344,9 @@ def try_import_video_packages() -> Any: try: import cv2 import decord - except ImportError: + except ImportError as exc: raise ImportError( - "Please install vllm[video] for video support.") from None + "Please install vllm[video] for video support.") from exc return cv2, decord From 9ef98fa8129c7c5c7dfde0abf45340aefadd44fd Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 8 Nov 2024 13:40:47 +0000 Subject: [PATCH 20/20] Add `cpu_model` mark Signed-off-by: DarkLight1337 --- .buildkite/run-cpu-test-ppc64le.sh | 4 ++-- .buildkite/run-cpu-test.sh | 4 ++-- pyproject.toml | 3 ++- .../decoder_only/audio_language/test_ultravox.py | 15 ++++++++++++--- .../decoder_only/vision_language/test_models.py | 4 ++-- 5 files changed, 20 insertions(+), 10 deletions(-) diff --git a/.buildkite/run-cpu-test-ppc64le.sh b/.buildkite/run-cpu-test-ppc64le.sh index 8d472fa358cd..5add7ff0c15c 100755 --- a/.buildkite/run-cpu-test-ppc64le.sh +++ b/.buildkite/run-cpu-test-ppc64le.sh @@ -29,8 +29,8 @@ docker exec cpu-test bash -c " pytest -v -s tests/models/encoder_decoder/language pytest -v -s tests/models/decoder_only/language/test_models.py # Chunked prefill not supported for CPU yet - # pytest -v -s tests/models/decoder_only/audio_language -m core_model - pytest -v -s tests/models/decoder_only/vision_language -m core_model" + # pytest -v -s tests/models/decoder_only/audio_language -m cpu_model + pytest -v -s tests/models/decoder_only/vision_language -m cpu_model" # online inference docker exec cpu-test bash -c " diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh index 39053014f490..25a448e63be2 100644 --- a/.buildkite/run-cpu-test.sh +++ b/.buildkite/run-cpu-test.sh @@ -36,8 +36,8 @@ docker exec cpu-test bash -c " pytest -v -s tests/models/encoder_decoder/language pytest -v -s tests/models/decoder_only/language/test_models.py # Chunked prefill not supported for CPU yet - # pytest -v -s tests/models/decoder_only/audio_language -m core_model - pytest -v -s tests/models/decoder_only/vision_language -m core_model" + # pytest -v -s tests/models/decoder_only/audio_language -m cpu_model + pytest -v -s tests/models/decoder_only/vision_language -m cpu_model" # Run compressed-tensor test docker exec cpu-test bash -c " diff --git a/pyproject.toml b/pyproject.toml index bae8645502de..1385a15d0787 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -93,7 +93,8 @@ skip_gitignore = true [tool.pytest.ini_options] markers = [ "skip_global_cleanup", - "core_model: run this model test in each PR instead of just daily", + "core_model: enable this model test in each PR instead of only nightly", + "cpu_model: enable this model test in CPU tests", "distributed_2_gpus: run this test only in distributed tests for 2 GPUs", "skip_v1: do not run this test with v1", ] diff --git a/tests/models/decoder_only/audio_language/test_ultravox.py b/tests/models/decoder_only/audio_language/test_ultravox.py index 3848a83c62bd..e100c6b9bb90 100644 --- a/tests/models/decoder_only/audio_language/test_ultravox.py +++ b/tests/models/decoder_only/audio_language/test_ultravox.py @@ -39,7 +39,10 @@ def audio(request): return AudioAsset(request.param) -@pytest.fixture(params=({}, CHUNKED_PREFILL_KWARGS)) +@pytest.fixture(params=[ + pytest.param({}, marks=pytest.mark.cpu_model), + pytest.param(CHUNKED_PREFILL_KWARGS), +]) def server(request, audio_assets): args = [ "--dtype=bfloat16", "--max-model-len=4096", "--enforce-eager", @@ -185,7 +188,10 @@ def run_multi_audio_test( @pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("max_tokens", [128]) @pytest.mark.parametrize("num_logprobs", [5]) -@pytest.mark.parametrize("vllm_kwargs", [{}, CHUNKED_PREFILL_KWARGS]) +@pytest.mark.parametrize("vllm_kwargs", [ + pytest.param({}, marks=pytest.mark.cpu_model), + pytest.param(CHUNKED_PREFILL_KWARGS), +]) def test_models(hf_runner, vllm_runner, audio, dtype: str, max_tokens: int, num_logprobs: int, vllm_kwargs: dict) -> None: @@ -207,7 +213,10 @@ def test_models(hf_runner, vllm_runner, audio, dtype: str, max_tokens: int, @pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("max_tokens", [128]) @pytest.mark.parametrize("num_logprobs", [5]) -@pytest.mark.parametrize("vllm_kwargs", [{}, CHUNKED_PREFILL_KWARGS]) +@pytest.mark.parametrize("vllm_kwargs", [ + pytest.param({}, marks=pytest.mark.cpu_model), + pytest.param(CHUNKED_PREFILL_KWARGS), +]) def test_models_with_multiple_audios(vllm_runner, audio_assets, dtype: str, max_tokens: int, num_logprobs: int, vllm_kwargs: dict) -> None: diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py index 2df0b0e19748..163752e9fe06 100644 --- a/tests/models/decoder_only/vision_language/test_models.py +++ b/tests/models/decoder_only/vision_language/test_models.py @@ -94,7 +94,7 @@ ), limit_mm_per_prompt={"image": 4}, )], - marks=[pytest.mark.core_model], + marks=[pytest.mark.core_model, pytest.mark.cpu_model], ), "paligemma": VLMTestInfo( models=["google/paligemma-3b-mix-224"], @@ -129,7 +129,7 @@ max_num_seqs=2, auto_cls=AutoModelForVision2Seq, vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output, - marks=[pytest.mark.core_model], + marks=[pytest.mark.core_model, pytest.mark.cpu_model], image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], ), #### Extended model tests