From 597cb3528657528ea7015303f0b0f9d32c0fe4cc Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Thu, 13 Jun 2024 01:17:31 +0000
Subject: [PATCH 01/20] Enable LLaVA test in CPU

---
 .buildkite/run-cpu-test.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index 6a86bc0ebfb6..8d1570c17e16 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -21,4 +21,4 @@ docker exec cpu-test bash -c "cd tests;
   pip install pytest Pillow protobuf
   bash ../.buildkite/download-images.sh
   cd ../
-  pytest -v -s tests/models --ignore=tests/models/test_llava.py --ignore=tests/models/test_embedding.py --ignore=tests/models/test_registry.py"
+  pytest -v -s tests/models --ignore=tests/models/test_embedding.py --ignore=tests/models/test_registry.py"

From 845b465a1400f8a7a5a0533878085a872a3e077d Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Thu, 13 Jun 2024 13:21:33 +0000
Subject: [PATCH 02/20] Fix failing test on CPU due to unsupported dtype

---
 tests/models/test_llava.py      | 9 ++++++++-
 tests/models/test_llava_next.py | 9 ++++++++-
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/tests/models/test_llava.py b/tests/models/test_llava.py
index a1f0cff1cc0e..744923618699 100644
--- a/tests/models/test_llava.py
+++ b/tests/models/test_llava.py
@@ -1,6 +1,7 @@
 from typing import List, Tuple
 
 import pytest
+import torch
 from transformers import AutoTokenizer
 
 from vllm.config import VisionLanguageConfig
@@ -65,9 +66,15 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str],
     return hf_input_ids, hf_output_str
 
 
+# TODO: remove this after CPU float16 support ready
+target_dtype = "float"
+if torch.cuda.is_available():
+    target_dtype = "half"
+
+
 # TODO: Add test for `tensor_parallel_size` [ref: PR #3883]
 @pytest.mark.parametrize("model_and_config", model_and_vl_config)
-@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("dtype", [target_dtype])
 @pytest.mark.parametrize("max_tokens", [128])
 def test_models(hf_runner, vllm_runner, hf_images, vllm_images,
                 model_and_config, dtype: str, max_tokens: int) -> None:
diff --git a/tests/models/test_llava_next.py b/tests/models/test_llava_next.py
index aa6ee268ae58..95d1711e8a1c 100644
--- a/tests/models/test_llava_next.py
+++ b/tests/models/test_llava_next.py
@@ -1,6 +1,7 @@
 from typing import List, Tuple
 
 import pytest
+import torch
 from transformers import AutoTokenizer
 
 from vllm.config import VisionLanguageConfig
@@ -72,11 +73,17 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str],
     return hf_input_ids, hf_output_str
 
 
+# TODO: remove this after CPU float16 support ready
+target_dtype = "float"
+if torch.cuda.is_available():
+    target_dtype = "half"
+
+
 @pytest.mark.xfail(
     reason="Inconsistent image processor being used due to lack "
     "of support for dynamic image token replacement")
 @pytest.mark.parametrize("model_and_config", model_and_vl_config)
-@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("dtype", [target_dtype])
 @pytest.mark.parametrize("max_tokens", [128])
 def test_models(hf_runner, vllm_runner, hf_images, vllm_images,
                 model_and_config, dtype: str, max_tokens: int) -> None:

From 783cb769e03fea1151923b0edd7f61c1c59137d5 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Fri, 21 Jun 2024 15:01:26 +0000
Subject: [PATCH 03/20] Install torchvision

---
 .buildkite/run-cpu-test.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index 11836cfc19c0..149b23e760e2 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -21,7 +21,7 @@ docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py"
 
 # Run basic model test
 docker exec cpu-test bash -c "cd tests;
-  pip install pytest Pillow protobuf
+  pip install pytest Pillow protobuf torchvision
   bash ../.buildkite/download-images.sh
   cd ../
   pytest -v -s tests/models --ignore=tests/models/test_embedding.py --ignore=tests/models/test_registry.py"

From e177bf8a311d17308c354762153b142d873c81b1 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Fri, 21 Jun 2024 15:46:56 +0000
Subject: [PATCH 04/20] Use CPU pypi index for torchvision

---
 .buildkite/run-cpu-test.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index 149b23e760e2..c912a8b41140 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -21,7 +21,8 @@ docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py"
 
 # Run basic model test
 docker exec cpu-test bash -c "cd tests;
-  pip install pytest Pillow protobuf torchvision
+  pip install pytest Pillow protobuf
+  pip install torchvision --index-url https://download.pytorch.org/whl/cpu
   bash ../.buildkite/download-images.sh
   cd ../
   pytest -v -s tests/models --ignore=tests/models/test_embedding.py --ignore=tests/models/test_registry.py"

From d9260828d07a1b92f707f2b8b629776967cf3f00 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Thu, 31 Oct 2024 16:27:43 +0000
Subject: [PATCH 05/20] format

---
 tests/models/decoder_only/audio_language/test_ultravox.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/models/decoder_only/audio_language/test_ultravox.py b/tests/models/decoder_only/audio_language/test_ultravox.py
index 8f3fcc9dc43b..b4154be164a8 100644
--- a/tests/models/decoder_only/audio_language/test_ultravox.py
+++ b/tests/models/decoder_only/audio_language/test_ultravox.py
@@ -158,6 +158,7 @@ def run_multi_audio_test(
     # just assert that some tokens were generated.
     assert all(tokens for tokens, *_ in vllm_outputs)
 
+
 # TODO: remove this after CPU float16 support ready
 target_dtype = "float" if current_platform.is_cpu() else "half"
 

From fe0ef62c0527044c60b78ab02b465b57c70fea2f Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Thu, 31 Oct 2024 16:40:56 +0000
Subject: [PATCH 06/20] Use bfloat16

---
 tests/models/decoder_only/audio_language/test_ultravox.py | 2 +-
 tests/models/decoder_only/language/test_big_models.py     | 2 +-
 tests/models/decoder_only/vision_language/test_models.py  | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/models/decoder_only/audio_language/test_ultravox.py b/tests/models/decoder_only/audio_language/test_ultravox.py
index b4154be164a8..2b7117484252 100644
--- a/tests/models/decoder_only/audio_language/test_ultravox.py
+++ b/tests/models/decoder_only/audio_language/test_ultravox.py
@@ -160,7 +160,7 @@ def run_multi_audio_test(
 
 
 # TODO: remove this after CPU float16 support ready
-target_dtype = "float" if current_platform.is_cpu() else "half"
+target_dtype = "bfloat16" if current_platform.is_cpu() else "half"
 
 
 @pytest.mark.core_model
diff --git a/tests/models/decoder_only/language/test_big_models.py b/tests/models/decoder_only/language/test_big_models.py
index fcfc159e4f5a..f0c61e06a931 100644
--- a/tests/models/decoder_only/language/test_big_models.py
+++ b/tests/models/decoder_only/language/test_big_models.py
@@ -29,7 +29,7 @@
     ]
 
 # TODO: remove this after CPU float16 support ready
-target_dtype = "float" if current_platform.is_cpu() else "half"
+target_dtype = "bfloat16" if current_platform.is_cpu() else "half"
 
 
 @pytest.mark.parametrize("model", MODELS)
diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index 7ed8f8598221..20588906c80c 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -95,7 +95,7 @@
             limit_mm_per_prompt={"image": 4},
         )],
         # TODO: remove this after CPU float16 support ready
-        dtype="float" if current_platform.is_cpu() else "half",
+        dtype="bfloat16" if current_platform.is_cpu() else "half",
         marks=[pytest.mark.core_model],
     ),
     "paligemma": VLMTestInfo(
@@ -134,7 +134,7 @@
         auto_cls=AutoModelForVision2Seq,
         vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
         # TODO: remove this after CPU float16 support ready
-        dtype="float" if current_platform.is_cpu() else "half",
+        dtype="bfloat16" if current_platform.is_cpu() else "half",
         marks=[pytest.mark.core_model],
         image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
     ),

From f12d39fd26e7d44ecadff3f7b0d751db42309158 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Thu, 31 Oct 2024 16:41:31 +0000
Subject: [PATCH 07/20] Update

---
 tests/models/decoder_only/vision_language/test_models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index 20588906c80c..9be3379dd4fa 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -114,7 +114,7 @@
         ),
         vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output,
         # TODO: update this after CPU float16 support ready
-        dtype=("float" if current_platform.is_cpu()
+        dtype=("bfloat16" if current_platform.is_cpu()
                else "half" if current_platform.is_rocm()
                else ("half", "float")),
         marks=[pytest.mark.core_model],

From 656a49998df77ec83b53002de89d95a16bada3aa Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Thu, 31 Oct 2024 17:40:00 +0000
Subject: [PATCH 08/20] Update test dependencies

---
 .buildkite/run-cpu-test-ppc64le.sh | 2 +-
 .buildkite/run-cpu-test.sh         | 2 +-
 .buildkite/test-pipeline.yaml      | 1 -
 requirements-test.in               | 5 -----
 4 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/.buildkite/run-cpu-test-ppc64le.sh b/.buildkite/run-cpu-test-ppc64le.sh
index ae9c0733a104..508ff63c76fd 100755
--- a/.buildkite/run-cpu-test-ppc64le.sh
+++ b/.buildkite/run-cpu-test-ppc64le.sh
@@ -18,7 +18,7 @@ docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/hugg
 # Run basic model test
 docker exec cpu-test bash -c "
   pip install pytest matplotlib einops transformers_stream_generator datamodel_code_generator
-  pip install Pillow
+  pip install Pillow librosa
   pip install torchvision --index-url https://download.pytorch.org/whl/cpu
   pytest -v -s tests/models/embedding/language
   pytest -v -s tests/models/encoder_decoder/language
diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index d3f0d6a28ab8..9d8c3acdf397 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -23,7 +23,7 @@ docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py"
 # Run basic model test
 docker exec cpu-test bash -c "
   pip install pytest matplotlib einops transformers_stream_generator datamodel_code_generator
-  pip install Pillow
+  pip install Pillow librosa
   pip install torchvision --index-url https://download.pytorch.org/whl/cpu
   pytest -v -s tests/models/embedding/language
   pytest -v -s tests/models/encoder_decoder/language
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 85dbe1e0cff3..6cd97887a9c4 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -268,7 +268,6 @@ steps:
   source_file_dependencies:
   - benchmarks/
   commands:
-  - pip install aiohttp
   - bash run-benchmarks.sh
 
 - label: Quantization Test # 33min
diff --git a/requirements-test.in b/requirements-test.in
index 3881f2566b55..e117829ccf31 100644
--- a/requirements-test.in
+++ b/requirements-test.in
@@ -11,9 +11,7 @@ awscli
 einops # required for MPT, qwen-vl and Mamba
 httpx
 librosa # required for audio tests
-opencv-python # required for video tests
 peft
-requests
 ray[adag]==2.35
 sentence-transformers # required for embedding
 soundfile # required for audio test
@@ -27,9 +25,6 @@ lm-eval[api]==0.4.4 # required for model evaluation test
 # TODO: Add this after fully implementing llava(mantis)
 # git+https://github.com/TIGER-AI-Lab/Mantis.git # required for llava(mantis) test
 
-# Benchmarking
-aiohttp
-
 # quantization
 bitsandbytes>=0.44.0
 buildkite-test-collector==0.1.8

From 8e33605c4d76ef6f59b616e46273c12327c0f427 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Thu, 7 Nov 2024 05:14:42 +0000
Subject: [PATCH 09/20] Remove unnecessary `is_cpu()` checks

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .../vision_language/test_h2ovl.py             | 129 ++++++++++++++++++
 .../vision_language/test_models.py            |   4 +-
 .../vision_language/test_phi3v.py             |   2 -
 tests/models/utils.py                         |   3 +-
 4 files changed, 132 insertions(+), 6 deletions(-)
 create mode 100644 tests/models/decoder_only/vision_language/test_h2ovl.py

diff --git a/tests/models/decoder_only/vision_language/test_h2ovl.py b/tests/models/decoder_only/vision_language/test_h2ovl.py
new file mode 100644
index 000000000000..45a736520440
--- /dev/null
+++ b/tests/models/decoder_only/vision_language/test_h2ovl.py
@@ -0,0 +1,129 @@
+from typing import Optional, Tuple
+
+import pytest
+import torch
+from PIL.Image import Image
+from transformers import AutoConfig
+
+# Import the functions to test
+from vllm.model_executor.models.h2ovl import (calculate_num_blocks,
+                                              image_to_pixel_values_wrapper)
+from vllm.multimodal.utils import rescale_image_size
+
+models = [
+    "h2oai/h2ovl-mississippi-800m",  # Replace with your actual model names
+    "h2oai/h2ovl-mississippi-2b",
+]
+
+
+def run_preprocessing_test(
+    image: Image,
+    config,
+    max_dynamic_patch: Optional[int] = None,
+) -> Tuple[torch.Tensor, int]:
+    """Test the image preprocessing and calculate expected blocks."""
+
+    if max_dynamic_patch is None:
+        max_dynamic_patch = config.max_dynamic_patch
+
+    width, height = image.size
+    use_MSAC = config.use_msac
+
+    # Create the mapper function with the provided configuration
+    mapper = image_to_pixel_values_wrapper(config, max_dynamic_patch, use_MSAC)
+    pixel_values = mapper(image)
+
+    # Calculate the expected number of blocks
+    if use_MSAC:
+        # First pass
+        blocks1, _, _, aspect_ratio = calculate_num_blocks(
+            width,
+            height,
+            config.min_dynamic_patch,
+            max_dynamic_patch,
+            config.vision_config.image_size,
+            use_thumbnail=False,  # Thumbnail is handled separately
+            prior_aspect_ratio=None,
+        )
+
+        # Second pass
+        blocks2, _, _, _ = calculate_num_blocks(
+            width,
+            height,
+            config.min_dynamic_patch,
+            max_dynamic_patch,
+            config.vision_config.image_size,
+            use_thumbnail=False,
+            prior_aspect_ratio=aspect_ratio,
+        )
+
+        # Add thumbnail if use_thumbnail is True and total_blocks > 1
+        if config.use_thumbnail:
+            blocks1 += 1 if blocks1 > 1 else 0
+            blocks2 += 1 if blocks2 > 1 else 0
+
+        # Total blocks is the sum of blocks from both passes minus overlapping
+        total_blocks = blocks1 + blocks2 - 1
+
+        expected_blocks = total_blocks
+
+    else:
+        blocks, _, _, _ = calculate_num_blocks(
+            width,
+            height,
+            config.min_dynamic_patch,
+            max_dynamic_patch,
+            config.vision_config.image_size,
+            use_thumbnail=False,
+            prior_aspect_ratio=None,
+        )
+        expected_blocks = blocks
+
+        if config.use_thumbnail and expected_blocks > 1:
+            expected_blocks += 1
+
+    return pixel_values, expected_blocks
+
+
+@pytest.mark.parametrize("model_name", models)
+@pytest.mark.parametrize(
+    "size_factors",
+    [
+        # Single-scale
+        [1.0],
+        # Single-scale, batched
+        [1.0, 1.0, 1.0],
+        # Multi-scale
+        [0.25, 0.5, 1.0],
+    ],
+)
+@pytest.mark.parametrize("max_dynamic_patch", [None, 2, 4, 8])
+def test_image_preprocessing(image_assets, model_name, size_factors,
+                             max_dynamic_patch):
+    """Test image preprocessing pipeline with different configurations."""
+    # Load the configuration from the model
+    config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
+
+    for asset in image_assets:
+        image = asset.pil_image
+        for factor in size_factors:
+            scaled_image = rescale_image_size(image, factor)
+
+            # Test preprocessing and get expected number of blocks
+            pixel_values, expected_blocks = run_preprocessing_test(
+                scaled_image, config, max_dynamic_patch)
+
+            # Verify output shapes and properties
+            actual_blocks = pixel_values.shape[0]
+            assert actual_blocks == expected_blocks, (
+                f"Expected {expected_blocks} blocks, got {actual_blocks}")
+
+            # Check image dimensions
+            expected_size = (
+                3,  # Number of channels (C, H, W)
+                config.vision_config.image_size,
+                config.vision_config.image_size,
+            )
+            for img in pixel_values:
+                assert img.shape == expected_size, (
+                    f"Expected image size {expected_size}, got {img.shape}")
diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index 81612d3f6b66..6922f2948eb9 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -179,7 +179,7 @@
         use_tokenizer_eos=True,
         vllm_output_post_proc=model_utils.fuyu_vllm_to_hf_output,
         num_logprobs=10,
-        dtype="bfloat16" if current_platform.is_cpu() else "half",
+        dtype="half",
         image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
     ),
     "glm4": VLMTestInfo(
@@ -378,7 +378,7 @@
         prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
         test_type=VLMTestType.CUSTOM_INPUTS,
         max_model_len=4096,
-        dtype="bfloat16" if current_platform.is_cpu() else "half",
+        dtype="half",
         use_tokenizer_eos=True,
         patch_hf_runner=model_utils.internvl_patch_hf_runner,
         custom_test_opts=[
diff --git a/tests/models/decoder_only/vision_language/test_phi3v.py b/tests/models/decoder_only/vision_language/test_phi3v.py
index b9c20ddb2d74..82eae0705c9b 100644
--- a/tests/models/decoder_only/vision_language/test_phi3v.py
+++ b/tests/models/decoder_only/vision_language/test_phi3v.py
@@ -44,8 +44,6 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
 
 
 target_dtype = "half"
-if current_platform.is_cpu():
-    target_dtype = "bfloat16"
 
 # ROCm Triton FA can run into shared memory issues with these models,
 # use other backends in the meantime
diff --git a/tests/models/utils.py b/tests/models/utils.py
index f7802d98ad67..0eb3f61f1f04 100644
--- a/tests/models/utils.py
+++ b/tests/models/utils.py
@@ -5,7 +5,6 @@
 
 from vllm.config import ModelConfig, TaskOption
 from vllm.inputs import InputContext
-from vllm.platforms import current_platform
 from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs
 
 TokensText = Tuple[List[int], str]
@@ -270,7 +269,7 @@ def build_model_context(model_name: str,
     if tokenizer_name is None:
         tokenizer_name = model_name
     if dtype is None:
-        dtype = "bfloat16" if current_platform.is_cpu() else "half"
+        dtype = "half"
 
     model_config = ModelConfig(
         model_name,

From 1e63f857a884a85af2413cd55481cadc6f7658b9 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Thu, 7 Nov 2024 05:17:34 +0000
Subject: [PATCH 10/20] Update

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/models/decoder_only/audio_language/test_ultravox.py | 7 +------
 tests/models/decoder_only/vision_language/test_models.py  | 6 ++----
 2 files changed, 3 insertions(+), 10 deletions(-)

diff --git a/tests/models/decoder_only/audio_language/test_ultravox.py b/tests/models/decoder_only/audio_language/test_ultravox.py
index aafadae39884..3848a83c62bd 100644
--- a/tests/models/decoder_only/audio_language/test_ultravox.py
+++ b/tests/models/decoder_only/audio_language/test_ultravox.py
@@ -5,7 +5,6 @@
 import pytest_asyncio
 from transformers import AutoModel, AutoTokenizer, BatchEncoding
 
-from vllm.platforms import current_platform
 from vllm.sequence import SampleLogprobs
 from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
 
@@ -182,12 +181,8 @@ def run_multi_audio_test(
     assert all(tokens for tokens, *_ in vllm_outputs)
 
 
-# TODO: remove this after CPU float16 support ready
-target_dtype = "bfloat16" if current_platform.is_cpu() else "half"
-
-
 @pytest.mark.core_model
-@pytest.mark.parametrize("dtype", [target_dtype])
+@pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("num_logprobs", [5])
 @pytest.mark.parametrize("vllm_kwargs", [{}, CHUNKED_PREFILL_KWARGS])
diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index cdbb33ae15a7..a3640c862598 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -94,8 +94,7 @@
             ),
             limit_mm_per_prompt={"image": 4},
         )],
-        # TODO: remove this after CPU float16 support ready
-        dtype="bfloat16" if current_platform.is_cpu() else "half",
+        dtype="half",
         marks=[pytest.mark.core_model],
     ),
     "paligemma": VLMTestInfo(
@@ -133,8 +132,7 @@
         max_num_seqs=2,
         auto_cls=AutoModelForVision2Seq,
         vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
-        # TODO: remove this after CPU float16 support ready
-        dtype="bfloat16" if current_platform.is_cpu() else "half",
+        dtype="half",
         marks=[pytest.mark.core_model],
         image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
     ),

From c09b14025afe9abf8e96bc446f59a178b47b2b06 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Thu, 7 Nov 2024 05:18:36 +0000
Subject: [PATCH 11/20] Remove unnecessary args

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/models/decoder_only/vision_language/test_models.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index a3640c862598..90ddd598d0a8 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -94,7 +94,6 @@
             ),
             limit_mm_per_prompt={"image": 4},
         )],
-        dtype="half",
         marks=[pytest.mark.core_model],
     ),
     "paligemma": VLMTestInfo(
@@ -132,7 +131,6 @@
         max_num_seqs=2,
         auto_cls=AutoModelForVision2Seq,
         vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
-        dtype="half",
         marks=[pytest.mark.core_model],
         image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
     ),
@@ -177,7 +175,6 @@
         use_tokenizer_eos=True,
         vllm_output_post_proc=model_utils.fuyu_vllm_to_hf_output,
         num_logprobs=10,
-        dtype="half",
         image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
     ),
     "glm4": VLMTestInfo(
@@ -250,7 +247,6 @@
         models=["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"],
         test_type=VLMTestType.CUSTOM_INPUTS,
         prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n",   # noqa: E501
-        dtype="half",
         num_video_frames=16,
         max_model_len=16384,
         postprocess_inputs=model_utils.get_key_type_post_processor(
@@ -409,7 +405,6 @@
         prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
         test_type=VLMTestType.CUSTOM_INPUTS,
         max_model_len=4096,
-        dtype="half",
         use_tokenizer_eos=True,
         patch_hf_runner=model_utils.internvl_patch_hf_runner,
         custom_test_opts=[
@@ -424,7 +419,6 @@
         test_type=VLMTestType.CUSTOM_INPUTS,
         max_model_len=16384,
         max_num_seqs=2,
-        dtype="half",
         postprocess_inputs=model_utils.get_key_type_post_processor(
             "pixel_values"
         ),

From 6e6b8387a0131d0a3134444fbf2b6db773972ef6 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Thu, 7 Nov 2024 05:19:19 +0000
Subject: [PATCH 12/20] Update

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/models/decoder_only/vision_language/test_models.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index 90ddd598d0a8..2df0b0e19748 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -111,9 +111,7 @@
             "pixel_values"
         ),
         vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output,
-        # TODO: update this after CPU float16 support ready
-        dtype=("bfloat16" if current_platform.is_cpu()
-               else "half" if current_platform.is_rocm()
+        dtype=("half" if current_platform.is_cpu() or current_platform.is_rocm()
                else ("half", "float")),
         marks=[pytest.mark.core_model],
     ),

From e41db032150dd6819ea9c7e5f400d536de786523 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Thu, 7 Nov 2024 11:18:56 +0000
Subject: [PATCH 13/20] Fix missing library

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .buildkite/run-cpu-test-ppc64le.sh | 2 +-
 .buildkite/run-cpu-test.sh         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.buildkite/run-cpu-test-ppc64le.sh b/.buildkite/run-cpu-test-ppc64le.sh
index 508ff63c76fd..66f62183d870 100755
--- a/.buildkite/run-cpu-test-ppc64le.sh
+++ b/.buildkite/run-cpu-test-ppc64le.sh
@@ -17,7 +17,7 @@ docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/hugg
 
 # Run basic model test
 docker exec cpu-test bash -c "
-  pip install pytest matplotlib einops transformers_stream_generator datamodel_code_generator
+  pip install pytest pytest-asyncio matplotlib einops transformers_stream_generator datamodel_code_generator
   pip install Pillow librosa
   pip install torchvision --index-url https://download.pytorch.org/whl/cpu
   pytest -v -s tests/models/embedding/language
diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index c7bb9822a828..f82647c3b5f4 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -22,7 +22,7 @@ docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py"
 
 # Run basic model test
 docker exec cpu-test bash -c "
-  pip install pytest matplotlib einops transformers_stream_generator datamodel_code_generator
+  pip install pytest pytest-asyncio matplotlib einops transformers_stream_generator datamodel_code_generator
   pip install Pillow librosa
   pip install torchvision --index-url https://download.pytorch.org/whl/cpu
   pytest -v -s tests/models/embedding/language

From 8e3cf443a2f06baa78113d1b404f1955c9cb8a99 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Thu, 7 Nov 2024 13:03:20 +0000
Subject: [PATCH 14/20] Fix loading image embeds on CPU

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/assets/image.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/assets/image.py b/vllm/assets/image.py
index 5eec78c32890..389ecd5c869b 100644
--- a/vllm/assets/image.py
+++ b/vllm/assets/image.py
@@ -27,4 +27,4 @@ def image_embeds(self) -> torch.Tensor:
         """
         image_path = get_vllm_public_assets(filename=f"{self.name}.pt",
                                             s3_prefix=VLM_IMAGES_DIR)
-        return torch.load(image_path)
+        return torch.load(image_path, map_location="cpu")

From cd1cd155d0cd4f548d0f191c014269600a056514 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Thu, 7 Nov 2024 17:12:25 +0000
Subject: [PATCH 15/20] Fix errors not being propagated to CI

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .buildkite/run-cpu-test-ppc64le.sh | 2 ++
 .buildkite/run-cpu-test.sh         | 8 +++++++-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/.buildkite/run-cpu-test-ppc64le.sh b/.buildkite/run-cpu-test-ppc64le.sh
index 66f62183d870..57d67dec0eab 100755
--- a/.buildkite/run-cpu-test-ppc64le.sh
+++ b/.buildkite/run-cpu-test-ppc64le.sh
@@ -17,6 +17,7 @@ docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/hugg
 
 # Run basic model test
 docker exec cpu-test bash -c "
+  set -e
   pip install pytest pytest-asyncio matplotlib einops transformers_stream_generator datamodel_code_generator
   pip install Pillow librosa
   pip install torchvision --index-url https://download.pytorch.org/whl/cpu
@@ -28,6 +29,7 @@ docker exec cpu-test bash -c "
 
 # online inference
 docker exec cpu-test bash -c "
+  set -e
   python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m & 
   timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
   python3 benchmarks/benchmark_serving.py \
diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index f82647c3b5f4..961165bd436c 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -18,10 +18,13 @@ docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/hugg
  --cpuset-mems=1 --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2 cpu-test-avx2
 
 # offline inference
-docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py"
+docker exec cpu-test-avx2 bash -c "
+  set -e
+  python3 examples/offline_inference.py"
 
 # Run basic model test
 docker exec cpu-test bash -c "
+  set -e
   pip install pytest pytest-asyncio matplotlib einops transformers_stream_generator datamodel_code_generator
   pip install Pillow librosa
   pip install torchvision --index-url https://download.pytorch.org/whl/cpu
@@ -33,17 +36,20 @@ docker exec cpu-test bash -c "
 
 # Run compressed-tensor test
 docker exec cpu-test bash -c "
+  set -e
   pytest -s -v \
   tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
   tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
 
 # Run AWQ test
 docker exec cpu-test bash -c "
+  set -e
   pytest -s -v \
   tests/quantization/test_ipex_quant.py"
 
 # online inference
 docker exec cpu-test bash -c "
+  set -e
   export VLLM_CPU_KVCACHE_SPACE=10 
   export VLLM_CPU_OMP_THREADS_BIND=48-92 
   python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half & 

From b401cb91ca36d7bd5a52ca6fb0cd2bb6c1ff00e7 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Thu, 7 Nov 2024 17:20:09 +0000
Subject: [PATCH 16/20] Fix missing libraries

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .buildkite/run-cpu-test-ppc64le.sh | 5 +++--
 .buildkite/run-cpu-test.sh         | 5 +++--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/.buildkite/run-cpu-test-ppc64le.sh b/.buildkite/run-cpu-test-ppc64le.sh
index 57d67dec0eab..51f549c8da15 100755
--- a/.buildkite/run-cpu-test-ppc64le.sh
+++ b/.buildkite/run-cpu-test-ppc64le.sh
@@ -18,8 +18,9 @@ docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/hugg
 # Run basic model test
 docker exec cpu-test bash -c "
   set -e
-  pip install pytest pytest-asyncio matplotlib einops transformers_stream_generator datamodel_code_generator
-  pip install Pillow librosa
+  pip install pytest pytest-asyncio \
+    einops librosa peft Pillow sentence-transformers soundfile \
+    transformers_stream_generator matplotlib datamodel_code_generator
   pip install torchvision --index-url https://download.pytorch.org/whl/cpu
   pytest -v -s tests/models/embedding/language
   pytest -v -s tests/models/encoder_decoder/language
diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index 961165bd436c..3a355e01ff20 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -25,8 +25,9 @@ docker exec cpu-test-avx2 bash -c "
 # Run basic model test
 docker exec cpu-test bash -c "
   set -e
-  pip install pytest pytest-asyncio matplotlib einops transformers_stream_generator datamodel_code_generator
-  pip install Pillow librosa
+  pip install pytest pytest-asyncio \
+    einops librosa peft Pillow sentence-transformers soundfile \
+    transformers_stream_generator matplotlib datamodel_code_generator
   pip install torchvision --index-url https://download.pytorch.org/whl/cpu
   pytest -v -s tests/models/embedding/language
   pytest -v -s tests/models/encoder_decoder/language

From 431a5c8cd13827f36fc398bd70f06d3347bdf558 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Fri, 8 Nov 2024 04:46:27 +0000
Subject: [PATCH 17/20] Embedding models are not supported for CPU backend

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .buildkite/run-cpu-test-ppc64le.sh | 3 ++-
 .buildkite/run-cpu-test.sh         | 3 ++-
 vllm/worker/cpu_worker.py          | 6 +++++-
 3 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/.buildkite/run-cpu-test-ppc64le.sh b/.buildkite/run-cpu-test-ppc64le.sh
index 51f549c8da15..4abf4eeadf9e 100755
--- a/.buildkite/run-cpu-test-ppc64le.sh
+++ b/.buildkite/run-cpu-test-ppc64le.sh
@@ -22,7 +22,8 @@ docker exec cpu-test bash -c "
     einops librosa peft Pillow sentence-transformers soundfile \
     transformers_stream_generator matplotlib datamodel_code_generator
   pip install torchvision --index-url https://download.pytorch.org/whl/cpu
-  pytest -v -s tests/models/embedding/language
+  # Embedding models are not supported on CPU yet
+  # pytest -v -s tests/models/embedding/language
   pytest -v -s tests/models/encoder_decoder/language
   pytest -v -s tests/models/decoder_only/language/test_models.py
   pytest -v -s tests/models/decoder_only/audio_language -m core_model
diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index 3a355e01ff20..391205df7549 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -29,7 +29,8 @@ docker exec cpu-test bash -c "
     einops librosa peft Pillow sentence-transformers soundfile \
     transformers_stream_generator matplotlib datamodel_code_generator
   pip install torchvision --index-url https://download.pytorch.org/whl/cpu
-  pytest -v -s tests/models/embedding/language
+  # Embedding models are not supported on CPU yet
+  # pytest -v -s tests/models/embedding/language
   pytest -v -s tests/models/encoder_decoder/language
   pytest -v -s tests/models/decoder_only/language/test_models.py
   pytest -v -s tests/models/decoder_only/audio_language -m core_model
diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py
index 2914f520d823..162e1e4be873 100644
--- a/vllm/worker/cpu_worker.py
+++ b/vllm/worker/cpu_worker.py
@@ -151,7 +151,11 @@ def __init__(
             self.local_omp_cpuid = omp_cpuids.split("|")[rank]
 
         ModelRunnerClass: Type[CPUModelRunner] = CPUModelRunner
-        if self.model_config.is_encoder_decoder:
+        if self.model_config.task == "embedding":
+            raise NotImplementedError(
+                "Embedding models are not supported for CPU backend")
+            # ModelRunnerClass = CPUEmbeddingModelRunner
+        elif self.model_config.is_encoder_decoder:
             ModelRunnerClass = CPUEncoderDecoderModelRunner
         self.model_runner: CPUModelRunner = ModelRunnerClass(
             vllm_config=vllm_config,

From 8c817e43a2f2fb4705efbf052b34d2af8c96ca30 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Fri, 8 Nov 2024 06:14:45 +0000
Subject: [PATCH 18/20] Chunked prefill not supported for CPU

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .buildkite/run-cpu-test-ppc64le.sh | 5 +++--
 .buildkite/run-cpu-test.sh         | 5 +++--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/.buildkite/run-cpu-test-ppc64le.sh b/.buildkite/run-cpu-test-ppc64le.sh
index 50d446aaa174..cc5c4f480dce 100755
--- a/.buildkite/run-cpu-test-ppc64le.sh
+++ b/.buildkite/run-cpu-test-ppc64le.sh
@@ -24,11 +24,12 @@ docker exec cpu-test bash -c "
     einops librosa peft Pillow sentence-transformers soundfile \
     transformers_stream_generator matplotlib datamodel_code_generator
   pip install torchvision --index-url https://download.pytorch.org/whl/cpu
-  # Embedding models are not supported on CPU yet
+  # Embedding models are not supported for CPU yet
   # pytest -v -s tests/models/embedding/language
   pytest -v -s tests/models/encoder_decoder/language
   pytest -v -s tests/models/decoder_only/language/test_models.py
-  pytest -v -s tests/models/decoder_only/audio_language -m core_model
+  # Chunked prefill not supported for CPU yet
+  # pytest -v -s tests/models/decoder_only/audio_language -m core_model
   pytest -v -s tests/models/decoder_only/vision_language -m core_model"
 
 # online inference
diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index 16a5a25c8b60..40859492086e 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -31,11 +31,12 @@ docker exec cpu-test bash -c "
     einops librosa peft Pillow sentence-transformers soundfile \
     transformers_stream_generator matplotlib datamodel_code_generator
   pip install torchvision --index-url https://download.pytorch.org/whl/cpu
-  # Embedding models are not supported on CPU yet
+  # Embedding models are not supported for CPU yet
   # pytest -v -s tests/models/embedding/language
   pytest -v -s tests/models/encoder_decoder/language
   pytest -v -s tests/models/decoder_only/language/test_models.py
-  pytest -v -s tests/models/decoder_only/audio_language -m core_model
+  # Chunked prefill not supported for CPU yet
+  # pytest -v -s tests/models/decoder_only/audio_language -m core_model
   pytest -v -s tests/models/decoder_only/vision_language -m core_model"
 
 # Run compressed-tensor test

From 4c39939434fa4383b7f0bbbdff61c57ec2ec4126 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Fri, 8 Nov 2024 08:35:37 +0000
Subject: [PATCH 19/20] Fix installation

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .buildkite/run-cpu-test-ppc64le.sh     | 2 +-
 .buildkite/run-cpu-test.sh             | 2 +-
 vllm/model_executor/models/ultravox.py | 4 ++--
 vllm/multimodal/utils.py               | 8 ++++----
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/.buildkite/run-cpu-test-ppc64le.sh b/.buildkite/run-cpu-test-ppc64le.sh
index cc5c4f480dce..8d472fa358cd 100755
--- a/.buildkite/run-cpu-test-ppc64le.sh
+++ b/.buildkite/run-cpu-test-ppc64le.sh
@@ -21,7 +21,7 @@ docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/hugg
 docker exec cpu-test bash -c "
   set -e
   pip install pytest pytest-asyncio \
-    einops librosa peft Pillow sentence-transformers soundfile \
+    decord einops librosa peft Pillow sentence-transformers soundfile \
     transformers_stream_generator matplotlib datamodel_code_generator
   pip install torchvision --index-url https://download.pytorch.org/whl/cpu
   # Embedding models are not supported for CPU yet
diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index 40859492086e..39053014f490 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -28,7 +28,7 @@ docker exec cpu-test-avx2 bash -c "
 docker exec cpu-test bash -c "
   set -e
   pip install pytest pytest-asyncio \
-    einops librosa peft Pillow sentence-transformers soundfile \
+    decord einops librosa peft Pillow sentence-transformers soundfile \
     transformers_stream_generator matplotlib datamodel_code_generator
   pip install torchvision --index-url https://download.pytorch.org/whl/cpu
   # Embedding models are not supported for CPU yet
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index 3a343986a934..411584b1a6c3 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -134,9 +134,9 @@ def input_mapper_for_ultravox(ctx: InputContext, data: object):
         if sr != feature_extractor.sampling_rate:
             try:
                 import librosa
-            except ImportError:
+            except ImportError as exc:
                 raise ImportError(
-                    "Please install vllm[audio] for audio support.") from None
+                    "Please install vllm[audio] for audio support.") from exc
             audio = librosa.resample(audio,
                                      orig_sr=sr,
                                      target_sr=feature_extractor.sampling_rate)
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 0c666b8cc2e6..bee3c25dbd8d 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -206,9 +206,9 @@ def try_import_audio_packages() -> Tuple[Any, Any]:
     try:
         import librosa
         import soundfile
-    except ImportError:
+    except ImportError as exc:
         raise ImportError(
-            "Please install vllm[audio] for audio support.") from None
+            "Please install vllm[audio] for audio support.") from exc
     return librosa, soundfile
 
 
@@ -344,9 +344,9 @@ def try_import_video_packages() -> Any:
     try:
         import cv2
         import decord
-    except ImportError:
+    except ImportError as exc:
         raise ImportError(
-            "Please install vllm[video] for video support.") from None
+            "Please install vllm[video] for video support.") from exc
     return cv2, decord
 
 

From 9ef98fa8129c7c5c7dfde0abf45340aefadd44fd Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Fri, 8 Nov 2024 13:40:47 +0000
Subject: [PATCH 20/20] Add `cpu_model` mark

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .buildkite/run-cpu-test-ppc64le.sh                |  4 ++--
 .buildkite/run-cpu-test.sh                        |  4 ++--
 pyproject.toml                                    |  3 ++-
 .../decoder_only/audio_language/test_ultravox.py  | 15 ++++++++++++---
 .../decoder_only/vision_language/test_models.py   |  4 ++--
 5 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/.buildkite/run-cpu-test-ppc64le.sh b/.buildkite/run-cpu-test-ppc64le.sh
index 8d472fa358cd..5add7ff0c15c 100755
--- a/.buildkite/run-cpu-test-ppc64le.sh
+++ b/.buildkite/run-cpu-test-ppc64le.sh
@@ -29,8 +29,8 @@ docker exec cpu-test bash -c "
   pytest -v -s tests/models/encoder_decoder/language
   pytest -v -s tests/models/decoder_only/language/test_models.py
   # Chunked prefill not supported for CPU yet
-  # pytest -v -s tests/models/decoder_only/audio_language -m core_model
-  pytest -v -s tests/models/decoder_only/vision_language -m core_model"
+  # pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
+  pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"
 
 # online inference
 docker exec cpu-test bash -c "
diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index 39053014f490..25a448e63be2 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -36,8 +36,8 @@ docker exec cpu-test bash -c "
   pytest -v -s tests/models/encoder_decoder/language
   pytest -v -s tests/models/decoder_only/language/test_models.py
   # Chunked prefill not supported for CPU yet
-  # pytest -v -s tests/models/decoder_only/audio_language -m core_model
-  pytest -v -s tests/models/decoder_only/vision_language -m core_model"
+  # pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
+  pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"
 
 # Run compressed-tensor test
 docker exec cpu-test bash -c "
diff --git a/pyproject.toml b/pyproject.toml
index bae8645502de..1385a15d0787 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -93,7 +93,8 @@ skip_gitignore = true
 [tool.pytest.ini_options]
 markers = [
     "skip_global_cleanup",
-    "core_model: run this model test in each PR instead of just daily",
+    "core_model: enable this model test in each PR instead of only nightly",
+    "cpu_model: enable this model test in CPU tests",
     "distributed_2_gpus: run this test only in distributed tests for 2 GPUs",
     "skip_v1: do not run this test with v1",
 ]
diff --git a/tests/models/decoder_only/audio_language/test_ultravox.py b/tests/models/decoder_only/audio_language/test_ultravox.py
index 3848a83c62bd..e100c6b9bb90 100644
--- a/tests/models/decoder_only/audio_language/test_ultravox.py
+++ b/tests/models/decoder_only/audio_language/test_ultravox.py
@@ -39,7 +39,10 @@ def audio(request):
     return AudioAsset(request.param)
 
 
-@pytest.fixture(params=({}, CHUNKED_PREFILL_KWARGS))
+@pytest.fixture(params=[
+    pytest.param({}, marks=pytest.mark.cpu_model),
+    pytest.param(CHUNKED_PREFILL_KWARGS),
+])
 def server(request, audio_assets):
     args = [
         "--dtype=bfloat16", "--max-model-len=4096", "--enforce-eager",
@@ -185,7 +188,10 @@ def run_multi_audio_test(
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("num_logprobs", [5])
-@pytest.mark.parametrize("vllm_kwargs", [{}, CHUNKED_PREFILL_KWARGS])
+@pytest.mark.parametrize("vllm_kwargs", [
+    pytest.param({}, marks=pytest.mark.cpu_model),
+    pytest.param(CHUNKED_PREFILL_KWARGS),
+])
 def test_models(hf_runner, vllm_runner, audio, dtype: str, max_tokens: int,
                 num_logprobs: int, vllm_kwargs: dict) -> None:
 
@@ -207,7 +213,10 @@ def test_models(hf_runner, vllm_runner, audio, dtype: str, max_tokens: int,
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("num_logprobs", [5])
-@pytest.mark.parametrize("vllm_kwargs", [{}, CHUNKED_PREFILL_KWARGS])
+@pytest.mark.parametrize("vllm_kwargs", [
+    pytest.param({}, marks=pytest.mark.cpu_model),
+    pytest.param(CHUNKED_PREFILL_KWARGS),
+])
 def test_models_with_multiple_audios(vllm_runner, audio_assets, dtype: str,
                                      max_tokens: int, num_logprobs: int,
                                      vllm_kwargs: dict) -> None:
diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index 2df0b0e19748..163752e9fe06 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -94,7 +94,7 @@
             ),
             limit_mm_per_prompt={"image": 4},
         )],
-        marks=[pytest.mark.core_model],
+        marks=[pytest.mark.core_model, pytest.mark.cpu_model],
     ),
     "paligemma": VLMTestInfo(
         models=["google/paligemma-3b-mix-224"],
@@ -129,7 +129,7 @@
         max_num_seqs=2,
         auto_cls=AutoModelForVision2Seq,
         vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
-        marks=[pytest.mark.core_model],
+        marks=[pytest.mark.core_model, pytest.mark.cpu_model],
         image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
     ),
     #### Extended model tests