diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py index 880d1bd1dc4e..84a5260ad9a0 100644 --- a/tests/models/decoder_only/vision_language/test_models.py +++ b/tests/models/decoder_only/vision_language/test_models.py @@ -254,13 +254,21 @@ "glm4v": VLMTestInfo( models=["THUDM/glm-4v-9b"], test_type=VLMTestType.IMAGE, - prompt_formatter=identity, - img_idx_to_prompt=lambda idx: "", + prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|assistant|>", # noqa: E501 + single_image_prompts=IMAGE_ASSETS.prompts({ + "stop_sign": "<|begin_of_image|><|endoftext|><|end_of_image|>What's the content in the center of the image?", # noqa: E501 + "cherry_blossom": "<|begin_of_image|><|endoftext|><|end_of_image|>What is the season?", # noqa: E501 + }), max_model_len=2048, max_num_seqs=2, dtype="bfloat16", get_stop_token_ids=lambda tok: [151329, 151336, 151338], - patch_hf_runner=model_utils.glm_patch_hf_runner, + patch_hf_runner=model_utils.glm4v_patch_hf_runner, + # The image embeddings match with HF but the outputs of the language + # decoder are only consistent up to 2 decimal places. + # So, we need to reduce the number of tokens for the test to pass. + max_tokens=8, + num_logprobs=10, marks=[large_gpu_mark(min_gb=32)], ), "h2ovl": VLMTestInfo( diff --git a/tests/models/decoder_only/vision_language/vlm_utils/core.py b/tests/models/decoder_only/vision_language/vlm_utils/core.py index aaad584c9cd5..31f0209b102d 100644 --- a/tests/models/decoder_only/vision_language/vlm_utils/core.py +++ b/tests/models/decoder_only/vision_language/vlm_utils/core.py @@ -61,7 +61,9 @@ def run_test( # if we run HF first, the cuda initialization will be done and it # will hurt multiprocessing backend with fork method (the default method). - vllm_runner_kwargs_: dict[str, Any] = {} + vllm_runner_kwargs_: dict[str, Any] = { + "disable_mm_preprocessor_cache": True, + } if model_info.tokenizer: vllm_runner_kwargs_["tokenizer"] = model_info.tokenizer if model_info.tokenizer_mode: diff --git a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py index 5e1fcfd8f082..3b4d1237c37a 100644 --- a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py +++ b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py @@ -316,8 +316,8 @@ def processor(*args, **kwargs): return hf_model -def glm_patch_hf_runner(hf_model: HfRunner) -> HfRunner: - """Patches and returns an instance of the HfRunner to use for GLM4.""" +def glm4v_patch_hf_runner(hf_model: HfRunner) -> HfRunner: + """Patches and returns an instance of the HfRunner to use for GLM4V.""" hf_processor = hf_model.processor patch_padding_side(hf_processor) @@ -325,12 +325,20 @@ def processor(*args, text="", images=None, **kwargs): if images is None: return hf_processor(*args, **kwargs) + images = [images] if isinstance(images, Image) else images + + contents = re.findall( + r"<\|begin_of_image\|><\|endoftext\|><\|end_of_image\|>(.*?)<\|assistant\|>", + text, + ) + assert len(contents) == len(images) + return hf_processor.apply_chat_template( [{ "role": "user", - "image": images, - "content": text - }], + "image": image, + "content": content + } for image, content in zip(images, contents)], add_generation_prompt=True, tokenize=True, return_dict=True, diff --git a/vllm/config.py b/vllm/config.py index 3ac7ceabd8d3..35411ca73ad2 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -286,14 +286,18 @@ def __init__( if rope_scaling is not None: hf_override: dict[str, Any] = {"rope_scaling": rope_scaling} hf_overrides_kw.update(hf_override) - msg = ("`--rope-scaling` will be removed in a future release. " - f"'Please instead use `--hf-overrides '{hf_override!r}'`") + hf_overrides_str = json.dumps(hf_overrides) + msg = ( + "`--rope-scaling` will be removed in a future release. " + f"'Please instead use `--hf-overrides '{hf_overrides_str}'`") warnings.warn(DeprecationWarning(msg), stacklevel=2) if rope_theta is not None: hf_override = {"rope_theta": rope_theta} hf_overrides_kw.update(hf_override) - msg = ("`--rope-theta` will be removed in a future release. " - f"'Please instead use `--hf-overrides '{hf_override!r}'`") + hf_overrides_str = json.dumps(hf_overrides) + msg = ( + "`--rope-theta` will be removed in a future release. " + f"'Please instead use `--hf-overrides '{hf_overrides_str}'`") warnings.warn(DeprecationWarning(msg), stacklevel=2) self.maybe_pull_model_tokenizer_for_s3(model, tokenizer) diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index 61f21482f707..4ce4fa897cc9 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -403,7 +403,9 @@ def _placeholder_str(self, modality: ModalityStr, hf_config = self._model_config.hf_config model_type = hf_config.model_type - if modality in ["image", "image_embeds"]: + if modality in ("image", "image_embeds"): + if model_type == "chatglm": + return "<|begin_of_image|><|endoftext|><|end_of_image|>" if model_type == "phi3_v": # Workaround since this token is not defined in the tokenizer return f"<|image_{current_count}|>" @@ -411,8 +413,7 @@ def _placeholder_str(self, modality: ModalityStr, return "<|endoftext10|>" # 200010 (see vocab.json in hf model) if model_type in ("minicpmo", "minicpmv"): return "(./)" - if model_type in ("blip-2", "chatglm", "fuyu", "paligemma", - "pixtral"): + if model_type in ("blip-2", "fuyu", "paligemma", "pixtral"): # These models do not use image tokens in the prompt return None if model_type == "qwen": diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index 6eca25212ee6..14dca23b3934 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -2,6 +2,7 @@ # Adapted from # https://github.com/THUDM/ChatGLM2-6B """Inference-only ChatGLM model compatible with THUDM weights.""" +import json from typing import Iterable, Optional, Set, Tuple, Union import torch @@ -463,7 +464,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): "The configuration of this model indicates that it supports " "vision inputs, but you instantiated the text-only version " "of this model. Please use the vision model by setting " - f"`--hf-overrides {hf_overrides!r}`") + f"`--hf-overrides '{json.dumps(hf_overrides)}'`") super().__init__(vllm_config=vllm_config, prefix=prefix) diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py index 96abfb9d1096..a33739a8eef9 100644 --- a/vllm/model_executor/models/qwen.py +++ b/vllm/model_executor/models/qwen.py @@ -5,7 +5,7 @@ # Copyright (c) Alibaba Cloud. # LICENSE: https://huggingface.co/Qwen/Qwen-7B/blob/main/LICENSE """Inference-only QWen model compatible with HuggingFace weights.""" - +import json from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union import torch @@ -354,7 +354,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): "The configuration of this model indicates that it supports " "vision inputs, but you instantiated the text-only version " "of this model. Please use the vision model by setting " - f"`--hf-overrides {hf_overrides!r}`") + f"`--hf-overrides '{json.dumps(hf_overrides)}'`") super().__init__(vllm_config=vllm_config, prefix=prefix)