From 6d078c329c64704c00898437c05d077ff02e2992 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Thu, 23 Jan 2025 10:06:40 +0000 Subject: [PATCH 01/16] Init Signed-off-by: Jee Jee Li --- vllm/model_executor/models/chatglm.py | 151 +++++++++++++++++++++++++- 1 file changed, 145 insertions(+), 6 deletions(-) diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index d5f9b4d19e5c..def2501352c4 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -3,7 +3,7 @@ """Inference-only CogAgent model compatible with THUDM weights.""" from argparse import Namespace from array import array -from typing import (Dict, Iterable, List, Mapping, Optional, Set, Tuple, +from typing import (Any, Dict, Iterable, List, Mapping, Optional, Set, Tuple, TypedDict) import torch @@ -35,6 +35,13 @@ from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (ModalityData, MultiModalKwargs, NestedTensors) +from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, + ImageSize, MultiModalDataItems) +from vllm.multimodal.processing import (BaseMultiModalProcessor, + BaseProcessingInfo, BatchFeature, + MultiModalFieldConfig, ProcessorMixin, + PromptReplacement) +from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs from vllm.multimodal.utils import cached_get_tokenizer from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors, SequenceData) @@ -223,6 +230,133 @@ def input_processor_for_glmv(ctx: InputContext, inputs: DecoderOnlyInputs): ) +class GLM4VProcessingInfo(BaseProcessingInfo): + pass + + def __init__(self, ctx): + super().__init__(ctx) + self._pre_calculate() + + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"image": None} + + def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: + + return {"image": self.image_tokens} + + def _pre_calculate(self): + hf_config = self.get_hf_config() + vision_config = hf_config.vision_config + self.image_tokens = (vision_config["image_size"] // + vision_config["patch_size"] // 2)**2 + self.image_szie = vision_config["image_size"] + + def get_num_image_tokens( + self, + *, + image_width: int, + image_height: int, + processor: Optional[ProcessorMixin], + ) -> int: + return self.image_tokens + + def get_image_size(self) -> ImageSize: + + return ImageSize(height=self.image_szie, width=self.image_szie) + + +class GLM4VDummyInputsBuilder(BaseDummyInputsBuilder[GLM4VProcessingInfo]): + + def get_dummy_processor_inputs( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> ProcessorInputs: + num_images = mm_counts.get("image", 0) + assert num_images == 1 + + target_width, target_height = self.info.get_image_size() + + mm_data = { + "image": + self._get_dummy_images(width=target_width, + height=target_height, + num_images=num_images) + } + + hf_config = self.info.get_hf_config() + image_placeholder_length=self.info.image_tokens + # image + token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE, [hf_config.boi_token_id] + + [0] * image_placeholder_length + + [hf_config.eoi_token_id]) + token_ids += array(VLLM_TOKEN_ID_ARRAY_TYPE, + [0] * (seq_len - image_placeholder_length - 2)) + + return ProcessorInputs( + prompt_text="".join(token_ids[:num_images]), + mm_data=mm_data, + ) + + +class GLM4VMultiModalProcessor(BaseMultiModalProcessor[GLM4VProcessingInfo]): + + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + return dict( + pixel_values=MultiModalFieldConfig.batched("image"), + image_sizes=MultiModalFieldConfig.batched("image"), + image_embeds=MultiModalFieldConfig.batched("image"), + ) + + def _get_prompt_replacements( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, Any], + out_mm_kwargs: MultiModalKwargs, + ) -> list[PromptReplacement]: + hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) + image_tokens: list[str] = hf_processor.img_tokens # type: ignore + + tokenizer = self.info.get_tokenizer() + bos_token_id = tokenizer.bos_token_id + assert isinstance(bos_token_id, int) + + def get_replacement_phi3v(item_idx: int): + images = mm_items.get_items( + "image", (ImageEmbeddingItems, ImageProcessorItems)) + + if isinstance(images, ImageEmbeddingItems): + num_image_tokens = images.get_feature_size(item_idx) + else: + image_size = images.get_image_size(item_idx) + num_image_tokens = self.info.get_num_image_tokens( + image_width=image_size.width, + image_height=image_size.height, + processor=hf_processor, + ) + + image_tokens = [_IMAGE_TOKEN_ID] * num_image_tokens + + return PromptReplacementDetails( + full=image_tokens + [bos_token_id], + features=image_tokens, + ) + + num_images = mm_items.get_count("image", strict=False) + + return [ + PromptReplacement( + modality="image", + target=image_token, + replacement=get_replacement_phi3v, + ) for image_token in image_tokens[:num_images] + ] + + class GLMAttention(nn.Module): def __init__( @@ -756,10 +890,15 @@ def get_mm_mapping(self) -> MultiModelKeys: tower_model="transformer.vision.transformer") -@MULTIMODAL_REGISTRY.register_image_input_mapper(mm_input_mapper_for_glmv) -@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_glmv_image_tokens) -@INPUT_REGISTRY.register_dummy_data(dummy_data_for_glmv) -@INPUT_REGISTRY.register_input_processor(input_processor_for_glmv) +# @MULTIMODAL_REGISTRY.register_image_input_mapper(mm_input_mapper_for_glmv) +# @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_glmv_image_tokens) +# @INPUT_REGISTRY.register_dummy_data(dummy_data_for_glmv) +# @INPUT_REGISTRY.register_input_processor(input_processor_for_glmv) + + +@MULTIMODAL_REGISTRY.register_processor(GLM4VMultiModalProcessor, + info=GLM4VProcessingInfo, + dummy_inputs=GLM4VDummyInputsBuilder) class ChatGLMForCausalLM(ChatGLMBaseModel, SupportsLoRA, SupportsPP, SupportsMultiModal): # Ensure that the LoRA support check passes when the class is not @@ -780,4 +919,4 @@ def __new__( return ChatGLMV(vllm_config=vllm_config, prefix=prefix) # Initialize LLM else: - return ChatGLM(vllm_config=vllm_config, prefix=prefix) \ No newline at end of file + return ChatGLM(vllm_config=vllm_config, prefix=prefix) From d4c635f4e7bb3e0723997884cb2b2519822f7b0d Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Thu, 23 Jan 2025 16:43:17 +0000 Subject: [PATCH 02/16] Backup Signed-off-by: Jee Jee Li --- vllm/model_executor/models/chatglm.py | 76 +++++++++++++++++++++++++-- 1 file changed, 71 insertions(+), 5 deletions(-) diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index def2501352c4..946f5cfe94c4 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -238,16 +238,16 @@ def __init__(self, ctx): self._pre_calculate() def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: - return {"image": None} + return {"image": 1} def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: - return {"image": self.image_tokens} + return {"image": self.image_token_num} def _pre_calculate(self): hf_config = self.get_hf_config() vision_config = hf_config.vision_config - self.image_tokens = (vision_config["image_size"] // + self.image_token_num = (vision_config["image_size"] // vision_config["patch_size"] // 2)**2 self.image_szie = vision_config["image_size"] @@ -285,7 +285,7 @@ def get_dummy_processor_inputs( } hf_config = self.info.get_hf_config() - image_placeholder_length=self.info.image_tokens + image_placeholder_length=self.info.image_token_num # image token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE, [hf_config.boi_token_id] + [0] * image_placeholder_length + @@ -294,7 +294,7 @@ def get_dummy_processor_inputs( [0] * (seq_len - image_placeholder_length - 2)) return ProcessorInputs( - prompt_text="".join(token_ids[:num_images]), + prompt_text=VLLM_TOKEN_ID_ARRAY_TYPE, mm_data=mm_data, ) @@ -312,6 +312,72 @@ def _get_mm_fields_config( image_embeds=MultiModalFieldConfig.batched("image"), ) + def _call_hf_processor( + self, + prompt: str, + mm_data: Mapping[str, object], + mm_kwargs: Mapping[str, object], + ) -> BatchFeature: + + hf_config = self.info.get_hf_config() + vision_config = getattr(hf_config, "vision_config", None) + + if vision_config is None: + return prompt + + + try: + + tokenizer = self.info.get_tokenizer() + raw_batch_data = tokenizer.apply_chat_template( + conversation=[ + { + "role": "user", + "image": mm_data.get("image",None), + "content": prompt, + } + ], + add_generation_prompt=True, + tokenize=True, + return_tensors="pt", + return_dict=True, + ).data + except Exception: + logger.error("Failed to process content (%s)", prompt) + raise + input_ids = raw_batch_data["input_ids"][0].tolist() + + boi_token_id = hf_config.boi_token_id + eoi_token_id = hf_config.eoi_token_id + boi_positions = find_all_positions(input_ids, boi_token_id) + eoi_positions = find_all_positions(input_ids, eoi_token_id) + + assert len(boi_positions) == len(eoi_positions) + + new_input_ids = [] + final_processed_position = 0 + + for boi_position, eoi_position in zip(boi_positions, eoi_positions): + assert boi_position < eoi_position + new_input_ids.extend( + input_ids[final_processed_position : boi_position + 1] + ) + new_input_ids.extend( + [input_ids[boi_position + 1]] * self.info.image_token_num + ) + final_processed_position = eoi_position + + new_input_ids.extend(input_ids[final_processed_position:]) + + if prompt is None: + prompt = tokenizer.decode(new_input_ids) + + return token_inputs( + prompt_token_ids=new_input_ids, + prompt=prompt, + multi_modal_data=mm_data, + ) + def _get_prompt_replacements( self, mm_items: MultiModalDataItems, From dc12b461a277055586fb55420c2c34a033f8e573 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Fri, 24 Jan 2025 03:22:44 +0000 Subject: [PATCH 03/16] Backup Signed-off-by: Jee Jee Li --- vllm/model_executor/models/chatglm.py | 34 ++++++++++++++++----------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index 946f5cfe94c4..c9c88faf79c3 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -266,7 +266,6 @@ def get_image_size(self) -> ImageSize: class GLM4VDummyInputsBuilder(BaseDummyInputsBuilder[GLM4VProcessingInfo]): - def get_dummy_processor_inputs( self, seq_len: int, @@ -278,23 +277,30 @@ def get_dummy_processor_inputs( target_width, target_height = self.info.get_image_size() mm_data = { - "image": - self._get_dummy_images(width=target_width, - height=target_height, - num_images=num_images) + "image": self._get_dummy_images( + width=target_width, height=target_height, num_images=num_images + ) } - + hf_config = self.info.get_hf_config() - image_placeholder_length=self.info.image_token_num - # image - token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE, [hf_config.boi_token_id] + - [0] * image_placeholder_length + - [hf_config.eoi_token_id]) - token_ids += array(VLLM_TOKEN_ID_ARRAY_TYPE, - [0] * (seq_len - image_placeholder_length - 2)) + image_placeholder_length = self.info.image_token_num + + token_ids = array( + VLLM_TOKEN_ID_ARRAY_TYPE, + [hf_config.boi_token_id] + + [0] * image_placeholder_length + + [hf_config.eoi_token_id], + ) + token_ids += array( + VLLM_TOKEN_ID_ARRAY_TYPE, + [0] * (seq_len - image_placeholder_length - 2), + ) + + tokenizer = self.info.get_tokenizer() + text = tokenizer.decode(token_ids) return ProcessorInputs( - prompt_text=VLLM_TOKEN_ID_ARRAY_TYPE, + prompt_text=text, mm_data=mm_data, ) From 962c61d705a437331d9ad482cc79a4e9e1e5770c Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Sun, 26 Jan 2025 15:40:32 +0000 Subject: [PATCH 04/16] Backup Signed-off-by: Jee Jee Li --- vllm/model_executor/models/chatglm.py | 165 +++++++++++--------------- 1 file changed, 70 insertions(+), 95 deletions(-) diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index c9c88faf79c3..b38028f8ee05 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -4,7 +4,7 @@ from argparse import Namespace from array import array from typing import (Any, Dict, Iterable, List, Mapping, Optional, Set, Tuple, - TypedDict) + TypedDict,Union) import torch from PIL import Image @@ -258,7 +258,7 @@ def get_num_image_tokens( image_height: int, processor: Optional[ProcessorMixin], ) -> int: - return self.image_tokens + return self.image_token_num def get_image_size(self) -> ImageSize: @@ -272,8 +272,6 @@ def get_dummy_processor_inputs( mm_counts: Mapping[str, int], ) -> ProcessorInputs: num_images = mm_counts.get("image", 0) - assert num_images == 1 - target_width, target_height = self.info.get_image_size() mm_data = { @@ -281,24 +279,7 @@ def get_dummy_processor_inputs( width=target_width, height=target_height, num_images=num_images ) } - - hf_config = self.info.get_hf_config() - image_placeholder_length = self.info.image_token_num - - token_ids = array( - VLLM_TOKEN_ID_ARRAY_TYPE, - [hf_config.boi_token_id] - + [0] * image_placeholder_length - + [hf_config.eoi_token_id], - ) - token_ids += array( - VLLM_TOKEN_ID_ARRAY_TYPE, - [0] * (seq_len - image_placeholder_length - 2), - ) - - tokenizer = self.info.get_tokenizer() - text = tokenizer.decode(token_ids) - + text="<|endoftext|>" return ProcessorInputs( prompt_text=text, mm_data=mm_data, @@ -313,33 +294,69 @@ def _get_mm_fields_config( hf_processor_mm_kwargs: Mapping[str, object], ) -> Mapping[str, MultiModalFieldConfig]: return dict( - pixel_values=MultiModalFieldConfig.batched("image"), - image_sizes=MultiModalFieldConfig.batched("image"), - image_embeds=MultiModalFieldConfig.batched("image"), + images=MultiModalFieldConfig.batched("image"), + + ) + + + + def _apply_hf_processor_main( + self, + prompt: Union[str, list[int]], + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + *, + enable_hf_prompt_replacement: bool, + ) -> tuple[list[int], MultiModalKwargs]: + """ + Apply the HF processor on the prompt text and multi-modal data. + + Note: + If :code:`enable_hf_prompt_replacement=False`, the prompt should + correspond to the multi-modal items. + """ + if isinstance(prompt, str): + if enable_hf_prompt_replacement: + return self._apply_hf_processor_text_mm( + prompt_text=prompt, + mm_items=mm_items, + hf_processor_mm_kwargs=hf_processor_mm_kwargs, + ) + + prompt_ids = self._apply_hf_processor_text_only(prompt) + else: + prompt_ids = self._apply_hf_processor_tokens_only(prompt) + + mm_missing_kwargs = self._apply_hf_processor_mm_only( + mm_items=mm_items, + hf_processor_mm_kwargs=hf_processor_mm_kwargs, ) + return prompt_ids, mm_missing_kwargs + + def _call_hf_processor( self, prompt: str, mm_data: Mapping[str, object], mm_kwargs: Mapping[str, object], ) -> BatchFeature: - - hf_config = self.info.get_hf_config() - vision_config = getattr(hf_config, "vision_config", None) - - if vision_config is None: - return prompt + if not mm_data: + tokenizer = self.info.get_tokenizer() + prefix = "<|begin_of_image|><|endoftext|><|end_of_image|>" + prompt_ids = tokenizer.encode(prompt + prefix) + return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt") try: tokenizer = self.info.get_tokenizer() + img=mm_data.get("images",None)[0] raw_batch_data = tokenizer.apply_chat_template( conversation=[ { "role": "user", - "image": mm_data.get("image",None), + "image":img, "content": prompt, } ], @@ -350,83 +367,41 @@ def _call_hf_processor( ).data except Exception: logger.error("Failed to process content (%s)", prompt) - raise - input_ids = raw_batch_data["input_ids"][0].tolist() - - boi_token_id = hf_config.boi_token_id - eoi_token_id = hf_config.eoi_token_id - boi_positions = find_all_positions(input_ids, boi_token_id) - eoi_positions = find_all_positions(input_ids, eoi_token_id) - assert len(boi_positions) == len(eoi_positions) - - new_input_ids = [] - final_processed_position = 0 - - for boi_position, eoi_position in zip(boi_positions, eoi_positions): - assert boi_position < eoi_position - new_input_ids.extend( - input_ids[final_processed_position : boi_position + 1] - ) - new_input_ids.extend( - [input_ids[boi_position + 1]] * self.info.image_token_num + return BatchFeature( + dict( + input_ids=raw_batch_data["input_ids"], + images=[raw_batch_data["images"][0]] if mm_data else None, ) - final_processed_position = eoi_position - - new_input_ids.extend(input_ids[final_processed_position:]) - - if prompt is None: - prompt = tokenizer.decode(new_input_ids) - - return token_inputs( - prompt_token_ids=new_input_ids, - prompt=prompt, - multi_modal_data=mm_data, ) + # return token_inputs( + # prompt_token_ids=new_input_ids, + # prompt=prompt, + # multi_modal_data=mm_data, + # ) + def _get_prompt_replacements( self, mm_items: MultiModalDataItems, - hf_processor_mm_kwargs: Mapping[str, Any], + hf_processor_mm_kwargs: Mapping[str, object], out_mm_kwargs: MultiModalKwargs, ) -> list[PromptReplacement]: - hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) - image_tokens: list[str] = hf_processor.img_tokens # type: ignore - - tokenizer = self.info.get_tokenizer() - bos_token_id = tokenizer.bos_token_id - assert isinstance(bos_token_id, int) - - def get_replacement_phi3v(item_idx: int): - images = mm_items.get_items( - "image", (ImageEmbeddingItems, ImageProcessorItems)) - - if isinstance(images, ImageEmbeddingItems): - num_image_tokens = images.get_feature_size(item_idx) - else: - image_size = images.get_image_size(item_idx) - num_image_tokens = self.info.get_num_image_tokens( - image_width=image_size.width, - image_height=image_size.height, - processor=hf_processor, - ) - - image_tokens = [_IMAGE_TOKEN_ID] * num_image_tokens - - return PromptReplacementDetails( - full=image_tokens + [bos_token_id], - features=image_tokens, - ) - - num_images = mm_items.get_count("image", strict=False) + tokenizer=self.info.get_tokenizer() + image_token_str = "<|endoftext|>" + image_token_id = tokenizer.convert_tokens_to_ids(image_token_str) + def get_replacement(item_idx: int): + num_image_tokens=self.info.get_num_image_tokens(image_height=1120,image_width=1120,processor=None) + return [image_token_id] * num_image_tokens return [ PromptReplacement( modality="image", - target=image_token, - replacement=get_replacement_phi3v, - ) for image_token in image_tokens[:num_images] + target=[image_token_id], + replacement=get_replacement, + ), ] + class GLMAttention(nn.Module): From 1388412d39b637c1a3984ca0987d7255bef8ff86 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Thu, 6 Feb 2025 16:51:23 +0000 Subject: [PATCH 05/16] Support V1 Signed-off-by: Jee Jee Li --- vllm/model_executor/models/chatglm.py | 262 ++++++-------------------- 1 file changed, 62 insertions(+), 200 deletions(-) diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index 09ba1363db9f..31cd0bbd71e5 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -4,20 +4,16 @@ # https://github.com/THUDM/CogAgent """Inference-only CogAgent model compatible with THUDM weights.""" from argparse import Namespace -from array import array -from typing import (Any, Dict, Iterable, List, Mapping, Optional, Set, Tuple, - TypedDict, Union) +from typing import (Iterable, List, Mapping, Optional, Sequence, Set, Tuple, + TypedDict) import torch -from PIL import Image from torch import nn from torch.nn import LayerNorm from vllm.attention import Attention, AttentionMetadata from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size -from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData, - InputContext, token_inputs) from vllm.logger import init_logger from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.layernorm import RMSNorm @@ -35,25 +31,24 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import (ModalityData, MultiModalKwargs, - NestedTensors) -from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, - ImageSize, MultiModalDataItems) +from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors +from vllm.multimodal.parse import ImageSize, MultiModalDataItems from vllm.multimodal.processing import (BaseMultiModalProcessor, BaseProcessingInfo, BatchFeature, - MultiModalFieldConfig, ProcessorMixin, - PromptReplacement) + BoundPromptReplacement, + MultiModalFieldConfig, + PlaceholderFeaturesInfo, + ProcessorMixin, PromptReplacement) from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs -from vllm.multimodal.utils import cached_get_tokenizer -from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors, - SequenceData) +from vllm.sequence import IntermediateTensors from vllm.transformers_utils.configs import ChatGLMConfig from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP from .utils import (AutoWeightsLoader, WeightsMapper, is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, - maybe_prefix) + maybe_prefix, merge_multimodal_embeddings) +IMAGE_TOKEN_ID = 151329 logger = init_logger(__name__) @@ -61,179 +56,12 @@ def calculate_image_placeholder(vision_config): return (vision_config["image_size"] // vision_config["patch_size"] // 2)**2 -def mm_input_mapper_for_glmv( - ctx: InputContext, - data: ModalityData[object], -) -> Dict: - model_config = ctx.model_config - tokenizer = cached_get_tokenizer( - model_config.tokenizer, - trust_remote_code=model_config.trust_remote_code) - if tokenizer is None: - raise RuntimeError("No HuggingFace processor is available " - "to process the image object") - try: - raw_batch_data = tokenizer.apply_chat_template( - conversation=[{ - "role": "user", - "image": data - }], - add_generation_prompt=True, - tokenize=True, - return_tensors="pt", - return_dict=True).data - except Exception: - logger.error("Failed to process image (%s)", data) - raise - pixel_values = raw_batch_data['images'] - - return MultiModalKwargs({'pixel_values': pixel_values}) - - -def merge_glm_vision_embeddings( - input_ids: torch.Tensor, - inputs_embeds: torch.Tensor, - vision_embeddings: torch.Tensor, - boi_token_id: int, - eoi_token_id: int, -) -> torch.Tensor: - - boi_positions = (input_ids == boi_token_id).nonzero(as_tuple=True)[0] - eoi_positions = (input_ids == eoi_token_id).nonzero(as_tuple=True)[0] - - mask = torch.zeros_like(input_ids, dtype=torch.bool) - - for boi_pos, eoi_pos in zip(boi_positions, eoi_positions): - assert boi_pos < eoi_pos - mask[boi_pos:eoi_pos + 1] = True - inputs_embeds[mask] = vision_embeddings.view(-1, - vision_embeddings.shape[-1]) - return inputs_embeds - - class GLMImagePixelInputs(TypedDict): pixel_values: torch.Tensor """Shape: `(batch_size, num_channels, height, width)`""" -def get_max_glmv_image_tokens(ctx: InputContext): - hf_config = ctx.get_hf_config(ChatGLMConfig) - - vision_config = getattr(hf_config, 'vision_config', None) - if vision_config is None: - return 1 - elif isinstance(vision_config, dict): - return calculate_image_placeholder(vision_config) - - msg = f"Unsupported vision config: {type(vision_config)}" - raise NotImplementedError(msg) - - -def dummy_data_for_glmv(ctx: InputContext, seq_len: int, - mm_counts: Mapping[str, int]) -> DummyData: - hf_config = ctx.get_hf_config(ChatGLMConfig) - vision_config = getattr(hf_config, 'vision_config', None) - - if vision_config is None: - token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE, [0] * seq_len) - seq_data = SequenceData(token_ids) - return DummyData(seq_data, None) - elif isinstance(vision_config, dict): - image_size = vision_config["image_size"] - image_placeholder_length = calculate_image_placeholder(vision_config) - token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE, [hf_config.boi_token_id] + - [0] * image_placeholder_length + - [hf_config.eoi_token_id]) - token_ids += array(VLLM_TOKEN_ID_ARRAY_TYPE, - [0] * (seq_len - image_placeholder_length - 2)) - seq_data = SequenceData(token_ids) - - mm_data = { - "image": Image.new("RGB", (image_size, image_size), color=0) - } - - return DummyData(seq_data, mm_data) - - msg = f"Unsupported vision config: {type(vision_config)}" - raise NotImplementedError(msg) - - -def find_all_positions(input_ids: List[int], target: int) -> List[int]: - return [index for index, value in enumerate(input_ids) if value == target] - - -def input_processor_for_glmv(ctx: InputContext, inputs: DecoderOnlyInputs): - multi_modal_data = inputs.get("multi_modal_data") - if multi_modal_data is None or "image" not in multi_modal_data: - return inputs - - hf_config = ctx.get_hf_config(ChatGLMConfig) - vision_config = getattr(hf_config, 'vision_config', None) - - if vision_config is None: - return inputs - elif isinstance(vision_config, dict): - image_placeholder_length = calculate_image_placeholder(vision_config) - else: - msg = f"Unsupported vision config: {type(vision_config)}" - raise NotImplementedError(msg) - - input_ids = inputs["prompt_token_ids"] - - tokenizer = cached_get_tokenizer( - ctx.model_config.model, - trust_remote_code=ctx.model_config.trust_remote_code) - - try: - raw_batch_data = tokenizer.apply_chat_template( - conversation=[{ - "role": "user", - "image": multi_modal_data["image"], - "content": inputs['prompt'], - }], - add_generation_prompt=True, - tokenize=True, - return_tensors="pt", - return_dict=True, - ).data - except Exception: - logger.error("Failed to process content (%s)", inputs['prompt']) - raise - input_ids = raw_batch_data['input_ids'][0].tolist() - - boi_token_id = hf_config.boi_token_id - eoi_token_id = hf_config.eoi_token_id - boi_positions = find_all_positions(input_ids, boi_token_id) - eoi_positions = find_all_positions(input_ids, eoi_token_id) - - assert len(boi_positions) == len(eoi_positions) - - new_input_ids = [] - final_processed_position = 0 - - for boi_position, eoi_position in zip(boi_positions, eoi_positions): - assert boi_position < eoi_position - new_input_ids.extend(input_ids[final_processed_position:boi_position + - 1]) - new_input_ids.extend([input_ids[boi_position + 1]] * - image_placeholder_length) - final_processed_position = eoi_position - - new_input_ids.extend(input_ids[final_processed_position:]) - - prompt = inputs.get("prompt") - if prompt is None: - prompt = tokenizer.decode(new_input_ids) - - return token_inputs( - prompt_token_ids=new_input_ids, - prompt=prompt, - multi_modal_data=multi_modal_data, - ) - - class GLM4VProcessingInfo(BaseProcessingInfo): - pass def __init__(self, ctx): super().__init__(ctx) @@ -248,7 +76,7 @@ def get_mm_max_tokens_per_item( mm_counts: Mapping[str, int], ) -> Mapping[str, int]: - return {"image": self.image_token_num} + return {"image": self.image_token_num + 2} def _pre_calculate(self): hf_config = self.get_hf_config() @@ -361,21 +189,48 @@ def _get_prompt_replacements( hf_processor_mm_kwargs: Mapping[str, object], out_mm_kwargs: MultiModalKwargs, ) -> list[PromptReplacement]: - image_token_id = [151329] def get_replacement(item_idx: int): num_image_tokens = self.info.get_num_image_tokens( image_height=1120, image_width=1120, processor=None) - return [image_token_id] * num_image_tokens + return [IMAGE_TOKEN_ID] * num_image_tokens return [ PromptReplacement( modality="image", - target=[image_token_id], + target=[IMAGE_TOKEN_ID], replacement=get_replacement, ), ] + def _apply_prompt_replacements( + self, + token_ids: list[int], + mm_prompt_repls: Mapping[str, Sequence[BoundPromptReplacement]], + mm_item_counts: Mapping[str, int], + ) -> tuple[list[int], str, Mapping[str, list[PlaceholderFeaturesInfo]]]: + token_ids, text, placeholders = super()._apply_prompt_replacements( + token_ids=token_ids, + mm_prompt_repls=mm_prompt_repls, + mm_item_counts=mm_item_counts, + ) + hf_config = self.info.get_hf_config() + boi_token_id = hf_config.boi_token_id + eoi_token_id = hf_config.eoi_token_id + placeholders = { + modality: [ + PlaceholderFeaturesInfo( + modality=p.modality, + item_idx=p.item_idx, + start_idx=p.start_idx - 1, + tokens=[boi_token_id] + p.tokens + [eoi_token_id], + ) for p in ps + ] + for modality, ps in placeholders.items() + } + + return token_ids, text, placeholders + class GLMAttention(nn.Module): @@ -724,12 +579,16 @@ def get_input_embeddings( ) -> torch.Tensor: inputs_embeds = self.embedding(input_ids) if multimodal_embeddings is not None: - inputs_embeds = merge_glm_vision_embeddings( + inputs_embeds = merge_multimodal_embeddings( input_ids=input_ids, inputs_embeds=inputs_embeds, - vision_embeddings=multimodal_embeddings, - boi_token_id=self.config.boi_token_id, - eoi_token_id=self.config.eoi_token_id) + multimodal_embeddings=multimodal_embeddings, + placeholder_token_id=[ + self.config.boi_token_id, + IMAGE_TOKEN_ID, + self.config.eoi_token_id, + ], + ) return inputs_embeds def forward( @@ -745,14 +604,12 @@ def forward( # NOTE: In v1, inputs_embeds is always generated at model runner, this # condition is for v0 compatibility. - if intermediate_tensors is None and inputs_embeds is None: + if intermediate_tensors is not None: + inputs_embeds = intermediate_tensors["hidden_states"] + elif inputs_embeds is None: vision_embeddings = self.get_multimodal_embeddings(**kwargs) inputs_embeds = self.get_input_embeddings(input_ids, vision_embeddings) - input_ids = None - else: - inputs_embeds = intermediate_tensors["hidden_states"] - # Run encoder. hidden_states = self.encoder( hidden_states=inputs_embeds, @@ -915,11 +772,16 @@ def get_mm_mapping(self) -> MultiModelKeys: connector="transformer.vision.linear_proj", tower_model="transformer.vision.transformer") + def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]: + return self.transformer.get_multimodal_embeddings(**kwargs) -# @MULTIMODAL_REGISTRY.register_image_input_mapper(mm_input_mapper_for_glmv) -# @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_glmv_image_tokens) -# @INPUT_REGISTRY.register_dummy_data(dummy_data_for_glmv) -# @INPUT_REGISTRY.register_input_processor(input_processor_for_glmv) + def get_input_embeddings( + self, + input_ids: torch.Tensor, + multimodal_embeddings: Optional[NestedTensors] = None, + ) -> torch.Tensor: + return self.transformer.get_input_embeddings(input_ids, + multimodal_embeddings) @MULTIMODAL_REGISTRY.register_processor(GLM4VMultiModalProcessor, From 487b3ae3b1cb06e0521b839a5956d4aa5ec76c35 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Fri, 7 Feb 2025 05:35:26 +0000 Subject: [PATCH 06/16] Done Signed-off-by: Jee Jee Li --- vllm/model_executor/models/chatglm.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index 31cd0bbd71e5..88e8e5ecac07 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -81,8 +81,7 @@ def get_mm_max_tokens_per_item( def _pre_calculate(self): hf_config = self.get_hf_config() vision_config = hf_config.vision_config - self.image_token_num = (vision_config["image_size"] // - vision_config["patch_size"] // 2)**2 + self.image_token_num = calculate_image_placeholder(vision_config) self.image_szie = vision_config["image_size"] def get_num_image_tokens( @@ -158,7 +157,6 @@ def _call_hf_processor( mm_data: Mapping[str, object], mm_kwargs: Mapping[str, object], ) -> BatchFeature: - try: tokenizer = self.info.get_tokenizer() img = mm_data.get("images", None)[0] if mm_data else None From cc0225e4f6a86169a037b12c3acd7302b5c15a8f Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Fri, 7 Feb 2025 05:39:52 +0000 Subject: [PATCH 07/16] Add docs Signed-off-by: Jee Jee Li --- docs/source/models/supported_models.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md index 32f3e9deff67..3239463ffb59 100644 --- a/docs/source/models/supported_models.md +++ b/docs/source/models/supported_models.md @@ -719,7 +719,7 @@ See [this page](#generative-models) for more information on how to use generativ * `THUDM/glm-4v-9b` etc. * ✅︎ * ✅︎ - * + * ✅︎ - * `H2OVLChatModel` * H2OVL * T + IE+ From eb5492bf7a661e6ba64489ff06d2e9c9f8d0af18 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Fri, 7 Feb 2025 08:43:52 +0000 Subject: [PATCH 08/16] Add test Signed-off-by: Jee Jee Li --- .../multimodal/processing/test_glm4v.py | 141 ++++++++++++++++++ 1 file changed, 141 insertions(+) create mode 100644 tests/models/multimodal/processing/test_glm4v.py diff --git a/tests/models/multimodal/processing/test_glm4v.py b/tests/models/multimodal/processing/test_glm4v.py new file mode 100644 index 000000000000..d15f33e2c7a6 --- /dev/null +++ b/tests/models/multimodal/processing/test_glm4v.py @@ -0,0 +1,141 @@ +# SPDX-License-Identifier: Apache-2.0 + +from functools import partial + +import numpy as np +import pytest +from PIL import Image + +from vllm.config import ModelConfig +from vllm.inputs import InputProcessingContext +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.processing import ProcessingCache +from vllm.multimodal.utils import cached_get_tokenizer + +from ....multimodal.utils import random_audio, random_image, random_video +from ...registry import HF_EXAMPLE_MODELS + + +def _test_processing_correctness( + model_id: str, + hit_rate: float, + num_batches: int, + simplify_rate: float, +): + model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id) + model_info.check_available_online(on_fail="skip") + model_info.check_transformers_version(on_fail="skip") + + model_config = ModelConfig( + model_id, + task="auto", + tokenizer=model_id, + tokenizer_mode="auto", + trust_remote_code=model_info.trust_remote_code, + seed=0, + dtype="float16", + revision=None, + hf_overrides=model_info.hf_overrides, + ) + + model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config) + factories = MULTIMODAL_REGISTRY._processor_factories[model_cls] + ctx = InputProcessingContext( + model_config, + tokenizer=cached_get_tokenizer( + model_config.tokenizer, + trust_remote_code=model_info.trust_remote_code, + ), + ) + # Ensure that it can fit all of the data + cache = ProcessingCache(capacity=1 << 30) + + processing_info = factories.info(ctx) + supported_mm_limits = processing_info.get_supported_mm_limits() + limit_mm_per_prompt = { + modality: 3 if limit is None else limit + for modality, limit in supported_mm_limits.items() + } + + model_config.get_multimodal_config().limit_per_prompt = limit_mm_per_prompt + + baseline_processor = factories.build_processor(ctx, cache=None) + cached_processor = factories.build_processor(ctx, cache=cache) + dummy_inputs = baseline_processor.dummy_inputs + rng = np.random.RandomState(0) + + input_to_hit = { + "image": Image.new("RGB", size=(128, 128)), + "video": np.zeros((4, 128, 128, 3), dtype=np.uint8), + "audio": (np.zeros((512, )), 16000), + } + input_factory = { + "image": + partial(random_image, rng, min_wh=128, max_wh=256), + "video": + partial(random_video, + rng, + min_frames=2, + max_frames=8, + min_wh=128, + max_wh=256), + "audio": + partial(random_audio, rng, min_len=512, max_len=1024, sr=16000), + } + + for batch_idx in range(num_batches): + mm_data = { + k: + [(input_to_hit[k] if rng.rand() < hit_rate else input_factory[k]()) + for _ in range(limit)] + for k, limit in limit_mm_per_prompt.items() + } + + mm_counts = {k: len(vs) for k, vs in mm_data.items()} + prompt = dummy_inputs.get_dummy_processor_inputs( + model_config.max_model_len, + mm_counts, + ).prompt_text + + # Drop unnecessary keys and test single -> multi conversion + if rng.rand() < simplify_rate: + for k in list(mm_data.keys()): + if not mm_data[k]: + del mm_data[k] + elif len(mm_data[k]) == 1: + mm_data[k] = mm_data[k][0] + + baseline_result = baseline_processor.apply( + prompt, + mm_data=mm_data, + hf_processor_mm_kwargs={}, + ) + cached_result = cached_processor.apply( + prompt, + mm_data=mm_data, + hf_processor_mm_kwargs={}, + ) + + assert baseline_result == cached_result, ( + f"Failed ({batch_idx=}, {prompt=}, {mm_data=})") + +# yapf: disable +@pytest.mark.parametrize("model_id", ["THUDM/glm-4v-9b"]) +@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0]) +@pytest.mark.parametrize("num_batches", [32]) +@pytest.mark.parametrize("simplify_rate", [1.0]) +# yapf: enable +def test_processing_correctness( + model_id: str, + hit_rate: float, + num_batches: int, + simplify_rate: float, +): + _test_processing_correctness( + model_id, + hit_rate=hit_rate, + num_batches=num_batches, + simplify_rate=simplify_rate, + ) + + From 42764b5ec798dcf349eb1a15797ec88c4151c65b Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Fri, 7 Feb 2025 08:46:23 +0000 Subject: [PATCH 09/16] format Signed-off-by: Jee Jee Li --- tests/models/multimodal/processing/test_glm4v.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/models/multimodal/processing/test_glm4v.py b/tests/models/multimodal/processing/test_glm4v.py index d15f33e2c7a6..a5f8e9e2708f 100644 --- a/tests/models/multimodal/processing/test_glm4v.py +++ b/tests/models/multimodal/processing/test_glm4v.py @@ -119,6 +119,7 @@ def _test_processing_correctness( assert baseline_result == cached_result, ( f"Failed ({batch_idx=}, {prompt=}, {mm_data=})") + # yapf: disable @pytest.mark.parametrize("model_id", ["THUDM/glm-4v-9b"]) @pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0]) @@ -137,5 +138,3 @@ def test_processing_correctness( num_batches=num_batches, simplify_rate=simplify_rate, ) - - From 37a135e3d35ffa93751e5fe7cc7005ec2aa10f7d Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Fri, 7 Feb 2025 10:56:31 +0000 Subject: [PATCH 10/16] Add hf processor Signed-off-by: Jee Jee Li --- examples/offline_inference/vision_language.py | 2 +- .../multimodal/processing/test_common.py | 1 + .../multimodal/processing/test_glm4v.py | 140 ----------------- vllm/model_executor/models/chatglm.py | 147 +++++++++++------- 4 files changed, 96 insertions(+), 194 deletions(-) delete mode 100644 tests/models/multimodal/processing/test_glm4v.py diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index 436c36570599..43ce5e07d48f 100644 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -106,7 +106,7 @@ def run_glm4v(question: str, modality: str): trust_remote_code=True, enforce_eager=True, disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache) - prompt = question + prompt = "<|user|>\n<|begin_of_image|><|endoftext|><|end_of_image|>{question}<|assistant|>" stop_token_ids = [151329, 151336, 151338] return llm, prompt, stop_token_ids diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py index 77cf3442df90..4052670bbfe0 100644 --- a/tests/models/multimodal/processing/test_common.py +++ b/tests/models/multimodal/processing/test_common.py @@ -164,6 +164,7 @@ def _test_processing_correctness( "Qwen/Qwen2.5-VL-3B-Instruct", "Qwen/Qwen2-Audio-7B-Instruct", "fixie-ai/ultravox-v0_3", + "THUDM/glm-4v-9b" ]) @pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0]) @pytest.mark.parametrize("num_batches", [32]) diff --git a/tests/models/multimodal/processing/test_glm4v.py b/tests/models/multimodal/processing/test_glm4v.py deleted file mode 100644 index a5f8e9e2708f..000000000000 --- a/tests/models/multimodal/processing/test_glm4v.py +++ /dev/null @@ -1,140 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 - -from functools import partial - -import numpy as np -import pytest -from PIL import Image - -from vllm.config import ModelConfig -from vllm.inputs import InputProcessingContext -from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.processing import ProcessingCache -from vllm.multimodal.utils import cached_get_tokenizer - -from ....multimodal.utils import random_audio, random_image, random_video -from ...registry import HF_EXAMPLE_MODELS - - -def _test_processing_correctness( - model_id: str, - hit_rate: float, - num_batches: int, - simplify_rate: float, -): - model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id) - model_info.check_available_online(on_fail="skip") - model_info.check_transformers_version(on_fail="skip") - - model_config = ModelConfig( - model_id, - task="auto", - tokenizer=model_id, - tokenizer_mode="auto", - trust_remote_code=model_info.trust_remote_code, - seed=0, - dtype="float16", - revision=None, - hf_overrides=model_info.hf_overrides, - ) - - model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config) - factories = MULTIMODAL_REGISTRY._processor_factories[model_cls] - ctx = InputProcessingContext( - model_config, - tokenizer=cached_get_tokenizer( - model_config.tokenizer, - trust_remote_code=model_info.trust_remote_code, - ), - ) - # Ensure that it can fit all of the data - cache = ProcessingCache(capacity=1 << 30) - - processing_info = factories.info(ctx) - supported_mm_limits = processing_info.get_supported_mm_limits() - limit_mm_per_prompt = { - modality: 3 if limit is None else limit - for modality, limit in supported_mm_limits.items() - } - - model_config.get_multimodal_config().limit_per_prompt = limit_mm_per_prompt - - baseline_processor = factories.build_processor(ctx, cache=None) - cached_processor = factories.build_processor(ctx, cache=cache) - dummy_inputs = baseline_processor.dummy_inputs - rng = np.random.RandomState(0) - - input_to_hit = { - "image": Image.new("RGB", size=(128, 128)), - "video": np.zeros((4, 128, 128, 3), dtype=np.uint8), - "audio": (np.zeros((512, )), 16000), - } - input_factory = { - "image": - partial(random_image, rng, min_wh=128, max_wh=256), - "video": - partial(random_video, - rng, - min_frames=2, - max_frames=8, - min_wh=128, - max_wh=256), - "audio": - partial(random_audio, rng, min_len=512, max_len=1024, sr=16000), - } - - for batch_idx in range(num_batches): - mm_data = { - k: - [(input_to_hit[k] if rng.rand() < hit_rate else input_factory[k]()) - for _ in range(limit)] - for k, limit in limit_mm_per_prompt.items() - } - - mm_counts = {k: len(vs) for k, vs in mm_data.items()} - prompt = dummy_inputs.get_dummy_processor_inputs( - model_config.max_model_len, - mm_counts, - ).prompt_text - - # Drop unnecessary keys and test single -> multi conversion - if rng.rand() < simplify_rate: - for k in list(mm_data.keys()): - if not mm_data[k]: - del mm_data[k] - elif len(mm_data[k]) == 1: - mm_data[k] = mm_data[k][0] - - baseline_result = baseline_processor.apply( - prompt, - mm_data=mm_data, - hf_processor_mm_kwargs={}, - ) - cached_result = cached_processor.apply( - prompt, - mm_data=mm_data, - hf_processor_mm_kwargs={}, - ) - - assert baseline_result == cached_result, ( - f"Failed ({batch_idx=}, {prompt=}, {mm_data=})") - - -# yapf: disable -@pytest.mark.parametrize("model_id", ["THUDM/glm-4v-9b"]) -@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0]) -@pytest.mark.parametrize("num_batches", [32]) -@pytest.mark.parametrize("simplify_rate", [1.0]) -# yapf: enable -def test_processing_correctness( - model_id: str, - hit_rate: float, - num_batches: int, - simplify_rate: float, -): - _test_processing_correctness( - model_id, - hit_rate=hit_rate, - num_batches=num_batches, - simplify_rate=simplify_rate, - ) diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index 88e8e5ecac07..7db803e73a5b 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -5,11 +5,16 @@ """Inference-only CogAgent model compatible with THUDM weights.""" from argparse import Namespace from typing import (Iterable, List, Mapping, Optional, Sequence, Set, Tuple, - TypedDict) + TypedDict, Union) import torch from torch import nn from torch.nn import LayerNorm +from torchvision import transforms +from torchvision.transforms import InterpolationMode +from transformers import PreTrainedTokenizer, TensorType +from transformers.image_utils import ImageInput +from transformers.tokenization_utils_base import TextInput from vllm.attention import Attention, AttentionMetadata from vllm.config import CacheConfig, VllmConfig @@ -52,6 +57,31 @@ logger = init_logger(__name__) +def build_normalization_transform(image_size: int) -> transforms.Compose: + """ + Build a normalization transform which can be applied to one or + more input images from which we want to extract visual features. + + Args: + image_size: size of the image to be processed for visual embeddings. + + Returns: + Callable transform for normalizing and resizing one RGB image. + """ + + return transforms.Compose([ + transforms.Resize( + (image_size, image_size), + interpolation=InterpolationMode.BICUBIC, + ), + transforms.ToTensor(), + transforms.Normalize( + (0.48145466, 0.4578275, 0.40821073), + (0.26862954, 0.26130258, 0.27577711), + ), + ]) + + def calculate_image_placeholder(vision_config): return (vision_config["image_size"] // vision_config["patch_size"] // 2)**2 @@ -61,6 +91,62 @@ class GLMImagePixelInputs(TypedDict): """Shape: `(batch_size, num_channels, height, width)`""" +class GLM4VProcessor: + """ + This model doesn't define its own HF processor, + so we implement our own one here. + + """ + + def __init__( + self, + config: ChatGLMConfig, + tokenizer: PreTrainedTokenizer, + ) -> None: + super().__init__() + + self.config = config + self.tokenizer = tokenizer + + if hasattr(self.config, "vision_config"): + self.image_transform = build_normalization_transform( + config.vision_config["image_size"]) + else: + self.image_transform = None + + def __call__( + self, + text: Optional[Union[TextInput, list[TextInput]]] = None, + images: Optional[Union[ImageInput, list[ImageInput]]] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + ) -> BatchFeature: + if text is None: + text = [] + if not isinstance(text, list): + text = [text] + if images is None: + images = [] + if not isinstance(images, list): + images = [images] + text_inputs = self.tokenizer(text) + if len(images) == 0: + image_inputs = {} + else: + if self.image_transform is None: + raise ValueError("This model does not support image inputs") + + pixel_values = [self.image_transform(image) for image in images] + image_inputs = {"pixel_values": torch.stack(pixel_values)} + + return BatchFeature( + { + **text_inputs, + **image_inputs, + }, + tensor_type=return_tensors, + ) + + class GLM4VProcessingInfo(BaseProcessingInfo): def __init__(self, ctx): @@ -97,6 +183,12 @@ def get_image_size(self) -> ImageSize: return ImageSize(height=self.image_szie, width=self.image_szie) + def get_hf_processor(self) -> GLM4VProcessor: + return GLM4VProcessor( + self.get_hf_config(), + self.get_tokenizer(), + ) + class GLM4VDummyInputsBuilder(BaseDummyInputsBuilder[GLM4VProcessingInfo]): @@ -114,7 +206,7 @@ def get_dummy_processor_inputs( height=target_height, num_images=num_images) } - text = "" + text = "<|begin_of_image|><|endoftext|><|end_of_image|>" return ProcessorInputs( prompt_text=text, mm_data=mm_data, @@ -130,57 +222,6 @@ def _get_mm_fields_config( ) -> Mapping[str, MultiModalFieldConfig]: return dict(pixel_values=MultiModalFieldConfig.batched("image"), ) - def _apply_hf_processor_text_only(self, prompt_text: str) -> list[int]: - """ - Apply the HF processor on the prompt text only. - - Since HF processor requires that text and multi-modal items - correspond to each other, we create dummy multi-modal items - to go along with the text. - """ - mm_counts = self.info.get_supported_mm_limits() - dummy_inputs = self.dummy_inputs.get_dummy_processor_inputs( - self.info.ctx.model_config.max_model_len, - mm_counts, - ) - prompt_ids, _ = self._apply_hf_processor_text_mm( - prompt_text=prompt_text, - mm_items=self._to_mm_items(dummy_inputs.mm_data), - hf_processor_mm_kwargs={}, - ) - - return prompt_ids - - def _call_hf_processor( - self, - prompt: str, - mm_data: Mapping[str, object], - mm_kwargs: Mapping[str, object], - ) -> BatchFeature: - try: - tokenizer = self.info.get_tokenizer() - img = mm_data.get("images", None)[0] if mm_data else None - raw_batch_data = tokenizer.apply_chat_template( - conversation=[{ - "role": "user", - "image": img, - "content": prompt, - }], - add_generation_prompt=True, - tokenize=True, - return_tensors="pt", - return_dict=True, - ).data - except Exception: - logger.error("Failed to process content (%s)", prompt) - - return BatchFeature( - dict( - input_ids=raw_batch_data["input_ids"], - pixel_values=[raw_batch_data["images"][0]] - if mm_data else None, - )) - def _get_prompt_replacements( self, mm_items: MultiModalDataItems, From 12e3dc379d4a7ceae563427cb03710e587fe2059 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Fri, 7 Feb 2025 10:57:59 +0000 Subject: [PATCH 11/16] Fix typo Signed-off-by: Jee Jee Li --- vllm/model_executor/models/chatglm.py | 51 +++++++++++++++------------ 1 file changed, 29 insertions(+), 22 deletions(-) diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index 7db803e73a5b..07926859968a 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -5,17 +5,12 @@ """Inference-only CogAgent model compatible with THUDM weights.""" from argparse import Namespace from typing import (Iterable, List, Mapping, Optional, Sequence, Set, Tuple, - TypedDict, Union) + TypedDict,Union) import torch from torch import nn from torch.nn import LayerNorm -from torchvision import transforms -from torchvision.transforms import InterpolationMode -from transformers import PreTrainedTokenizer, TensorType from transformers.image_utils import ImageInput -from transformers.tokenization_utils_base import TextInput - from vllm.attention import Attention, AttentionMetadata from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size @@ -47,7 +42,12 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs from vllm.sequence import IntermediateTensors from vllm.transformers_utils.configs import ChatGLMConfig +from transformers.tokenization_utils_base import TextInput +from transformers import (PreTrainedTokenizer, + TensorType) +from torchvision import transforms +from torchvision.transforms import InterpolationMode from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP from .utils import (AutoWeightsLoader, WeightsMapper, is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, @@ -57,6 +57,7 @@ logger = init_logger(__name__) + def build_normalization_transform(image_size: int) -> transforms.Compose: """ Build a normalization transform which can be applied to one or @@ -69,28 +70,35 @@ def build_normalization_transform(image_size: int) -> transforms.Compose: Callable transform for normalizing and resizing one RGB image. """ - return transforms.Compose([ - transforms.Resize( - (image_size, image_size), - interpolation=InterpolationMode.BICUBIC, - ), - transforms.ToTensor(), - transforms.Normalize( - (0.48145466, 0.4578275, 0.40821073), - (0.26862954, 0.26130258, 0.27577711), - ), - ]) + return transforms.Compose( + [ + transforms.Resize( + (image_size, image_size), + interpolation=InterpolationMode.BICUBIC, + ), + transforms.ToTensor(), + transforms.Normalize( + (0.48145466, 0.4578275, 0.40821073), + (0.26862954, 0.26130258, 0.27577711), + ), + ] + ) def calculate_image_placeholder(vision_config): return (vision_config["image_size"] // vision_config["patch_size"] // 2)**2 + + + class GLMImagePixelInputs(TypedDict): pixel_values: torch.Tensor """Shape: `(batch_size, num_channels, height, width)`""" + + class GLM4VProcessor: """ This model doesn't define its own HF processor, @@ -107,7 +115,7 @@ def __init__( self.config = config self.tokenizer = tokenizer - + if hasattr(self.config, "vision_config"): self.image_transform = build_normalization_transform( config.vision_config["image_size"]) @@ -146,7 +154,6 @@ def __call__( tensor_type=return_tensors, ) - class GLM4VProcessingInfo(BaseProcessingInfo): def __init__(self, ctx): @@ -168,7 +175,7 @@ def _pre_calculate(self): hf_config = self.get_hf_config() vision_config = hf_config.vision_config self.image_token_num = calculate_image_placeholder(vision_config) - self.image_szie = vision_config["image_size"] + self.image_size = vision_config["image_size"] def get_num_image_tokens( self, @@ -181,7 +188,7 @@ def get_num_image_tokens( def get_image_size(self) -> ImageSize: - return ImageSize(height=self.image_szie, width=self.image_szie) + return ImageSize(height=self.image_size, width=self.image_size) def get_hf_processor(self) -> GLM4VProcessor: return GLM4VProcessor( @@ -222,6 +229,7 @@ def _get_mm_fields_config( ) -> Mapping[str, MultiModalFieldConfig]: return dict(pixel_values=MultiModalFieldConfig.batched("image"), ) + def _get_prompt_replacements( self, mm_items: MultiModalDataItems, @@ -267,7 +275,6 @@ def _apply_prompt_replacements( ] for modality, ps in placeholders.items() } - return token_ids, text, placeholders From fda84fe56b0fc6b83abb8da97f9801f4c13507b7 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Fri, 7 Feb 2025 13:44:16 +0000 Subject: [PATCH 12/16] Fix format Signed-off-by: Jee Jee Li --- examples/offline_inference/vision_language.py | 4 +- vllm/model_executor/models/chatglm.py | 46 ++++++++----------- 2 files changed, 22 insertions(+), 28 deletions(-) diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index 43ce5e07d48f..9a4183106cff 100644 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -106,7 +106,9 @@ def run_glm4v(question: str, modality: str): trust_remote_code=True, enforce_eager=True, disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache) - prompt = "<|user|>\n<|begin_of_image|><|endoftext|><|end_of_image|>{question}<|assistant|>" + prompt = f"<|user|>\n<|begin_of_image|><|endoftext|><|end_of_image|>\ + {question}<|assistant|>" + stop_token_ids = [151329, 151336, 151338] return llm, prompt, stop_token_ids diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index 07926859968a..10cf0b606d57 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -5,12 +5,17 @@ """Inference-only CogAgent model compatible with THUDM weights.""" from argparse import Namespace from typing import (Iterable, List, Mapping, Optional, Sequence, Set, Tuple, - TypedDict,Union) + TypedDict, Union) import torch from torch import nn from torch.nn import LayerNorm +from torchvision import transforms +from torchvision.transforms import InterpolationMode +from transformers import PreTrainedTokenizer, TensorType from transformers.image_utils import ImageInput +from transformers.tokenization_utils_base import TextInput + from vllm.attention import Attention, AttentionMetadata from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size @@ -42,12 +47,7 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs from vllm.sequence import IntermediateTensors from vllm.transformers_utils.configs import ChatGLMConfig -from transformers.tokenization_utils_base import TextInput -from transformers import (PreTrainedTokenizer, - TensorType) -from torchvision import transforms -from torchvision.transforms import InterpolationMode from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP from .utils import (AutoWeightsLoader, WeightsMapper, is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, @@ -57,7 +57,6 @@ logger = init_logger(__name__) - def build_normalization_transform(image_size: int) -> transforms.Compose: """ Build a normalization transform which can be applied to one or @@ -70,35 +69,28 @@ def build_normalization_transform(image_size: int) -> transforms.Compose: Callable transform for normalizing and resizing one RGB image. """ - return transforms.Compose( - [ - transforms.Resize( - (image_size, image_size), - interpolation=InterpolationMode.BICUBIC, - ), - transforms.ToTensor(), - transforms.Normalize( - (0.48145466, 0.4578275, 0.40821073), - (0.26862954, 0.26130258, 0.27577711), - ), - ] - ) + return transforms.Compose([ + transforms.Resize( + (image_size, image_size), + interpolation=InterpolationMode.BICUBIC, + ), + transforms.ToTensor(), + transforms.Normalize( + (0.48145466, 0.4578275, 0.40821073), + (0.26862954, 0.26130258, 0.27577711), + ), + ]) def calculate_image_placeholder(vision_config): return (vision_config["image_size"] // vision_config["patch_size"] // 2)**2 - - - class GLMImagePixelInputs(TypedDict): pixel_values: torch.Tensor """Shape: `(batch_size, num_channels, height, width)`""" - - class GLM4VProcessor: """ This model doesn't define its own HF processor, @@ -115,7 +107,7 @@ def __init__( self.config = config self.tokenizer = tokenizer - + if hasattr(self.config, "vision_config"): self.image_transform = build_normalization_transform( config.vision_config["image_size"]) @@ -154,6 +146,7 @@ def __call__( tensor_type=return_tensors, ) + class GLM4VProcessingInfo(BaseProcessingInfo): def __init__(self, ctx): @@ -229,7 +222,6 @@ def _get_mm_fields_config( ) -> Mapping[str, MultiModalFieldConfig]: return dict(pixel_values=MultiModalFieldConfig.batched("image"), ) - def _get_prompt_replacements( self, mm_items: MultiModalDataItems, From 4b8f461d92912ec223189cfe71df346c2b1903b6 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Fri, 7 Feb 2025 14:00:02 +0000 Subject: [PATCH 13/16] Fix format Signed-off-by: Jee Jee Li --- vllm/model_executor/models/chatglm.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index 10cf0b606d57..d44dd1ccdd5b 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -53,9 +53,10 @@ make_empty_intermediate_tensors_factory, make_layers, maybe_prefix, merge_multimodal_embeddings) -IMAGE_TOKEN_ID = 151329 logger = init_logger(__name__) +IMAGE_TOKEN_ID = 151329 + def build_normalization_transform(image_size: int) -> transforms.Compose: """ @@ -220,7 +221,7 @@ def _get_mm_fields_config( hf_inputs: BatchFeature, hf_processor_mm_kwargs: Mapping[str, object], ) -> Mapping[str, MultiModalFieldConfig]: - return dict(pixel_values=MultiModalFieldConfig.batched("image"), ) + return dict(pixel_values=MultiModalFieldConfig.batched("image")) def _get_prompt_replacements( self, From 4d5be0065a70747a770b60b1adfd08f8f3a4cfe3 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Fri, 7 Feb 2025 14:49:10 +0000 Subject: [PATCH 14/16] Done Signed-off-by: Jee Jee Li --- vllm/model_executor/models/chatglm.py | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index d44dd1ccdd5b..fe3df65e793e 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -43,7 +43,7 @@ BoundPromptReplacement, MultiModalFieldConfig, PlaceholderFeaturesInfo, - ProcessorMixin, PromptReplacement) + PromptReplacement) from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs from vllm.sequence import IntermediateTensors from vllm.transformers_utils.configs import ChatGLMConfig @@ -171,13 +171,7 @@ def _pre_calculate(self): self.image_token_num = calculate_image_placeholder(vision_config) self.image_size = vision_config["image_size"] - def get_num_image_tokens( - self, - *, - image_width: int, - image_height: int, - processor: Optional[ProcessorMixin], - ) -> int: + def get_num_image_tokens(self) -> int: return self.image_token_num def get_image_size(self) -> ImageSize: @@ -231,8 +225,7 @@ def _get_prompt_replacements( ) -> list[PromptReplacement]: def get_replacement(item_idx: int): - num_image_tokens = self.info.get_num_image_tokens( - image_height=1120, image_width=1120, processor=None) + num_image_tokens = self.info.get_num_image_tokens() return [IMAGE_TOKEN_ID] * num_image_tokens return [ From c5aba0739c0c7b8bbd74dcc43f3c2bab071a4555 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Fri, 7 Feb 2025 15:22:34 +0000 Subject: [PATCH 15/16] Done Signed-off-by: Jee Jee Li --- .../multimodal/processing/test_common.py | 2 +- vllm/model_executor/models/chatglm.py | 59 +++++-------------- 2 files changed, 17 insertions(+), 44 deletions(-) diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py index 4052670bbfe0..8658e60bc5b2 100644 --- a/tests/models/multimodal/processing/test_common.py +++ b/tests/models/multimodal/processing/test_common.py @@ -147,6 +147,7 @@ def _test_processing_correctness( "facebook/chameleon-7b", "deepseek-ai/deepseek-vl2-tiny", "adept/fuyu-8b", + "THUDM/glm-4v-9b", "h2oai/h2ovl-mississippi-800m", "OpenGVLab/InternVL2-1B", "HuggingFaceM4/Idefics3-8B-Llama3", @@ -164,7 +165,6 @@ def _test_processing_correctness( "Qwen/Qwen2.5-VL-3B-Instruct", "Qwen/Qwen2-Audio-7B-Instruct", "fixie-ai/ultravox-v0_3", - "THUDM/glm-4v-9b" ]) @pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0]) @pytest.mark.parametrize("num_batches", [32]) diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index fe3df65e793e..afadf6872a46 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -4,8 +4,8 @@ # https://github.com/THUDM/CogAgent """Inference-only CogAgent model compatible with THUDM weights.""" from argparse import Namespace -from typing import (Iterable, List, Mapping, Optional, Sequence, Set, Tuple, - TypedDict, Union) +from typing import (Iterable, List, Mapping, Optional, Set, Tuple, TypedDict, + Union) import torch from torch import nn @@ -40,10 +40,9 @@ from vllm.multimodal.parse import ImageSize, MultiModalDataItems from vllm.multimodal.processing import (BaseMultiModalProcessor, BaseProcessingInfo, BatchFeature, - BoundPromptReplacement, MultiModalFieldConfig, - PlaceholderFeaturesInfo, - PromptReplacement) + PromptReplacement, + PromptReplacementDetails) from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs from vllm.sequence import IntermediateTensors from vllm.transformers_utils.configs import ChatGLMConfig @@ -163,7 +162,7 @@ def get_mm_max_tokens_per_item( mm_counts: Mapping[str, int], ) -> Mapping[str, int]: - return {"image": self.image_token_num + 2} + return {"image": self.image_token_num} def _pre_calculate(self): hf_config = self.get_hf_config() @@ -224,45 +223,23 @@ def _get_prompt_replacements( out_mm_kwargs: MultiModalKwargs, ) -> list[PromptReplacement]: - def get_replacement(item_idx: int): - num_image_tokens = self.info.get_num_image_tokens() - return [IMAGE_TOKEN_ID] * num_image_tokens + hf_config = self.info.get_hf_config() + boi_token_id = hf_config.boi_token_id + eoi_token_id = hf_config.eoi_token_id + num_image_tokens = self.info.get_num_image_tokens() + image_tokens = [IMAGE_TOKEN_ID] * num_image_tokens return [ PromptReplacement( modality="image", target=[IMAGE_TOKEN_ID], - replacement=get_replacement, - ), + replacement=PromptReplacementDetails( + full=([boi_token_id] + image_tokens + [eoi_token_id]), + features=image_tokens, + ), + ) ] - def _apply_prompt_replacements( - self, - token_ids: list[int], - mm_prompt_repls: Mapping[str, Sequence[BoundPromptReplacement]], - mm_item_counts: Mapping[str, int], - ) -> tuple[list[int], str, Mapping[str, list[PlaceholderFeaturesInfo]]]: - token_ids, text, placeholders = super()._apply_prompt_replacements( - token_ids=token_ids, - mm_prompt_repls=mm_prompt_repls, - mm_item_counts=mm_item_counts, - ) - hf_config = self.info.get_hf_config() - boi_token_id = hf_config.boi_token_id - eoi_token_id = hf_config.eoi_token_id - placeholders = { - modality: [ - PlaceholderFeaturesInfo( - modality=p.modality, - item_idx=p.item_idx, - start_idx=p.start_idx - 1, - tokens=[boi_token_id] + p.tokens + [eoi_token_id], - ) for p in ps - ] - for modality, ps in placeholders.items() - } - return token_ids, text, placeholders - class GLMAttention(nn.Module): @@ -615,11 +592,7 @@ def get_input_embeddings( input_ids=input_ids, inputs_embeds=inputs_embeds, multimodal_embeddings=multimodal_embeddings, - placeholder_token_id=[ - self.config.boi_token_id, - IMAGE_TOKEN_ID, - self.config.eoi_token_id, - ], + placeholder_token_id=IMAGE_TOKEN_ID, ) return inputs_embeds From 3428eb481c16b894cd4a8b585f4c4844ed19089b Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Sat, 8 Feb 2025 16:47:57 +0000 Subject: [PATCH 16/16] Fix v0 Signed-off-by: Jee Jee Li --- vllm/model_executor/models/chatglm.py | 62 +++++++++++++++++++-------- 1 file changed, 45 insertions(+), 17 deletions(-) diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index afadf6872a46..9ee9e9ca8009 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -4,8 +4,8 @@ # https://github.com/THUDM/CogAgent """Inference-only CogAgent model compatible with THUDM weights.""" from argparse import Namespace -from typing import (Iterable, List, Mapping, Optional, Set, Tuple, TypedDict, - Union) +from typing import (Iterable, List, Mapping, Optional, Sequence, Set, Tuple, + TypedDict, Union) import torch from torch import nn @@ -40,9 +40,10 @@ from vllm.multimodal.parse import ImageSize, MultiModalDataItems from vllm.multimodal.processing import (BaseMultiModalProcessor, BaseProcessingInfo, BatchFeature, + BoundPromptReplacement, MultiModalFieldConfig, - PromptReplacement, - PromptReplacementDetails) + PlaceholderFeaturesInfo, + PromptReplacement) from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs from vllm.sequence import IntermediateTensors from vllm.transformers_utils.configs import ChatGLMConfig @@ -162,7 +163,7 @@ def get_mm_max_tokens_per_item( mm_counts: Mapping[str, int], ) -> Mapping[str, int]: - return {"image": self.image_token_num} + return {"image": self.image_token_num + 2} def _pre_calculate(self): hf_config = self.get_hf_config() @@ -171,7 +172,7 @@ def _pre_calculate(self): self.image_size = vision_config["image_size"] def get_num_image_tokens(self) -> int: - return self.image_token_num + return self.image_token_num + 2 def get_image_size(self) -> ImageSize: @@ -223,23 +224,46 @@ def _get_prompt_replacements( out_mm_kwargs: MultiModalKwargs, ) -> list[PromptReplacement]: - hf_config = self.info.get_hf_config() - boi_token_id = hf_config.boi_token_id - eoi_token_id = hf_config.eoi_token_id - num_image_tokens = self.info.get_num_image_tokens() - image_tokens = [IMAGE_TOKEN_ID] * num_image_tokens + def get_replacement(item_idx: int): + image_tokens = self.info.image_token_num + return [IMAGE_TOKEN_ID] * image_tokens return [ PromptReplacement( modality="image", target=[IMAGE_TOKEN_ID], - replacement=PromptReplacementDetails( - full=([boi_token_id] + image_tokens + [eoi_token_id]), - features=image_tokens, - ), - ) + replacement=get_replacement, + ), ] + def _apply_prompt_replacements( + self, + token_ids: list[int], + mm_prompt_repls: Mapping[str, Sequence[BoundPromptReplacement]], + mm_item_counts: Mapping[str, int], + ) -> tuple[list[int], str, Mapping[str, list[PlaceholderFeaturesInfo]]]: + token_ids, text, placeholders = super()._apply_prompt_replacements( + token_ids=token_ids, + mm_prompt_repls=mm_prompt_repls, + mm_item_counts=mm_item_counts, + ) + hf_config = self.info.get_hf_config() + boi_token_id = hf_config.boi_token_id + eoi_token_id = hf_config.eoi_token_id + placeholders = { + modality: [ + PlaceholderFeaturesInfo( + modality=p.modality, + item_idx=p.item_idx, + start_idx=p.start_idx - 1, + tokens=[boi_token_id] + p.tokens + [eoi_token_id], + ) for p in ps + ] + for modality, ps in placeholders.items() + } + + return token_ids, text, placeholders + class GLMAttention(nn.Module): @@ -592,7 +616,11 @@ def get_input_embeddings( input_ids=input_ids, inputs_embeds=inputs_embeds, multimodal_embeddings=multimodal_embeddings, - placeholder_token_id=IMAGE_TOKEN_ID, + placeholder_token_id=[ + self.config.boi_token_id, + IMAGE_TOKEN_ID, + self.config.eoi_token_id, + ], ) return inputs_embeds