From 21379173089701aad9d50ebc6f0d665ce9694c7c Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Tue, 6 Aug 2024 11:38:13 +0800
Subject: [PATCH 01/25] init

---
 vllm/lora/models.py                    |  7 ++++
 vllm/model_executor/models/minicpmv.py | 48 ++++++++++++++++++++------
 vllm/worker/model_runner.py            |  6 ++--
 3 files changed, 47 insertions(+), 14 deletions(-)

diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index 017a1002bb9a..fe33f2544e9a 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -439,6 +439,13 @@ def _create_lora_modules(self):
                 self.model, module_name,
                 from_layer(module, self.lora_slots, self.lora_config,
                            packed_moduled_lst, self.model.config))
+            # In some models, especially multimodal ones, layers with the same
+            # name may have different types, such as nn.Linear and
+            # ReplicatedLinear. The nn.Linear layers cannot be replaced with
+            # LoRA layers, leading to assertion errors. The following check
+            # aims to prevent this issue
+            if not isinstance(new_module, BaseLayerWithLoRA):
+                continue
             # LinearScalingRotaryEmbeddingWithLora is used to handle
             # long context lora. Register relevant metadata.
             if isinstance(new_module, LinearScalingRotaryEmbeddingWithLora):
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 038825959562..55f68811db3d 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -37,7 +37,7 @@
 from transformers.configuration_utils import PretrainedConfig
 
 from vllm.attention import AttentionMetadata
-from vllm.config import CacheConfig, MultiModalConfig
+from vllm.config import CacheConfig, LoRAConfig, MultiModalConfig
 from vllm.inputs import INPUT_REGISTRY, InputContext, LLMInputs
 from vllm.logger import init_logger
 from vllm.model_executor.layers.linear import ReplicatedLinear
@@ -59,6 +59,7 @@
 from vllm.sequence import IntermediateTensors, SamplerOutput, SequenceData
 
 from .idefics2_vision_model import Idefics2VisionTransformer
+from .interfaces import SupportsLoRA
 
 logger = init_logger(__name__)
 
@@ -808,7 +809,26 @@ def is_default_weight_loading(self, name: str) -> bool:
         return "resampler" in name or "vpm" in name
 
 
-class MiniCPMV2_5(MiniCPMVBaseModel):
+class MiniCPMV2_5(MiniCPMVBaseModel, SupportsLoRA):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    # LoRA specific attributes
+    supported_lora_modules = [
+        "qkv_proj", "o_proj", "gate_up_proj", "down_proj", "fc1", "fc2",
+        "out_proj", "kv_proj"
+    ]
+    embedding_modules = {}
+    embedding_padding_modules = []
 
     def __init__(
         self,
@@ -816,6 +836,7 @@ def __init__(
         multimodal_config: MultiModalConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        lora_config: Optional[LoRAConfig] = None,
     ):
         super().__init__(config, multimodal_config, cache_config, quant_config)
         assert self.version == (2, 5)
@@ -993,20 +1014,25 @@ def is_default_weight_loading(self, name: str) -> bool:
 @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_minicpmv_image_tokens)
 @INPUT_REGISTRY.register_dummy_data(dummy_data_for_minicpmv)
 @INPUT_REGISTRY.register_input_processor(input_processor_for_minicpmv)
-class MiniCPMV(MiniCPMVBaseModel):
+class MiniCPMV(MiniCPMVBaseModel, SupportsLoRA):
     """
     Different versions of MiniCPMV use different visual encoders and LLMs,
     which is not conducive to the current integration logic of LoRA and
     bitsandbytes in vLLM. Therefore, it is necessary to separate them.
     """
-
-    def __new__(
-        cls,
-        config: PretrainedConfig,
-        multimodal_config: MultiModalConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-    ):
+    packed_modules_mapping = {}
+
+    # LoRA specific attributes
+    supported_lora_modules = []
+    embedding_modules = {}
+    embedding_padding_modules = []
+
+    def __new__(cls,
+                config: PretrainedConfig,
+                multimodal_config: MultiModalConfig,
+                cache_config: Optional[CacheConfig] = None,
+                quant_config: Optional[QuantizationConfig] = None,
+                lora_config: Optional[LoRAConfig] = None):
         if not hasattr(config, "version"):
             if config.hidden_size == 2304 and config.query_num == 64:
                 version = (2, 0)
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index f9c26e0c318b..7ddbda7ee8ba 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -734,9 +734,9 @@ def load_model(self) -> None:
 
         if self.lora_config:
             assert supports_lora(self.model), "Model does not support LoRA"
-            assert not supports_vision(
-                self.model
-            ), "To be tested: vision language model with LoRA settings."
+            # assert not supports_vision(
+            #     self.model
+            # ), "To be tested: vision language model with LoRA settings."
 
             self.lora_manager = LRUCacheWorkerLoRAManager(
                 self.scheduler_config.max_num_seqs,

From 5edda37733143f378ee5a65e509c2f6c9590f6eb Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Tue, 6 Aug 2024 18:08:31 +0800
Subject: [PATCH 02/25] optimize minicpmv implementation

---
 vllm/lora/models.py                    |  4 +-
 vllm/model_executor/models/minicpmv.py | 53 ++++++++++++++++++++------
 2 files changed, 44 insertions(+), 13 deletions(-)

diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index fe33f2544e9a..310b9a2e267e 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -442,8 +442,8 @@ def _create_lora_modules(self):
             # In some models, especially multimodal ones, layers with the same
             # name may have different types, such as nn.Linear and
             # ReplicatedLinear. The nn.Linear layers cannot be replaced with
-            # LoRA layers, leading to assertion errors. The following check
-            # aims to prevent this issue
+            # LoRA layers, leading to assertion error. The following check
+            # aims to prevent this error
             if not isinstance(new_module, BaseLayerWithLoRA):
                 continue
             # LinearScalingRotaryEmbeddingWithLora is used to handle
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 55f68811db3d..7ccdbbac3638 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -65,7 +65,6 @@
 
 _KEYS_TO_MODIFY_MAPPING = {
     "llm.lm_head": "lm_head",
-    "llm.model": "llm",
 }
 
 
@@ -483,6 +482,21 @@ def get_placeholder(image_size: Tuple[int, int], num_image: int):
     return llm_inputs
 
 
+class LLMWrapper(nn.Module):
+    """
+    To align with the key names of LoRA trained with PEFT, we need to add an 
+    additional layer to the llm's implementation.
+    """
+
+    def __init__(self, llm: nn.Module, name: str) -> None:
+        super().__init__()
+        self.model_name = name
+        setattr(self, name, llm)
+
+    def forward(self, *args, **kwargs) -> Any:
+        return getattr(self, self.model_name)(*args, **kwargs)
+
+
 class MiniCPMVBaseModel(nn.Module, SupportsVision):
     """
     The abstract class of MiniCPMV can only be inherited, but cannot be
@@ -521,7 +535,7 @@ def get_embedding(
         input_ids: torch.Tensor,
         image_inputs: Optional[MiniCPMVImageInputs],
     ) -> Tuple[torch.Tensor, torch.Tensor]:
-        vlm_embedding: torch.Tensor = self.llm.embed_tokens(input_ids)
+        vlm_embedding: torch.Tensor = self.get_llm_embedding(input_ids)
         if hasattr(self.config, "scale_emb"):
             vlm_embedding *= self.config.scale_emb
 
@@ -710,6 +724,9 @@ def get_vision_embedding(
     ) -> torch.Tensor:
         raise NotImplementedError
 
+    def get_llm_embedding(self, input_ids: torch.Tensor) -> torch.Tensor:
+        raise NotImplementedError
+
     def get_vision_hidden_states(self,
                                  data: MiniCPMVImageInputs) -> torch.Tensor:
         raise NotImplementedError
@@ -736,9 +753,11 @@ def init_llm(
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
     ) -> nn.Module:
-        return MiniCPMModel(config,
-                            cache_config=cache_config,
-                            quant_config=quant_config)
+
+        return LLMWrapper(MiniCPMModel(config,
+                                       cache_config=cache_config,
+                                       quant_config=quant_config),
+                          name="model")
 
     def init_vision_module(self) -> nn.Module:
         # TODO :refactor this vision model
@@ -764,6 +783,9 @@ def init_vision_module(self) -> nn.Module:
 
         return model
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_tokens(input_ids)
+
     def init_resampler(self, embed_dim: int, vision_dim: int) -> nn.Module:
         with set_default_torch_dtype(torch.float16):
             resampler = Resampler2(
@@ -799,6 +821,9 @@ def get_vision_embedding(
             res.append(self.resampler(vision_embedding, tgt_size))
         return torch.vstack(res)
 
+    def get_llm_embedding(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.llm.embed_tokens(input_ids)
+
     def get_vision_hidden_states(self,
                                  data: MiniCPMVImageInputs) -> torch.Tensor:
         pixel_values = data["pixel_values"]
@@ -847,9 +872,10 @@ def init_llm(
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
     ) -> nn.Module:
-        return LlamaModel(config,
-                          cache_config=cache_config,
-                          quant_config=quant_config)
+        return LLMWrapper(LlamaModel(config,
+                                     cache_config=cache_config,
+                                     quant_config=quant_config),
+                          name="model")
 
     def init_vision_module(self) -> nn.Module:
         model = Idefics2VisionTransformer(self.config.vision_config)
@@ -878,6 +904,9 @@ def get_vision_embedding(
         vision_embedding = self.resampler(vision_embedding, tgt_sizes)
         return vision_embedding
 
+    def get_llm_embedding(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.llm.model.embed_tokens(input_ids)
+
     def get_vision_hidden_states(self,
                                  data: MiniCPMVImageInputs) -> torch.Tensor:
         pixel_values = data["pixel_values"]
@@ -957,7 +986,6 @@ def init_resampler(self, embed_dim: int, vision_dim: int) -> nn.Module:
                 num_heads=embed_dim // 128,
                 kv_dim=vision_dim,
             )
-
         return resampler
 
     def get_vision_embedding(
@@ -973,6 +1001,9 @@ def get_vision_embedding(
         ).last_hidden_state
         return vision_embedding
 
+    def get_llm_embedding(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.llm.embed_tokens(input_ids)
+
     def get_vision_hidden_states(self,
                                  data: MiniCPMVImageInputs) -> torch.Tensor:
         pixel_values = data["pixel_values"]
@@ -1020,9 +1051,9 @@ class MiniCPMV(MiniCPMVBaseModel, SupportsLoRA):
     which is not conducive to the current integration logic of LoRA and
     bitsandbytes in vLLM. Therefore, it is necessary to separate them.
     """
+    # Ensure that the LoRA support check passes when the class is not
+    # initialized,but set all these attributes to empty
     packed_modules_mapping = {}
-
-    # LoRA specific attributes
     supported_lora_modules = []
     embedding_modules = {}
     embedding_padding_modules = []

From 2ea5006cfc2691a081e5a04059355c65e5c23b97 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Tue, 6 Aug 2024 18:10:50 +0800
Subject: [PATCH 03/25] delete comment

---
 vllm/worker/model_runner.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 7ddbda7ee8ba..f0bf981a3746 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -734,10 +734,6 @@ def load_model(self) -> None:
 
         if self.lora_config:
             assert supports_lora(self.model), "Model does not support LoRA"
-            # assert not supports_vision(
-            #     self.model
-            # ), "To be tested: vision language model with LoRA settings."
-
             self.lora_manager = LRUCacheWorkerLoRAManager(
                 self.scheduler_config.max_num_seqs,
                 self.scheduler_config.max_num_batched_tokens,

From 42e846a48d1b1a3a7a8d5af82fe8be1389039094 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Wed, 7 Aug 2024 14:32:24 +0800
Subject: [PATCH 04/25] Trigger LoRA test


From 9eed2354fb16c442908fb2be3a8802e03a2f75bc Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Wed, 25 Sep 2024 14:28:45 +0800
Subject: [PATCH 05/25] Modify code

---
 vllm/model_executor/models/minicpmv.py | 4 +---
 vllm/worker/model_runner.py            | 5 ++---
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index b802202b802c..1d40aad8c82d 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -389,7 +389,6 @@ def input_mapper_for_minicpmv(ctx: InputContext, data: object):
     return MultiModalInputs(batch_data)
 
 
-
 class LLMWrapper(nn.Module):
     """
     To align with the key names of LoRA trained with PEFT, we need to add an 
@@ -405,7 +404,6 @@ def forward(self, *args, **kwargs) -> Any:
         return getattr(self, self.model_name)(*args, **kwargs)
 
 
-
 class MiniCPMVBaseModel(nn.Module, SupportsMultiModal):
     """
     The abstract class of MiniCPMV can only be inherited, but cannot be
@@ -448,7 +446,7 @@ def get_embedding(
         input_ids: torch.Tensor,
         image_inputs: Optional[MiniCPMVImagePixelInputs],
     ) -> Tuple[torch.Tensor, torch.Tensor]:
-        vlm_embedding: torch.Tensor = self.get_llm_embedding(input_ids)
+        vlm_embedding: torch.Tensor = self.llm.embed_tokens(input_ids)
         if hasattr(self.config, "scale_emb"):
             vlm_embedding *= self.config.scale_emb
 
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index dd20954e8882..dc1020e3ea8b 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -1027,9 +1027,8 @@ def load_model(self) -> None:
 
         if self.lora_config:
             assert supports_lora(self.model), "Model does not support LoRA"
-            # assert not supports_multimodal(
-            #     self.model
-            # ), "To be tested: Multi-modal model with LoRA settings."
+            if supports_multimodal(self.model):
+                logger.warning("todo:add warning info")
 
             self.lora_manager = LRUCacheWorkerLoRAManager(
                 self.scheduler_config.max_num_seqs,

From e4e3f46536e96f300697a0ca42f3cc329b7501ba Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Wed, 25 Sep 2024 18:49:51 +0800
Subject: [PATCH 06/25] Complete VL supports lora

---
 vllm/lora/models.py                          | 35 ++++++++++--
 vllm/model_executor/models/minicpmv.py       | 18 +++++-
 vllm/model_executor/models/module_mapping.py | 59 ++++++++++++++++++++
 vllm/worker/model_runner.py                  |  3 +-
 4 files changed, 108 insertions(+), 7 deletions(-)
 create mode 100644 vllm/model_executor/models/module_mapping.py

diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index 8d33fc4e2f29..d726a36b83c2 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -24,7 +24,9 @@
 from vllm.lora.punica import PunicaWrapper
 from vllm.lora.utils import (from_layer, from_layer_logits_processor,
                              parse_fine_tuned_lora_name, replace_submodule)
-from vllm.model_executor.models.interfaces import SupportsLoRA
+from vllm.model_executor.models.interfaces import (SupportsLoRA,
+                                                   supports_multimodal)
+from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.models.utils import PPMissingLayer
 from vllm.utils import is_pin_memory_available
 
@@ -332,6 +334,8 @@ def __init__(
                 self.supported_lora_modules.append("rotary_emb")
             self.packed_modules_mapping = copy.deepcopy(
                 self.model.packed_modules_mapping)
+        # Used to indicate whether the model is a multimodal model
+        self.supports_mm: bool = supports_multimodal(self.model)
         self.packed_modules: Dict[str, List[str]] = {}
         self.modules: Dict[str, "BaseLayerWithLoRA"] = {}
         # Dict instead of a Set for compatibility with LRUCache.
@@ -437,6 +441,15 @@ def _create_lora_modules(self):
                 continue
             if not self._match_target_modules(module_name):
                 continue
+            # A temporary approach for multimodal models to support LoRA
+            # TODO: Remove this restriction
+            if self._filter_unsupported_modules(module_name):
+                logger.warning(
+                    "Regarding multimodal models, vLLM currently only supports "
+                    "adding LoRA to language models, %s will be ignored.",
+                    module_name,
+                )
+                continue
             parts = module_name.split(".")[-1]
             packed_moduled_lst = self.packed_modules_mapping.get(parts, [])
             new_module = replace_submodule(
@@ -485,9 +498,10 @@ def create_dummy_lora(
         """Create zero-initialized LoRAModel for warmup."""
         model = LoRAModel(lora_id, rank, {}, scaling_factor)
         for module_name, module in self.model.named_modules():
-            if not self._match_target_modules(module_name) or not isinstance(
-                    module, BaseLayerWithLoRA) or isinstance(
-                        module, LinearScalingRotaryEmbeddingWithLora):
+            if (not self._match_target_modules(module_name)
+                    or not isinstance(module, BaseLayerWithLoRA)
+                    or isinstance(module, LinearScalingRotaryEmbeddingWithLora)
+                    or self._filter_unsupported_modules(module_name)):
                 continue
             parts = module_name.split(".")
             if module_name not in self.packed_modules:
@@ -548,6 +562,19 @@ def _match_target_modules(self, module_name: str):
                 module_name) or target_module == module_name
             for target_module in self.supported_lora_modules)
 
+    def _filter_unsupported_modules(self, module_name: str) -> bool:
+        """
+        Regarding multimodal models, vLLM currently only supports adding LoRA to
+        language model. LoRA for other modules, such as the vision tower, will 
+        be filtered out.
+        """
+        if self.supports_mm:
+            prefix = module_name.split(".")[0]
+            module_mapping: MultiModelKeys = self.model.get_mm_mapping()
+            return (prefix in module_mapping.connector
+                    or prefix in module_mapping.vision_tower)
+        return False
+
     def _register_packed_modules(self, module_full_name: str) -> None:
         parts = module_full_name.split(".")
         module_name = parts[-1]
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 1d40aad8c82d..26adb04cc07c 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -50,6 +50,7 @@
 from vllm.model_executor.models.interfaces import SupportsMultiModal
 from vllm.model_executor.models.llama import LlamaModel
 from vllm.model_executor.models.minicpm import MiniCPMModel
+from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.models.qwen2 import Qwen2Model
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
@@ -403,6 +404,9 @@ def __init__(self, llm: nn.Module, name: str) -> None:
     def forward(self, *args, **kwargs) -> Any:
         return getattr(self, self.model_name)(*args, **kwargs)
 
+    def embed_tokens(self, *args, **kwargs):
+        return getattr(self, self.model_name).embed_tokens(*args, **kwargs)
+
 
 class MiniCPMVBaseModel(nn.Module, SupportsMultiModal):
     """
@@ -636,6 +640,14 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                                         default_weight_loader)
                 weight_loader(param, loaded_weight)
 
+    def get_mm_mapping(self) -> MultiModelKeys:
+        """
+        Get the module prefix in multimodal models
+        """
+        return MultiModelKeys(language_model="llm",
+                              connector="resampler",
+                              vision_tower="vpm")
+
     def init_llm(
         self,
         config: PretrainedConfig,
@@ -778,8 +790,10 @@ class MiniCPMV2_5(MiniCPMVBaseModel, SupportsLoRA):
 
     # LoRA specific attributes
     supported_lora_modules = [
-        "qkv_proj", "o_proj", "gate_up_proj", "down_proj", "fc1", "fc2",
-        "out_proj", "kv_proj"
+        "qkv_proj",
+        "o_proj",
+        "gate_up_proj",
+        "down_proj",
     ]
     embedding_modules = {}
     embedding_padding_modules = []
diff --git a/vllm/model_executor/models/module_mapping.py b/vllm/model_executor/models/module_mapping.py
new file mode 100644
index 000000000000..7e13896acbe9
--- /dev/null
+++ b/vllm/model_executor/models/module_mapping.py
@@ -0,0 +1,59 @@
+
+#Copied code from: https://github.com/modelscope/ms-swift/blob/v2.4.2/swift/utils/module_mapping.py
+
+from dataclasses import dataclass, field
+from typing import List, Union
+
+
+@dataclass
+class ModelKeys:
+    model_type: str = None
+
+    module_list: str = None
+
+    embedding: str = None
+
+    mlp: str = None
+
+    down_proj: str = None
+
+    attention: str = None
+
+    o_proj: str = None
+
+    q_proj: str = None
+
+    k_proj: str = None
+
+    v_proj: str = None
+
+    qkv_proj: str = None
+
+    qk_proj: str = None
+
+    qa_proj: str = None
+
+    qb_proj: str = None
+
+    kva_proj: str = None
+
+    kvb_proj: str = None
+
+    output: str = None
+
+
+@dataclass
+class MultiModelKeys(ModelKeys):
+    language_model: Union[List[str], str] = field(default_factory=list)
+    connector: Union[List[str], str] = field(default_factory=list)
+    vision_tower: Union[List[str], str] = field(default_factory=list)
+    generator: Union[List[str], str] = field(default_factory=list)
+
+    def __post_init__(self):
+        # compat
+        for key in ["language_model", "connector", "vision_tower", "generator"]:
+            v = getattr(self, key)
+            if isinstance(v, str):
+                setattr(self, key, [v])
+            if v is None:
+                setattr(self, key, [])
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index dc1020e3ea8b..970b42b23135 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -1028,7 +1028,8 @@ def load_model(self) -> None:
         if self.lora_config:
             assert supports_lora(self.model), "Model does not support LoRA"
             if supports_multimodal(self.model):
-                logger.warning("todo:add warning info")
+                logger.warning("Regarding multimodal models, vLLM currently "
+                               "only supports adding LoRA to language model.")
 
             self.lora_manager = LRUCacheWorkerLoRAManager(
                 self.scheduler_config.max_num_seqs,

From 65b5b08fe3497046812470b25334b9b6b022e7f4 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Wed, 25 Sep 2024 23:12:18 +0800
Subject: [PATCH 07/25] Format code

---
 vllm/model_executor/models/module_mapping.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/models/module_mapping.py b/vllm/model_executor/models/module_mapping.py
index 7e13896acbe9..10ee06fde0e1 100644
--- a/vllm/model_executor/models/module_mapping.py
+++ b/vllm/model_executor/models/module_mapping.py
@@ -1,5 +1,5 @@
-
-#Copied code from: https://github.com/modelscope/ms-swift/blob/v2.4.2/swift/utils/module_mapping.py
+# Copied code from
+#  https://github.com/modelscope/ms-swift/blob/v2.4.2/swift/utils/module_mapping.py
 
 from dataclasses import dataclass, field
 from typing import List, Union
@@ -51,7 +51,9 @@ class MultiModelKeys(ModelKeys):
 
     def __post_init__(self):
         # compat
-        for key in ["language_model", "connector", "vision_tower", "generator"]:
+        for key in [
+                "language_model", "connector", "vision_tower", "generator"
+        ]:
             v = getattr(self, key)
             if isinstance(v, str):
                 setattr(self, key, [v])

From 9bf92d5deb03cbd2b33fb2dc7209e9cf60ed1437 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Wed, 25 Sep 2024 23:36:26 +0800
Subject: [PATCH 08/25] Clean code

---
 vllm/lora/models.py                    |  2 +-
 vllm/model_executor/models/minicpmv.py | 10 ++++++++--
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index d726a36b83c2..b2110e9188be 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -446,7 +446,7 @@ def _create_lora_modules(self):
             if self._filter_unsupported_modules(module_name):
                 logger.warning(
                     "Regarding multimodal models, vLLM currently only supports "
-                    "adding LoRA to language models, %s will be ignored.",
+                    "adding LoRA to language model, %s will be ignored.",
                     module_name,
                 )
                 continue
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 26adb04cc07c..063c698db91f 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -787,13 +787,19 @@ class MiniCPMV2_5(MiniCPMVBaseModel, SupportsLoRA):
             "up_proj",
         ],
     }
-
     # LoRA specific attributes
     supported_lora_modules = [
-        "qkv_proj",
+        # vision encoder
+        "fc1",
+        "fc2",
+        "out_proj",
+        # language model
+        "qkv_proj",  # same name with vision encoder
         "o_proj",
         "gate_up_proj",
         "down_proj",
+        # resampler
+        "kv_proj",
     ]
     embedding_modules = {}
     embedding_padding_modules = []

From 561b4b758be556c7e57583572b6d27566c5745a2 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Thu, 26 Sep 2024 16:05:21 +0800
Subject: [PATCH 09/25] Clean code

---
 vllm/lora/models.py                          | 8 ++++----
 vllm/model_executor/models/minicpmv.py       | 6 +++---
 vllm/model_executor/models/module_mapping.py | 8 +++-----
 3 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index b2110e9188be..c0b7ff8258a8 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -443,7 +443,7 @@ def _create_lora_modules(self):
                 continue
             # A temporary approach for multimodal models to support LoRA
             # TODO: Remove this restriction
-            if self._filter_unsupported_modules(module_name):
+            if self._filter_unsupported_module(module_name):
                 logger.warning(
                     "Regarding multimodal models, vLLM currently only supports "
                     "adding LoRA to language model, %s will be ignored.",
@@ -501,7 +501,7 @@ def create_dummy_lora(
             if (not self._match_target_modules(module_name)
                     or not isinstance(module, BaseLayerWithLoRA)
                     or isinstance(module, LinearScalingRotaryEmbeddingWithLora)
-                    or self._filter_unsupported_modules(module_name)):
+                    or self._filter_unsupported_module(module_name)):
                 continue
             parts = module_name.split(".")
             if module_name not in self.packed_modules:
@@ -562,7 +562,7 @@ def _match_target_modules(self, module_name: str):
                 module_name) or target_module == module_name
             for target_module in self.supported_lora_modules)
 
-    def _filter_unsupported_modules(self, module_name: str) -> bool:
+    def _filter_unsupported_module(self, module_name: str) -> bool:
         """
         Regarding multimodal models, vLLM currently only supports adding LoRA to
         language model. LoRA for other modules, such as the vision tower, will 
@@ -572,7 +572,7 @@ def _filter_unsupported_modules(self, module_name: str) -> bool:
             prefix = module_name.split(".")[0]
             module_mapping: MultiModelKeys = self.model.get_mm_mapping()
             return (prefix in module_mapping.connector
-                    or prefix in module_mapping.vision_tower)
+                    or prefix in module_mapping.tower_model)
         return False
 
     def _register_packed_modules(self, module_full_name: str) -> None:
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 063c698db91f..2be49dc18b76 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -404,7 +404,7 @@ def __init__(self, llm: nn.Module, name: str) -> None:
     def forward(self, *args, **kwargs) -> Any:
         return getattr(self, self.model_name)(*args, **kwargs)
 
-    def embed_tokens(self, *args, **kwargs):
+    def embed_tokens(self, *args, **kwargs) -> Any:
         return getattr(self, self.model_name).embed_tokens(*args, **kwargs)
 
 
@@ -646,7 +646,7 @@ def get_mm_mapping(self) -> MultiModelKeys:
         """
         return MultiModelKeys(language_model="llm",
                               connector="resampler",
-                              vision_tower="vpm")
+                              tower_model="vpm")
 
     def init_llm(
         self,
@@ -1001,7 +1001,7 @@ class MiniCPMV(MiniCPMVBaseModel, SupportsLoRA):
     bitsandbytes in vLLM. Therefore, it is necessary to separate them.
     """
     # Ensure that the LoRA support check passes when the class is not
-    # initialized,but set all these attributes to empty
+    # initialized, but set all these attributes to empty.
     packed_modules_mapping = {}
     supported_lora_modules = []
     embedding_modules = {}
diff --git a/vllm/model_executor/models/module_mapping.py b/vllm/model_executor/models/module_mapping.py
index 10ee06fde0e1..221be8bb6133 100644
--- a/vllm/model_executor/models/module_mapping.py
+++ b/vllm/model_executor/models/module_mapping.py
@@ -46,14 +46,12 @@ class ModelKeys:
 class MultiModelKeys(ModelKeys):
     language_model: Union[List[str], str] = field(default_factory=list)
     connector: Union[List[str], str] = field(default_factory=list)
-    vision_tower: Union[List[str], str] = field(default_factory=list)
+    # such vision tower and audio tower
+    tower_model: Union[List[str], str] = field(default_factory=list)
     generator: Union[List[str], str] = field(default_factory=list)
 
     def __post_init__(self):
-        # compat
-        for key in [
-                "language_model", "connector", "vision_tower", "generator"
-        ]:
+        for key in ["language_model", "connector", "tower_model", "generator"]:
             v = getattr(self, key)
             if isinstance(v, str):
                 setattr(self, key, [v])

From 578deba51fbdb67f21d88870f80c5b3bd24234ed Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Thu, 26 Sep 2024 16:13:27 +0800
Subject: [PATCH 10/25] Clean code

---
 vllm/model_executor/models/module_mapping.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/module_mapping.py b/vllm/model_executor/models/module_mapping.py
index 221be8bb6133..66f5427f8f30 100644
--- a/vllm/model_executor/models/module_mapping.py
+++ b/vllm/model_executor/models/module_mapping.py
@@ -46,7 +46,7 @@ class ModelKeys:
 class MultiModelKeys(ModelKeys):
     language_model: Union[List[str], str] = field(default_factory=list)
     connector: Union[List[str], str] = field(default_factory=list)
-    # such vision tower and audio tower
+    # vision tower and audio tower
     tower_model: Union[List[str], str] = field(default_factory=list)
     generator: Union[List[str], str] = field(default_factory=list)
 

From 99dacdf223d3148ebe4169984012fffc02dd953e Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Fri, 27 Sep 2024 14:12:26 +0800
Subject: [PATCH 11/25] Add unit test for minicpmv25

---
 tests/lora/conftest.py                 |  5 ++
 tests/lora/test_minicpmv.py            | 99 ++++++++++++++++++++++++++
 vllm/model_executor/models/minicpmv.py | 11 ++-
 vllm/worker/model_runner.py            |  4 +-
 4 files changed, 115 insertions(+), 4 deletions(-)
 create mode 100644 tests/lora/test_minicpmv.py

diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py
index 4834a9d35a3e..7f6f60f38b5d 100644
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -194,6 +194,11 @@ def baichuan_zero_lora_files():
     return snapshot_download(repo_id="jeeejeee/baichuan7b-zero-init")
 
 
+@pytest.fixture(scope="session")
+def minicpmv_lora_files():
+    return snapshot_download(repo_id="jeeejeee/minicpmv25-lora-pokemon")
+
+
 @pytest.fixture(scope="session")
 def tinyllama_lora_files():
     return snapshot_download(repo_id="jashing/tinyllama-colorist-lora")
diff --git a/tests/lora/test_minicpmv.py b/tests/lora/test_minicpmv.py
new file mode 100644
index 000000000000..d279513dabaa
--- /dev/null
+++ b/tests/lora/test_minicpmv.py
@@ -0,0 +1,99 @@
+from typing import List
+
+import pytest
+
+import vllm
+from vllm.lora.request import LoRARequest
+from vllm.assets.image import ImageAsset
+
+
+MODEL_PATH = "openbmb/MiniCPM-Llama3-V-2_5"
+
+PROMPT_TEMPLATE = (
+    "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n"
+    "(<image>./</image>)\nWhat is in the image?<|eot_id|>"
+    "<|start_header_id|>assistant<|end_header_id|>\n\n"
+)
+
+IMAGE_ASSETS = [
+    ImageAsset("stop_sign"),
+    ImageAsset("cherry_blossom"),
+]
+
+
+# After fine-tuning with LoRA, all generated content should start begin `A`.
+EXPECTED_OUTPUT = [
+    "A red and white stop sign with a Chinese archway in the background featuring red lanterns and gold accents.",  # noqa: E501
+    "A pink cherry blossom tree with a blue sky in the background.",
+]
+
+
+def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
+    sampling_params = vllm.SamplingParams(
+        temperature=0,
+        max_tokens=256,
+        stop_token_ids=[128001, 128009],  # eos_id, eot_id
+    )
+
+    inputs = [
+        {
+            "prompt": PROMPT_TEMPLATE,
+            "multi_modal_data": {"image": asset.pil_image},
+        }
+        for asset in IMAGE_ASSETS
+    ]
+
+    outputs = llm.generate(
+        inputs,
+        sampling_params,
+        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
+        if lora_id
+        else None,
+    )
+    # Print the outputs.
+    generated_texts: List[str] = []
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text.strip()
+        generated_texts.append(generated_text)
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    return generated_texts
+
+
+def test_minicpmv_lora(minicpmv_lora_files):
+    llm = vllm.LLM(
+        MODEL_PATH,
+        max_model_len=1024,
+        enable_lora=True,
+        max_loras=4,
+        max_lora_rank=64,
+        trust_remote_code=True,
+    )
+
+    output1 = do_sample(llm, minicpmv_lora_files, lora_id=1)
+    for i in range(len(EXPECTED_OUTPUT)):
+        assert output1[i] == EXPECTED_OUTPUT[i]
+    output2 = do_sample(llm, minicpmv_lora_files, lora_id=2)
+    for i in range(len(EXPECTED_OUTPUT)):
+        assert output2[i] == EXPECTED_OUTPUT[i]
+
+
+# @pytest.mark.skip("Requires multiple GPUs")
+@pytest.mark.parametrize("fully_sharded", [True, False])
+@pytest.mark.parametrize("tp", [2, 4])
+def test_minicpmv_tensor_parallel(minicpmv_lora_files, fully_sharded, tp):
+    llm = vllm.LLM(
+        MODEL_PATH,
+        enable_lora=True,
+        max_num_seqs=16,
+        max_loras=4,
+        max_lora_rank=64,
+        tensor_parallel_size=tp,
+        trust_remote_code=True,
+        fully_sharded_loras=fully_sharded,
+    )
+    output_tp = do_sample(llm, minicpmv_lora_files, lora_id=1)
+
+    for i in range(len(EXPECTED_OUTPUT)):
+        assert output_tp[i] == EXPECTED_OUTPUT[i]
+
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 2be49dc18b76..d793cfa83e01 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -904,9 +904,14 @@ def init_llm(
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
     ) -> nn.Module:
-        return Qwen2Model(config,
-                          cache_config=cache_config,
-                          quant_config=quant_config)
+        # return Qwen2Model(config,
+        #                   cache_config=cache_config,
+        #                   quant_config=quant_config)
+
+        return LLMWrapper(Qwen2Model(config,
+                                     cache_config=cache_config,
+                                     quant_config=quant_config),
+                          name="model")
 
     def init_vision_module(self) -> nn.Module:
         # A custom version of SiglipVisionTransformer, won't work with TP
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 970b42b23135..b2107862aa04 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -1026,7 +1026,9 @@ def load_model(self) -> None:
                     self.model_memory_usage / float(2**30))
 
         if self.lora_config:
-            assert supports_lora(self.model), "Model does not support LoRA"
+            assert supports_lora(
+                self.model
+            ), f"{self.model.__class__.__name__} does not support LoRA yet."
             if supports_multimodal(self.model):
                 logger.warning("Regarding multimodal models, vLLM currently "
                                "only supports adding LoRA to language model.")

From 9b85373a54bf8eb0ee1d0932ba48de1419ce6425 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Fri, 27 Sep 2024 14:18:47 +0800
Subject: [PATCH 12/25] Format code

---
 tests/lora/test_minicpmv.py | 24 +++++++++---------------
 1 file changed, 9 insertions(+), 15 deletions(-)

diff --git a/tests/lora/test_minicpmv.py b/tests/lora/test_minicpmv.py
index d279513dabaa..d0bfdf076a3b 100644
--- a/tests/lora/test_minicpmv.py
+++ b/tests/lora/test_minicpmv.py
@@ -3,24 +3,21 @@
 import pytest
 
 import vllm
-from vllm.lora.request import LoRARequest
 from vllm.assets.image import ImageAsset
-
+from vllm.lora.request import LoRARequest
 
 MODEL_PATH = "openbmb/MiniCPM-Llama3-V-2_5"
 
 PROMPT_TEMPLATE = (
     "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n"
     "(<image>./</image>)\nWhat is in the image?<|eot_id|>"
-    "<|start_header_id|>assistant<|end_header_id|>\n\n"
-)
+    "<|start_header_id|>assistant<|end_header_id|>\n\n")
 
 IMAGE_ASSETS = [
     ImageAsset("stop_sign"),
     ImageAsset("cherry_blossom"),
 ]
 
-
 # After fine-tuning with LoRA, all generated content should start begin `A`.
 EXPECTED_OUTPUT = [
     "A red and white stop sign with a Chinese archway in the background featuring red lanterns and gold accents.",  # noqa: E501
@@ -35,20 +32,18 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
         stop_token_ids=[128001, 128009],  # eos_id, eot_id
     )
 
-    inputs = [
-        {
-            "prompt": PROMPT_TEMPLATE,
-            "multi_modal_data": {"image": asset.pil_image},
-        }
-        for asset in IMAGE_ASSETS
-    ]
+    inputs = [{
+        "prompt": PROMPT_TEMPLATE,
+        "multi_modal_data": {
+            "image": asset.pil_image
+        },
+    } for asset in IMAGE_ASSETS]
 
     outputs = llm.generate(
         inputs,
         sampling_params,
         lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
-        if lora_id
-        else None,
+        if lora_id else None,
     )
     # Print the outputs.
     generated_texts: List[str] = []
@@ -96,4 +91,3 @@ def test_minicpmv_tensor_parallel(minicpmv_lora_files, fully_sharded, tp):
 
     for i in range(len(EXPECTED_OUTPUT)):
         assert output_tp[i] == EXPECTED_OUTPUT[i]
-

From bf4ee9d6d13c9fa876ec39e5b09d35bde712660b Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Fri, 27 Sep 2024 16:47:30 +0800
Subject: [PATCH 13/25] Modify code

---
 tests/lora/test_minicpmv.py            |  4 +++-
 vllm/lora/models.py                    |  6 +++---
 vllm/model_executor/models/minicpmv.py | 22 +---------------------
 vllm/model_executor/models/utils.py    | 22 ++++++++++++++++++++--
 4 files changed, 27 insertions(+), 27 deletions(-)

diff --git a/tests/lora/test_minicpmv.py b/tests/lora/test_minicpmv.py
index d0bfdf076a3b..92cd155f5625 100644
--- a/tests/lora/test_minicpmv.py
+++ b/tests/lora/test_minicpmv.py
@@ -6,6 +6,8 @@
 from vllm.assets.image import ImageAsset
 from vllm.lora.request import LoRARequest
 
+from ..utils import multi_gpu_test
+
 MODEL_PATH = "openbmb/MiniCPM-Llama3-V-2_5"
 
 PROMPT_TEMPLATE = (
@@ -73,7 +75,7 @@ def test_minicpmv_lora(minicpmv_lora_files):
         assert output2[i] == EXPECTED_OUTPUT[i]
 
 
-# @pytest.mark.skip("Requires multiple GPUs")
+@multi_gpu_test(num_gpus=4)
 @pytest.mark.parametrize("fully_sharded", [True, False])
 @pytest.mark.parametrize("tp", [2, 4])
 def test_minicpmv_tensor_parallel(minicpmv_lora_files, fully_sharded, tp):
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index c0b7ff8258a8..03c019b7f90a 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -443,7 +443,7 @@ def _create_lora_modules(self):
                 continue
             # A temporary approach for multimodal models to support LoRA
             # TODO: Remove this restriction
-            if self._filter_unsupported_module(module_name):
+            if self._filter_unsupported_mm_module(module_name):
                 logger.warning(
                     "Regarding multimodal models, vLLM currently only supports "
                     "adding LoRA to language model, %s will be ignored.",
@@ -501,7 +501,7 @@ def create_dummy_lora(
             if (not self._match_target_modules(module_name)
                     or not isinstance(module, BaseLayerWithLoRA)
                     or isinstance(module, LinearScalingRotaryEmbeddingWithLora)
-                    or self._filter_unsupported_module(module_name)):
+                    or self._filter_unsupported_mm_module(module_name)):
                 continue
             parts = module_name.split(".")
             if module_name not in self.packed_modules:
@@ -562,7 +562,7 @@ def _match_target_modules(self, module_name: str):
                 module_name) or target_module == module_name
             for target_module in self.supported_lora_modules)
 
-    def _filter_unsupported_module(self, module_name: str) -> bool:
+    def _filter_unsupported_mm_module(self, module_name: str) -> bool:
         """
         Regarding multimodal models, vLLM currently only supports adding LoRA to
         language model. LoRA for other modules, such as the vision tower, will 
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index d793cfa83e01..a56559c35e9b 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -52,6 +52,7 @@
 from vllm.model_executor.models.minicpm import MiniCPMModel
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.models.qwen2 import Qwen2Model
+from vllm.model_executor.models.utils import LLMWrapper
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.base import MultiModalInputs
@@ -390,24 +391,6 @@ def input_mapper_for_minicpmv(ctx: InputContext, data: object):
     return MultiModalInputs(batch_data)
 
 
-class LLMWrapper(nn.Module):
-    """
-    To align with the key names of LoRA trained with PEFT, we need to add an 
-    additional layer to the llm's implementation.
-    """
-
-    def __init__(self, llm: nn.Module, name: str) -> None:
-        super().__init__()
-        self.model_name = name
-        setattr(self, name, llm)
-
-    def forward(self, *args, **kwargs) -> Any:
-        return getattr(self, self.model_name)(*args, **kwargs)
-
-    def embed_tokens(self, *args, **kwargs) -> Any:
-        return getattr(self, self.model_name).embed_tokens(*args, **kwargs)
-
-
 class MiniCPMVBaseModel(nn.Module, SupportsMultiModal):
     """
     The abstract class of MiniCPMV can only be inherited, but cannot be
@@ -904,9 +887,6 @@ def init_llm(
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
     ) -> nn.Module:
-        # return Qwen2Model(config,
-        #                   cache_config=cache_config,
-        #                   quant_config=quant_config)
 
         return LLMWrapper(Qwen2Model(config,
                                      cache_config=cache_config,
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index 38d6a4653ebd..f6218bad4ef1 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -1,7 +1,7 @@
 import itertools
 from collections import UserDict
-from typing import (Dict, Iterable, List, Literal, Optional, Protocol, Tuple,
-                    Union, overload)
+from typing import (Any, Dict, Iterable, List, Literal, Optional, Protocol,
+                    Tuple, Union, overload)
 
 import torch
 import torch.nn as nn
@@ -329,3 +329,21 @@ def make_empty_intermediate_tensors(
         })
 
     return make_empty_intermediate_tensors
+
+
+class LLMWrapper(nn.Module):
+    """
+    To align with the key names of LoRA trained with PEFT, we need to add an 
+    additional layer to the llm's implementation.
+    """
+
+    def __init__(self, llm: nn.Module, name: str) -> None:
+        super().__init__()
+        self.model_name = name
+        setattr(self, name, llm)
+
+    def forward(self, *args, **kwargs) -> Any:
+        return getattr(self, self.model_name)(*args, **kwargs)
+
+    def embed_tokens(self, *args, **kwargs) -> Any:
+        return getattr(self, self.model_name).embed_tokens(*args, **kwargs)

From a9e724c1a06c7f46b176f61356062f22f6a7a927 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Sat, 28 Sep 2024 00:31:00 +0800
Subject: [PATCH 14/25] Modify module_mapping logic

---
 vllm/model_executor/models/minicpmv.py       |  6 ++--
 vllm/model_executor/models/module_mapping.py | 36 +++++++++++++-------
 2 files changed, 26 insertions(+), 16 deletions(-)

diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index a56559c35e9b..89cdfbcc6afa 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -627,9 +627,9 @@ def get_mm_mapping(self) -> MultiModelKeys:
         """
         Get the module prefix in multimodal models
         """
-        return MultiModelKeys(language_model="llm",
-                              connector="resampler",
-                              tower_model="vpm")
+        return MultiModelKeys.from_string_field(language_model="llm",
+                                                connector="resampler",
+                                                tower_model="vpm")
 
     def init_llm(
         self,
diff --git a/vllm/model_executor/models/module_mapping.py b/vllm/model_executor/models/module_mapping.py
index 66f5427f8f30..a9102a6073a2 100644
--- a/vllm/model_executor/models/module_mapping.py
+++ b/vllm/model_executor/models/module_mapping.py
@@ -1,4 +1,4 @@
-# Copied code from
+# Adapted from
 #  https://github.com/modelscope/ms-swift/blob/v2.4.2/swift/utils/module_mapping.py
 
 from dataclasses import dataclass, field
@@ -44,16 +44,26 @@ class ModelKeys:
 
 @dataclass
 class MultiModelKeys(ModelKeys):
-    language_model: Union[List[str], str] = field(default_factory=list)
-    connector: Union[List[str], str] = field(default_factory=list)
+    language_model: List[str] = field(default_factory=list)
+    connector: List[str] = field(default_factory=list)
     # vision tower and audio tower
-    tower_model: Union[List[str], str] = field(default_factory=list)
-    generator: Union[List[str], str] = field(default_factory=list)
-
-    def __post_init__(self):
-        for key in ["language_model", "connector", "tower_model", "generator"]:
-            v = getattr(self, key)
-            if isinstance(v, str):
-                setattr(self, key, [v])
-            if v is None:
-                setattr(self, key, [])
+    tower_model: List[str] = field(default_factory=list)
+    generator: List[str] = field(default_factory=list)
+
+    @staticmethod
+    def from_string_field(language_model: Union[str, List[str]] = None,
+                          connector: Union[str, List[str]] = None,
+                          tower_model: Union[str, List[str]] = None,
+                          generator: Union[str, List[str]] = None,
+                          **kwargs) -> 'MultiModelKeys':
+
+        def to_list(value):
+            if value is None:
+                return []
+            return [value] if isinstance(value, str) else list(value)
+
+        return MultiModelKeys(language_model=to_list(language_model),
+                              connector=to_list(connector),
+                              tower_model=to_list(tower_model),
+                              generator=to_list(generator),
+                              **kwargs)

From be6c92860fd29beca6d07af4c3205c1db1109f90 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Sat, 28 Sep 2024 00:43:13 +0800
Subject: [PATCH 15/25] Add unit test

---
 tests/lora/test_minicpmv.py    | 24 ---------
 tests/lora/test_minicpmv_tp.py | 99 ++++++++++++++++++++++++++++++++++
 2 files changed, 99 insertions(+), 24 deletions(-)
 create mode 100644 tests/lora/test_minicpmv_tp.py

diff --git a/tests/lora/test_minicpmv.py b/tests/lora/test_minicpmv.py
index 92cd155f5625..4860b72fc1f6 100644
--- a/tests/lora/test_minicpmv.py
+++ b/tests/lora/test_minicpmv.py
@@ -1,13 +1,9 @@
 from typing import List
 
-import pytest
-
 import vllm
 from vllm.assets.image import ImageAsset
 from vllm.lora.request import LoRARequest
 
-from ..utils import multi_gpu_test
-
 MODEL_PATH = "openbmb/MiniCPM-Llama3-V-2_5"
 
 PROMPT_TEMPLATE = (
@@ -73,23 +69,3 @@ def test_minicpmv_lora(minicpmv_lora_files):
     output2 = do_sample(llm, minicpmv_lora_files, lora_id=2)
     for i in range(len(EXPECTED_OUTPUT)):
         assert output2[i] == EXPECTED_OUTPUT[i]
-
-
-@multi_gpu_test(num_gpus=4)
-@pytest.mark.parametrize("fully_sharded", [True, False])
-@pytest.mark.parametrize("tp", [2, 4])
-def test_minicpmv_tensor_parallel(minicpmv_lora_files, fully_sharded, tp):
-    llm = vllm.LLM(
-        MODEL_PATH,
-        enable_lora=True,
-        max_num_seqs=16,
-        max_loras=4,
-        max_lora_rank=64,
-        tensor_parallel_size=tp,
-        trust_remote_code=True,
-        fully_sharded_loras=fully_sharded,
-    )
-    output_tp = do_sample(llm, minicpmv_lora_files, lora_id=1)
-
-    for i in range(len(EXPECTED_OUTPUT)):
-        assert output_tp[i] == EXPECTED_OUTPUT[i]
diff --git a/tests/lora/test_minicpmv_tp.py b/tests/lora/test_minicpmv_tp.py
new file mode 100644
index 000000000000..40b1aa2701ac
--- /dev/null
+++ b/tests/lora/test_minicpmv_tp.py
@@ -0,0 +1,99 @@
+from typing import List
+
+import pytest
+
+import vllm
+from vllm.assets.image import ImageAsset
+from vllm.lora.request import LoRARequest
+
+from ..utils import multi_gpu_test
+
+MODEL_PATH = "openbmb/MiniCPM-Llama3-V-2_5"
+
+PROMPT_TEMPLATE = (
+    "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n"
+    "(<image>./</image>)\nWhat is in the image?<|eot_id|>"
+    "<|start_header_id|>assistant<|end_header_id|>\n\n"
+)
+
+IMAGE_ASSETS = [
+    ImageAsset("stop_sign"),
+    ImageAsset("cherry_blossom"),
+]
+
+# After fine-tuning with LoRA, all generated content should start begin `A`.
+EXPECTED_OUTPUT = [
+    "A red and white stop sign with a Chinese archway in the background featuring red lanterns and gold accents.",  # noqa: E501
+    "A pink cherry blossom tree with a blue sky in the background.",
+]
+
+
+def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
+    sampling_params = vllm.SamplingParams(
+        temperature=0,
+        max_tokens=256,
+        stop_token_ids=[128001, 128009],  # eos_id, eot_id
+    )
+
+    inputs = [
+        {
+            "prompt": PROMPT_TEMPLATE,
+            "multi_modal_data": {"image": asset.pil_image},
+        }
+        for asset in IMAGE_ASSETS
+    ]
+
+    outputs = llm.generate(
+        inputs,
+        sampling_params,
+        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
+        if lora_id
+        else None,
+    )
+    # Print the outputs.
+    generated_texts: List[str] = []
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text.strip()
+        generated_texts.append(generated_text)
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    return generated_texts
+
+@multi_gpu_test(num_gpus=2)
+@pytest.mark.parametrize("fully_sharded", [True, False])
+def test_minicpmv_tp2(minicpmv_lora_files, fully_sharded):
+    llm = vllm.LLM(
+        MODEL_PATH,
+        enable_lora=True,
+        max_num_seqs=16,
+        max_loras=4,
+        max_lora_rank=64,
+        tensor_parallel_size=2,
+        trust_remote_code=True,
+        fully_sharded_loras=fully_sharded,
+    )
+
+    output_tp = do_sample(llm, minicpmv_lora_files, lora_id=1)
+
+    for i in range(len(EXPECTED_OUTPUT)):
+        assert output_tp[i] == EXPECTED_OUTPUT[i]
+
+
+@multi_gpu_test(num_gpus=4)
+@pytest.mark.parametrize("fully_sharded", [True, False])
+def test_minicpmv_tp4(minicpmv_lora_files, fully_sharded):
+    llm = vllm.LLM(
+        MODEL_PATH,
+        enable_lora=True,
+        max_num_seqs=16,
+        max_loras=4,
+        max_lora_rank=64,
+        tensor_parallel_size=4,
+        trust_remote_code=True,
+        fully_sharded_loras=fully_sharded,
+    )
+
+    output_tp = do_sample(llm, minicpmv_lora_files, lora_id=1)
+
+    for i in range(len(EXPECTED_OUTPUT)):
+        assert output_tp[i] == EXPECTED_OUTPUT[i]

From c9db73e1f4d3f4e8c170346b44daef5a4bde2d1e Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Sat, 28 Sep 2024 22:40:08 +0800
Subject: [PATCH 16/25] Modify unit test

---
 tests/lora/test_minicpmv.py    |  2 +-
 tests/lora/test_minicpmv_tp.py | 24 +++++++++++-------------
 2 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/tests/lora/test_minicpmv.py b/tests/lora/test_minicpmv.py
index 4860b72fc1f6..4e1b5468641e 100644
--- a/tests/lora/test_minicpmv.py
+++ b/tests/lora/test_minicpmv.py
@@ -56,7 +56,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
 def test_minicpmv_lora(minicpmv_lora_files):
     llm = vllm.LLM(
         MODEL_PATH,
-        max_model_len=1024,
+        max_num_seqs=2,
         enable_lora=True,
         max_loras=4,
         max_lora_rank=64,
diff --git a/tests/lora/test_minicpmv_tp.py b/tests/lora/test_minicpmv_tp.py
index 40b1aa2701ac..260530627597 100644
--- a/tests/lora/test_minicpmv_tp.py
+++ b/tests/lora/test_minicpmv_tp.py
@@ -13,8 +13,7 @@
 PROMPT_TEMPLATE = (
     "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n"
     "(<image>./</image>)\nWhat is in the image?<|eot_id|>"
-    "<|start_header_id|>assistant<|end_header_id|>\n\n"
-)
+    "<|start_header_id|>assistant<|end_header_id|>\n\n")
 
 IMAGE_ASSETS = [
     ImageAsset("stop_sign"),
@@ -35,20 +34,18 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
         stop_token_ids=[128001, 128009],  # eos_id, eot_id
     )
 
-    inputs = [
-        {
-            "prompt": PROMPT_TEMPLATE,
-            "multi_modal_data": {"image": asset.pil_image},
-        }
-        for asset in IMAGE_ASSETS
-    ]
+    inputs = [{
+        "prompt": PROMPT_TEMPLATE,
+        "multi_modal_data": {
+            "image": asset.pil_image
+        },
+    } for asset in IMAGE_ASSETS]
 
     outputs = llm.generate(
         inputs,
         sampling_params,
         lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
-        if lora_id
-        else None,
+        if lora_id else None,
     )
     # Print the outputs.
     generated_texts: List[str] = []
@@ -59,13 +56,14 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
         print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
     return generated_texts
 
+
 @multi_gpu_test(num_gpus=2)
 @pytest.mark.parametrize("fully_sharded", [True, False])
 def test_minicpmv_tp2(minicpmv_lora_files, fully_sharded):
     llm = vllm.LLM(
         MODEL_PATH,
         enable_lora=True,
-        max_num_seqs=16,
+        max_num_seqs=2,
         max_loras=4,
         max_lora_rank=64,
         tensor_parallel_size=2,
@@ -85,7 +83,7 @@ def test_minicpmv_tp4(minicpmv_lora_files, fully_sharded):
     llm = vllm.LLM(
         MODEL_PATH,
         enable_lora=True,
-        max_num_seqs=16,
+        max_num_seqs=2,
         max_loras=4,
         max_lora_rank=64,
         tensor_parallel_size=4,

From bbfd3e0b8032d504edd52339f542e89fca8a6765 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Sun, 29 Sep 2024 10:42:47 +0800
Subject: [PATCH 17/25] Delete mincpmv25 distributed test

---
 tests/lora/test_minicpmv_tp.py | 97 ----------------------------------
 1 file changed, 97 deletions(-)
 delete mode 100644 tests/lora/test_minicpmv_tp.py

diff --git a/tests/lora/test_minicpmv_tp.py b/tests/lora/test_minicpmv_tp.py
deleted file mode 100644
index 260530627597..000000000000
--- a/tests/lora/test_minicpmv_tp.py
+++ /dev/null
@@ -1,97 +0,0 @@
-from typing import List
-
-import pytest
-
-import vllm
-from vllm.assets.image import ImageAsset
-from vllm.lora.request import LoRARequest
-
-from ..utils import multi_gpu_test
-
-MODEL_PATH = "openbmb/MiniCPM-Llama3-V-2_5"
-
-PROMPT_TEMPLATE = (
-    "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n"
-    "(<image>./</image>)\nWhat is in the image?<|eot_id|>"
-    "<|start_header_id|>assistant<|end_header_id|>\n\n")
-
-IMAGE_ASSETS = [
-    ImageAsset("stop_sign"),
-    ImageAsset("cherry_blossom"),
-]
-
-# After fine-tuning with LoRA, all generated content should start begin `A`.
-EXPECTED_OUTPUT = [
-    "A red and white stop sign with a Chinese archway in the background featuring red lanterns and gold accents.",  # noqa: E501
-    "A pink cherry blossom tree with a blue sky in the background.",
-]
-
-
-def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
-    sampling_params = vllm.SamplingParams(
-        temperature=0,
-        max_tokens=256,
-        stop_token_ids=[128001, 128009],  # eos_id, eot_id
-    )
-
-    inputs = [{
-        "prompt": PROMPT_TEMPLATE,
-        "multi_modal_data": {
-            "image": asset.pil_image
-        },
-    } for asset in IMAGE_ASSETS]
-
-    outputs = llm.generate(
-        inputs,
-        sampling_params,
-        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
-        if lora_id else None,
-    )
-    # Print the outputs.
-    generated_texts: List[str] = []
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text.strip()
-        generated_texts.append(generated_text)
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-    return generated_texts
-
-
-@multi_gpu_test(num_gpus=2)
-@pytest.mark.parametrize("fully_sharded", [True, False])
-def test_minicpmv_tp2(minicpmv_lora_files, fully_sharded):
-    llm = vllm.LLM(
-        MODEL_PATH,
-        enable_lora=True,
-        max_num_seqs=2,
-        max_loras=4,
-        max_lora_rank=64,
-        tensor_parallel_size=2,
-        trust_remote_code=True,
-        fully_sharded_loras=fully_sharded,
-    )
-
-    output_tp = do_sample(llm, minicpmv_lora_files, lora_id=1)
-
-    for i in range(len(EXPECTED_OUTPUT)):
-        assert output_tp[i] == EXPECTED_OUTPUT[i]
-
-
-@multi_gpu_test(num_gpus=4)
-@pytest.mark.parametrize("fully_sharded", [True, False])
-def test_minicpmv_tp4(minicpmv_lora_files, fully_sharded):
-    llm = vllm.LLM(
-        MODEL_PATH,
-        enable_lora=True,
-        max_num_seqs=2,
-        max_loras=4,
-        max_lora_rank=64,
-        tensor_parallel_size=4,
-        trust_remote_code=True,
-        fully_sharded_loras=fully_sharded,
-    )
-
-    output_tp = do_sample(llm, minicpmv_lora_files, lora_id=1)
-
-    for i in range(len(EXPECTED_OUTPUT)):
-        assert output_tp[i] == EXPECTED_OUTPUT[i]

From acc836a6a375a82749e6fc394819108b2886f2c3 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Sun, 29 Sep 2024 12:44:52 +0800
Subject: [PATCH 18/25] Fix lora bug and modify minicpmv lora tests

---
 tests/lora/test_minicpmv.py    |  6 +--
 tests/lora/test_minicpmv_tp.py | 95 ++++++++++++++++++++++++++++++++++
 vllm/lora/models.py            | 17 +++---
 3 files changed, 108 insertions(+), 10 deletions(-)
 create mode 100644 tests/lora/test_minicpmv_tp.py

diff --git a/tests/lora/test_minicpmv.py b/tests/lora/test_minicpmv.py
index 4e1b5468641e..81b8188e638c 100644
--- a/tests/lora/test_minicpmv.py
+++ b/tests/lora/test_minicpmv.py
@@ -26,7 +26,7 @@
 def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
     sampling_params = vllm.SamplingParams(
         temperature=0,
-        max_tokens=256,
+        max_tokens=5,
         stop_token_ids=[128001, 128009],  # eos_id, eot_id
     )
 
@@ -65,7 +65,7 @@ def test_minicpmv_lora(minicpmv_lora_files):
 
     output1 = do_sample(llm, minicpmv_lora_files, lora_id=1)
     for i in range(len(EXPECTED_OUTPUT)):
-        assert output1[i] == EXPECTED_OUTPUT[i]
+        assert EXPECTED_OUTPUT[i].startswith(output1[i])
     output2 = do_sample(llm, minicpmv_lora_files, lora_id=2)
     for i in range(len(EXPECTED_OUTPUT)):
-        assert output2[i] == EXPECTED_OUTPUT[i]
+        assert EXPECTED_OUTPUT[i].startswith(output2[i])
diff --git a/tests/lora/test_minicpmv_tp.py b/tests/lora/test_minicpmv_tp.py
new file mode 100644
index 000000000000..ba29e562e58e
--- /dev/null
+++ b/tests/lora/test_minicpmv_tp.py
@@ -0,0 +1,95 @@
+from typing import List
+
+import pytest
+
+import vllm
+from vllm.assets.image import ImageAsset
+from vllm.lora.request import LoRARequest
+
+from ..utils import multi_gpu_test
+
+MODEL_PATH = "openbmb/MiniCPM-Llama3-V-2_5"
+
+PROMPT_TEMPLATE = (
+    "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n"
+    "(<image>./</image>)\nWhat is in the image?<|eot_id|>"
+    "<|start_header_id|>assistant<|end_header_id|>\n\n")
+
+IMAGE_ASSETS = [
+    ImageAsset("stop_sign"),
+    ImageAsset("cherry_blossom"),
+]
+
+# After fine-tuning with LoRA, all generated content should start begin `A`.
+EXPECTED_OUTPUT = [
+    "A red and white stop sign with a Chinese archway in the background featuring red lanterns and gold accents.",  # noqa: E501
+    "A pink cherry blossom tree with a blue sky in the background.",
+]
+
+
+def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
+    sampling_params = vllm.SamplingParams(
+        temperature=0,
+        max_tokens=5,
+        stop_token_ids=[128001, 128009],  # eos_id, eot_id
+    )
+
+    inputs = [{
+        "prompt": PROMPT_TEMPLATE,
+        "multi_modal_data": {
+            "image": asset.pil_image
+        },
+    } for asset in IMAGE_ASSETS]
+
+    outputs = llm.generate(
+        inputs,
+        sampling_params,
+        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
+        if lora_id else None,
+    )
+    # Print the outputs.
+    generated_texts: List[str] = []
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text.strip()
+        generated_texts.append(generated_text)
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    return generated_texts
+
+
+@multi_gpu_test(num_gpus=2)
+@pytest.mark.parametrize("fully_sharded", [True, False])
+def test_minicpmv_tp2(minicpmv_lora_files, fully_sharded):
+    llm = vllm.LLM(
+        MODEL_PATH,
+        enable_lora=True,
+        max_num_seqs=2,
+        max_loras=4,
+        max_lora_rank=64,
+        tensor_parallel_size=2,
+        trust_remote_code=True,
+        fully_sharded_loras=fully_sharded,
+    )
+
+    output_tp = do_sample(llm, minicpmv_lora_files, lora_id=1)
+
+    for i in range(len(EXPECTED_OUTPUT)):
+        assert EXPECTED_OUTPUT[i].startswith(output_tp[i])
+
+
+@multi_gpu_test(num_gpus=4)
+@pytest.mark.parametrize("fully_sharded", [True, False])
+def test_minicpmv_tp4(minicpmv_lora_files, fully_sharded):
+    llm = vllm.LLM(
+        MODEL_PATH,
+        enable_lora=True,
+        max_num_seqs=2,
+        max_loras=4,
+        max_lora_rank=64,
+        tensor_parallel_size=4,
+        trust_remote_code=True,
+        fully_sharded_loras=fully_sharded,
+    )
+    output_tp = do_sample(llm, minicpmv_lora_files, lora_id=1)
+    for i in range(len(EXPECTED_OUTPUT)):
+        assert EXPECTED_OUTPUT[i].startswith(output_tp[i])
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index 03c019b7f90a..1f80c716bc48 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -456,13 +456,7 @@ def _create_lora_modules(self):
                 self.model, module_name,
                 from_layer(module, self.lora_slots, self.lora_config,
                            packed_moduled_lst, self.model.config))
-            # In some models, especially multimodal ones, layers with the same
-            # name may have different types, such as nn.Linear and
-            # ReplicatedLinear. The nn.Linear layers cannot be replaced with
-            # LoRA layers, leading to assertion error. The following check
-            # aims to prevent this error
-            if not isinstance(new_module, BaseLayerWithLoRA):
-                continue
+
             # LinearScalingRotaryEmbeddingWithLora is used to handle
             # long context lora. Register relevant metadata.
             if isinstance(new_module, LinearScalingRotaryEmbeddingWithLora):
@@ -480,6 +474,15 @@ def _create_lora_modules(self):
                                                 module, self.lora_slots,
                                                 self.lora_config,
                                                 self.model.config))
+
+            # In some models, especially multimodal ones, layers with the same
+            # name may have different types, such as nn.Linear and
+            # ReplicatedLinear. The nn.Linear layers cannot be replaced with
+            # LoRA layers, leading to assertion error. The following check
+            # aims to prevent this error
+            if self.supports_mm and not isinstance(new_module,
+                                                   BaseLayerWithLoRA):
+                continue
             self.register_module(module_name, new_module)
             self._register_packed_modules(module_name)
             # All lora layers share the same punica_wrapper based on reference.

From 27a7be46948427c33bdf35f63b24e715a074070d Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Sun, 29 Sep 2024 14:17:29 +0800
Subject: [PATCH 19/25] Minicpmv26 support LoRA done

---
 vllm/model_executor/models/minicpmv.py | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 89cdfbcc6afa..493a9ef05332 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -869,7 +869,28 @@ def is_default_weight_loading(self, name: str) -> bool:
         return "resampler" in name
 
 
-class MiniCPMV2_6(MiniCPMVBaseModel):
+class MiniCPMV2_6(MiniCPMVBaseModel, SupportsLoRA):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    # LoRA specific attributes
+    supported_lora_modules = [
+        "qkv_proj",
+        "o_proj",
+        "gate_up_proj",
+        "down_proj",
+    ]
+    embedding_modules = {}
+    embedding_padding_modules = []
 
     def __init__(
         self,

From 114c4e01c7bfc2b86e08a23182a80e44cc382949 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Sun, 29 Sep 2024 17:11:39 +0800
Subject: [PATCH 20/25] Update minicpmv26 vpm

---
 .../model_executor/models/idefics2_vision_model.py | 14 +++++++++++---
 vllm/model_executor/models/minicpmv.py             |  8 ++++----
 2 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/vllm/model_executor/models/idefics2_vision_model.py b/vllm/model_executor/models/idefics2_vision_model.py
index cc448ed28d2d..76b360a4e231 100644
--- a/vllm/model_executor/models/idefics2_vision_model.py
+++ b/vllm/model_executor/models/idefics2_vision_model.py
@@ -69,6 +69,7 @@ def forward(
         self,
         pixel_values: torch.FloatTensor,
         patch_attention_mask: torch.BoolTensor,
+        tgt_sizes: Optional[torch.IntTensor] = None
     ) -> torch.Tensor:
         batch_size, _, max_im_h, max_im_w = pixel_values.shape
         patch_embeds = self.patch_embedding(pixel_values)
@@ -84,8 +85,13 @@ def forward(
                                   fill_value=0)
 
         for batch_idx, p_attn_mask in enumerate(patch_attention_mask):
-            nb_patches_h = p_attn_mask[:, 0].sum()
-            nb_patches_w = p_attn_mask[0].sum()
+
+            if tgt_sizes is not None:
+                nb_patches_h = tgt_sizes[batch_idx][0]
+                nb_patches_w = tgt_sizes[batch_idx][1]
+            else:
+                nb_patches_h = p_attn_mask[:, 0].sum()
+                nb_patches_w = p_attn_mask[0].sum()
             fractional_coords_h = torch.arange(0, 1 - 1e-6, 1 / nb_patches_h)
             fractional_coords_w = torch.arange(0, 1 - 1e-6, 1 / nb_patches_w)
             bucket_coords_h = torch.bucketize(fractional_coords_h,
@@ -287,10 +293,12 @@ def forward(
         self,
         pixel_values,
         patch_attention_mask: Optional[torch.BoolTensor] = None,
+        tgt_sizes: Optional[torch.IntTensor] = None,
     ) -> torch.tensor:
         hidden_states = self.embeddings(
             pixel_values=pixel_values,
-            patch_attention_mask=patch_attention_mask)
+            patch_attention_mask=patch_attention_mask,
+            tgt_sizes=tgt_sizes)
         encoder_outputs = self.encoder(hidden_states)
         last_hidden_state = self.post_layernorm(encoder_outputs)
         return last_hidden_state
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 493a9ef05332..d7eee629cb43 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -916,14 +916,14 @@ def init_llm(
 
     def init_vision_module(self) -> nn.Module:
         # A custom version of SiglipVisionTransformer, won't work with TP
-        from vllm.model_executor.models.na_vit import SiglipVisionTransformer
+        # from vllm.model_executor.models.na_vit import SiglipVisionTransformer
 
         if self.config._attn_implementation == "flash_attention_2":
             self.config.vision_config._attn_implementation = "flash_attention_2"
         else:
             # not support sdpa
             self.config.vision_config._attn_implementation = "eager"
-        model = SiglipVisionTransformer(self.config.vision_config)
+        model = Idefics2VisionTransformer(self.config.vision_config)
         if self.config.drop_vision_last_layer:
             model.encoder.layers = model.encoder.layers[:-1]
         return model
@@ -981,12 +981,12 @@ def get_vision_hidden_states(
             all_pixel_values.type(dtype),
             patch_attention_mask=patch_attn_mask,
             tgt_sizes=tgt_sizes,
-        ).last_hidden_state
+        )#.last_hidden_state
 
         return self.resampler(vision_embedding, tgt_sizes)
 
     def is_default_weight_loading(self, name: str) -> bool:
-        return "resampler" in name or "vpm" in name
+        return "resampler" in name #or "vpm" in name
 
 
 _SUPPORT_VERSION = {

From 1b7b0ec47764e4d2c5c481438a69cec4008cd9f4 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Sun, 29 Sep 2024 18:27:25 +0800
Subject: [PATCH 21/25] Done

---
 .../models/idefics2_vision_model.py           |  10 +-
 vllm/model_executor/models/minicpmv.py        |  24 +-
 vllm/model_executor/models/na_vit.py          | 804 ------------------
 3 files changed, 16 insertions(+), 822 deletions(-)
 delete mode 100644 vllm/model_executor/models/na_vit.py

diff --git a/vllm/model_executor/models/idefics2_vision_model.py b/vllm/model_executor/models/idefics2_vision_model.py
index 76b360a4e231..b069089abf97 100644
--- a/vllm/model_executor/models/idefics2_vision_model.py
+++ b/vllm/model_executor/models/idefics2_vision_model.py
@@ -65,12 +65,10 @@ def __init__(self, config: Idefics2VisionConfig):
         self.position_embedding = nn.Embedding(self.num_positions,
                                                self.embed_dim)
 
-    def forward(
-        self,
-        pixel_values: torch.FloatTensor,
-        patch_attention_mask: torch.BoolTensor,
-        tgt_sizes: Optional[torch.IntTensor] = None
-    ) -> torch.Tensor:
+    def forward(self,
+                pixel_values: torch.FloatTensor,
+                patch_attention_mask: torch.BoolTensor,
+                tgt_sizes: Optional[torch.IntTensor] = None) -> torch.Tensor:
         batch_size, _, max_im_h, max_im_w = pixel_values.shape
         patch_embeds = self.patch_embedding(pixel_values)
         embeddings = patch_embeds.flatten(2).transpose(1, 2)
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index d7eee629cb43..b7d067d9381d 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -881,14 +881,21 @@ class MiniCPMV2_6(MiniCPMVBaseModel, SupportsLoRA):
             "up_proj",
         ],
     }
-
     # LoRA specific attributes
     supported_lora_modules = [
-        "qkv_proj",
+        # vision encoder
+        "fc1",
+        "fc2",
+        "out_proj",
+        # language model
+        "qkv_proj",  # same name with vision encoder
         "o_proj",
         "gate_up_proj",
         "down_proj",
+        # resampler
+        "kv_proj",
     ]
+
     embedding_modules = {}
     embedding_padding_modules = []
 
@@ -915,14 +922,7 @@ def init_llm(
                           name="model")
 
     def init_vision_module(self) -> nn.Module:
-        # A custom version of SiglipVisionTransformer, won't work with TP
-        # from vllm.model_executor.models.na_vit import SiglipVisionTransformer
 
-        if self.config._attn_implementation == "flash_attention_2":
-            self.config.vision_config._attn_implementation = "flash_attention_2"
-        else:
-            # not support sdpa
-            self.config.vision_config._attn_implementation = "eager"
         model = Idefics2VisionTransformer(self.config.vision_config)
         if self.config.drop_vision_last_layer:
             model.encoder.layers = model.encoder.layers[:-1]
@@ -949,7 +949,7 @@ def get_vision_embedding(
             pixel_values,
             patch_attention_mask=patch_attn_mask,
             tgt_sizes=tgt_sizes,
-        ).last_hidden_state
+        )
         return vision_embedding
 
     def get_vision_hidden_states(
@@ -981,12 +981,12 @@ def get_vision_hidden_states(
             all_pixel_values.type(dtype),
             patch_attention_mask=patch_attn_mask,
             tgt_sizes=tgt_sizes,
-        )#.last_hidden_state
+        )
 
         return self.resampler(vision_embedding, tgt_sizes)
 
     def is_default_weight_loading(self, name: str) -> bool:
-        return "resampler" in name #or "vpm" in name
+        return "resampler" in name  #or "vpm" in name
 
 
 _SUPPORT_VERSION = {
diff --git a/vllm/model_executor/models/na_vit.py b/vllm/model_executor/models/na_vit.py
deleted file mode 100644
index 1d6f26f0d4fb..000000000000
--- a/vllm/model_executor/models/na_vit.py
+++ /dev/null
@@ -1,804 +0,0 @@
-import logging
-import math
-import os
-import warnings
-from typing import Optional, Tuple, Union
-
-import numpy as np
-import torch
-import torch.nn.functional as F
-from torch import nn
-from torch.nn.init import _calculate_fan_in_and_fan_out
-from transformers.activations import ACT2FN
-from transformers.configuration_utils import PretrainedConfig
-from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask
-from transformers.modeling_outputs import (BaseModelOutput,
-                                           BaseModelOutputWithPooling)
-from transformers.modeling_utils import PreTrainedModel
-from transformers.utils import (ModelOutput, is_flash_attn_2_available,
-                                replace_return_docstrings)
-
-logger = logging.getLogger("vllm")
-
-
-# For Siglip: copied from
-#   HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit and add tgt_sizes
-# Remove hints as there's little possibility to change these code.
-class SiglipVisionConfig(PretrainedConfig):
-
-    model_type = "siglip_vision_model"
-
-    def __init__(
-        self,
-        hidden_size=768,
-        intermediate_size=3072,
-        num_hidden_layers=12,
-        num_attention_heads=12,
-        num_channels=3,
-        image_size=224,
-        patch_size=16,
-        hidden_act="gelu_pytorch_tanh",
-        layer_norm_eps=1e-6,
-        attention_dropout=0.0,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.num_channels = num_channels
-        self.patch_size = patch_size
-        self.image_size = image_size
-        self.attention_dropout = attention_dropout
-        self.layer_norm_eps = layer_norm_eps
-        self.hidden_act = hidden_act
-
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str,
-                                                                  os.PathLike],
-                        **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(
-            pretrained_model_name_or_path, **kwargs)
-
-        # get the vision config dict if we are loading from SiglipConfig
-        if config_dict.get("model_type") == "siglip":
-            config_dict = config_dict["vision_config"]
-
-        if "model_type" in config_dict and hasattr(
-                cls,
-                "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                "You are using a model of type %s to "
-                "instantiate a model of type %s. "
-                "This is not supported for all configurations"
-                "of models and can yield errors.", config_dict['model_type'],
-                cls.model_type)
-
-        return cls.from_dict(config_dict, **kwargs)
-
-
-_CHECKPOINT_FOR_DOC = "google/siglip-base-patch16-224"
-
-SIGLIP_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "google/siglip-base-patch16-224",
-    # See all SigLIP models at https://huggingface.co/models?filter=siglip
-]
-
-if is_flash_attn_2_available():
-    from flash_attn import flash_attn_func, flash_attn_varlen_func
-    from flash_attn.bert_padding import pad_input  # noqa
-    from flash_attn.bert_padding import index_first_axis, unpad_input
-
-
-# Copied from transformers.models.llama.modeling_llama._get_unpad_data
-def _get_unpad_data(attention_mask):
-    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
-    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
-    max_seqlen_in_batch = seqlens_in_batch.max().item()
-    cu_seqlens = F.pad(
-        torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
-    return (
-        indices,
-        cu_seqlens,
-        max_seqlen_in_batch,
-    )
-
-
-def _trunc_normal_(tensor, mean, std, a, b):
-
-    def norm_cdf(x):
-        # Computes standard normal cumulative distribution function
-        return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0
-
-    if (mean < a - 2 * std) or (mean > b + 2 * std):
-        warnings.warn(
-            "mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
-            "The distribution of values may be incorrect.",
-            stacklevel=2,
-        )
-
-    # Values are generated by using a truncated uniform distribution and
-    # then using the inverse CDF for the normal distribution.
-    # Get upper and lower cdf values
-    l_ = norm_cdf((a - mean) / std)
-    u = norm_cdf((b - mean) / std)
-
-    # Uniformly fill tensor with values from [l, u], then translate to
-    # [2l-1, 2u-1].
-    tensor.uniform_(2 * l_ - 1, 2 * u - 1)
-
-    # Use inverse cdf transform for normal distribution to get truncated
-    # standard normal
-    if tensor.dtype in [torch.float16, torch.bfloat16]:
-        # The `erfinv_` op is not (yet?) defined in float16+cpu, bfloat16+gpu
-        og_dtype = tensor.dtype
-        tensor = tensor.to(torch.float32)
-        tensor.erfinv_()
-        tensor = tensor.to(og_dtype)
-    else:
-        tensor.erfinv_()
-
-    # Transform to proper mean, std
-    tensor.mul_(std * math.sqrt(2.0))
-    tensor.add_(mean)
-
-    # Clamp to ensure it's in the proper range
-    if tensor.dtype == torch.float16:
-        # The `clamp_` op is not (yet?) defined in float16+cpu
-        tensor = tensor.to(torch.float32)
-        tensor.clamp_(min=a, max=b)
-        tensor = tensor.to(torch.float16)
-    else:
-        tensor.clamp_(min=a, max=b)
-
-
-def trunc_normal_tf_(tensor: torch.Tensor,
-                     mean: float = 0.0,
-                     std: float = 1.0,
-                     a: float = -2.0,
-                     b: float = 2.0) -> torch.Tensor:
-    with torch.no_grad():
-        _trunc_normal_(tensor, 0, 1.0, a, b)
-        tensor.mul_(std).add_(mean)
-
-
-def variance_scaling_(tensor, scale=1.0, mode="fan_in", distribution="normal"):
-    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
-    if mode == "fan_in":
-        denom = fan_in
-    elif mode == "fan_out":
-        denom = fan_out
-    elif mode == "fan_avg":
-        denom = (fan_in + fan_out) / 2
-
-    variance = scale / denom
-
-    if distribution == "truncated_normal":
-        # constant is stddev of standard normal truncated to (-2, 2)
-        trunc_normal_tf_(tensor, std=math.sqrt(variance) / 0.87962566103423978)
-    elif distribution == "normal":
-        with torch.no_grad():
-            tensor.normal_(std=math.sqrt(variance))
-    elif distribution == "uniform":
-        bound = math.sqrt(3 * variance)
-        with torch.no_grad():
-            tensor.uniform_(-bound, bound)
-    else:
-        raise ValueError(f"invalid distribution {distribution}")
-
-
-def lecun_normal_(tensor):
-    variance_scaling_(tensor, mode="fan_in", distribution="truncated_normal")
-
-
-def default_flax_embed_init(tensor):
-    variance_scaling_(tensor, mode="fan_in", distribution="normal")
-
-
-class SiglipVisionModelOutput(ModelOutput):
-    image_embeds: Optional[torch.FloatTensor] = None
-    last_hidden_state: torch.FloatTensor = None
-    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    attentions: Optional[Tuple[torch.FloatTensor]] = None
-
-
-class SiglipVisionEmbeddings(nn.Module):
-
-    def __init__(self, config: SiglipVisionConfig):
-        super().__init__()
-        self.config = config
-        self.embed_dim = config.hidden_size
-        self.image_size = config.image_size
-        self.patch_size = config.patch_size
-
-        self.patch_embedding = nn.Conv2d(
-            in_channels=config.num_channels,
-            out_channels=self.embed_dim,
-            kernel_size=self.patch_size,
-            stride=self.patch_size,
-            padding="valid",
-        )
-
-        self.num_patches_per_side = self.image_size // self.patch_size
-        self.num_patches = self.num_patches_per_side**2
-        self.num_positions = self.num_patches
-        self.position_embedding = nn.Embedding(self.num_positions,
-                                               self.embed_dim)
-
-    def forward(self,
-                pixel_values: torch.FloatTensor,
-                patch_attention_mask: torch.BoolTensor,
-                tgt_sizes: Optional[torch.IntTensor] = None) -> torch.Tensor:
-        batch_size = pixel_values.size(0)
-
-        patch_embeds = self.patch_embedding(pixel_values)
-        embeddings = patch_embeds.flatten(2).transpose(1, 2)
-
-        max_im_h, max_im_w = pixel_values.size(2), pixel_values.size(3)
-        max_nb_patches_h, max_nb_patches_w = (max_im_h // self.patch_size,
-                                              max_im_w // self.patch_size)
-        boundaries = torch.arange(1 / self.num_patches_per_side, 1.0,
-                                  1 / self.num_patches_per_side)
-        position_ids = torch.full(
-            size=(
-                batch_size,
-                max_nb_patches_h * max_nb_patches_w,
-            ),
-            fill_value=0,
-        )
-
-        for batch_idx, p_attn_mask in enumerate(patch_attention_mask):
-            if tgt_sizes is not None:
-                nb_patches_h = tgt_sizes[batch_idx][0]
-                nb_patches_w = tgt_sizes[batch_idx][1]
-            else:
-                nb_patches_h = p_attn_mask[:, 0].sum()
-                nb_patches_w = p_attn_mask[0].sum()
-
-            fractional_coords_h = torch.arange(0, 1 - 1e-6, 1 / nb_patches_h)
-            fractional_coords_w = torch.arange(0, 1 - 1e-6, 1 / nb_patches_w)
-
-            bucket_coords_h = torch.bucketize(fractional_coords_h,
-                                              boundaries,
-                                              right=True)
-            bucket_coords_w = torch.bucketize(fractional_coords_w,
-                                              boundaries,
-                                              right=True)
-
-            pos_ids = (bucket_coords_h[:, None] * self.num_patches_per_side +
-                       bucket_coords_w).flatten()
-            position_ids[batch_idx][p_attn_mask.view(-1).cpu()] = pos_ids
-
-        position_ids = position_ids.to(self.position_embedding.weight.device)
-
-        embeddings = embeddings + self.position_embedding(position_ids)
-        return embeddings
-
-
-class SiglipAttention(nn.Module):
-    """Multi-headed attention from 'Attention Is All You Need' paper"""
-
-    # Copied from transformers.models.clip.modeling_clip.CLIPAttention.__init__
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.embed_dim = config.hidden_size
-        self.num_heads = config.num_attention_heads
-        self.head_dim = self.embed_dim // self.num_heads
-        if self.head_dim * self.num_heads != self.embed_dim:
-            raise ValueError(
-                "embed_dim must be divisible by num_heads (got `embed_dim`: "
-                f"{self.embed_dim} and `num_heads`:"
-                f" {self.num_heads}).")
-        self.scale = self.head_dim**-0.5
-        self.dropout = config.attention_dropout
-
-        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
-        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
-        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
-        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        output_attentions: Optional[bool] = False,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor],
-               Optional[Tuple[torch.Tensor]]]:
-        """Input shape: Batch x Time x Channel"""
-
-        batch_size, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-        query_states = query_states.view(batch_size, q_len, self.num_heads,
-                                         self.head_dim).transpose(1, 2)
-        key_states = key_states.view(batch_size, q_len, self.num_heads,
-                                     self.head_dim).transpose(1, 2)
-        value_states = value_states.view(batch_size, q_len, self.num_heads,
-                                         self.head_dim).transpose(1, 2)
-
-        k_v_seq_len = key_states.shape[-2]
-        attn_weights = torch.matmul(query_states, key_states.transpose(
-            2, 3)) * self.scale
-
-        if attn_weights.size() != (batch_size, self.num_heads, q_len,
-                                   k_v_seq_len):
-            raise ValueError(
-                "Attention weights should be of size "
-                f"{(batch_size, self.num_heads, q_len, k_v_seq_len)}, but is"
-                f" {attn_weights.size()}")
-
-        if attention_mask is not None:
-            if attention_mask.size() != (batch_size, 1, q_len, k_v_seq_len):
-                raise ValueError(
-                    "Attention mask should be of size "
-                    f"{(batch_size, 1, q_len, k_v_seq_len)}",
-                    f"but is {attention_mask.size()}")
-            attn_weights = attn_weights + attention_mask
-
-        # upcast attention to fp32
-        attn_weights = nn.functional.softmax(attn_weights,
-                                             dim=-1,
-                                             dtype=torch.float32).to(
-                                                 query_states.dtype)
-        attn_weights = nn.functional.dropout(attn_weights,
-                                             p=self.dropout,
-                                             training=self.training)
-        attn_output = torch.matmul(attn_weights, value_states)
-
-        if attn_output.size() != (batch_size, self.num_heads, q_len,
-                                  self.head_dim):
-            raise ValueError(
-                "`attn_output` should be of size "
-                f"{(batch_size, self.num_heads, q_len, self.head_dim)}, "
-                "but is"
-                f" {attn_output.size()}")
-
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.reshape(batch_size, q_len, self.embed_dim)
-
-        attn_output = self.out_proj(attn_output)
-
-        return attn_output, attn_weights
-
-
-class SiglipFlashAttention2(SiglipAttention):
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.is_causal = False  # Hack to make sure we don't use a causal mask
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.LongTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        **kwargs,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor],
-               Optional[Tuple[torch.Tensor]]]:
-        output_attentions = False
-
-        bsz, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-        query_states = query_states.view(bsz, q_len, self.num_heads,
-                                         self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_heads,
-                                     self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_heads,
-                                         self.head_dim).transpose(1, 2)
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            kv_seq_len += past_key_value.get_usable_length(
-                kv_seq_len, self.layer_idx)
-
-        query_states = query_states.transpose(1, 2)
-        key_states = key_states.transpose(1, 2)
-        value_states = value_states.transpose(1, 2)
-
-        dropout_rate = self.dropout if self.training else 0.0
-
-        input_dtype = query_states.dtype
-        if input_dtype == torch.float32:
-            if torch.is_autocast_enabled():
-                target_dtype = torch.get_autocast_gpu_dtype()
-            # Handle the case where the model is quantized
-            elif hasattr(self.config, "_pre_quantization_dtype"):
-                target_dtype = self.config._pre_quantization_dtype
-            else:
-                target_dtype = self.q_proj.weight.dtype
-
-            logger.warning(
-                "The input hidden states seems to be "
-                "silently casted in float32, "
-                "this might be related to the fact "
-                "you have upcasted embedding or layer norm layers in float32. "
-                "We will cast back the input in"
-                " %s.", target_dtype)
-
-            query_states = query_states.to(target_dtype)
-            key_states = key_states.to(target_dtype)
-            value_states = value_states.to(target_dtype)
-
-        attn_output = self._flash_attention_forward(query_states,
-                                                    key_states,
-                                                    value_states,
-                                                    attention_mask,
-                                                    q_len,
-                                                    dropout=dropout_rate)
-
-        attn_output = attn_output.reshape(bsz, q_len,
-                                          self.embed_dim).contiguous()
-        attn_output = self.out_proj(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-        return attn_output, attn_weights
-
-    def _flash_attention_forward(self,
-                                 query_states,
-                                 key_states,
-                                 value_states,
-                                 attention_mask,
-                                 query_length,
-                                 dropout=0.0,
-                                 softmax_scale=None):
-        causal = self.is_causal and query_length != 1
-
-        # Contains at least one padding token in the sequence
-        if attention_mask is not None:
-            batch_size = query_states.shape[0]
-            (query_states, key_states, value_states, indices_q, cu_seq_lens,
-             max_seq_lens) = self._upad_input(query_states, key_states,
-                                              value_states, attention_mask,
-                                              query_length)
-
-            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
-            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
-
-            attn_output_unpad = flash_attn_varlen_func(
-                query_states,
-                key_states,
-                value_states,
-                cu_seqlens_q=cu_seqlens_q,
-                cu_seqlens_k=cu_seqlens_k,
-                max_seqlen_q=max_seqlen_in_batch_q,
-                max_seqlen_k=max_seqlen_in_batch_k,
-                dropout_p=dropout,
-                softmax_scale=softmax_scale,
-                causal=causal,
-            )
-
-            attn_output = pad_input(attn_output_unpad, indices_q, batch_size,
-                                    query_length)
-        else:
-            attn_output = flash_attn_func(query_states,
-                                          key_states,
-                                          value_states,
-                                          dropout,
-                                          softmax_scale=softmax_scale,
-                                          causal=causal)
-
-        return attn_output
-
-    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask,
-                    query_length):
-        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(
-            attention_mask)
-        batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
-
-        key_layer = index_first_axis(
-            key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads,
-                              head_dim), indices_k)
-        value_layer = index_first_axis(
-            value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads,
-                                head_dim), indices_k)
-        if query_length == kv_seq_len:
-            query_layer = index_first_axis(
-                query_layer.reshape(batch_size * kv_seq_len, self.num_heads,
-                                    head_dim), indices_k)
-            cu_seqlens_q = cu_seqlens_k
-            max_seqlen_in_batch_q = max_seqlen_in_batch_k
-            indices_q = indices_k
-        elif query_length == 1:
-            max_seqlen_in_batch_q = 1
-            cu_seqlens_q = torch.arange(
-                batch_size + 1, dtype=torch.int32, device=query_layer.device
-            )  # There is a memcpy here, that is very bad.
-            indices_q = cu_seqlens_q[:-1]
-            query_layer = query_layer.squeeze(1)
-        else:
-            # The -q_len: slice assumes left padding.
-            attention_mask = attention_mask[:, -query_length:]
-            (query_layer, indices_q, cu_seqlens_q,
-             max_seqlen_in_batch_q) = unpad_input(query_layer, attention_mask)
-
-        return (
-            query_layer,
-            key_layer,
-            value_layer,
-            indices_q,
-            (cu_seqlens_q, cu_seqlens_k),
-            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
-        )
-
-
-# Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->Siglip
-class SiglipMLP(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.activation_fn = ACT2FN[config.hidden_act]
-        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
-        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
-
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        hidden_states = self.fc1(hidden_states)
-        hidden_states = self.activation_fn(hidden_states)
-        hidden_states = self.fc2(hidden_states)
-        return hidden_states
-
-
-# Copied from transformers.models.clip.modeling_clip.CLIPEncoderLayer
-# with CLIP->Siglip
-class SiglipEncoderLayer(nn.Module):
-
-    def __init__(self, config: SiglipVisionConfig):
-        super().__init__()
-        self.embed_dim = config.hidden_size
-        self._use_flash_attention_2 = (
-            config._attn_implementation == "flash_attention_2")
-        self.self_attn = (SiglipAttention(config)
-                          if not self._use_flash_attention_2 else
-                          SiglipFlashAttention2(config))
-        self.layer_norm1 = nn.LayerNorm(self.embed_dim,
-                                        eps=config.layer_norm_eps)
-        self.mlp = SiglipMLP(config)
-        self.layer_norm2 = nn.LayerNorm(self.embed_dim,
-                                        eps=config.layer_norm_eps)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: torch.Tensor,
-        output_attentions: Optional[bool] = False,
-    ) -> Tuple[torch.FloatTensor]:
-        residual = hidden_states
-
-        hidden_states = self.layer_norm1(hidden_states)
-        hidden_states, attn_weights = self.self_attn(
-            hidden_states=hidden_states,
-            attention_mask=attention_mask,
-            output_attentions=output_attentions,
-        )
-        hidden_states = residual + hidden_states
-
-        residual = hidden_states
-        hidden_states = self.layer_norm2(hidden_states)
-        hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + hidden_states
-
-        outputs = (hidden_states, )
-
-        if output_attentions:
-            outputs += (attn_weights, )
-
-        return outputs
-
-
-class SiglipPreTrainedModel(PreTrainedModel):
-    config_class = SiglipVisionConfig
-    base_model_prefix = "siglip"
-    supports_gradient_checkpointing = True
-
-    def _init_weights(self, module):
-        """Initialize the weights"""
-
-        if isinstance(module, SiglipVisionEmbeddings):
-            width = self.config.hidden_size
-            nn.init.normal_(module.position_embedding.weight,
-                            std=1 / np.sqrt(width))
-        elif isinstance(module, nn.Embedding):
-            default_flax_embed_init(module.weight)
-        elif isinstance(module, SiglipAttention):
-            nn.init.normal_(module.q_proj.weight)
-            nn.init.normal_(module.k_proj.weight)
-            nn.init.normal_(module.v_proj.weight)
-            nn.init.normal_(module.out_proj.weight)
-            nn.init.zeros_(module.q_proj.bias)
-            nn.init.zeros_(module.k_proj.bias)
-            nn.init.zeros_(module.v_proj.bias)
-            nn.init.zeros_(module.out_proj.bias)
-        elif isinstance(module, SiglipMLP):
-            nn.init.normal_(module.fc1.weight)
-            nn.init.normal_(module.fc2.weight)
-            nn.init.normal_(module.fc1.bias, std=1e-6)
-            nn.init.normal_(module.fc2.bias, std=1e-6)
-        elif isinstance(module, (nn.Linear, nn.Conv2d)):
-            lecun_normal_(module.weight)
-            if module.bias is not None:
-                nn.init.zeros_(module.bias)
-        elif isinstance(module, nn.LayerNorm):
-            module.bias.data.zero_()
-            module.weight.data.fill_(1.0)
-
-
-# Copied from transformers.models.clip.modeling_clip.CLIPEncoder
-# with CLIP->Siglip
-class SiglipEncoder(nn.Module):
-
-    def __init__(self, config: SiglipVisionConfig):
-        super().__init__()
-        self.config = config
-        self.layers = nn.ModuleList([
-            SiglipEncoderLayer(config) for _ in range(config.num_hidden_layers)
-        ])
-        self.gradient_checkpointing = False
-
-    # Ignore copy
-    def forward(
-        self,
-        inputs_embeds,
-        attention_mask: Optional[torch.Tensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutput]:
-        output_attentions = output_attentions if output_attentions is not None \
-                                else self.config.output_attentions
-        output_hidden_states = (output_hidden_states
-                                if output_hidden_states is not None else
-                                self.config.output_hidden_states)
-        return_dict = return_dict if return_dict is not None \
-                        else self.config.use_return_dict
-
-        encoder_states = () if output_hidden_states else None
-        all_attentions = () if output_attentions else None
-
-        hidden_states = inputs_embeds
-        for encoder_layer in self.layers:
-            if output_hidden_states:
-                encoder_states = encoder_states + (hidden_states, )
-            if self.gradient_checkpointing and self.training:
-                layer_outputs = self._gradient_checkpointing_func(
-                    encoder_layer.__call__,
-                    hidden_states,
-                    attention_mask,
-                    output_attentions,
-                )
-            else:
-                layer_outputs = encoder_layer(
-                    hidden_states,
-                    attention_mask,
-                    output_attentions=output_attentions,
-                )
-
-            hidden_states = layer_outputs[0]
-
-            if output_attentions:
-                all_attentions = all_attentions + (layer_outputs[1], )
-
-        if output_hidden_states:
-            encoder_states = encoder_states + (hidden_states, )
-
-        if not return_dict:
-            return tuple(
-                v for v in [hidden_states, encoder_states, all_attentions]
-                if v is not None)
-        return BaseModelOutput(last_hidden_state=hidden_states,
-                               hidden_states=encoder_states,
-                               attentions=all_attentions)
-
-
-class SiglipVisionTransformer(SiglipPreTrainedModel):
-    config_class = SiglipVisionConfig
-    main_input_name = "pixel_values"
-    _supports_flash_attn_2 = True
-
-    def __init__(self, config: SiglipVisionConfig):
-        super().__init__(config)
-        self.config = config
-        embed_dim = config.hidden_size
-
-        self.embeddings = SiglipVisionEmbeddings(config)
-        self.encoder = SiglipEncoder(config)
-        self.post_layernorm = nn.LayerNorm(embed_dim,
-                                           eps=config.layer_norm_eps)
-        self._use_flash_attention_2 = (
-            config._attn_implementation == "flash_attention_2")
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self) -> nn.Module:
-        return self.embeddings.patch_embedding
-
-    @replace_return_docstrings(output_type=BaseModelOutputWithPooling,
-                               config_class=SiglipVisionConfig)
-    def forward(
-        self,
-        pixel_values,
-        patch_attention_mask: Optional[torch.BoolTensor] = None,
-        tgt_sizes: Optional[torch.IntTensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPooling]:
-        r"""
-        Returns:
-        """
-        output_attentions = output_attentions if output_attentions is not None \
-                                else self.config.output_attentions
-        output_hidden_states = (output_hidden_states
-                                if output_hidden_states is not None else
-                                self.config.output_hidden_states)
-        return_dict = return_dict if return_dict is not None \
-                        else self.config.use_return_dict
-
-        batch_size = pixel_values.size(0)
-        if patch_attention_mask is None:
-            patch_attention_mask = torch.ones(
-                size=(
-                    batch_size,
-                    pixel_values.size(2) // self.config.patch_size,
-                    pixel_values.size(3) // self.config.patch_size,
-                ),
-                dtype=torch.bool,
-                device=pixel_values.device,
-            )
-
-        hidden_states = self.embeddings(
-            pixel_values=pixel_values,
-            patch_attention_mask=patch_attention_mask,
-            tgt_sizes=tgt_sizes)
-
-        patch_attention_mask = patch_attention_mask.view(batch_size, -1)
-        # The call to `_upad_input` in `_flash_attention_forward` is expensive
-        # So when the `patch_attention_mask` is full of 1s
-        # (i.e. attending to the whole sequence),
-        # avoiding passing the attention_mask,
-        # which is equivalent to attending to the full sequence
-        if not torch.any(~patch_attention_mask):
-            attention_mask = None
-        else:
-            attention_mask = (_prepare_4d_attention_mask(
-                patch_attention_mask, hidden_states.dtype)
-                              if not self._use_flash_attention_2 else
-                              patch_attention_mask)
-
-        encoder_outputs = self.encoder(
-            inputs_embeds=hidden_states,
-            attention_mask=attention_mask,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        last_hidden_state = encoder_outputs[0]
-        last_hidden_state = self.post_layernorm(last_hidden_state)
-
-        if not return_dict:
-            return (last_hidden_state, None) + encoder_outputs[1:]
-
-        return BaseModelOutputWithPooling(
-            last_hidden_state=last_hidden_state,
-            pooler_output=None,
-            hidden_states=encoder_outputs.hidden_states,
-            attentions=encoder_outputs.attentions,
-        )

From f7f317293697761e8bece7ce05c493618a705103 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Mon, 30 Sep 2024 02:10:40 +0000
Subject: [PATCH 22/25] Use the common `BaseResampler`

---
 vllm/model_executor/models/minicpmv.py | 55 +-------------------------
 1 file changed, 1 insertion(+), 54 deletions(-)

diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index b7d067d9381d..ce357313e576 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -31,14 +31,13 @@
 import torch.types
 from PIL import Image
 from torch import nn
-from torch.nn.init import trunc_normal_
 from transformers import PretrainedConfig
 from typing_extensions import NotRequired
 
 from vllm.attention import AttentionMetadata
 from vllm.config import CacheConfig, LoRAConfig, MultiModalConfig
 from vllm.inputs import INPUT_REGISTRY, InputContext, LLMInputs
-from vllm.model_executor.layers.linear import ReplicatedLinear
+from vllm.model_executor.layers.resampler import BaseResampler
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.resampler import (Resampler2,
@@ -106,58 +105,6 @@ class MiniCPMVImagePixelInputs(TypedDict):
 DEFAULT_LN = partial(nn.LayerNorm, eps=1e-6)
 
 
-class BaseResampler(nn.Module):
-    """
-    A 2D perceiver-resampler network with one cross attention layers by
-        (grid_size**2) learnable queries and 2d sincos pos_emb
-    Outputs:
-        A tensor with the shape of (grid_size**2, embed_dim)
-    """
-
-    def __init__(
-        self,
-        num_queries: int,
-        embed_dim: int,
-        num_heads: int,
-        kv_dim: Optional[int] = None,
-        norm_layer: Callable[[int], nn.LayerNorm] = DEFAULT_LN,
-    ) -> None:
-        super().__init__()
-
-        self.num_queries = num_queries
-        self.embed_dim = embed_dim
-        self.num_heads = num_heads
-
-        self.query = nn.Parameter(torch.zeros(self.num_queries, embed_dim))
-        trunc_normal_(self.query, std=0.02)
-        if kv_dim is not None and kv_dim != embed_dim:
-            self.kv_proj = ReplicatedLinear(kv_dim, embed_dim, bias=False)
-        else:
-            # Maintain the same return value with ReplicatedLinear.forward
-            self.kv_proj = lambda *args, **kwargs: (
-                nn.Identity()(*args, **kwargs),
-                None,
-            )
-        self.attn = nn.MultiheadAttention(embed_dim, num_heads)
-        self.ln_q = norm_layer(embed_dim)
-        self.ln_kv = norm_layer(embed_dim)
-        self.ln_post = norm_layer(embed_dim)
-        self.proj = nn.Parameter(
-            (embed_dim**-0.5) * torch.randn(embed_dim, embed_dim))
-
-    def _init_weights(self, m: nn.Module) -> None:
-        if isinstance(m, nn.Linear):
-            trunc_normal_(m.weight, std=0.02)
-            if isinstance(m, nn.Linear) and m.bias is not None:
-                nn.init.constant_(m.bias, 0)
-        elif isinstance(m, nn.LayerNorm):
-            nn.init.constant_(m.bias, 0)
-            nn.init.constant_(m.weight, 1.0)
-
-    def _repeat(self, query, N: int):
-        return query.unsqueeze(1).repeat(1, N, 1)
-
-
 class Resampler2_5(BaseResampler):
 
     def __init__(

From 9ec1c655cb33e4b53ba1c69a333aa76ff3925b39 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Mon, 30 Sep 2024 02:10:46 +0000
Subject: [PATCH 23/25] Fix type annotation

---
 vllm/model_executor/models/idefics2_vision_model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/idefics2_vision_model.py b/vllm/model_executor/models/idefics2_vision_model.py
index b069089abf97..3b0b6febaa48 100644
--- a/vllm/model_executor/models/idefics2_vision_model.py
+++ b/vllm/model_executor/models/idefics2_vision_model.py
@@ -292,7 +292,7 @@ def forward(
         pixel_values,
         patch_attention_mask: Optional[torch.BoolTensor] = None,
         tgt_sizes: Optional[torch.IntTensor] = None,
-    ) -> torch.tensor:
+    ) -> torch.Tensor:
         hidden_states = self.embeddings(
             pixel_values=pixel_values,
             patch_attention_mask=patch_attention_mask,

From 28a86539143ea7adbd60c9755bb44f73956a1608 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Mon, 30 Sep 2024 02:11:19 +0000
Subject: [PATCH 24/25] Remove comment

---
 vllm/model_executor/models/minicpmv.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index ce357313e576..d14ada246e09 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -933,7 +933,7 @@ def get_vision_hidden_states(
         return self.resampler(vision_embedding, tgt_sizes)
 
     def is_default_weight_loading(self, name: str) -> bool:
-        return "resampler" in name  #or "vpm" in name
+        return "resampler" in name
 
 
 _SUPPORT_VERSION = {

From 6f4cfd71a2db52a001bc5c97ea69511449c078a1 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Mon, 30 Sep 2024 02:12:55 +0000
Subject: [PATCH 25/25] format

---
 vllm/model_executor/models/minicpmv.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index d14ada246e09..aaae4397c01d 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -37,10 +37,9 @@
 from vllm.attention import AttentionMetadata
 from vllm.config import CacheConfig, LoRAConfig, MultiModalConfig
 from vllm.inputs import INPUT_REGISTRY, InputContext, LLMInputs
-from vllm.model_executor.layers.resampler import BaseResampler
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.resampler import (Resampler2,
+from vllm.model_executor.layers.resampler import (BaseResampler, Resampler2,
                                                   get_2d_sincos_pos_embed)
 from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead