Skip to content

Commit c79babd

Browse files
litianjianFerdinandZhong
authored andcommitted
[Bugfix]Disable the post_norm layer of the vision encoder for LLaVA models (vllm-project#9653)
Signed-off-by: qishuai <[email protected]>
1 parent cc9c32a commit c79babd

File tree

4 files changed

+8
-4
lines changed

4 files changed

+8
-4
lines changed

vllm/model_executor/models/llava.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -273,7 +273,8 @@ def __init__(self,
273273
config.projector_hidden_act = "gelu"
274274

275275
# TODO: Optionally initializes this for supporting embeddings.
276-
self.vision_tower = init_vision_tower_for_llava(config, quant_config)
276+
self.vision_tower = init_vision_tower_for_llava(
277+
config, quant_config, require_post_norm=False)
277278
self.multi_modal_projector = LlavaMultiModalProjector(
278279
vision_hidden_size=config.vision_config.hidden_size,
279280
text_hidden_size=config.text_config.hidden_size,

vllm/model_executor/models/llava_next.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -277,7 +277,8 @@ def __init__(self,
277277
self.multimodal_config = multimodal_config
278278

279279
# TODO: Optionally initializes this for supporting embeddings.
280-
self.vision_tower = init_vision_tower_for_llava(config, quant_config)
280+
self.vision_tower = init_vision_tower_for_llava(
281+
config, quant_config, require_post_norm=False)
281282
self.image_newline = nn.Parameter(
282283
torch.empty(config.text_config.hidden_size))
283284
self.multi_modal_projector = LlavaMultiModalProjector(

vllm/model_executor/models/llava_next_video.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -256,7 +256,8 @@ def __init__(self,
256256
self.multimodal_config = multimodal_config
257257

258258
# Initialize the vision tower only up to the required feature layer
259-
self.vision_tower = init_vision_tower_for_llava(config, quant_config)
259+
self.vision_tower = init_vision_tower_for_llava(
260+
config, quant_config, require_post_norm=False)
260261
self.vision_resampler = LlavaNextVideoPooler(config)
261262
self.multi_modal_projector = LlavaNextMultiModalProjector(
262263
vision_hidden_size=config.vision_config.hidden_size,

vllm/model_executor/models/llava_onevision.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -400,7 +400,8 @@ def __init__(self,
400400
self.multimodal_config = multimodal_config
401401

402402
# Initialize the vision tower only up to the required feature layer
403-
self.vision_tower = init_vision_tower_for_llava(config, quant_config)
403+
self.vision_tower = init_vision_tower_for_llava(
404+
config, quant_config, require_post_norm=False)
404405
self.multi_modal_projector = LlavaOnevisionMultiModalProjector(config)
405406
self.language_model = init_vllm_registered_model(
406407
config.text_config, cache_config, quant_config)

0 commit comments

Comments
 (0)