Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 61 additions & 1 deletion optimum/exporters/openvino/model_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,7 @@
PhiMoEModelPatcher,
Qwen2_5_VLVisionEmbMergerPatcher,
Qwen2MoEPatcher,
Qwen2VLEmbeddingPatcher,
Qwen2VLLanguageModelPatcher,
Qwen2VLVisionEmbMergerPatcher,
Qwen3MoeModelPatcher,
Expand Down Expand Up @@ -250,6 +251,10 @@ def init_model_configs():
"transformers",
"Qwen2VLForConditionalGeneration",
)
TasksManager._CUSTOM_CLASSES[("pt", "qwen2_vl", "image-text-to-embedding")] = (
"transformers",
"Qwen2VLForConditionalGeneration",
)
TasksManager._CUSTOM_CLASSES[("pt", "qwen2_5_vl", "image-text-to-text")] = (
"transformers",
"AutoModelForImageTextToText",
Expand Down Expand Up @@ -1725,6 +1730,51 @@ def generate_dummy_inputs(self, framework: str = "pt", **kwargs):
return dummy_inputs


class LMEmbeddingConfigHelper(LMInputEmbedsConfigHelper):
def patch_model_for_export(
self, model: PreTrainedModel, model_kwargs: Optional[Dict[str, Any]] = None
) -> ModelPatcher:
model_kwargs = model_kwargs or {}
# No use_cache=True — embedding model is stateless, no KV-cache
if self.patcher_cls is not None:
return self.patcher_cls(self, model, model_kwargs=model_kwargs)
return self.orig_export_config.patch_model_for_export(model, model_kwargs=model_kwargs)

@property
def inputs(self) -> Dict[str, Dict[int, str]]:
# Strip past_key_values from inputs — embedding model has no KV-cache
return {k: v for k, v in super().inputs.items() if not k.startswith("past_key_values")}

@property
def outputs(self) -> Dict[str, Dict[int, str]]:
return {"last_hidden_state": {0: "batch_size", 1: "sequence_length"}}

def generate_dummy_inputs(self, framework: str = "pt", **kwargs):
dummy_inputs = super().generate_dummy_inputs(framework, **kwargs)
dummy_inputs.pop("past_key_values", None)
return dummy_inputs


def get_vlm_embedding_lm_config(
model_type,
model_config,
int_dtype,
float_dtype,
model_patcher=None,
dummy_input_generator=None,
inputs_update=None,
):
internal_export_config = get_vlm_internal_text_generation_config(model_type, model_config, int_dtype, float_dtype)
export_config = LMEmbeddingConfigHelper(
internal_export_config,
patcher_cls=model_patcher,
dummy_input_generator=dummy_input_generator,
inputs_update=inputs_update,
)
export_config._normalized_config = internal_export_config._normalized_config
return export_config


class InputEmbedOpenvVINOConfig(TextDecoderOnnxConfig):
NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
_MODEL_PATCHER = InputEmbeddingPatcher
Expand Down Expand Up @@ -3639,7 +3689,7 @@ class QwenVLConfigBehavior(str, enum.Enum):
VISION_EMBEDDINGS_POS = "vision_embeddings_pos"


@register_in_tasks_manager("qwen2_vl", *["image-text-to-text"], library_name="transformers")
@register_in_tasks_manager("qwen2_vl", *["image-text-to-text", "image-text-to-embedding"], library_name="transformers")
class Qwen2VLOpenVINOConfig(BaseVLMOpenVINOConfig):
SUPPORTED_BEHAVIORS = [model_type.value for model_type in QwenVLConfigBehavior]
NORMALIZED_CONFIG_CLASS = NormalizedVisionConfig
Expand Down Expand Up @@ -3715,6 +3765,16 @@ def with_behavior(
return get_vlm_text_embeddings_config("qwen2", self._orig_config, self.int_dtype, self.float_dtype)

if behavior == QwenVLConfigBehavior.LANGUAGE:
if self.task == "image-text-to-embedding":
return get_vlm_embedding_lm_config(
"qwen2",
self._orig_config,
self.int_dtype,
self.float_dtype,
model_patcher=Qwen2VLEmbeddingPatcher,
dummy_input_generator=DummyQwen2VLLMInputGenerator,
inputs_update={"position_ids": {1: "batch_size", 2: "sequence_length"}},
)
return get_vlm_text_generation_config(
"qwen2",
self._orig_config,
Expand Down
30 changes: 30 additions & 0 deletions optimum/exporters/openvino/model_patcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -4090,6 +4090,36 @@ def __exit__(self, exc_type, exc_value, traceback):
self._model.forward = self._model.__orig_forward


class Qwen2VLEmbeddingPatcher(OVDecoderModelPatcher):
def __init__(
self,
config: "OnnxConfig",
model: "PreTrainedModel",
model_kwargs: Dict[str, Any] = None,
):
model.__orig_forward = model.forward

def embedding_forward(
self,
attention_mask,
position_ids=None,
inputs_embeds=None,
):
outputs = self.model(
inputs_embeds=inputs_embeds,
attention_mask=attention_mask,
position_ids=position_ids,
)
return {"last_hidden_state": outputs.last_hidden_state}

model.forward = types.MethodType(embedding_forward, model)
super().__init__(config, model, model_kwargs)

def __exit__(self, exc_type, exc_value, traceback):
super().__exit__(exc_type, exc_value, traceback)
self._model.forward = self._model.__orig_forward


class Qwen3VLLanguageModelPatcher(OVDecoderModelPatcher):
def __init__(
self,
Expand Down
3 changes: 3 additions & 0 deletions optimum/intel/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@
"OVModelForCustomTasks",
"OVModelForFeatureExtraction",
"OVModelForImageClassification",
"OVModelForImageTextToEmbedding",
"OVModelForMaskedLM",
"OVModelForPix2Struct",
"OVModelForQuestionAnswering",
Expand Down Expand Up @@ -125,6 +126,7 @@
"OVModelForCustomTasks",
"OVModelForFeatureExtraction",
"OVModelForImageClassification",
"OVModelForImageTextToEmbedding",
"OVModelForMaskedLM",
"OVModelForPix2Struct",
"OVModelForQuestionAnswering",
Expand Down Expand Up @@ -412,6 +414,7 @@
OVModelForMaskedLM,
OVModelForQuestionAnswering,
OVModelForSeq2SeqLM,
OVModelForImageTextToEmbedding,
OVModelForSequenceClassification,
OVModelForSpeechSeq2Seq,
OVModelForTextToSpeechSeq2Seq,
Expand Down
2 changes: 1 addition & 1 deletion optimum/intel/openvino/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@
from .modeling_sam import OVSamModel
from .modeling_seq2seq import OVModelForPix2Struct, OVModelForSeq2SeqLM, OVModelForSpeechSeq2Seq, OVModelForVision2Seq
from .modeling_text2speech import OVModelForTextToSpeechSeq2Seq
from .modeling_visual_language import OVModelForVisualCausalLM
from .modeling_visual_language import OVModelForImageTextToEmbedding, OVModelForVisualCausalLM


if is_diffusers_available():
Expand Down
Loading