huggingface · jianxiah-intel · Mar 24, 2026 · Mar 25, 2026
diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
@@ -192,6 +192,7 @@
     PhiMoEModelPatcher,
     Qwen2_5_VLVisionEmbMergerPatcher,
     Qwen2MoEPatcher,
+    Qwen2VLEmbeddingPatcher,
     Qwen2VLLanguageModelPatcher,
     Qwen2VLVisionEmbMergerPatcher,
     Qwen3MoeModelPatcher,
@@ -250,6 +251,10 @@ def init_model_configs():
         "transformers",
         "Qwen2VLForConditionalGeneration",
     )
+    TasksManager._CUSTOM_CLASSES[("pt", "qwen2_vl", "image-text-to-embedding")] = (
+        "transformers",
+        "Qwen2VLForConditionalGeneration",
+    )
     TasksManager._CUSTOM_CLASSES[("pt", "qwen2_5_vl", "image-text-to-text")] = (
         "transformers",
         "AutoModelForImageTextToText",
@@ -1725,6 +1730,51 @@ def generate_dummy_inputs(self, framework: str = "pt", **kwargs):
         return dummy_inputs
 
 
+class LMEmbeddingConfigHelper(LMInputEmbedsConfigHelper):
+    def patch_model_for_export(
+        self, model: PreTrainedModel, model_kwargs: Optional[Dict[str, Any]] = None
+    ) -> ModelPatcher:
+        model_kwargs = model_kwargs or {}
+        # No use_cache=True — embedding model is stateless, no KV-cache
+        if self.patcher_cls is not None:
+            return self.patcher_cls(self, model, model_kwargs=model_kwargs)
+        return self.orig_export_config.patch_model_for_export(model, model_kwargs=model_kwargs)
+
+    @property
+    def inputs(self) -> Dict[str, Dict[int, str]]:
+        # Strip past_key_values from inputs — embedding model has no KV-cache
+        return {k: v for k, v in super().inputs.items() if not k.startswith("past_key_values")}
+
+    @property
+    def outputs(self) -> Dict[str, Dict[int, str]]:
+        return {"last_hidden_state": {0: "batch_size", 1: "sequence_length"}}
+
+    def generate_dummy_inputs(self, framework: str = "pt", **kwargs):
+        dummy_inputs = super().generate_dummy_inputs(framework, **kwargs)
+        dummy_inputs.pop("past_key_values", None)
+        return dummy_inputs
+
+
+def get_vlm_embedding_lm_config(
+    model_type,
+    model_config,
+    int_dtype,
+    float_dtype,
+    model_patcher=None,
+    dummy_input_generator=None,
+    inputs_update=None,
+):
+    internal_export_config = get_vlm_internal_text_generation_config(model_type, model_config, int_dtype, float_dtype)
+    export_config = LMEmbeddingConfigHelper(
+        internal_export_config,
+        patcher_cls=model_patcher,
+        dummy_input_generator=dummy_input_generator,
+        inputs_update=inputs_update,
+    )
+    export_config._normalized_config = internal_export_config._normalized_config
+    return export_config
+
+
 class InputEmbedOpenvVINOConfig(TextDecoderOnnxConfig):
     NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
     _MODEL_PATCHER = InputEmbeddingPatcher
@@ -3639,7 +3689,7 @@ class QwenVLConfigBehavior(str, enum.Enum):
     VISION_EMBEDDINGS_POS = "vision_embeddings_pos"
 
 
-@register_in_tasks_manager("qwen2_vl", *["image-text-to-text"], library_name="transformers")
+@register_in_tasks_manager("qwen2_vl", *["image-text-to-text", "image-text-to-embedding"], library_name="transformers")
 class Qwen2VLOpenVINOConfig(BaseVLMOpenVINOConfig):
     SUPPORTED_BEHAVIORS = [model_type.value for model_type in QwenVLConfigBehavior]
     NORMALIZED_CONFIG_CLASS = NormalizedVisionConfig
@@ -3715,6 +3765,16 @@ def with_behavior(
             return get_vlm_text_embeddings_config("qwen2", self._orig_config, self.int_dtype, self.float_dtype)
 
         if behavior == QwenVLConfigBehavior.LANGUAGE:
+            if self.task == "image-text-to-embedding":
+                return get_vlm_embedding_lm_config(
+                    "qwen2",
+                    self._orig_config,
+                    self.int_dtype,
+                    self.float_dtype,
+                    model_patcher=Qwen2VLEmbeddingPatcher,
+                    dummy_input_generator=DummyQwen2VLLMInputGenerator,
+                    inputs_update={"position_ids": {1: "batch_size", 2: "sequence_length"}},
+                )
             return get_vlm_text_generation_config(
                 "qwen2",
                 self._orig_config,

diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
@@ -4090,6 +4090,36 @@ def __exit__(self, exc_type, exc_value, traceback):
         self._model.forward = self._model.__orig_forward
 
 
+class Qwen2VLEmbeddingPatcher(OVDecoderModelPatcher):
+    def __init__(
+        self,
+        config: "OnnxConfig",
+        model: "PreTrainedModel",
+        model_kwargs: Dict[str, Any] = None,
+    ):
+        model.__orig_forward = model.forward
+
+        def embedding_forward(
+            self,
+            attention_mask,
+            position_ids=None,
+            inputs_embeds=None,
+        ):
+            outputs = self.model(
+                inputs_embeds=inputs_embeds,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+            )
+            return {"last_hidden_state": outputs.last_hidden_state}
+
+        model.forward = types.MethodType(embedding_forward, model)
+        super().__init__(config, model, model_kwargs)
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        super().__exit__(exc_type, exc_value, traceback)
+        self._model.forward = self._model.__orig_forward
+
+
 class Qwen3VLLanguageModelPatcher(OVDecoderModelPatcher):
     def __init__(
         self,

diff --git a/optimum/intel/__init__.py b/optimum/intel/__init__.py
@@ -97,6 +97,7 @@
             "OVModelForCustomTasks",
             "OVModelForFeatureExtraction",
             "OVModelForImageClassification",
+            "OVModelForImageTextToEmbedding",
             "OVModelForMaskedLM",
             "OVModelForPix2Struct",
             "OVModelForQuestionAnswering",
@@ -125,6 +126,7 @@
         "OVModelForCustomTasks",
         "OVModelForFeatureExtraction",
         "OVModelForImageClassification",
+        "OVModelForImageTextToEmbedding",
         "OVModelForMaskedLM",
         "OVModelForPix2Struct",
         "OVModelForQuestionAnswering",
@@ -412,6 +414,7 @@
             OVModelForMaskedLM,
             OVModelForQuestionAnswering,
             OVModelForSeq2SeqLM,
+            OVModelForImageTextToEmbedding,
             OVModelForSequenceClassification,
             OVModelForSpeechSeq2Seq,
             OVModelForTextToSpeechSeq2Seq,

diff --git a/optimum/intel/openvino/__init__.py b/optimum/intel/openvino/__init__.py
@@ -87,7 +87,7 @@
 from .modeling_sam import OVSamModel
 from .modeling_seq2seq import OVModelForPix2Struct, OVModelForSeq2SeqLM, OVModelForSpeechSeq2Seq, OVModelForVision2Seq
 from .modeling_text2speech import OVModelForTextToSpeechSeq2Seq
-from .modeling_visual_language import OVModelForVisualCausalLM
+from .modeling_visual_language import OVModelForImageTextToEmbedding, OVModelForVisualCausalLM
 
 
 if is_diffusers_available():