From c4eb6895c2c2b0ddd870832701e49c40a3794157 Mon Sep 17 00:00:00 2001
From: xufang <fang.xu@intel.com>
Date: Mon, 9 Feb 2026 13:38:42 +0800
Subject: [PATCH 01/39] support videochat_flash_qwen

---
 optimum/exporters/openvino/model_configs.py   | 188 +++++++++++++
 optimum/exporters/openvino/model_patcher.py   |  59 +++++
 optimum/exporters/openvino/utils.py           |   1 +
 .../openvino/modeling_visual_language.py      | 250 ++++++++++++++++++
 4 files changed, 498 insertions(+)

diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index ca12d455be..6cb23fbadb 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -197,6 +197,8 @@
     Qwen3MoeModelPatcher,
     QwenModelPatcher,
     SanaTextEncoderModelPatcher,
+    VideochatFlashQwenLanguageModelPatcher,
+    VideochatFlashQwenVisionEmbeddingModelPatcher,
     XverseModelPatcher,
     Zamba2ModelPatcher,
 )
@@ -4964,3 +4966,189 @@ class SiglipTextWithProjectionOpenVINOConfig(SiglipTextWithProjectionOnnxConfig)
 @register_in_tasks_manager("siglip-text", *["feature-extraction"])
 class SiglipTextOpenVINOConfig(SiglipTextOnnxConfig):
     pass
+
+
+class DummyVideoChatFlashQwenInputGenerator(DummyVisionInputGenerator):
+    def __init__(
+        self,
+        task: str,
+        normalized_config: NormalizedVisionConfig,
+        batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"],
+        num_channels: int = DEFAULT_DUMMY_SHAPES["num_channels"],
+        width: int = DEFAULT_DUMMY_SHAPES["width"],
+        height: int = DEFAULT_DUMMY_SHAPES["height"],
+        visual_seq_length: int = DEFAULT_DUMMY_SHAPES["visual_seq_length"],
+        **kwargs,
+    ):
+        super().__init__(task, normalized_config, batch_size, num_channels, width, height, visual_seq_length, **kwargs)
+        if hasattr(normalized_config, "config") and hasattr(normalized_config.config, "mm_local_num_frames"):
+            self.num_frames = normalized_config.config.mm_local_num_frames
+            self.height = 224
+            self.width = 224
+            self.image_size = (self.height, self.width)
+
+    def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
+        if input_name == "pixel_values":
+            return self.random_float_tensor(
+                shape=[
+                    self.batch_size,
+                    self.num_channels,
+                    self.num_frames,
+                    self.height,
+                    self.width,
+                ],
+                framework=framework,
+                dtype=float_dtype,
+            )
+
+
+class DummyVideoChatFlashQwenProjectorInputGenerator(DummyInputGenerator):
+    SUPPORTED_INPUT_NAMES = ["input"]
+
+    def __init__(
+        self,
+        task: str,
+        normalized_config: NormalizedTextConfig,
+        batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"],
+        random_batch_size_range: Optional[Tuple[int, int]] = None,
+        **kwargs,
+    ):
+        self.task = task
+        self.batch_size = batch_size
+        self.hidden_size = normalized_config.hidden_size
+        self.num_patches = 64
+        self.normalized_config = normalized_config
+
+    def generate(
+        self,
+        input_name: str,
+        framework: str = "pt",
+        int_dtype: str = "int64",
+        float_dtype: str = "fp32",
+    ):
+        shape = [self.batch_size, self.num_patches, self.hidden_size]
+        return self.random_float_tensor(shape, framework=framework, dtype=float_dtype)
+
+
+class VideoChatFlashQWENProjectorOpenVINOConfig(OnnxConfig):
+    DUMMY_INPUT_GENERATOR_CLASSES = (DummyVideoChatFlashQwenProjectorInputGenerator,)
+    NORMALIZED_CONFIG_CLASS = NormalizedVisionConfig
+
+    @property
+    def inputs(self) -> Dict[str, Dict[int, str]]:
+        return {"input": {0: "batch_size", 1: "num_patches", 2: "hidden_size"}}
+
+
+class VideoChatFlashQwenConfigBehavior(str, enum.Enum):
+    LANGUAGE = "language"
+    VISION_EMBEDDINGS = "vision_embeddings"
+    VISION_PROJECTION = "vision_projection"
+    TEXT_EMBEDDINGS = "text_embeddings"
+
+
+@register_in_tasks_manager("videochat_flash_qwen", *["image-text-to-text"], library_name="transformers")
+class VideoChatFlashQwenOpenVINOConfig(BaseVLMOpenVINOConfig):
+    MIN_TRANSFORMERS_VERSION = "4.42.0"
+    SUPPORTED_BEHAVIORS = [model_type.value for model_type in VideoChatFlashQwenConfigBehavior]
+    DUMMY_INPUT_GENERATOR_CLASSES = (DummyVideoChatFlashQwenInputGenerator,)
+
+    def __init__(
+        self,
+        config: "PretrainedConfig",
+        task: str = "feature-extraction",
+        int_dtype: str = "int64",
+        float_dtype: str = "fp32",
+        behavior: VideoChatFlashQwenConfigBehavior = VideoChatFlashQwenConfigBehavior.VISION_EMBEDDINGS,
+        preprocessors: Optional[List[Any]] = None,
+        **kwargs,
+    ):
+        super().__init__(
+            config=config,
+            task=task,
+            int_dtype=int_dtype,
+            float_dtype=float_dtype,
+            behavior=behavior,
+            preprocessors=preprocessors,
+        )
+        self._orig_config = config
+        if self._behavior == VideoChatFlashQwenConfigBehavior.VISION_EMBEDDINGS and hasattr(config, "vision_config"):
+            self._config = config.vision_config
+            self._normalized_config = self.NORMALIZED_CONFIG_CLASS(self._config)
+
+    @property
+    def inputs(self) -> Dict[str, Dict[int, str]]:
+        if not self._behavior == VideoChatFlashQwenConfigBehavior.VISION_EMBEDDINGS:
+            return {}
+        return {"pixel_values": {0: "batch_size", 1: "num_channels", 2: "num_frames", 3: "height", 4: "width"}}
+
+    def with_behavior(
+        self,
+        behavior: Union[str, VideoChatFlashQwenConfigBehavior],
+    ):
+        """
+        Creates a config for different behaviour.
+
+        Args:
+            behavior ([`ConfigBehavior`]):
+                The behavior to use for the new instance.
+        """
+        if isinstance(behavior, str) and not isinstance(behavior, VideoChatFlashQwenConfigBehavior):
+            behavior = VideoChatFlashQwenConfigBehavior(behavior)
+
+        if behavior == VideoChatFlashQwenConfigBehavior.VISION_PROJECTION:
+            export_config = VideoChatFlashQWENProjectorOpenVINOConfig(
+                self._orig_config,
+                task="feature-extraction",
+                int_dtype=self.int_dtype,
+                float_dtype=self.float_dtype,
+            )
+            return export_config
+
+        if behavior == VideoChatFlashQwenConfigBehavior.TEXT_EMBEDDINGS:
+            return get_vlm_text_embeddings_config(
+                "qwen2", self._orig_config, self.int_dtype, self.float_dtype
+            )
+
+        if behavior == VideoChatFlashQwenConfigBehavior.LANGUAGE:
+            return get_vlm_text_generation_config(
+                "qwen2", self._orig_config, self.int_dtype, self.float_dtype
+            )
+
+        if behavior == VideoChatFlashQwenConfigBehavior.VISION_EMBEDDINGS:
+            return self.__class__(
+                self._orig_config,
+                task=self.task,
+                int_dtype=self.int_dtype,
+                float_dtype=self.float_dtype,
+                behavior=behavior,
+                preprocessors=self._preprocessors,
+            )
+
+    def get_model_for_behavior(self, model, behavior: Union[str, VideoChatFlashQwenConfigBehavior]):
+        if isinstance(behavior, str) and not isinstance(behavior, VideoChatFlashQwenConfigBehavior):
+            behavior = VideoChatFlashQwenConfigBehavior(behavior)
+
+        if behavior == VideoChatFlashQwenConfigBehavior.VISION_PROJECTION:
+            return model.get_model().mm_projector.mlp
+
+        if behavior == VideoChatFlashQwenConfigBehavior.VISION_EMBEDDINGS:
+            return model.get_vision_tower().vision_tower
+
+        if behavior == VideoChatFlashQwenConfigBehavior.TEXT_EMBEDDINGS:
+            text_embedding = model.get_input_embeddings()
+            text_embedding.config = model.config
+            return text_embedding
+
+        if behavior == VideoChatFlashQwenConfigBehavior.LANGUAGE:
+            model.model.llm_compress_layer_list = []
+            return model.language_model if not hasattr(model, "lm_head") else model
+
+    def patch_model_for_export(self, model: PreTrainedModel, model_kwargs: Optional[Dict[str, Any]] = None):
+        model_kwargs = model_kwargs or {}
+        if self._behavior == VideoChatFlashQwenConfigBehavior.LANGUAGE:
+            return VideochatFlashQwenLanguageModelPatcher(self, model, model_kwargs)
+
+        if self._behavior == VideoChatFlashQwenConfigBehavior.VISION_EMBEDDINGS:
+            return VideochatFlashQwenVisionEmbeddingModelPatcher(self, model, model_kwargs)
+
+        return super().patch_model_for_export(model, model_kwargs)
diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index 56e550858c..163c8136a1 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -7519,3 +7519,62 @@ def __exit__(self, exc_type, exc_value, traceback):
                 afmoe_moe = layer.mlp
                 afmoe_moe.forward = afmoe_moe._orig_forward
                 del afmoe_moe.down_projs, afmoe_moe.gate_projs, afmoe_moe.up_projs
+
+
+class VideochatFlashQwenVisionEmbeddingModelPatcher(ModelPatcher):
+    def __init__(
+        self,
+        config: "OnnxConfig",
+        model: "PreTrainedModel",
+        model_kwargs: Dict[str, Any] = None,
+    ):
+        model.__orig_forward = model.forward
+
+        def forward_wrap(self, pixel_values):
+            return self.__orig_forward(x=pixel_values)
+
+        model.forward = types.MethodType(forward_wrap, model)
+        super().__init__(config, model, model_kwargs)
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        super().__exit__(exc_type, exc_value, traceback)
+        self._model.forward = self._model.__orig_forward
+
+
+class VideochatFlashQwenLanguageModelPatcher(ModelPatcher):
+    def __init__(
+        self,
+        config: "OnnxConfig",
+        model: "PreTrainedModel",
+        model_kwargs: Dict[str, Any] = None,
+    ):
+        model.__orig_forward = model.forward
+
+        def forward_wrap(
+            self,
+            attention_mask,
+            position_ids=None,
+            past_key_values=None,
+            inputs_embeds=None,
+        ):
+            from transformers.cache_utils import DynamicCache
+
+            outputs, labels = self.model(
+                input_ids=None,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                inputs_embeds=inputs_embeds,
+            )
+            hidden_states = outputs[0]
+            logits = self.lm_head(hidden_states)
+            logits = logits.float()
+            output = (logits,) + outputs[1:]
+            return output
+
+        model.forward = types.MethodType(forward_wrap, model)
+        super().__init__(config, model, model_kwargs)
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        super().__exit__(exc_type, exc_value, traceback)
+        self._model.forward = self._model.__orig_forward
diff --git a/optimum/exporters/openvino/utils.py b/optimum/exporters/openvino/utils.py
index de92645017..c293783d77 100644
--- a/optimum/exporters/openvino/utils.py
+++ b/optimum/exporters/openvino/utils.py
@@ -302,6 +302,7 @@ def get_submodels(model):
     "phi4_multimodal",
     "llama4",
     "minicpmo",
+    "videochat_flash_qwen",
 ]
 
 SSM_MODELS = ["mamba", "falcon_mamba", "zamba2", "lfm2", "granitemoehybrid"]
diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py
index d7f1ffe7d2..de9851b53a 100644
--- a/optimum/intel/openvino/modeling_visual_language.py
+++ b/optimum/intel/openvino/modeling_visual_language.py
@@ -4386,6 +4386,255 @@ def preprocess_inputs(
         return inputs
 
 
+class _OVVideoChatFlashQwenForCausalLM(OVModelForVisualCausalLM):
+    additional_parts = ["vision_projection"]
+
+    def get_vision_embeddings(self, pixel_values, input_ids=None, **kwargs):
+        if input_ids is not None and input_ids.shape[1] == 1:
+            return None
+        image_features = self.vision_embeddings(pixel_values).last_hidden_state
+        image_features = self.multi_modal_projector(image_features)
+        return image_features
+
+    def pack_image_features(self, image_features, image_sizes, image_newline=None):
+        """
+        Reshape, unpad and then pack each image_feature into a single image_features tensor containing all visual vectors.
+
+        Args:
+            image_features (`List[torch.Tensor]` of length num_images, each of shape `(num_patches, image_length, embed_dim)`)
+                List of image feature tensor, each contains all the visual feature of all patches.
+            image_sizes (`torch.Tensor` of shape `(num_images, 2)`)
+                Actual image size of each images (H, W).
+            vision_feature_select_strategy (`str`)
+                The feature selection strategy used to select the vision feature from the vision backbone.
+            image_newline (`torch.Tensor` of shape `(embed_dim)`)
+                New line embedding vector.
+        Returns:
+            image_features (`torch.Tensor` of shape `(all_feat_len, embed_dim)`)
+            feature_lens (`List[int]`)
+                token length of each image in image_features
+        """
+        from transformers.models.llava_next_video.modeling_llava_next_video import (
+            get_anyres_image_grid_shape,
+            unpad_image,
+        )
+
+        new_image_features = []
+        feature_lens = []
+        vision_feature_select_strategy = self.config.vision_feature_select_strategy
+        for image_idx, image_feature in enumerate(image_features):
+            if image_feature.shape[0] > 1:
+                base_image_feature = image_feature[0]
+                image_feature = image_feature[1:]
+                height = width = self.config.vision_config.image_size // self.config.vision_config.patch_size
+
+                num_patch_height, num_patch_width = get_anyres_image_grid_shape(
+                    image_sizes[image_idx],
+                    self.config.image_grid_pinpoints,
+                    self.config.vision_config.image_size,
+                )
+
+                if (
+                    np.prod(image_feature.shape) % (num_patch_height * num_patch_width * height * width) != 0
+                    and vision_feature_select_strategy == "default"
+                ):
+                    logger.warning_once(
+                        "Image feature shape does not line up with the provided patch size. "
+                        "You may be using the `default` vision_feature_select_strategy with a"
+                        " visual encoder that does not have CLS."
+                    )
+
+                image_feature = image_feature.view(num_patch_height, num_patch_width, height, width, -1)
+                image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous()
+                image_feature = image_feature.flatten(1, 2).flatten(2, 3)
+                image_feature = unpad_image(image_feature, image_sizes[image_idx])
+                if image_newline is not None:
+                    image_feature = torch.cat(
+                        (
+                            image_feature,
+                            image_newline[:, None, None]
+                            .expand(*image_feature.shape[:-1], 1)
+                            .to(image_feature.device, image_feature.dtype),
+                        ),
+                        dim=-1,
+                    )
+                image_feature = image_feature.flatten(1, 2).transpose(0, 1)
+                image_feature = torch.cat((base_image_feature, image_feature), dim=0)
+            else:
+                image_feature = image_feature[0]
+                if image_newline is not None:
+                    image_feature = torch.cat((image_feature, image_newline[None].to(image_feature)), dim=0)
+            new_image_features.append(image_feature)
+            feature_lens.append(image_feature.size(0))
+        image_features = torch.cat(new_image_features, dim=0)
+        feature_lens = torch.tensor(feature_lens, dtype=torch.long, device=image_features.device)
+        return image_features, feature_lens
+
+    @staticmethod
+    def preprocess_inputs(
+        text: str,
+        image: Optional["Image"] = None,
+        processor: Optional[AutoImageProcessor] = None,
+        tokenizer: Optional[PreTrainedTokenizer] = None,
+        config: Optional[PretrainedConfig] = None,
+        video: Optional["VideoInput"] = None,
+        audio: Optional[np.ndarray] = None,
+    ):
+        if processor is None:
+            raise ValueError("Processor is required.")
+        if audio is not None:
+            raise ValueError("Audio input is not supported")
+        if getattr(processor, "chat_template", None) is not None:
+            chat_prompt = [{"role": "user", "content": [{"type": "text", "text": text}]}]
+            if image is not None:
+                chat_prompt[0]["content"].append({"type": "image"})
+            if video is not None:
+                chat_prompt[0]["content"].append({"type": "video"})
+            prompt = processor.apply_chat_template(chat_prompt, add_generation_prompt=True, tokenize=False)
+        else:
+            prompt = text
+            if image is not None and "<image>" not in prompt:
+                prompt = "<image>\n" + prompt
+            if video is not None and "<video>" not in prompt:
+                prompt = "<video>\n" + prompt
+
+        if is_transformers_version(">", "4.47.99") and getattr(processor, "patch_size", None) is None:
+            if (
+                getattr(config, "vision_config", None) is not None
+                and getattr(config.vision_config, "patch_size", None) is not None
+            ):
+                processor.patch_size = config.vision_config.patch_size
+            else:
+                raise ValueError(
+                    "Processor does not have `patch_size` attribute. Please fix the processor or provide `patch_size` in the config."
+                )
+
+        inputs = processor(images=image, text=prompt, videos=video, return_tensors="pt")
+        return inputs
+
+    def get_multimodal_embeddings(
+        self,
+        input_ids,
+        pixel_values=None,
+        attention_mask=None,
+        position_ids=None,
+        past_key_values=None,
+        image_sizes=None,
+        pixel_values_videos=None,
+        **kwargs,
+    ):
+        inputs_embeds = self.get_text_embeddings(input_ids, **kwargs)
+
+        if (
+            pixel_values is not None
+            and pixel_values.size(0) > 0
+            and self._support_new_processing
+            and past_key_values is None
+        ):
+            legacy_processing = (
+                (input_ids == self.config.image_token_index).sum(1).max() < self.config.image_seq_length
+            ).item()
+        elif (
+            pixel_values_videos is not None
+            and pixel_values_videos.size(0) > 0
+            and self._support_new_processing
+            and past_key_values is None
+        ):
+            legacy_processing = (
+                (input_ids == self.config.video_token_index).sum(1).max() < self.config.video_seq_length
+            ).item()
+        else:
+            legacy_processing = True
+
+        legacy_processing = (
+            legacy_processing.item() if isinstance(legacy_processing, torch.Tensor) else legacy_processing
+        )
+
+        if pixel_values is not None and pixel_values.size(0) > 0:
+            inputs_embeds, attention_mask, position_ids = self.add_image_features(
+                input_ids,
+                inputs_embeds,
+                pixel_values,
+                attention_mask,
+                position_ids,
+                image_sizes,
+                legacy_processing,
+                **kwargs,
+            )
+
+        if pixel_values_videos is not None and pixel_values_videos.size(0) > 0:
+            inputs_embeds, attention_mask, position_ids = self.add_video_features(
+                input_ids,
+                inputs_embeds,
+                pixel_values_videos,
+                attention_mask,
+                position_ids,
+                legacy_processing=legacy_processing,
+                **kwargs,
+            )
+
+        if legacy_processing and pixel_values is not None and past_key_values is not None and input_ids.shape[1] == 1:
+            attention_mask, position_ids = self._filter_unattended_tokens(input_ids, attention_mask, past_key_values)
+
+        return inputs_embeds, attention_mask, position_ids
+
+    def add_video_features(
+        self,
+        input_ids,
+        inputs_embeds,
+        pixel_values_videos,
+        attention_mask,
+        position_ids,
+        legacy_processing,
+        **kwargs,
+    ):
+        # Adopted from https://github.com/huggingface/transformers/blob/v4.49.0/src/transformers/models/llava_next_video/modeling_llava_next_video.py#L732-L751
+        video_features = self.get_video_features(pixel_values_videos, input_ids)
+        if video_features is not None and len(video_features) != 0:
+            video_features = [feature.flatten(0, 1) for feature in video_features]
+            video_feature_lens = [feature.size(0) for feature in video_features]
+            video_features = torch.cat(video_features, dim=0)
+            video_feature_lens = torch.tensor(video_feature_lens, dtype=torch.long, device=video_features.device)
+
+            if legacy_processing:
+                inputs_embeds, attention_mask, position_ids = self.merge_vision_text_embeddings(
+                    video_features,
+                    inputs_embeds,
+                    video_feature_lens,
+                    input_ids,
+                    attention_mask,
+                    position_ids,
+                    legacy_processing,
+                    self.config.video_token_index,
+                )
+            else:
+                inputs_embeds = (
+                    torch.from_numpy(inputs_embeds) if isinstance(inputs_embeds, np.ndarray) else inputs_embeds
+                )
+                special_image_mask = (input_ids == self.config.video_token_index).unsqueeze(-1)
+                special_image_mask = special_image_mask.expand_as(inputs_embeds)
+                if inputs_embeds[special_image_mask].numel() != video_features.numel():
+                    n_video_tokens = (input_ids == self.config.video_token_index).sum().item()
+                    n_video_features = video_features.shape[0]
+                    raise ValueError(
+                        f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}"
+                    )
+                inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, video_features)
+        return inputs_embeds, attention_mask, position_ids
+
+    def get_video_features(self, pixel_values, input_ids=None, **kwargs):
+        # Adopted from https://github.com/huggingface/transformers/blob/v4.49.0/src/transformers/models/llava_next_video/modeling_llava_next_video.py#L835
+        if input_ids is not None and input_ids.shape[1] == 1:
+            return None
+        batch_size, frames, channels, height, width = pixel_values.shape
+        pixel_values = pixel_values.reshape(batch_size * frames, channels, height, width)
+        selected_video_features = self.vision_embeddings(pixel_values).last_hidden_state
+        video_features = self.vision_resampler(selected_video_features)
+        video_features = self.multi_modal_projector(video_features)
+        video_features = torch.split(torch.from_numpy(video_features), frames, dim=0)
+        return video_features
+
+
 MODEL_TYPE_TO_CLS_MAPPING = {
     "llava": _OVLlavaForCausalLM,
     "llava_next": _OVLlavaNextForCausalLM,
@@ -4407,4 +4656,5 @@ def preprocess_inputs(
     "phi4_multimodal": _OVPhi4MMForCausalLM,
     "llama4": _OVLlama4ForCausalLM,
     "minicpmo": _OVMiniCPMOForCausalLM,
+    "videochat_flash_qwen": _OVVideoChatFlashQwenForCausalLM,
 }

From 3748931f509a6140e16ebd77b16c607b9ca44afc Mon Sep 17 00:00:00 2001
From: xufang <fang.xu@intel.com>
Date: Tue, 10 Feb 2026 17:27:52 +0800
Subject: [PATCH 02/39] fix error

---
 optimum/exporters/openvino/convert.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py
index 4b0652393d..d2d155856a 100644
--- a/optimum/exporters/openvino/convert.py
+++ b/optimum/exporters/openvino/convert.py
@@ -360,7 +360,8 @@ def export_pytorch(
             logger.info(f"Overriding {len(config.values_override)} configuration item(s)")
             for override_config_key, override_config_value in config.values_override.items():
                 logger.info(f"\t- {override_config_key} -> {override_config_value}")
-                setattr(model.config, override_config_key, override_config_value)
+                if hasattr(model, "config"):
+                    setattr(model.config, override_config_key, override_config_value)
 
         if input_shapes is None:
             input_shapes = {}  # will use the defaults from DEFAULT_DUMMY_SHAPES

From 64fb0292ea5f699cc118c841c8293ee02ac6cf55 Mon Sep 17 00:00:00 2001
From: xufang <fang.xu@intel.com>
Date: Thu, 5 Mar 2026 09:28:10 +0800
Subject: [PATCH 03/39] remove unused function

---
 .../openvino/modeling_visual_language.py      | 204 +-----------------
 1 file changed, 1 insertion(+), 203 deletions(-)

diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py
index de9851b53a..6447e2ba49 100644
--- a/optimum/intel/openvino/modeling_visual_language.py
+++ b/optimum/intel/openvino/modeling_visual_language.py
@@ -4389,87 +4389,6 @@ def preprocess_inputs(
 class _OVVideoChatFlashQwenForCausalLM(OVModelForVisualCausalLM):
     additional_parts = ["vision_projection"]
 
-    def get_vision_embeddings(self, pixel_values, input_ids=None, **kwargs):
-        if input_ids is not None and input_ids.shape[1] == 1:
-            return None
-        image_features = self.vision_embeddings(pixel_values).last_hidden_state
-        image_features = self.multi_modal_projector(image_features)
-        return image_features
-
-    def pack_image_features(self, image_features, image_sizes, image_newline=None):
-        """
-        Reshape, unpad and then pack each image_feature into a single image_features tensor containing all visual vectors.
-
-        Args:
-            image_features (`List[torch.Tensor]` of length num_images, each of shape `(num_patches, image_length, embed_dim)`)
-                List of image feature tensor, each contains all the visual feature of all patches.
-            image_sizes (`torch.Tensor` of shape `(num_images, 2)`)
-                Actual image size of each images (H, W).
-            vision_feature_select_strategy (`str`)
-                The feature selection strategy used to select the vision feature from the vision backbone.
-            image_newline (`torch.Tensor` of shape `(embed_dim)`)
-                New line embedding vector.
-        Returns:
-            image_features (`torch.Tensor` of shape `(all_feat_len, embed_dim)`)
-            feature_lens (`List[int]`)
-                token length of each image in image_features
-        """
-        from transformers.models.llava_next_video.modeling_llava_next_video import (
-            get_anyres_image_grid_shape,
-            unpad_image,
-        )
-
-        new_image_features = []
-        feature_lens = []
-        vision_feature_select_strategy = self.config.vision_feature_select_strategy
-        for image_idx, image_feature in enumerate(image_features):
-            if image_feature.shape[0] > 1:
-                base_image_feature = image_feature[0]
-                image_feature = image_feature[1:]
-                height = width = self.config.vision_config.image_size // self.config.vision_config.patch_size
-
-                num_patch_height, num_patch_width = get_anyres_image_grid_shape(
-                    image_sizes[image_idx],
-                    self.config.image_grid_pinpoints,
-                    self.config.vision_config.image_size,
-                )
-
-                if (
-                    np.prod(image_feature.shape) % (num_patch_height * num_patch_width * height * width) != 0
-                    and vision_feature_select_strategy == "default"
-                ):
-                    logger.warning_once(
-                        "Image feature shape does not line up with the provided patch size. "
-                        "You may be using the `default` vision_feature_select_strategy with a"
-                        " visual encoder that does not have CLS."
-                    )
-
-                image_feature = image_feature.view(num_patch_height, num_patch_width, height, width, -1)
-                image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous()
-                image_feature = image_feature.flatten(1, 2).flatten(2, 3)
-                image_feature = unpad_image(image_feature, image_sizes[image_idx])
-                if image_newline is not None:
-                    image_feature = torch.cat(
-                        (
-                            image_feature,
-                            image_newline[:, None, None]
-                            .expand(*image_feature.shape[:-1], 1)
-                            .to(image_feature.device, image_feature.dtype),
-                        ),
-                        dim=-1,
-                    )
-                image_feature = image_feature.flatten(1, 2).transpose(0, 1)
-                image_feature = torch.cat((base_image_feature, image_feature), dim=0)
-            else:
-                image_feature = image_feature[0]
-                if image_newline is not None:
-                    image_feature = torch.cat((image_feature, image_newline[None].to(image_feature)), dim=0)
-            new_image_features.append(image_feature)
-            feature_lens.append(image_feature.size(0))
-        image_features = torch.cat(new_image_features, dim=0)
-        feature_lens = torch.tensor(feature_lens, dtype=torch.long, device=image_features.device)
-        return image_features, feature_lens
-
     @staticmethod
     def preprocess_inputs(
         text: str,
@@ -4480,6 +4399,7 @@ def preprocess_inputs(
         video: Optional["VideoInput"] = None,
         audio: Optional[np.ndarray] = None,
     ):
+        # Note: The implementation of this function is not validated, it's only there to allow this subclass to be initialized.
         if processor is None:
             raise ValueError("Processor is required.")
         if audio is not None:
@@ -4512,128 +4432,6 @@ def preprocess_inputs(
         inputs = processor(images=image, text=prompt, videos=video, return_tensors="pt")
         return inputs
 
-    def get_multimodal_embeddings(
-        self,
-        input_ids,
-        pixel_values=None,
-        attention_mask=None,
-        position_ids=None,
-        past_key_values=None,
-        image_sizes=None,
-        pixel_values_videos=None,
-        **kwargs,
-    ):
-        inputs_embeds = self.get_text_embeddings(input_ids, **kwargs)
-
-        if (
-            pixel_values is not None
-            and pixel_values.size(0) > 0
-            and self._support_new_processing
-            and past_key_values is None
-        ):
-            legacy_processing = (
-                (input_ids == self.config.image_token_index).sum(1).max() < self.config.image_seq_length
-            ).item()
-        elif (
-            pixel_values_videos is not None
-            and pixel_values_videos.size(0) > 0
-            and self._support_new_processing
-            and past_key_values is None
-        ):
-            legacy_processing = (
-                (input_ids == self.config.video_token_index).sum(1).max() < self.config.video_seq_length
-            ).item()
-        else:
-            legacy_processing = True
-
-        legacy_processing = (
-            legacy_processing.item() if isinstance(legacy_processing, torch.Tensor) else legacy_processing
-        )
-
-        if pixel_values is not None and pixel_values.size(0) > 0:
-            inputs_embeds, attention_mask, position_ids = self.add_image_features(
-                input_ids,
-                inputs_embeds,
-                pixel_values,
-                attention_mask,
-                position_ids,
-                image_sizes,
-                legacy_processing,
-                **kwargs,
-            )
-
-        if pixel_values_videos is not None and pixel_values_videos.size(0) > 0:
-            inputs_embeds, attention_mask, position_ids = self.add_video_features(
-                input_ids,
-                inputs_embeds,
-                pixel_values_videos,
-                attention_mask,
-                position_ids,
-                legacy_processing=legacy_processing,
-                **kwargs,
-            )
-
-        if legacy_processing and pixel_values is not None and past_key_values is not None and input_ids.shape[1] == 1:
-            attention_mask, position_ids = self._filter_unattended_tokens(input_ids, attention_mask, past_key_values)
-
-        return inputs_embeds, attention_mask, position_ids
-
-    def add_video_features(
-        self,
-        input_ids,
-        inputs_embeds,
-        pixel_values_videos,
-        attention_mask,
-        position_ids,
-        legacy_processing,
-        **kwargs,
-    ):
-        # Adopted from https://github.com/huggingface/transformers/blob/v4.49.0/src/transformers/models/llava_next_video/modeling_llava_next_video.py#L732-L751
-        video_features = self.get_video_features(pixel_values_videos, input_ids)
-        if video_features is not None and len(video_features) != 0:
-            video_features = [feature.flatten(0, 1) for feature in video_features]
-            video_feature_lens = [feature.size(0) for feature in video_features]
-            video_features = torch.cat(video_features, dim=0)
-            video_feature_lens = torch.tensor(video_feature_lens, dtype=torch.long, device=video_features.device)
-
-            if legacy_processing:
-                inputs_embeds, attention_mask, position_ids = self.merge_vision_text_embeddings(
-                    video_features,
-                    inputs_embeds,
-                    video_feature_lens,
-                    input_ids,
-                    attention_mask,
-                    position_ids,
-                    legacy_processing,
-                    self.config.video_token_index,
-                )
-            else:
-                inputs_embeds = (
-                    torch.from_numpy(inputs_embeds) if isinstance(inputs_embeds, np.ndarray) else inputs_embeds
-                )
-                special_image_mask = (input_ids == self.config.video_token_index).unsqueeze(-1)
-                special_image_mask = special_image_mask.expand_as(inputs_embeds)
-                if inputs_embeds[special_image_mask].numel() != video_features.numel():
-                    n_video_tokens = (input_ids == self.config.video_token_index).sum().item()
-                    n_video_features = video_features.shape[0]
-                    raise ValueError(
-                        f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}"
-                    )
-                inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, video_features)
-        return inputs_embeds, attention_mask, position_ids
-
-    def get_video_features(self, pixel_values, input_ids=None, **kwargs):
-        # Adopted from https://github.com/huggingface/transformers/blob/v4.49.0/src/transformers/models/llava_next_video/modeling_llava_next_video.py#L835
-        if input_ids is not None and input_ids.shape[1] == 1:
-            return None
-        batch_size, frames, channels, height, width = pixel_values.shape
-        pixel_values = pixel_values.reshape(batch_size * frames, channels, height, width)
-        selected_video_features = self.vision_embeddings(pixel_values).last_hidden_state
-        video_features = self.vision_resampler(selected_video_features)
-        video_features = self.multi_modal_projector(video_features)
-        video_features = torch.split(torch.from_numpy(video_features), frames, dim=0)
-        return video_features
-
 
 MODEL_TYPE_TO_CLS_MAPPING = {
     "llava": _OVLlavaForCausalLM,

From e7483360f24f513a63906c06f178840bfbb81678 Mon Sep 17 00:00:00 2001
From: xufang <fang.xu@intel.com>
Date: Fri, 6 Mar 2026 21:54:55 +0800
Subject: [PATCH 04/39] fix hidden_size for vision projectio

---
 optimum/exporters/openvino/model_configs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index 6cb23fbadb..dd6f70f6bd 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -5015,7 +5015,7 @@ def __init__(
     ):
         self.task = task
         self.batch_size = batch_size
-        self.hidden_size = normalized_config.hidden_size
+        self.hidden_size = normalized_config.mm_hidden_size
         self.num_patches = 64
         self.normalized_config = normalized_config
 

From 553c49d8fba91b66d301125761592ba48422a575 Mon Sep 17 00:00:00 2001
From: xufang <fang.xu@intel.com>
Date: Thu, 12 Mar 2026 21:59:54 +0800
Subject: [PATCH 05/39] add preprocess_inputs

---
 .../openvino/modeling_visual_language.py      | 95 ++++++++++++++-----
 1 file changed, 71 insertions(+), 24 deletions(-)

diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py
index fa89acf0cc..0661bc7f38 100644
--- a/optimum/intel/openvino/modeling_visual_language.py
+++ b/optimum/intel/openvino/modeling_visual_language.py
@@ -28,6 +28,7 @@
 from transformers.modeling_outputs import BaseModelOutputWithPooling
 from transformers.models.qwen2_vl.modeling_qwen2_vl import VisionRotaryEmbedding
 from transformers.utils import ModelOutput
+from transformers import StoppingCriteria
 
 from ...exporters.openvino import main_export
 from ...exporters.openvino.stateful import ensure_stateful_is_available, model_has_input_output_name
@@ -4802,6 +4803,53 @@ def preprocess_inputs(
         return inputs
 
 
+def tokenizer_image_token(prompt, tokenizer, image_token_index=-200, return_tensors=None):
+    prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split("<image>")]
+
+    def insert_separator(X, sep):
+        return [ele for sublist in zip(X, [sep] * len(X)) for ele in sublist][:-1]
+
+    input_ids = []
+    offset = 0
+    if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id:
+        offset = 1
+        input_ids.append(prompt_chunks[0][0])
+
+    for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)):
+        input_ids.extend(x[offset:])
+
+    if return_tensors is not None:
+        if return_tensors == "pt":
+            return torch.tensor(input_ids, dtype=torch.long)
+        raise ValueError(f"Unsupported tensor type: {return_tensors}")
+    return input_ids
+
+class KeywordsStoppingCriteria(StoppingCriteria):
+    def __init__(self, keywords, tokenizer, input_ids):
+        self.keywords = keywords
+        self.keyword_ids = []
+        for keyword in keywords:
+            cur_keyword_ids = tokenizer(keyword).input_ids
+            if len(cur_keyword_ids) > 1 and cur_keyword_ids[0] == tokenizer.bos_token_id:
+                cur_keyword_ids = cur_keyword_ids[1:]
+            self.keyword_ids.append(torch.tensor(cur_keyword_ids))
+        self.tokenizer = tokenizer
+        self.start_len = input_ids.shape[1]
+
+    def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
+        assert output_ids.shape[0] == 1, "Only support batch size 1 (yet)"  # TODO
+        offset = min(output_ids.shape[1] - self.start_len, 3)
+        self.keyword_ids = [keyword_id.to(output_ids.device) for keyword_id in self.keyword_ids]
+        for keyword_id in self.keyword_ids:
+            if output_ids[0, -keyword_id.shape[0] :] == keyword_id:
+                return True
+        outputs = self.tokenizer.batch_decode(output_ids[:, -offset:], skip_special_tokens=True)[0]
+        for keyword in self.keywords:
+            if keyword in outputs:
+                return True
+        return False
+
+
 class _OVVideoChatFlashQwenForCausalLM(OVModelForVisualCausalLM):
     additional_parts = ["vision_projection"]
 
@@ -4820,32 +4868,31 @@ def preprocess_inputs(
             raise ValueError("Processor is required.")
         if audio is not None:
             raise ValueError("Audio input is not supported")
-        if getattr(processor, "chat_template", None) is not None:
-            chat_prompt = [{"role": "user", "content": [{"type": "text", "text": text}]}]
-            if image is not None:
-                chat_prompt[0]["content"].append({"type": "image"})
-            if video is not None:
-                chat_prompt[0]["content"].append({"type": "video"})
-            prompt = processor.apply_chat_template(chat_prompt, add_generation_prompt=True, tokenize=False)
-        else:
-            prompt = text
-            if image is not None and "<image>" not in prompt:
-                prompt = "<image>\n" + prompt
-            if video is not None and "<video>" not in prompt:
-                prompt = "<video>\n" + prompt
+        if tokenizer is None:
+            raise ValueError("Tokenizer is required.")
 
-        if is_transformers_version(">", "4.47.99") and getattr(processor, "patch_size", None) is None:
-            if (
-                getattr(config, "vision_config", None) is not None
-                and getattr(config.vision_config, "patch_size", None) is not None
-            ):
-                processor.patch_size = config.vision_config.patch_size
-            else:
-                raise ValueError(
-                    "Processor does not have `patch_size` attribute. Please fix the processor or provide `patch_size` in the config."
-                )
+        # preprocess text
+        prompt = f"<image>\n{text}"
+        messages = [{"role": "user", "content": prompt}]
+        text_prompt = tokenizer.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True,
+        )
+        input_ids = tokenizer_image_token(text_prompt, tokenizer, -200, return_tensors="pt").unsqueeze(0)
 
-        inputs = processor(images=image, text=prompt, videos=video, return_tensors="pt")
+        # preprocess video
+        frames = [processor(images=video, return_tensors="pt")]
+
+        if tokenizer.pad_token_id is None:
+            if "qwen" in tokenizer.name_or_path.lower():
+                print("Setting pad token to bos token for qwen model.")
+                tokenizer.pad_token_id = 151643
+        attention_masks = input_ids.ne(tokenizer.pad_token_id).long()
+
+        stop_str = "<|im_end|>"
+        keywords = [stop_str]
+        stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
+
+        inputs = {"images": frames, "inputs": input_ids, "attention_mask": attention_masks, "modalities": ["video"], "stopping_criteria": [stopping_criteria]}
         return inputs
 
 

From 97c12267cdd9f12aa10efdc76bd47743c910666b Mon Sep 17 00:00:00 2001
From: xufang <fang.xu@intel.com>
Date: Sat, 14 Mar 2026 12:14:33 +0800
Subject: [PATCH 06/39] set default quantization config for videochat model

---
 optimum/intel/openvino/configuration.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py
index 2002e268ac..24b31607cf 100644
--- a/optimum/intel/openvino/configuration.py
+++ b/optimum/intel/openvino/configuration.py
@@ -426,6 +426,18 @@ class OVQuantizationMethod(str, Enum):
             "weight_only": True,
         },
     },
+    "VideoChat-Flash-Qwen2_5-7B_InternVideo2-1B": {
+        "quantization_configs": {
+            "lm_model": {
+                "bits": 4,
+                "sym": False,
+                "group_size": 128,
+                "ratio": 1.0,
+            },
+            "text_embeddings_model": {"bits": 8, "sym": True, "weight_only": True},
+            "vision_embeddings_model": {"bits": 8, "sym": True, "weight_only": True},
+        },
+    },
 }
 
 _DEFAULT_8BIT_WQ_CONFIGS = {

From 74a8e5fbab65d7a8e809464a134d10ab9204faf0 Mon Sep 17 00:00:00 2001
From: xufang <fang.xu@intel.com>
Date: Sat, 14 Mar 2026 22:37:49 +0800
Subject: [PATCH 07/39] add rotary_pos_embed to vision_embedding

---
 optimum/exporters/openvino/model_configs.py | 10 ++-
 optimum/exporters/openvino/model_patcher.py | 75 ++++++++++++++++++++-
 2 files changed, 81 insertions(+), 4 deletions(-)

diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index e375536d98..2cb7f29992 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -201,6 +201,7 @@
     SanaTextEncoderModelPatcher,
     VideochatFlashQwenLanguageModelPatcher,
     VideochatFlashQwenVisionEmbeddingModelPatcher,
+    VideochatFlashQwenVisionProjectionModelPatcher,
     XverseModelPatcher,
     Zamba2ModelPatcher,
 )
@@ -5372,7 +5373,11 @@ class VideoChatFlashQWENProjectorOpenVINOConfig(OnnxConfig):
 
     @property
     def inputs(self) -> Dict[str, Dict[int, str]]:
-        return {"input": {0: "batch_size", 1: "num_patches", 2: "hidden_size"}}
+        return {"hidden_states": {0: "batch_size", 1: "num_patches", 2: "hidden_size"}}
+
+    def patch_model_for_export(self, model: PreTrainedModel, model_kwargs: Optional[Dict[str, Any]] = None):
+        model_kwargs = model_kwargs or {}
+        return VideochatFlashQwenVisionProjectionModelPatcher(self, model, model_kwargs)
 
 
 class VideoChatFlashQwenConfigBehavior(str, enum.Enum):
@@ -5415,7 +5420,8 @@ def __init__(
     def inputs(self) -> Dict[str, Dict[int, str]]:
         if not self._behavior == VideoChatFlashQwenConfigBehavior.VISION_EMBEDDINGS:
             return {}
-        return {"pixel_values": {0: "batch_size", 1: "num_channels", 2: "num_frames", 3: "height", 4: "width"}}
+        return {"hidden_states": {0: "batch_size", 1: "num_channels", 2: "num_frames", 3: "height", 4: "width"},
+                "rotary_pos_emb": {0: "batch_size", 1: "num_tokens",2: "hidden_size"}}
 
     def with_behavior(
         self,
diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index ffd3341ab2..e80e80dfb1 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -7647,8 +7647,79 @@ def __init__(
     ):
         model.__orig_forward = model.forward
 
-        def forward_wrap(self, pixel_values):
-            return self.__orig_forward(x=pixel_values)
+        def forward_wrap(self, hidden_states, rotary_pos_emb=None, mask=None, use_image=False):
+            hidden_states = self.patch_embed(hidden_states.type(self.dtype))
+            B, T, L, C = hidden_states.shape  # T: temporal; L: spatial
+            hidden_states = hidden_states.view([B, T * L, C])
+
+            # append cls token
+            cls_tokens = self.cls_token.expand(B, -1, -1)
+            hidden_states = torch.cat((cls_tokens, hidden_states), dim=1)
+
+            # add pos_embed
+            if self.sep_pos_embed:
+                raise NotImplementedError
+            else:
+                if use_image:
+                    if self.sep_image_video_pos_embed:
+                        rotary_pos_emb = self.img_pos_embed
+                    else:
+                        # (1, num_img_patches + 1, embed_dim)
+                        cls_pos_embed = self.pos_embed[:, 0:1, :]
+
+                        img_pos_embed = self.pos_embed[:, 1:, :].view(1, self.num_frames, self.patch_embed.num_patches // self.num_frames, self.embed_dim).mean(dim=1)
+
+                        rotary_pos_emb = torch.cat([cls_pos_embed, img_pos_embed], dim=1)
+                else:
+                    if rotary_pos_emb is None:
+                        rotary_pos_emb = self.pos_embed
+
+            hidden_states = hidden_states + rotary_pos_emb
+
+            # mask tokens, ~mask means visible
+            if mask is not None:
+                hidden_states = hidden_states[~mask].reshape(B, -1, C)
+            else:
+                hidden_states = hidden_states.reshape(B, -1, C)
+
+            residual = None
+
+            for idx, blk in enumerate(self.blocks):
+                if isinstance(hidden_states, tuple) and len(hidden_states) == 2:
+                    hidden_states, residual = hidden_states
+                hidden_states = blk(hidden_states, residual=residual)
+
+            if isinstance(hidden_states, tuple) and len(hidden_states) == 2:
+                hidden_states, residual = hidden_states
+                if residual is not None:
+                    hidden_states = hidden_states + residual
+
+            x_vis = hidden_states
+            if self.x_vis_only:
+                return x_vis
+            else:
+                x_pool_vis = self.clip_projector(x_vis)
+                return x_vis, x_pool_vis, None, None
+
+        model.forward = types.MethodType(forward_wrap, model)
+        super().__init__(config, model, model_kwargs)
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        super().__exit__(exc_type, exc_value, traceback)
+        self._model.forward = self._model.__orig_forward
+
+
+class VideochatFlashQwenVisionProjectionModelPatcher(ModelPatcher):
+    def __init__(
+        self,
+        config: "OnnxConfig",
+        model: "PreTrainedModel",
+        model_kwargs: Dict[str, Any] = None,
+    ):
+        model.__orig_forward = model.forward
+
+        def forward_wrap(self, hidden_states):
+            return self.__orig_forward(input=hidden_states)
 
         model.forward = types.MethodType(forward_wrap, model)
         super().__init__(config, model, model_kwargs)

From a0af467644173ef46ed7e868cd7c52bb35a26704 Mon Sep 17 00:00:00 2001
From: xufang <fang.xu@intel.com>
Date: Sat, 14 Mar 2026 23:03:54 +0800
Subject: [PATCH 08/39] update vision projection input name

---
 optimum/exporters/openvino/model_configs.py | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index 2cb7f29992..ca2a9d6654 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -5306,6 +5306,7 @@ class SiglipTextOpenVINOConfig(SiglipTextOnnxConfig):
 
 
 class DummyVideoChatFlashQwenInputGenerator(DummyVisionInputGenerator):
+    SUPPORTED_INPUT_NAMES = ("hidden_states", "rotary_pos_emb")
     def __init__(
         self,
         task: str,
@@ -5320,12 +5321,14 @@ def __init__(
         super().__init__(task, normalized_config, batch_size, num_channels, width, height, visual_seq_length, **kwargs)
         if hasattr(normalized_config, "config") and hasattr(normalized_config.config, "mm_local_num_frames"):
             self.num_frames = normalized_config.config.mm_local_num_frames
-            self.height = 224
-            self.width = 224
-            self.image_size = (self.height, self.width)
+        self.height = 224
+        self.width = 224
+        self.image_size = (self.height, self.width)
+        self.patch_size = 14
+        self.embed_dim = 1408
 
     def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
-        if input_name == "pixel_values":
+        if input_name == "hidden_states":
             return self.random_float_tensor(
                 shape=[
                     self.batch_size,
@@ -5337,10 +5340,14 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int
                 framework=framework,
                 dtype=float_dtype,
             )
+        elif input_name == "rotary_pos_emb":
+            grid_h, grid_w = self.height // self.patch_size, self.width // self.patch_size
+            grid_t = self.num_frames
+            return self.random_float_tensor([1, 1 + grid_h * grid_t * grid_w, self.embed_dim], framework=framework, dtype=float_dtype)
 
 
 class DummyVideoChatFlashQwenProjectorInputGenerator(DummyInputGenerator):
-    SUPPORTED_INPUT_NAMES = ["input"]
+    SUPPORTED_INPUT_NAMES = ["hidden_states"]
 
     def __init__(
         self,

From 70056d07e802786b52810488257d55091d587fc6 Mon Sep 17 00:00:00 2001
From: xufang <fang.xu@intel.com>
Date: Mon, 16 Mar 2026 09:00:23 +0800
Subject: [PATCH 09/39] use mm_hidden_size as embed_dim

---
 optimum/exporters/openvino/model_configs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index ca2a9d6654..7c3dd92b4c 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -5325,7 +5325,7 @@ def __init__(
         self.width = 224
         self.image_size = (self.height, self.width)
         self.patch_size = 14
-        self.embed_dim = 1408
+        self.embed_dim = normalized_config.config.mm_hidden_size
 
     def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
         if input_name == "hidden_states":

From 67f33c2eedeb644a74a4e9d7332742a66bdf0871 Mon Sep 17 00:00:00 2001
From: xufang <fang.xu@intel.com>
Date: Tue, 17 Mar 2026 23:53:16 +0800
Subject: [PATCH 10/39] add check for videochat

---
 optimum/commands/export/openvino.py | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py
index cd3280189e..6715a8c826 100644
--- a/optimum/commands/export/openvino.py
+++ b/optimum/commands/export/openvino.py
@@ -349,8 +349,23 @@ def run(self):
             get_default_quantization_config,
         )
         from ...intel.openvino.utils import TemporaryDirectory
-        from ...intel.utils.import_utils import is_nncf_available
+        from ...intel.utils.import_utils import is_nncf_available, is_transformers_version
         from ...intel.utils.modeling_utils import _infer_library_from_model_name_or_path
+        import os
+
+        is_local = os.path.isdir(self.args.model)
+        if (
+            "OpenGVLab/VideoChat-Flash-Qwen2_5-7B_InternVideo2-1B" in self.args.model
+            and not is_local
+            and (is_transformers_version(">=", "4.49"))
+        ):
+            raise ValueError(
+                "The model OpenGVLab/VideoChat-Flash-Qwen2_5-7B_InternVideo2-1B in hugging face "
+                "contains custom code and requires transformers version prior to 4.49. "
+                "It is recommended to install transformers version 4.48 in your environment or download "
+                "https://modelscope.cn/models/OpenGVLab/VideoChat-Flash-Qwen2_5-7B_InternVideo2-1B "
+                "to your local path and use local path to convert."
+            )
 
         if self.args.library is None:
             # TODO: add revision, subfolder and token to args

From b44b15de4cbc77bde827dcd060a379569a7e01cd Mon Sep 17 00:00:00 2001
From: xufang <fang.xu@intel.com>
Date: Wed, 18 Mar 2026 10:13:56 +0800
Subject: [PATCH 11/39] Add pipeline for VideoChat

---
 .../openvino/modeling_visual_language.py      | 805 +++++++++++++++++-
 1 file changed, 801 insertions(+), 4 deletions(-)

diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py
index 1c8c726f76..1f4b1ee4a2 100644
--- a/optimum/intel/openvino/modeling_visual_language.py
+++ b/optimum/intel/openvino/modeling_visual_language.py
@@ -5,10 +5,12 @@
 import math
 import os
 import warnings
+import ast
+import re
 from abc import abstractmethod
 from dataclasses import dataclass
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
 
 import numpy as np
 import openvino
@@ -4852,6 +4854,332 @@ def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kw
 
 class _OVVideoChatFlashQwenForCausalLM(OVModelForVisualCausalLM):
     additional_parts = ["vision_projection"]
+    IMAGE_TOKEN_INDEX = -200
+    IGNORE_INDEX = -100
+
+    def get_3d_sincos_pos_embed(embed_dim, grid_size, t_size, cls_token=False):
+        """
+        grid_size: int of the grid height and width
+        t_size: int of the temporal size
+        return:
+        pos_embed: [t_size*grid_size*grid_size, embed_dim] or [1+t_size*grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+        """
+        assert embed_dim % 4 == 0
+        embed_dim_spatial = embed_dim // 4 * 3
+        embed_dim_temporal = embed_dim // 4
+
+        # spatial
+        grid_h = np.arange(grid_size, dtype=np.float32)
+        grid_w = np.arange(grid_size, dtype=np.float32)
+        grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+        grid = np.stack(grid, axis=0)
+
+        grid = grid.reshape([2, 1, grid_size, grid_size])
+        pos_embed_spatial = _OVVideoChatFlashQwenForCausalLM.get_2d_sincos_pos_embed_from_grid(
+            embed_dim_spatial, grid
+        )
+
+        # temporal
+        grid_t = np.arange(t_size, dtype=np.float32)
+        pos_embed_temporal = _OVVideoChatFlashQwenForCausalLM.get_1d_sincos_pos_embed_from_grid(
+            embed_dim_temporal, grid_t
+        )
+
+        # concate: [T, H, W] order
+        pos_embed_temporal = pos_embed_temporal[:, np.newaxis, :]
+        pos_embed_temporal = np.repeat(
+            pos_embed_temporal, grid_size**2, axis=1
+        )  # [T, H*W, D // 4]
+        pos_embed_spatial = pos_embed_spatial[np.newaxis, :, :]
+        pos_embed_spatial = np.repeat(
+            pos_embed_spatial, t_size, axis=0
+        )  # [T, H*W, D // 4 * 3]
+
+        pos_embed = np.concatenate([pos_embed_temporal, pos_embed_spatial], axis=-1)
+        pos_embed = pos_embed.reshape([-1, embed_dim])  # [T*H*W, D]
+
+        if cls_token:
+            pos_embed = np.concatenate(
+                [np.zeros([1, embed_dim]), pos_embed], axis=0
+            )
+        return pos_embed
+
+    def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+        assert embed_dim % 2 == 0
+
+        # use half of dimensions to encode grid_h
+        emb_h = _OVVideoChatFlashQwenForCausalLM.get_1d_sincos_pos_embed_from_grid(
+            embed_dim // 2, grid[0]
+        )  # (H*W, D/2)
+        emb_w = _OVVideoChatFlashQwenForCausalLM.get_1d_sincos_pos_embed_from_grid(
+            embed_dim // 2, grid[1]
+        )  # (H*W, D/2)
+
+        emb = np.concatenate([emb_h, emb_w], axis=1)  # (H*W, D)
+        return emb
+
+
+    def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+        """
+        embed_dim: output dimension for each position
+        pos: a list of positions to be encoded: size (M,)
+        out: (M, D)
+        """
+        assert embed_dim % 2 == 0
+        omega = np.arange(embed_dim // 2, dtype=np.float32)
+        omega /= embed_dim / 2.0
+        omega = 1.0 / 10000**omega  # (D/2,)
+
+        pos = pos.reshape(-1)  # (M,)
+        out = np.einsum("m,d->md", pos, omega)  # (M, D/2), outer product
+
+        emb_sin = np.sin(out)  # (M, D/2)
+        emb_cos = np.cos(out)  # (M, D/2)
+
+        emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+        return emb
+
+    def __init__(
+        self,
+        language_model: ov.Model,
+        text_embeddings: ov.Model,
+        vision_embeddings: ov.Model,
+        config: PretrainedConfig = None,
+        device: str = "CPU",
+        dynamic_shapes: bool = None,
+        ov_config: Optional[Dict[str, str]] = None,
+        model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None,
+        quantization_config: Union[OVWeightQuantizationConfig, Dict] = None,
+        **kwargs,
+    ):
+        from torch import nn
+        super().__init__(
+            language_model=language_model,
+            text_embeddings=text_embeddings,
+            vision_embeddings=vision_embeddings,
+            config=config,
+            device=device,
+            dynamic_shapes=dynamic_shapes,
+            ov_config=ov_config,
+            model_save_dir=model_save_dir,
+            quantization_config=quantization_config,
+            **kwargs,
+        )
+        num_frames = getattr(config, "mm_local_num_frames", 8)
+        self.num_attention_heads = 16
+        self.patch_size = 14
+        self.image_size = 224
+        self.grid_size = (num_frames, self.image_size // self.patch_size, self.image_size // self.patch_size) # (T, H, W)
+        self.num_patches = self.grid_size[0] * self.grid_size[1] * self.grid_size[2]
+        self.num_img_patches = self.grid_size[1] * self.grid_size[2]
+        self.embed_dim = 1408
+        self.pos_embed = nn.Parameter(torch.zeros(1, self.num_patches + 1, self.embed_dim))
+        self.img_pos_embed = nn.Parameter(torch.zeros(1, self.num_img_patches + 1, self.embed_dim))
+        pos_embed = _OVVideoChatFlashQwenForCausalLM.get_3d_sincos_pos_embed(
+            self.pos_embed.shape[-1],
+            self.grid_size[1], # height & weight
+            self.grid_size[0], # t_size
+            cls_token=True
+        )
+        self.pos_embed.data.copy_(torch.from_numpy(pos_embed).float().unsqueeze(0))
+
+        img_pos_embed = _OVVideoChatFlashQwenForCausalLM.get_3d_sincos_pos_embed(
+            self.pos_embed.shape[-1],
+            self.grid_size[1], # height & weight
+            1,
+            cls_token=True
+        )
+        self.img_pos_embed.data.copy_(torch.from_numpy(img_pos_embed).float().unsqueeze(0))
+
+    def bipartite_soft_matching(
+        metric: torch.Tensor,
+        r: int,
+    ) -> Tuple[Callable, Callable]:
+        """
+        Applies ToMe with a balanced matching set (50%, 50%).
+
+        Input size is [batch, tokens, channels].
+        r indicates the number of tokens to remove (max 50% of tokens).
+        """
+        protected = 0
+
+        t = metric.shape[1]
+        r = min(r, (t - protected) // 2)
+
+        assert r > 0, r
+
+        with torch.no_grad():
+            metric = metric / metric.norm(dim=-1, keepdim=True)
+            a, b = metric[..., ::2, :], metric[..., 1::2, :]
+            scores = a @ b.transpose(-1, -2)
+
+            node_max, node_idx = scores.max(dim=-1)
+            edge_idx = node_max.argsort(dim=-1, descending=True)[..., None]
+
+            unm_idx = edge_idx[..., r:, :]  # Unmerged Tokens
+            src_idx = edge_idx[..., :r, :]  # Merged Tokens
+            dst_idx = node_idx[..., None].gather(dim=-2, index=src_idx)
+
+        def merge(x: torch.Tensor, mode="mean") -> torch.Tensor:
+            src, dst = x[..., ::2, :], x[..., 1::2, :]
+            n, t1, c = src.shape
+            unm = src.gather(dim=-2, index=unm_idx.expand(n, t1 - r, c))
+            src = src.gather(dim=-2, index=src_idx.expand(n, r, c))
+            dst = dst.scatter_add(-2, dst_idx.expand(n, r, c), src) # , reduce=mode)
+
+            return torch.cat([unm, dst], dim=1)
+
+        def unmerge(x: torch.Tensor) -> torch.Tensor:
+            unm_len = unm_idx.shape[1]
+            unm, dst = x[..., :unm_len, :], x[..., unm_len:, :]
+            n, _, c = unm.shape
+
+            src = dst.gather(dim=-2, index=dst_idx.expand(n, r, c))
+
+            out = torch.zeros(n, metric.shape[1], c, device=x.device, dtype=x.dtype)
+
+            out[..., 1::2, :] = dst
+            out.scatter_(dim=-2, index=(2 * unm_idx).expand(n, unm_len, c), src=unm)
+            out.scatter_(dim=-2, index=(2 * src_idx).expand(n, r, c), src=src)
+
+            return out
+
+        return merge, unmerge
+
+    def merge_wavg(
+        merge: Callable, x: torch.Tensor, size: torch.Tensor = None
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Applies the merge function by taking a weighted average based on token size.
+        Returns the merged tensor and the new token sizes.
+        """
+        if size is None:
+            size = torch.ones_like(x[..., 0, None])
+
+        x = merge(x * size, mode="sum")
+        size = merge(size, mode="sum")
+
+        x = x / size
+        return x, size
+
+    def get_vision_embeddings(self, images):
+        if type(images) is list:
+            raise NotImplementedError
+        else:
+            # input: B T C H W
+            # output: B T*L C
+            T = images.shape[1]
+            images = images.permute(0, 2, 1, 3, 4)
+            if T == 1:
+                pos_embeds = self.img_pos_embed.detach()
+            else:
+                pos_embeds = self.pos_embed.detach()
+            image_embeds = self.vision_embeddings(images, rotary_pos_emb=pos_embeds, use_image=(T == 1)).last_hidden_state
+            image_embeds = image_embeds[:, 1:, :]
+
+            videos_features = torch.from_numpy(image_embeds) if isinstance(image_embeds, np.ndarray) else image_embeds
+
+        return videos_features
+
+    def merge_tokens(self, x, target_num_token):
+        r"""
+        x = torch.randn(10, 2560, c)
+        x = merge_tokens(x, r_merge_list=[1280])
+        """
+        size = None
+        b, p, c = x.shape
+        tmp_p = p
+        r_merge_list = []
+        assert tmp_p > target_num_token, f"{tmp_p} should greater than {target_num_token}"
+        while tmp_p != target_num_token:
+            if tmp_p - target_num_token <= (tmp_p // 2):
+                r_merge_list.append(tmp_p - target_num_token)
+                break
+            else:
+                r_merge_list.append(tmp_p // 2)
+                tmp_p = tmp_p - (tmp_p // 2)
+
+        head = self.num_attention_heads
+
+        dim = c // head
+        for r in r_merge_list:
+            metric = x.reshape(b, p, head, dim).mean(2) # [b, p, c//head]
+            merge, _ = _OVVideoChatFlashQwenForCausalLM.bipartite_soft_matching(
+                metric,
+                r
+            )
+            x, size = _OVVideoChatFlashQwenForCausalLM.merge_wavg(merge, x, size)
+            _, p, _ = x.shape
+
+        return x
+
+    def get_vision_projection(self, x, compress=False, local_num_frames=-1): # 单帧64
+        height = width = self.image_size // self.patch_size
+        assert height * width == x.shape[1]
+
+        if local_num_frames != -1 and local_num_frames != 1:
+            assert compress is True
+        if compress:
+            if local_num_frames != -1:
+                num_frames = local_num_frames
+                x = x.reshape(x.shape[0] // local_num_frames, -1, x.shape[-1])
+            else:
+                num_frames = x.shape[0]
+                x = x.reshape(1, -1, x.shape[-1])
+            num_tome_tokens = 16 * num_frames
+        else:
+            num_tome_tokens = 64
+
+        x = self.merge_tokens(x, target_num_token=num_tome_tokens)
+        x = self.vision_projection(x)
+        x = torch.from_numpy(x) if isinstance(x, np.ndarray) else x
+        return x
+
+    def tokenizer_image_token(prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, return_tensors=None):
+        prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split("<image>")]
+
+        def insert_separator(X, sep):
+            return [ele for sublist in zip(X, [sep] * len(X)) for ele in sublist][:-1]
+
+        input_ids = []
+        offset = 0
+        if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id:
+            offset = 1
+            input_ids.append(prompt_chunks[0][0])
+
+        for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)):
+            input_ids.extend(x[offset:])
+
+        if return_tensors is not None:
+            if return_tensors == "pt":
+                return torch.tensor(input_ids, dtype=torch.long)
+            raise ValueError(f"Unsupported tensor type: {return_tensors}")
+        return input_ids
+
+    class KeywordsStoppingCriteria(StoppingCriteria):
+        def __init__(self, keywords, tokenizer, input_ids):
+            self.keywords = keywords
+            self.keyword_ids = []
+            for keyword in keywords:
+                cur_keyword_ids = tokenizer(keyword).input_ids
+                if len(cur_keyword_ids) > 1 and cur_keyword_ids[0] == tokenizer.bos_token_id:
+                    cur_keyword_ids = cur_keyword_ids[1:]
+                self.keyword_ids.append(torch.tensor(cur_keyword_ids))
+            self.tokenizer = tokenizer
+            self.start_len = input_ids.shape[1]
+
+        def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
+            assert output_ids.shape[0] == 1, "Only support batch size 1 (yet)"  # TODO
+            offset = min(output_ids.shape[1] - self.start_len, 3)
+            self.keyword_ids = [keyword_id.to(output_ids.device) for keyword_id in self.keyword_ids]
+            for keyword_id in self.keyword_ids:
+                if output_ids[0, -keyword_id.shape[0] :] == keyword_id:
+                    return True
+            outputs = self.tokenizer.batch_decode(output_ids[:, -offset:], skip_special_tokens=True)[0]
+            for keyword in self.keywords:
+                if keyword in outputs:
+                    return True
+            return False
 
     @staticmethod
     def preprocess_inputs(
@@ -4870,6 +5198,7 @@ def preprocess_inputs(
             raise ValueError("Audio input is not supported")
         if tokenizer is None:
             raise ValueError("Tokenizer is required.")
+        image_sizes = None
 
         # preprocess text
         prompt = f"<image>\n{text}"
@@ -4877,7 +5206,14 @@ def preprocess_inputs(
         text_prompt = tokenizer.apply_chat_template(
             messages, tokenize=False, add_generation_prompt=True,
         )
-        input_ids = tokenizer_image_token(text_prompt, tokenizer, -200, return_tensors="pt").unsqueeze(0)
+        input_ids = _OVVideoChatFlashQwenForCausalLM.tokenizer_image_token(text_prompt, tokenizer, _OVVideoChatFlashQwenForCausalLM.IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0)
+
+        if isinstance(video, list):
+            if isinstance(video[0], np.ndarray):
+                image_sizes = [video[0].shape[:2]]
+            else:
+                width, height = video[0].size
+                image_sizes = [(height, width)]
 
         # preprocess video
         frames = [processor(images=video, return_tensors="pt")]
@@ -4890,11 +5226,472 @@ def preprocess_inputs(
 
         stop_str = "<|im_end|>"
         keywords = [stop_str]
-        stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
+        stopping_criteria = _OVVideoChatFlashQwenForCausalLM.KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
 
-        inputs = {"images": frames, "inputs": input_ids, "attention_mask": attention_masks, "modalities": ["video"], "stopping_criteria": [stopping_criteria]}
+        # inputs = {"images": frames, "inputs": input_ids, "attention_mask": attention_masks, "modalities": ["video"], "stopping_criteria": [stopping_criteria], "image_sizes":image_sizes}
+        inputs = {"images": frames, "inputs": input_ids, "attention_mask": attention_masks, "stopping_criteria": [stopping_criteria], "image_sizes":image_sizes}
         return inputs
 
+    def encode_video_image(self, images_list, video_idx_in_batch):
+        # video encoder编码后按图像的connector处理
+        bs = len(images_list)
+
+        concat_images = []
+        concat_videos = []
+        for idx, image in enumerate(images_list):
+            if idx in video_idx_in_batch:
+                concat_videos.append(image)
+            else:
+                concat_images.append(image)
+        # print(concat_videos[0].shape)
+        has_image = len(concat_images) > 0
+        has_video = len(concat_videos) > 0
+
+        mm_local_num_frames = getattr(self.config, "mm_local_num_frames", -1)
+        assert mm_local_num_frames != -1
+        if has_image:
+            image_split_sizes = [image.shape[0] for image in concat_images]
+            concat_images = torch.cat([image.unsqueeze(1) for image in concat_images], dim=0)
+            # print("input vit image.shape:", concat_images.shape)
+            images_features = self.get_vision_embeddings(concat_images) # B_i, N, D
+            images_features = torch.split(images_features, image_split_sizes)
+
+        if has_video:
+            video_split_sizes = [video.shape[0] // mm_local_num_frames for video in concat_videos]
+            concat_videos = torch.cat([video.reshape(video.shape[0] // mm_local_num_frames, mm_local_num_frames, video.shape[1], video.shape[2], video.shape[3]) for video in concat_videos], dim=0)
+            # print("input vit video.shape:", concat_videos.shape)
+            videos_features = self.get_vision_embeddings(concat_videos) # B_v, N, D
+            videos_features = [v.reshape(-1, v.shape[-2] // mm_local_num_frames, v.shape[-1]) for v in torch.split(videos_features, video_split_sizes)]
+
+        all_videos_or_images_features = []
+        img_idx = 0
+        vid_idx = 0
+
+        for idx in range(bs):
+            if idx in video_idx_in_batch:
+                feat = self.get_vision_projection(videos_features[vid_idx], compress=True, local_num_frames=getattr(self.config, "mm_local_num_frames", -1))
+                vid_idx += 1
+            else:
+                feat = self.get_vision_projection(images_features[img_idx], compress=False)
+                img_idx += 1
+            # print("video_idx_in_batch:", video_idx_in_batch)
+            all_videos_or_images_features.append(feat)
+
+        if has_video:
+            assert vid_idx == len(videos_features), f"vid: {vid_idx} != {len(videos_features)}"
+        if has_image:
+            assert img_idx == len(images_features), f"img: {img_idx} != {len(images_features)}"
+
+        return all_videos_or_images_features
+
+    def select_best_resolution(original_size, possible_resolutions, max_resolutions, patch_size):
+        """
+        Selects the best resolution from a list of possible resolutions based on the original size.
+
+        Args:
+            original_size (tuple): The original size of the image in the format (width, height).
+            possible_resolutions (list): A list of possible resolutions in the format [(width1, height1), (width2, height2), ...].
+
+        Returns:
+            tuple: The best fit resolution in the format (width, height).
+        """
+        original_width, original_height = original_size
+        best_fit = None
+        max_effective_resolution = 0
+        min_wasted_resolution = float("inf")
+
+        for width, height in possible_resolutions:
+            if max_resolutions != None and (width * height != patch_size * patch_size):
+                if (width * height+patch_size*patch_size) > max_resolutions: # NOTE 要算一个global
+                    continue
+            # Calculate the downscaled size to keep the aspect ratio
+            scale = min(width / original_width, height / original_height)
+            downscaled_width, downscaled_height = int(original_width * scale), int(original_height * scale)
+
+            # Calculate effective and wasted resolutions
+            effective_resolution = min(downscaled_width * downscaled_height, original_width * original_height)
+            wasted_resolution = (width * height) - effective_resolution
+
+            if effective_resolution > max_effective_resolution or (effective_resolution == max_effective_resolution and wasted_resolution < min_wasted_resolution):
+                max_effective_resolution = effective_resolution
+                min_wasted_resolution = wasted_resolution
+                best_fit = (width, height)
+
+        # print(f"original_size={original_size}, possible_resolutions={possible_resolutions}, max_resolutions={max_resolutions}, best_fit={best_fit}")
+        assert best_fit is not None, f"Can't find suitable fit in {possible_resolutions} at max:{max_resolutions}"
+        return best_fit
+
+    def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size, max_resolutions=None):
+        """
+        Calculate the shape of the image patch grid after the preprocessing for images of any resolution.
+
+        Args:
+            image_size (tuple): The size of the input image in the format (width, height).
+            grid_pinpoints (str): A string representation of a list of possible resolutions.
+            patch_size (int): The size of each image patch.
+
+        Returns:
+            tuple: The shape of the image patch grid in the format (width, height).
+        """
+        if isinstance(grid_pinpoints, str) and "x" in grid_pinpoints:
+            assert patch_size in [224, 336, 384, 448, 512], "patch_size should be in [224, 336, 384, 448, 512]"
+            # Use regex to extract the range from the input string
+            matches = re.findall(r"\((\d+)x(\d+)\)", grid_pinpoints)
+            range_start = tuple(map(int, matches[0]))
+            range_end = tuple(map(int, matches[-1]))
+            # Generate a matrix of tuples from (range_start[0], range_start[1]) to (range_end[0], range_end[1])
+            grid_pinpoints = [(i, j) for i in range(range_start[0], range_end[0] + 1) for j in range(range_start[1], range_end[1] + 1)]
+            # Multiply all elements by patch_size
+            grid_pinpoints = [[dim * patch_size for dim in pair] for pair in grid_pinpoints]
+        if type(grid_pinpoints) is list:
+            possible_resolutions = grid_pinpoints
+        else:
+            possible_resolutions = ast.literal_eval(grid_pinpoints)
+        width, height = _OVVideoChatFlashQwenForCausalLM.select_best_resolution(image_size, possible_resolutions, max_resolutions=max_resolutions, patch_size=patch_size)
+
+        # print("get width/patch size", width, patch_size, flush=True)
+
+        return width // patch_size, height // patch_size
+
+    def get_text_embeddings(self, input_ids):
+        squeeze_batch_dim = False
+        if input_ids.ndim == 1:
+            input_ids = input_ids.unsqueeze(0)
+            squeeze_batch_dim = True
+
+        text_embed = super().get_text_embeddings(input_ids)
+
+        if squeeze_batch_dim and text_embed.ndim > 0 and text_embed.shape[0] == 1:
+            text_embed = text_embed[0]
+
+        text_embed = torch.from_numpy(text_embed) if isinstance(text_embed, np.ndarray) else text_embed
+        return text_embed
+
+
+    # def get_multimodal_embeddings(self, input_ids, position_ids, attention_mask, past_key_values, labels, images, modalities=["image"], image_sizes=None):
+    def get_multimodal_embeddings(self, input_ids, pixel_values=None, attention_mask=None, position_ids=None, past_key_values=None, labels=None, modalities=["image"], image_sizes=None, **kwargs):
+        # assert type(modalities) is list, modalities
+        print('get_multimodal_embeddings enter')
+        # if input_ids is not None:
+        #     print(f'input_ids: {input_ids.shape}, attention_mask: {attention_mask.shape}, position_ids: {position_ids.shape}')
+        images = pixel_values
+
+        if images is None:
+            inputs_embeds = self.get_text_embeddings(input_ids)
+            return inputs_embeds, attention_mask, position_ids
+        else:
+            modalities = ["video"]
+
+        # rank_print(modalities)
+        if type(images) is list:
+            images = [x.unsqueeze(0) if x.ndim == 3 else x for x in images]
+
+        video_idx_in_batch = []
+        for _ in range(len(modalities)):
+            if modalities[_] == "video":
+                video_idx_in_batch.append(_)
+
+        images_list = []
+        for image in images:
+            if image.ndim == 4:
+                images_list.append(image)
+            else:
+                images_list.append(image.unsqueeze(0))
+
+        vision_encode_type = getattr(self.config, "vision_encode_type", "image")
+        mm_patch_merge_type = getattr(self.config, "mm_patch_merge_type", "flat")
+        image_aspect_ratio = getattr(self.config, "image_aspect_ratio", "square")
+        frame_aspect_ratio = getattr(self.config, "frame_aspect_ratio", "square")
+        mm_newline_position = getattr(self.config, "mm_newline_position", "nothing")
+
+        if vision_encode_type == "video_image": # video backbone, process video with compress
+            image_features = self.encode_video_image(images_list, video_idx_in_batch=video_idx_in_batch)
+        else:
+            raise NotImplementedError(vision_encode_type)
+
+        if mm_patch_merge_type == "flat":
+            image_features = [x.flatten(0, 1) for x in image_features]
+        elif mm_patch_merge_type.startswith("spatial"):
+            new_image_features = []
+            for image_idx, image_feature in enumerate(image_features):
+
+                if image_idx in video_idx_in_batch:  # video operations
+
+                    if "anyres" in frame_aspect_ratio:
+                        raise NotImplementedError
+                    else:
+                        frame_feature = image_feature
+
+                    if "pad" in mm_patch_merge_type:
+                        if mm_newline_position == 'one_token':
+                            frame_feature = frame_feature.flatten(0, 1)
+                            if "unpad" in mm_patch_merge_type:
+                                frame_feature = torch.cat((frame_feature, self.model.image_newline[None].to(frame_feature.device)), dim=0)
+                            else:
+                                frame_feature = torch.cat((frame_feature, self.model.frame_newline[None].to(frame_feature.device)), dim=0)
+                        elif mm_newline_position == 'nothing':
+                            frame_feature = frame_feature.flatten(0, 1)
+                        else:
+                            raise NotImplementedError("add pad please!!")
+                    else:
+                        frame_feature = frame_feature.flatten(0, 1)
+
+                    # print(f"final video frame_feature.shape: {frame_feature.shape}")
+                    image_feature = frame_feature
+
+                elif image_feature.shape[0] > 1:  # multi patches and multi images operations
+                    base_image_feature = image_feature[0]
+                    image_feature = image_feature[1:]
+                    origin_size = image_feature.shape
+
+                    height = width = 8
+                    assert height * width == base_image_feature.shape[0], f"height:{height}, width: {width}, base_image_feature: {base_image_feature.shape}"
+
+                    if "anyres_max" in image_aspect_ratio:
+                        matched_anyres_max_num_patches = re.match(r"anyres_max_(\d+)", image_aspect_ratio)
+                        if matched_anyres_max_num_patches:
+                            max_num_patches = int(matched_anyres_max_num_patches.group(1))
+
+                    if "anyres" in image_aspect_ratio:
+                        vision_tower_image_size = 224
+                        try:
+                            num_patch_width, num_patch_height = _OVVideoChatFlashQwenForCausalLM.get_anyres_image_grid_shape(image_sizes[image_idx], self.config.image_grid_pinpoints, vision_tower_image_size, max_resolutions=None)
+                        except Exception as e:
+                            print(f"Error: {e}")
+                            raise e
+                            # num_patch_width, num_patch_height = 2, 2
+
+                        image_feature = image_feature.view(num_patch_height, num_patch_width, height, width, -1)
+                    else:
+                        raise NotImplementedError(image_aspect_ratio)
+                        image_feature = image_feature.view(2, 2, height, width, -1)
+
+                    if "maxpool2x2" in mm_patch_merge_type:
+                        raise NotImplementedError
+                    elif "unpad" in mm_patch_merge_type and "anyres_max" in image_aspect_ratio and matched_anyres_max_num_patches:
+                        raise NotImplementedError
+                    elif "unpad" in mm_patch_merge_type:
+                        raise NotImplementedError
+                    else:
+                        image_feature = image_feature.permute(0, 2, 1, 3, 4).contiguous()
+                        image_feature = image_feature.flatten(0, 3)
+                    if "nobase" in mm_patch_merge_type:
+                        pass
+                    else:
+                        try:
+                            image_feature = torch.cat((base_image_feature, image_feature), dim=0)
+                        except Exception as e:
+                            raise ValueError(f"{num_patch_width} {num_patch_height} now: base_image_feature: {base_image_feature.shape}, {image_feature.shape}, image_sizes[image_idx]: {image_sizes[image_idx]}, origin_size: {origin_size}, {image_sizes[image_idx]}, {self.config.image_grid_pinpoints}, {vision_tower_image_size}")
+                else:  # single image operations
+                    image_feature = image_feature[0]
+                    if "unpad" in mm_patch_merge_type:
+                        image_feature = torch.cat((image_feature, self.model.image_newline[None]), dim=0)
+
+                # print(f"image/video_feature.shape: {image_feature.shape}")
+                new_image_features.append(image_feature)
+            image_features = new_image_features
+        else:
+            raise ValueError(f"Unexpected mm_patch_merge_type: {self.config.mm_patch_merge_type}")
+
+        # TODO: image start / end is not implemented here to support pretraining.
+        if getattr(self.config, "tune_mm_mlp_adapter", False) and getattr(self.config, "mm_use_im_start_end", False):
+            raise NotImplementedError
+
+        # Let's just add dummy tensors if they do not exist,
+        # it is a headache to deal with None all the time.
+        # But it is not ideal, and if you have a better idea,
+        # please open an issue / submit a PR, thanks.
+        _labels = labels
+        _position_ids = position_ids
+        _attention_mask = attention_mask
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids, dtype=torch.bool)
+        else:
+            attention_mask = attention_mask.bool()
+        if position_ids is None:
+            position_ids = torch.arange(0, input_ids.shape[1], dtype=torch.long, device=input_ids.device)
+        if labels is None:
+            labels = torch.full_like(input_ids, _OVVideoChatFlashQwenForCausalLM.IGNORE_INDEX)
+
+        print(f'input_ids: {input_ids.shape}, attention_mask: {attention_mask.shape}, position_ids: {position_ids.shape}, labels: {labels.shape}')
+        input_ids = [cur_input_ids[cur_attention_mask] for cur_input_ids, cur_attention_mask in zip(input_ids, attention_mask)]
+        labels = [cur_labels[cur_attention_mask] for cur_labels, cur_attention_mask in zip(labels, attention_mask)]
+
+        new_input_embeds = []
+        new_labels = []
+        cur_image_idx = 0
+
+        mm_llm_compress = getattr(self.config, "mm_llm_compress", False)
+
+        if mm_llm_compress:
+            self.language_model.model.llm_compress_type = getattr(self.config, "llm_compress_type", "attention")
+            self.language_model.model.llm_compress_layer_list = getattr(self.config, "llm_compress_layer_list", [8, 16, 24])
+            self.language_model.model.llm_image_token_ratio_list = getattr(self.config, "llm_image_token_ratio_list", [1.0, 0.5, 0.25, 0.125])
+            first_image_token_position = []
+            text_prompt_lens = []
+        else:
+            self.language_model.model.llm_compress_type = "attention"
+            self.language_model.model.llm_compress_layer_list = []
+            self.language_model.model.llm_image_token_ratio_list = []
+            first_image_token_position = []
+            text_prompt_lens = []
+
+        # rank_print("Inserting Images embedding")
+        for batch_idx, cur_input_ids in enumerate(input_ids):
+            num_images = (cur_input_ids == _OVVideoChatFlashQwenForCausalLM.IMAGE_TOKEN_INDEX).sum()
+
+            if mm_llm_compress:
+                ####### copy from pdrop, only support single image/video NOTE ##################
+                # record image position for further dropping
+                image_index = torch.where(cur_input_ids == _OVVideoChatFlashQwenForCausalLM.IMAGE_TOKEN_INDEX)[0].tolist()
+                assert len(image_index) == 1, f"Only support singe/video: {image_index}"
+                if image_index == []:
+                    first_image_token_position.append(-1)
+                else:
+                    first_image_token_position.append(image_index[0])
+
+                # record input instruction length in inference mode
+                if not self.training:
+                    if image_index == []:
+                        assert num_images == 0, num_images
+                    else:
+                        assert num_images == 1, f"num_images={num_images}"
+                    text_prompt_lens.append(cur_input_ids.shape[0] - num_images)   # consider image place holder
+
+                ###############################################
+
+            # print(f"num_images={num_images}")
+            if num_images == 0:
+                cur_image_features = image_features[cur_image_idx]
+                cur_input_embeds_1 = self.get_text_embeddings(cur_input_ids)
+                cur_input_embeds = torch.cat([cur_input_embeds_1, cur_image_features[0:0]], dim=0)
+                new_input_embeds.append(cur_input_embeds)
+                new_labels.append(labels[batch_idx])
+                cur_image_idx += 1
+                continue
+
+            image_token_indices = [-1] + torch.where(cur_input_ids == _OVVideoChatFlashQwenForCausalLM.IMAGE_TOKEN_INDEX)[0].tolist() + [cur_input_ids.shape[0]]
+            cur_input_ids_noim = []
+            cur_labels = labels[batch_idx]
+            cur_labels_noim = []
+            for i in range(len(image_token_indices) - 1):
+                cur_input_ids_noim.append(cur_input_ids[image_token_indices[i] + 1 : image_token_indices[i + 1]])
+                cur_labels_noim.append(cur_labels[image_token_indices[i] + 1 : image_token_indices[i + 1]])
+            split_sizes = [x.shape[0] for x in cur_labels_noim]
+            cur_input_embeds = self.get_text_embeddings(torch.cat(cur_input_ids_noim))
+            cur_input_embeds_no_im = torch.split(cur_input_embeds, split_sizes, dim=0)
+            cur_new_input_embeds = []
+            cur_new_labels = []
+
+            for i in range(num_images + 1):
+                cur_new_input_embeds.append(cur_input_embeds_no_im[i])
+                cur_new_labels.append(cur_labels_noim[i])
+                if i < num_images:
+                    try:
+                        cur_image_features = image_features[cur_image_idx]
+                    except IndexError:
+                        print(f"cur_image_idx={cur_image_idx} is not ok")
+                        cur_image_features = image_features[cur_image_idx - 1]
+                    cur_image_idx += 1
+                    cur_new_input_embeds.append(cur_image_features)
+                    cur_new_labels.append(torch.full((cur_image_features.shape[0],), _OVVideoChatFlashQwenForCausalLM.IGNORE_INDEX, device=cur_labels.device, dtype=cur_labels.dtype))
+
+            cur_new_input_embeds = [x.to(self.device) for x in cur_new_input_embeds]
+
+            # import pdb; pdb.set_trace()
+            cur_new_input_embeds = torch.cat(cur_new_input_embeds)
+            cur_new_labels = torch.cat(cur_new_labels)
+
+            new_input_embeds.append(cur_new_input_embeds)
+            new_labels.append(cur_new_labels)
+
+
+        if mm_llm_compress:
+            self.language_model.model.first_image_token_position = first_image_token_position
+            self.language_model.model.text_prompt_lens = text_prompt_lens
+            self.language_model.model.num_image_token_lens = [image_feature.shape[0] for image_feature in image_features]
+
+        # Truncate sequences to max length as image embeddings can make the sequence longer
+        tokenizer_model_max_length = getattr(self.config, "tokenizer_model_max_length", None)
+        # rank_print("Finishing Inserting")
+
+        new_input_embeds = [x[:tokenizer_model_max_length] for x, modality in zip(new_input_embeds, modalities)]
+        new_labels = [x[:tokenizer_model_max_length] for x, modality in zip(new_labels, modalities)]
+
+        # Combine them
+        max_len = max(x.shape[0] for x in new_input_embeds)
+        batch_size = len(new_input_embeds)
+
+        new_input_embeds_padded = []
+        new_labels_padded = torch.full((batch_size, max_len), _OVVideoChatFlashQwenForCausalLM.IGNORE_INDEX, dtype=new_labels[0].dtype, device=new_labels[0].device)
+        attention_mask = torch.zeros((batch_size, max_len), dtype=attention_mask.dtype, device=attention_mask.device)
+        position_ids = torch.zeros((batch_size, max_len), dtype=position_ids.dtype, device=position_ids.device)
+        # print("Prepare pos id")
+
+        for i, (cur_new_embed, cur_new_labels) in enumerate(zip(new_input_embeds, new_labels)):
+            cur_len = cur_new_embed.shape[0]
+            if getattr(self.config, "tokenizer_padding_side", "right") == "left":
+                new_input_embeds_padded.append(torch.cat((torch.zeros((max_len - cur_len, cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, device=cur_new_embed.device), cur_new_embed), dim=0))
+                if cur_len > 0:
+                    new_labels_padded[i, -cur_len:] = cur_new_labels
+                    attention_mask[i, -cur_len:] = True
+                    position_ids[i, -cur_len:] = torch.arange(0, cur_len, dtype=position_ids.dtype, device=position_ids.device)
+            else:
+                new_input_embeds_padded.append(torch.cat((cur_new_embed, torch.zeros((max_len - cur_len, cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, device=cur_new_embed.device)), dim=0))
+                if cur_len > 0:
+                    new_labels_padded[i, :cur_len] = cur_new_labels
+                    attention_mask[i, :cur_len] = True
+                    position_ids[i, :cur_len] = torch.arange(0, cur_len, dtype=position_ids.dtype, device=position_ids.device)
+
+        new_input_embeds = torch.stack(new_input_embeds_padded, dim=0)
+        # print("tokenizer padding")
+
+        if _labels is None:
+            new_labels = None
+        else:
+            new_labels = new_labels_padded
+
+        if _attention_mask is None:
+            attention_mask = None
+        else:
+            attention_mask = attention_mask.to(dtype=_attention_mask.dtype)
+
+        if _position_ids is None:
+            position_ids = None
+
+        # return None, position_ids, attention_mask, past_key_values, new_input_embeds, new_labels
+        return new_input_embeds, attention_mask, position_ids
+
+    def _update_model_kwargs_for_generation(
+        self,
+        outputs: ModelOutput,
+        model_kwargs: Dict[str, Any],
+        is_encoder_decoder: bool = False,
+        num_new_tokens: int = 1,
+    ) -> Dict[str, Any]:
+        model_kwargs = super()._update_model_kwargs_for_generation(
+            outputs=outputs,
+            model_kwargs=model_kwargs,
+            is_encoder_decoder=is_encoder_decoder,
+            num_new_tokens=num_new_tokens,
+        )
+        # print('_update_model_kwargs_for_generation enter')
+        # for key, value in model_kwargs.items():
+        #     print(f'{key}: {value.shape if isinstance(value, torch.Tensor) else None}')
+
+        model_kwargs.pop("images", None)
+        model_kwargs.pop("image_sizes", None)
+        past_len = self.language_model._past_length
+        attn = model_kwargs.get("attention_mask")
+        if attn is not None and attn.shape[1] < past_len + 1:
+            model_kwargs["attention_mask"] = torch.ones(
+                (attn.shape[0], past_len + 1),
+                dtype=attn.dtype,
+                device=attn.device,
+            )
+
+        return model_kwargs
+
 
 MODEL_TYPE_TO_CLS_MAPPING = {
     "llava": _OVLlavaForCausalLM,

From 7bb536d2fd25714e5f98a6446a8d4064c070a2b3 Mon Sep 17 00:00:00 2001
From: xufang <fang.xu@intel.com>
Date: Wed, 18 Mar 2026 13:08:00 +0800
Subject: [PATCH 12/39] support text only

---
 .../openvino/modeling_visual_language.py      | 36 +++++++++----------
 1 file changed, 16 insertions(+), 20 deletions(-)

diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py
index 1f4b1ee4a2..98df805614 100644
--- a/optimum/intel/openvino/modeling_visual_language.py
+++ b/optimum/intel/openvino/modeling_visual_language.py
@@ -5199,38 +5199,38 @@ def preprocess_inputs(
         if tokenizer is None:
             raise ValueError("Tokenizer is required.")
         image_sizes = None
+        frames = None
+        results = {}
 
         # preprocess text
-        prompt = f"<image>\n{text}"
+        prompt = f"<image>\n{text}" if (image is not None or video is not None) else text
         messages = [{"role": "user", "content": prompt}]
         text_prompt = tokenizer.apply_chat_template(
             messages, tokenize=False, add_generation_prompt=True,
         )
         input_ids = _OVVideoChatFlashQwenForCausalLM.tokenizer_image_token(text_prompt, tokenizer, _OVVideoChatFlashQwenForCausalLM.IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0)
-
-        if isinstance(video, list):
-            if isinstance(video[0], np.ndarray):
-                image_sizes = [video[0].shape[:2]]
-            else:
-                width, height = video[0].size
-                image_sizes = [(height, width)]
+        results["input_ids"] = input_ids
 
         # preprocess video
-        frames = [processor(images=video, return_tensors="pt")]
+        if video is not None:
+            if isinstance(video, list):
+                if isinstance(video[0], np.ndarray):
+                    image_sizes = [video[0].shape[:2]]
+                else:
+                    width, height = video[0].size
+                    image_sizes = [(height, width)]
+            frames = [processor(images=video, return_tensors="pt")]
+            results["images"] = frames
+            results["image_sizes"] = image_sizes
 
         if tokenizer.pad_token_id is None:
             if "qwen" in tokenizer.name_or_path.lower():
                 print("Setting pad token to bos token for qwen model.")
                 tokenizer.pad_token_id = 151643
         attention_masks = input_ids.ne(tokenizer.pad_token_id).long()
+        results["attention_mask"] = attention_masks
 
-        stop_str = "<|im_end|>"
-        keywords = [stop_str]
-        stopping_criteria = _OVVideoChatFlashQwenForCausalLM.KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
-
-        # inputs = {"images": frames, "inputs": input_ids, "attention_mask": attention_masks, "modalities": ["video"], "stopping_criteria": [stopping_criteria], "image_sizes":image_sizes}
-        inputs = {"images": frames, "inputs": input_ids, "attention_mask": attention_masks, "stopping_criteria": [stopping_criteria], "image_sizes":image_sizes}
-        return inputs
+        return results
 
     def encode_video_image(self, images_list, video_idx_in_batch):
         # video encoder编码后按图像的connector处理
@@ -5370,10 +5370,6 @@ def get_text_embeddings(self, input_ids):
 
     # def get_multimodal_embeddings(self, input_ids, position_ids, attention_mask, past_key_values, labels, images, modalities=["image"], image_sizes=None):
     def get_multimodal_embeddings(self, input_ids, pixel_values=None, attention_mask=None, position_ids=None, past_key_values=None, labels=None, modalities=["image"], image_sizes=None, **kwargs):
-        # assert type(modalities) is list, modalities
-        print('get_multimodal_embeddings enter')
-        # if input_ids is not None:
-        #     print(f'input_ids: {input_ids.shape}, attention_mask: {attention_mask.shape}, position_ids: {position_ids.shape}')
         images = pixel_values
 
         if images is None:

From 2cf85fd8fbd5a1d39c11104eaa54558871d1b0ac Mon Sep 17 00:00:00 2001
From: xufang <fang.xu@intel.com>
Date: Wed, 18 Mar 2026 13:54:46 +0800
Subject: [PATCH 13/39] remove unused code

---
 .../openvino/modeling_visual_language.py      | 122 +++---------------
 1 file changed, 15 insertions(+), 107 deletions(-)

diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py
index 98df805614..68856463a8 100644
--- a/optimum/intel/openvino/modeling_visual_language.py
+++ b/optimum/intel/openvino/modeling_visual_language.py
@@ -4805,58 +4805,12 @@ def preprocess_inputs(
         return inputs
 
 
-def tokenizer_image_token(prompt, tokenizer, image_token_index=-200, return_tensors=None):
-    prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split("<image>")]
-
-    def insert_separator(X, sep):
-        return [ele for sublist in zip(X, [sep] * len(X)) for ele in sublist][:-1]
-
-    input_ids = []
-    offset = 0
-    if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id:
-        offset = 1
-        input_ids.append(prompt_chunks[0][0])
-
-    for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)):
-        input_ids.extend(x[offset:])
-
-    if return_tensors is not None:
-        if return_tensors == "pt":
-            return torch.tensor(input_ids, dtype=torch.long)
-        raise ValueError(f"Unsupported tensor type: {return_tensors}")
-    return input_ids
-
-class KeywordsStoppingCriteria(StoppingCriteria):
-    def __init__(self, keywords, tokenizer, input_ids):
-        self.keywords = keywords
-        self.keyword_ids = []
-        for keyword in keywords:
-            cur_keyword_ids = tokenizer(keyword).input_ids
-            if len(cur_keyword_ids) > 1 and cur_keyword_ids[0] == tokenizer.bos_token_id:
-                cur_keyword_ids = cur_keyword_ids[1:]
-            self.keyword_ids.append(torch.tensor(cur_keyword_ids))
-        self.tokenizer = tokenizer
-        self.start_len = input_ids.shape[1]
-
-    def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
-        assert output_ids.shape[0] == 1, "Only support batch size 1 (yet)"  # TODO
-        offset = min(output_ids.shape[1] - self.start_len, 3)
-        self.keyword_ids = [keyword_id.to(output_ids.device) for keyword_id in self.keyword_ids]
-        for keyword_id in self.keyword_ids:
-            if output_ids[0, -keyword_id.shape[0] :] == keyword_id:
-                return True
-        outputs = self.tokenizer.batch_decode(output_ids[:, -offset:], skip_special_tokens=True)[0]
-        for keyword in self.keywords:
-            if keyword in outputs:
-                return True
-        return False
-
-
 class _OVVideoChatFlashQwenForCausalLM(OVModelForVisualCausalLM):
     additional_parts = ["vision_projection"]
     IMAGE_TOKEN_INDEX = -200
     IGNORE_INDEX = -100
 
+    # Adopted from https://huggingface.co/OpenGVLab/VideoChat-Flash-Qwen2_5-7B_InternVideo2-1B/blob/main/vision_tower_builder.py#L181
     def get_3d_sincos_pos_embed(embed_dim, grid_size, t_size, cls_token=False):
         """
         grid_size: int of the grid height and width
@@ -4904,6 +4858,7 @@ def get_3d_sincos_pos_embed(embed_dim, grid_size, t_size, cls_token=False):
             )
         return pos_embed
 
+    # Adopted from https://huggingface.co/OpenGVLab/VideoChat-Flash-Qwen2_5-7B_InternVideo2-1B/blob/main/vision_tower_builder.py#L141
     def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
         assert embed_dim % 2 == 0
 
@@ -4919,6 +4874,7 @@ def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
         return emb
 
 
+    # Adopted from https://huggingface.co/OpenGVLab/VideoChat-Flash-Qwen2_5-7B_InternVideo2-1B/blob/main/vision_tower_builder.py#L156
     def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
         """
         embed_dim: output dimension for each position
@@ -4991,6 +4947,7 @@ def __init__(
         )
         self.img_pos_embed.data.copy_(torch.from_numpy(img_pos_embed).float().unsqueeze(0))
 
+    # Adopted from https://huggingface.co/OpenGVLab/VideoChat-Flash-Qwen2_5-7B_InternVideo2-1B/blob/main/mm_projector_builder.py#L6
     def bipartite_soft_matching(
         metric: torch.Tensor,
         r: int,
@@ -5046,6 +5003,7 @@ def unmerge(x: torch.Tensor) -> torch.Tensor:
 
         return merge, unmerge
 
+    # Adopted from https://huggingface.co/OpenGVLab/VideoChat-Flash-Qwen2_5-7B_InternVideo2-1B/blob/main/mm_projector_builder.py#L62
     def merge_wavg(
         merge: Callable, x: torch.Tensor, size: torch.Tensor = None
     ) -> Tuple[torch.Tensor, torch.Tensor]:
@@ -5081,6 +5039,7 @@ def get_vision_embeddings(self, images):
 
         return videos_features
 
+    # Adopted from https://huggingface.co/OpenGVLab/VideoChat-Flash-Qwen2_5-7B_InternVideo2-1B/blob/main/mm_projector_builder.py#L96
     def merge_tokens(self, x, target_num_token):
         r"""
         x = torch.randn(10, 2560, c)
@@ -5135,6 +5094,7 @@ def get_vision_projection(self, x, compress=False, local_num_frames=-1): # 单
         x = torch.from_numpy(x) if isinstance(x, np.ndarray) else x
         return x
 
+    # Adopted from https://huggingface.co/OpenGVLab/VideoChat-Flash-Qwen2_5-7B_InternVideo2-1B/blob/main/mm_utils.py#L797
     def tokenizer_image_token(prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, return_tensors=None):
         prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split("<image>")]
 
@@ -5156,31 +5116,6 @@ def insert_separator(X, sep):
             raise ValueError(f"Unsupported tensor type: {return_tensors}")
         return input_ids
 
-    class KeywordsStoppingCriteria(StoppingCriteria):
-        def __init__(self, keywords, tokenizer, input_ids):
-            self.keywords = keywords
-            self.keyword_ids = []
-            for keyword in keywords:
-                cur_keyword_ids = tokenizer(keyword).input_ids
-                if len(cur_keyword_ids) > 1 and cur_keyword_ids[0] == tokenizer.bos_token_id:
-                    cur_keyword_ids = cur_keyword_ids[1:]
-                self.keyword_ids.append(torch.tensor(cur_keyword_ids))
-            self.tokenizer = tokenizer
-            self.start_len = input_ids.shape[1]
-
-        def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
-            assert output_ids.shape[0] == 1, "Only support batch size 1 (yet)"  # TODO
-            offset = min(output_ids.shape[1] - self.start_len, 3)
-            self.keyword_ids = [keyword_id.to(output_ids.device) for keyword_id in self.keyword_ids]
-            for keyword_id in self.keyword_ids:
-                if output_ids[0, -keyword_id.shape[0] :] == keyword_id:
-                    return True
-            outputs = self.tokenizer.batch_decode(output_ids[:, -offset:], skip_special_tokens=True)[0]
-            for keyword in self.keywords:
-                if keyword in outputs:
-                    return True
-            return False
-
     @staticmethod
     def preprocess_inputs(
         text: str,
@@ -5269,7 +5204,7 @@ def encode_video_image(self, images_list, video_idx_in_batch):
 
         for idx in range(bs):
             if idx in video_idx_in_batch:
-                feat = self.get_vision_projection(videos_features[vid_idx], compress=True, local_num_frames=getattr(self.config, "mm_local_num_frames", -1))
+                feat = self.get_vision_projection(videos_features[vid_idx], compress=True, local_num_frames=mm_local_num_frames)
                 vid_idx += 1
             else:
                 feat = self.get_vision_projection(images_features[img_idx], compress=False)
@@ -5284,6 +5219,7 @@ def encode_video_image(self, images_list, video_idx_in_batch):
 
         return all_videos_or_images_features
 
+    # Adopted from https://huggingface.co/OpenGVLab/VideoChat-Flash-Qwen2_5-7B_InternVideo2-1B/blob/main/mm_utils.py#L502-L537
     def select_best_resolution(original_size, possible_resolutions, max_resolutions, patch_size):
         """
         Selects the best resolution from a list of possible resolutions based on the original size.
@@ -5321,6 +5257,7 @@ def select_best_resolution(original_size, possible_resolutions, max_resolutions,
         assert best_fit is not None, f"Can't find suitable fit in {possible_resolutions} at max:{max_resolutions}"
         return best_fit
 
+    # Adopted from https://huggingface.co/OpenGVLab/VideoChat-Flash-Qwen2_5-7B_InternVideo2-1B/blob/main/mm_utils.py#L601-L631
     def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size, max_resolutions=None):
         """
         Calculate the shape of the image patch grid after the preprocessing for images of any resolution.
@@ -5367,9 +5304,8 @@ def get_text_embeddings(self, input_ids):
         text_embed = torch.from_numpy(text_embed) if isinstance(text_embed, np.ndarray) else text_embed
         return text_embed
 
-
-    # def get_multimodal_embeddings(self, input_ids, position_ids, attention_mask, past_key_values, labels, images, modalities=["image"], image_sizes=None):
-    def get_multimodal_embeddings(self, input_ids, pixel_values=None, attention_mask=None, position_ids=None, past_key_values=None, labels=None, modalities=["image"], image_sizes=None, **kwargs):
+    # Adopted from https://huggingface.co/OpenGVLab/VideoChat-Flash-Qwen2_5-7B_InternVideo2-1B/blob/main/modeling_videochat_flash.py#L183
+    def get_multimodal_embeddings(self, input_ids, pixel_values=None, attention_mask=None, position_ids=None, modalities=["image"], image_sizes=None, **kwargs):
         images = pixel_values
 
         if images is None:
@@ -5497,7 +5433,6 @@ def get_multimodal_embeddings(self, input_ids, pixel_values=None, attention_mask
         # it is a headache to deal with None all the time.
         # But it is not ideal, and if you have a better idea,
         # please open an issue / submit a PR, thanks.
-        _labels = labels
         _position_ids = position_ids
         _attention_mask = attention_mask
         if attention_mask is None:
@@ -5506,15 +5441,10 @@ def get_multimodal_embeddings(self, input_ids, pixel_values=None, attention_mask
             attention_mask = attention_mask.bool()
         if position_ids is None:
             position_ids = torch.arange(0, input_ids.shape[1], dtype=torch.long, device=input_ids.device)
-        if labels is None:
-            labels = torch.full_like(input_ids, _OVVideoChatFlashQwenForCausalLM.IGNORE_INDEX)
 
-        print(f'input_ids: {input_ids.shape}, attention_mask: {attention_mask.shape}, position_ids: {position_ids.shape}, labels: {labels.shape}')
         input_ids = [cur_input_ids[cur_attention_mask] for cur_input_ids, cur_attention_mask in zip(input_ids, attention_mask)]
-        labels = [cur_labels[cur_attention_mask] for cur_labels, cur_attention_mask in zip(labels, attention_mask)]
 
         new_input_embeds = []
-        new_labels = []
         cur_image_idx = 0
 
         mm_llm_compress = getattr(self.config, "mm_llm_compress", False)
@@ -5562,26 +5492,20 @@ def get_multimodal_embeddings(self, input_ids, pixel_values=None, attention_mask
                 cur_input_embeds_1 = self.get_text_embeddings(cur_input_ids)
                 cur_input_embeds = torch.cat([cur_input_embeds_1, cur_image_features[0:0]], dim=0)
                 new_input_embeds.append(cur_input_embeds)
-                new_labels.append(labels[batch_idx])
                 cur_image_idx += 1
                 continue
 
             image_token_indices = [-1] + torch.where(cur_input_ids == _OVVideoChatFlashQwenForCausalLM.IMAGE_TOKEN_INDEX)[0].tolist() + [cur_input_ids.shape[0]]
             cur_input_ids_noim = []
-            cur_labels = labels[batch_idx]
-            cur_labels_noim = []
             for i in range(len(image_token_indices) - 1):
                 cur_input_ids_noim.append(cur_input_ids[image_token_indices[i] + 1 : image_token_indices[i + 1]])
-                cur_labels_noim.append(cur_labels[image_token_indices[i] + 1 : image_token_indices[i + 1]])
-            split_sizes = [x.shape[0] for x in cur_labels_noim]
+            split_sizes = [x.shape[0] for x in cur_input_ids_noim]
             cur_input_embeds = self.get_text_embeddings(torch.cat(cur_input_ids_noim))
             cur_input_embeds_no_im = torch.split(cur_input_embeds, split_sizes, dim=0)
             cur_new_input_embeds = []
-            cur_new_labels = []
 
             for i in range(num_images + 1):
                 cur_new_input_embeds.append(cur_input_embeds_no_im[i])
-                cur_new_labels.append(cur_labels_noim[i])
                 if i < num_images:
                     try:
                         cur_image_features = image_features[cur_image_idx]
@@ -5590,16 +5514,13 @@ def get_multimodal_embeddings(self, input_ids, pixel_values=None, attention_mask
                         cur_image_features = image_features[cur_image_idx - 1]
                     cur_image_idx += 1
                     cur_new_input_embeds.append(cur_image_features)
-                    cur_new_labels.append(torch.full((cur_image_features.shape[0],), _OVVideoChatFlashQwenForCausalLM.IGNORE_INDEX, device=cur_labels.device, dtype=cur_labels.dtype))
 
             cur_new_input_embeds = [x.to(self.device) for x in cur_new_input_embeds]
 
             # import pdb; pdb.set_trace()
             cur_new_input_embeds = torch.cat(cur_new_input_embeds)
-            cur_new_labels = torch.cat(cur_new_labels)
 
             new_input_embeds.append(cur_new_input_embeds)
-            new_labels.append(cur_new_labels)
 
 
         if mm_llm_compress:
@@ -5611,42 +5532,33 @@ def get_multimodal_embeddings(self, input_ids, pixel_values=None, attention_mask
         tokenizer_model_max_length = getattr(self.config, "tokenizer_model_max_length", None)
         # rank_print("Finishing Inserting")
 
-        new_input_embeds = [x[:tokenizer_model_max_length] for x, modality in zip(new_input_embeds, modalities)]
-        new_labels = [x[:tokenizer_model_max_length] for x, modality in zip(new_labels, modalities)]
+        new_input_embeds = [x[:tokenizer_model_max_length] for x in new_input_embeds]
 
         # Combine them
         max_len = max(x.shape[0] for x in new_input_embeds)
         batch_size = len(new_input_embeds)
 
         new_input_embeds_padded = []
-        new_labels_padded = torch.full((batch_size, max_len), _OVVideoChatFlashQwenForCausalLM.IGNORE_INDEX, dtype=new_labels[0].dtype, device=new_labels[0].device)
         attention_mask = torch.zeros((batch_size, max_len), dtype=attention_mask.dtype, device=attention_mask.device)
         position_ids = torch.zeros((batch_size, max_len), dtype=position_ids.dtype, device=position_ids.device)
         # print("Prepare pos id")
 
-        for i, (cur_new_embed, cur_new_labels) in enumerate(zip(new_input_embeds, new_labels)):
+        for i, cur_new_embed in enumerate(new_input_embeds):
             cur_len = cur_new_embed.shape[0]
             if getattr(self.config, "tokenizer_padding_side", "right") == "left":
                 new_input_embeds_padded.append(torch.cat((torch.zeros((max_len - cur_len, cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, device=cur_new_embed.device), cur_new_embed), dim=0))
                 if cur_len > 0:
-                    new_labels_padded[i, -cur_len:] = cur_new_labels
                     attention_mask[i, -cur_len:] = True
                     position_ids[i, -cur_len:] = torch.arange(0, cur_len, dtype=position_ids.dtype, device=position_ids.device)
             else:
                 new_input_embeds_padded.append(torch.cat((cur_new_embed, torch.zeros((max_len - cur_len, cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, device=cur_new_embed.device)), dim=0))
                 if cur_len > 0:
-                    new_labels_padded[i, :cur_len] = cur_new_labels
                     attention_mask[i, :cur_len] = True
                     position_ids[i, :cur_len] = torch.arange(0, cur_len, dtype=position_ids.dtype, device=position_ids.device)
 
         new_input_embeds = torch.stack(new_input_embeds_padded, dim=0)
         # print("tokenizer padding")
 
-        if _labels is None:
-            new_labels = None
-        else:
-            new_labels = new_labels_padded
-
         if _attention_mask is None:
             attention_mask = None
         else:
@@ -5655,7 +5567,6 @@ def get_multimodal_embeddings(self, input_ids, pixel_values=None, attention_mask
         if _position_ids is None:
             position_ids = None
 
-        # return None, position_ids, attention_mask, past_key_values, new_input_embeds, new_labels
         return new_input_embeds, attention_mask, position_ids
 
     def _update_model_kwargs_for_generation(
@@ -5671,9 +5582,6 @@ def _update_model_kwargs_for_generation(
             is_encoder_decoder=is_encoder_decoder,
             num_new_tokens=num_new_tokens,
         )
-        # print('_update_model_kwargs_for_generation enter')
-        # for key, value in model_kwargs.items():
-        #     print(f'{key}: {value.shape if isinstance(value, torch.Tensor) else None}')
 
         model_kwargs.pop("images", None)
         model_kwargs.pop("image_sizes", None)

From 2ad5818566ceb3f03d8a4729ffaa9ef60ba9725f Mon Sep 17 00:00:00 2001
From: xufang <fang.xu@intel.com>
Date: Wed, 18 Mar 2026 22:23:17 +0800
Subject: [PATCH 14/39] add videochat test

---
 .../openvino/modeling_visual_language.py      |  4 +-
 tests/openvino/test_export.py                 |  3 +-
 tests/openvino/test_seq2seq.py                | 44 +++++++++++++++----
 tests/openvino/utils_tests.py                 |  2 +
 4 files changed, 42 insertions(+), 11 deletions(-)

diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py
index 68856463a8..7f290218eb 100644
--- a/optimum/intel/openvino/modeling_visual_language.py
+++ b/optimum/intel/openvino/modeling_visual_language.py
@@ -4806,6 +4806,8 @@ def preprocess_inputs(
 
 
 class _OVVideoChatFlashQwenForCausalLM(OVModelForVisualCausalLM):
+    from transformers import AutoModel
+    auto_model_class = AutoModel
     additional_parts = ["vision_projection"]
     IMAGE_TOKEN_INDEX = -200
     IGNORE_INDEX = -100
@@ -4928,7 +4930,7 @@ def __init__(
         self.grid_size = (num_frames, self.image_size // self.patch_size, self.image_size // self.patch_size) # (T, H, W)
         self.num_patches = self.grid_size[0] * self.grid_size[1] * self.grid_size[2]
         self.num_img_patches = self.grid_size[1] * self.grid_size[2]
-        self.embed_dim = 1408
+        self.embed_dim = getattr(config, "mm_hidden_size", 1408)
         self.pos_embed = nn.Parameter(torch.zeros(1, self.num_patches + 1, self.embed_dim))
         self.img_pos_embed = nn.Parameter(torch.zeros(1, self.num_img_patches + 1, self.embed_dim))
         pos_embed = _OVVideoChatFlashQwenForCausalLM.get_3d_sincos_pos_embed(
diff --git a/tests/openvino/test_export.py b/tests/openvino/test_export.py
index 9519cea1ec..909854cf5e 100644
--- a/tests/openvino/test_export.py
+++ b/tests/openvino/test_export.py
@@ -93,6 +93,7 @@ class ExportModelTest(unittest.TestCase):
         "stable-diffusion-3": OVStableDiffusion3Pipeline,
         "flux": OVFluxPipeline,
         "ltx-video": OVLTXPipeline,
+        "videochat_flash_qwen": OVModelForVisualCausalLM,
     }
 
     if is_transformers_version(">=", "4.48.0"):
@@ -146,7 +147,7 @@ def _openvino_export(
             model_class = TasksManager.get_model_class_for_task(task, library=library_name)
             model = model_class(f"hf_hub:{model_name}", pretrained=True, exportable=True)
             TasksManager.standardize_model_attributes(model_name, model, library_name=library_name)
-        elif model_type == "llava":
+        elif model_type in ["llava", "videochat_flash_qwen"]:
             model = MODEL_TYPE_TO_CLS_MAPPING[model_type].auto_model_class.from_pretrained(
                 model_name, **loading_kwargs
             )
diff --git a/tests/openvino/test_seq2seq.py b/tests/openvino/test_seq2seq.py
index 73e12b5584..63f1c05dfd 100644
--- a/tests/openvino/test_seq2seq.py
+++ b/tests/openvino/test_seq2seq.py
@@ -530,8 +530,9 @@ class OVModelForVisualCausalLMIntegrationTest(OVSeq2SeqTestMixin):
         "minicpmv",
         "phi3_v",
         "qwen2_vl",
+        "videochat_flash_qwen",
     ]
-    SUPPORT_VIDEO = ["llava_next_video", "qwen2_vl"]
+    SUPPORT_VIDEO = ["llava_next_video", "qwen2_vl", "videochat_flash_qwen"]
     SUPPORT_AUDIO = []
     OVMODEL_CLASS = OVModelForVisualCausalLM
     TASK = "image-text-to-text"
@@ -559,7 +560,7 @@ class OVModelForVisualCausalLMIntegrationTest(OVSeq2SeqTestMixin):
         # remote code models differs after transformers v4.54
         SUPPORTED_ARCHITECTURES = set(SUPPORTED_ARCHITECTURES) - {"llava-qwen2", "phi3_v", "phi4mm"}
 
-    REMOTE_CODE_MODELS = ["internvl_chat", "minicpmv", "minicpmo", "llava-qwen2", "phi3_v", "maira2", "phi4mm"]
+    REMOTE_CODE_MODELS = ["internvl_chat", "minicpmv", "minicpmo", "llava-qwen2", "phi3_v", "maira2", "phi4mm", "videochat_flash_qwen"]
     IMAGE = Image.open(
         requests.get(
             TEST_IMAGE_URL,
@@ -630,6 +631,8 @@ def test_find_untested_architectures(self):
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     def test_compare_to_transformers(self, model_arch):
+        if model_arch == "videochat_flash_qwen":
+            self.skipTest("Skipping comparison against Transformers because videochat_flash_qwen in OV does not support image input")
         def compare_outputs(inputs, ov_model, transformers_model, generation_config):
             transformers_inputs = copy.deepcopy(inputs)
             ov_outputs = ov_model.generate(**inputs, generation_config=generation_config)
@@ -875,13 +878,16 @@ def test_generate_utils(self, model_arch):
             model_id, export=True, trust_remote_code=trust_remote_code, device=OPENVINO_DEVICE
         )
         tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=trust_remote_code)
-        question = "Describe image"
         preprocessors = self.get_preprocessors(model_arch)
-        inputs = model.preprocess_inputs(**preprocessors, text=question, image=self.IMAGE.resize((600, 600)))
-        # General case
-        outputs = model.generate(**inputs, max_new_tokens=10)
-        outputs = tokenizer.batch_decode(outputs[:, inputs["input_ids"].shape[1] :], skip_special_tokens=True)
-        self.assertIsInstance(outputs[0], str)
+
+        # videochat_flash_qwen does not support image input
+        if model_arch != "videochat_flash_qwen":
+            question = "Describe image"
+            inputs = model.preprocess_inputs(**preprocessors, text=question, image=self.IMAGE.resize((600, 600)))
+            # General case
+            outputs = model.generate(**inputs, max_new_tokens=10)
+            outputs = tokenizer.batch_decode(outputs[:, inputs["input_ids"].shape[1] :], skip_special_tokens=True)
+            self.assertIsInstance(outputs[0], str)
 
         # GOT-OCR2 does not support text-only input
         if model_arch != "got_ocr2":
@@ -907,7 +913,11 @@ def test_generate_utils(self, model_arch):
                     repo_type="dataset",
                     user_agent=http_user_agent(),
                 )
-                input_video, _ = load_video(video_path, num_frames=2, backend="opencv")
+                if model_arch == "videochat_flash_qwen":
+                    # videochat_flash_qwen need frame number to be multiple of 4
+                    input_video, _ = load_video(video_path, num_frames=8, backend="opencv")
+                else:
+                    input_video, _ = load_video(video_path, num_frames=2, backend="opencv")
                 question = "Why is this video funny?"
                 inputs = model.preprocess_inputs(**preprocessors, text=question, video=input_video)
                 outputs = model.generate(**inputs, max_new_tokens=10)
@@ -953,6 +963,22 @@ def get_preprocessors(self, model_arch):
                 model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS
             )
             preprocessors = {"processor": None, "tokenizer": tokenizer, "config": config}
+        elif model_arch == "videochat_flash_qwen":
+            class VideochatProcessorWrapper:
+                def __init__(self, model_id):
+                    from transformers import AutoModel
+                    hf_model = AutoModel.from_pretrained(model_id, trust_remote_code=True)
+                    self.processor = hf_model.get_vision_tower().image_processor.preprocess
+                    self.model_dtype = hf_model.dtype
+                    del hf_model
+
+                def __call__(self, images, return_tensors):
+                    return self.processor(images, return_tensors="pt")["pixel_values"].to(dtype=self.model_dtype)
+            processor = VideochatProcessorWrapper(model_id)
+            tokenizer = AutoTokenizer.from_pretrained(
+                model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS
+            )
+            preprocessors = {"processor": processor, "tokenizer": tokenizer, "config": config}
         else:
             processor = AutoProcessor.from_pretrained(
                 model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS
diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py
index fe6d584d2f..c340a3949f 100644
--- a/tests/openvino/utils_tests.py
+++ b/tests/openvino/utils_tests.py
@@ -228,6 +228,7 @@
     "ltx-video": "optimum-intel-internal-testing/tiny-random-ltx-video",
     "zamba2": "optimum-intel-internal-testing/tiny-random-zamba2",
     "qwen3_eagle3": "AngelSlim/Qwen3-1.7B_eagle3",
+    "videochat_flash_qwen": "optimum-intel-internal-testing/tiny-videochat-flash-qwen",
 }
 
 EAGLE3_MODELS = {"qwen3_eagle3": ("AngelSlim/Qwen3-1.7B_eagle3", "Qwen/Qwen3-1.7B")}
@@ -399,6 +400,7 @@
     "minicpm3",
     "deepseek",
     "qwen3_eagle3",
+    "videochat_flash_qwen",
 )
 
 

From 74dcc9dd8d1f11d55a3f51c0a6ee103f6796849a Mon Sep 17 00:00:00 2001
From: xufang <fang.xu@intel.com>
Date: Thu, 19 Mar 2026 08:44:24 +0800
Subject: [PATCH 15/39] add test dependencies

---
 setup.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/setup.py b/setup.py
index b86c176463..0803e340c2 100644
--- a/setup.py
+++ b/setup.py
@@ -63,6 +63,9 @@
     "vocos",
     "vector_quantize_pytorch",
     "openvino-genai",
+    "av",
+    "decord",
+    "imageio",
 ]
 
 QUALITY_REQUIRE = ["black~=23.1", "ruff==0.4.4"]

From 80df5c22a4343ad7a9da7d92b939300d23b760c1 Mon Sep 17 00:00:00 2001
From: xufang <fang.xu@intel.com>
Date: Thu, 19 Mar 2026 09:37:13 +0800
Subject: [PATCH 16/39] fix style check issue

---
 optimum/exporters/openvino/model_configs.py   |  21 +-
 optimum/exporters/openvino/model_patcher.py   |   7 +-
 .../openvino/modeling_visual_language.py      | 219 ++++++++++++------
 tests/openvino/test_seq2seq.py                |  19 +-
 4 files changed, 188 insertions(+), 78 deletions(-)

diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index c123efbe20..d9663fa99a 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -5308,6 +5308,7 @@ class SiglipTextOpenVINOConfig(SiglipTextOnnxConfig):
 
 class DummyVideoChatFlashQwenInputGenerator(DummyVisionInputGenerator):
     SUPPORTED_INPUT_NAMES = ("hidden_states", "rotary_pos_emb")
+
     def __init__(
         self,
         task: str,
@@ -5344,7 +5345,9 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int
         elif input_name == "rotary_pos_emb":
             grid_h, grid_w = self.height // self.patch_size, self.width // self.patch_size
             grid_t = self.num_frames
-            return self.random_float_tensor([1, 1 + grid_h * grid_t * grid_w, self.embed_dim], framework=framework, dtype=float_dtype)
+            return self.random_float_tensor(
+                [1, 1 + grid_h * grid_t * grid_w, self.embed_dim], framework=framework, dtype=float_dtype
+            )
 
 
 class DummyVideoChatFlashQwenProjectorInputGenerator(DummyInputGenerator):
@@ -5428,8 +5431,10 @@ def __init__(
     def inputs(self) -> Dict[str, Dict[int, str]]:
         if not self._behavior == VideoChatFlashQwenConfigBehavior.VISION_EMBEDDINGS:
             return {}
-        return {"hidden_states": {0: "batch_size", 1: "num_channels", 2: "num_frames", 3: "height", 4: "width"},
-                "rotary_pos_emb": {0: "batch_size", 1: "num_tokens",2: "hidden_size"}}
+        return {
+            "hidden_states": {0: "batch_size", 1: "num_channels", 2: "num_frames", 3: "height", 4: "width"},
+            "rotary_pos_emb": {0: "batch_size", 1: "num_tokens", 2: "hidden_size"},
+        }
 
     def with_behavior(
         self,
@@ -5455,14 +5460,10 @@ def with_behavior(
             return export_config
 
         if behavior == VideoChatFlashQwenConfigBehavior.TEXT_EMBEDDINGS:
-            return get_vlm_text_embeddings_config(
-                "qwen2", self._orig_config, self.int_dtype, self.float_dtype
-            )
+            return get_vlm_text_embeddings_config("qwen2", self._orig_config, self.int_dtype, self.float_dtype)
 
         if behavior == VideoChatFlashQwenConfigBehavior.LANGUAGE:
-            return get_vlm_text_generation_config(
-                "qwen2", self._orig_config, self.int_dtype, self.float_dtype
-            )
+            return get_vlm_text_generation_config("qwen2", self._orig_config, self.int_dtype, self.float_dtype)
 
         if behavior == VideoChatFlashQwenConfigBehavior.VISION_EMBEDDINGS:
             return self.__class__(
@@ -5502,6 +5503,8 @@ def patch_model_for_export(self, model: PreTrainedModel, model_kwargs: Optional[
             return VideochatFlashQwenVisionEmbeddingModelPatcher(self, model, model_kwargs)
 
         return super().patch_model_for_export(model, model_kwargs)
+
+
 @register_in_tasks_manager(
     "hunyuan_v1_dense",
     *[
diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index f2e0c46d6c..a15d40c820 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -7669,7 +7669,11 @@ def forward_wrap(self, hidden_states, rotary_pos_emb=None, mask=None, use_image=
                         # (1, num_img_patches + 1, embed_dim)
                         cls_pos_embed = self.pos_embed[:, 0:1, :]
 
-                        img_pos_embed = self.pos_embed[:, 1:, :].view(1, self.num_frames, self.patch_embed.num_patches // self.num_frames, self.embed_dim).mean(dim=1)
+                        img_pos_embed = (
+                            self.pos_embed[:, 1:, :]
+                            .view(1, self.num_frames, self.patch_embed.num_patches // self.num_frames, self.embed_dim)
+                            .mean(dim=1)
+                        )
 
                         rotary_pos_emb = torch.cat([cls_pos_embed, img_pos_embed], dim=1)
                 else:
@@ -7769,6 +7773,7 @@ def __exit__(self, exc_type, exc_value, traceback):
         super().__exit__(exc_type, exc_value, traceback)
         self._model.forward = self._model.__orig_forward
 
+
 # adopted from https://github.com/huggingface/transformers/blob/v4.57.6/src/transformers/models/llama/modeling_llama.py#L197
 class LlamaEagle3Attention(LlamaAttention):
     """
diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py
index 7f290218eb..59aa9e5b8c 100644
--- a/optimum/intel/openvino/modeling_visual_language.py
+++ b/optimum/intel/openvino/modeling_visual_language.py
@@ -4807,6 +4807,7 @@ def preprocess_inputs(
 
 class _OVVideoChatFlashQwenForCausalLM(OVModelForVisualCausalLM):
     from transformers import AutoModel
+
     auto_model_class = AutoModel
     additional_parts = ["vision_projection"]
     IMAGE_TOKEN_INDEX = -200
@@ -4831,9 +4832,7 @@ def get_3d_sincos_pos_embed(embed_dim, grid_size, t_size, cls_token=False):
         grid = np.stack(grid, axis=0)
 
         grid = grid.reshape([2, 1, grid_size, grid_size])
-        pos_embed_spatial = _OVVideoChatFlashQwenForCausalLM.get_2d_sincos_pos_embed_from_grid(
-            embed_dim_spatial, grid
-        )
+        pos_embed_spatial = _OVVideoChatFlashQwenForCausalLM.get_2d_sincos_pos_embed_from_grid(embed_dim_spatial, grid)
 
         # temporal
         grid_t = np.arange(t_size, dtype=np.float32)
@@ -4843,21 +4842,15 @@ def get_3d_sincos_pos_embed(embed_dim, grid_size, t_size, cls_token=False):
 
         # concate: [T, H, W] order
         pos_embed_temporal = pos_embed_temporal[:, np.newaxis, :]
-        pos_embed_temporal = np.repeat(
-            pos_embed_temporal, grid_size**2, axis=1
-        )  # [T, H*W, D // 4]
+        pos_embed_temporal = np.repeat(pos_embed_temporal, grid_size**2, axis=1)  # [T, H*W, D // 4]
         pos_embed_spatial = pos_embed_spatial[np.newaxis, :, :]
-        pos_embed_spatial = np.repeat(
-            pos_embed_spatial, t_size, axis=0
-        )  # [T, H*W, D // 4 * 3]
+        pos_embed_spatial = np.repeat(pos_embed_spatial, t_size, axis=0)  # [T, H*W, D // 4 * 3]
 
         pos_embed = np.concatenate([pos_embed_temporal, pos_embed_spatial], axis=-1)
         pos_embed = pos_embed.reshape([-1, embed_dim])  # [T*H*W, D]
 
         if cls_token:
-            pos_embed = np.concatenate(
-                [np.zeros([1, embed_dim]), pos_embed], axis=0
-            )
+            pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0)
         return pos_embed
 
     # Adopted from https://huggingface.co/OpenGVLab/VideoChat-Flash-Qwen2_5-7B_InternVideo2-1B/blob/main/vision_tower_builder.py#L141
@@ -4875,7 +4868,6 @@ def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
         emb = np.concatenate([emb_h, emb_w], axis=1)  # (H*W, D)
         return emb
 
-
     # Adopted from https://huggingface.co/OpenGVLab/VideoChat-Flash-Qwen2_5-7B_InternVideo2-1B/blob/main/vision_tower_builder.py#L156
     def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
         """
@@ -4911,6 +4903,7 @@ def __init__(
         **kwargs,
     ):
         from torch import nn
+
         super().__init__(
             language_model=language_model,
             text_embeddings=text_embeddings,
@@ -4927,25 +4920,23 @@ def __init__(
         self.num_attention_heads = 16
         self.patch_size = 14
         self.image_size = 224
-        self.grid_size = (num_frames, self.image_size // self.patch_size, self.image_size // self.patch_size) # (T, H, W)
+        self.grid_size = (
+            num_frames,
+            self.image_size // self.patch_size,
+            self.image_size // self.patch_size,
+        )  # (T, H, W)
         self.num_patches = self.grid_size[0] * self.grid_size[1] * self.grid_size[2]
         self.num_img_patches = self.grid_size[1] * self.grid_size[2]
         self.embed_dim = getattr(config, "mm_hidden_size", 1408)
         self.pos_embed = nn.Parameter(torch.zeros(1, self.num_patches + 1, self.embed_dim))
         self.img_pos_embed = nn.Parameter(torch.zeros(1, self.num_img_patches + 1, self.embed_dim))
         pos_embed = _OVVideoChatFlashQwenForCausalLM.get_3d_sincos_pos_embed(
-            self.pos_embed.shape[-1],
-            self.grid_size[1], # height & weight
-            self.grid_size[0], # t_size
-            cls_token=True
+            self.pos_embed.shape[-1], self.grid_size[1], self.grid_size[0], cls_token=True
         )
         self.pos_embed.data.copy_(torch.from_numpy(pos_embed).float().unsqueeze(0))
 
         img_pos_embed = _OVVideoChatFlashQwenForCausalLM.get_3d_sincos_pos_embed(
-            self.pos_embed.shape[-1],
-            self.grid_size[1], # height & weight
-            1,
-            cls_token=True
+            self.pos_embed.shape[-1], self.grid_size[1], 1, cls_token=True
         )
         self.img_pos_embed.data.copy_(torch.from_numpy(img_pos_embed).float().unsqueeze(0))
 
@@ -4984,7 +4975,7 @@ def merge(x: torch.Tensor, mode="mean") -> torch.Tensor:
             n, t1, c = src.shape
             unm = src.gather(dim=-2, index=unm_idx.expand(n, t1 - r, c))
             src = src.gather(dim=-2, index=src_idx.expand(n, r, c))
-            dst = dst.scatter_add(-2, dst_idx.expand(n, r, c), src) # , reduce=mode)
+            dst = dst.scatter_add(-2, dst_idx.expand(n, r, c), src)  # , reduce=mode)
 
             return torch.cat([unm, dst], dim=1)
 
@@ -5006,9 +4997,7 @@ def unmerge(x: torch.Tensor) -> torch.Tensor:
         return merge, unmerge
 
     # Adopted from https://huggingface.co/OpenGVLab/VideoChat-Flash-Qwen2_5-7B_InternVideo2-1B/blob/main/mm_projector_builder.py#L62
-    def merge_wavg(
-        merge: Callable, x: torch.Tensor, size: torch.Tensor = None
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
+    def merge_wavg(merge: Callable, x: torch.Tensor, size: torch.Tensor = None) -> Tuple[torch.Tensor, torch.Tensor]:
         """
         Applies the merge function by taking a weighted average based on token size.
         Returns the merged tensor and the new token sizes.
@@ -5034,7 +5023,9 @@ def get_vision_embeddings(self, images):
                 pos_embeds = self.img_pos_embed.detach()
             else:
                 pos_embeds = self.pos_embed.detach()
-            image_embeds = self.vision_embeddings(images, rotary_pos_emb=pos_embeds, use_image=(T == 1)).last_hidden_state
+            image_embeds = self.vision_embeddings(
+                images, rotary_pos_emb=pos_embeds, use_image=(T == 1)
+            ).last_hidden_state
             image_embeds = image_embeds[:, 1:, :]
 
             videos_features = torch.from_numpy(image_embeds) if isinstance(image_embeds, np.ndarray) else image_embeds
@@ -5064,17 +5055,14 @@ def merge_tokens(self, x, target_num_token):
 
         dim = c // head
         for r in r_merge_list:
-            metric = x.reshape(b, p, head, dim).mean(2) # [b, p, c//head]
-            merge, _ = _OVVideoChatFlashQwenForCausalLM.bipartite_soft_matching(
-                metric,
-                r
-            )
+            metric = x.reshape(b, p, head, dim).mean(2)  # [b, p, c//head]
+            merge, _ = _OVVideoChatFlashQwenForCausalLM.bipartite_soft_matching(metric, r)
             x, size = _OVVideoChatFlashQwenForCausalLM.merge_wavg(merge, x, size)
             _, p, _ = x.shape
 
         return x
 
-    def get_vision_projection(self, x, compress=False, local_num_frames=-1): # 单帧64
+    def get_vision_projection(self, x, compress=False, local_num_frames=-1):
         height = width = self.image_size // self.patch_size
         assert height * width == x.shape[1]
 
@@ -5143,9 +5131,13 @@ def preprocess_inputs(
         prompt = f"<image>\n{text}" if (image is not None or video is not None) else text
         messages = [{"role": "user", "content": prompt}]
         text_prompt = tokenizer.apply_chat_template(
-            messages, tokenize=False, add_generation_prompt=True,
+            messages,
+            tokenize=False,
+            add_generation_prompt=True,
         )
-        input_ids = _OVVideoChatFlashQwenForCausalLM.tokenizer_image_token(text_prompt, tokenizer, _OVVideoChatFlashQwenForCausalLM.IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0)
+        input_ids = _OVVideoChatFlashQwenForCausalLM.tokenizer_image_token(
+            text_prompt, tokenizer, _OVVideoChatFlashQwenForCausalLM.IMAGE_TOKEN_INDEX, return_tensors="pt"
+        ).unsqueeze(0)
         results["input_ids"] = input_ids
 
         # preprocess video
@@ -5190,15 +5182,30 @@ def encode_video_image(self, images_list, video_idx_in_batch):
             image_split_sizes = [image.shape[0] for image in concat_images]
             concat_images = torch.cat([image.unsqueeze(1) for image in concat_images], dim=0)
             # print("input vit image.shape:", concat_images.shape)
-            images_features = self.get_vision_embeddings(concat_images) # B_i, N, D
+            images_features = self.get_vision_embeddings(concat_images)  # B_i, N, D
             images_features = torch.split(images_features, image_split_sizes)
 
         if has_video:
             video_split_sizes = [video.shape[0] // mm_local_num_frames for video in concat_videos]
-            concat_videos = torch.cat([video.reshape(video.shape[0] // mm_local_num_frames, mm_local_num_frames, video.shape[1], video.shape[2], video.shape[3]) for video in concat_videos], dim=0)
+            concat_videos = torch.cat(
+                [
+                    video.reshape(
+                        video.shape[0] // mm_local_num_frames,
+                        mm_local_num_frames,
+                        video.shape[1],
+                        video.shape[2],
+                        video.shape[3],
+                    )
+                    for video in concat_videos
+                ],
+                dim=0,
+            )
             # print("input vit video.shape:", concat_videos.shape)
-            videos_features = self.get_vision_embeddings(concat_videos) # B_v, N, D
-            videos_features = [v.reshape(-1, v.shape[-2] // mm_local_num_frames, v.shape[-1]) for v in torch.split(videos_features, video_split_sizes)]
+            videos_features = self.get_vision_embeddings(concat_videos)  # B_v, N, D
+            videos_features = [
+                v.reshape(-1, v.shape[-2] // mm_local_num_frames, v.shape[-1])
+                for v in torch.split(videos_features, video_split_sizes)
+            ]
 
         all_videos_or_images_features = []
         img_idx = 0
@@ -5206,7 +5213,9 @@ def encode_video_image(self, images_list, video_idx_in_batch):
 
         for idx in range(bs):
             if idx in video_idx_in_batch:
-                feat = self.get_vision_projection(videos_features[vid_idx], compress=True, local_num_frames=mm_local_num_frames)
+                feat = self.get_vision_projection(
+                    videos_features[vid_idx], compress=True, local_num_frames=mm_local_num_frames
+                )
                 vid_idx += 1
             else:
                 feat = self.get_vision_projection(images_features[img_idx], compress=False)
@@ -5240,7 +5249,7 @@ def select_best_resolution(original_size, possible_resolutions, max_resolutions,
 
         for width, height in possible_resolutions:
             if max_resolutions != None and (width * height != patch_size * patch_size):
-                if (width * height+patch_size*patch_size) > max_resolutions: # NOTE 要算一个global
+                if (width * height + patch_size * patch_size) > max_resolutions:  # NOTE 要算一个global
                     continue
             # Calculate the downscaled size to keep the aspect ratio
             scale = min(width / original_width, height / original_height)
@@ -5250,7 +5259,9 @@ def select_best_resolution(original_size, possible_resolutions, max_resolutions,
             effective_resolution = min(downscaled_width * downscaled_height, original_width * original_height)
             wasted_resolution = (width * height) - effective_resolution
 
-            if effective_resolution > max_effective_resolution or (effective_resolution == max_effective_resolution and wasted_resolution < min_wasted_resolution):
+            if effective_resolution > max_effective_resolution or (
+                effective_resolution == max_effective_resolution and wasted_resolution < min_wasted_resolution
+            ):
                 max_effective_resolution = effective_resolution
                 min_wasted_resolution = wasted_resolution
                 best_fit = (width, height)
@@ -5279,14 +5290,20 @@ def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size, max_reso
             range_start = tuple(map(int, matches[0]))
             range_end = tuple(map(int, matches[-1]))
             # Generate a matrix of tuples from (range_start[0], range_start[1]) to (range_end[0], range_end[1])
-            grid_pinpoints = [(i, j) for i in range(range_start[0], range_end[0] + 1) for j in range(range_start[1], range_end[1] + 1)]
+            grid_pinpoints = [
+                (i, j)
+                for i in range(range_start[0], range_end[0] + 1)
+                for j in range(range_start[1], range_end[1] + 1)
+            ]
             # Multiply all elements by patch_size
             grid_pinpoints = [[dim * patch_size for dim in pair] for pair in grid_pinpoints]
         if type(grid_pinpoints) is list:
             possible_resolutions = grid_pinpoints
         else:
             possible_resolutions = ast.literal_eval(grid_pinpoints)
-        width, height = _OVVideoChatFlashQwenForCausalLM.select_best_resolution(image_size, possible_resolutions, max_resolutions=max_resolutions, patch_size=patch_size)
+        width, height = _OVVideoChatFlashQwenForCausalLM.select_best_resolution(
+            image_size, possible_resolutions, max_resolutions=max_resolutions, patch_size=patch_size
+        )
 
         # print("get width/patch size", width, patch_size, flush=True)
 
@@ -5307,7 +5324,16 @@ def get_text_embeddings(self, input_ids):
         return text_embed
 
     # Adopted from https://huggingface.co/OpenGVLab/VideoChat-Flash-Qwen2_5-7B_InternVideo2-1B/blob/main/modeling_videochat_flash.py#L183
-    def get_multimodal_embeddings(self, input_ids, pixel_values=None, attention_mask=None, position_ids=None, modalities=["image"], image_sizes=None, **kwargs):
+    def get_multimodal_embeddings(
+        self,
+        input_ids,
+        pixel_values=None,
+        attention_mask=None,
+        position_ids=None,
+        modalities=["image"],
+        image_sizes=None,
+        **kwargs,
+    ):
         images = pixel_values
 
         if images is None:
@@ -5338,7 +5364,7 @@ def get_multimodal_embeddings(self, input_ids, pixel_values=None, attention_mask
         frame_aspect_ratio = getattr(self.config, "frame_aspect_ratio", "square")
         mm_newline_position = getattr(self.config, "mm_newline_position", "nothing")
 
-        if vision_encode_type == "video_image": # video backbone, process video with compress
+        if vision_encode_type == "video_image":  # video backbone, process video with compress
             image_features = self.encode_video_image(images_list, video_idx_in_batch=video_idx_in_batch)
         else:
             raise NotImplementedError(vision_encode_type)
@@ -5357,13 +5383,17 @@ def get_multimodal_embeddings(self, input_ids, pixel_values=None, attention_mask
                         frame_feature = image_feature
 
                     if "pad" in mm_patch_merge_type:
-                        if mm_newline_position == 'one_token':
+                        if mm_newline_position == "one_token":
                             frame_feature = frame_feature.flatten(0, 1)
                             if "unpad" in mm_patch_merge_type:
-                                frame_feature = torch.cat((frame_feature, self.model.image_newline[None].to(frame_feature.device)), dim=0)
+                                frame_feature = torch.cat(
+                                    (frame_feature, self.model.image_newline[None].to(frame_feature.device)), dim=0
+                                )
                             else:
-                                frame_feature = torch.cat((frame_feature, self.model.frame_newline[None].to(frame_feature.device)), dim=0)
-                        elif mm_newline_position == 'nothing':
+                                frame_feature = torch.cat(
+                                    (frame_feature, self.model.frame_newline[None].to(frame_feature.device)), dim=0
+                                )
+                        elif mm_newline_position == "nothing":
                             frame_feature = frame_feature.flatten(0, 1)
                         else:
                             raise NotImplementedError("add pad please!!")
@@ -5379,7 +5409,9 @@ def get_multimodal_embeddings(self, input_ids, pixel_values=None, attention_mask
                     origin_size = image_feature.shape
 
                     height = width = 8
-                    assert height * width == base_image_feature.shape[0], f"height:{height}, width: {width}, base_image_feature: {base_image_feature.shape}"
+                    assert (
+                        height * width == base_image_feature.shape[0]
+                    ), f"height:{height}, width: {width}, base_image_feature: {base_image_feature.shape}"
 
                     if "anyres_max" in image_aspect_ratio:
                         matched_anyres_max_num_patches = re.match(r"anyres_max_(\d+)", image_aspect_ratio)
@@ -5389,7 +5421,15 @@ def get_multimodal_embeddings(self, input_ids, pixel_values=None, attention_mask
                     if "anyres" in image_aspect_ratio:
                         vision_tower_image_size = 224
                         try:
-                            num_patch_width, num_patch_height = _OVVideoChatFlashQwenForCausalLM.get_anyres_image_grid_shape(image_sizes[image_idx], self.config.image_grid_pinpoints, vision_tower_image_size, max_resolutions=None)
+                            (
+                                num_patch_width,
+                                num_patch_height,
+                            ) = _OVVideoChatFlashQwenForCausalLM.get_anyres_image_grid_shape(
+                                image_sizes[image_idx],
+                                self.config.image_grid_pinpoints,
+                                vision_tower_image_size,
+                                max_resolutions=None,
+                            )
                         except Exception as e:
                             print(f"Error: {e}")
                             raise e
@@ -5402,7 +5442,11 @@ def get_multimodal_embeddings(self, input_ids, pixel_values=None, attention_mask
 
                     if "maxpool2x2" in mm_patch_merge_type:
                         raise NotImplementedError
-                    elif "unpad" in mm_patch_merge_type and "anyres_max" in image_aspect_ratio and matched_anyres_max_num_patches:
+                    elif (
+                        "unpad" in mm_patch_merge_type
+                        and "anyres_max" in image_aspect_ratio
+                        and matched_anyres_max_num_patches
+                    ):
                         raise NotImplementedError
                     elif "unpad" in mm_patch_merge_type:
                         raise NotImplementedError
@@ -5415,7 +5459,9 @@ def get_multimodal_embeddings(self, input_ids, pixel_values=None, attention_mask
                         try:
                             image_feature = torch.cat((base_image_feature, image_feature), dim=0)
                         except Exception as e:
-                            raise ValueError(f"{num_patch_width} {num_patch_height} now: base_image_feature: {base_image_feature.shape}, {image_feature.shape}, image_sizes[image_idx]: {image_sizes[image_idx]}, origin_size: {origin_size}, {image_sizes[image_idx]}, {self.config.image_grid_pinpoints}, {vision_tower_image_size}")
+                            raise ValueError(
+                                f"{num_patch_width} {num_patch_height} now: base_image_feature: {base_image_feature.shape}, {image_feature.shape}, image_sizes[image_idx]: {image_sizes[image_idx]}, origin_size: {origin_size}, {image_sizes[image_idx]}, {self.config.image_grid_pinpoints}, {vision_tower_image_size}"
+                            )
                 else:  # single image operations
                     image_feature = image_feature[0]
                     if "unpad" in mm_patch_merge_type:
@@ -5444,7 +5490,9 @@ def get_multimodal_embeddings(self, input_ids, pixel_values=None, attention_mask
         if position_ids is None:
             position_ids = torch.arange(0, input_ids.shape[1], dtype=torch.long, device=input_ids.device)
 
-        input_ids = [cur_input_ids[cur_attention_mask] for cur_input_ids, cur_attention_mask in zip(input_ids, attention_mask)]
+        input_ids = [
+            cur_input_ids[cur_attention_mask] for cur_input_ids, cur_attention_mask in zip(input_ids, attention_mask)
+        ]
 
         new_input_embeds = []
         cur_image_idx = 0
@@ -5453,8 +5501,12 @@ def get_multimodal_embeddings(self, input_ids, pixel_values=None, attention_mask
 
         if mm_llm_compress:
             self.language_model.model.llm_compress_type = getattr(self.config, "llm_compress_type", "attention")
-            self.language_model.model.llm_compress_layer_list = getattr(self.config, "llm_compress_layer_list", [8, 16, 24])
-            self.language_model.model.llm_image_token_ratio_list = getattr(self.config, "llm_image_token_ratio_list", [1.0, 0.5, 0.25, 0.125])
+            self.language_model.model.llm_compress_layer_list = getattr(
+                self.config, "llm_compress_layer_list", [8, 16, 24]
+            )
+            self.language_model.model.llm_image_token_ratio_list = getattr(
+                self.config, "llm_image_token_ratio_list", [1.0, 0.5, 0.25, 0.125]
+            )
             first_image_token_position = []
             text_prompt_lens = []
         else:
@@ -5471,7 +5523,9 @@ def get_multimodal_embeddings(self, input_ids, pixel_values=None, attention_mask
             if mm_llm_compress:
                 ####### copy from pdrop, only support single image/video NOTE ##################
                 # record image position for further dropping
-                image_index = torch.where(cur_input_ids == _OVVideoChatFlashQwenForCausalLM.IMAGE_TOKEN_INDEX)[0].tolist()
+                image_index = torch.where(cur_input_ids == _OVVideoChatFlashQwenForCausalLM.IMAGE_TOKEN_INDEX)[
+                    0
+                ].tolist()
                 assert len(image_index) == 1, f"Only support singe/video: {image_index}"
                 if image_index == []:
                     first_image_token_position.append(-1)
@@ -5484,7 +5538,7 @@ def get_multimodal_embeddings(self, input_ids, pixel_values=None, attention_mask
                         assert num_images == 0, num_images
                     else:
                         assert num_images == 1, f"num_images={num_images}"
-                    text_prompt_lens.append(cur_input_ids.shape[0] - num_images)   # consider image place holder
+                    text_prompt_lens.append(cur_input_ids.shape[0] - num_images)  # consider image place holder
 
                 ###############################################
 
@@ -5497,7 +5551,11 @@ def get_multimodal_embeddings(self, input_ids, pixel_values=None, attention_mask
                 cur_image_idx += 1
                 continue
 
-            image_token_indices = [-1] + torch.where(cur_input_ids == _OVVideoChatFlashQwenForCausalLM.IMAGE_TOKEN_INDEX)[0].tolist() + [cur_input_ids.shape[0]]
+            image_token_indices = (
+                [-1]
+                + torch.where(cur_input_ids == _OVVideoChatFlashQwenForCausalLM.IMAGE_TOKEN_INDEX)[0].tolist()
+                + [cur_input_ids.shape[0]]
+            )
             cur_input_ids_noim = []
             for i in range(len(image_token_indices) - 1):
                 cur_input_ids_noim.append(cur_input_ids[image_token_indices[i] + 1 : image_token_indices[i + 1]])
@@ -5524,11 +5582,12 @@ def get_multimodal_embeddings(self, input_ids, pixel_values=None, attention_mask
 
             new_input_embeds.append(cur_new_input_embeds)
 
-
         if mm_llm_compress:
             self.language_model.model.first_image_token_position = first_image_token_position
             self.language_model.model.text_prompt_lens = text_prompt_lens
-            self.language_model.model.num_image_token_lens = [image_feature.shape[0] for image_feature in image_features]
+            self.language_model.model.num_image_token_lens = [
+                image_feature.shape[0] for image_feature in image_features
+            ]
 
         # Truncate sequences to max length as image embeddings can make the sequence longer
         tokenizer_model_max_length = getattr(self.config, "tokenizer_model_max_length", None)
@@ -5548,15 +5607,43 @@ def get_multimodal_embeddings(self, input_ids, pixel_values=None, attention_mask
         for i, cur_new_embed in enumerate(new_input_embeds):
             cur_len = cur_new_embed.shape[0]
             if getattr(self.config, "tokenizer_padding_side", "right") == "left":
-                new_input_embeds_padded.append(torch.cat((torch.zeros((max_len - cur_len, cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, device=cur_new_embed.device), cur_new_embed), dim=0))
+                new_input_embeds_padded.append(
+                    torch.cat(
+                        (
+                            torch.zeros(
+                                (max_len - cur_len, cur_new_embed.shape[1]),
+                                dtype=cur_new_embed.dtype,
+                                device=cur_new_embed.device,
+                            ),
+                            cur_new_embed,
+                        ),
+                        dim=0,
+                    )
+                )
                 if cur_len > 0:
                     attention_mask[i, -cur_len:] = True
-                    position_ids[i, -cur_len:] = torch.arange(0, cur_len, dtype=position_ids.dtype, device=position_ids.device)
+                    position_ids[i, -cur_len:] = torch.arange(
+                        0, cur_len, dtype=position_ids.dtype, device=position_ids.device
+                    )
             else:
-                new_input_embeds_padded.append(torch.cat((cur_new_embed, torch.zeros((max_len - cur_len, cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, device=cur_new_embed.device)), dim=0))
+                new_input_embeds_padded.append(
+                    torch.cat(
+                        (
+                            cur_new_embed,
+                            torch.zeros(
+                                (max_len - cur_len, cur_new_embed.shape[1]),
+                                dtype=cur_new_embed.dtype,
+                                device=cur_new_embed.device,
+                            ),
+                        ),
+                        dim=0,
+                    )
+                )
                 if cur_len > 0:
                     attention_mask[i, :cur_len] = True
-                    position_ids[i, :cur_len] = torch.arange(0, cur_len, dtype=position_ids.dtype, device=position_ids.device)
+                    position_ids[i, :cur_len] = torch.arange(
+                        0, cur_len, dtype=position_ids.dtype, device=position_ids.device
+                    )
 
         new_input_embeds = torch.stack(new_input_embeds_padded, dim=0)
         # print("tokenizer padding")
diff --git a/tests/openvino/test_seq2seq.py b/tests/openvino/test_seq2seq.py
index 63f1c05dfd..fb11eadb9f 100644
--- a/tests/openvino/test_seq2seq.py
+++ b/tests/openvino/test_seq2seq.py
@@ -560,7 +560,16 @@ class OVModelForVisualCausalLMIntegrationTest(OVSeq2SeqTestMixin):
         # remote code models differs after transformers v4.54
         SUPPORTED_ARCHITECTURES = set(SUPPORTED_ARCHITECTURES) - {"llava-qwen2", "phi3_v", "phi4mm"}
 
-    REMOTE_CODE_MODELS = ["internvl_chat", "minicpmv", "minicpmo", "llava-qwen2", "phi3_v", "maira2", "phi4mm", "videochat_flash_qwen"]
+    REMOTE_CODE_MODELS = [
+        "internvl_chat",
+        "minicpmv",
+        "minicpmo",
+        "llava-qwen2",
+        "phi3_v",
+        "maira2",
+        "phi4mm",
+        "videochat_flash_qwen",
+    ]
     IMAGE = Image.open(
         requests.get(
             TEST_IMAGE_URL,
@@ -632,7 +641,10 @@ def test_find_untested_architectures(self):
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     def test_compare_to_transformers(self, model_arch):
         if model_arch == "videochat_flash_qwen":
-            self.skipTest("Skipping comparison against Transformers because videochat_flash_qwen in OV does not support image input")
+            self.skipTest(
+                "Skipping comparison against Transformers because videochat_flash_qwen in OV does not support image input"
+            )
+
         def compare_outputs(inputs, ov_model, transformers_model, generation_config):
             transformers_inputs = copy.deepcopy(inputs)
             ov_outputs = ov_model.generate(**inputs, generation_config=generation_config)
@@ -964,9 +976,11 @@ def get_preprocessors(self, model_arch):
             )
             preprocessors = {"processor": None, "tokenizer": tokenizer, "config": config}
         elif model_arch == "videochat_flash_qwen":
+
             class VideochatProcessorWrapper:
                 def __init__(self, model_id):
                     from transformers import AutoModel
+
                     hf_model = AutoModel.from_pretrained(model_id, trust_remote_code=True)
                     self.processor = hf_model.get_vision_tower().image_processor.preprocess
                     self.model_dtype = hf_model.dtype
@@ -974,6 +988,7 @@ def __init__(self, model_id):
 
                 def __call__(self, images, return_tensors):
                     return self.processor(images, return_tensors="pt")["pixel_values"].to(dtype=self.model_dtype)
+
             processor = VideochatProcessorWrapper(model_id)
             tokenizer = AutoTokenizer.from_pretrained(
                 model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS

From f7835a041752540b645a1840bb38d77616e9a19f Mon Sep 17 00:00:00 2001
From: xufang <fang.xu@intel.com>
Date: Mon, 23 Mar 2026 14:27:14 +0800
Subject: [PATCH 17/39] fix style check issue

---
 optimum/intel/openvino/modeling_visual_language.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py
index 59aa9e5b8c..3dde0c2b89 100644
--- a/optimum/intel/openvino/modeling_visual_language.py
+++ b/optimum/intel/openvino/modeling_visual_language.py
@@ -5339,12 +5339,12 @@ def get_multimodal_embeddings(
         if images is None:
             inputs_embeds = self.get_text_embeddings(input_ids)
             return inputs_embeds, attention_mask, position_ids
-        else:
-            modalities = ["video"]
 
         # rank_print(modalities)
         if type(images) is list:
             images = [x.unsqueeze(0) if x.ndim == 3 else x for x in images]
+            if images[0].shape[0] > 1:
+                modalities = ["video"]
 
         video_idx_in_batch = []
         for _ in range(len(modalities)):
@@ -5374,9 +5374,7 @@ def get_multimodal_embeddings(
         elif mm_patch_merge_type.startswith("spatial"):
             new_image_features = []
             for image_idx, image_feature in enumerate(image_features):
-
                 if image_idx in video_idx_in_batch:  # video operations
-
                     if "anyres" in frame_aspect_ratio:
                         raise NotImplementedError
                     else:

From 82e4c229a50b6fd8749548f6218ca1d3c1d20772 Mon Sep 17 00:00:00 2001
From: Fang Xu <fang.xu@intel.com>
Date: Tue, 24 Mar 2026 09:13:47 +0800
Subject: [PATCH 18/39] Apply suggestions from code review

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 optimum/exporters/openvino/model_configs.py        |  4 +++-
 optimum/intel/openvino/modeling_visual_language.py | 13 +++++++------
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index d9663fa99a..8be42938fa 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -5433,7 +5433,9 @@ def inputs(self) -> Dict[str, Dict[int, str]]:
             return {}
         return {
             "hidden_states": {0: "batch_size", 1: "num_channels", 2: "num_frames", 3: "height", 4: "width"},
-            "rotary_pos_emb": {0: "batch_size", 1: "num_tokens", 2: "hidden_size"},
+            # rotary_pos_emb has a fixed leading dimension of 1 in the dummy generator,
+            # so we do not associate axis 0 with batch_size and keep only dynamic axes here.
+            "rotary_pos_emb": {1: "num_tokens", 2: "hidden_size"},
         }
 
     def with_behavior(
diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py
index 3dde0c2b89..62a4e11991 100644
--- a/optimum/intel/openvino/modeling_visual_language.py
+++ b/optimum/intel/openvino/modeling_visual_language.py
@@ -30,7 +30,6 @@
 from transformers.modeling_outputs import BaseModelOutputWithPooling
 from transformers.models.qwen2_vl.modeling_qwen2_vl import VisionRotaryEmbedding
 from transformers.utils import ModelOutput
-from transformers import StoppingCriteria
 
 from ...exporters.openvino import main_export
 from ...exporters.openvino.stateful import ensure_stateful_is_available, model_has_input_output_name
@@ -5154,7 +5153,7 @@ def preprocess_inputs(
 
         if tokenizer.pad_token_id is None:
             if "qwen" in tokenizer.name_or_path.lower():
-                print("Setting pad token to bos token for qwen model.")
+                logger.info("Setting pad token to bos token for qwen model.")
                 tokenizer.pad_token_id = 151643
         attention_masks = input_ids.ne(tokenizer.pad_token_id).long()
         results["attention_mask"] = attention_masks
@@ -5330,12 +5329,14 @@ def get_multimodal_embeddings(
         pixel_values=None,
         attention_mask=None,
         position_ids=None,
-        modalities=["image"],
+        modalities=None,
         image_sizes=None,
         **kwargs,
     ):
         images = pixel_values
 
+        if modalities is None:
+            modalities = ["image"]
         if images is None:
             inputs_embeds = self.get_text_embeddings(input_ids)
             return inputs_embeds, attention_mask, position_ids
@@ -5428,9 +5429,9 @@ def get_multimodal_embeddings(
                                 vision_tower_image_size,
                                 max_resolutions=None,
                             )
-                        except Exception as e:
-                            print(f"Error: {e}")
-                            raise e
+                        except Exception:
+                            logger.exception("Error while computing anyres image grid shape")
+                            raise
                             # num_patch_width, num_patch_height = 2, 2
 
                         image_feature = image_feature.view(num_patch_height, num_patch_width, height, width, -1)

From 3a0e310bc63a8e73455b548b8258d60884cca63d Mon Sep 17 00:00:00 2001
From: xufang <fang.xu@intel.com>
Date: Tue, 24 Mar 2026 02:36:17 +0000
Subject: [PATCH 19/39] apply code review comments

---
 optimum/exporters/openvino/convert.py              | 5 ++---
 optimum/exporters/openvino/model_configs.py        | 7 +++----
 optimum/exporters/openvino/model_patcher.py        | 4 +---
 optimum/intel/openvino/modeling_visual_language.py | 5 +++--
 4 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py
index d9bd00cdc3..2a1f0849ba 100644
--- a/optimum/exporters/openvino/convert.py
+++ b/optimum/exporters/openvino/convert.py
@@ -360,12 +360,11 @@ def export_pytorch(
         model.eval()
 
         # Check if we need to override certain configuration item
-        if config.values_override is not None:
+        if config.values_override is not None and hasattr(model, "config"):
             logger.info(f"Overriding {len(config.values_override)} configuration item(s)")
             for override_config_key, override_config_value in config.values_override.items():
                 logger.info(f"\t- {override_config_key} -> {override_config_value}")
-                if hasattr(model, "config"):
-                    setattr(model.config, override_config_key, override_config_value)
+                setattr(model.config, override_config_key, override_config_value)
 
         if input_shapes is None:
             input_shapes = {}  # will use the defaults from DEFAULT_DUMMY_SHAPES
diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index 8be42938fa..352ede94d4 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -5321,13 +5321,12 @@ def __init__(
         **kwargs,
     ):
         super().__init__(task, normalized_config, batch_size, num_channels, width, height, visual_seq_length, **kwargs)
-        if hasattr(normalized_config, "config") and hasattr(normalized_config.config, "mm_local_num_frames"):
-            self.num_frames = normalized_config.config.mm_local_num_frames
+        self.num_frames = getattr(normalized_config.config, "mm_local_num_frames", 4)
+        self.embed_dim = getattr(normalized_config.config, "mm_hidden_size", 1408)
         self.height = 224
         self.width = 224
         self.image_size = (self.height, self.width)
         self.patch_size = 14
-        self.embed_dim = normalized_config.config.mm_hidden_size
 
     def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
         if input_name == "hidden_states":
@@ -5363,7 +5362,7 @@ def __init__(
     ):
         self.task = task
         self.batch_size = batch_size
-        self.hidden_size = normalized_config.mm_hidden_size
+        self.hidden_size = normalized_config.config.mm_hidden_size
         self.num_patches = 64
         self.normalized_config = normalized_config
 
diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index a15d40c820..ea35b00e11 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -7751,9 +7751,7 @@ def forward_wrap(
             past_key_values=None,
             inputs_embeds=None,
         ):
-            from transformers.cache_utils import DynamicCache
-
-            outputs, labels = self.model(
+            outputs, _ = self.model(
                 input_ids=None,
                 attention_mask=attention_mask,
                 position_ids=position_ids,
diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py
index 62a4e11991..303e2f4384 100644
--- a/optimum/intel/openvino/modeling_visual_language.py
+++ b/optimum/intel/openvino/modeling_visual_language.py
@@ -5247,7 +5247,7 @@ def select_best_resolution(original_size, possible_resolutions, max_resolutions,
         min_wasted_resolution = float("inf")
 
         for width, height in possible_resolutions:
-            if max_resolutions != None and (width * height != patch_size * patch_size):
+            if max_resolutions is not None and (width * height != patch_size * patch_size):
                 if (width * height + patch_size * patch_size) > max_resolutions:  # NOTE 要算一个global
                     continue
             # Calculate the downscaled size to keep the aspect ratio
@@ -5266,7 +5266,8 @@ def select_best_resolution(original_size, possible_resolutions, max_resolutions,
                 best_fit = (width, height)
 
         # print(f"original_size={original_size}, possible_resolutions={possible_resolutions}, max_resolutions={max_resolutions}, best_fit={best_fit}")
-        assert best_fit is not None, f"Can't find suitable fit in {possible_resolutions} at max:{max_resolutions}"
+        if best_fit is None:
+            raise ValueError(f"Can't find suitable fit in {possible_resolutions} at max:{max_resolutions}")
         return best_fit
 
     # Adopted from https://huggingface.co/OpenGVLab/VideoChat-Flash-Qwen2_5-7B_InternVideo2-1B/blob/main/mm_utils.py#L601-L631

From 190a796e4faf6d3aadf0398db3be5baaecc111ac Mon Sep 17 00:00:00 2001
From: xufang <fang.xu@intel.com>
Date: Tue, 24 Mar 2026 02:58:44 +0000
Subject: [PATCH 20/39] fix code style

---
 optimum/commands/export/openvino.py                |  3 ++-
 optimum/intel/openvino/modeling_visual_language.py | 14 ++++++--------
 2 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py
index 6715a8c826..e91e3ddacd 100644
--- a/optimum/commands/export/openvino.py
+++ b/optimum/commands/export/openvino.py
@@ -341,6 +341,8 @@ def parse_args(parser: "ArgumentParser"):
         return parse_args_openvino(parser)
 
     def run(self):
+        import os
+
         from ...exporters.openvino.__main__ import _main_quantize, _merge_move, main_export
         from ...intel.openvino.configuration import (
             _DEFAULT_4BIT_WQ_CONFIG,
@@ -351,7 +353,6 @@ def run(self):
         from ...intel.openvino.utils import TemporaryDirectory
         from ...intel.utils.import_utils import is_nncf_available, is_transformers_version
         from ...intel.utils.modeling_utils import _infer_library_from_model_name_or_path
-        import os
 
         is_local = os.path.isdir(self.args.model)
         if (
diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py
index 303e2f4384..47dcdd6753 100644
--- a/optimum/intel/openvino/modeling_visual_language.py
+++ b/optimum/intel/openvino/modeling_visual_language.py
@@ -1,12 +1,12 @@
+import ast
 import copy
 import enum
 import inspect
 import logging
 import math
 import os
-import warnings
-import ast
 import re
+import warnings
 from abc import abstractmethod
 from dataclasses import dataclass
 from pathlib import Path
@@ -5011,7 +5011,7 @@ def merge_wavg(merge: Callable, x: torch.Tensor, size: torch.Tensor = None) -> T
         return x, size
 
     def get_vision_embeddings(self, images):
-        if type(images) is list:
+        if isinstance(images, list):
             raise NotImplementedError
         else:
             # input: B T C H W
@@ -5297,7 +5297,7 @@ def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size, max_reso
             ]
             # Multiply all elements by patch_size
             grid_pinpoints = [[dim * patch_size for dim in pair] for pair in grid_pinpoints]
-        if type(grid_pinpoints) is list:
+        if isinstance(grid_pinpoints, list):
             possible_resolutions = grid_pinpoints
         else:
             possible_resolutions = ast.literal_eval(grid_pinpoints)
@@ -5343,7 +5343,7 @@ def get_multimodal_embeddings(
             return inputs_embeds, attention_mask, position_ids
 
         # rank_print(modalities)
-        if type(images) is list:
+        if isinstance(images, list):
             images = [x.unsqueeze(0) if x.ndim == 3 else x for x in images]
             if images[0].shape[0] > 1:
                 modalities = ["video"]
@@ -5415,8 +5415,6 @@ def get_multimodal_embeddings(
 
                     if "anyres_max" in image_aspect_ratio:
                         matched_anyres_max_num_patches = re.match(r"anyres_max_(\d+)", image_aspect_ratio)
-                        if matched_anyres_max_num_patches:
-                            max_num_patches = int(matched_anyres_max_num_patches.group(1))
 
                     if "anyres" in image_aspect_ratio:
                         vision_tower_image_size = 224
@@ -5458,7 +5456,7 @@ def get_multimodal_embeddings(
                     else:
                         try:
                             image_feature = torch.cat((base_image_feature, image_feature), dim=0)
-                        except Exception as e:
+                        except Exception:
                             raise ValueError(
                                 f"{num_patch_width} {num_patch_height} now: base_image_feature: {base_image_feature.shape}, {image_feature.shape}, image_sizes[image_idx]: {image_sizes[image_idx]}, origin_size: {origin_size}, {image_sizes[image_idx]}, {self.config.image_grid_pinpoints}, {vision_tower_image_size}"
                             )

From 45724c44eefbbb2aae91920325f25ce07b1076eb Mon Sep 17 00:00:00 2001
From: xufang <fang.xu@intel.com>
Date: Tue, 24 Mar 2026 03:21:37 +0000
Subject: [PATCH 21/39] fix fail in 4.45

---
 .../intel/openvino/modeling_visual_language.py    | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py
index 47dcdd6753..2e9b30ebe1 100644
--- a/optimum/intel/openvino/modeling_visual_language.py
+++ b/optimum/intel/openvino/modeling_visual_language.py
@@ -5128,12 +5128,15 @@ def preprocess_inputs(
 
         # preprocess text
         prompt = f"<image>\n{text}" if (image is not None or video is not None) else text
-        messages = [{"role": "user", "content": prompt}]
-        text_prompt = tokenizer.apply_chat_template(
-            messages,
-            tokenize=False,
-            add_generation_prompt=True,
-        )
+        if getattr(tokenizer, "chat_template", None) is not None:
+            messages = [{"role": "user", "content": prompt}]
+            text_prompt = tokenizer.apply_chat_template(
+                messages,
+                tokenize=False,
+                add_generation_prompt=True,
+            )
+        else:
+            text_prompt = prompt
         input_ids = _OVVideoChatFlashQwenForCausalLM.tokenizer_image_token(
             text_prompt, tokenizer, _OVVideoChatFlashQwenForCausalLM.IMAGE_TOKEN_INDEX, return_tensors="pt"
         ).unsqueeze(0)

From 77482afdc173984fee253f67a4c277436f7a7358 Mon Sep 17 00:00:00 2001
From: xufang <fang.xu@intel.com>
Date: Tue, 24 Mar 2026 06:05:04 +0000
Subject: [PATCH 22/39] update _DEFAULT_4BIT_WQ_CONFIGS

---
 optimum/intel/openvino/configuration.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py
index 80b3452d75..c6b6941ae2 100644
--- a/optimum/intel/openvino/configuration.py
+++ b/optimum/intel/openvino/configuration.py
@@ -428,7 +428,7 @@ class OVQuantizationMethod(str, Enum):
             "weight_only": True,
         },
     },
-    "VideoChat-Flash-Qwen2_5-7B_InternVideo2-1B": {
+    "OpenGVLab/VideoChat-Flash-Qwen2_5-7B_InternVideo2-1B": {
         "quantization_configs": {
             "lm_model": {
                 "bits": 4,

From 87f5b701be242b0ef66996e3a49ed030d42dc091 Mon Sep 17 00:00:00 2001
From: xufang <fang.xu@intel.com>
Date: Thu, 26 Mar 2026 01:02:56 +0800
Subject: [PATCH 23/39] add test to compare inference results with transformers

---
 .../openvino/modeling_visual_language.py      | 40 +++++++++++++----
 tests/openvino/test_seq2seq.py                | 44 +++++++++++++------
 2 files changed, 62 insertions(+), 22 deletions(-)

diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py
index 2e9b30ebe1..fe542daf41 100644
--- a/optimum/intel/openvino/modeling_visual_language.py
+++ b/optimum/intel/openvino/modeling_visual_language.py
@@ -5122,8 +5122,8 @@ def preprocess_inputs(
             raise ValueError("Audio input is not supported")
         if tokenizer is None:
             raise ValueError("Tokenizer is required.")
-        image_sizes = None
-        frames = None
+        image_sizes = []
+        frames = []
         results = {}
 
         # preprocess text
@@ -5146,11 +5146,27 @@ def preprocess_inputs(
         if video is not None:
             if isinstance(video, list):
                 if isinstance(video[0], np.ndarray):
-                    image_sizes = [video[0].shape[:2]]
+                    image_size = video[0].shape[:2]
                 else:
                     width, height = video[0].size
-                    image_sizes = [(height, width)]
-            frames = [processor(images=video, return_tensors="pt")]
+                    image_size = (height, width)
+                image_sizes.append(image_size)
+            frames.append(processor(images=video, return_tensors="pt"))
+
+        # preprocess image
+        if image is not None:
+            from PIL.Image import Image as PILImage
+
+            if isinstance(image, PILImage):
+                width, height = image.size
+                image_size = (height, width)
+            else:
+                image_size = image.shape[:2]
+            image_frame = processor(images=image, return_tensors="pt")
+            frames.append(image_frame)
+            image_sizes.append(image_size)
+
+        if len(frames) >= 1:
             results["images"] = frames
             results["image_sizes"] = image_sizes
 
@@ -5339,8 +5355,6 @@ def get_multimodal_embeddings(
     ):
         images = pixel_values
 
-        if modalities is None:
-            modalities = ["image"]
         if images is None:
             inputs_embeds = self.get_text_embeddings(input_ids)
             return inputs_embeds, attention_mask, position_ids
@@ -5348,8 +5362,16 @@ def get_multimodal_embeddings(
         # rank_print(modalities)
         if isinstance(images, list):
             images = [x.unsqueeze(0) if x.ndim == 3 else x for x in images]
-            if images[0].shape[0] > 1:
-                modalities = ["video"]
+            if modalities is None:
+                modalities = []
+                for image in images:
+                    if image.shape[0] > 1:
+                        modalities.append("video")
+                    else:
+                        modalities.append("image")
+
+        if modalities is None:
+            modalities = ["image"]
 
         video_idx_in_batch = []
         for _ in range(len(modalities)):
diff --git a/tests/openvino/test_seq2seq.py b/tests/openvino/test_seq2seq.py
index fb11eadb9f..0c77f7d22c 100644
--- a/tests/openvino/test_seq2seq.py
+++ b/tests/openvino/test_seq2seq.py
@@ -610,6 +610,10 @@ def get_transformer_model_class(self, model_arch):
             from transformers import Qwen2VLForConditionalGeneration
 
             return Qwen2VLForConditionalGeneration
+        if model_arch == "videochat_flash_qwen":
+            from transformers import AutoModel
+
+            return AutoModel
         return AutoModelForCausalLM
 
     def _check_device_and_request(self, ov_model, expected_device, has_request):
@@ -640,16 +644,19 @@ def test_find_untested_architectures(self):
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     def test_compare_to_transformers(self, model_arch):
-        if model_arch == "videochat_flash_qwen":
-            self.skipTest(
-                "Skipping comparison against Transformers because videochat_flash_qwen in OV does not support image input"
-            )
-
-        def compare_outputs(inputs, ov_model, transformers_model, generation_config):
+        def compare_outputs(inputs, ov_model, transformers_model, generation_config, has_image=False, has_video=True):
             transformers_inputs = copy.deepcopy(inputs)
+            if model_arch == "videochat_flash_qwen":
+                input_ids = transformers_inputs.pop("input_ids")
+                transformers_inputs["inputs"] = input_ids
+                transformers_inputs["modalities"] = []
+                if has_video:
+                    transformers_inputs["modalities"].append("video")
+                if has_image:
+                    transformers_inputs["modalities"].append("image")
             ov_outputs = ov_model.generate(**inputs, generation_config=generation_config)
             # original minicpmv, internvl always skip input tokens in generation results, while transformers based approach provide them
-            if model_arch in ["minicpmv", "minicpmo", "internvl_chat"]:
+            if model_arch in ["minicpmv", "minicpmo", "internvl_chat", "videochat_flash_qwen"]:
                 ov_outputs = ov_outputs[:, inputs["input_ids"].shape[1] :]
             with torch.no_grad():
                 transformers_outputs = transformers_model.generate(
@@ -714,6 +721,10 @@ def compare_outputs(inputs, ov_model, transformers_model, generation_config):
             from transformers.cache_utils import DynamicCache
 
             transformers_inputs["past_key_values"] = DynamicCache()
+        if model_arch == "videochat_flash_qwen":
+            input_ids = transformers_inputs.pop("input_ids")
+            transformers_inputs["inputs"] = input_ids
+            transformers_inputs["modalities"] = ["image"]
 
         test_device = "AUTO"
         ov_model.to(test_device)
@@ -726,20 +737,23 @@ def compare_outputs(inputs, ov_model, transformers_model, generation_config):
         self._check_device_and_request(ov_model, test_device, False)
 
         # pytorch minicpmv and internvl_chat are not designed to be used via forward
-        if model_arch not in ["minicpmv", "minicpmo", "internvl_chat"]:
+        if model_arch not in ["minicpmv", "minicpmo", "internvl_chat", "videochat_flash_qwen"]:
             set_seed(SEED)
             ov_outputs = ov_model(**inputs)
             set_seed(SEED)
             with torch.no_grad():
                 transformers_outputs = transformers_model(**transformers_inputs)
+
             self.assertTrue(
                 torch.allclose(ov_outputs.logits, transformers_outputs.logits, atol=4e-3),
                 f"Max abs diff {(torch.abs(ov_outputs.logits - transformers_outputs.logits).max())}",
             )
 
         ov_model.generation_config.eos_token_id = None
-        transformers_model.generation_config.eos_token_id = None
-        transformers_model.generation_config.do_sample = False
+        # For videochat_flash_qwen, generation_config is None in transformers model, so we need to check it before setting eos_token_id
+        if transformers_model.generation_config is not None:
+            transformers_model.generation_config.eos_token_id = None
+            transformers_model.generation_config.do_sample = False
         ov_model.config.eos_token_id = None
         transformers_model.config.eos_token_id = None
         ov_model.generation_config.do_sample = False
@@ -787,7 +801,7 @@ def compare_outputs(inputs, ov_model, transformers_model, generation_config):
                 transformers_outputs = transformers_outputs[1].sequences
 
         # original minicpmv, internvl always skip input tokens in generation results, while transformers based approach provide them
-        if model_arch in ["minicpmv", "minicpmo", "internvl_chat"]:
+        if model_arch in ["minicpmv", "minicpmo", "internvl_chat", "videochat_flash_qwen"]:
             ov_outputs = ov_outputs[:, inputs["input_ids"].shape[1] :]
         self.assertTrue(
             torch.equal(ov_outputs, transformers_outputs),
@@ -807,14 +821,18 @@ def compare_outputs(inputs, ov_model, transformers_model, generation_config):
                 repo_type="dataset",
                 user_agent=http_user_agent(),
             )
-            input_video, _ = load_video(video_path, num_frames=2, backend="opencv")
+            # videochat_flash_qwen needs frames to be multiple of 4
+            if model_arch == "videochat_flash_qwen":
+                input_video, _ = load_video(video_path, num_frames=4, backend="opencv")
+            else:
+                input_video, _ = load_video(video_path, num_frames=2, backend="opencv")
             question = "Why is this video funny?"
             inputs = ov_model.preprocess_inputs(**preprocessors, text=question, video=input_video)
             compare_outputs(inputs, ov_model, transformers_model, gen_config)
 
             # check video+image scenario
             inputs = ov_model.preprocess_inputs(**preprocessors, text=question, video=input_video, image=image)
-            compare_outputs(inputs, ov_model, transformers_model, gen_config)
+            compare_outputs(inputs, ov_model, transformers_model, gen_config, has_image=True)
 
         if model_arch in self.SUPPORT_AUDIO:
             input_audio = self._generate_random_audio_data()

From c7245a6e46c75310221174300830446fb5b23215 Mon Sep 17 00:00:00 2001
From: xufang <fang.xu@intel.com>
Date: Wed, 25 Mar 2026 16:33:29 +0800
Subject: [PATCH 24/39] apply comments

---
 docs/source/openvino/models.mdx                    |  1 +
 optimum/exporters/openvino/convert.py              |  2 +-
 optimum/exporters/openvino/model_configs.py        | 12 ++++++++----
 optimum/intel/openvino/modeling_visual_language.py |  4 ++--
 4 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/docs/source/openvino/models.mdx b/docs/source/openvino/models.mdx
index c11505fa4d..ad18014fc4 100644
--- a/docs/source/openvino/models.mdx
+++ b/docs/source/openvino/models.mdx
@@ -150,6 +150,7 @@ Here is the list of the supported architectures :
 - TROCR
 - UniSpeech
 - UniSpeech SAT
+- VideoChat-Flash-Qwen
 - Vision Encoder Decoder
 - Vit
 - Wav2Vec2
diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py
index 2a1f0849ba..60d90f53e0 100644
--- a/optimum/exporters/openvino/convert.py
+++ b/optimum/exporters/openvino/convert.py
@@ -360,7 +360,7 @@ def export_pytorch(
         model.eval()
 
         # Check if we need to override certain configuration item
-        if config.values_override is not None and hasattr(model, "config"):
+        if config.values_override is not None:
             logger.info(f"Overriding {len(config.values_override)} configuration item(s)")
             for override_config_key, override_config_value in config.values_override.items():
                 logger.info(f"\t- {override_config_key} -> {override_config_value}")
diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index 352ede94d4..f5cf6a76b1 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -5377,7 +5377,7 @@ def generate(
         return self.random_float_tensor(shape, framework=framework, dtype=float_dtype)
 
 
-class VideoChatFlashQWENProjectorOpenVINOConfig(OnnxConfig):
+class VideoChatFlashQwenProjectorOpenVINOConfig(OnnxConfig):
     DUMMY_INPUT_GENERATOR_CLASSES = (DummyVideoChatFlashQwenProjectorInputGenerator,)
     NORMALIZED_CONFIG_CLASS = NormalizedVisionConfig
 
@@ -5452,7 +5452,7 @@ def with_behavior(
             behavior = VideoChatFlashQwenConfigBehavior(behavior)
 
         if behavior == VideoChatFlashQwenConfigBehavior.VISION_PROJECTION:
-            export_config = VideoChatFlashQWENProjectorOpenVINOConfig(
+            export_config = VideoChatFlashQwenProjectorOpenVINOConfig(
                 self._orig_config,
                 task="feature-extraction",
                 int_dtype=self.int_dtype,
@@ -5481,10 +5481,14 @@ def get_model_for_behavior(self, model, behavior: Union[str, VideoChatFlashQwenC
             behavior = VideoChatFlashQwenConfigBehavior(behavior)
 
         if behavior == VideoChatFlashQwenConfigBehavior.VISION_PROJECTION:
-            return model.get_model().mm_projector.mlp
+            vision_projector = model.get_model().mm_projector.mlp
+            vision_projector.config = model.config
+            return vision_projector
 
         if behavior == VideoChatFlashQwenConfigBehavior.VISION_EMBEDDINGS:
-            return model.get_vision_tower().vision_tower
+            vision_tower = model.get_vision_tower().vision_tower
+            vision_tower.config = model.config
+            return vision_tower
 
         if behavior == VideoChatFlashQwenConfigBehavior.TEXT_EMBEDDINGS:
             text_embedding = model.get_input_embeddings()
diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py
index fe542daf41..fbcdd2a22f 100644
--- a/optimum/intel/openvino/modeling_visual_language.py
+++ b/optimum/intel/openvino/modeling_visual_language.py
@@ -5180,7 +5180,7 @@ def preprocess_inputs(
         return results
 
     def encode_video_image(self, images_list, video_idx_in_batch):
-        # video encoder编码后按图像的connector处理
+        # process the video encoder output using image connector
         bs = len(images_list)
 
         concat_images = []
@@ -5267,7 +5267,7 @@ def select_best_resolution(original_size, possible_resolutions, max_resolutions,
 
         for width, height in possible_resolutions:
             if max_resolutions is not None and (width * height != patch_size * patch_size):
-                if (width * height + patch_size * patch_size) > max_resolutions:  # NOTE 要算一个global
+                if (width * height + patch_size * patch_size) > max_resolutions:
                     continue
             # Calculate the downscaled size to keep the aspect ratio
             scale = min(width / original_width, height / original_height)

From e0b425101ffe8c9c833e82da3133baacca852baf Mon Sep 17 00:00:00 2001
From: xufang <fang.xu@intel.com>
Date: Thu, 26 Mar 2026 20:39:03 +0800
Subject: [PATCH 25/39] fix test fail in transformers 4.45

---
 tests/openvino/test_seq2seq.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/openvino/test_seq2seq.py b/tests/openvino/test_seq2seq.py
index 0c77f7d22c..788b0fe05e 100644
--- a/tests/openvino/test_seq2seq.py
+++ b/tests/openvino/test_seq2seq.py
@@ -530,9 +530,8 @@ class OVModelForVisualCausalLMIntegrationTest(OVSeq2SeqTestMixin):
         "minicpmv",
         "phi3_v",
         "qwen2_vl",
-        "videochat_flash_qwen",
     ]
-    SUPPORT_VIDEO = ["llava_next_video", "qwen2_vl", "videochat_flash_qwen"]
+    SUPPORT_VIDEO = ["llava_next_video", "qwen2_vl"]
     SUPPORT_AUDIO = []
     OVMODEL_CLASS = OVModelForVisualCausalLM
     TASK = "image-text-to-text"
@@ -557,6 +556,9 @@ class OVModelForVisualCausalLMIntegrationTest(OVSeq2SeqTestMixin):
         SUPPORT_VIDEO += ["qwen3_vl"]
 
     if is_transformers_version(">=", "4.54.0"):
+        # the layers attribute of DynamicCache is used in videochat_flash_qwen model
+        SUPPORTED_ARCHITECTURES.append("videochat_flash_qwen")
+        SUPPORT_VIDEO.append("videochat_flash_qwen")
         # remote code models differs after transformers v4.54
         SUPPORTED_ARCHITECTURES = set(SUPPORTED_ARCHITECTURES) - {"llava-qwen2", "phi3_v", "phi4mm"}
 

From 60c9a4b1f109824a71050570dbc7f0ea96388520 Mon Sep 17 00:00:00 2001
From: xufang <fang.xu@intel.com>
Date: Fri, 27 Mar 2026 00:01:12 +0800
Subject: [PATCH 26/39] pad frame number to a multiple of 4

---
 .../openvino/modeling_visual_language.py      | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py
index fbcdd2a22f..45eaab9a82 100644
--- a/optimum/intel/openvino/modeling_visual_language.py
+++ b/optimum/intel/openvino/modeling_visual_language.py
@@ -5125,6 +5125,7 @@ def preprocess_inputs(
         image_sizes = []
         frames = []
         results = {}
+        local_num_frames = getattr(config, "mm_local_num_frames", 4)
 
         # preprocess text
         prompt = f"<image>\n{text}" if (image is not None or video is not None) else text
@@ -5144,13 +5145,27 @@ def preprocess_inputs(
 
         # preprocess video
         if video is not None:
-            if isinstance(video, list):
+            if isinstance(video, np.ndarray):
+                num_frames = video.shape[0]
+                image_size = video.shape[1:3]
+                if num_frames % local_num_frames != 0:
+                    pad_frames = local_num_frames - (num_frames % local_num_frames)
+                    pad = np.repeat(video[-1:], pad_frames, axis=0)
+                    video = np.concatenate([video, pad], axis=0)
+            elif isinstance(video, list):
+                num_frames = len(video)
                 if isinstance(video[0], np.ndarray):
                     image_size = video[0].shape[:2]
                 else:
                     width, height = video[0].size
                     image_size = (height, width)
-                image_sizes.append(image_size)
+                if num_frames % local_num_frames != 0:
+                    pad_frames = local_num_frames - (num_frames % local_num_frames)
+                    video = video + [video[-1]] * pad_frames
+            else:
+                raise ValueError("Unsupported video type: {}".format(type(video)))
+
+            image_sizes.append(image_size)
             frames.append(processor(images=video, return_tensors="pt"))
 
         # preprocess image

From 33ca87595dc75b996ca9e168a78b77e7e98d6ea2 Mon Sep 17 00:00:00 2001
From: xufang <fang.xu@intel.com>
Date: Fri, 27 Mar 2026 20:00:17 +0800
Subject: [PATCH 27/39] fix file locking issue on windows

---
 optimum/intel/openvino/quantization.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py
index 3741725e5d..daac7920a1 100644
--- a/optimum/intel/openvino/quantization.py
+++ b/optimum/intel/openvino/quantization.py
@@ -1472,6 +1472,12 @@ def _quantize_ovbasemodel(
                 ov_model_name, pipeline_quantization_config.default_config
             )
             if config is None:
+                if immediate_save:
+                    # Unload skipped submodels early so their IR files are not held open on Windows.
+                    # This can avoid later _merge_move failures caused by locked .bin files.
+                    ov_model = self.model.ov_models[ov_model_name]
+                    if ov_model is not None:
+                        self.model._unload_ov_model(ov_model)
                 continue
             ov_model = self.model.ov_models[ov_model_name]
             nncf_dataset = calibration_dataset.get(ov_model_name, None) if calibration_dataset else None

From 303d87526163c8f30499e95adf19dcc1a2394df4 Mon Sep 17 00:00:00 2001
From: xufang <fang.xu@intel.com>
Date: Tue, 31 Mar 2026 16:50:47 +0800
Subject: [PATCH 28/39] apply comments

---
 optimum/commands/export/openvino.py         | 18 +-----------------
 optimum/exporters/openvino/model_configs.py |  3 ++-
 2 files changed, 3 insertions(+), 18 deletions(-)

diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py
index e91e3ddacd..cd3280189e 100644
--- a/optimum/commands/export/openvino.py
+++ b/optimum/commands/export/openvino.py
@@ -341,8 +341,6 @@ def parse_args(parser: "ArgumentParser"):
         return parse_args_openvino(parser)
 
     def run(self):
-        import os
-
         from ...exporters.openvino.__main__ import _main_quantize, _merge_move, main_export
         from ...intel.openvino.configuration import (
             _DEFAULT_4BIT_WQ_CONFIG,
@@ -351,23 +349,9 @@ def run(self):
             get_default_quantization_config,
         )
         from ...intel.openvino.utils import TemporaryDirectory
-        from ...intel.utils.import_utils import is_nncf_available, is_transformers_version
+        from ...intel.utils.import_utils import is_nncf_available
         from ...intel.utils.modeling_utils import _infer_library_from_model_name_or_path
 
-        is_local = os.path.isdir(self.args.model)
-        if (
-            "OpenGVLab/VideoChat-Flash-Qwen2_5-7B_InternVideo2-1B" in self.args.model
-            and not is_local
-            and (is_transformers_version(">=", "4.49"))
-        ):
-            raise ValueError(
-                "The model OpenGVLab/VideoChat-Flash-Qwen2_5-7B_InternVideo2-1B in hugging face "
-                "contains custom code and requires transformers version prior to 4.49. "
-                "It is recommended to install transformers version 4.48 in your environment or download "
-                "https://modelscope.cn/models/OpenGVLab/VideoChat-Flash-Qwen2_5-7B_InternVideo2-1B "
-                "to your local path and use local path to convert."
-            )
-
         if self.args.library is None:
             # TODO: add revision, subfolder and token to args
             library_name = _infer_library_from_model_name_or_path(
diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index f5cf6a76b1..403be8a47b 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -5399,7 +5399,8 @@ class VideoChatFlashQwenConfigBehavior(str, enum.Enum):
 
 @register_in_tasks_manager("videochat_flash_qwen", *["image-text-to-text"], library_name="transformers")
 class VideoChatFlashQwenOpenVINOConfig(BaseVLMOpenVINOConfig):
-    MIN_TRANSFORMERS_VERSION = "4.42.0"
+    MIN_TRANSFORMERS_VERSION = "4.45.0"
+    MAX_TRANSFORMERS_VERSION = "4.57.99"
     SUPPORTED_BEHAVIORS = [model_type.value for model_type in VideoChatFlashQwenConfigBehavior]
     DUMMY_INPUT_GENERATOR_CLASSES = (DummyVideoChatFlashQwenInputGenerator,)
 

From 7379ab377578c52decad93783ecab0e736a22871 Mon Sep 17 00:00:00 2001
From: xufang <fang.xu@intel.com>
Date: Tue, 31 Mar 2026 22:04:35 +0800
Subject: [PATCH 29/39] update comments

---
 optimum/exporters/openvino/model_configs.py | 9 +++++++++
 optimum/intel/openvino/quantization.py      | 3 ++-
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index 403be8a47b..e43314b200 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -5414,6 +5414,15 @@ def __init__(
         preprocessors: Optional[List[Any]] = None,
         **kwargs,
     ):
+        import os
+
+        if (
+            is_transformers_version(">=", "4.49.0")
+            and not os.path.isdir(config.name_or_path)
+            and config.name_or_path == "OpenGVLab/VideoChat-Flash-Qwen2_5-7B_InternVideo2-1B"
+        ):
+            raise Exception("This model is not supported for transformers version >= 4.49.0")
+
         super().__init__(
             config=config,
             task=task,
diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py
index daac7920a1..6152fe74eb 100644
--- a/optimum/intel/openvino/quantization.py
+++ b/optimum/intel/openvino/quantization.py
@@ -1473,7 +1473,8 @@ def _quantize_ovbasemodel(
             )
             if config is None:
                 if immediate_save:
-                    # Unload skipped submodels early so their IR files are not held open on Windows.
+                    # The submodels being quantized is unloaded after quantization, 
+                    # so the skipped submodels should also be unloaded to avoid keeping their IR files open on Windows.
                     # This can avoid later _merge_move failures caused by locked .bin files.
                     ov_model = self.model.ov_models[ov_model_name]
                     if ov_model is not None:

From 2cd021436230e8cb29c37de4c3819177bb14ac12 Mon Sep 17 00:00:00 2001
From: xufang <fang.xu@intel.com>
Date: Wed, 1 Apr 2026 08:50:04 +0800
Subject: [PATCH 30/39] fix code style

---
 optimum/intel/openvino/quantization.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py
index 6152fe74eb..dad2ec70b6 100644
--- a/optimum/intel/openvino/quantization.py
+++ b/optimum/intel/openvino/quantization.py
@@ -1473,7 +1473,7 @@ def _quantize_ovbasemodel(
             )
             if config is None:
                 if immediate_save:
-                    # The submodels being quantized is unloaded after quantization, 
+                    # The submodels being quantized is unloaded after quantization,
                     # so the skipped submodels should also be unloaded to avoid keeping their IR files open on Windows.
                     # This can avoid later _merge_move failures caused by locked .bin files.
                     ov_model = self.model.ov_models[ov_model_name]

From e1dba190e1b2565e892752a06fc06bff86604aa9 Mon Sep 17 00:00:00 2001
From: xufang <fang.xu@intel.com>
Date: Fri, 3 Apr 2026 15:45:48 +0800
Subject: [PATCH 31/39] add default image_preprocess

---
 .../openvino/modeling_visual_language.py      | 62 +++++++++++++++++--
 1 file changed, 57 insertions(+), 5 deletions(-)

diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py
index 45eaab9a82..ebbb9a33c0 100644
--- a/optimum/intel/openvino/modeling_visual_language.py
+++ b/optimum/intel/openvino/modeling_visual_language.py
@@ -5105,6 +5105,50 @@ def insert_separator(X, sep):
             raise ValueError(f"Unsupported tensor type: {return_tensors}")
         return input_ids
 
+        # Adopted from https://huggingface.co/OpenGVLab/VideoChat-Flash-Qwen2_5-7B_InternVideo2-1B/blob/main/vision_tower_builder.py#L681
+
+    def image_preprocess(images, return_tensors, target_size=None):
+        from functools import partial, reduce
+        from PIL.Image import Image as PILImage
+        from transformers.image_processing_utils import BatchFeature
+        from transformers.image_utils import ChannelDimension, PILImageResampling, to_numpy_array
+        from transformers.image_transforms import (
+            convert_to_rgb,
+            normalize,
+            rescale,
+            resize,
+            to_channel_dimension_format,
+        )
+
+        if isinstance(images, PILImage):
+            images = [images]
+        else:
+            # to adapt video data
+            images = [to_numpy_array(image) for image in images]
+            assert isinstance(images, list)
+
+        if target_size is None:
+            target_size = (224, 224)
+
+        data_format = ChannelDimension.FIRST
+        rescale_factor = 1 / 255
+        image_mean = (0.485, 0.456, 0.406)
+        image_std = (0.229, 0.224, 0.225)
+
+        transforms = [
+            convert_to_rgb,
+            to_numpy_array,
+            partial(resize, size=target_size, resample=PILImageResampling.BICUBIC, data_format=data_format),
+            partial(rescale, scale=rescale_factor, data_format=data_format),
+            partial(normalize, mean=image_mean, std=image_std, data_format=data_format),
+            partial(to_channel_dimension_format, channel_dim=data_format, input_channel_dim=data_format),
+        ]
+
+        images = reduce(lambda x, f: [*map(f, x)], transforms, images)
+        data = {"pixel_values": images}
+
+        return BatchFeature(data=data, tensor_type=return_tensors)
+
     @staticmethod
     def preprocess_inputs(
         text: str,
@@ -5115,9 +5159,6 @@ def preprocess_inputs(
         video: Optional["VideoInput"] = None,
         audio: Optional[np.ndarray] = None,
     ):
-        # Note: The implementation of this function is not validated, it's only there to allow this subclass to be initialized.
-        if processor is None:
-            raise ValueError("Processor is required.")
         if audio is not None:
             raise ValueError("Audio input is not supported")
         if tokenizer is None:
@@ -5166,7 +5207,13 @@ def preprocess_inputs(
                 raise ValueError("Unsupported video type: {}".format(type(video)))
 
             image_sizes.append(image_size)
-            frames.append(processor(images=video, return_tensors="pt"))
+            if processor is not None:
+                processed_images = processor(images=video, return_tensors="pt")
+            else:
+                processed_images = _OVVideoChatFlashQwenForCausalLM.image_preprocess(
+                    images=video, return_tensors="pt"
+                )["pixel_values"]
+            frames.append(processed_images)
 
         # preprocess image
         if image is not None:
@@ -5177,7 +5224,12 @@ def preprocess_inputs(
                 image_size = (height, width)
             else:
                 image_size = image.shape[:2]
-            image_frame = processor(images=image, return_tensors="pt")
+            if processor is not None:
+                image_frame = processor(images=image, return_tensors="pt")
+            else:
+                image_frame = _OVVideoChatFlashQwenForCausalLM.image_preprocess(images=image, return_tensors="pt")[
+                    "pixel_values"
+                ]
             frames.append(image_frame)
             image_sizes.append(image_size)
 

From 0f1907d5a94b747c611799aa89fb0f2919d25036 Mon Sep 17 00:00:00 2001
From: xufang <fang.xu@intel.com>
Date: Fri, 3 Apr 2026 19:55:10 +0800
Subject: [PATCH 32/39] apply comments

---
 optimum/exporters/openvino/model_configs.py   |  33 ++--
 optimum/exporters/openvino/model_patcher.py   |  74 +--------
 .../openvino/modeling_visual_language.py      | 147 ++++++++++--------
 3 files changed, 98 insertions(+), 156 deletions(-)

diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index e43314b200..009302db5b 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -200,9 +200,8 @@
     Qwen3VLVisionEmbMergerPatcher,
     QwenModelPatcher,
     SanaTextEncoderModelPatcher,
-    VideochatFlashQwenLanguageModelPatcher,
-    VideochatFlashQwenVisionEmbeddingModelPatcher,
-    VideochatFlashQwenVisionProjectionModelPatcher,
+    VideoChatFlashQwenLanguageModelPatcher,
+    VideoChatFlashQwenVisionEmbeddingModelPatcher,
     XverseModelPatcher,
     Zamba2ModelPatcher,
 )
@@ -5323,6 +5322,7 @@ def __init__(
         super().__init__(task, normalized_config, batch_size, num_channels, width, height, visual_seq_length, **kwargs)
         self.num_frames = getattr(normalized_config.config, "mm_local_num_frames", 4)
         self.embed_dim = getattr(normalized_config.config, "mm_hidden_size", 1408)
+        # Then input image size and patch size for the vision encoder can not be got from the config, we set them to fixed values according to the original implementation.
         self.height = 224
         self.width = 224
         self.image_size = (self.height, self.width)
@@ -5350,7 +5350,7 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int
 
 
 class DummyVideoChatFlashQwenProjectorInputGenerator(DummyInputGenerator):
-    SUPPORTED_INPUT_NAMES = ["hidden_states"]
+    SUPPORTED_INPUT_NAMES = ["input"]
 
     def __init__(
         self,
@@ -5363,6 +5363,7 @@ def __init__(
         self.task = task
         self.batch_size = batch_size
         self.hidden_size = normalized_config.config.mm_hidden_size
+        # The original implementation with projector_type 'tome16_mlp_hd64' uses a fixed number of patches (64).
         self.num_patches = 64
         self.normalized_config = normalized_config
 
@@ -5383,12 +5384,7 @@ class VideoChatFlashQwenProjectorOpenVINOConfig(OnnxConfig):
 
     @property
     def inputs(self) -> Dict[str, Dict[int, str]]:
-        return {"hidden_states": {0: "batch_size", 1: "num_patches", 2: "hidden_size"}}
-
-    def patch_model_for_export(self, model: PreTrainedModel, model_kwargs: Optional[Dict[str, Any]] = None):
-        model_kwargs = model_kwargs or {}
-        return VideochatFlashQwenVisionProjectionModelPatcher(self, model, model_kwargs)
-
+        return {"input": {0: "batch_size", 1: "num_patches", 2: "hidden_size"}}
 
 class VideoChatFlashQwenConfigBehavior(str, enum.Enum):
     LANGUAGE = "language"
@@ -5399,7 +5395,7 @@ class VideoChatFlashQwenConfigBehavior(str, enum.Enum):
 
 @register_in_tasks_manager("videochat_flash_qwen", *["image-text-to-text"], library_name="transformers")
 class VideoChatFlashQwenOpenVINOConfig(BaseVLMOpenVINOConfig):
-    MIN_TRANSFORMERS_VERSION = "4.45.0"
+    MIN_TRANSFORMERS_VERSION = "4.49.0"
     MAX_TRANSFORMERS_VERSION = "4.57.99"
     SUPPORTED_BEHAVIORS = [model_type.value for model_type in VideoChatFlashQwenConfigBehavior]
     DUMMY_INPUT_GENERATOR_CLASSES = (DummyVideoChatFlashQwenInputGenerator,)
@@ -5414,15 +5410,6 @@ def __init__(
         preprocessors: Optional[List[Any]] = None,
         **kwargs,
     ):
-        import os
-
-        if (
-            is_transformers_version(">=", "4.49.0")
-            and not os.path.isdir(config.name_or_path)
-            and config.name_or_path == "OpenGVLab/VideoChat-Flash-Qwen2_5-7B_InternVideo2-1B"
-        ):
-            raise Exception("This model is not supported for transformers version >= 4.49.0")
-
         super().__init__(
             config=config,
             task=task,
@@ -5441,7 +5428,7 @@ def inputs(self) -> Dict[str, Dict[int, str]]:
         if not self._behavior == VideoChatFlashQwenConfigBehavior.VISION_EMBEDDINGS:
             return {}
         return {
-            "hidden_states": {0: "batch_size", 1: "num_channels", 2: "num_frames", 3: "height", 4: "width"},
+            "hidden_states": {0: "batch_size", 2: "num_frames", 3: "height", 4: "width"},
             # rotary_pos_emb has a fixed leading dimension of 1 in the dummy generator,
             # so we do not associate axis 0 with batch_size and keep only dynamic axes here.
             "rotary_pos_emb": {1: "num_tokens", 2: "hidden_size"},
@@ -5512,10 +5499,10 @@ def get_model_for_behavior(self, model, behavior: Union[str, VideoChatFlashQwenC
     def patch_model_for_export(self, model: PreTrainedModel, model_kwargs: Optional[Dict[str, Any]] = None):
         model_kwargs = model_kwargs or {}
         if self._behavior == VideoChatFlashQwenConfigBehavior.LANGUAGE:
-            return VideochatFlashQwenLanguageModelPatcher(self, model, model_kwargs)
+            return VideoChatFlashQwenLanguageModelPatcher(self, model, model_kwargs)
 
         if self._behavior == VideoChatFlashQwenConfigBehavior.VISION_EMBEDDINGS:
-            return VideochatFlashQwenVisionEmbeddingModelPatcher(self, model, model_kwargs)
+            return VideoChatFlashQwenVisionEmbeddingModelPatcher(self, model, model_kwargs)
 
         return super().patch_model_for_export(model, model_kwargs)
 
diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index ea35b00e11..de0eae6355 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -7640,7 +7640,7 @@ def __exit__(self, exc_type, exc_value, traceback):
                 del afmoe_moe.down_projs, afmoe_moe.gate_projs, afmoe_moe.up_projs
 
 
-class VideochatFlashQwenVisionEmbeddingModelPatcher(ModelPatcher):
+class VideoChatFlashQwenVisionEmbeddingModelPatcher(ModelPatcher):
     def __init__(
         self,
         config: "OnnxConfig",
@@ -7649,7 +7649,7 @@ def __init__(
     ):
         model.__orig_forward = model.forward
 
-        def forward_wrap(self, hidden_states, rotary_pos_emb=None, mask=None, use_image=False):
+        def forward_wrap(self, hidden_states, rotary_pos_emb):
             hidden_states = self.patch_embed(hidden_states.type(self.dtype))
             B, T, L, C = hidden_states.shape  # T: temporal; L: spatial
             hidden_states = hidden_states.view([B, T * L, C])
@@ -7657,75 +7657,13 @@ def forward_wrap(self, hidden_states, rotary_pos_emb=None, mask=None, use_image=
             # append cls token
             cls_tokens = self.cls_token.expand(B, -1, -1)
             hidden_states = torch.cat((cls_tokens, hidden_states), dim=1)
-
-            # add pos_embed
-            if self.sep_pos_embed:
-                raise NotImplementedError
-            else:
-                if use_image:
-                    if self.sep_image_video_pos_embed:
-                        rotary_pos_emb = self.img_pos_embed
-                    else:
-                        # (1, num_img_patches + 1, embed_dim)
-                        cls_pos_embed = self.pos_embed[:, 0:1, :]
-
-                        img_pos_embed = (
-                            self.pos_embed[:, 1:, :]
-                            .view(1, self.num_frames, self.patch_embed.num_patches // self.num_frames, self.embed_dim)
-                            .mean(dim=1)
-                        )
-
-                        rotary_pos_emb = torch.cat([cls_pos_embed, img_pos_embed], dim=1)
-                else:
-                    if rotary_pos_emb is None:
-                        rotary_pos_emb = self.pos_embed
-
             hidden_states = hidden_states + rotary_pos_emb
-
-            # mask tokens, ~mask means visible
-            if mask is not None:
-                hidden_states = hidden_states[~mask].reshape(B, -1, C)
-            else:
-                hidden_states = hidden_states.reshape(B, -1, C)
-
-            residual = None
+            hidden_states = hidden_states.reshape(B, -1, C)
 
             for idx, blk in enumerate(self.blocks):
-                if isinstance(hidden_states, tuple) and len(hidden_states) == 2:
-                    hidden_states, residual = hidden_states
-                hidden_states = blk(hidden_states, residual=residual)
-
-            if isinstance(hidden_states, tuple) and len(hidden_states) == 2:
-                hidden_states, residual = hidden_states
-                if residual is not None:
-                    hidden_states = hidden_states + residual
-
-            x_vis = hidden_states
-            if self.x_vis_only:
-                return x_vis
-            else:
-                x_pool_vis = self.clip_projector(x_vis)
-                return x_vis, x_pool_vis, None, None
-
-        model.forward = types.MethodType(forward_wrap, model)
-        super().__init__(config, model, model_kwargs)
+                hidden_states = blk(hidden_states, residual=None)
 
-    def __exit__(self, exc_type, exc_value, traceback):
-        super().__exit__(exc_type, exc_value, traceback)
-        self._model.forward = self._model.__orig_forward
-
-
-class VideochatFlashQwenVisionProjectionModelPatcher(ModelPatcher):
-    def __init__(
-        self,
-        config: "OnnxConfig",
-        model: "PreTrainedModel",
-        model_kwargs: Dict[str, Any] = None,
-    ):
-        model.__orig_forward = model.forward
-
-        def forward_wrap(self, hidden_states):
-            return self.__orig_forward(input=hidden_states)
+            return hidden_states
 
         model.forward = types.MethodType(forward_wrap, model)
         super().__init__(config, model, model_kwargs)
@@ -7735,7 +7673,7 @@ def __exit__(self, exc_type, exc_value, traceback):
         self._model.forward = self._model.__orig_forward
 
 
-class VideochatFlashQwenLanguageModelPatcher(ModelPatcher):
+class VideoChatFlashQwenLanguageModelPatcher(ModelPatcher):
     def __init__(
         self,
         config: "OnnxConfig",
diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py
index ebbb9a33c0..7e14011b6f 100644
--- a/optimum/intel/openvino/modeling_visual_language.py
+++ b/optimum/intel/openvino/modeling_visual_language.py
@@ -1,4 +1,3 @@
-import ast
 import copy
 import enum
 import inspect
@@ -4825,36 +4824,37 @@ def get_3d_sincos_pos_embed(embed_dim, grid_size, t_size, cls_token=False):
         embed_dim_temporal = embed_dim // 4
 
         # spatial
-        grid_h = np.arange(grid_size, dtype=np.float32)
-        grid_w = np.arange(grid_size, dtype=np.float32)
-        grid = np.meshgrid(grid_w, grid_h)  # here w goes first
-        grid = np.stack(grid, axis=0)
+        grid_h = torch.arange(grid_size, dtype=torch.float32)
+        grid_w = torch.arange(grid_size, dtype=torch.float32)
+        grid = torch.meshgrid(grid_w, grid_h, indexing="xy")  # here w goes first
+        grid = torch.stack(grid, dim=0)
 
-        grid = grid.reshape([2, 1, grid_size, grid_size])
+        grid = grid.reshape(2, 1, grid_size, grid_size)
         pos_embed_spatial = _OVVideoChatFlashQwenForCausalLM.get_2d_sincos_pos_embed_from_grid(embed_dim_spatial, grid)
 
         # temporal
-        grid_t = np.arange(t_size, dtype=np.float32)
+        grid_t = torch.arange(t_size, dtype=torch.float32)
         pos_embed_temporal = _OVVideoChatFlashQwenForCausalLM.get_1d_sincos_pos_embed_from_grid(
             embed_dim_temporal, grid_t
         )
 
         # concate: [T, H, W] order
-        pos_embed_temporal = pos_embed_temporal[:, np.newaxis, :]
-        pos_embed_temporal = np.repeat(pos_embed_temporal, grid_size**2, axis=1)  # [T, H*W, D // 4]
-        pos_embed_spatial = pos_embed_spatial[np.newaxis, :, :]
-        pos_embed_spatial = np.repeat(pos_embed_spatial, t_size, axis=0)  # [T, H*W, D // 4 * 3]
+        pos_embed_temporal = pos_embed_temporal[:, None, :]
+        pos_embed_temporal = pos_embed_temporal.repeat(1, grid_size**2, 1)  # [T, H*W, D // 4]
+        pos_embed_spatial = pos_embed_spatial[None, :, :]
+        pos_embed_spatial = pos_embed_spatial.repeat(t_size, 1, 1)  # [T, H*W, D // 4 * 3]
 
-        pos_embed = np.concatenate([pos_embed_temporal, pos_embed_spatial], axis=-1)
-        pos_embed = pos_embed.reshape([-1, embed_dim])  # [T*H*W, D]
+        pos_embed = torch.cat([pos_embed_temporal, pos_embed_spatial], dim=-1)
+        pos_embed = pos_embed.reshape(-1, embed_dim)  # [T*H*W, D]
 
         if cls_token:
-            pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0)
+            pos_embed = torch.cat([torch.zeros((1, embed_dim), dtype=pos_embed.dtype), pos_embed], dim=0)
         return pos_embed
 
     # Adopted from https://huggingface.co/OpenGVLab/VideoChat-Flash-Qwen2_5-7B_InternVideo2-1B/blob/main/vision_tower_builder.py#L141
     def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
         assert embed_dim % 2 == 0
+        grid = grid if isinstance(grid, torch.Tensor) else torch.as_tensor(grid, dtype=torch.float32)
 
         # use half of dimensions to encode grid_h
         emb_h = _OVVideoChatFlashQwenForCausalLM.get_1d_sincos_pos_embed_from_grid(
@@ -4864,7 +4864,7 @@ def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
             embed_dim // 2, grid[1]
         )  # (H*W, D/2)
 
-        emb = np.concatenate([emb_h, emb_w], axis=1)  # (H*W, D)
+        emb = torch.cat([emb_h, emb_w], dim=1)  # (H*W, D)
         return emb
 
     # Adopted from https://huggingface.co/OpenGVLab/VideoChat-Flash-Qwen2_5-7B_InternVideo2-1B/blob/main/vision_tower_builder.py#L156
@@ -4875,17 +4875,18 @@ def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
         out: (M, D)
         """
         assert embed_dim % 2 == 0
-        omega = np.arange(embed_dim // 2, dtype=np.float32)
+        omega = torch.arange(embed_dim // 2, dtype=torch.float32)
         omega /= embed_dim / 2.0
-        omega = 1.0 / 10000**omega  # (D/2,)
+        omega = 1.0 / (10000**omega)  # (D/2,)
 
-        pos = pos.reshape(-1)  # (M,)
-        out = np.einsum("m,d->md", pos, omega)  # (M, D/2), outer product
+        pos = pos if isinstance(pos, torch.Tensor) else torch.as_tensor(pos, dtype=torch.float32)
+        pos = pos.reshape(-1).to(dtype=torch.float32)  # (M,)
+        out = torch.einsum("m,d->md", pos, omega)  # (M, D/2), outer product
 
-        emb_sin = np.sin(out)  # (M, D/2)
-        emb_cos = np.cos(out)  # (M, D/2)
+        emb_sin = torch.sin(out)  # (M, D/2)
+        emb_cos = torch.cos(out)  # (M, D/2)
 
-        emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+        emb = torch.cat([emb_sin, emb_cos], dim=1)  # (M, D)
         return emb
 
     def __init__(
@@ -4932,12 +4933,12 @@ def __init__(
         pos_embed = _OVVideoChatFlashQwenForCausalLM.get_3d_sincos_pos_embed(
             self.pos_embed.shape[-1], self.grid_size[1], self.grid_size[0], cls_token=True
         )
-        self.pos_embed.data.copy_(torch.from_numpy(pos_embed).float().unsqueeze(0))
+        self.pos_embed.data.copy_(pos_embed.to(dtype=self.pos_embed.dtype).unsqueeze(0))
 
         img_pos_embed = _OVVideoChatFlashQwenForCausalLM.get_3d_sincos_pos_embed(
             self.pos_embed.shape[-1], self.grid_size[1], 1, cls_token=True
         )
-        self.img_pos_embed.data.copy_(torch.from_numpy(img_pos_embed).float().unsqueeze(0))
+        self.img_pos_embed.data.copy_(img_pos_embed.to(dtype=self.img_pos_embed.dtype).unsqueeze(0))
 
     # Adopted from https://huggingface.co/OpenGVLab/VideoChat-Flash-Qwen2_5-7B_InternVideo2-1B/blob/main/mm_projector_builder.py#L6
     def bipartite_soft_matching(
@@ -4945,10 +4946,31 @@ def bipartite_soft_matching(
         r: int,
     ) -> Tuple[Callable, Callable]:
         """
-        Applies ToMe with a balanced matching set (50%, 50%).
+        Build balanced ToMe token matching operators for vision token compression.
+        In this model's vision path, it is the core matching step used by
+        ``merge_tokens`` to progressively shrink visual token sequences before
+        ``vision_projection``. This reduces the token count passed into the
+        language-model side of the multimodal pipeline, improving memory/latency
+        while keeping high-similarity visual information aggregated.
+
+        This function splits tokens into two interleaved groups (even/odd positions),
+        computes pairwise similarity between the two groups, and selects the top-``r``
+        pairs to merge. It returns two closures:
+
+        - ``merge``: merges matched source tokens into destination tokens to reduce
+            sequence length while preserving information.
+        - ``unmerge``: restores merged tokens back to the original token layout,
+            which is useful for shape recovery or downstream alignment
 
-        Input size is [batch, tokens, channels].
-        r indicates the number of tokens to remove (max 50% of tokens).
+        Args:
+            metric (`torch.Tensor`): Token features with shape ``[batch, tokens, channels]``
+                    used to compute matching similarity.
+            r (`int`): Number of tokens to remove by merging. It is internally capped
+                    at half of available tokens.
+
+        Returns:
+            `Tuple[Callable, Callable]`: ``(merge, unmerge)`` operators for reversible
+            token reduction.
         """
         protected = 0
 
@@ -5011,23 +5033,22 @@ def merge_wavg(merge: Callable, x: torch.Tensor, size: torch.Tensor = None) -> T
         return x, size
 
     def get_vision_embeddings(self, images):
-        if isinstance(images, list):
-            raise NotImplementedError
+        # Upstream preprocessing provides BTCHW, but the vision tower expects BCHWT,
+        # so we permute dimensions before running the visual encoder.
+        # We then keep patch tokens in [B, T*L, C] (dropping cls later) because
+        # downstream token merging/projection operates on a flattened token sequence.
+        T = images.shape[1]
+        images = images.permute(0, 2, 1, 3, 4)
+        if T == 1:
+            pos_embeds = self.img_pos_embed.detach()
         else:
-            # input: B T C H W
-            # output: B T*L C
-            T = images.shape[1]
-            images = images.permute(0, 2, 1, 3, 4)
-            if T == 1:
-                pos_embeds = self.img_pos_embed.detach()
-            else:
-                pos_embeds = self.pos_embed.detach()
-            image_embeds = self.vision_embeddings(
-                images, rotary_pos_emb=pos_embeds, use_image=(T == 1)
-            ).last_hidden_state
-            image_embeds = image_embeds[:, 1:, :]
+            pos_embeds = self.pos_embed.detach()
+        image_embeds = self.vision_embeddings(
+            images, rotary_pos_emb=pos_embeds, use_image=(T == 1)
+        ).last_hidden_state
+        image_embeds = image_embeds[:, 1:, :]
 
-            videos_features = torch.from_numpy(image_embeds) if isinstance(image_embeds, np.ndarray) else image_embeds
+        videos_features = torch.from_numpy(image_embeds) if isinstance(image_embeds, np.ndarray) else image_embeds
 
         return videos_features
 
@@ -5039,16 +5060,17 @@ def merge_tokens(self, x, target_num_token):
         """
         size = None
         b, p, c = x.shape
-        tmp_p = p
+        current_num_tokens = p
+        # Number of tokens to merge at each iterative ToMe step until reaching target_num_token.
         r_merge_list = []
-        assert tmp_p > target_num_token, f"{tmp_p} should greater than {target_num_token}"
-        while tmp_p != target_num_token:
-            if tmp_p - target_num_token <= (tmp_p // 2):
-                r_merge_list.append(tmp_p - target_num_token)
+        assert current_num_tokens > target_num_token, f"{current_num_tokens} should greater than {target_num_token}"
+        while current_num_tokens != target_num_token:
+            if current_num_tokens - target_num_token <= (current_num_tokens // 2):
+                r_merge_list.append(current_num_tokens - target_num_token)
                 break
             else:
-                r_merge_list.append(tmp_p // 2)
-                tmp_p = tmp_p - (tmp_p // 2)
+                r_merge_list.append(current_num_tokens // 2)
+                current_num_tokens = current_num_tokens - (current_num_tokens // 2)
 
         head = self.num_attention_heads
 
@@ -5386,7 +5408,8 @@ def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size, max_reso
         if isinstance(grid_pinpoints, list):
             possible_resolutions = grid_pinpoints
         else:
-            possible_resolutions = ast.literal_eval(grid_pinpoints)
+            pairs = re.findall(r"\(\s*(\d+)\s*,\s*(\d+)\s*\)", grid_pinpoints)
+            possible_resolutions = [(int(w), int(h)) for w, h in pairs]
         width, height = _OVVideoChatFlashQwenForCausalLM.select_best_resolution(
             image_size, possible_resolutions, max_resolutions=max_resolutions, patch_size=patch_size
         )
@@ -5510,25 +5533,19 @@ def get_multimodal_embeddings(
 
                     if "anyres" in image_aspect_ratio:
                         vision_tower_image_size = 224
-                        try:
-                            (
-                                num_patch_width,
-                                num_patch_height,
-                            ) = _OVVideoChatFlashQwenForCausalLM.get_anyres_image_grid_shape(
-                                image_sizes[image_idx],
-                                self.config.image_grid_pinpoints,
-                                vision_tower_image_size,
-                                max_resolutions=None,
-                            )
-                        except Exception:
-                            logger.exception("Error while computing anyres image grid shape")
-                            raise
-                            # num_patch_width, num_patch_height = 2, 2
+                        (
+                            num_patch_width,
+                            num_patch_height,
+                        ) = _OVVideoChatFlashQwenForCausalLM.get_anyres_image_grid_shape(
+                            image_sizes[image_idx],
+                            self.config.image_grid_pinpoints,
+                            vision_tower_image_size,
+                            max_resolutions=None,
+                        )
 
                         image_feature = image_feature.view(num_patch_height, num_patch_width, height, width, -1)
                     else:
                         raise NotImplementedError(image_aspect_ratio)
-                        image_feature = image_feature.view(2, 2, height, width, -1)
 
                     if "maxpool2x2" in mm_patch_merge_type:
                         raise NotImplementedError

From 9bca64a5c49109d4cb3dd596ffdc6a816d5fee8c Mon Sep 17 00:00:00 2001
From: xufang <fang.xu@intel.com>
Date: Sat, 4 Apr 2026 05:55:23 +0800
Subject: [PATCH 33/39] update tests

---
 tests/openvino/test_seq2seq.py | 26 ++++++++------------------
 1 file changed, 8 insertions(+), 18 deletions(-)

diff --git a/tests/openvino/test_seq2seq.py b/tests/openvino/test_seq2seq.py
index 788b0fe05e..be37bf000f 100644
--- a/tests/openvino/test_seq2seq.py
+++ b/tests/openvino/test_seq2seq.py
@@ -823,11 +823,7 @@ def compare_outputs(inputs, ov_model, transformers_model, generation_config, has
                 repo_type="dataset",
                 user_agent=http_user_agent(),
             )
-            # videochat_flash_qwen needs frames to be multiple of 4
-            if model_arch == "videochat_flash_qwen":
-                input_video, _ = load_video(video_path, num_frames=4, backend="opencv")
-            else:
-                input_video, _ = load_video(video_path, num_frames=2, backend="opencv")
+            input_video, _ = load_video(video_path, num_frames=2, backend="opencv")
             question = "Why is this video funny?"
             inputs = ov_model.preprocess_inputs(**preprocessors, text=question, video=input_video)
             compare_outputs(inputs, ov_model, transformers_model, gen_config)
@@ -912,14 +908,12 @@ def test_generate_utils(self, model_arch):
         tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=trust_remote_code)
         preprocessors = self.get_preprocessors(model_arch)
 
-        # videochat_flash_qwen does not support image input
-        if model_arch != "videochat_flash_qwen":
-            question = "Describe image"
-            inputs = model.preprocess_inputs(**preprocessors, text=question, image=self.IMAGE.resize((600, 600)))
-            # General case
-            outputs = model.generate(**inputs, max_new_tokens=10)
-            outputs = tokenizer.batch_decode(outputs[:, inputs["input_ids"].shape[1] :], skip_special_tokens=True)
-            self.assertIsInstance(outputs[0], str)
+        question = "Describe image"
+        inputs = model.preprocess_inputs(**preprocessors, text=question, image=self.IMAGE.resize((600, 600)))
+        # General case
+        outputs = model.generate(**inputs, max_new_tokens=10)
+        outputs = tokenizer.batch_decode(outputs[:, inputs["input_ids"].shape[1] :], skip_special_tokens=True)
+        self.assertIsInstance(outputs[0], str)
 
         # GOT-OCR2 does not support text-only input
         if model_arch != "got_ocr2":
@@ -945,11 +939,7 @@ def test_generate_utils(self, model_arch):
                     repo_type="dataset",
                     user_agent=http_user_agent(),
                 )
-                if model_arch == "videochat_flash_qwen":
-                    # videochat_flash_qwen need frame number to be multiple of 4
-                    input_video, _ = load_video(video_path, num_frames=8, backend="opencv")
-                else:
-                    input_video, _ = load_video(video_path, num_frames=2, backend="opencv")
+                input_video, _ = load_video(video_path, num_frames=2, backend="opencv")
                 question = "Why is this video funny?"
                 inputs = model.preprocess_inputs(**preprocessors, text=question, video=input_video)
                 outputs = model.generate(**inputs, max_new_tokens=10)

From ea1a5a069059817ae9f0754e56dc3c4287a9f33e Mon Sep 17 00:00:00 2001
From: xufang <fang.xu@intel.com>
Date: Fri, 3 Apr 2026 21:30:49 +0800
Subject: [PATCH 34/39] update tests

---
 optimum/exporters/openvino/model_configs.py   |  1 +
 .../openvino/modeling_visual_language.py      |  4 +---
 tests/openvino/test_seq2seq.py                | 22 +------------------
 3 files changed, 3 insertions(+), 24 deletions(-)

diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index 009302db5b..331a8e2bfe 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -5386,6 +5386,7 @@ class VideoChatFlashQwenProjectorOpenVINOConfig(OnnxConfig):
     def inputs(self) -> Dict[str, Dict[int, str]]:
         return {"input": {0: "batch_size", 1: "num_patches", 2: "hidden_size"}}
 
+
 class VideoChatFlashQwenConfigBehavior(str, enum.Enum):
     LANGUAGE = "language"
     VISION_EMBEDDINGS = "vision_embeddings"
diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py
index 7e14011b6f..eaf2a694d4 100644
--- a/optimum/intel/openvino/modeling_visual_language.py
+++ b/optimum/intel/openvino/modeling_visual_language.py
@@ -5043,9 +5043,7 @@ def get_vision_embeddings(self, images):
             pos_embeds = self.img_pos_embed.detach()
         else:
             pos_embeds = self.pos_embed.detach()
-        image_embeds = self.vision_embeddings(
-            images, rotary_pos_emb=pos_embeds, use_image=(T == 1)
-        ).last_hidden_state
+        image_embeds = self.vision_embeddings(images, rotary_pos_emb=pos_embeds).last_hidden_state
         image_embeds = image_embeds[:, 1:, :]
 
         videos_features = torch.from_numpy(image_embeds) if isinstance(image_embeds, np.ndarray) else image_embeds
diff --git a/tests/openvino/test_seq2seq.py b/tests/openvino/test_seq2seq.py
index be37bf000f..9f32f0aa96 100644
--- a/tests/openvino/test_seq2seq.py
+++ b/tests/openvino/test_seq2seq.py
@@ -745,7 +745,6 @@ def compare_outputs(inputs, ov_model, transformers_model, generation_config, has
             set_seed(SEED)
             with torch.no_grad():
                 transformers_outputs = transformers_model(**transformers_inputs)
-
             self.assertTrue(
                 torch.allclose(ov_outputs.logits, transformers_outputs.logits, atol=4e-3),
                 f"Max abs diff {(torch.abs(ov_outputs.logits - transformers_outputs.logits).max())}",
@@ -980,30 +979,11 @@ def get_preprocessors(self, model_arch):
                 model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS
             )
             preprocessors = {"processor": processor, "tokenizer": tokenizer, "config": config}
-        elif model_arch == "internvl_chat":
+        elif model_arch in ["internvl_chat", "videochat_flash_qwen"]:
             tokenizer = AutoTokenizer.from_pretrained(
                 model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS
             )
             preprocessors = {"processor": None, "tokenizer": tokenizer, "config": config}
-        elif model_arch == "videochat_flash_qwen":
-
-            class VideochatProcessorWrapper:
-                def __init__(self, model_id):
-                    from transformers import AutoModel
-
-                    hf_model = AutoModel.from_pretrained(model_id, trust_remote_code=True)
-                    self.processor = hf_model.get_vision_tower().image_processor.preprocess
-                    self.model_dtype = hf_model.dtype
-                    del hf_model
-
-                def __call__(self, images, return_tensors):
-                    return self.processor(images, return_tensors="pt")["pixel_values"].to(dtype=self.model_dtype)
-
-            processor = VideochatProcessorWrapper(model_id)
-            tokenizer = AutoTokenizer.from_pretrained(
-                model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS
-            )
-            preprocessors = {"processor": processor, "tokenizer": tokenizer, "config": config}
         else:
             processor = AutoProcessor.from_pretrained(
                 model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS

From 33b17e89a822aadbc8da4aec80ec6336d2a92c22 Mon Sep 17 00:00:00 2001
From: xufang <fang.xu@intel.com>
Date: Sat, 4 Apr 2026 06:57:09 +0800
Subject: [PATCH 35/39] remove NotImplemented exceptions

---
 .../openvino/modeling_visual_language.py      | 58 +++++--------------
 1 file changed, 13 insertions(+), 45 deletions(-)

diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py
index eaf2a694d4..9b5f2d87b8 100644
--- a/optimum/intel/openvino/modeling_visual_language.py
+++ b/optimum/intel/openvino/modeling_visual_language.py
@@ -5043,7 +5043,9 @@ def get_vision_embeddings(self, images):
             pos_embeds = self.img_pos_embed.detach()
         else:
             pos_embeds = self.pos_embed.detach()
-        image_embeds = self.vision_embeddings(images, rotary_pos_emb=pos_embeds).last_hidden_state
+        image_embeds = self.vision_embeddings(
+            images, rotary_pos_emb=pos_embeds
+        ).last_hidden_state
         image_embeds = image_embeds[:, 1:, :]
 
         videos_features = torch.from_numpy(image_embeds) if isinstance(image_embeds, np.ndarray) else image_embeds
@@ -5119,10 +5121,8 @@ def insert_separator(X, sep):
         for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)):
             input_ids.extend(x[offset:])
 
-        if return_tensors is not None:
-            if return_tensors == "pt":
-                return torch.tensor(input_ids, dtype=torch.long)
-            raise ValueError(f"Unsupported tensor type: {return_tensors}")
+        if return_tensors == "pt":
+            return torch.tensor(input_ids, dtype=torch.long)
         return input_ids
 
         # Adopted from https://huggingface.co/OpenGVLab/VideoChat-Flash-Qwen2_5-7B_InternVideo2-1B/blob/main/vision_tower_builder.py#L681
@@ -5412,8 +5412,6 @@ def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size, max_reso
             image_size, possible_resolutions, max_resolutions=max_resolutions, patch_size=patch_size
         )
 
-        # print("get width/patch size", width, patch_size, flush=True)
-
         return width // patch_size, height // patch_size
 
     def get_text_embeddings(self, input_ids):
@@ -5479,10 +5477,8 @@ def get_multimodal_embeddings(
         frame_aspect_ratio = getattr(self.config, "frame_aspect_ratio", "square")
         mm_newline_position = getattr(self.config, "mm_newline_position", "nothing")
 
-        if vision_encode_type == "video_image":  # video backbone, process video with compress
-            image_features = self.encode_video_image(images_list, video_idx_in_batch=video_idx_in_batch)
-        else:
-            raise NotImplementedError(vision_encode_type)
+        # video backbone, process video with compress
+        image_features = self.encode_video_image(images_list, video_idx_in_batch=video_idx_in_batch)
 
         if mm_patch_merge_type == "flat":
             image_features = [x.flatten(0, 1) for x in image_features]
@@ -5490,10 +5486,7 @@ def get_multimodal_embeddings(
             new_image_features = []
             for image_idx, image_feature in enumerate(image_features):
                 if image_idx in video_idx_in_batch:  # video operations
-                    if "anyres" in frame_aspect_ratio:
-                        raise NotImplementedError
-                    else:
-                        frame_feature = image_feature
+                    frame_feature = image_feature
 
                     if "pad" in mm_patch_merge_type:
                         if mm_newline_position == "one_token":
@@ -5508,8 +5501,6 @@ def get_multimodal_embeddings(
                                 )
                         elif mm_newline_position == "nothing":
                             frame_feature = frame_feature.flatten(0, 1)
-                        else:
-                            raise NotImplementedError("add pad please!!")
                     else:
                         frame_feature = frame_feature.flatten(0, 1)
 
@@ -5542,46 +5533,24 @@ def get_multimodal_embeddings(
                         )
 
                         image_feature = image_feature.view(num_patch_height, num_patch_width, height, width, -1)
-                    else:
-                        raise NotImplementedError(image_aspect_ratio)
-
-                    if "maxpool2x2" in mm_patch_merge_type:
-                        raise NotImplementedError
-                    elif (
-                        "unpad" in mm_patch_merge_type
-                        and "anyres_max" in image_aspect_ratio
-                        and matched_anyres_max_num_patches
-                    ):
-                        raise NotImplementedError
-                    elif "unpad" in mm_patch_merge_type:
-                        raise NotImplementedError
-                    else:
-                        image_feature = image_feature.permute(0, 2, 1, 3, 4).contiguous()
-                        image_feature = image_feature.flatten(0, 3)
+
+                    image_feature = image_feature.permute(0, 2, 1, 3, 4).contiguous()
+                    image_feature = image_feature.flatten(0, 3)
                     if "nobase" in mm_patch_merge_type:
                         pass
                     else:
-                        try:
-                            image_feature = torch.cat((base_image_feature, image_feature), dim=0)
-                        except Exception:
-                            raise ValueError(
-                                f"{num_patch_width} {num_patch_height} now: base_image_feature: {base_image_feature.shape}, {image_feature.shape}, image_sizes[image_idx]: {image_sizes[image_idx]}, origin_size: {origin_size}, {image_sizes[image_idx]}, {self.config.image_grid_pinpoints}, {vision_tower_image_size}"
-                            )
+                        image_feature = torch.cat((base_image_feature, image_feature), dim=0)
+
                 else:  # single image operations
                     image_feature = image_feature[0]
                     if "unpad" in mm_patch_merge_type:
                         image_feature = torch.cat((image_feature, self.model.image_newline[None]), dim=0)
 
-                # print(f"image/video_feature.shape: {image_feature.shape}")
                 new_image_features.append(image_feature)
             image_features = new_image_features
         else:
             raise ValueError(f"Unexpected mm_patch_merge_type: {self.config.mm_patch_merge_type}")
 
-        # TODO: image start / end is not implemented here to support pretraining.
-        if getattr(self.config, "tune_mm_mlp_adapter", False) and getattr(self.config, "mm_use_im_start_end", False):
-            raise NotImplementedError
-
         # Let's just add dummy tensors if they do not exist,
         # it is a headache to deal with None all the time.
         # But it is not ideal, and if you have a better idea,
@@ -5751,7 +5720,6 @@ def get_multimodal_embeddings(
                     )
 
         new_input_embeds = torch.stack(new_input_embeds_padded, dim=0)
-        # print("tokenizer padding")
 
         if _attention_mask is None:
             attention_mask = None

From b67cdf09081c156e5d3f075ba06c9534989a8244 Mon Sep 17 00:00:00 2001
From: xufang <fang.xu@intel.com>
Date: Fri, 3 Apr 2026 22:30:55 +0800
Subject: [PATCH 36/39] fix code style

---
 optimum/intel/openvino/modeling_visual_language.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py
index 9b5f2d87b8..45468297f6 100644
--- a/optimum/intel/openvino/modeling_visual_language.py
+++ b/optimum/intel/openvino/modeling_visual_language.py
@@ -5043,9 +5043,7 @@ def get_vision_embeddings(self, images):
             pos_embeds = self.img_pos_embed.detach()
         else:
             pos_embeds = self.pos_embed.detach()
-        image_embeds = self.vision_embeddings(
-            images, rotary_pos_emb=pos_embeds
-        ).last_hidden_state
+        image_embeds = self.vision_embeddings(images, rotary_pos_emb=pos_embeds).last_hidden_state
         image_embeds = image_embeds[:, 1:, :]
 
         videos_features = torch.from_numpy(image_embeds) if isinstance(image_embeds, np.ndarray) else image_embeds

From 8a2811dadce2c433fccdd177526140a1c7d9649b Mon Sep 17 00:00:00 2001
From: xufang <fang.xu@intel.com>
Date: Fri, 3 Apr 2026 22:44:22 +0800
Subject: [PATCH 37/39] fix code style

---
 optimum/intel/openvino/modeling_visual_language.py | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py
index 45468297f6..2524c76fcb 100644
--- a/optimum/intel/openvino/modeling_visual_language.py
+++ b/optimum/intel/openvino/modeling_visual_language.py
@@ -5127,9 +5127,9 @@ def insert_separator(X, sep):
 
     def image_preprocess(images, return_tensors, target_size=None):
         from functools import partial, reduce
+
         from PIL.Image import Image as PILImage
         from transformers.image_processing_utils import BatchFeature
-        from transformers.image_utils import ChannelDimension, PILImageResampling, to_numpy_array
         from transformers.image_transforms import (
             convert_to_rgb,
             normalize,
@@ -5137,6 +5137,7 @@ def image_preprocess(images, return_tensors, target_size=None):
             resize,
             to_channel_dimension_format,
         )
+        from transformers.image_utils import ChannelDimension, PILImageResampling, to_numpy_array
 
         if isinstance(images, PILImage):
             images = [images]
@@ -5469,10 +5470,8 @@ def get_multimodal_embeddings(
             else:
                 images_list.append(image.unsqueeze(0))
 
-        vision_encode_type = getattr(self.config, "vision_encode_type", "image")
         mm_patch_merge_type = getattr(self.config, "mm_patch_merge_type", "flat")
         image_aspect_ratio = getattr(self.config, "image_aspect_ratio", "square")
-        frame_aspect_ratio = getattr(self.config, "frame_aspect_ratio", "square")
         mm_newline_position = getattr(self.config, "mm_newline_position", "nothing")
 
         # video backbone, process video with compress
@@ -5508,16 +5507,12 @@ def get_multimodal_embeddings(
                 elif image_feature.shape[0] > 1:  # multi patches and multi images operations
                     base_image_feature = image_feature[0]
                     image_feature = image_feature[1:]
-                    origin_size = image_feature.shape
 
                     height = width = 8
                     assert (
                         height * width == base_image_feature.shape[0]
                     ), f"height:{height}, width: {width}, base_image_feature: {base_image_feature.shape}"
 
-                    if "anyres_max" in image_aspect_ratio:
-                        matched_anyres_max_num_patches = re.match(r"anyres_max_(\d+)", image_aspect_ratio)
-
                     if "anyres" in image_aspect_ratio:
                         vision_tower_image_size = 224
                         (

From b4bdb5099b6d7be65ed1c38fd7d7b55d44560a96 Mon Sep 17 00:00:00 2001
From: xufang <fang.xu@intel.com>
Date: Sat, 4 Apr 2026 10:04:55 +0800
Subject: [PATCH 38/39] test videochat export when transformers>=4.49

---
 tests/openvino/test_export.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/openvino/test_export.py b/tests/openvino/test_export.py
index 909854cf5e..6aeb198e85 100644
--- a/tests/openvino/test_export.py
+++ b/tests/openvino/test_export.py
@@ -93,14 +93,13 @@ class ExportModelTest(unittest.TestCase):
         "stable-diffusion-3": OVStableDiffusion3Pipeline,
         "flux": OVFluxPipeline,
         "ltx-video": OVLTXPipeline,
-        "videochat_flash_qwen": OVModelForVisualCausalLM,
     }
 
     if is_transformers_version(">=", "4.48.0"):
         SUPPORTED_ARCHITECTURES.update({"cohere2": OVModelForCausalLM})
 
     if is_transformers_version(">=", "4.49"):
-        SUPPORTED_ARCHITECTURES.update({"zamba2": OVModelForCausalLM})
+        SUPPORTED_ARCHITECTURES.update({"zamba2": OVModelForCausalLM, "videochat_flash_qwen": OVModelForVisualCausalLM})
 
     if is_transformers_version(">=", "4.53.0"):
         SUPPORTED_ARCHITECTURES.update({"granitemoehybrid": OVModelForCausalLM})

From 72c94dae36f94e689f406beca86a1045e1e49755 Mon Sep 17 00:00:00 2001
From: xufang <fang.xu@intel.com>
Date: Sat, 4 Apr 2026 10:44:54 +0800
Subject: [PATCH 39/39] fix code style

---
 tests/openvino/test_export.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/openvino/test_export.py b/tests/openvino/test_export.py
index 6aeb198e85..38dc7dedfb 100644
--- a/tests/openvino/test_export.py
+++ b/tests/openvino/test_export.py
@@ -99,7 +99,9 @@ class ExportModelTest(unittest.TestCase):
         SUPPORTED_ARCHITECTURES.update({"cohere2": OVModelForCausalLM})
 
     if is_transformers_version(">=", "4.49"):
-        SUPPORTED_ARCHITECTURES.update({"zamba2": OVModelForCausalLM, "videochat_flash_qwen": OVModelForVisualCausalLM})
+        SUPPORTED_ARCHITECTURES.update(
+            {"zamba2": OVModelForCausalLM, "videochat_flash_qwen": OVModelForVisualCausalLM}
+        )
 
     if is_transformers_version(">=", "4.53.0"):
         SUPPORTED_ARCHITECTURES.update({"granitemoehybrid": OVModelForCausalLM})