From f88c6a8bb9a666b4c60d059ae1def4f9858fcca9 Mon Sep 17 00:00:00 2001
From: Mohamed-Ashraf273 <ma2736666@gmail.com>
Date: Tue, 3 Mar 2026 05:15:16 +0200
Subject: [PATCH 01/39] add support gigachat3

---
 optimum/exporters/openvino/model_patcher.py | 327 ++++++--------------
 1 file changed, 98 insertions(+), 229 deletions(-)

diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index 557cd1f8d1..31048a005d 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -3758,8 +3758,8 @@ class DeepseekPatcher(OVDecoderModelPatcher):
     def __enter__(self):
         super().__enter__()
         self_attn = {
-            "deepseek_v3": deepseek_v3_attn_forward,
-            "deepseek_v2": deepseek_v2_attn_forward,
+            "deepseek_v3": make_deepseek_attn_forward(version=3),
+            "deepseek_v2": make_deepseek_attn_forward(version=2),
             "deepseek": minicpm3_attn_forward,
         }
 
@@ -3770,249 +3770,118 @@ def __enter__(self):
                 block.self_attn.forward = types.MethodType(self_attn_fwd, block.self_attn)
             if hasattr(block.mlp, "moe_infer"):
                 block.mlp._org_moe_infer = block.mlp.moe_infer
-                block.mlp.moe_infer = types.MethodType(deepseek_moe_infer, block.mlp)
+            elif hasattr(block.mlp, "experts"):
+                block.mlp._org_moe_infer = None
+                block.mlp.ep_rank = 0
+                block.mlp.experts_per_rank = len(block.mlp.experts)
+            block.mlp.moe_infer = types.MethodType(deepseek_moe_infer, block.mlp)
 
     def __exit__(self, exc_type, exc_value, traceback):
         super().__exit__(exc_type, exc_value, traceback)
         for block in self._model.model.layers:
-            block.self_attn.forward = block.self_attn._orig_forward
-            if hasattr(block.mlp, "_orig_moe_infer"):
-                block.mlp.moe_infer = block.mlp._orig_moe_infer
-
-
-def deepseek_v3_attn_forward(
-    self,
-    hidden_states: torch.Tensor,
-    attention_mask: Optional[torch.Tensor] = None,
-    position_ids: Optional[torch.LongTensor] = None,
-    past_key_value=None,
-    output_attentions: bool = False,
-    use_cache: bool = False,
-) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-    # modified from https://huggingface.co/deepseek-ai/DeepSeek-V3/blob/main/modeling_deepseek.py#L751
-    def rotate_half(x):
-        """Rotates half the hidden dims of the input."""
-        x1 = x[..., : x.shape[-1] // 2]
-        x2 = x[..., x.shape[-1] // 2 :]
-        return torch.cat((-x2, x1), dim=-1)
-
-    def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
-        orig_dtype = k.dtype
-        cos = cos[position_ids].unsqueeze(unsqueeze_dim)  # [bs, 1, seq_len, dim]
-        sin = sin[position_ids].unsqueeze(unsqueeze_dim)  # [bs, 1, seq_len, dim]
-        q_fp32 = q.to(dtype=torch.float32, device=q.device)
-        k_fp32 = k.to(dtype=torch.float32, device=k.device)
-        q_embed = (q_fp32 * cos) + (rotate_half(q_fp32) * sin)
-        k_embed = (k_fp32 * cos) + (rotate_half(k_fp32) * sin)
-        return q_embed.to(dtype=orig_dtype), k_embed.to(dtype=orig_dtype)
-
-    if output_attentions:
-        return self._orig_forward(
-            hidden_states=hidden_states,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_value=past_key_value,
-            output_attentions=output_attentions,
-            use_cache=use_cache,
-        )
-
-    bsz, q_len, _ = hidden_states.size()
-
-    if self.q_lora_rank is None:
-        q = self.q_proj(hidden_states)
-    else:
-        q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states)))
-    q = q.view(bsz, q_len, self.num_heads, self.q_head_dim).transpose(1, 2)
-    q_nope, q_pe = torch.split(q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
-
-    compressed_kv = self.kv_a_proj_with_mqa(hidden_states)
-    compressed_kv, k_pe = torch.split(compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
-    k_pe = k_pe.view(bsz, q_len, 1, self.qk_rope_head_dim).transpose(1, 2)
-    kv = (
-        self.kv_b_proj(self.kv_a_layernorm(compressed_kv))
-        .view(bsz, q_len, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
-        .transpose(1, 2)
-    )
-
-    k_nope, value_states = torch.split(kv, [self.qk_nope_head_dim, self.v_head_dim], dim=-1)
-    kv_seq_len = value_states.shape[-2]
-    if past_key_value is not None:
-        if self.layer_idx is None:
-            raise ValueError(
-                f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
-                "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
-                "with a layer index."
-            )
-        kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-    cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-
-    q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin, position_ids)
-
-    # Difference with original code, k_pe.new_empty create constant tensor in torchscript
-    query_states = torch.concat([q_nope, q_pe], dim=-1)
-    # query_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim)
-    # query_states[:, :, :, : self.qk_nope_head_dim] = q_nope
-    # query_states[:, :, :, self.qk_nope_head_dim :] = q_pe
-    key_states = torch.concat([k_nope, k_pe.expand(-1, self.num_heads, -1, -1)], dim=-1)
-    # key_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim)
-    # key_states[:, :, :, : self.qk_nope_head_dim] = k_nope
-    # key_states[:, :, :, self.qk_nope_head_dim :] = k_pe
-    if past_key_value is not None:
-        cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
-        key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-
-    if attention_mask is not None:
-        if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
-            raise ValueError(
-                f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
-            )
-
-    # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
-    # Reference: https://github.com/pytorch/pytorch/issues/112577.
-    if query_states.device.type == "cuda" and attention_mask is not None:
-        query_states = query_states.contiguous()
-        key_states = key_states.contiguous()
-        value_states = value_states.contiguous()
-
-    attn_output = torch.nn.functional.scaled_dot_product_attention(
-        query_states,
-        key_states,
-        value_states,
-        attn_mask=attention_mask,
-        dropout_p=self.attention_dropout if self.training else 0.0,
-        # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
-        is_causal=self.is_causal and attention_mask is None and q_len > 1,
-    )
-
-    attn_output = attn_output.transpose(1, 2).contiguous()
-
-    attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.v_head_dim)
+            if hasattr(block.self_attn, "_orig_forward"):
+                block.self_attn.forward = block.self_attn._orig_forward
+            if hasattr(block.mlp, "_org_moe_infer"):
+                if block.mlp._org_moe_infer is not None:
+                    block.mlp.moe_infer = block.mlp._org_moe_infer
+                else:
+                    delattr(block.mlp, "moe_infer")
+                    if hasattr(block.mlp, "ep_rank"):
+                        delattr(block.mlp, "ep_rank")
+                    if hasattr(block.mlp, "experts_per_rank"):
+                        delattr(block.mlp, "experts_per_rank")
+
+
+def make_deepseek_attn_forward(version: int = 3):
+    """Return a MLA attention forward function for the given DeepSeek version.
+
+    Both deepseek_v2 and deepseek_v3 share identical MLA attention logic — the
+    only differences are:
+    - v3: ``position_embeddings`` is a ``(cos, sin)`` tuple; RoPE applied via
+      ``apply_rotary_pos_emb(q_rot, k_rot, cos, sin)``; cache stores cos/sin.
+    - v2: ``position_embeddings`` is a complex ``freqs_cis`` tensor; RoPE applied
+      via ``apply_rotary_emb(q_pe, k_pe, freqs_cis)`` (complex multiplication).
+    """
 
-    attn_output = self.o_proj(attn_output)
+    def deepseek_attn_forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings=None,
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_value=None,
+        past_key_values=None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        batch_size, seq_length = hidden_states.shape[:-1]
+        query_shape = (batch_size, seq_length, -1, self.qk_head_dim)
+        key_shape = (batch_size, seq_length, -1, self.qk_nope_head_dim + self.v_head_dim)
 
-    return attn_output, None, past_key_value
+        if self.q_lora_rank is None:
+            q = self.q_proj(hidden_states)
+        else:
+            q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states)))
+        q = q.view(query_shape).transpose(1, 2)
+        q_nope, q_pe = torch.split(q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
+
+        compressed_kv = self.kv_a_proj_with_mqa(hidden_states)
+        k_nope, k_pe = torch.split(compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
+        k_nope = self.kv_b_proj(self.kv_a_layernorm(k_nope)).view(key_shape).transpose(1, 2)
+        k_nope, value_states = torch.split(k_nope, [self.qk_nope_head_dim, self.v_head_dim], dim=-1)
+        k_pe = k_pe.view(batch_size, 1, seq_length, self.qk_rope_head_dim)
+
+        if version == 3:
+            from transformers.models.deepseek_v3.modeling_deepseek_v3 import apply_rotary_pos_emb
+
+            cos, sin = position_embeddings
+            q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin)
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+        elif version == 2:
+
+            def apply_rotary_emb(
+                xq: torch.Tensor,
+                xk: torch.Tensor,
+                freqs_cis: torch.Tensor,
+            ) -> tuple[torch.Tensor, torch.Tensor]:
+                xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
+                xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))
 
+                # Broadcast to [1, 1, seq_len, dim // 2]
+                freqs_cis = freqs_cis.unsqueeze(1).to(xq_.device)
 
-def deepseek_v2_attn_forward(
-    self,
-    hidden_states: torch.Tensor,
-    attention_mask: Optional[torch.Tensor] = None,
-    position_ids: Optional[torch.LongTensor] = None,
-    past_key_value=None,
-    output_attentions: bool = False,
-    use_cache: bool = False,
-    **kwargs,
-) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-    # modified from https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite/blob/main/modeling_deepseek.py#L806
-    def rotate_half(x):
-        """Rotates half the hidden dims of the input."""
-        x1 = x[..., : x.shape[-1] // 2]
-        x2 = x[..., x.shape[-1] // 2 :]
-        return torch.cat((-x2, x1), dim=-1)
+                xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3).type_as(xq)
+                xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3).type_as(xk)
+                return xq_out, xk_out
 
-    def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
-        cos = cos[position_ids].unsqueeze(unsqueeze_dim)
-        sin = sin[position_ids].unsqueeze(unsqueeze_dim)
+            q_pe, k_pe = apply_rotary_emb(q_pe, k_pe, position_embeddings.to(q_pe.device))
+            cache_kwargs = {"cache_position": cache_position}
 
-        b, h, s, d = q.shape
-        q = q.view(b, h, s, d // 2, 2).transpose(4, 3).reshape(b, h, s, d)
+        else:
+            raise ValueError(f"Unsupported DeepSeek version: {version}")
 
-        b, h, s, d = k.shape
-        k = k.view(b, h, s, d // 2, 2).transpose(4, 3).reshape(b, h, s, d)
+        # Use expand+cat instead of new_empty+slice to avoid constant tensors in torchscript
+        k_pe = k_pe.expand(*k_nope.shape[:-1], -1)
+        query_states = torch.cat((q_nope, q_pe), dim=-1)
+        key_states = torch.cat((k_nope, k_pe), dim=-1)
 
-        q_embed = (q * cos) + (rotate_half(q) * sin)
-        k_embed = (k * cos) + (rotate_half(k) * sin)
-        return q_embed, k_embed
+        kv_cache = past_key_value if past_key_value is not None else past_key_values
+        if kv_cache is not None:
+            key_states, value_states = kv_cache.update(key_states, value_states, self.layer_idx, cache_kwargs)
 
-    if output_attentions:
-        return self._orig_forward(
-            hidden_states=hidden_states,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_value=past_key_value,
-            output_attentions=output_attentions,
-            use_cache=use_cache,
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=attention_mask,
+            dropout_p=self.attention_dropout if self.training else 0.0,
+            is_causal=self.is_causal and attention_mask is None and seq_length > 1,
         )
 
-    bsz, q_len, _ = hidden_states.shape
-
-    if self.q_lora_rank is None:
-        q = self.q_proj(hidden_states)
-    else:
-        q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states)))
-    q = q.view(bsz, q_len, self.num_heads, self.q_head_dim).transpose(1, 2)
-    q_nope, q_pe = torch.split(q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
-
-    compressed_kv = self.kv_a_proj_with_mqa(hidden_states)
-    compressed_kv, k_pe = torch.split(compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
-    k_pe = k_pe.view(bsz, q_len, 1, self.qk_rope_head_dim).transpose(1, 2)
-    kv = (
-        self.kv_b_proj(self.kv_a_layernorm(compressed_kv))
-        .view(bsz, q_len, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
-        .transpose(1, 2)
-    )
-
-    k_nope, value_states = torch.split(kv, [self.qk_nope_head_dim, self.v_head_dim], dim=-1)
-    kv_seq_len = value_states.shape[-2]
-    if past_key_value is not None:
-        if self.layer_idx is None:
-            raise ValueError(
-                f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
-                "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
-                "with a layer index."
-            )
-        kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-    cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-
-    q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin, position_ids)
-
-    # Difference with original code, k_pe.new_empty create constant tensor in torchscript
-    query_states = torch.concat([q_nope, q_pe], dim=-1)
-    # query_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim)
-    # query_states[:, :, :, : self.qk_nope_head_dim] = q_nope
-    # query_states[:, :, :, self.qk_nope_head_dim :] = q_pe
-    key_states = torch.concat([k_nope, k_pe.expand(-1, self.num_heads, -1, -1)], dim=-1)
-    # key_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim)
-    # key_states[:, :, :, : self.qk_nope_head_dim] = k_nope
-    # key_states[:, :, :, self.qk_nope_head_dim :] = k_pe
-    if past_key_value is not None:
-        cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
-        key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-
-    if attention_mask is not None:
-        if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
-            raise ValueError(
-                f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
-            )
-
-    if attention_mask is not None:
-        if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
-            raise ValueError(
-                f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
-            )
-    # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
-    # Reference: https://github.com/pytorch/pytorch/issues/112577.
-    if query_states.device.type == "cuda" and attention_mask is not None:
-        query_states = query_states.contiguous()
-        key_states = key_states.contiguous()
-        value_states = value_states.contiguous()
-
-    attn_output = torch.nn.functional.scaled_dot_product_attention(
-        query_states,
-        key_states,
-        value_states,
-        attn_mask=attention_mask,
-        dropout_p=self.attention_dropout if self.training else 0.0,
-        # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
-        is_causal=self.is_causal and attention_mask is None and q_len > 1,
-    )
-    attn_output = attn_output.transpose(1, 2).contiguous()
-
-    attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.v_head_dim)
-
-    attn_output = self.o_proj(attn_output)
+        attn_output = attn_output.reshape(batch_size, seq_length, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, None
 
-    return attn_output, None, past_key_value
+    return deepseek_attn_forward
 
 
 def deepseek_moe_infer(self, x, topk_ids, topk_weight):

From c26ffe8f88344f96a909fc8eeba2c378518f83b9 Mon Sep 17 00:00:00 2001
From: Mohamed-Ashraf273 <ma2736666@gmail.com>
Date: Tue, 3 Mar 2026 14:54:57 +0200
Subject: [PATCH 02/39] support gigacgat3

---
 optimum/exporters/openvino/model_patcher.py | 140 ++++++++++++--------
 tests/openvino/test_decoder.py              |   2 +-
 tests/openvino/utils_tests.py               |   1 -
 3 files changed, 88 insertions(+), 55 deletions(-)

diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index 31048a005d..3ff17e85b6 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -3795,91 +3795,125 @@ def __exit__(self, exc_type, exc_value, traceback):
 def make_deepseek_attn_forward(version: int = 3):
     """Return a MLA attention forward function for the given DeepSeek version.
 
-    Both deepseek_v2 and deepseek_v3 share identical MLA attention logic — the
-    only differences are:
-    - v3: ``position_embeddings`` is a ``(cos, sin)`` tuple; RoPE applied via
-      ``apply_rotary_pos_emb(q_rot, k_rot, cos, sin)``; cache stores cos/sin.
-    - v2: ``position_embeddings`` is a complex ``freqs_cis`` tensor; RoPE applied
-      via ``apply_rotary_emb(q_pe, k_pe, freqs_cis)`` (complex multiplication).
+    Args:
+        version: 2 for deepseek_v2 (uses freqs_cis), 3 for deepseek_v3 (uses cos/sin tuple)
     """
+    from typing import Callable
+
+    from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
+
+    if version == 3:
+        from transformers.models.deepseek_v3.modeling_deepseek_v3 import (
+            apply_rotary_pos_emb,
+            apply_rotary_pos_emb_interleave,
+            eager_attention_forward,
+        )
+    elif version == 2:
+
+        def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+            batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+            if n_rep == 1:
+                return hidden_states
+            hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+            return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+        def apply_rotary_emb(xq: torch.Tensor, xk: torch.Tensor, freqs_cis: torch.Tensor):
+            xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
+            xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))
+            freqs_cis = freqs_cis.unsqueeze(1).to(xq_.device)
+            xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3).type_as(xq)
+            xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3).type_as(xk)
+            return xq_out, xk_out
+
+        def eager_attention_forward(module, query, key, value, attention_mask, scaling, dropout=0.0, **kwargs):
+            key = repeat_kv(key, module.num_key_value_groups)
+            value = repeat_kv(value, module.num_key_value_groups)
+            attn_weights = torch.matmul(query, key.transpose(2, 3)) * scaling
+            if attention_mask is not None:
+                attn_weights = attn_weights + attention_mask
+            attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+            attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+            attn_output = torch.matmul(attn_weights, value)
+            return attn_output.transpose(1, 2).contiguous(), attn_weights
+    else:
+        raise ValueError(f"Unsupported DeepSeek version: {version}")
 
     def deepseek_attn_forward(
         self,
         hidden_states: torch.Tensor,
-        position_embeddings=None,
-        attention_mask: Optional[torch.Tensor] = None,
-        past_key_value=None,
+        position_embeddings,
+        attention_mask: Optional[torch.Tensor],
+        past_key_value: Optional[Cache] = None,
         past_key_values=None,
         cache_position: Optional[torch.LongTensor] = None,
         **kwargs,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
         batch_size, seq_length = hidden_states.shape[:-1]
-        query_shape = (batch_size, seq_length, -1, self.qk_head_dim)
-        key_shape = (batch_size, seq_length, -1, self.qk_nope_head_dim + self.v_head_dim)
 
         if self.q_lora_rank is None:
-            q = self.q_proj(hidden_states)
+            q_states = self.q_proj(hidden_states)
         else:
-            q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states)))
-        q = q.view(query_shape).transpose(1, 2)
-        q_nope, q_pe = torch.split(q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
+            q_states = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states)))
+        q_states = q_states.view(batch_size, seq_length, -1, self.qk_head_dim).transpose(1, 2)
+        q_pass, q_rot = torch.split(q_states, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
 
         compressed_kv = self.kv_a_proj_with_mqa(hidden_states)
-        k_nope, k_pe = torch.split(compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
-        k_nope = self.kv_b_proj(self.kv_a_layernorm(k_nope)).view(key_shape).transpose(1, 2)
-        k_nope, value_states = torch.split(k_nope, [self.qk_nope_head_dim, self.v_head_dim], dim=-1)
-        k_pe = k_pe.view(batch_size, 1, seq_length, self.qk_rope_head_dim)
+        k_pass, k_rot = torch.split(compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
+        k_pass = self.kv_b_proj(self.kv_a_layernorm(k_pass))
+        k_pass = k_pass.view(batch_size, seq_length, -1, self.qk_nope_head_dim + self.v_head_dim).transpose(1, 2)
+        k_pass, value_states = torch.split(k_pass, [self.qk_nope_head_dim, self.v_head_dim], dim=-1)
+        k_rot = k_rot.view(batch_size, 1, seq_length, self.qk_rope_head_dim)
 
         if version == 3:
-            from transformers.models.deepseek_v3.modeling_deepseek_v3 import apply_rotary_pos_emb
-
             cos, sin = position_embeddings
-            q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin)
+            if self.config.rope_interleave:
+                q_rot, k_rot = apply_rotary_pos_emb_interleave(q_rot, k_rot, cos, sin)
+            else:
+                q_rot, k_rot = apply_rotary_pos_emb(q_rot, k_rot, cos, sin)
             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
-        elif version == 2:
-
-            def apply_rotary_emb(
-                xq: torch.Tensor,
-                xk: torch.Tensor,
-                freqs_cis: torch.Tensor,
-            ) -> tuple[torch.Tensor, torch.Tensor]:
-                xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
-                xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))
-
-                # Broadcast to [1, 1, seq_len, dim // 2]
-                freqs_cis = freqs_cis.unsqueeze(1).to(xq_.device)
-
-                xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3).type_as(xq)
-                xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3).type_as(xk)
-                return xq_out, xk_out
-
-            q_pe, k_pe = apply_rotary_emb(q_pe, k_pe, position_embeddings.to(q_pe.device))
-            cache_kwargs = {"cache_position": cache_position}
-
+            kv_cache = past_key_value
         else:
-            raise ValueError(f"Unsupported DeepSeek version: {version}")
+            q_rot, k_rot = apply_rotary_emb(q_rot, k_rot, position_embeddings.to(q_rot.device))
+            cache_kwargs = {"cache_position": cache_position}
+            kv_cache = past_key_values
 
-        # Use expand+cat instead of new_empty+slice to avoid constant tensors in torchscript
-        k_pe = k_pe.expand(*k_nope.shape[:-1], -1)
-        query_states = torch.cat((q_nope, q_pe), dim=-1)
-        key_states = torch.cat((k_nope, k_pe), dim=-1)
+        k_rot = k_rot.expand(*k_pass.shape[:-1], -1)
+        query_states = torch.cat((q_pass, q_rot), dim=-1)
+        key_states = torch.cat((k_pass, k_rot), dim=-1)
 
-        kv_cache = past_key_value if past_key_value is not None else past_key_values
         if kv_cache is not None:
             key_states, value_states = kv_cache.update(key_states, value_states, self.layer_idx, cache_kwargs)
 
-        attn_output = torch.nn.functional.scaled_dot_product_attention(
+        is_flash_attn = "flash" in self.config._attn_implementation
+        if is_flash_attn and self.qk_head_dim != self.v_head_dim:
+            value_states = F.pad(value_states, [0, self.qk_head_dim - self.v_head_dim])
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            if version == 2:
+                attention_interface = ALL_ATTENTION_FUNCTIONS.get_interface(
+                    self.config._attn_implementation, eager_attention_forward
+                )
+            else:
+                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
             query_states,
             key_states,
             value_states,
-            attn_mask=attention_mask,
-            dropout_p=self.attention_dropout if self.training else 0.0,
-            is_causal=self.is_causal and attention_mask is None and seq_length > 1,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            **kwargs,
         )
 
+        if is_flash_attn and self.qk_head_dim != self.v_head_dim:
+            attn_output = attn_output[:, :, :, : self.v_head_dim]
+
         attn_output = attn_output.reshape(batch_size, seq_length, -1).contiguous()
         attn_output = self.o_proj(attn_output)
-        return attn_output, None
+        return attn_output, attn_weights
 
     return deepseek_attn_forward
 
diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py
index 91121023d8..736d7379de 100644
--- a/tests/openvino/test_decoder.py
+++ b/tests/openvino/test_decoder.py
@@ -383,7 +383,7 @@ def test_compare_to_transformers(self, model_arch):
                 transformers_outputs = transformers_model(**tokens)
 
         # Compare tensor outputs
-        atol = 3e-3 if model_arch in ["minicpm", "qwen2-moe"] else 1e-4
+        atol = 3e-2 if model_arch in ["deepseek"] else 3e-3 if model_arch in ["minicpm", "qwen2-moe"] else 1e-4
         # quantized models have different logits value range
         if "awq" not in model_arch and "gptq" not in model_arch:
             self.assertTrue(torch.allclose(ov_outputs.logits, transformers_outputs.logits, equal_nan=True, atol=atol))
diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py
index 230ec88e45..472733b1f3 100644
--- a/tests/openvino/utils_tests.py
+++ b/tests/openvino/utils_tests.py
@@ -395,7 +395,6 @@
     "exaone4",
     "decilm",
     "minicpm3",
-    "deepseek",
     "qwen3_eagle3",
 )
 

From 751cd02b2736ec37e4367a1ca3de54d184bc23bf Mon Sep 17 00:00:00 2001
From: Mohamed-Ashraf273 <ma2736666@gmail.com>
Date: Tue, 3 Mar 2026 23:09:13 +0200
Subject: [PATCH 03/39] add tests & create tiny model

---
 tests/openvino/test_decoder.py | 11 ++++++++---
 tests/openvino/utils_tests.py  |  1 +
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py
index 736d7379de..12443d1465 100644
--- a/tests/openvino/test_decoder.py
+++ b/tests/openvino/test_decoder.py
@@ -117,7 +117,7 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase):
         SUPPORTED_ARCHITECTURES += ("glm", "mistral-nemo", "phimoe")
 
         if is_transformers_version("<", "4.54.0"):
-            SUPPORTED_ARCHITECTURES += ("deepseek",)
+            SUPPORTED_ARCHITECTURES += ("deepseek", "gigachat3")
 
         # gptq and awq install disabled for windows test environment
         if platform.system() != "Windows" and is_transformers_version("<", "4.56.0"):
@@ -227,6 +227,7 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase):
         "minicpm3": 6,
         "phimoe": 2,
         "deepseek": 2,
+        "gigachat3": 2,
         "opt_gptq": 12,
         "mixtral_awq": 2,
         "gemma3_text": 2,
@@ -383,9 +384,13 @@ def test_compare_to_transformers(self, model_arch):
                 transformers_outputs = transformers_model(**tokens)
 
         # Compare tensor outputs
-        atol = 3e-2 if model_arch in ["deepseek"] else 3e-3 if model_arch in ["minicpm", "qwen2-moe"] else 1e-4
+        atol = 3e-2 if model_arch in ["deepseek", "gigachat3"] else 3e-3 if model_arch in ["minicpm", "qwen2-moe"] else 1e-4
         # quantized models have different logits value range
         if "awq" not in model_arch and "gptq" not in model_arch:
+            diff = torch.abs(ov_outputs.logits - transformers_outputs.logits)
+            print(f"\nMax diff: {diff.max()}, Mean diff: {diff.mean()}, aftol: {atol}")
+            print(f"OV logits sample: {ov_outputs.logits[0, 0, :5]}")
+            print(f"TF logits sample: {transformers_outputs.logits[0, 0, :5]}")
             self.assertTrue(torch.allclose(ov_outputs.logits, transformers_outputs.logits, equal_nan=True, atol=atol))
 
         # Qwen tokenizer does not support padding
@@ -410,7 +415,7 @@ def test_compare_to_transformers(self, model_arch):
         ov_outputs = ov_model.generate(**tokens, generation_config=gen_config)
 
         # TODO: add back once https://huggingface.co/katuni4ka/tiny-random-minicpm3/discussions/1 merged (for all models) as current modeling incompatible with transformers >= v4.49
-        if model_arch in {"deepseek"} and is_transformers_version(">=", "4.49"):
+        if model_arch in {"deepseek", "gigachat3"} and is_transformers_version(">=", "4.49"):
             self.skipTest("Incompatible modeling code")
 
         additional_inputs = {}
diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py
index 472733b1f3..f7de13a928 100644
--- a/tests/openvino/utils_tests.py
+++ b/tests/openvino/utils_tests.py
@@ -70,6 +70,7 @@
     "deberta-v2": "optimum-intel-internal-testing/tiny-random-DebertaV2Model",
     "decilm": "optimum-intel-internal-testing/tiny-random-decilm",
     "deepseek": "optimum-intel-internal-testing/tiny-random-deepseek-v3",
+    "gigachat3": "mohamedahraf273/tiny-random-gigachat3",
     "deit": "optimum-intel-internal-testing/tiny-random-DeiTModel",
     "convnext": "optimum-intel-internal-testing/tiny-random-convnext",
     "convnextv2": "optimum-intel-internal-testing/tiny-random-ConvNextV2Model",

From 0d16b4f41e2bd87f72ded8d066a61e5a3cf417ef Mon Sep 17 00:00:00 2001
From: Mohamed-Ashraf273 <ma2736666@gmail.com>
Date: Wed, 4 Mar 2026 11:15:51 +0200
Subject: [PATCH 04/39] add tests and fix issues

---
 optimum/exporters/openvino/model_configs.py |  3 +--
 optimum/exporters/openvino/model_patcher.py |  1 +
 tests/openvino/test_decoder.py              | 20 +++++++++++---------
 tests/openvino/test_export.py               |  3 +++
 tests/openvino/test_exporters_cli.py        | 16 +++++++++++++---
 tests/openvino/utils_tests.py               |  1 +
 6 files changed, 30 insertions(+), 14 deletions(-)

diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index 7ffe158396..d295f9b266 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -4080,8 +4080,7 @@ class M2M100OpenVINOConfig(BartOpenVINOConfig):
 )
 @register_in_tasks_manager("deepseek", *["text-generation", "text-generation-with-past"], library_name="transformers")
 class DeepseekOpenVINOConfig(MiniCPM3OpenVINOConfig):
-    MIN_TRANSFORMERS_VERSION = "4.46.0"
-    MAX_TRANSFORMERS_VERSION = "4.53.3"
+    MIN_TRANSFORMERS_VERSION = "4.53.0"
     _MODEL_PATCHER = DeepseekPatcher
 
 
diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index 3ff17e85b6..a1e9d111f1 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -3835,6 +3835,7 @@ def eager_attention_forward(module, query, key, value, attention_mask, scaling,
             attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
             attn_output = torch.matmul(attn_weights, value)
             return attn_output.transpose(1, 2).contiguous(), attn_weights
+
     else:
         raise ValueError(f"Unsupported DeepSeek version: {version}")
 
diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py
index 12443d1465..d3cb6eac72 100644
--- a/tests/openvino/test_decoder.py
+++ b/tests/openvino/test_decoder.py
@@ -116,13 +116,13 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase):
     if is_transformers_version(">=", "4.46.0"):
         SUPPORTED_ARCHITECTURES += ("glm", "mistral-nemo", "phimoe")
 
-        if is_transformers_version("<", "4.54.0"):
-            SUPPORTED_ARCHITECTURES += ("deepseek", "gigachat3")
-
         # gptq and awq install disabled for windows test environment
         if platform.system() != "Windows" and is_transformers_version("<", "4.56.0"):
             SUPPORTED_ARCHITECTURES += ("opt_gptq", "mixtral_awq")
 
+    if is_transformers_version(">=", "4.53.0"):
+        SUPPORTED_ARCHITECTURES += ("deepseek", "gigachat3")
+
     if is_transformers_version(">", "4.47"):
         SUPPORTED_ARCHITECTURES += ("olmo2",)
 
@@ -384,13 +384,15 @@ def test_compare_to_transformers(self, model_arch):
                 transformers_outputs = transformers_model(**tokens)
 
         # Compare tensor outputs
-        atol = 3e-2 if model_arch in ["deepseek", "gigachat3"] else 3e-3 if model_arch in ["minicpm", "qwen2-moe"] else 1e-4
+        atol_by_arch = {
+            "deepseek": 3e-2,
+            "gigachat3": 3e-2,
+            "minicpm": 3e-3,
+            "qwen2-moe": 3e-3,
+        }
+        atol = atol_by_arch.get(model_arch, 1e-4)
         # quantized models have different logits value range
         if "awq" not in model_arch and "gptq" not in model_arch:
-            diff = torch.abs(ov_outputs.logits - transformers_outputs.logits)
-            print(f"\nMax diff: {diff.max()}, Mean diff: {diff.mean()}, aftol: {atol}")
-            print(f"OV logits sample: {ov_outputs.logits[0, 0, :5]}")
-            print(f"TF logits sample: {transformers_outputs.logits[0, 0, :5]}")
             self.assertTrue(torch.allclose(ov_outputs.logits, transformers_outputs.logits, equal_nan=True, atol=atol))
 
         # Qwen tokenizer does not support padding
@@ -668,7 +670,7 @@ def test_beam_search(self, model_arch):
             return
 
         # TODO: add back once https://huggingface.co/katuni4ka/tiny-random-minicpm3/discussions/1 merged (for all models) as current modeling incompatible with transformers >= v4.49
-        if model_arch in {"deepseek"} and is_transformers_version(">=", "4.49"):
+        if model_arch in {"deepseek", "gigachat3"}:
             self.skipTest("Incompatible modeling code")
 
         tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=model_arch in REMOTE_CODE_MODELS)
diff --git a/tests/openvino/test_export.py b/tests/openvino/test_export.py
index d1c373e2bc..1cb27d2f97 100644
--- a/tests/openvino/test_export.py
+++ b/tests/openvino/test_export.py
@@ -98,6 +98,9 @@ class ExportModelTest(unittest.TestCase):
     if is_transformers_version(">=", "4.48.0"):
         SUPPORTED_ARCHITECTURES.update({"cohere2": OVModelForCausalLM})
 
+    if is_transformers_version(">=", "4.53.0"):
+        SUPPORTED_ARCHITECTURES.update({"deepseek": OVModelForCausalLM, "gigachat3": OVModelForCausalLM})
+
     if is_transformers_version(">=", "4.49"):
         SUPPORTED_ARCHITECTURES.update({"zamba2": OVModelForCausalLM})
 
diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py
index edbc01e310..c8b0eec341 100644
--- a/tests/openvino/test_exporters_cli.py
+++ b/tests/openvino/test_exporters_cli.py
@@ -159,6 +159,13 @@ class OVCLIExportTestCase(unittest.TestCase):
             ]
         )
 
+    if is_transformers_version(">=", "4.53.0"):
+        SUPPORTED_ARCHITECTURES.extend(
+            [
+                ("text-generation-with-past", "gigachat3"),
+            ]
+        )
+
     if is_transformers_version(">=", "4.57.0"):
         SUPPORTED_ARCHITECTURES.extend(
             [
@@ -197,6 +204,7 @@ class OVCLIExportTestCase(unittest.TestCase):
         "exaone4": 2,
         "bitnet": 2,
         "granitemoehybrid": 2,
+        "gigachat3": 2,
     }
 
     TOKENIZER_CHAT_TEMPLATE_TESTS_MODELS = {
@@ -299,9 +307,11 @@ class OVCLIExportTestCase(unittest.TestCase):
             "whisper",
             "f8e4m3",
             "--dataset librispeech --num-samples 1 --smooth-quant-alpha 0.9 --trust-remote-code",
-            {"encoder": 16, "decoder": 26, "decoder_with_past": 23}
-            if is_transformers_version("<=", "4.45")
-            else {"encoder": 16, "decoder": 26, "decoder_with_past": 25},
+            (
+                {"encoder": 16, "decoder": 26, "decoder_with_past": 23}
+                if is_transformers_version("<=", "4.45")
+                else {"encoder": 16, "decoder": 26, "decoder_with_past": 25}
+            ),
             (
                 {"encoder": {"f8e4m3": 14}, "decoder": {"f8e4m3": 22}, "decoder_with_past": {"f8e4m3": 17}}
                 if is_transformers_version("<=", "4.45")
diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py
index f7de13a928..35d74ebcb0 100644
--- a/tests/openvino/utils_tests.py
+++ b/tests/openvino/utils_tests.py
@@ -371,6 +371,7 @@
     "lfm2": {"model": 52},
     "hunyuan_v1_dense": {"model": 32},
     "qwen3_eagle3": {"model": 20},
+    "gigachat3": {"model": 58},
 }
 
 TEST_IMAGE_URL = "http://images.cocodataset.org/val2017/000000039769.jpg"

From f2a1e533759fef1b0c2be059aff1d6151010047f Mon Sep 17 00:00:00 2001
From: Mohamed-Ashraf273 <ma2736666@gmail.com>
Date: Wed, 4 Mar 2026 11:18:38 +0200
Subject: [PATCH 05/39] fix version skip test

---
 tests/openvino/test_decoder.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py
index d3cb6eac72..7a36240a70 100644
--- a/tests/openvino/test_decoder.py
+++ b/tests/openvino/test_decoder.py
@@ -670,7 +670,7 @@ def test_beam_search(self, model_arch):
             return
 
         # TODO: add back once https://huggingface.co/katuni4ka/tiny-random-minicpm3/discussions/1 merged (for all models) as current modeling incompatible with transformers >= v4.49
-        if model_arch in {"deepseek", "gigachat3"}:
+        if model_arch in {"deepseek"} and is_transformers_version(">=", "4.49"):
             self.skipTest("Incompatible modeling code")
 
         tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=model_arch in REMOTE_CODE_MODELS)

From aac19fb9624be81bdcd62eef2fc7223a0c13d9b2 Mon Sep 17 00:00:00 2001
From: Mohamed-Ashraf273 <ma2736666@gmail.com>
Date: Wed, 4 Mar 2026 12:13:11 +0200
Subject: [PATCH 06/39] add docs & modify patcher

---
 docs/source/openvino/models.mdx             | 1 +
 optimum/exporters/openvino/model_patcher.py | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/docs/source/openvino/models.mdx b/docs/source/openvino/models.mdx
index 51200060e8..d62b9e654a 100644
--- a/docs/source/openvino/models.mdx
+++ b/docs/source/openvino/models.mdx
@@ -61,6 +61,7 @@ Here is the list of the supported architectures :
 - Falcon
 - Falcon-Mamba
 - Flaubert
+- GigaChat3
 - GLM-4
 - GLM-Edge
 - GPT-2
diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index a1e9d111f1..b338930ce0 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -3770,11 +3770,12 @@ def __enter__(self):
                 block.self_attn.forward = types.MethodType(self_attn_fwd, block.self_attn)
             if hasattr(block.mlp, "moe_infer"):
                 block.mlp._org_moe_infer = block.mlp.moe_infer
+                block.mlp.moe_infer = types.MethodType(deepseek_moe_infer, block.mlp)
             elif hasattr(block.mlp, "experts"):
                 block.mlp._org_moe_infer = None
                 block.mlp.ep_rank = 0
                 block.mlp.experts_per_rank = len(block.mlp.experts)
-            block.mlp.moe_infer = types.MethodType(deepseek_moe_infer, block.mlp)
+                block.mlp.moe_infer = types.MethodType(deepseek_moe_infer, block.mlp)
 
     def __exit__(self, exc_type, exc_value, traceback):
         super().__exit__(exc_type, exc_value, traceback)

From 5c134eb352092176a75b87a63194b161d969550c Mon Sep 17 00:00:00 2001
From: Mohamed-Ashraf273 <ma2736666@gmail.com>
Date: Wed, 4 Mar 2026 12:15:31 +0200
Subject: [PATCH 07/39] modify patcher

---
 optimum/exporters/openvino/model_patcher.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index b338930ce0..3cb2f0f553 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -3794,11 +3794,6 @@ def __exit__(self, exc_type, exc_value, traceback):
 
 
 def make_deepseek_attn_forward(version: int = 3):
-    """Return a MLA attention forward function for the given DeepSeek version.
-
-    Args:
-        version: 2 for deepseek_v2 (uses freqs_cis), 3 for deepseek_v3 (uses cos/sin tuple)
-    """
     from typing import Callable
 
     from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS

From f231bca4d53adc663611064dcceda19be8d2fbed Mon Sep 17 00:00:00 2001
From: Mohamed-Ashraf273 <ma2736666@gmail.com>
Date: Wed, 4 Mar 2026 12:22:05 +0200
Subject: [PATCH 08/39] modify patcher

---
 optimum/exporters/openvino/model_patcher.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index 3cb2f0f553..dbc17c49d2 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -3769,10 +3769,10 @@ def __enter__(self):
                 block.self_attn._orig_forward = block.self_attn.forward
                 block.self_attn.forward = types.MethodType(self_attn_fwd, block.self_attn)
             if hasattr(block.mlp, "moe_infer"):
-                block.mlp._org_moe_infer = block.mlp.moe_infer
+                block.mlp._orig_moe_infer = block.mlp.moe_infer
                 block.mlp.moe_infer = types.MethodType(deepseek_moe_infer, block.mlp)
             elif hasattr(block.mlp, "experts"):
-                block.mlp._org_moe_infer = None
+                block.mlp._orig_moe_infer = None
                 block.mlp.ep_rank = 0
                 block.mlp.experts_per_rank = len(block.mlp.experts)
                 block.mlp.moe_infer = types.MethodType(deepseek_moe_infer, block.mlp)
@@ -3782,9 +3782,9 @@ def __exit__(self, exc_type, exc_value, traceback):
         for block in self._model.model.layers:
             if hasattr(block.self_attn, "_orig_forward"):
                 block.self_attn.forward = block.self_attn._orig_forward
-            if hasattr(block.mlp, "_org_moe_infer"):
-                if block.mlp._org_moe_infer is not None:
-                    block.mlp.moe_infer = block.mlp._org_moe_infer
+            if hasattr(block.mlp, "_orig_moe_infer"):
+                if block.mlp._orig_moe_infer is not None:
+                    block.mlp.moe_infer = block.mlp._orig_moe_infer
                 else:
                     delattr(block.mlp, "moe_infer")
                     if hasattr(block.mlp, "ep_rank"):

From 8f18ff5f4dda72fd2d73c3032262d42db50f4b98 Mon Sep 17 00:00:00 2001
From: Mohamed-Ashraf273 <ma2736666@gmail.com>
Date: Thu, 5 Mar 2026 15:18:14 +0200
Subject: [PATCH 09/39] fix issues

---
 optimum/exporters/openvino/convert.py       | 20 ++++++++++
 optimum/exporters/openvino/model_configs.py |  1 +
 optimum/exporters/openvino/model_patcher.py | 44 ++++++++++++++++++++-
 tests/openvino/test_decoder.py              | 23 ++++-------
 tests/openvino/utils_tests.py               |  8 ++--
 5 files changed, 75 insertions(+), 21 deletions(-)

diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py
index eda3c7e01b..20f3fe12eb 100644
--- a/optimum/exporters/openvino/convert.py
+++ b/optimum/exporters/openvino/convert.py
@@ -716,7 +716,27 @@ def export_from_model(
         generation_config = getattr(model, "generation_config", None)
         if generation_config is not None:
             try:
+                # Preserve the original `transformers_version` from the source model's generation_config.json.
+                # Starting in transformers 4.50, _prepare_generation_config() applies model-default generation
+                # parameters (do_sample, temperature, top_p, …) when the user-provided GenerationConfig uses
+                # the global default value for those fields AND the stored `transformers_version` is >= 4.50.
+                # Exporting bumps the version to the current transformers release, which causes user-supplied
+                # params (e.g. do_sample=False) to be silently overridden by the model defaults at inference
+                # time. Preserving the original version keeps the OV model consistent with the PT original.
+                orig_transformers_version = getattr(generation_config, "transformers_version", None)
                 generation_config.save_pretrained(output)
+                if orig_transformers_version is not None:
+                    import json as _json
+                    from pathlib import Path as _Path
+
+                    gen_cfg_path = _Path(output) / "generation_config.json"
+                    if gen_cfg_path.exists():
+                        with open(gen_cfg_path, "r", encoding="utf-8") as _f:
+                            _cfg = _json.load(_f)
+                        if _cfg.get("transformers_version") != orig_transformers_version:
+                            _cfg["transformers_version"] = orig_transformers_version
+                            with open(gen_cfg_path, "w", encoding="utf-8") as _f:
+                                _json.dump(_cfg, _f, indent=2)
             except Exception as exception:
                 logger.warning(
                     f"The generation config will not be saved, saving failed with following error:\n{exception}"
diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index d295f9b266..de4916c7a4 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -4081,6 +4081,7 @@ class M2M100OpenVINOConfig(BartOpenVINOConfig):
 @register_in_tasks_manager("deepseek", *["text-generation", "text-generation-with-past"], library_name="transformers")
 class DeepseekOpenVINOConfig(MiniCPM3OpenVINOConfig):
     MIN_TRANSFORMERS_VERSION = "4.53.0"
+    MAX_TRANSFORMERS_VERSION = None
     _MODEL_PATCHER = DeepseekPatcher
 
 
diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index dbc17c49d2..40c8c606da 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -3769,10 +3769,21 @@ def __enter__(self):
                 block.self_attn._orig_forward = block.self_attn.forward
                 block.self_attn.forward = types.MethodType(self_attn_fwd, block.self_attn)
             if hasattr(block.mlp, "moe_infer"):
+                # old interface (transformers < 4.57): moe_infer(self, x, topk_ids, topk_weight)
                 block.mlp._orig_moe_infer = block.mlp.moe_infer
+                block.mlp._orig_moe = None
+                block.mlp.ep_rank = getattr(block.mlp, "ep_rank", 0)
+                block.mlp.experts_per_rank = getattr(block.mlp, "experts_per_rank", len(block.mlp.experts))
                 block.mlp.moe_infer = types.MethodType(deepseek_moe_infer, block.mlp)
+            elif hasattr(block.mlp, "moe") and hasattr(block.mlp, "experts"):
+                # new interface (transformers >= 4.57): moe(self, hidden_states, topk_indices, topk_weights)
+                block.mlp._orig_moe = block.mlp.moe
+                block.mlp._orig_moe_infer = None
+                block.mlp.moe = types.MethodType(deepseek_moe, block.mlp)
             elif hasattr(block.mlp, "experts"):
+                # fallback: patch by injecting moe_infer with required attributes
                 block.mlp._orig_moe_infer = None
+                block.mlp._orig_moe = None
                 block.mlp.ep_rank = 0
                 block.mlp.experts_per_rank = len(block.mlp.experts)
                 block.mlp.moe_infer = types.MethodType(deepseek_moe_infer, block.mlp)
@@ -3782,15 +3793,21 @@ def __exit__(self, exc_type, exc_value, traceback):
         for block in self._model.model.layers:
             if hasattr(block.self_attn, "_orig_forward"):
                 block.self_attn.forward = block.self_attn._orig_forward
+            if hasattr(block.mlp, "_orig_moe"):
+                if block.mlp._orig_moe is not None:
+                    block.mlp.moe = block.mlp._orig_moe
+                delattr(block.mlp, "_orig_moe")
             if hasattr(block.mlp, "_orig_moe_infer"):
                 if block.mlp._orig_moe_infer is not None:
                     block.mlp.moe_infer = block.mlp._orig_moe_infer
                 else:
-                    delattr(block.mlp, "moe_infer")
+                    if hasattr(block.mlp, "moe_infer"):
+                        delattr(block.mlp, "moe_infer")
                     if hasattr(block.mlp, "ep_rank"):
                         delattr(block.mlp, "ep_rank")
                     if hasattr(block.mlp, "experts_per_rank"):
                         delattr(block.mlp, "experts_per_rank")
+                delattr(block.mlp, "_orig_moe_infer")
 
 
 def make_deepseek_attn_forward(version: int = 3):
@@ -3868,7 +3885,7 @@ def deepseek_attn_forward(
             else:
                 q_rot, k_rot = apply_rotary_pos_emb(q_rot, k_rot, cos, sin)
             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
-            kv_cache = past_key_value
+            kv_cache = past_key_values if past_key_values is not None else past_key_value
         else:
             q_rot, k_rot = apply_rotary_emb(q_rot, k_rot, position_embeddings.to(q_rot.device))
             cache_kwargs = {"cache_position": cache_position}
@@ -3949,6 +3966,29 @@ def deepseek_moe_infer(self, x, topk_ids, topk_weight):
     return final_out
 
 
+def deepseek_moe(self, hidden_states: torch.Tensor, topk_indices: torch.Tensor, topk_weights: torch.Tensor):
+    """
+    Replacement for DeepseekV3MoE.moe (transformers >= 4.57).
+    The original skips experts with no tokens (data-dependent control flow that breaks tracing).
+    This version unconditionally runs all experts to produce a traceable static graph.
+    """
+    final_hidden_states = torch.zeros_like(hidden_states, dtype=topk_weights.dtype)
+    expert_mask = torch.nn.functional.one_hot(topk_indices, num_classes=len(self.experts))
+    expert_mask = expert_mask.permute(2, 0, 1)
+
+    for expert_idx in range(len(self.experts)):
+        expert = self.experts[expert_idx]
+        mask = expert_mask[expert_idx]
+        token_indices, weight_indices = torch.where(mask)
+        expert_weights = topk_weights[token_indices, weight_indices]
+        expert_input = hidden_states[token_indices]
+        expert_output = expert(expert_input)
+        weighted_output = expert_output * expert_weights.unsqueeze(-1)
+        final_hidden_states.index_add_(0, token_indices, weighted_output)
+
+    return final_hidden_states.type(hidden_states.dtype)
+
+
 class Qwen2VLLanguageModelPatcher(OVDecoderModelPatcher):
     def __init__(
         self,
diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py
index 7a36240a70..2110c1b191 100644
--- a/tests/openvino/test_decoder.py
+++ b/tests/openvino/test_decoder.py
@@ -24,7 +24,6 @@
 
 from optimum.exporters.openvino.model_configs import (
     BitnetOpenVINOConfig,
-    DeepseekOpenVINOConfig,
     LFM2OpenVINOConfig,
     Qwen3VLOpenVINOConfig,
 )
@@ -287,11 +286,6 @@ def test_find_untested_architectures(self):
 
         if "llama4_text" in supported_architectures:
             supported_architectures.remove("llama4_text")
-        if is_transformers_version(">=", str(DeepseekOpenVINOConfig.MAX_TRANSFORMERS_VERSION)):
-            if "deepseek_v2" in supported_architectures:
-                supported_architectures.remove("deepseek_v2")
-            if "deepseek_v3" in supported_architectures:
-                supported_architectures.remove("deepseek_v3")
         if is_transformers_version("<", str(BitnetOpenVINOConfig.MIN_TRANSFORMERS_VERSION)):
             supported_architectures -= {"bitnet"}
         if is_transformers_version("<", str(LFM2OpenVINOConfig.MIN_TRANSFORMERS_VERSION)):
@@ -385,10 +379,9 @@ def test_compare_to_transformers(self, model_arch):
 
         # Compare tensor outputs
         atol_by_arch = {
-            "deepseek": 3e-2,
-            "gigachat3": 3e-2,
             "minicpm": 3e-3,
             "qwen2-moe": 3e-3,
+            "gigachat3": 3e-3,
         }
         atol = atol_by_arch.get(model_arch, 1e-4)
         # quantized models have different logits value range
@@ -400,6 +393,12 @@ def test_compare_to_transformers(self, model_arch):
             return
 
         tokens = tokenizer(["Today is a nice day and I am longer", "This is me"], return_tensors="pt", padding=True)
+
+        # Gigachat3 tokenizer add token_type_ids which DeepSeekV3
+        # and similar models do not accept in generate(); strip it so both OV and PT calls succeed.
+        if model_arch in ["gigachat3"]:
+            tokens.pop("token_type_ids", None)
+
         ov_model.generation_config.eos_token_id = None
         transformers_model.generation_config.eos_token_id = None
         ov_model.config.eos_token_id = None
@@ -416,10 +415,6 @@ def test_compare_to_transformers(self, model_arch):
 
         ov_outputs = ov_model.generate(**tokens, generation_config=gen_config)
 
-        # TODO: add back once https://huggingface.co/katuni4ka/tiny-random-minicpm3/discussions/1 merged (for all models) as current modeling incompatible with transformers >= v4.49
-        if model_arch in {"deepseek", "gigachat3"} and is_transformers_version(">=", "4.49"):
-            self.skipTest("Incompatible modeling code")
-
         additional_inputs = {}
         # gemma2 does not support dynamic cache, it is unfair to compare dynamic cache result vs hybrid cache,
         # align cache representation in torch model
@@ -669,10 +664,6 @@ def test_beam_search(self, model_arch):
         if model_arch in ["lfm2", "granitemoehybrid"]:
             return
 
-        # TODO: add back once https://huggingface.co/katuni4ka/tiny-random-minicpm3/discussions/1 merged (for all models) as current modeling incompatible with transformers >= v4.49
-        if model_arch in {"deepseek"} and is_transformers_version(">=", "4.49"):
-            self.skipTest("Incompatible modeling code")
-
         tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=model_arch in REMOTE_CODE_MODELS)
         if model_arch == "persimmon":
             tokenizer.pad_token_id = tokenizer.bos_token_id
diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py
index 35d74ebcb0..4c2dbf1065 100644
--- a/tests/openvino/utils_tests.py
+++ b/tests/openvino/utils_tests.py
@@ -70,7 +70,7 @@
     "deberta-v2": "optimum-intel-internal-testing/tiny-random-DebertaV2Model",
     "decilm": "optimum-intel-internal-testing/tiny-random-decilm",
     "deepseek": "optimum-intel-internal-testing/tiny-random-deepseek-v3",
-    "gigachat3": "mohamedahraf273/tiny-random-gigachat3",
+    "gigachat3": "optimum-intel-internal-testing/tiny-random-gigachat3",
     "deit": "optimum-intel-internal-testing/tiny-random-DeiTModel",
     "convnext": "optimum-intel-internal-testing/tiny-random-convnext",
     "convnextv2": "optimum-intel-internal-testing/tiny-random-ConvNextV2Model",
@@ -551,8 +551,10 @@ def get_supported_model_for_library(library_name):
         if supported_model_type[model_type].get("openvino"):
             export_config = next(iter(supported_model_type[model_type]["openvino"].values()))
 
-            min_transformers = str(getattr(export_config.func, "MIN_TRANSFORMERS_VERSION", "0"))
-            max_transformers = str(getattr(export_config.func, "MAX_TRANSFORMERS_VERSION", "999"))
+            raw_min = getattr(export_config.func, "MIN_TRANSFORMERS_VERSION", None)
+            raw_max = getattr(export_config.func, "MAX_TRANSFORMERS_VERSION", None)
+            min_transformers = str(raw_min) if raw_min is not None else "0"
+            max_transformers = str(raw_max) if raw_max is not None else "999"
 
             if is_transformers_version(">=", min_transformers) and is_transformers_version("<=", max_transformers):
                 valid_model.add(model_type)

From 5049ce373aebfeedff0f12cc4be200340ca4eef9 Mon Sep 17 00:00:00 2001
From: Mohamed-Ashraf273 <ma2736666@gmail.com>
Date: Thu, 5 Mar 2026 15:56:48 +0200
Subject: [PATCH 10/39] update test

---
 tests/openvino/test_decoder.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py
index 2110c1b191..4e2d12f56e 100644
--- a/tests/openvino/test_decoder.py
+++ b/tests/openvino/test_decoder.py
@@ -381,7 +381,6 @@ def test_compare_to_transformers(self, model_arch):
         atol_by_arch = {
             "minicpm": 3e-3,
             "qwen2-moe": 3e-3,
-            "gigachat3": 3e-3,
         }
         atol = atol_by_arch.get(model_arch, 1e-4)
         # quantized models have different logits value range

From 722dc5334302cd3678ff4400bad6f74a71000845 Mon Sep 17 00:00:00 2001
From: Mohamed-Ashraf273 <ma2736666@gmail.com>
Date: Fri, 6 Mar 2026 10:33:17 +0200
Subject: [PATCH 11/39] update tests

---
 tests/openvino/test_decoder.py       | 6 +-----
 tests/openvino/test_exporters_cli.py | 8 +++-----
 2 files changed, 4 insertions(+), 10 deletions(-)

diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py
index 4e2d12f56e..ee7ee6c1ea 100644
--- a/tests/openvino/test_decoder.py
+++ b/tests/openvino/test_decoder.py
@@ -378,11 +378,7 @@ def test_compare_to_transformers(self, model_arch):
                 transformers_outputs = transformers_model(**tokens)
 
         # Compare tensor outputs
-        atol_by_arch = {
-            "minicpm": 3e-3,
-            "qwen2-moe": 3e-3,
-        }
-        atol = atol_by_arch.get(model_arch, 1e-4)
+        atol = 3e-3 if model_arch in ["minicpm", "qwen2-moe"] else 1e-4
         # quantized models have different logits value range
         if "awq" not in model_arch and "gptq" not in model_arch:
             self.assertTrue(torch.allclose(ov_outputs.logits, transformers_outputs.logits, equal_nan=True, atol=atol))
diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py
index c8b0eec341..b14b4fe40a 100644
--- a/tests/openvino/test_exporters_cli.py
+++ b/tests/openvino/test_exporters_cli.py
@@ -307,11 +307,9 @@ class OVCLIExportTestCase(unittest.TestCase):
             "whisper",
             "f8e4m3",
             "--dataset librispeech --num-samples 1 --smooth-quant-alpha 0.9 --trust-remote-code",
-            (
-                {"encoder": 16, "decoder": 26, "decoder_with_past": 23}
-                if is_transformers_version("<=", "4.45")
-                else {"encoder": 16, "decoder": 26, "decoder_with_past": 25}
-            ),
+            {"encoder": 16, "decoder": 26, "decoder_with_past": 23}
+            if is_transformers_version("<=", "4.45")
+            else {"encoder": 16, "decoder": 26, "decoder_with_past": 25},
             (
                 {"encoder": {"f8e4m3": 14}, "decoder": {"f8e4m3": 22}, "decoder_with_past": {"f8e4m3": 17}}
                 if is_transformers_version("<=", "4.45")

From 28a6330daa3f49529bb9fa3bf553f5bc644968cf Mon Sep 17 00:00:00 2001
From: Mohamed-Ashraf273 <ma2736666@gmail.com>
Date: Fri, 6 Mar 2026 11:57:04 +0200
Subject: [PATCH 12/39] fix test issue

---
 tests/openvino/test_decoder.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py
index ee7ee6c1ea..2f47694b5f 100644
--- a/tests/openvino/test_decoder.py
+++ b/tests/openvino/test_decoder.py
@@ -24,6 +24,7 @@
 
 from optimum.exporters.openvino.model_configs import (
     BitnetOpenVINOConfig,
+    DeepseekOpenVINOConfig,
     LFM2OpenVINOConfig,
     Qwen3VLOpenVINOConfig,
 )
@@ -290,6 +291,9 @@ def test_find_untested_architectures(self):
             supported_architectures -= {"bitnet"}
         if is_transformers_version("<", str(LFM2OpenVINOConfig.MIN_TRANSFORMERS_VERSION)):
             supported_architectures -= {"lfm2"}
+        # deepseek_v2 and deepseek_v3 are aliases of the same architecture tested under "deepseek"
+        if is_transformers_version(">=", str(DeepseekOpenVINOConfig.MIN_TRANSFORMERS_VERSION)):
+            supported_architectures -= {"deepseek_v2", "deepseek_v3"}
 
         # qwen3_vl_text a part of qwen3_vl architecture and is tested in seq2seq group
         if is_transformers_version(">=", str(Qwen3VLOpenVINOConfig.MIN_TRANSFORMERS_VERSION)):

From 07efafd24bda7a2e076e5ce59430543861199e46 Mon Sep 17 00:00:00 2001
From: Mohamed-Ashraf273 <ma2736666@gmail.com>
Date: Fri, 6 Mar 2026 12:00:37 +0200
Subject: [PATCH 13/39] fix test issue

---
 tests/openvino/test_decoder.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py
index 2f47694b5f..c19ef2ade9 100644
--- a/tests/openvino/test_decoder.py
+++ b/tests/openvino/test_decoder.py
@@ -287,13 +287,15 @@ def test_find_untested_architectures(self):
 
         if "llama4_text" in supported_architectures:
             supported_architectures.remove("llama4_text")
+        if is_transformers_version(">=", str(DeepseekOpenVINOConfig.MAX_TRANSFORMERS_VERSION)):
+            if "deepseek_v2" in supported_architectures:
+                supported_architectures.remove("deepseek_v2")
+            if "deepseek_v3" in supported_architectures:
+                supported_architectures.remove("deepseek_v3")
         if is_transformers_version("<", str(BitnetOpenVINOConfig.MIN_TRANSFORMERS_VERSION)):
             supported_architectures -= {"bitnet"}
         if is_transformers_version("<", str(LFM2OpenVINOConfig.MIN_TRANSFORMERS_VERSION)):
             supported_architectures -= {"lfm2"}
-        # deepseek_v2 and deepseek_v3 are aliases of the same architecture tested under "deepseek"
-        if is_transformers_version(">=", str(DeepseekOpenVINOConfig.MIN_TRANSFORMERS_VERSION)):
-            supported_architectures -= {"deepseek_v2", "deepseek_v3"}
 
         # qwen3_vl_text a part of qwen3_vl architecture and is tested in seq2seq group
         if is_transformers_version(">=", str(Qwen3VLOpenVINOConfig.MIN_TRANSFORMERS_VERSION)):

From c52c62a5e8f04cd80ef8c50b019cd587564631b0 Mon Sep 17 00:00:00 2001
From: Mohamed-Ashraf273 <ma2736666@gmail.com>
Date: Mon, 9 Mar 2026 13:22:01 +0200
Subject: [PATCH 14/39] fix tests

---
 tests/openvino/test_decoder.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py
index c19ef2ade9..74e0673dbb 100644
--- a/tests/openvino/test_decoder.py
+++ b/tests/openvino/test_decoder.py
@@ -287,11 +287,8 @@ def test_find_untested_architectures(self):
 
         if "llama4_text" in supported_architectures:
             supported_architectures.remove("llama4_text")
-        if is_transformers_version(">=", str(DeepseekOpenVINOConfig.MAX_TRANSFORMERS_VERSION)):
-            if "deepseek_v2" in supported_architectures:
-                supported_architectures.remove("deepseek_v2")
-            if "deepseek_v3" in supported_architectures:
-                supported_architectures.remove("deepseek_v3")
+        if is_transformers_version(">=", str(DeepseekOpenVINOConfig.MIN_TRANSFORMERS_VERSION)):
+            supported_architectures -= {"deepseek_v2", "deepseek_v3"}
         if is_transformers_version("<", str(BitnetOpenVINOConfig.MIN_TRANSFORMERS_VERSION)):
             supported_architectures -= {"bitnet"}
         if is_transformers_version("<", str(LFM2OpenVINOConfig.MIN_TRANSFORMERS_VERSION)):

From a63a52d9f457d8b5beeb9950ad5ef91d02dc7abf Mon Sep 17 00:00:00 2001
From: Mohamed-Ashraf273 <ma2736666@gmail.com>
Date: Mon, 9 Mar 2026 15:26:59 +0200
Subject: [PATCH 15/39] fix conflict

---
 tests/openvino/utils_tests.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py
index 4c2dbf1065..455baab74d 100644
--- a/tests/openvino/utils_tests.py
+++ b/tests/openvino/utils_tests.py
@@ -173,6 +173,7 @@
     "qwen3": "optimum-intel-internal-testing/tiny-random-qwen3",
     "qwen3_moe": "optimum-intel-internal-testing/tiny-random-qwen3moe",
     "qwen3_vl": "optimum-intel-internal-testing/tiny-random-qwen3-vl",
+    "qwen3_next": "optimum-intel-internal-testing/tiny-random-qwen3-next",
     "rembert": "optimum-intel-internal-testing/tiny-random-rembert",
     "resnet": "optimum-intel-internal-testing/tiny-random-resnet",
     "roberta": "optimum-intel-internal-testing/tiny-random-roberta",
@@ -372,6 +373,7 @@
     "hunyuan_v1_dense": {"model": 32},
     "qwen3_eagle3": {"model": 20},
     "gigachat3": {"model": 58},
+    "qwen3_next": {"model": 100},
 }
 
 TEST_IMAGE_URL = "http://images.cocodataset.org/val2017/000000039769.jpg"

From f8bdfe5591d9c35ed61cd526dfca45fc8397ee24 Mon Sep 17 00:00:00 2001
From: Mohamed-Ashraf273 <ma2736666@gmail.com>
Date: Mon, 9 Mar 2026 15:38:49 +0200
Subject: [PATCH 16/39] fix conflict

---
 tests/openvino/utils_tests.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py
index 455baab74d..f53bd364e1 100644
--- a/tests/openvino/utils_tests.py
+++ b/tests/openvino/utils_tests.py
@@ -372,8 +372,8 @@
     "lfm2": {"model": 52},
     "hunyuan_v1_dense": {"model": 32},
     "qwen3_eagle3": {"model": 20},
-    "gigachat3": {"model": 58},
     "qwen3_next": {"model": 100},
+    "gigachat3": {"model": 58},
 }
 
 TEST_IMAGE_URL = "http://images.cocodataset.org/val2017/000000039769.jpg"

From 5b32d32522dde091d73b20f697542e4e35c8a6a9 Mon Sep 17 00:00:00 2001
From: Mohamed-Ashraf273 <ma2736666@gmail.com>
Date: Fri, 13 Mar 2026 16:47:36 +0200
Subject: [PATCH 17/39] revert conevrt.py changes

---
 optimum/exporters/openvino/convert.py | 20 --------------------
 tests/openvino/test_decoder.py        |  3 +++
 2 files changed, 3 insertions(+), 20 deletions(-)

diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py
index b50b990ab7..60d90f53e0 100644
--- a/optimum/exporters/openvino/convert.py
+++ b/optimum/exporters/openvino/convert.py
@@ -721,27 +721,7 @@ def export_from_model(
         generation_config = getattr(model, "generation_config", None)
         if generation_config is not None:
             try:
-                # Preserve the original `transformers_version` from the source model's generation_config.json.
-                # Starting in transformers 4.50, _prepare_generation_config() applies model-default generation
-                # parameters (do_sample, temperature, top_p, …) when the user-provided GenerationConfig uses
-                # the global default value for those fields AND the stored `transformers_version` is >= 4.50.
-                # Exporting bumps the version to the current transformers release, which causes user-supplied
-                # params (e.g. do_sample=False) to be silently overridden by the model defaults at inference
-                # time. Preserving the original version keeps the OV model consistent with the PT original.
-                orig_transformers_version = getattr(generation_config, "transformers_version", None)
                 generation_config.save_pretrained(output)
-                if orig_transformers_version is not None:
-                    import json as _json
-                    from pathlib import Path as _Path
-
-                    gen_cfg_path = _Path(output) / "generation_config.json"
-                    if gen_cfg_path.exists():
-                        with open(gen_cfg_path, "r", encoding="utf-8") as _f:
-                            _cfg = _json.load(_f)
-                        if _cfg.get("transformers_version") != orig_transformers_version:
-                            _cfg["transformers_version"] = orig_transformers_version
-                            with open(gen_cfg_path, "w", encoding="utf-8") as _f:
-                                _json.dump(_cfg, _f, indent=2)
             except Exception as exception:
                 logger.warning(
                     f"The generation config will not be saved, saving failed with following error:\n{exception}"
diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py
index a393782973..d998bd3e40 100644
--- a/tests/openvino/test_decoder.py
+++ b/tests/openvino/test_decoder.py
@@ -401,6 +401,9 @@ def test_compare_to_transformers(self, model_arch):
         if model_arch in ["gigachat3"]:
             tokens.pop("token_type_ids", None)
 
+        if model_arch == "deepseek":
+            gen_config.do_sample = False
+
         ov_model.generation_config.eos_token_id = None
         transformers_model.generation_config.eos_token_id = None
         ov_model.config.eos_token_id = None

From 04b4d9f3ee1a49843c9a3860a2bef3036e5d8c17 Mon Sep 17 00:00:00 2001
From: Mohamed-Ashraf273 <ma2736666@gmail.com>
Date: Fri, 13 Mar 2026 16:53:41 +0200
Subject: [PATCH 18/39] revert conevrt.py changes

---
 tests/openvino/test_decoder.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py
index d998bd3e40..653f831cb0 100644
--- a/tests/openvino/test_decoder.py
+++ b/tests/openvino/test_decoder.py
@@ -401,8 +401,6 @@ def test_compare_to_transformers(self, model_arch):
         if model_arch in ["gigachat3"]:
             tokens.pop("token_type_ids", None)
 
-        if model_arch == "deepseek":
-            gen_config.do_sample = False
 
         ov_model.generation_config.eos_token_id = None
         transformers_model.generation_config.eos_token_id = None
@@ -418,6 +416,9 @@ def test_compare_to_transformers(self, model_arch):
             do_sample=False,
         )
 
+        if model_arch == "deepseek":
+            gen_config.do_sample = False
+
         ov_outputs = ov_model.generate(**tokens, generation_config=gen_config)
 
         additional_inputs = {}

From c0ba5d0b77a2cf417f3f8078fa738499fcbe90fb Mon Sep 17 00:00:00 2001
From: Mohamed-Ashraf273 <ma2736666@gmail.com>
Date: Fri, 13 Mar 2026 17:08:25 +0200
Subject: [PATCH 19/39] revert conevrt.py changes

---
 tests/openvino/test_decoder.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py
index 653f831cb0..c5d26c8d4a 100644
--- a/tests/openvino/test_decoder.py
+++ b/tests/openvino/test_decoder.py
@@ -401,6 +401,9 @@ def test_compare_to_transformers(self, model_arch):
         if model_arch in ["gigachat3"]:
             tokens.pop("token_type_ids", None)
 
+        if model_arch == "deepseek":
+            ov_model.generation_config.do_sample = False
+            transformers_model.generation_config.do_sample = False
 
         ov_model.generation_config.eos_token_id = None
         transformers_model.generation_config.eos_token_id = None
@@ -416,9 +419,6 @@ def test_compare_to_transformers(self, model_arch):
             do_sample=False,
         )
 
-        if model_arch == "deepseek":
-            gen_config.do_sample = False
-
         ov_outputs = ov_model.generate(**tokens, generation_config=gen_config)
 
         additional_inputs = {}
@@ -778,8 +778,10 @@ def test_beam_search(self, model_arch):
         ov_model_stateless.config.eos_token_id = None
         transformers_model.config.eos_token_id = None
 
-        if is_transformers_version(">=", "4.51"):
-            additional_inputs["use_model_defaults"] = False
+        # For deepseek, sampling is enabled by default, but we need to disable it for the test
+        if model_arch == "deepseek":
+            ov_model.generation_config.do_sample = False
+            transformers_model.generation_config.do_sample = False
 
         for gen_config in gen_configs:
             if gen_config.do_sample and model_arch in ["baichuan2-13b", "olmo", "zamba2"]:

From 63a956d049303b99481d595bc2bfbfbd98cee712 Mon Sep 17 00:00:00 2001
From: Mohamed-Ashraf273 <ma2736666@gmail.com>
Date: Fri, 13 Mar 2026 17:34:51 +0200
Subject: [PATCH 20/39] revert conevrt.py changes

---
 tests/openvino/test_decoder.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py
index c5d26c8d4a..1306ce2b47 100644
--- a/tests/openvino/test_decoder.py
+++ b/tests/openvino/test_decoder.py
@@ -778,10 +778,8 @@ def test_beam_search(self, model_arch):
         ov_model_stateless.config.eos_token_id = None
         transformers_model.config.eos_token_id = None
 
-        # For deepseek, sampling is enabled by default, but we need to disable it for the test
-        if model_arch == "deepseek":
-            ov_model.generation_config.do_sample = False
-            transformers_model.generation_config.do_sample = False
+        if is_transformers_version(">=", "4.51"):
+                additional_inputs["use_model_defaults"] = False
 
         for gen_config in gen_configs:
             if gen_config.do_sample and model_arch in ["baichuan2-13b", "olmo", "zamba2"]:

From acd8148f2134a3548c002aef289f7be1075bf5f2 Mon Sep 17 00:00:00 2001
From: Mohamed-Ashraf273 <ma2736666@gmail.com>
Date: Fri, 13 Mar 2026 17:35:31 +0200
Subject: [PATCH 21/39] revert conevrt.py changes

---
 tests/openvino/test_decoder.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py
index 1306ce2b47..89bcacf9a4 100644
--- a/tests/openvino/test_decoder.py
+++ b/tests/openvino/test_decoder.py
@@ -779,7 +779,7 @@ def test_beam_search(self, model_arch):
         transformers_model.config.eos_token_id = None
 
         if is_transformers_version(">=", "4.51"):
-                additional_inputs["use_model_defaults"] = False
+            additional_inputs["use_model_defaults"] = False
 
         for gen_config in gen_configs:
             if gen_config.do_sample and model_arch in ["baichuan2-13b", "olmo", "zamba2"]:

From dbc143293bf9e2ef8c1138b66a615c92cc3c3a4a Mon Sep 17 00:00:00 2001
From: Mohamed-Ashraf273 <ma2736666@gmail.com>
Date: Sun, 15 Mar 2026 15:56:58 +0200
Subject: [PATCH 22/39] update deepseek's patcher

---
 optimum/exporters/openvino/model_patcher.py | 70 ++++++++++++++++-----
 1 file changed, 55 insertions(+), 15 deletions(-)

diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index dc600868cf..6d5e1f19c8 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -3781,6 +3781,34 @@ def __enter__(self):
                 # new interface (transformers >= 4.57): moe(self, hidden_states, topk_indices, topk_weights)
                 block.mlp._orig_moe = block.mlp.moe
                 block.mlp._orig_moe_infer = None
+                num_experts = len(block.mlp.experts)
+                block.mlp.gate_projs = (
+                    torch.concat(
+                        tuple(block.mlp.experts[i].gate_proj.weight.unsqueeze(0) for i in range(num_experts)),
+                        dim=0,
+                    )
+                    .transpose(1, 2)
+                    .float()
+                )
+
+                block.mlp.up_projs = (
+                    torch.concat(
+                        tuple(block.mlp.experts[i].up_proj.weight.unsqueeze(0) for i in range(num_experts)),
+                        dim=0,
+                    )
+                    .transpose(1, 2)
+                    .float()
+                )
+
+                block.mlp.down_projs = (
+                    torch.concat(
+                        tuple(block.mlp.experts[i].down_proj.weight.unsqueeze(0) for i in range(num_experts)),
+                        dim=0,
+                    )
+                    .transpose(1, 2)
+                    .float()
+                )
+
                 block.mlp.moe = types.MethodType(deepseek_moe, block.mlp)
             elif hasattr(block.mlp, "experts"):
                 # fallback: patch by injecting moe_infer with required attributes
@@ -3798,6 +3826,12 @@ def __exit__(self, exc_type, exc_value, traceback):
             if hasattr(block.mlp, "_orig_moe"):
                 if block.mlp._orig_moe is not None:
                     block.mlp.moe = block.mlp._orig_moe
+                if hasattr(block.mlp, "gate_projs"):
+                    del block.mlp.gate_projs
+                if hasattr(block.mlp, "up_projs"):
+                    del block.mlp.up_projs
+                if hasattr(block.mlp, "down_projs"):
+                    del block.mlp.down_projs
                 delattr(block.mlp, "_orig_moe")
             if hasattr(block.mlp, "_orig_moe_infer"):
                 if block.mlp._orig_moe_infer is not None:
@@ -3974,21 +4008,27 @@ def deepseek_moe(self, hidden_states: torch.Tensor, topk_indices: torch.Tensor,
     The original skips experts with no tokens (data-dependent control flow that breaks tracing).
     This version unconditionally runs all experts to produce a traceable static graph.
     """
-    final_hidden_states = torch.zeros_like(hidden_states, dtype=topk_weights.dtype)
-    expert_mask = torch.nn.functional.one_hot(topk_indices, num_classes=len(self.experts))
-    expert_mask = expert_mask.permute(2, 0, 1)
-
-    for expert_idx in range(len(self.experts)):
-        expert = self.experts[expert_idx]
-        mask = expert_mask[expert_idx]
-        token_indices, weight_indices = torch.where(mask)
-        expert_weights = topk_weights[token_indices, weight_indices]
-        expert_input = hidden_states[token_indices]
-        expert_output = expert(expert_input)
-        weighted_output = expert_output * expert_weights.unsqueeze(-1)
-        final_hidden_states.index_add_(0, token_indices, weighted_output)
-
-    return final_hidden_states.type(hidden_states.dtype)
+    num_experts = len(self.experts)
+    batch_tokens, hidden_dim = hidden_states.shape
+
+    routing = torch.zeros(
+        batch_tokens, num_experts,
+        dtype=topk_weights.dtype,
+        device=hidden_states.device
+    )
+    routing.scatter_(1, topk_indices, topk_weights)
+
+    hidden_states = hidden_states.repeat(num_experts, 1)
+    hidden_states = hidden_states.view(num_experts, batch_tokens, hidden_dim)
+    act_fn = self.experts[0].act_fn
+    gate = torch.bmm(hidden_states, self.gate_projs)
+    up = torch.bmm(hidden_states, self.up_projs)
+    gate_up = act_fn(gate) * up
+    next_states = torch.bmm(gate_up, self.down_projs)
+    routing = routing.transpose(0, 1).unsqueeze(-1)
+    next_states = next_states * routing
+    next_states = next_states.sum(dim=0)
+    return next_states.type(hidden_states.dtype)
 
 
 class Qwen2VLLanguageModelPatcher(OVDecoderModelPatcher):

From 47d2910f4d89598c5e6df61287d53553524bbc30 Mon Sep 17 00:00:00 2001
From: Mohamed-Ashraf273 <ma2736666@gmail.com>
Date: Tue, 17 Mar 2026 16:27:29 +0200
Subject: [PATCH 23/39] modify patcher

---
 optimum/exporters/openvino/model_patcher.py | 70 +++++++++++++--------
 1 file changed, 44 insertions(+), 26 deletions(-)

diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index 6d5e1f19c8..1934e2a5ae 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -3782,33 +3782,51 @@ def __enter__(self):
                 block.mlp._orig_moe = block.mlp.moe
                 block.mlp._orig_moe_infer = None
                 num_experts = len(block.mlp.experts)
-                block.mlp.gate_projs = (
-                    torch.concat(
-                        tuple(block.mlp.experts[i].gate_proj.weight.unsqueeze(0) for i in range(num_experts)),
-                        dim=0,
-                    )
-                    .transpose(1, 2)
-                    .float()
-                )
 
-                block.mlp.up_projs = (
-                    torch.concat(
-                        tuple(block.mlp.experts[i].up_proj.weight.unsqueeze(0) for i in range(num_experts)),
-                        dim=0,
-                    )
-                    .transpose(1, 2)
-                    .float()
+                # Concatenate expert weights
+                gate_projs = torch.concat(
+                    tuple(block.mlp.experts[i].gate_proj.weight.unsqueeze(0) for i in range(num_experts)),
+                    dim=0,
                 )
-
-                block.mlp.down_projs = (
-                    torch.concat(
-                        tuple(block.mlp.experts[i].down_proj.weight.unsqueeze(0) for i in range(num_experts)),
-                        dim=0,
-                    )
-                    .transpose(1, 2)
-                    .float()
+                up_projs = torch.concat(
+                    tuple(block.mlp.experts[i].up_proj.weight.unsqueeze(0) for i in range(num_experts)),
+                    dim=0,
+                )
+                down_projs = torch.concat(
+                    tuple(block.mlp.experts[i].down_proj.weight.unsqueeze(0) for i in range(num_experts)),
+                    dim=0,
                 )
 
+                # Handle OpenVINO version check with proper version string parsing
+                import re
+                import warnings
+
+                from packaging import version
+
+                import openvino as ov
+
+                ov_version_str = ov.__version__
+                version_match = re.match(r'(\d+\.\d+\.\d+)', ov_version_str)
+                if version_match:
+                    ov_version = version.parse(version_match.group(1))
+                    if ov_version <= version.parse("2026.0.0"):
+                        warnings.warn(
+                            "This model works best with OpenVINO 2026.1 or later. "
+                            "Earlier versions require float() conversion for MoE weights, "
+                            "which may affect performance."
+                        )
+                        block.mlp.gate_projs = gate_projs.float()
+                        block.mlp.up_projs = up_projs.float()
+                        block.mlp.down_projs = down_projs.float()
+                    else:
+                        block.mlp.gate_projs = gate_projs
+                        block.mlp.up_projs = up_projs
+                        block.mlp.down_projs = down_projs
+                else:
+                    block.mlp.gate_projs = gate_projs
+                    block.mlp.up_projs = up_projs
+                    block.mlp.down_projs = down_projs
+
                 block.mlp.moe = types.MethodType(deepseek_moe, block.mlp)
             elif hasattr(block.mlp, "experts"):
                 # fallback: patch by injecting moe_infer with required attributes
@@ -4021,10 +4039,10 @@ def deepseek_moe(self, hidden_states: torch.Tensor, topk_indices: torch.Tensor,
     hidden_states = hidden_states.repeat(num_experts, 1)
     hidden_states = hidden_states.view(num_experts, batch_tokens, hidden_dim)
     act_fn = self.experts[0].act_fn
-    gate = torch.bmm(hidden_states, self.gate_projs)
-    up = torch.bmm(hidden_states, self.up_projs)
+    gate = torch.bmm(hidden_states, self.gate_projs.transpose(1, 2))
+    up = torch.bmm(hidden_states, self.up_projs.transpose(1, 2))
     gate_up = act_fn(gate) * up
-    next_states = torch.bmm(gate_up, self.down_projs)
+    next_states = torch.bmm(gate_up, self.down_projs.transpose(1, 2))
     routing = routing.transpose(0, 1).unsqueeze(-1)
     next_states = next_states * routing
     next_states = next_states.sum(dim=0)

From eb601d9a35567195f8b4fd26e7976ac451a0d118 Mon Sep 17 00:00:00 2001
From: Mohamed-Ashraf273 <ma2736666@gmail.com>
Date: Fri, 20 Mar 2026 12:15:12 +0200
Subject: [PATCH 24/39] update patcher

---
 optimum/exporters/openvino/model_patcher.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index 1934e2a5ae..e71a418234 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -54,7 +54,12 @@
     override_arguments,
     sdpa_mask_without_vmap,
 )
-from optimum.intel.utils.import_utils import is_diffusers_version, is_torch_version, is_transformers_version
+from optimum.intel.utils.import_utils import (
+    is_diffusers_version,
+    is_openvino_version,
+    is_torch_version,
+    is_transformers_version,
+)
 
 from ._ov_ops import convert_recurrent_attention_cell
 
@@ -3797,19 +3802,16 @@ def __enter__(self):
                     dim=0,
                 )
 
-                # Handle OpenVINO version check with proper version string parsing
+                # Handle OpenVINO version check
                 import re
                 import warnings
 
-                from packaging import version
-
                 import openvino as ov
 
                 ov_version_str = ov.__version__
                 version_match = re.match(r'(\d+\.\d+\.\d+)', ov_version_str)
                 if version_match:
-                    ov_version = version.parse(version_match.group(1))
-                    if ov_version <= version.parse("2026.0.0"):
+                    if is_openvino_version("<=", "2026.0.0"):
                         warnings.warn(
                             "This model works best with OpenVINO 2026.1 or later. "
                             "Earlier versions require float() conversion for MoE weights, "

From fe1b84a50de10837ed6d46832c09a5fe254df7a1 Mon Sep 17 00:00:00 2001
From: Mohamed-Ashraf273 <ma2736666@gmail.com>
Date: Mon, 23 Mar 2026 10:47:52 +0200
Subject: [PATCH 25/39] removed unnecessary check

---
 optimum/exporters/openvino/model_patcher.py | 28 +++++++--------------
 1 file changed, 9 insertions(+), 19 deletions(-)

diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index e71a418234..120098da4d 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -3803,27 +3803,17 @@ def __enter__(self):
                 )
 
                 # Handle OpenVINO version check
-                import re
                 import warnings
 
-                import openvino as ov
-
-                ov_version_str = ov.__version__
-                version_match = re.match(r'(\d+\.\d+\.\d+)', ov_version_str)
-                if version_match:
-                    if is_openvino_version("<=", "2026.0.0"):
-                        warnings.warn(
-                            "This model works best with OpenVINO 2026.1 or later. "
-                            "Earlier versions require float() conversion for MoE weights, "
-                            "which may affect performance."
-                        )
-                        block.mlp.gate_projs = gate_projs.float()
-                        block.mlp.up_projs = up_projs.float()
-                        block.mlp.down_projs = down_projs.float()
-                    else:
-                        block.mlp.gate_projs = gate_projs
-                        block.mlp.up_projs = up_projs
-                        block.mlp.down_projs = down_projs
+                if is_openvino_version("<=", "2026.0.0"):
+                    warnings.warn(
+                        "This model works best with OpenVINO 2026.1 or later. "
+                        "Earlier versions require float() conversion for MoE weights, "
+                        "which may affect performance."
+                    )
+                    block.mlp.gate_projs = gate_projs.float()
+                    block.mlp.up_projs = up_projs.float()
+                    block.mlp.down_projs = down_projs.float()
                 else:
                     block.mlp.gate_projs = gate_projs
                     block.mlp.up_projs = up_projs

From e86e7d9a41f63836c5402fde6d73215b180ceda9 Mon Sep 17 00:00:00 2001
From: Mohamed-Ashraf273 <ma2736666@gmail.com>
Date: Mon, 23 Mar 2026 11:15:43 +0200
Subject: [PATCH 26/39] fix pacther

---
 optimum/exporters/openvino/model_patcher.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index 120098da4d..5a91ea981f 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -3805,11 +3805,12 @@ def __enter__(self):
                 # Handle OpenVINO version check
                 import warnings
 
-                if is_openvino_version("<=", "2026.0.0"):
+                if is_openvino_version("<", "2026.1.0"):
                     warnings.warn(
                         "This model works best with OpenVINO 2026.1 or later. "
                         "Earlier versions require float() conversion for MoE weights, "
-                        "which may affect performance."
+                        "which may affect performance. "
+                        "OpenVINO 2026.1 includes a fix for torch.bmm dtype handling."
                     )
                     block.mlp.gate_projs = gate_projs.float()
                     block.mlp.up_projs = up_projs.float()

From cbc2005b6fa16c0ae59ca806023cfc91dca845e6 Mon Sep 17 00:00:00 2001
From: Mohamed-Ashraf273 <ma2736666@gmail.com>
Date: Mon, 23 Mar 2026 11:18:43 +0200
Subject: [PATCH 27/39] fix version

---
 optimum/exporters/openvino/model_patcher.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index 5a91ea981f..bf99733807 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -3807,7 +3807,7 @@ def __enter__(self):
 
                 if is_openvino_version("<", "2026.1.0"):
                     warnings.warn(
-                        "This model works best with OpenVINO 2026.1 or later. "
+                        "This model works best with OpenVINO 2026.0 or later. "
                         "Earlier versions require float() conversion for MoE weights, "
                         "which may affect performance. "
                         "OpenVINO 2026.1 includes a fix for torch.bmm dtype handling."

From 199da923b72b4c7f4d4c31faf3cc27746b379129 Mon Sep 17 00:00:00 2001
From: Mohamed-Ashraf273 <ma2736666@gmail.com>
Date: Mon, 23 Mar 2026 11:19:13 +0200
Subject: [PATCH 28/39] fix version

---
 optimum/exporters/openvino/model_patcher.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index bf99733807..5a91ea981f 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -3807,7 +3807,7 @@ def __enter__(self):
 
                 if is_openvino_version("<", "2026.1.0"):
                     warnings.warn(
-                        "This model works best with OpenVINO 2026.0 or later. "
+                        "This model works best with OpenVINO 2026.1 or later. "
                         "Earlier versions require float() conversion for MoE weights, "
                         "which may affect performance. "
                         "OpenVINO 2026.1 includes a fix for torch.bmm dtype handling."

From 1dee64cc03c0cf446115b468d3fd1441aef13841 Mon Sep 17 00:00:00 2001
From: Mohamed-Ashraf273 <ma2736666@gmail.com>
Date: Mon, 23 Mar 2026 23:12:54 +0200
Subject: [PATCH 29/39] revert refactoring

---
 optimum/exporters/openvino/model_configs.py |   4 +-
 optimum/exporters/openvino/model_patcher.py | 377 +++++++++++++-------
 tests/openvino/test_decoder.py              |  25 +-
 tests/openvino/test_export.py               |   2 +-
 tests/openvino/test_exporters_cli.py        |   2 +-
 tests/openvino/utils_tests.py               |   7 +-
 6 files changed, 274 insertions(+), 143 deletions(-)

diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index 35f33a793f..0624624a77 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -4081,8 +4081,8 @@ class M2M100OpenVINOConfig(BartOpenVINOConfig):
 )
 @register_in_tasks_manager("deepseek", *["text-generation", "text-generation-with-past"], library_name="transformers")
 class DeepseekOpenVINOConfig(MiniCPM3OpenVINOConfig):
-    MIN_TRANSFORMERS_VERSION = "4.53.0"
-    MAX_TRANSFORMERS_VERSION = None
+    MIN_TRANSFORMERS_VERSION = "4.46.0"
+    MAX_TRANSFORMERS_VERSION = "4.53.3"
     _MODEL_PATCHER = DeepseekPatcher
 
 
diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index 5a91ea981f..26ec653030 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -3765,30 +3765,32 @@ class DeepseekPatcher(OVDecoderModelPatcher):
     def __enter__(self):
         super().__enter__()
         self_attn = {
-            "deepseek_v3": make_deepseek_attn_forward(version=3),
-            "deepseek_v2": make_deepseek_attn_forward(version=2),
+            "deepseek_v3": deepseek_v3_attn_forward,
+            "deepseek_v2": deepseek_v2_attn_forward,
             "deepseek": minicpm3_attn_forward,
         }
 
         self_attn_fwd = self_attn.get(self._model.config.model_type)
         for block in self._model.model.layers:
+            # Patch attention
             if self_attn_fwd is not None:
                 block.self_attn._orig_forward = block.self_attn.forward
                 block.self_attn.forward = types.MethodType(self_attn_fwd, block.self_attn)
+
+            # Patch MoE
             if hasattr(block.mlp, "moe_infer"):
-                # old interface (transformers < 4.57): moe_infer(self, x, topk_ids, topk_weight)
                 block.mlp._orig_moe_infer = block.mlp.moe_infer
                 block.mlp._orig_moe = None
                 block.mlp.ep_rank = getattr(block.mlp, "ep_rank", 0)
                 block.mlp.experts_per_rank = getattr(block.mlp, "experts_per_rank", len(block.mlp.experts))
                 block.mlp.moe_infer = types.MethodType(deepseek_moe_infer, block.mlp)
+
             elif hasattr(block.mlp, "moe") and hasattr(block.mlp, "experts"):
-                # new interface (transformers >= 4.57): moe(self, hidden_states, topk_indices, topk_weights)
                 block.mlp._orig_moe = block.mlp.moe
                 block.mlp._orig_moe_infer = None
-                num_experts = len(block.mlp.experts)
 
-                # Concatenate expert weights
+                # Pre-concatenate expert weights for vectorized computation
+                num_experts = len(block.mlp.experts)
                 gate_projs = torch.concat(
                     tuple(block.mlp.experts[i].gate_proj.weight.unsqueeze(0) for i in range(num_experts)),
                     dim=0,
@@ -3802,16 +3804,7 @@ def __enter__(self):
                     dim=0,
                 )
 
-                # Handle OpenVINO version check
-                import warnings
-
                 if is_openvino_version("<", "2026.1.0"):
-                    warnings.warn(
-                        "This model works best with OpenVINO 2026.1 or later. "
-                        "Earlier versions require float() conversion for MoE weights, "
-                        "which may affect performance. "
-                        "OpenVINO 2026.1 includes a fix for torch.bmm dtype handling."
-                    )
                     block.mlp.gate_projs = gate_projs.float()
                     block.mlp.up_projs = up_projs.float()
                     block.mlp.down_projs = down_projs.float()
@@ -3821,19 +3814,15 @@ def __enter__(self):
                     block.mlp.down_projs = down_projs
 
                 block.mlp.moe = types.MethodType(deepseek_moe, block.mlp)
-            elif hasattr(block.mlp, "experts"):
-                # fallback: patch by injecting moe_infer with required attributes
-                block.mlp._orig_moe_infer = None
-                block.mlp._orig_moe = None
-                block.mlp.ep_rank = 0
-                block.mlp.experts_per_rank = len(block.mlp.experts)
-                block.mlp.moe_infer = types.MethodType(deepseek_moe_infer, block.mlp)
 
     def __exit__(self, exc_type, exc_value, traceback):
         super().__exit__(exc_type, exc_value, traceback)
         for block in self._model.model.layers:
+            # Restore attention
             if hasattr(block.self_attn, "_orig_forward"):
                 block.self_attn.forward = block.self_attn._orig_forward
+
+            # Restore MoE - handle both interfaces
             if hasattr(block.mlp, "_orig_moe"):
                 if block.mlp._orig_moe is not None:
                     block.mlp.moe = block.mlp._orig_moe
@@ -3844,6 +3833,7 @@ def __exit__(self, exc_type, exc_value, traceback):
                 if hasattr(block.mlp, "down_projs"):
                     del block.mlp.down_projs
                 delattr(block.mlp, "_orig_moe")
+
             if hasattr(block.mlp, "_orig_moe_infer"):
                 if block.mlp._orig_moe_infer is not None:
                     block.mlp.moe_infer = block.mlp._orig_moe_infer
@@ -3857,126 +3847,261 @@ def __exit__(self, exc_type, exc_value, traceback):
                 delattr(block.mlp, "_orig_moe_infer")
 
 
-def make_deepseek_attn_forward(version: int = 3):
-    from typing import Callable
+def deepseek_v3_attn_forward(
+    self,
+    hidden_states: torch.Tensor,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_value=None,
+    output_attentions: bool = False,
+    use_cache: bool = False,
+    cache_position: Optional[torch.LongTensor] = None,  # ← ADD THIS
+    **kwargs,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+    # modified from https://huggingface.co/deepseek-ai/DeepSeek-V3/blob/main/modeling_deepseek.py#L751
+    def rotate_half(x):
+        """Rotates half the hidden dims of the input."""
+        x1 = x[..., : x.shape[-1] // 2]
+        x2 = x[..., x.shape[-1] // 2 :]
+        return torch.cat((-x2, x1), dim=-1)
+
+    def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
+        orig_dtype = k.dtype
+        cos = cos[position_ids].unsqueeze(unsqueeze_dim)  # [bs, 1, seq_len, dim]
+        sin = sin[position_ids].unsqueeze(unsqueeze_dim)  # [bs, 1, seq_len, dim]
+        q_fp32 = q.to(dtype=torch.float32, device=q.device)
+        k_fp32 = k.to(dtype=torch.float32, device=k.device)
+        q_embed = (q_fp32 * cos) + (rotate_half(q_fp32) * sin)
+        k_embed = (k_fp32 * cos) + (rotate_half(k_fp32) * sin)
+        return q_embed.to(dtype=orig_dtype), k_embed.to(dtype=orig_dtype)
 
-    from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
+    if not hasattr(self, 'q_head_dim'):
+        self.q_head_dim = self.qk_nope_head_dim + self.qk_rope_head_dim
 
-    if version == 3:
-        from transformers.models.deepseek_v3.modeling_deepseek_v3 import (
-            apply_rotary_pos_emb,
-            apply_rotary_pos_emb_interleave,
-            eager_attention_forward,
+    if output_attentions:
+        return self._orig_forward(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            **kwargs,
         )
-    elif version == 2:
-
-        def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
-            batch, num_key_value_heads, slen, head_dim = hidden_states.shape
-            if n_rep == 1:
-                return hidden_states
-            hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
-            return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
-
-        def apply_rotary_emb(xq: torch.Tensor, xk: torch.Tensor, freqs_cis: torch.Tensor):
-            xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
-            xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))
-            freqs_cis = freqs_cis.unsqueeze(1).to(xq_.device)
-            xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3).type_as(xq)
-            xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3).type_as(xk)
-            return xq_out, xk_out
-
-        def eager_attention_forward(module, query, key, value, attention_mask, scaling, dropout=0.0, **kwargs):
-            key = repeat_kv(key, module.num_key_value_groups)
-            value = repeat_kv(value, module.num_key_value_groups)
-            attn_weights = torch.matmul(query, key.transpose(2, 3)) * scaling
-            if attention_mask is not None:
-                attn_weights = attn_weights + attention_mask
-            attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
-            attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
-            attn_output = torch.matmul(attn_weights, value)
-            return attn_output.transpose(1, 2).contiguous(), attn_weights
 
+    bsz, q_len, _ = hidden_states.size()
+
+    if self.q_lora_rank is None:
+        q = self.q_proj(hidden_states)
     else:
-        raise ValueError(f"Unsupported DeepSeek version: {version}")
+        q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states)))
+    q = q.view(bsz, q_len, self.num_heads, self.q_head_dim).transpose(1, 2)
+    q_nope, q_pe = torch.split(q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
 
-    def deepseek_attn_forward(
-        self,
-        hidden_states: torch.Tensor,
-        position_embeddings,
-        attention_mask: Optional[torch.Tensor],
-        past_key_value: Optional[Cache] = None,
-        past_key_values=None,
-        cache_position: Optional[torch.LongTensor] = None,
-        **kwargs,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
-        batch_size, seq_length = hidden_states.shape[:-1]
+    compressed_kv = self.kv_a_proj_with_mqa(hidden_states)
+    compressed_kv, k_pe = torch.split(compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
+    k_pe = k_pe.view(bsz, q_len, 1, self.qk_rope_head_dim).transpose(1, 2)
+    kv = (
+        self.kv_b_proj(self.kv_a_layernorm(compressed_kv))
+        .view(bsz, q_len, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
+        .transpose(1, 2)
+    )
 
-        if self.q_lora_rank is None:
-            q_states = self.q_proj(hidden_states)
-        else:
-            q_states = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states)))
-        q_states = q_states.view(batch_size, seq_length, -1, self.qk_head_dim).transpose(1, 2)
-        q_pass, q_rot = torch.split(q_states, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
-
-        compressed_kv = self.kv_a_proj_with_mqa(hidden_states)
-        k_pass, k_rot = torch.split(compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
-        k_pass = self.kv_b_proj(self.kv_a_layernorm(k_pass))
-        k_pass = k_pass.view(batch_size, seq_length, -1, self.qk_nope_head_dim + self.v_head_dim).transpose(1, 2)
-        k_pass, value_states = torch.split(k_pass, [self.qk_nope_head_dim, self.v_head_dim], dim=-1)
-        k_rot = k_rot.view(batch_size, 1, seq_length, self.qk_rope_head_dim)
-
-        if version == 3:
-            cos, sin = position_embeddings
-            if self.config.rope_interleave:
-                q_rot, k_rot = apply_rotary_pos_emb_interleave(q_rot, k_rot, cos, sin)
-            else:
-                q_rot, k_rot = apply_rotary_pos_emb(q_rot, k_rot, cos, sin)
-            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
-            kv_cache = past_key_values if past_key_values is not None else past_key_value
-        else:
-            q_rot, k_rot = apply_rotary_emb(q_rot, k_rot, position_embeddings.to(q_rot.device))
-            cache_kwargs = {"cache_position": cache_position}
-            kv_cache = past_key_values
+    k_nope, value_states = torch.split(kv, [self.qk_nope_head_dim, self.v_head_dim], dim=-1)
+    kv_seq_len = value_states.shape[-2]
+    if past_key_value is not None:
+        if self.layer_idx is None:
+            raise ValueError(
+                f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
+                "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
+                "with a layer index."
+            )
+        kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
 
-        k_rot = k_rot.expand(*k_pass.shape[:-1], -1)
-        query_states = torch.cat((q_pass, q_rot), dim=-1)
-        key_states = torch.cat((k_pass, k_rot), dim=-1)
+    new_interface = False # Set to True if using new rotary embedding interface
+    if hasattr(self, 'rotary_emb'):
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin, position_ids)
+    else:
+        from transformers.models.deepseek_v3.modeling_deepseek_v3 import apply_rotary_pos_emb
 
-        if kv_cache is not None:
-            key_states, value_states = kv_cache.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        cos, sin = position_embeddings
+        q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin)
+        new_interface = True
 
-        is_flash_attn = "flash" in self.config._attn_implementation
-        if is_flash_attn and self.qk_head_dim != self.v_head_dim:
-            value_states = F.pad(value_states, [0, self.qk_head_dim - self.v_head_dim])
 
-        attention_interface: Callable = eager_attention_forward
-        if self.config._attn_implementation != "eager":
-            if version == 2:
-                attention_interface = ALL_ATTENTION_FUNCTIONS.get_interface(
-                    self.config._attn_implementation, eager_attention_forward
-                )
-            else:
-                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+    q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin, position_ids)
 
-        attn_output, attn_weights = attention_interface(
-            self,
-            query_states,
-            key_states,
-            value_states,
-            attention_mask,
-            dropout=0.0 if not self.training else self.attention_dropout,
-            scaling=self.scaling,
-            **kwargs,
+    # Difference with original code, k_pe.new_empty create constant tensor in torchscript
+    query_states = torch.concat([q_nope, q_pe], dim=-1)
+    # query_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim)
+    # query_states[:, :, :, : self.qk_nope_head_dim] = q_nope
+    # query_states[:, :, :, self.qk_nope_head_dim :] = q_pe
+    key_states = torch.concat([k_nope, k_pe.expand(-1, self.num_heads, -1, -1)], dim=-1)
+    # key_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim)
+    # key_states[:, :, :, : self.qk_nope_head_dim] = k_nope
+    # key_states[:, :, :, self.qk_nope_head_dim :] = k_pe
+    if past_key_value is not None:
+        cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+        key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+    if attention_mask is not None:
+        if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+            raise ValueError(
+                f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+            )
+
+    # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+    # Reference: https://github.com/pytorch/pytorch/issues/112577.
+    if query_states.device.type == "cuda" and attention_mask is not None:
+        query_states = query_states.contiguous()
+        key_states = key_states.contiguous()
+        value_states = value_states.contiguous()
+
+    attn_output = torch.nn.functional.scaled_dot_product_attention(
+        query_states,
+        key_states,
+        value_states,
+        attn_mask=attention_mask,
+        dropout_p=self.attention_dropout if self.training else 0.0,
+        # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
+        is_causal=self.is_causal and attention_mask is None and q_len > 1,
+    )
+
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.v_head_dim)
+
+    attn_output = self.o_proj(attn_output)
+
+    if new_interface:
+        return attn_output, None
+
+    return attn_output, None, past_key_value
+
+
+def deepseek_v2_attn_forward(
+    self,
+    hidden_states: torch.Tensor,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_value=None,
+    output_attentions: bool = False,
+    use_cache: bool = False,
+    **kwargs,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+    # modified from https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite/blob/main/modeling_deepseek.py#L806
+    def rotate_half(x):
+        """Rotates half the hidden dims of the input."""
+        x1 = x[..., : x.shape[-1] // 2]
+        x2 = x[..., x.shape[-1] // 2 :]
+        return torch.cat((-x2, x1), dim=-1)
+
+    def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
+        cos = cos[position_ids].unsqueeze(unsqueeze_dim)
+        sin = sin[position_ids].unsqueeze(unsqueeze_dim)
+
+        b, h, s, d = q.shape
+        q = q.view(b, h, s, d // 2, 2).transpose(4, 3).reshape(b, h, s, d)
+
+        b, h, s, d = k.shape
+        k = k.view(b, h, s, d // 2, 2).transpose(4, 3).reshape(b, h, s, d)
+
+        q_embed = (q * cos) + (rotate_half(q) * sin)
+        k_embed = (k * cos) + (rotate_half(k) * sin)
+        return q_embed, k_embed
+
+    if output_attentions:
+        return self._orig_forward(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
         )
 
-        if is_flash_attn and self.qk_head_dim != self.v_head_dim:
-            attn_output = attn_output[:, :, :, : self.v_head_dim]
+    bsz, q_len, _ = hidden_states.shape
 
-        attn_output = attn_output.reshape(batch_size, seq_length, -1).contiguous()
-        attn_output = self.o_proj(attn_output)
-        return attn_output, attn_weights
+    if self.q_lora_rank is None:
+        q = self.q_proj(hidden_states)
+    else:
+        q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states)))
+    q = q.view(bsz, q_len, self.num_heads, self.q_head_dim).transpose(1, 2)
+    q_nope, q_pe = torch.split(q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
+
+    compressed_kv = self.kv_a_proj_with_mqa(hidden_states)
+    compressed_kv, k_pe = torch.split(compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
+    k_pe = k_pe.view(bsz, q_len, 1, self.qk_rope_head_dim).transpose(1, 2)
+    kv = (
+        self.kv_b_proj(self.kv_a_layernorm(compressed_kv))
+        .view(bsz, q_len, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
+        .transpose(1, 2)
+    )
+
+    k_nope, value_states = torch.split(kv, [self.qk_nope_head_dim, self.v_head_dim], dim=-1)
+    kv_seq_len = value_states.shape[-2]
+    if past_key_value is not None:
+        if self.layer_idx is None:
+            raise ValueError(
+                f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
+                "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
+                "with a layer index."
+            )
+        kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+    cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+
+    q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin, position_ids)
+
+    # Difference with original code, k_pe.new_empty create constant tensor in torchscript
+    query_states = torch.concat([q_nope, q_pe], dim=-1)
+    # query_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim)
+    # query_states[:, :, :, : self.qk_nope_head_dim] = q_nope
+    # query_states[:, :, :, self.qk_nope_head_dim :] = q_pe
+    key_states = torch.concat([k_nope, k_pe.expand(-1, self.num_heads, -1, -1)], dim=-1)
+    # key_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim)
+    # key_states[:, :, :, : self.qk_nope_head_dim] = k_nope
+    # key_states[:, :, :, self.qk_nope_head_dim :] = k_pe
+    if past_key_value is not None:
+        cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+        key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+    if attention_mask is not None:
+        if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+            raise ValueError(
+                f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+            )
 
-    return deepseek_attn_forward
+    if attention_mask is not None:
+        if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+            raise ValueError(
+                f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+            )
+    # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+    # Reference: https://github.com/pytorch/pytorch/issues/112577.
+    if query_states.device.type == "cuda" and attention_mask is not None:
+        query_states = query_states.contiguous()
+        key_states = key_states.contiguous()
+        value_states = value_states.contiguous()
+
+    attn_output = torch.nn.functional.scaled_dot_product_attention(
+        query_states,
+        key_states,
+        value_states,
+        attn_mask=attention_mask,
+        dropout_p=self.attention_dropout if self.training else 0.0,
+        # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
+        is_causal=self.is_causal and attention_mask is None and q_len > 1,
+    )
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.v_head_dim)
+
+    attn_output = self.o_proj(attn_output)
+
+    return attn_output, None, past_key_value
 
 
 def deepseek_moe_infer(self, x, topk_ids, topk_weight):
diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py
index 89bcacf9a4..a5df1f1cb2 100644
--- a/tests/openvino/test_decoder.py
+++ b/tests/openvino/test_decoder.py
@@ -119,13 +119,13 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase):
     if is_transformers_version(">=", "4.46.0"):
         SUPPORTED_ARCHITECTURES += ("glm", "mistral-nemo", "phimoe")
 
+        if is_transformers_version("<", "4.54.0"):
+            SUPPORTED_ARCHITECTURES += ("deepseek", "gigachat3",)
+
         # gptq and awq install disabled for windows test environment
         if platform.system() != "Windows" and is_transformers_version("<", "4.56.0"):
             SUPPORTED_ARCHITECTURES += ("opt_gptq", "mixtral_awq")
 
-    if is_transformers_version(">=", "4.53.0"):
-        SUPPORTED_ARCHITECTURES += ("deepseek", "gigachat3")
-
     if is_transformers_version(">", "4.47"):
         SUPPORTED_ARCHITECTURES += ("olmo2",)
 
@@ -291,8 +291,11 @@ def test_find_untested_architectures(self):
 
         if "llama4_text" in supported_architectures:
             supported_architectures.remove("llama4_text")
-        if is_transformers_version(">=", str(DeepseekOpenVINOConfig.MIN_TRANSFORMERS_VERSION)):
-            supported_architectures -= {"deepseek_v2", "deepseek_v3"}
+        if is_transformers_version(">=", str(DeepseekOpenVINOConfig.MAX_TRANSFORMERS_VERSION)):
+            if "deepseek_v2" in supported_architectures:
+                supported_architectures.remove("deepseek_v2")
+            if "deepseek_v3" in supported_architectures:
+                supported_architectures.remove("deepseek_v3")
         if is_transformers_version("<", str(BitnetOpenVINOConfig.MIN_TRANSFORMERS_VERSION)):
             supported_architectures -= {"bitnet"}
         if is_transformers_version("<", str(LFM2OpenVINOConfig.MIN_TRANSFORMERS_VERSION)):
@@ -401,10 +404,6 @@ def test_compare_to_transformers(self, model_arch):
         if model_arch in ["gigachat3"]:
             tokens.pop("token_type_ids", None)
 
-        if model_arch == "deepseek":
-            ov_model.generation_config.do_sample = False
-            transformers_model.generation_config.do_sample = False
-
         ov_model.generation_config.eos_token_id = None
         transformers_model.generation_config.eos_token_id = None
         ov_model.config.eos_token_id = None
@@ -421,6 +420,10 @@ def test_compare_to_transformers(self, model_arch):
 
         ov_outputs = ov_model.generate(**tokens, generation_config=gen_config)
 
+        # TODO: add back once https://huggingface.co/katuni4ka/tiny-random-minicpm3/discussions/1 merged (for all models) as current modeling incompatible with transformers >= v4.49
+        if model_arch in {"deepseek"} and is_transformers_version(">=", "4.49"):
+            self.skipTest("Incompatible modeling code")
+
         additional_inputs = {}
         # gemma2 does not support dynamic cache, it is unfair to compare dynamic cache result vs hybrid cache,
         # align cache representation in torch model
@@ -670,6 +673,10 @@ def test_beam_search(self, model_arch):
         if model_arch in ["lfm2", "granitemoehybrid"]:
             return
 
+        # TODO: add back once https://huggingface.co/katuni4ka/tiny-random-minicpm3/discussions/1 merged (for all models) as current modeling incompatible with transformers >= v4.49
+        if model_arch in {"deepseek"} and is_transformers_version(">=", "4.49"):
+            self.skipTest("Incompatible modeling code")
+
         tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=model_arch in REMOTE_CODE_MODELS)
         if model_arch == "persimmon":
             tokenizer.pad_token_id = tokenizer.bos_token_id
diff --git a/tests/openvino/test_export.py b/tests/openvino/test_export.py
index 16b5d528cc..90332cd397 100644
--- a/tests/openvino/test_export.py
+++ b/tests/openvino/test_export.py
@@ -98,7 +98,7 @@ class ExportModelTest(unittest.TestCase):
     if is_transformers_version(">=", "4.48.0"):
         SUPPORTED_ARCHITECTURES.update({"cohere2": OVModelForCausalLM})
 
-    if is_transformers_version(">=", "4.53.0"):
+    if is_transformers_version(">=", "4.46.0") and is_transformers_version("<=", "4.53.3"):
         SUPPORTED_ARCHITECTURES.update({"deepseek": OVModelForCausalLM, "gigachat3": OVModelForCausalLM})
 
     if is_transformers_version(">=", "4.49"):
diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py
index ad649baecf..61a1fce622 100644
--- a/tests/openvino/test_exporters_cli.py
+++ b/tests/openvino/test_exporters_cli.py
@@ -159,7 +159,7 @@ class OVCLIExportTestCase(unittest.TestCase):
             ]
         )
 
-    if is_transformers_version(">=", "4.53.0"):
+    if is_transformers_version(">=", "4.46.0") and is_transformers_version("<=", "4.53.3"):
         SUPPORTED_ARCHITECTURES.extend(
             [
                 ("text-generation-with-past", "gigachat3"),
diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py
index f53bd364e1..6bd099c4cd 100644
--- a/tests/openvino/utils_tests.py
+++ b/tests/openvino/utils_tests.py
@@ -399,6 +399,7 @@
     "exaone4",
     "decilm",
     "minicpm3",
+    "deepseek",
     "qwen3_eagle3",
 )
 
@@ -553,10 +554,8 @@ def get_supported_model_for_library(library_name):
         if supported_model_type[model_type].get("openvino"):
             export_config = next(iter(supported_model_type[model_type]["openvino"].values()))
 
-            raw_min = getattr(export_config.func, "MIN_TRANSFORMERS_VERSION", None)
-            raw_max = getattr(export_config.func, "MAX_TRANSFORMERS_VERSION", None)
-            min_transformers = str(raw_min) if raw_min is not None else "0"
-            max_transformers = str(raw_max) if raw_max is not None else "999"
+            min_transformers = str(getattr(export_config.func, "MIN_TRANSFORMERS_VERSION", "0"))
+            max_transformers = str(getattr(export_config.func, "MAX_TRANSFORMERS_VERSION", "999"))
 
             if is_transformers_version(">=", min_transformers) and is_transformers_version("<=", max_transformers):
                 valid_model.add(model_type)

From 225aed31af4067d76cc720de2211f43a4965cd17 Mon Sep 17 00:00:00 2001
From: Mohamed-Ashraf273 <ma2736666@gmail.com>
Date: Tue, 24 Mar 2026 00:09:37 +0200
Subject: [PATCH 30/39] update doc

---
 optimum/exporters/openvino/model_patcher.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index 26ec653030..00dbf68ae1 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -4140,9 +4140,7 @@ def deepseek_moe_infer(self, x, topk_ids, topk_weight):
 
 def deepseek_moe(self, hidden_states: torch.Tensor, topk_indices: torch.Tensor, topk_weights: torch.Tensor):
     """
-    Replacement for DeepseekV3MoE.moe (transformers >= 4.57).
-    The original skips experts with no tokens (data-dependent control flow that breaks tracing).
-    This version unconditionally runs all experts to produce a traceable static graph.
+    Vectorized MoE forward for DeepSeek-V3.
     """
     num_experts = len(self.experts)
     batch_tokens, hidden_dim = hidden_states.shape

From 4174478f9d76e86fc490a1add385d2a7b760bfc6 Mon Sep 17 00:00:00 2001
From: Mohamed-Ashraf273 <ma2736666@gmail.com>
Date: Tue, 24 Mar 2026 23:25:47 +0200
Subject: [PATCH 31/39] modify based on review

---
 optimum/exporters/openvino/model_patcher.py | 17 +++++++----------
 tests/openvino/test_decoder.py              |  5 ++++-
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index 00dbf68ae1..b14897a0fb 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -3856,7 +3856,7 @@ def deepseek_v3_attn_forward(
     past_key_value=None,
     output_attentions: bool = False,
     use_cache: bool = False,
-    cache_position: Optional[torch.LongTensor] = None,  # ← ADD THIS
+    cache_position: Optional[torch.LongTensor] = None,
     **kwargs,
 ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
     # modified from https://huggingface.co/deepseek-ai/DeepSeek-V3/blob/main/modeling_deepseek.py#L751
@@ -3876,7 +3876,7 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
         k_embed = (k_fp32 * cos) + (rotate_half(k_fp32) * sin)
         return q_embed.to(dtype=orig_dtype), k_embed.to(dtype=orig_dtype)
 
-    if not hasattr(self, 'q_head_dim'):
+    if not hasattr(self, "q_head_dim"):
         self.q_head_dim = self.qk_nope_head_dim + self.qk_rope_head_dim
 
     if output_attentions:
@@ -3920,8 +3920,8 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
             )
         kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
 
-    new_interface = False # Set to True if using new rotary embedding interface
-    if hasattr(self, 'rotary_emb'):
+    new_interface = False  # Set to True if using new rotary embedding interface
+    if hasattr(self, "rotary_emb"):
         cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
         q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin, position_ids)
     else:
@@ -3931,7 +3931,6 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
         q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin)
         new_interface = True
 
-
     q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin, position_ids)
 
     # Difference with original code, k_pe.new_empty create constant tensor in torchscript
@@ -3977,6 +3976,8 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
     attn_output = self.o_proj(attn_output)
 
     if new_interface:
+        # Some models (e.g. gigachat3) expect 2-tuple return (attn_output, attn_weights)
+        # Returning 3-tuple breaks tracing with "too many values to unpack"
         return attn_output, None
 
     return attn_output, None, past_key_value
@@ -4145,11 +4146,7 @@ def deepseek_moe(self, hidden_states: torch.Tensor, topk_indices: torch.Tensor,
     num_experts = len(self.experts)
     batch_tokens, hidden_dim = hidden_states.shape
 
-    routing = torch.zeros(
-        batch_tokens, num_experts,
-        dtype=topk_weights.dtype,
-        device=hidden_states.device
-    )
+    routing = torch.zeros(batch_tokens, num_experts, dtype=topk_weights.dtype, device=hidden_states.device)
     routing.scatter_(1, topk_indices, topk_weights)
 
     hidden_states = hidden_states.repeat(num_experts, 1)
diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py
index a5df1f1cb2..4dfdea8647 100644
--- a/tests/openvino/test_decoder.py
+++ b/tests/openvino/test_decoder.py
@@ -120,7 +120,10 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase):
         SUPPORTED_ARCHITECTURES += ("glm", "mistral-nemo", "phimoe")
 
         if is_transformers_version("<", "4.54.0"):
-            SUPPORTED_ARCHITECTURES += ("deepseek", "gigachat3",)
+            SUPPORTED_ARCHITECTURES += (
+                "deepseek",
+                "gigachat3",
+            )
 
         # gptq and awq install disabled for windows test environment
         if platform.system() != "Windows" and is_transformers_version("<", "4.56.0"):

From 9ae11622b641f34801d454a32b11f5cafd884ef6 Mon Sep 17 00:00:00 2001
From: Mohamed-Ashraf273 <ma2736666@gmail.com>
Date: Thu, 26 Mar 2026 02:39:39 +0200
Subject: [PATCH 32/39] fix issues

---
 optimum/exporters/openvino/model_configs.py |   4 +-
 optimum/exporters/openvino/model_patcher.py | 180 +++++++++++---------
 tests/openvino/test_decoder.py              |  28 +--
 tests/openvino/test_export.py               |   2 +-
 tests/openvino/test_exporters_cli.py        |   2 +-
 tests/openvino/utils_tests.py               |   7 +-
 6 files changed, 120 insertions(+), 103 deletions(-)

diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index 0624624a77..35f33a793f 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -4081,8 +4081,8 @@ class M2M100OpenVINOConfig(BartOpenVINOConfig):
 )
 @register_in_tasks_manager("deepseek", *["text-generation", "text-generation-with-past"], library_name="transformers")
 class DeepseekOpenVINOConfig(MiniCPM3OpenVINOConfig):
-    MIN_TRANSFORMERS_VERSION = "4.46.0"
-    MAX_TRANSFORMERS_VERSION = "4.53.3"
+    MIN_TRANSFORMERS_VERSION = "4.53.0"
+    MAX_TRANSFORMERS_VERSION = None
     _MODEL_PATCHER = DeepseekPatcher
 
 
diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index b14897a0fb..f20d3bf38c 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -3772,25 +3772,23 @@ def __enter__(self):
 
         self_attn_fwd = self_attn.get(self._model.config.model_type)
         for block in self._model.model.layers:
-            # Patch attention
             if self_attn_fwd is not None:
                 block.self_attn._orig_forward = block.self_attn.forward
                 block.self_attn.forward = types.MethodType(self_attn_fwd, block.self_attn)
-
-            # Patch MoE
             if hasattr(block.mlp, "moe_infer"):
+                # old interface (transformers < 4.57): moe_infer(self, x, topk_ids, topk_weight)
                 block.mlp._orig_moe_infer = block.mlp.moe_infer
                 block.mlp._orig_moe = None
                 block.mlp.ep_rank = getattr(block.mlp, "ep_rank", 0)
                 block.mlp.experts_per_rank = getattr(block.mlp, "experts_per_rank", len(block.mlp.experts))
                 block.mlp.moe_infer = types.MethodType(deepseek_moe_infer, block.mlp)
-
             elif hasattr(block.mlp, "moe") and hasattr(block.mlp, "experts"):
+                # new interface (transformers >= 4.57): moe(self, hidden_states, topk_indices, topk_weights)
                 block.mlp._orig_moe = block.mlp.moe
                 block.mlp._orig_moe_infer = None
-
-                # Pre-concatenate expert weights for vectorized computation
                 num_experts = len(block.mlp.experts)
+
+                # Concatenate expert weights
                 gate_projs = torch.concat(
                     tuple(block.mlp.experts[i].gate_proj.weight.unsqueeze(0) for i in range(num_experts)),
                     dim=0,
@@ -3805,6 +3803,12 @@ def __enter__(self):
                 )
 
                 if is_openvino_version("<", "2026.1.0"):
+                    logger.warning(
+                        "This model works best with OpenVINO 2026.1 or later. "
+                        "Earlier versions require float() conversion for MoE weights, "
+                        "which may affect performance. "
+                        "OpenVINO 2026.1 includes a fix for torch.bmm dtype handling."
+                    )
                     block.mlp.gate_projs = gate_projs.float()
                     block.mlp.up_projs = up_projs.float()
                     block.mlp.down_projs = down_projs.float()
@@ -3814,15 +3818,19 @@ def __enter__(self):
                     block.mlp.down_projs = down_projs
 
                 block.mlp.moe = types.MethodType(deepseek_moe, block.mlp)
+            elif hasattr(block.mlp, "experts"):
+                # fallback: patch by injecting moe_infer with required attributes
+                block.mlp._orig_moe_infer = None
+                block.mlp._orig_moe = None
+                block.mlp.ep_rank = 0
+                block.mlp.experts_per_rank = len(block.mlp.experts)
+                block.mlp.moe_infer = types.MethodType(deepseek_moe_infer, block.mlp)
 
     def __exit__(self, exc_type, exc_value, traceback):
         super().__exit__(exc_type, exc_value, traceback)
         for block in self._model.model.layers:
-            # Restore attention
             if hasattr(block.self_attn, "_orig_forward"):
                 block.self_attn.forward = block.self_attn._orig_forward
-
-            # Restore MoE - handle both interfaces
             if hasattr(block.mlp, "_orig_moe"):
                 if block.mlp._orig_moe is not None:
                     block.mlp.moe = block.mlp._orig_moe
@@ -3833,7 +3841,6 @@ def __exit__(self, exc_type, exc_value, traceback):
                 if hasattr(block.mlp, "down_projs"):
                     del block.mlp.down_projs
                 delattr(block.mlp, "_orig_moe")
-
             if hasattr(block.mlp, "_orig_moe_infer"):
                 if block.mlp._orig_moe_infer is not None:
                     block.mlp.moe_infer = block.mlp._orig_moe_infer
@@ -3850,45 +3857,44 @@ def __exit__(self, exc_type, exc_value, traceback):
 def deepseek_v3_attn_forward(
     self,
     hidden_states: torch.Tensor,
+    position_embeddings=None,
     attention_mask: Optional[torch.Tensor] = None,
-    position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
     position_ids: Optional[torch.LongTensor] = None,
     past_key_value=None,
+    past_key_values=None,
+    cache_position: Optional[torch.LongTensor] = None,
     output_attentions: bool = False,
     use_cache: bool = False,
-    cache_position: Optional[torch.LongTensor] = None,
     **kwargs,
 ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
     # modified from https://huggingface.co/deepseek-ai/DeepSeek-V3/blob/main/modeling_deepseek.py#L751
     def rotate_half(x):
-        """Rotates half the hidden dims of the input."""
         x1 = x[..., : x.shape[-1] // 2]
         x2 = x[..., x.shape[-1] // 2 :]
         return torch.cat((-x2, x1), dim=-1)
 
     def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
         orig_dtype = k.dtype
-        cos = cos[position_ids].unsqueeze(unsqueeze_dim)  # [bs, 1, seq_len, dim]
-        sin = sin[position_ids].unsqueeze(unsqueeze_dim)  # [bs, 1, seq_len, dim]
-        q_fp32 = q.to(dtype=torch.float32, device=q.device)
-        k_fp32 = k.to(dtype=torch.float32, device=k.device)
+        cos = cos[position_ids].unsqueeze(unsqueeze_dim)
+        sin = sin[position_ids].unsqueeze(unsqueeze_dim)
+        q_fp32 = q.to(dtype=torch.float32)
+        k_fp32 = k.to(dtype=torch.float32)
         q_embed = (q_fp32 * cos) + (rotate_half(q_fp32) * sin)
         k_embed = (k_fp32 * cos) + (rotate_half(k_fp32) * sin)
         return q_embed.to(dtype=orig_dtype), k_embed.to(dtype=orig_dtype)
 
-    if not hasattr(self, "q_head_dim"):
-        self.q_head_dim = self.qk_nope_head_dim + self.qk_rope_head_dim
-
     if output_attentions:
         return self._orig_forward(
             hidden_states=hidden_states,
             attention_mask=attention_mask,
             position_ids=position_ids,
+            position_embeddings=position_embeddings,
             past_key_value=past_key_value,
+            past_key_values=past_key_values,
             output_attentions=output_attentions,
             use_cache=use_cache,
             cache_position=cache_position,
-            **kwargs,
+            kwargs=kwargs,
         )
 
     bsz, q_len, _ = hidden_states.size()
@@ -3897,60 +3903,84 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
         q = self.q_proj(hidden_states)
     else:
         q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states)))
-    q = q.view(bsz, q_len, self.num_heads, self.q_head_dim).transpose(1, 2)
-    q_nope, q_pe = torch.split(q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
 
+    q = q.view(bsz, q_len, self.num_heads, self.qk_head_dim).transpose(1, 2)
+    q_nope, q_pe = torch.split(q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
     compressed_kv = self.kv_a_proj_with_mqa(hidden_states)
-    compressed_kv, k_pe = torch.split(compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
-    k_pe = k_pe.view(bsz, q_len, 1, self.qk_rope_head_dim).transpose(1, 2)
-    kv = (
-        self.kv_b_proj(self.kv_a_layernorm(compressed_kv))
-        .view(bsz, q_len, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
-        .transpose(1, 2)
+
+    k_pass, k_rot = torch.split(
+        compressed_kv,
+        [self.kv_lora_rank, self.qk_rope_head_dim],
+        dim=-1,
     )
 
-    k_nope, value_states = torch.split(kv, [self.qk_nope_head_dim, self.v_head_dim], dim=-1)
-    kv_seq_len = value_states.shape[-2]
-    if past_key_value is not None:
-        if self.layer_idx is None:
-            raise ValueError(
-                f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
-                "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
-                "with a layer index."
-            )
-        kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+    k_pass = self.kv_b_proj(self.kv_a_layernorm(k_pass))
+    k_pass = k_pass.view(bsz, q_len, self.num_heads, self.qk_nope_head_dim + self.v_head_dim).transpose(1, 2)
 
-    new_interface = False  # Set to True if using new rotary embedding interface
-    if hasattr(self, "rotary_emb"):
-        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-        q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin, position_ids)
-    else:
-        from transformers.models.deepseek_v3.modeling_deepseek_v3 import apply_rotary_pos_emb
+    k_pass, value_states = torch.split(
+        k_pass,
+        [self.qk_nope_head_dim, self.v_head_dim],
+        dim=-1,
+    )
+
+    k_rot = k_rot.view(bsz, 1, q_len, self.qk_rope_head_dim)
+
+    new_interface = position_embeddings is not None and not hasattr(self, "rotary_emb")
+
+    if new_interface:
+        from transformers.models.deepseek_v3.modeling_deepseek_v3 import apply_rotary_pos_emb_interleave
 
         cos, sin = position_embeddings
-        q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin)
-        new_interface = True
 
-    q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin, position_ids)
+        if getattr(self.config, "rope_interleave", False):
+            try:
+                q_pe, k_rot = apply_rotary_pos_emb_interleave(q_pe, k_rot, cos, sin)
+            except Exception as e:
+                raise RuntimeError(
+                    "Failed to apply interleaved rotary position embeddings, "
+                    f"may due to incompatible transformers version, try to `pip install transformers>=4.57.1`: {e}"
+                )
+        else:
+            q_pe, k_rot = apply_rotary_pos_emb(q_pe, k_rot, cos, sin)
 
-    # Difference with original code, k_pe.new_empty create constant tensor in torchscript
-    query_states = torch.concat([q_nope, q_pe], dim=-1)
-    # query_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim)
-    # query_states[:, :, :, : self.qk_nope_head_dim] = q_nope
-    # query_states[:, :, :, self.qk_nope_head_dim :] = q_pe
-    key_states = torch.concat([k_nope, k_pe.expand(-1, self.num_heads, -1, -1)], dim=-1)
-    # key_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim)
-    # key_states[:, :, :, : self.qk_nope_head_dim] = k_nope
-    # key_states[:, :, :, self.qk_nope_head_dim :] = k_pe
-    if past_key_value is not None:
-        cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
-        key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        kv_cache = past_key_values if past_key_values is not None else past_key_value
 
-    if attention_mask is not None:
-        if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
-            raise ValueError(
-                f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
-            )
+    else:
+        kv_seq_len = value_states.shape[-2]
+        if past_key_value is not None:
+            if self.layer_idx is None:
+                raise ValueError(
+                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
+                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
+                    "with a layer index."
+                )
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        q_pe, k_rot = apply_rotary_pos_emb(q_pe, k_rot, cos, sin, position_ids)
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+                )
+
+        kv_cache = past_key_value
+
+    k_rot = k_rot.expand(*k_pass.shape[:-1], -1)
+    query_states = torch.cat((q_nope, q_pe), dim=-1)
+    key_states = torch.cat((k_pass, k_rot), dim=-1)
+
+    if kv_cache is not None:
+        if new_interface:
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = kv_cache.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+            if attention_mask is not None:
+                attention_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+
+        else:
+            cache_kwargs = {"sin": sin, "cos": cos}
+            key_states, value_states = kv_cache.update(key_states, value_states, self.layer_idx, cache_kwargs)
 
     # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
     # Reference: https://github.com/pytorch/pytorch/issues/112577.
@@ -3967,8 +3997,8 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
         dropout_p=self.attention_dropout if self.training else 0.0,
         # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
         is_causal=self.is_causal and attention_mask is None and q_len > 1,
+        scale=None if not new_interface else self.scaling,
     )
-
     attn_output = attn_output.transpose(1, 2).contiguous()
 
     attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.v_head_dim)
@@ -3976,8 +4006,6 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
     attn_output = self.o_proj(attn_output)
 
     if new_interface:
-        # Some models (e.g. gigachat3) expect 2-tuple return (attn_output, attn_weights)
-        # Returning 3-tuple breaks tracing with "too many values to unpack"
         return attn_output, None
 
     return attn_output, None, past_key_value
@@ -4141,25 +4169,23 @@ def deepseek_moe_infer(self, x, topk_ids, topk_weight):
 
 def deepseek_moe(self, hidden_states: torch.Tensor, topk_indices: torch.Tensor, topk_weights: torch.Tensor):
     """
-    Vectorized MoE forward for DeepSeek-V3.
+    Vectorized MoE that matches original behavior.
     """
+    orig_dtype = hidden_states.dtype
     num_experts = len(self.experts)
-    batch_tokens, hidden_dim = hidden_states.shape
-
+    batch_tokens, _ = hidden_states.shape
     routing = torch.zeros(batch_tokens, num_experts, dtype=topk_weights.dtype, device=hidden_states.device)
     routing.scatter_(1, topk_indices, topk_weights)
-
-    hidden_states = hidden_states.repeat(num_experts, 1)
-    hidden_states = hidden_states.view(num_experts, batch_tokens, hidden_dim)
+    expanded = hidden_states.unsqueeze(0).expand(num_experts, -1, -1)
     act_fn = self.experts[0].act_fn
-    gate = torch.bmm(hidden_states, self.gate_projs.transpose(1, 2))
-    up = torch.bmm(hidden_states, self.up_projs.transpose(1, 2))
+    gate = torch.bmm(expanded, self.gate_projs.transpose(1, 2))
+    up = torch.bmm(expanded, self.up_projs.transpose(1, 2))
     gate_up = act_fn(gate) * up
     next_states = torch.bmm(gate_up, self.down_projs.transpose(1, 2))
     routing = routing.transpose(0, 1).unsqueeze(-1)
     next_states = next_states * routing
     next_states = next_states.sum(dim=0)
-    return next_states.type(hidden_states.dtype)
+    return next_states.to(orig_dtype)
 
 
 class Qwen2VLLanguageModelPatcher(OVDecoderModelPatcher):
diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py
index 4dfdea8647..89bcacf9a4 100644
--- a/tests/openvino/test_decoder.py
+++ b/tests/openvino/test_decoder.py
@@ -119,16 +119,13 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase):
     if is_transformers_version(">=", "4.46.0"):
         SUPPORTED_ARCHITECTURES += ("glm", "mistral-nemo", "phimoe")
 
-        if is_transformers_version("<", "4.54.0"):
-            SUPPORTED_ARCHITECTURES += (
-                "deepseek",
-                "gigachat3",
-            )
-
         # gptq and awq install disabled for windows test environment
         if platform.system() != "Windows" and is_transformers_version("<", "4.56.0"):
             SUPPORTED_ARCHITECTURES += ("opt_gptq", "mixtral_awq")
 
+    if is_transformers_version(">=", "4.53.0"):
+        SUPPORTED_ARCHITECTURES += ("deepseek", "gigachat3")
+
     if is_transformers_version(">", "4.47"):
         SUPPORTED_ARCHITECTURES += ("olmo2",)
 
@@ -294,11 +291,8 @@ def test_find_untested_architectures(self):
 
         if "llama4_text" in supported_architectures:
             supported_architectures.remove("llama4_text")
-        if is_transformers_version(">=", str(DeepseekOpenVINOConfig.MAX_TRANSFORMERS_VERSION)):
-            if "deepseek_v2" in supported_architectures:
-                supported_architectures.remove("deepseek_v2")
-            if "deepseek_v3" in supported_architectures:
-                supported_architectures.remove("deepseek_v3")
+        if is_transformers_version(">=", str(DeepseekOpenVINOConfig.MIN_TRANSFORMERS_VERSION)):
+            supported_architectures -= {"deepseek_v2", "deepseek_v3"}
         if is_transformers_version("<", str(BitnetOpenVINOConfig.MIN_TRANSFORMERS_VERSION)):
             supported_architectures -= {"bitnet"}
         if is_transformers_version("<", str(LFM2OpenVINOConfig.MIN_TRANSFORMERS_VERSION)):
@@ -407,6 +401,10 @@ def test_compare_to_transformers(self, model_arch):
         if model_arch in ["gigachat3"]:
             tokens.pop("token_type_ids", None)
 
+        if model_arch == "deepseek":
+            ov_model.generation_config.do_sample = False
+            transformers_model.generation_config.do_sample = False
+
         ov_model.generation_config.eos_token_id = None
         transformers_model.generation_config.eos_token_id = None
         ov_model.config.eos_token_id = None
@@ -423,10 +421,6 @@ def test_compare_to_transformers(self, model_arch):
 
         ov_outputs = ov_model.generate(**tokens, generation_config=gen_config)
 
-        # TODO: add back once https://huggingface.co/katuni4ka/tiny-random-minicpm3/discussions/1 merged (for all models) as current modeling incompatible with transformers >= v4.49
-        if model_arch in {"deepseek"} and is_transformers_version(">=", "4.49"):
-            self.skipTest("Incompatible modeling code")
-
         additional_inputs = {}
         # gemma2 does not support dynamic cache, it is unfair to compare dynamic cache result vs hybrid cache,
         # align cache representation in torch model
@@ -676,10 +670,6 @@ def test_beam_search(self, model_arch):
         if model_arch in ["lfm2", "granitemoehybrid"]:
             return
 
-        # TODO: add back once https://huggingface.co/katuni4ka/tiny-random-minicpm3/discussions/1 merged (for all models) as current modeling incompatible with transformers >= v4.49
-        if model_arch in {"deepseek"} and is_transformers_version(">=", "4.49"):
-            self.skipTest("Incompatible modeling code")
-
         tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=model_arch in REMOTE_CODE_MODELS)
         if model_arch == "persimmon":
             tokenizer.pad_token_id = tokenizer.bos_token_id
diff --git a/tests/openvino/test_export.py b/tests/openvino/test_export.py
index 90332cd397..16b5d528cc 100644
--- a/tests/openvino/test_export.py
+++ b/tests/openvino/test_export.py
@@ -98,7 +98,7 @@ class ExportModelTest(unittest.TestCase):
     if is_transformers_version(">=", "4.48.0"):
         SUPPORTED_ARCHITECTURES.update({"cohere2": OVModelForCausalLM})
 
-    if is_transformers_version(">=", "4.46.0") and is_transformers_version("<=", "4.53.3"):
+    if is_transformers_version(">=", "4.53.0"):
         SUPPORTED_ARCHITECTURES.update({"deepseek": OVModelForCausalLM, "gigachat3": OVModelForCausalLM})
 
     if is_transformers_version(">=", "4.49"):
diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py
index 61a1fce622..ad649baecf 100644
--- a/tests/openvino/test_exporters_cli.py
+++ b/tests/openvino/test_exporters_cli.py
@@ -159,7 +159,7 @@ class OVCLIExportTestCase(unittest.TestCase):
             ]
         )
 
-    if is_transformers_version(">=", "4.46.0") and is_transformers_version("<=", "4.53.3"):
+    if is_transformers_version(">=", "4.53.0"):
         SUPPORTED_ARCHITECTURES.extend(
             [
                 ("text-generation-with-past", "gigachat3"),
diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py
index 6bd099c4cd..f53bd364e1 100644
--- a/tests/openvino/utils_tests.py
+++ b/tests/openvino/utils_tests.py
@@ -399,7 +399,6 @@
     "exaone4",
     "decilm",
     "minicpm3",
-    "deepseek",
     "qwen3_eagle3",
 )
 
@@ -554,8 +553,10 @@ def get_supported_model_for_library(library_name):
         if supported_model_type[model_type].get("openvino"):
             export_config = next(iter(supported_model_type[model_type]["openvino"].values()))
 
-            min_transformers = str(getattr(export_config.func, "MIN_TRANSFORMERS_VERSION", "0"))
-            max_transformers = str(getattr(export_config.func, "MAX_TRANSFORMERS_VERSION", "999"))
+            raw_min = getattr(export_config.func, "MIN_TRANSFORMERS_VERSION", None)
+            raw_max = getattr(export_config.func, "MAX_TRANSFORMERS_VERSION", None)
+            min_transformers = str(raw_min) if raw_min is not None else "0"
+            max_transformers = str(raw_max) if raw_max is not None else "999"
 
             if is_transformers_version(">=", min_transformers) and is_transformers_version("<=", max_transformers):
                 valid_model.add(model_type)

From 5dbc1c8c22e2a60a39f3d8b115740a485234efa3 Mon Sep 17 00:00:00 2001
From: Mohamed-Ashraf273 <ma2736666@gmail.com>
Date: Thu, 26 Mar 2026 02:41:54 +0200
Subject: [PATCH 33/39] fix issues

---
 optimum/exporters/openvino/model_patcher.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index f20d3bf38c..a86be32aab 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -3869,16 +3869,17 @@ def deepseek_v3_attn_forward(
 ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
     # modified from https://huggingface.co/deepseek-ai/DeepSeek-V3/blob/main/modeling_deepseek.py#L751
     def rotate_half(x):
+        """Rotates half the hidden dims of the input."""
         x1 = x[..., : x.shape[-1] // 2]
         x2 = x[..., x.shape[-1] // 2 :]
         return torch.cat((-x2, x1), dim=-1)
 
     def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
         orig_dtype = k.dtype
-        cos = cos[position_ids].unsqueeze(unsqueeze_dim)
-        sin = sin[position_ids].unsqueeze(unsqueeze_dim)
-        q_fp32 = q.to(dtype=torch.float32)
-        k_fp32 = k.to(dtype=torch.float32)
+        cos = cos[position_ids].unsqueeze(unsqueeze_dim)  # [bs, 1, seq_len, dim]
+        sin = sin[position_ids].unsqueeze(unsqueeze_dim)  # [bs, 1, seq_len, dim]
+        q_fp32 = q.to(dtype=torch.float32, device=q.device)
+        k_fp32 = k.to(dtype=torch.float32, device=k.device)
         q_embed = (q_fp32 * cos) + (rotate_half(q_fp32) * sin)
         k_embed = (k_fp32 * cos) + (rotate_half(k_fp32) * sin)
         return q_embed.to(dtype=orig_dtype), k_embed.to(dtype=orig_dtype)

From f7043c717aea9f7bf8ba2622a5e78d13d247870e Mon Sep 17 00:00:00 2001
From: Mohamed-Ashraf273 <ma2736666@gmail.com>
Date: Thu, 26 Mar 2026 02:43:31 +0200
Subject: [PATCH 34/39] fix issues

---
 optimum/exporters/openvino/model_patcher.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index a86be32aab..029ed27679 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -3904,9 +3904,9 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
         q = self.q_proj(hidden_states)
     else:
         q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states)))
-
     q = q.view(bsz, q_len, self.num_heads, self.qk_head_dim).transpose(1, 2)
     q_nope, q_pe = torch.split(q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
+
     compressed_kv = self.kv_a_proj_with_mqa(hidden_states)
 
     k_pass, k_rot = torch.split(

From 2877a8e9be7974ae9437bb126e34dbbf0aa0809f Mon Sep 17 00:00:00 2001
From: Mohamed-Ashraf273 <ma2736666@gmail.com>
Date: Thu, 26 Mar 2026 02:50:07 +0200
Subject: [PATCH 35/39] fix issues

---
 optimum/exporters/openvino/model_patcher.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index 029ed27679..d6c898fbbb 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -4000,6 +4000,7 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
         is_causal=self.is_causal and attention_mask is None and q_len > 1,
         scale=None if not new_interface else self.scaling,
     )
+
     attn_output = attn_output.transpose(1, 2).contiguous()
 
     attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.v_head_dim)

From 2c2d31bf7d8f7cd76a2d728a452b64debef5be01 Mon Sep 17 00:00:00 2001
From: Mohamed-Ashraf273 <ma2736666@gmail.com>
Date: Thu, 26 Mar 2026 15:51:16 +0200
Subject: [PATCH 36/39] fix issues

---
 optimum/exporters/openvino/model_patcher.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index d6c898fbbb..4f1853d0a8 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -3929,20 +3929,23 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
     new_interface = position_embeddings is not None and not hasattr(self, "rotary_emb")
 
     if new_interface:
-        from transformers.models.deepseek_v3.modeling_deepseek_v3 import apply_rotary_pos_emb_interleave
+        from transformers.models.deepseek_v3.modeling_deepseek_v3 import (
+            apply_rotary_pos_emb as deepseek_v3_apply_rotary_pos_emb,
+            apply_rotary_pos_emb_interleave as deepseek_v3_apply_rotary_pos_emb_interleave,
+        )
 
         cos, sin = position_embeddings
 
         if getattr(self.config, "rope_interleave", False):
             try:
-                q_pe, k_rot = apply_rotary_pos_emb_interleave(q_pe, k_rot, cos, sin)
+                q_pe, k_rot = deepseek_v3_apply_rotary_pos_emb_interleave(q_pe, k_rot, cos, sin)
             except Exception as e:
                 raise RuntimeError(
                     "Failed to apply interleaved rotary position embeddings, "
                     f"may due to incompatible transformers version, try to `pip install transformers>=4.57.1`: {e}"
                 )
         else:
-            q_pe, k_rot = apply_rotary_pos_emb(q_pe, k_rot, cos, sin)
+            q_pe, k_rot = deepseek_v3_apply_rotary_pos_emb(q_pe, k_rot, cos, sin)
 
         kv_cache = past_key_values if past_key_values is not None else past_key_value
 

From 5cb2d8b17275f8de37601f02ae96e1b89c9d099d Mon Sep 17 00:00:00 2001
From: Mohamed-Ashraf273 <ma2736666@gmail.com>
Date: Thu, 26 Mar 2026 20:01:11 +0200
Subject: [PATCH 37/39] fix issues

---
 optimum/exporters/openvino/model_patcher.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index 4f1853d0a8..50056b479c 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -3931,6 +3931,8 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
     if new_interface:
         from transformers.models.deepseek_v3.modeling_deepseek_v3 import (
             apply_rotary_pos_emb as deepseek_v3_apply_rotary_pos_emb,
+        )
+        from transformers.models.deepseek_v3.modeling_deepseek_v3 import (
             apply_rotary_pos_emb_interleave as deepseek_v3_apply_rotary_pos_emb_interleave,
         )
 
@@ -4179,14 +4181,15 @@ def deepseek_moe(self, hidden_states: torch.Tensor, topk_indices: torch.Tensor,
     orig_dtype = hidden_states.dtype
     num_experts = len(self.experts)
     batch_tokens, _ = hidden_states.shape
-    routing = torch.zeros(batch_tokens, num_experts, dtype=topk_weights.dtype, device=hidden_states.device)
-    routing.scatter_(1, topk_indices, topk_weights)
-    expanded = hidden_states.unsqueeze(0).expand(num_experts, -1, -1)
+    compute_dtype = torch.promote_types(hidden_states.dtype, self.gate_projs.dtype)
+    routing = torch.zeros(batch_tokens, num_experts, dtype=compute_dtype, device=hidden_states.device)
+    routing.scatter_(1, topk_indices, topk_weights.to(dtype=compute_dtype))
+    expanded = hidden_states.to(dtype=compute_dtype).unsqueeze(0).expand(num_experts, -1, -1)
     act_fn = self.experts[0].act_fn
-    gate = torch.bmm(expanded, self.gate_projs.transpose(1, 2))
-    up = torch.bmm(expanded, self.up_projs.transpose(1, 2))
+    gate = torch.bmm(expanded, self.gate_projs.to(dtype=compute_dtype).transpose(1, 2))
+    up = torch.bmm(expanded, self.up_projs.to(dtype=compute_dtype).transpose(1, 2))
     gate_up = act_fn(gate) * up
-    next_states = torch.bmm(gate_up, self.down_projs.transpose(1, 2))
+    next_states = torch.bmm(gate_up, self.down_projs.to(dtype=compute_dtype).transpose(1, 2))
     routing = routing.transpose(0, 1).unsqueeze(-1)
     next_states = next_states * routing
     next_states = next_states.sum(dim=0)

From 39e770cd612ddfef1cdca8dd13f5bc544e9e166f Mon Sep 17 00:00:00 2001
From: Mohamed Ashraf <117025882+Mohamed-Ashraf273@users.noreply.github.com>
Date: Mon, 30 Mar 2026 16:29:00 +0200
Subject: [PATCH 38/39] Remove Flaubert and add GigaChat3 to models list

---
 docs/source/openvino/models.mdx | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/docs/source/openvino/models.mdx b/docs/source/openvino/models.mdx
index 03ec2999e8..041f07a9a9 100644
--- a/docs/source/openvino/models.mdx
+++ b/docs/source/openvino/models.mdx
@@ -60,9 +60,8 @@ Here is the list of the supported architectures :
 - EXAONE 4
 - Falcon
 - Falcon-Mamba
-- Flaubert
-- GigaChat3
 - FlauBERT
+- GigaChat3
 - GLM-4
 - GLM-Edge
 - GPT-2
@@ -187,4 +186,4 @@ Here is the list of the supported architectures :
 - All Transformer and CLIP-based models.
 
 ## [OpenCLIP](https://github.com/mlfoundations/open_clip)
-- All CLIP-based models
\ No newline at end of file
+- All CLIP-based models

From aec4a90159ead91bf4d55167febb706775db33f7 Mon Sep 17 00:00:00 2001
From: Mohamed-Ashraf273 <ma2736666@gmail.com>
Date: Mon, 30 Mar 2026 16:38:14 +0200
Subject: [PATCH 39/39] update docs

---
 docs/source/openvino/models.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/openvino/models.mdx b/docs/source/openvino/models.mdx
index 041f07a9a9..6339ade837 100644
--- a/docs/source/openvino/models.mdx
+++ b/docs/source/openvino/models.mdx
@@ -186,4 +186,4 @@ Here is the list of the supported architectures :
 - All Transformer and CLIP-based models.
 
 ## [OpenCLIP](https://github.com/mlfoundations/open_clip)
-- All CLIP-based models
+- All CLIP-based models
\ No newline at end of file