From f88c6a8bb9a666b4c60d059ae1def4f9858fcca9 Mon Sep 17 00:00:00 2001 From: Mohamed-Ashraf273 Date: Tue, 3 Mar 2026 05:15:16 +0200 Subject: [PATCH 01/39] add support gigachat3 --- optimum/exporters/openvino/model_patcher.py | 327 ++++++-------------- 1 file changed, 98 insertions(+), 229 deletions(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 557cd1f8d1..31048a005d 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -3758,8 +3758,8 @@ class DeepseekPatcher(OVDecoderModelPatcher): def __enter__(self): super().__enter__() self_attn = { - "deepseek_v3": deepseek_v3_attn_forward, - "deepseek_v2": deepseek_v2_attn_forward, + "deepseek_v3": make_deepseek_attn_forward(version=3), + "deepseek_v2": make_deepseek_attn_forward(version=2), "deepseek": minicpm3_attn_forward, } @@ -3770,249 +3770,118 @@ def __enter__(self): block.self_attn.forward = types.MethodType(self_attn_fwd, block.self_attn) if hasattr(block.mlp, "moe_infer"): block.mlp._org_moe_infer = block.mlp.moe_infer - block.mlp.moe_infer = types.MethodType(deepseek_moe_infer, block.mlp) + elif hasattr(block.mlp, "experts"): + block.mlp._org_moe_infer = None + block.mlp.ep_rank = 0 + block.mlp.experts_per_rank = len(block.mlp.experts) + block.mlp.moe_infer = types.MethodType(deepseek_moe_infer, block.mlp) def __exit__(self, exc_type, exc_value, traceback): super().__exit__(exc_type, exc_value, traceback) for block in self._model.model.layers: - block.self_attn.forward = block.self_attn._orig_forward - if hasattr(block.mlp, "_orig_moe_infer"): - block.mlp.moe_infer = block.mlp._orig_moe_infer - - -def deepseek_v3_attn_forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_value=None, - output_attentions: bool = False, - use_cache: bool = False, -) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: - # modified from https://huggingface.co/deepseek-ai/DeepSeek-V3/blob/main/modeling_deepseek.py#L751 - def rotate_half(x): - """Rotates half the hidden dims of the input.""" - x1 = x[..., : x.shape[-1] // 2] - x2 = x[..., x.shape[-1] // 2 :] - return torch.cat((-x2, x1), dim=-1) - - def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1): - orig_dtype = k.dtype - cos = cos[position_ids].unsqueeze(unsqueeze_dim) # [bs, 1, seq_len, dim] - sin = sin[position_ids].unsqueeze(unsqueeze_dim) # [bs, 1, seq_len, dim] - q_fp32 = q.to(dtype=torch.float32, device=q.device) - k_fp32 = k.to(dtype=torch.float32, device=k.device) - q_embed = (q_fp32 * cos) + (rotate_half(q_fp32) * sin) - k_embed = (k_fp32 * cos) + (rotate_half(k_fp32) * sin) - return q_embed.to(dtype=orig_dtype), k_embed.to(dtype=orig_dtype) - - if output_attentions: - return self._orig_forward( - hidden_states=hidden_states, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_value=past_key_value, - output_attentions=output_attentions, - use_cache=use_cache, - ) - - bsz, q_len, _ = hidden_states.size() - - if self.q_lora_rank is None: - q = self.q_proj(hidden_states) - else: - q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states))) - q = q.view(bsz, q_len, self.num_heads, self.q_head_dim).transpose(1, 2) - q_nope, q_pe = torch.split(q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1) - - compressed_kv = self.kv_a_proj_with_mqa(hidden_states) - compressed_kv, k_pe = torch.split(compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1) - k_pe = k_pe.view(bsz, q_len, 1, self.qk_rope_head_dim).transpose(1, 2) - kv = ( - self.kv_b_proj(self.kv_a_layernorm(compressed_kv)) - .view(bsz, q_len, self.num_heads, self.qk_nope_head_dim + self.v_head_dim) - .transpose(1, 2) - ) - - k_nope, value_states = torch.split(kv, [self.qk_nope_head_dim, self.v_head_dim], dim=-1) - kv_seq_len = value_states.shape[-2] - if past_key_value is not None: - if self.layer_idx is None: - raise ValueError( - f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} " - "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " - "with a layer index." - ) - kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) - cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) - - q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin, position_ids) - - # Difference with original code, k_pe.new_empty create constant tensor in torchscript - query_states = torch.concat([q_nope, q_pe], dim=-1) - # query_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim) - # query_states[:, :, :, : self.qk_nope_head_dim] = q_nope - # query_states[:, :, :, self.qk_nope_head_dim :] = q_pe - key_states = torch.concat([k_nope, k_pe.expand(-1, self.num_heads, -1, -1)], dim=-1) - # key_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim) - # key_states[:, :, :, : self.qk_nope_head_dim] = k_nope - # key_states[:, :, :, self.qk_nope_head_dim :] = k_pe - if past_key_value is not None: - cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models - key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) - - if attention_mask is not None: - if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): - raise ValueError( - f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" - ) - - # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask, - # Reference: https://github.com/pytorch/pytorch/issues/112577. - if query_states.device.type == "cuda" and attention_mask is not None: - query_states = query_states.contiguous() - key_states = key_states.contiguous() - value_states = value_states.contiguous() - - attn_output = torch.nn.functional.scaled_dot_product_attention( - query_states, - key_states, - value_states, - attn_mask=attention_mask, - dropout_p=self.attention_dropout if self.training else 0.0, - # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1. - is_causal=self.is_causal and attention_mask is None and q_len > 1, - ) - - attn_output = attn_output.transpose(1, 2).contiguous() - - attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.v_head_dim) + if hasattr(block.self_attn, "_orig_forward"): + block.self_attn.forward = block.self_attn._orig_forward + if hasattr(block.mlp, "_org_moe_infer"): + if block.mlp._org_moe_infer is not None: + block.mlp.moe_infer = block.mlp._org_moe_infer + else: + delattr(block.mlp, "moe_infer") + if hasattr(block.mlp, "ep_rank"): + delattr(block.mlp, "ep_rank") + if hasattr(block.mlp, "experts_per_rank"): + delattr(block.mlp, "experts_per_rank") + + +def make_deepseek_attn_forward(version: int = 3): + """Return a MLA attention forward function for the given DeepSeek version. + + Both deepseek_v2 and deepseek_v3 share identical MLA attention logic — the + only differences are: + - v3: ``position_embeddings`` is a ``(cos, sin)`` tuple; RoPE applied via + ``apply_rotary_pos_emb(q_rot, k_rot, cos, sin)``; cache stores cos/sin. + - v2: ``position_embeddings`` is a complex ``freqs_cis`` tensor; RoPE applied + via ``apply_rotary_emb(q_pe, k_pe, freqs_cis)`` (complex multiplication). + """ - attn_output = self.o_proj(attn_output) + def deepseek_attn_forward( + self, + hidden_states: torch.Tensor, + position_embeddings=None, + attention_mask: Optional[torch.Tensor] = None, + past_key_value=None, + past_key_values=None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: + batch_size, seq_length = hidden_states.shape[:-1] + query_shape = (batch_size, seq_length, -1, self.qk_head_dim) + key_shape = (batch_size, seq_length, -1, self.qk_nope_head_dim + self.v_head_dim) - return attn_output, None, past_key_value + if self.q_lora_rank is None: + q = self.q_proj(hidden_states) + else: + q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states))) + q = q.view(query_shape).transpose(1, 2) + q_nope, q_pe = torch.split(q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1) + + compressed_kv = self.kv_a_proj_with_mqa(hidden_states) + k_nope, k_pe = torch.split(compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1) + k_nope = self.kv_b_proj(self.kv_a_layernorm(k_nope)).view(key_shape).transpose(1, 2) + k_nope, value_states = torch.split(k_nope, [self.qk_nope_head_dim, self.v_head_dim], dim=-1) + k_pe = k_pe.view(batch_size, 1, seq_length, self.qk_rope_head_dim) + + if version == 3: + from transformers.models.deepseek_v3.modeling_deepseek_v3 import apply_rotary_pos_emb + + cos, sin = position_embeddings + q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin) + cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} + elif version == 2: + + def apply_rotary_emb( + xq: torch.Tensor, + xk: torch.Tensor, + freqs_cis: torch.Tensor, + ) -> tuple[torch.Tensor, torch.Tensor]: + xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2)) + xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2)) + # Broadcast to [1, 1, seq_len, dim // 2] + freqs_cis = freqs_cis.unsqueeze(1).to(xq_.device) -def deepseek_v2_attn_forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_value=None, - output_attentions: bool = False, - use_cache: bool = False, - **kwargs, -) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: - # modified from https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite/blob/main/modeling_deepseek.py#L806 - def rotate_half(x): - """Rotates half the hidden dims of the input.""" - x1 = x[..., : x.shape[-1] // 2] - x2 = x[..., x.shape[-1] // 2 :] - return torch.cat((-x2, x1), dim=-1) + xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3).type_as(xq) + xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3).type_as(xk) + return xq_out, xk_out - def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1): - cos = cos[position_ids].unsqueeze(unsqueeze_dim) - sin = sin[position_ids].unsqueeze(unsqueeze_dim) + q_pe, k_pe = apply_rotary_emb(q_pe, k_pe, position_embeddings.to(q_pe.device)) + cache_kwargs = {"cache_position": cache_position} - b, h, s, d = q.shape - q = q.view(b, h, s, d // 2, 2).transpose(4, 3).reshape(b, h, s, d) + else: + raise ValueError(f"Unsupported DeepSeek version: {version}") - b, h, s, d = k.shape - k = k.view(b, h, s, d // 2, 2).transpose(4, 3).reshape(b, h, s, d) + # Use expand+cat instead of new_empty+slice to avoid constant tensors in torchscript + k_pe = k_pe.expand(*k_nope.shape[:-1], -1) + query_states = torch.cat((q_nope, q_pe), dim=-1) + key_states = torch.cat((k_nope, k_pe), dim=-1) - q_embed = (q * cos) + (rotate_half(q) * sin) - k_embed = (k * cos) + (rotate_half(k) * sin) - return q_embed, k_embed + kv_cache = past_key_value if past_key_value is not None else past_key_values + if kv_cache is not None: + key_states, value_states = kv_cache.update(key_states, value_states, self.layer_idx, cache_kwargs) - if output_attentions: - return self._orig_forward( - hidden_states=hidden_states, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_value=past_key_value, - output_attentions=output_attentions, - use_cache=use_cache, + attn_output = torch.nn.functional.scaled_dot_product_attention( + query_states, + key_states, + value_states, + attn_mask=attention_mask, + dropout_p=self.attention_dropout if self.training else 0.0, + is_causal=self.is_causal and attention_mask is None and seq_length > 1, ) - bsz, q_len, _ = hidden_states.shape - - if self.q_lora_rank is None: - q = self.q_proj(hidden_states) - else: - q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states))) - q = q.view(bsz, q_len, self.num_heads, self.q_head_dim).transpose(1, 2) - q_nope, q_pe = torch.split(q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1) - - compressed_kv = self.kv_a_proj_with_mqa(hidden_states) - compressed_kv, k_pe = torch.split(compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1) - k_pe = k_pe.view(bsz, q_len, 1, self.qk_rope_head_dim).transpose(1, 2) - kv = ( - self.kv_b_proj(self.kv_a_layernorm(compressed_kv)) - .view(bsz, q_len, self.num_heads, self.qk_nope_head_dim + self.v_head_dim) - .transpose(1, 2) - ) - - k_nope, value_states = torch.split(kv, [self.qk_nope_head_dim, self.v_head_dim], dim=-1) - kv_seq_len = value_states.shape[-2] - if past_key_value is not None: - if self.layer_idx is None: - raise ValueError( - f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} " - "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " - "with a layer index." - ) - kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) - cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) - - q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin, position_ids) - - # Difference with original code, k_pe.new_empty create constant tensor in torchscript - query_states = torch.concat([q_nope, q_pe], dim=-1) - # query_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim) - # query_states[:, :, :, : self.qk_nope_head_dim] = q_nope - # query_states[:, :, :, self.qk_nope_head_dim :] = q_pe - key_states = torch.concat([k_nope, k_pe.expand(-1, self.num_heads, -1, -1)], dim=-1) - # key_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim) - # key_states[:, :, :, : self.qk_nope_head_dim] = k_nope - # key_states[:, :, :, self.qk_nope_head_dim :] = k_pe - if past_key_value is not None: - cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models - key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) - - if attention_mask is not None: - if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): - raise ValueError( - f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" - ) - - if attention_mask is not None: - if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): - raise ValueError( - f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" - ) - # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask, - # Reference: https://github.com/pytorch/pytorch/issues/112577. - if query_states.device.type == "cuda" and attention_mask is not None: - query_states = query_states.contiguous() - key_states = key_states.contiguous() - value_states = value_states.contiguous() - - attn_output = torch.nn.functional.scaled_dot_product_attention( - query_states, - key_states, - value_states, - attn_mask=attention_mask, - dropout_p=self.attention_dropout if self.training else 0.0, - # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1. - is_causal=self.is_causal and attention_mask is None and q_len > 1, - ) - attn_output = attn_output.transpose(1, 2).contiguous() - - attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.v_head_dim) - - attn_output = self.o_proj(attn_output) + attn_output = attn_output.reshape(batch_size, seq_length, -1).contiguous() + attn_output = self.o_proj(attn_output) + return attn_output, None - return attn_output, None, past_key_value + return deepseek_attn_forward def deepseek_moe_infer(self, x, topk_ids, topk_weight): From c26ffe8f88344f96a909fc8eeba2c378518f83b9 Mon Sep 17 00:00:00 2001 From: Mohamed-Ashraf273 Date: Tue, 3 Mar 2026 14:54:57 +0200 Subject: [PATCH 02/39] support gigacgat3 --- optimum/exporters/openvino/model_patcher.py | 140 ++++++++++++-------- tests/openvino/test_decoder.py | 2 +- tests/openvino/utils_tests.py | 1 - 3 files changed, 88 insertions(+), 55 deletions(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 31048a005d..3ff17e85b6 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -3795,91 +3795,125 @@ def __exit__(self, exc_type, exc_value, traceback): def make_deepseek_attn_forward(version: int = 3): """Return a MLA attention forward function for the given DeepSeek version. - Both deepseek_v2 and deepseek_v3 share identical MLA attention logic — the - only differences are: - - v3: ``position_embeddings`` is a ``(cos, sin)`` tuple; RoPE applied via - ``apply_rotary_pos_emb(q_rot, k_rot, cos, sin)``; cache stores cos/sin. - - v2: ``position_embeddings`` is a complex ``freqs_cis`` tensor; RoPE applied - via ``apply_rotary_emb(q_pe, k_pe, freqs_cis)`` (complex multiplication). + Args: + version: 2 for deepseek_v2 (uses freqs_cis), 3 for deepseek_v3 (uses cos/sin tuple) """ + from typing import Callable + + from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS + + if version == 3: + from transformers.models.deepseek_v3.modeling_deepseek_v3 import ( + apply_rotary_pos_emb, + apply_rotary_pos_emb_interleave, + eager_attention_forward, + ) + elif version == 2: + + def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + + def apply_rotary_emb(xq: torch.Tensor, xk: torch.Tensor, freqs_cis: torch.Tensor): + xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2)) + xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2)) + freqs_cis = freqs_cis.unsqueeze(1).to(xq_.device) + xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3).type_as(xq) + xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3).type_as(xk) + return xq_out, xk_out + + def eager_attention_forward(module, query, key, value, attention_mask, scaling, dropout=0.0, **kwargs): + key = repeat_kv(key, module.num_key_value_groups) + value = repeat_kv(value, module.num_key_value_groups) + attn_weights = torch.matmul(query, key.transpose(2, 3)) * scaling + if attention_mask is not None: + attn_weights = attn_weights + attention_mask + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) + attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) + attn_output = torch.matmul(attn_weights, value) + return attn_output.transpose(1, 2).contiguous(), attn_weights + else: + raise ValueError(f"Unsupported DeepSeek version: {version}") def deepseek_attn_forward( self, hidden_states: torch.Tensor, - position_embeddings=None, - attention_mask: Optional[torch.Tensor] = None, - past_key_value=None, + position_embeddings, + attention_mask: Optional[torch.Tensor], + past_key_value: Optional[Cache] = None, past_key_values=None, cache_position: Optional[torch.LongTensor] = None, **kwargs, ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: batch_size, seq_length = hidden_states.shape[:-1] - query_shape = (batch_size, seq_length, -1, self.qk_head_dim) - key_shape = (batch_size, seq_length, -1, self.qk_nope_head_dim + self.v_head_dim) if self.q_lora_rank is None: - q = self.q_proj(hidden_states) + q_states = self.q_proj(hidden_states) else: - q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states))) - q = q.view(query_shape).transpose(1, 2) - q_nope, q_pe = torch.split(q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1) + q_states = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states))) + q_states = q_states.view(batch_size, seq_length, -1, self.qk_head_dim).transpose(1, 2) + q_pass, q_rot = torch.split(q_states, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1) compressed_kv = self.kv_a_proj_with_mqa(hidden_states) - k_nope, k_pe = torch.split(compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1) - k_nope = self.kv_b_proj(self.kv_a_layernorm(k_nope)).view(key_shape).transpose(1, 2) - k_nope, value_states = torch.split(k_nope, [self.qk_nope_head_dim, self.v_head_dim], dim=-1) - k_pe = k_pe.view(batch_size, 1, seq_length, self.qk_rope_head_dim) + k_pass, k_rot = torch.split(compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1) + k_pass = self.kv_b_proj(self.kv_a_layernorm(k_pass)) + k_pass = k_pass.view(batch_size, seq_length, -1, self.qk_nope_head_dim + self.v_head_dim).transpose(1, 2) + k_pass, value_states = torch.split(k_pass, [self.qk_nope_head_dim, self.v_head_dim], dim=-1) + k_rot = k_rot.view(batch_size, 1, seq_length, self.qk_rope_head_dim) if version == 3: - from transformers.models.deepseek_v3.modeling_deepseek_v3 import apply_rotary_pos_emb - cos, sin = position_embeddings - q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin) + if self.config.rope_interleave: + q_rot, k_rot = apply_rotary_pos_emb_interleave(q_rot, k_rot, cos, sin) + else: + q_rot, k_rot = apply_rotary_pos_emb(q_rot, k_rot, cos, sin) cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} - elif version == 2: - - def apply_rotary_emb( - xq: torch.Tensor, - xk: torch.Tensor, - freqs_cis: torch.Tensor, - ) -> tuple[torch.Tensor, torch.Tensor]: - xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2)) - xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2)) - - # Broadcast to [1, 1, seq_len, dim // 2] - freqs_cis = freqs_cis.unsqueeze(1).to(xq_.device) - - xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3).type_as(xq) - xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3).type_as(xk) - return xq_out, xk_out - - q_pe, k_pe = apply_rotary_emb(q_pe, k_pe, position_embeddings.to(q_pe.device)) - cache_kwargs = {"cache_position": cache_position} - + kv_cache = past_key_value else: - raise ValueError(f"Unsupported DeepSeek version: {version}") + q_rot, k_rot = apply_rotary_emb(q_rot, k_rot, position_embeddings.to(q_rot.device)) + cache_kwargs = {"cache_position": cache_position} + kv_cache = past_key_values - # Use expand+cat instead of new_empty+slice to avoid constant tensors in torchscript - k_pe = k_pe.expand(*k_nope.shape[:-1], -1) - query_states = torch.cat((q_nope, q_pe), dim=-1) - key_states = torch.cat((k_nope, k_pe), dim=-1) + k_rot = k_rot.expand(*k_pass.shape[:-1], -1) + query_states = torch.cat((q_pass, q_rot), dim=-1) + key_states = torch.cat((k_pass, k_rot), dim=-1) - kv_cache = past_key_value if past_key_value is not None else past_key_values if kv_cache is not None: key_states, value_states = kv_cache.update(key_states, value_states, self.layer_idx, cache_kwargs) - attn_output = torch.nn.functional.scaled_dot_product_attention( + is_flash_attn = "flash" in self.config._attn_implementation + if is_flash_attn and self.qk_head_dim != self.v_head_dim: + value_states = F.pad(value_states, [0, self.qk_head_dim - self.v_head_dim]) + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + if version == 2: + attention_interface = ALL_ATTENTION_FUNCTIONS.get_interface( + self.config._attn_implementation, eager_attention_forward + ) + else: + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + attn_output, attn_weights = attention_interface( + self, query_states, key_states, value_states, - attn_mask=attention_mask, - dropout_p=self.attention_dropout if self.training else 0.0, - is_causal=self.is_causal and attention_mask is None and seq_length > 1, + attention_mask, + dropout=0.0 if not self.training else self.attention_dropout, + scaling=self.scaling, + **kwargs, ) + if is_flash_attn and self.qk_head_dim != self.v_head_dim: + attn_output = attn_output[:, :, :, : self.v_head_dim] + attn_output = attn_output.reshape(batch_size, seq_length, -1).contiguous() attn_output = self.o_proj(attn_output) - return attn_output, None + return attn_output, attn_weights return deepseek_attn_forward diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index 91121023d8..736d7379de 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -383,7 +383,7 @@ def test_compare_to_transformers(self, model_arch): transformers_outputs = transformers_model(**tokens) # Compare tensor outputs - atol = 3e-3 if model_arch in ["minicpm", "qwen2-moe"] else 1e-4 + atol = 3e-2 if model_arch in ["deepseek"] else 3e-3 if model_arch in ["minicpm", "qwen2-moe"] else 1e-4 # quantized models have different logits value range if "awq" not in model_arch and "gptq" not in model_arch: self.assertTrue(torch.allclose(ov_outputs.logits, transformers_outputs.logits, equal_nan=True, atol=atol)) diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index 230ec88e45..472733b1f3 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -395,7 +395,6 @@ "exaone4", "decilm", "minicpm3", - "deepseek", "qwen3_eagle3", ) From 751cd02b2736ec37e4367a1ca3de54d184bc23bf Mon Sep 17 00:00:00 2001 From: Mohamed-Ashraf273 Date: Tue, 3 Mar 2026 23:09:13 +0200 Subject: [PATCH 03/39] add tests & create tiny model --- tests/openvino/test_decoder.py | 11 ++++++++--- tests/openvino/utils_tests.py | 1 + 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index 736d7379de..12443d1465 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -117,7 +117,7 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): SUPPORTED_ARCHITECTURES += ("glm", "mistral-nemo", "phimoe") if is_transformers_version("<", "4.54.0"): - SUPPORTED_ARCHITECTURES += ("deepseek",) + SUPPORTED_ARCHITECTURES += ("deepseek", "gigachat3") # gptq and awq install disabled for windows test environment if platform.system() != "Windows" and is_transformers_version("<", "4.56.0"): @@ -227,6 +227,7 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): "minicpm3": 6, "phimoe": 2, "deepseek": 2, + "gigachat3": 2, "opt_gptq": 12, "mixtral_awq": 2, "gemma3_text": 2, @@ -383,9 +384,13 @@ def test_compare_to_transformers(self, model_arch): transformers_outputs = transformers_model(**tokens) # Compare tensor outputs - atol = 3e-2 if model_arch in ["deepseek"] else 3e-3 if model_arch in ["minicpm", "qwen2-moe"] else 1e-4 + atol = 3e-2 if model_arch in ["deepseek", "gigachat3"] else 3e-3 if model_arch in ["minicpm", "qwen2-moe"] else 1e-4 # quantized models have different logits value range if "awq" not in model_arch and "gptq" not in model_arch: + diff = torch.abs(ov_outputs.logits - transformers_outputs.logits) + print(f"\nMax diff: {diff.max()}, Mean diff: {diff.mean()}, aftol: {atol}") + print(f"OV logits sample: {ov_outputs.logits[0, 0, :5]}") + print(f"TF logits sample: {transformers_outputs.logits[0, 0, :5]}") self.assertTrue(torch.allclose(ov_outputs.logits, transformers_outputs.logits, equal_nan=True, atol=atol)) # Qwen tokenizer does not support padding @@ -410,7 +415,7 @@ def test_compare_to_transformers(self, model_arch): ov_outputs = ov_model.generate(**tokens, generation_config=gen_config) # TODO: add back once https://huggingface.co/katuni4ka/tiny-random-minicpm3/discussions/1 merged (for all models) as current modeling incompatible with transformers >= v4.49 - if model_arch in {"deepseek"} and is_transformers_version(">=", "4.49"): + if model_arch in {"deepseek", "gigachat3"} and is_transformers_version(">=", "4.49"): self.skipTest("Incompatible modeling code") additional_inputs = {} diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index 472733b1f3..f7de13a928 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -70,6 +70,7 @@ "deberta-v2": "optimum-intel-internal-testing/tiny-random-DebertaV2Model", "decilm": "optimum-intel-internal-testing/tiny-random-decilm", "deepseek": "optimum-intel-internal-testing/tiny-random-deepseek-v3", + "gigachat3": "mohamedahraf273/tiny-random-gigachat3", "deit": "optimum-intel-internal-testing/tiny-random-DeiTModel", "convnext": "optimum-intel-internal-testing/tiny-random-convnext", "convnextv2": "optimum-intel-internal-testing/tiny-random-ConvNextV2Model", From 0d16b4f41e2bd87f72ded8d066a61e5a3cf417ef Mon Sep 17 00:00:00 2001 From: Mohamed-Ashraf273 Date: Wed, 4 Mar 2026 11:15:51 +0200 Subject: [PATCH 04/39] add tests and fix issues --- optimum/exporters/openvino/model_configs.py | 3 +-- optimum/exporters/openvino/model_patcher.py | 1 + tests/openvino/test_decoder.py | 20 +++++++++++--------- tests/openvino/test_export.py | 3 +++ tests/openvino/test_exporters_cli.py | 16 +++++++++++++--- tests/openvino/utils_tests.py | 1 + 6 files changed, 30 insertions(+), 14 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 7ffe158396..d295f9b266 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -4080,8 +4080,7 @@ class M2M100OpenVINOConfig(BartOpenVINOConfig): ) @register_in_tasks_manager("deepseek", *["text-generation", "text-generation-with-past"], library_name="transformers") class DeepseekOpenVINOConfig(MiniCPM3OpenVINOConfig): - MIN_TRANSFORMERS_VERSION = "4.46.0" - MAX_TRANSFORMERS_VERSION = "4.53.3" + MIN_TRANSFORMERS_VERSION = "4.53.0" _MODEL_PATCHER = DeepseekPatcher diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 3ff17e85b6..a1e9d111f1 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -3835,6 +3835,7 @@ def eager_attention_forward(module, query, key, value, attention_mask, scaling, attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) attn_output = torch.matmul(attn_weights, value) return attn_output.transpose(1, 2).contiguous(), attn_weights + else: raise ValueError(f"Unsupported DeepSeek version: {version}") diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index 12443d1465..d3cb6eac72 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -116,13 +116,13 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): if is_transformers_version(">=", "4.46.0"): SUPPORTED_ARCHITECTURES += ("glm", "mistral-nemo", "phimoe") - if is_transformers_version("<", "4.54.0"): - SUPPORTED_ARCHITECTURES += ("deepseek", "gigachat3") - # gptq and awq install disabled for windows test environment if platform.system() != "Windows" and is_transformers_version("<", "4.56.0"): SUPPORTED_ARCHITECTURES += ("opt_gptq", "mixtral_awq") + if is_transformers_version(">=", "4.53.0"): + SUPPORTED_ARCHITECTURES += ("deepseek", "gigachat3") + if is_transformers_version(">", "4.47"): SUPPORTED_ARCHITECTURES += ("olmo2",) @@ -384,13 +384,15 @@ def test_compare_to_transformers(self, model_arch): transformers_outputs = transformers_model(**tokens) # Compare tensor outputs - atol = 3e-2 if model_arch in ["deepseek", "gigachat3"] else 3e-3 if model_arch in ["minicpm", "qwen2-moe"] else 1e-4 + atol_by_arch = { + "deepseek": 3e-2, + "gigachat3": 3e-2, + "minicpm": 3e-3, + "qwen2-moe": 3e-3, + } + atol = atol_by_arch.get(model_arch, 1e-4) # quantized models have different logits value range if "awq" not in model_arch and "gptq" not in model_arch: - diff = torch.abs(ov_outputs.logits - transformers_outputs.logits) - print(f"\nMax diff: {diff.max()}, Mean diff: {diff.mean()}, aftol: {atol}") - print(f"OV logits sample: {ov_outputs.logits[0, 0, :5]}") - print(f"TF logits sample: {transformers_outputs.logits[0, 0, :5]}") self.assertTrue(torch.allclose(ov_outputs.logits, transformers_outputs.logits, equal_nan=True, atol=atol)) # Qwen tokenizer does not support padding @@ -668,7 +670,7 @@ def test_beam_search(self, model_arch): return # TODO: add back once https://huggingface.co/katuni4ka/tiny-random-minicpm3/discussions/1 merged (for all models) as current modeling incompatible with transformers >= v4.49 - if model_arch in {"deepseek"} and is_transformers_version(">=", "4.49"): + if model_arch in {"deepseek", "gigachat3"}: self.skipTest("Incompatible modeling code") tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=model_arch in REMOTE_CODE_MODELS) diff --git a/tests/openvino/test_export.py b/tests/openvino/test_export.py index d1c373e2bc..1cb27d2f97 100644 --- a/tests/openvino/test_export.py +++ b/tests/openvino/test_export.py @@ -98,6 +98,9 @@ class ExportModelTest(unittest.TestCase): if is_transformers_version(">=", "4.48.0"): SUPPORTED_ARCHITECTURES.update({"cohere2": OVModelForCausalLM}) + if is_transformers_version(">=", "4.53.0"): + SUPPORTED_ARCHITECTURES.update({"deepseek": OVModelForCausalLM, "gigachat3": OVModelForCausalLM}) + if is_transformers_version(">=", "4.49"): SUPPORTED_ARCHITECTURES.update({"zamba2": OVModelForCausalLM}) diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index edbc01e310..c8b0eec341 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -159,6 +159,13 @@ class OVCLIExportTestCase(unittest.TestCase): ] ) + if is_transformers_version(">=", "4.53.0"): + SUPPORTED_ARCHITECTURES.extend( + [ + ("text-generation-with-past", "gigachat3"), + ] + ) + if is_transformers_version(">=", "4.57.0"): SUPPORTED_ARCHITECTURES.extend( [ @@ -197,6 +204,7 @@ class OVCLIExportTestCase(unittest.TestCase): "exaone4": 2, "bitnet": 2, "granitemoehybrid": 2, + "gigachat3": 2, } TOKENIZER_CHAT_TEMPLATE_TESTS_MODELS = { @@ -299,9 +307,11 @@ class OVCLIExportTestCase(unittest.TestCase): "whisper", "f8e4m3", "--dataset librispeech --num-samples 1 --smooth-quant-alpha 0.9 --trust-remote-code", - {"encoder": 16, "decoder": 26, "decoder_with_past": 23} - if is_transformers_version("<=", "4.45") - else {"encoder": 16, "decoder": 26, "decoder_with_past": 25}, + ( + {"encoder": 16, "decoder": 26, "decoder_with_past": 23} + if is_transformers_version("<=", "4.45") + else {"encoder": 16, "decoder": 26, "decoder_with_past": 25} + ), ( {"encoder": {"f8e4m3": 14}, "decoder": {"f8e4m3": 22}, "decoder_with_past": {"f8e4m3": 17}} if is_transformers_version("<=", "4.45") diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index f7de13a928..35d74ebcb0 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -371,6 +371,7 @@ "lfm2": {"model": 52}, "hunyuan_v1_dense": {"model": 32}, "qwen3_eagle3": {"model": 20}, + "gigachat3": {"model": 58}, } TEST_IMAGE_URL = "http://images.cocodataset.org/val2017/000000039769.jpg" From f2a1e533759fef1b0c2be059aff1d6151010047f Mon Sep 17 00:00:00 2001 From: Mohamed-Ashraf273 Date: Wed, 4 Mar 2026 11:18:38 +0200 Subject: [PATCH 05/39] fix version skip test --- tests/openvino/test_decoder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index d3cb6eac72..7a36240a70 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -670,7 +670,7 @@ def test_beam_search(self, model_arch): return # TODO: add back once https://huggingface.co/katuni4ka/tiny-random-minicpm3/discussions/1 merged (for all models) as current modeling incompatible with transformers >= v4.49 - if model_arch in {"deepseek", "gigachat3"}: + if model_arch in {"deepseek"} and is_transformers_version(">=", "4.49"): self.skipTest("Incompatible modeling code") tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=model_arch in REMOTE_CODE_MODELS) From aac19fb9624be81bdcd62eef2fc7223a0c13d9b2 Mon Sep 17 00:00:00 2001 From: Mohamed-Ashraf273 Date: Wed, 4 Mar 2026 12:13:11 +0200 Subject: [PATCH 06/39] add docs & modify patcher --- docs/source/openvino/models.mdx | 1 + optimum/exporters/openvino/model_patcher.py | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/source/openvino/models.mdx b/docs/source/openvino/models.mdx index 51200060e8..d62b9e654a 100644 --- a/docs/source/openvino/models.mdx +++ b/docs/source/openvino/models.mdx @@ -61,6 +61,7 @@ Here is the list of the supported architectures : - Falcon - Falcon-Mamba - Flaubert +- GigaChat3 - GLM-4 - GLM-Edge - GPT-2 diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index a1e9d111f1..b338930ce0 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -3770,11 +3770,12 @@ def __enter__(self): block.self_attn.forward = types.MethodType(self_attn_fwd, block.self_attn) if hasattr(block.mlp, "moe_infer"): block.mlp._org_moe_infer = block.mlp.moe_infer + block.mlp.moe_infer = types.MethodType(deepseek_moe_infer, block.mlp) elif hasattr(block.mlp, "experts"): block.mlp._org_moe_infer = None block.mlp.ep_rank = 0 block.mlp.experts_per_rank = len(block.mlp.experts) - block.mlp.moe_infer = types.MethodType(deepseek_moe_infer, block.mlp) + block.mlp.moe_infer = types.MethodType(deepseek_moe_infer, block.mlp) def __exit__(self, exc_type, exc_value, traceback): super().__exit__(exc_type, exc_value, traceback) From 5c134eb352092176a75b87a63194b161d969550c Mon Sep 17 00:00:00 2001 From: Mohamed-Ashraf273 Date: Wed, 4 Mar 2026 12:15:31 +0200 Subject: [PATCH 07/39] modify patcher --- optimum/exporters/openvino/model_patcher.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index b338930ce0..3cb2f0f553 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -3794,11 +3794,6 @@ def __exit__(self, exc_type, exc_value, traceback): def make_deepseek_attn_forward(version: int = 3): - """Return a MLA attention forward function for the given DeepSeek version. - - Args: - version: 2 for deepseek_v2 (uses freqs_cis), 3 for deepseek_v3 (uses cos/sin tuple) - """ from typing import Callable from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS From f231bca4d53adc663611064dcceda19be8d2fbed Mon Sep 17 00:00:00 2001 From: Mohamed-Ashraf273 Date: Wed, 4 Mar 2026 12:22:05 +0200 Subject: [PATCH 08/39] modify patcher --- optimum/exporters/openvino/model_patcher.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 3cb2f0f553..dbc17c49d2 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -3769,10 +3769,10 @@ def __enter__(self): block.self_attn._orig_forward = block.self_attn.forward block.self_attn.forward = types.MethodType(self_attn_fwd, block.self_attn) if hasattr(block.mlp, "moe_infer"): - block.mlp._org_moe_infer = block.mlp.moe_infer + block.mlp._orig_moe_infer = block.mlp.moe_infer block.mlp.moe_infer = types.MethodType(deepseek_moe_infer, block.mlp) elif hasattr(block.mlp, "experts"): - block.mlp._org_moe_infer = None + block.mlp._orig_moe_infer = None block.mlp.ep_rank = 0 block.mlp.experts_per_rank = len(block.mlp.experts) block.mlp.moe_infer = types.MethodType(deepseek_moe_infer, block.mlp) @@ -3782,9 +3782,9 @@ def __exit__(self, exc_type, exc_value, traceback): for block in self._model.model.layers: if hasattr(block.self_attn, "_orig_forward"): block.self_attn.forward = block.self_attn._orig_forward - if hasattr(block.mlp, "_org_moe_infer"): - if block.mlp._org_moe_infer is not None: - block.mlp.moe_infer = block.mlp._org_moe_infer + if hasattr(block.mlp, "_orig_moe_infer"): + if block.mlp._orig_moe_infer is not None: + block.mlp.moe_infer = block.mlp._orig_moe_infer else: delattr(block.mlp, "moe_infer") if hasattr(block.mlp, "ep_rank"): From 8f18ff5f4dda72fd2d73c3032262d42db50f4b98 Mon Sep 17 00:00:00 2001 From: Mohamed-Ashraf273 Date: Thu, 5 Mar 2026 15:18:14 +0200 Subject: [PATCH 09/39] fix issues --- optimum/exporters/openvino/convert.py | 20 ++++++++++ optimum/exporters/openvino/model_configs.py | 1 + optimum/exporters/openvino/model_patcher.py | 44 ++++++++++++++++++++- tests/openvino/test_decoder.py | 23 ++++------- tests/openvino/utils_tests.py | 8 ++-- 5 files changed, 75 insertions(+), 21 deletions(-) diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py index eda3c7e01b..20f3fe12eb 100644 --- a/optimum/exporters/openvino/convert.py +++ b/optimum/exporters/openvino/convert.py @@ -716,7 +716,27 @@ def export_from_model( generation_config = getattr(model, "generation_config", None) if generation_config is not None: try: + # Preserve the original `transformers_version` from the source model's generation_config.json. + # Starting in transformers 4.50, _prepare_generation_config() applies model-default generation + # parameters (do_sample, temperature, top_p, …) when the user-provided GenerationConfig uses + # the global default value for those fields AND the stored `transformers_version` is >= 4.50. + # Exporting bumps the version to the current transformers release, which causes user-supplied + # params (e.g. do_sample=False) to be silently overridden by the model defaults at inference + # time. Preserving the original version keeps the OV model consistent with the PT original. + orig_transformers_version = getattr(generation_config, "transformers_version", None) generation_config.save_pretrained(output) + if orig_transformers_version is not None: + import json as _json + from pathlib import Path as _Path + + gen_cfg_path = _Path(output) / "generation_config.json" + if gen_cfg_path.exists(): + with open(gen_cfg_path, "r", encoding="utf-8") as _f: + _cfg = _json.load(_f) + if _cfg.get("transformers_version") != orig_transformers_version: + _cfg["transformers_version"] = orig_transformers_version + with open(gen_cfg_path, "w", encoding="utf-8") as _f: + _json.dump(_cfg, _f, indent=2) except Exception as exception: logger.warning( f"The generation config will not be saved, saving failed with following error:\n{exception}" diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index d295f9b266..de4916c7a4 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -4081,6 +4081,7 @@ class M2M100OpenVINOConfig(BartOpenVINOConfig): @register_in_tasks_manager("deepseek", *["text-generation", "text-generation-with-past"], library_name="transformers") class DeepseekOpenVINOConfig(MiniCPM3OpenVINOConfig): MIN_TRANSFORMERS_VERSION = "4.53.0" + MAX_TRANSFORMERS_VERSION = None _MODEL_PATCHER = DeepseekPatcher diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index dbc17c49d2..40c8c606da 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -3769,10 +3769,21 @@ def __enter__(self): block.self_attn._orig_forward = block.self_attn.forward block.self_attn.forward = types.MethodType(self_attn_fwd, block.self_attn) if hasattr(block.mlp, "moe_infer"): + # old interface (transformers < 4.57): moe_infer(self, x, topk_ids, topk_weight) block.mlp._orig_moe_infer = block.mlp.moe_infer + block.mlp._orig_moe = None + block.mlp.ep_rank = getattr(block.mlp, "ep_rank", 0) + block.mlp.experts_per_rank = getattr(block.mlp, "experts_per_rank", len(block.mlp.experts)) block.mlp.moe_infer = types.MethodType(deepseek_moe_infer, block.mlp) + elif hasattr(block.mlp, "moe") and hasattr(block.mlp, "experts"): + # new interface (transformers >= 4.57): moe(self, hidden_states, topk_indices, topk_weights) + block.mlp._orig_moe = block.mlp.moe + block.mlp._orig_moe_infer = None + block.mlp.moe = types.MethodType(deepseek_moe, block.mlp) elif hasattr(block.mlp, "experts"): + # fallback: patch by injecting moe_infer with required attributes block.mlp._orig_moe_infer = None + block.mlp._orig_moe = None block.mlp.ep_rank = 0 block.mlp.experts_per_rank = len(block.mlp.experts) block.mlp.moe_infer = types.MethodType(deepseek_moe_infer, block.mlp) @@ -3782,15 +3793,21 @@ def __exit__(self, exc_type, exc_value, traceback): for block in self._model.model.layers: if hasattr(block.self_attn, "_orig_forward"): block.self_attn.forward = block.self_attn._orig_forward + if hasattr(block.mlp, "_orig_moe"): + if block.mlp._orig_moe is not None: + block.mlp.moe = block.mlp._orig_moe + delattr(block.mlp, "_orig_moe") if hasattr(block.mlp, "_orig_moe_infer"): if block.mlp._orig_moe_infer is not None: block.mlp.moe_infer = block.mlp._orig_moe_infer else: - delattr(block.mlp, "moe_infer") + if hasattr(block.mlp, "moe_infer"): + delattr(block.mlp, "moe_infer") if hasattr(block.mlp, "ep_rank"): delattr(block.mlp, "ep_rank") if hasattr(block.mlp, "experts_per_rank"): delattr(block.mlp, "experts_per_rank") + delattr(block.mlp, "_orig_moe_infer") def make_deepseek_attn_forward(version: int = 3): @@ -3868,7 +3885,7 @@ def deepseek_attn_forward( else: q_rot, k_rot = apply_rotary_pos_emb(q_rot, k_rot, cos, sin) cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} - kv_cache = past_key_value + kv_cache = past_key_values if past_key_values is not None else past_key_value else: q_rot, k_rot = apply_rotary_emb(q_rot, k_rot, position_embeddings.to(q_rot.device)) cache_kwargs = {"cache_position": cache_position} @@ -3949,6 +3966,29 @@ def deepseek_moe_infer(self, x, topk_ids, topk_weight): return final_out +def deepseek_moe(self, hidden_states: torch.Tensor, topk_indices: torch.Tensor, topk_weights: torch.Tensor): + """ + Replacement for DeepseekV3MoE.moe (transformers >= 4.57). + The original skips experts with no tokens (data-dependent control flow that breaks tracing). + This version unconditionally runs all experts to produce a traceable static graph. + """ + final_hidden_states = torch.zeros_like(hidden_states, dtype=topk_weights.dtype) + expert_mask = torch.nn.functional.one_hot(topk_indices, num_classes=len(self.experts)) + expert_mask = expert_mask.permute(2, 0, 1) + + for expert_idx in range(len(self.experts)): + expert = self.experts[expert_idx] + mask = expert_mask[expert_idx] + token_indices, weight_indices = torch.where(mask) + expert_weights = topk_weights[token_indices, weight_indices] + expert_input = hidden_states[token_indices] + expert_output = expert(expert_input) + weighted_output = expert_output * expert_weights.unsqueeze(-1) + final_hidden_states.index_add_(0, token_indices, weighted_output) + + return final_hidden_states.type(hidden_states.dtype) + + class Qwen2VLLanguageModelPatcher(OVDecoderModelPatcher): def __init__( self, diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index 7a36240a70..2110c1b191 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -24,7 +24,6 @@ from optimum.exporters.openvino.model_configs import ( BitnetOpenVINOConfig, - DeepseekOpenVINOConfig, LFM2OpenVINOConfig, Qwen3VLOpenVINOConfig, ) @@ -287,11 +286,6 @@ def test_find_untested_architectures(self): if "llama4_text" in supported_architectures: supported_architectures.remove("llama4_text") - if is_transformers_version(">=", str(DeepseekOpenVINOConfig.MAX_TRANSFORMERS_VERSION)): - if "deepseek_v2" in supported_architectures: - supported_architectures.remove("deepseek_v2") - if "deepseek_v3" in supported_architectures: - supported_architectures.remove("deepseek_v3") if is_transformers_version("<", str(BitnetOpenVINOConfig.MIN_TRANSFORMERS_VERSION)): supported_architectures -= {"bitnet"} if is_transformers_version("<", str(LFM2OpenVINOConfig.MIN_TRANSFORMERS_VERSION)): @@ -385,10 +379,9 @@ def test_compare_to_transformers(self, model_arch): # Compare tensor outputs atol_by_arch = { - "deepseek": 3e-2, - "gigachat3": 3e-2, "minicpm": 3e-3, "qwen2-moe": 3e-3, + "gigachat3": 3e-3, } atol = atol_by_arch.get(model_arch, 1e-4) # quantized models have different logits value range @@ -400,6 +393,12 @@ def test_compare_to_transformers(self, model_arch): return tokens = tokenizer(["Today is a nice day and I am longer", "This is me"], return_tensors="pt", padding=True) + + # Gigachat3 tokenizer add token_type_ids which DeepSeekV3 + # and similar models do not accept in generate(); strip it so both OV and PT calls succeed. + if model_arch in ["gigachat3"]: + tokens.pop("token_type_ids", None) + ov_model.generation_config.eos_token_id = None transformers_model.generation_config.eos_token_id = None ov_model.config.eos_token_id = None @@ -416,10 +415,6 @@ def test_compare_to_transformers(self, model_arch): ov_outputs = ov_model.generate(**tokens, generation_config=gen_config) - # TODO: add back once https://huggingface.co/katuni4ka/tiny-random-minicpm3/discussions/1 merged (for all models) as current modeling incompatible with transformers >= v4.49 - if model_arch in {"deepseek", "gigachat3"} and is_transformers_version(">=", "4.49"): - self.skipTest("Incompatible modeling code") - additional_inputs = {} # gemma2 does not support dynamic cache, it is unfair to compare dynamic cache result vs hybrid cache, # align cache representation in torch model @@ -669,10 +664,6 @@ def test_beam_search(self, model_arch): if model_arch in ["lfm2", "granitemoehybrid"]: return - # TODO: add back once https://huggingface.co/katuni4ka/tiny-random-minicpm3/discussions/1 merged (for all models) as current modeling incompatible with transformers >= v4.49 - if model_arch in {"deepseek"} and is_transformers_version(">=", "4.49"): - self.skipTest("Incompatible modeling code") - tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=model_arch in REMOTE_CODE_MODELS) if model_arch == "persimmon": tokenizer.pad_token_id = tokenizer.bos_token_id diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index 35d74ebcb0..4c2dbf1065 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -70,7 +70,7 @@ "deberta-v2": "optimum-intel-internal-testing/tiny-random-DebertaV2Model", "decilm": "optimum-intel-internal-testing/tiny-random-decilm", "deepseek": "optimum-intel-internal-testing/tiny-random-deepseek-v3", - "gigachat3": "mohamedahraf273/tiny-random-gigachat3", + "gigachat3": "optimum-intel-internal-testing/tiny-random-gigachat3", "deit": "optimum-intel-internal-testing/tiny-random-DeiTModel", "convnext": "optimum-intel-internal-testing/tiny-random-convnext", "convnextv2": "optimum-intel-internal-testing/tiny-random-ConvNextV2Model", @@ -551,8 +551,10 @@ def get_supported_model_for_library(library_name): if supported_model_type[model_type].get("openvino"): export_config = next(iter(supported_model_type[model_type]["openvino"].values())) - min_transformers = str(getattr(export_config.func, "MIN_TRANSFORMERS_VERSION", "0")) - max_transformers = str(getattr(export_config.func, "MAX_TRANSFORMERS_VERSION", "999")) + raw_min = getattr(export_config.func, "MIN_TRANSFORMERS_VERSION", None) + raw_max = getattr(export_config.func, "MAX_TRANSFORMERS_VERSION", None) + min_transformers = str(raw_min) if raw_min is not None else "0" + max_transformers = str(raw_max) if raw_max is not None else "999" if is_transformers_version(">=", min_transformers) and is_transformers_version("<=", max_transformers): valid_model.add(model_type) From 5049ce373aebfeedff0f12cc4be200340ca4eef9 Mon Sep 17 00:00:00 2001 From: Mohamed-Ashraf273 Date: Thu, 5 Mar 2026 15:56:48 +0200 Subject: [PATCH 10/39] update test --- tests/openvino/test_decoder.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index 2110c1b191..4e2d12f56e 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -381,7 +381,6 @@ def test_compare_to_transformers(self, model_arch): atol_by_arch = { "minicpm": 3e-3, "qwen2-moe": 3e-3, - "gigachat3": 3e-3, } atol = atol_by_arch.get(model_arch, 1e-4) # quantized models have different logits value range From 722dc5334302cd3678ff4400bad6f74a71000845 Mon Sep 17 00:00:00 2001 From: Mohamed-Ashraf273 Date: Fri, 6 Mar 2026 10:33:17 +0200 Subject: [PATCH 11/39] update tests --- tests/openvino/test_decoder.py | 6 +----- tests/openvino/test_exporters_cli.py | 8 +++----- 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index 4e2d12f56e..ee7ee6c1ea 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -378,11 +378,7 @@ def test_compare_to_transformers(self, model_arch): transformers_outputs = transformers_model(**tokens) # Compare tensor outputs - atol_by_arch = { - "minicpm": 3e-3, - "qwen2-moe": 3e-3, - } - atol = atol_by_arch.get(model_arch, 1e-4) + atol = 3e-3 if model_arch in ["minicpm", "qwen2-moe"] else 1e-4 # quantized models have different logits value range if "awq" not in model_arch and "gptq" not in model_arch: self.assertTrue(torch.allclose(ov_outputs.logits, transformers_outputs.logits, equal_nan=True, atol=atol)) diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index c8b0eec341..b14b4fe40a 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -307,11 +307,9 @@ class OVCLIExportTestCase(unittest.TestCase): "whisper", "f8e4m3", "--dataset librispeech --num-samples 1 --smooth-quant-alpha 0.9 --trust-remote-code", - ( - {"encoder": 16, "decoder": 26, "decoder_with_past": 23} - if is_transformers_version("<=", "4.45") - else {"encoder": 16, "decoder": 26, "decoder_with_past": 25} - ), + {"encoder": 16, "decoder": 26, "decoder_with_past": 23} + if is_transformers_version("<=", "4.45") + else {"encoder": 16, "decoder": 26, "decoder_with_past": 25}, ( {"encoder": {"f8e4m3": 14}, "decoder": {"f8e4m3": 22}, "decoder_with_past": {"f8e4m3": 17}} if is_transformers_version("<=", "4.45") From 28a6330daa3f49529bb9fa3bf553f5bc644968cf Mon Sep 17 00:00:00 2001 From: Mohamed-Ashraf273 Date: Fri, 6 Mar 2026 11:57:04 +0200 Subject: [PATCH 12/39] fix test issue --- tests/openvino/test_decoder.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index ee7ee6c1ea..2f47694b5f 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -24,6 +24,7 @@ from optimum.exporters.openvino.model_configs import ( BitnetOpenVINOConfig, + DeepseekOpenVINOConfig, LFM2OpenVINOConfig, Qwen3VLOpenVINOConfig, ) @@ -290,6 +291,9 @@ def test_find_untested_architectures(self): supported_architectures -= {"bitnet"} if is_transformers_version("<", str(LFM2OpenVINOConfig.MIN_TRANSFORMERS_VERSION)): supported_architectures -= {"lfm2"} + # deepseek_v2 and deepseek_v3 are aliases of the same architecture tested under "deepseek" + if is_transformers_version(">=", str(DeepseekOpenVINOConfig.MIN_TRANSFORMERS_VERSION)): + supported_architectures -= {"deepseek_v2", "deepseek_v3"} # qwen3_vl_text a part of qwen3_vl architecture and is tested in seq2seq group if is_transformers_version(">=", str(Qwen3VLOpenVINOConfig.MIN_TRANSFORMERS_VERSION)): From 07efafd24bda7a2e076e5ce59430543861199e46 Mon Sep 17 00:00:00 2001 From: Mohamed-Ashraf273 Date: Fri, 6 Mar 2026 12:00:37 +0200 Subject: [PATCH 13/39] fix test issue --- tests/openvino/test_decoder.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index 2f47694b5f..c19ef2ade9 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -287,13 +287,15 @@ def test_find_untested_architectures(self): if "llama4_text" in supported_architectures: supported_architectures.remove("llama4_text") + if is_transformers_version(">=", str(DeepseekOpenVINOConfig.MAX_TRANSFORMERS_VERSION)): + if "deepseek_v2" in supported_architectures: + supported_architectures.remove("deepseek_v2") + if "deepseek_v3" in supported_architectures: + supported_architectures.remove("deepseek_v3") if is_transformers_version("<", str(BitnetOpenVINOConfig.MIN_TRANSFORMERS_VERSION)): supported_architectures -= {"bitnet"} if is_transformers_version("<", str(LFM2OpenVINOConfig.MIN_TRANSFORMERS_VERSION)): supported_architectures -= {"lfm2"} - # deepseek_v2 and deepseek_v3 are aliases of the same architecture tested under "deepseek" - if is_transformers_version(">=", str(DeepseekOpenVINOConfig.MIN_TRANSFORMERS_VERSION)): - supported_architectures -= {"deepseek_v2", "deepseek_v3"} # qwen3_vl_text a part of qwen3_vl architecture and is tested in seq2seq group if is_transformers_version(">=", str(Qwen3VLOpenVINOConfig.MIN_TRANSFORMERS_VERSION)): From c52c62a5e8f04cd80ef8c50b019cd587564631b0 Mon Sep 17 00:00:00 2001 From: Mohamed-Ashraf273 Date: Mon, 9 Mar 2026 13:22:01 +0200 Subject: [PATCH 14/39] fix tests --- tests/openvino/test_decoder.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index c19ef2ade9..74e0673dbb 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -287,11 +287,8 @@ def test_find_untested_architectures(self): if "llama4_text" in supported_architectures: supported_architectures.remove("llama4_text") - if is_transformers_version(">=", str(DeepseekOpenVINOConfig.MAX_TRANSFORMERS_VERSION)): - if "deepseek_v2" in supported_architectures: - supported_architectures.remove("deepseek_v2") - if "deepseek_v3" in supported_architectures: - supported_architectures.remove("deepseek_v3") + if is_transformers_version(">=", str(DeepseekOpenVINOConfig.MIN_TRANSFORMERS_VERSION)): + supported_architectures -= {"deepseek_v2", "deepseek_v3"} if is_transformers_version("<", str(BitnetOpenVINOConfig.MIN_TRANSFORMERS_VERSION)): supported_architectures -= {"bitnet"} if is_transformers_version("<", str(LFM2OpenVINOConfig.MIN_TRANSFORMERS_VERSION)): From a63a52d9f457d8b5beeb9950ad5ef91d02dc7abf Mon Sep 17 00:00:00 2001 From: Mohamed-Ashraf273 Date: Mon, 9 Mar 2026 15:26:59 +0200 Subject: [PATCH 15/39] fix conflict --- tests/openvino/utils_tests.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index 4c2dbf1065..455baab74d 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -173,6 +173,7 @@ "qwen3": "optimum-intel-internal-testing/tiny-random-qwen3", "qwen3_moe": "optimum-intel-internal-testing/tiny-random-qwen3moe", "qwen3_vl": "optimum-intel-internal-testing/tiny-random-qwen3-vl", + "qwen3_next": "optimum-intel-internal-testing/tiny-random-qwen3-next", "rembert": "optimum-intel-internal-testing/tiny-random-rembert", "resnet": "optimum-intel-internal-testing/tiny-random-resnet", "roberta": "optimum-intel-internal-testing/tiny-random-roberta", @@ -372,6 +373,7 @@ "hunyuan_v1_dense": {"model": 32}, "qwen3_eagle3": {"model": 20}, "gigachat3": {"model": 58}, + "qwen3_next": {"model": 100}, } TEST_IMAGE_URL = "http://images.cocodataset.org/val2017/000000039769.jpg" From f8bdfe5591d9c35ed61cd526dfca45fc8397ee24 Mon Sep 17 00:00:00 2001 From: Mohamed-Ashraf273 Date: Mon, 9 Mar 2026 15:38:49 +0200 Subject: [PATCH 16/39] fix conflict --- tests/openvino/utils_tests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index 455baab74d..f53bd364e1 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -372,8 +372,8 @@ "lfm2": {"model": 52}, "hunyuan_v1_dense": {"model": 32}, "qwen3_eagle3": {"model": 20}, - "gigachat3": {"model": 58}, "qwen3_next": {"model": 100}, + "gigachat3": {"model": 58}, } TEST_IMAGE_URL = "http://images.cocodataset.org/val2017/000000039769.jpg" From 5b32d32522dde091d73b20f697542e4e35c8a6a9 Mon Sep 17 00:00:00 2001 From: Mohamed-Ashraf273 Date: Fri, 13 Mar 2026 16:47:36 +0200 Subject: [PATCH 17/39] revert conevrt.py changes --- optimum/exporters/openvino/convert.py | 20 -------------------- tests/openvino/test_decoder.py | 3 +++ 2 files changed, 3 insertions(+), 20 deletions(-) diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py index b50b990ab7..60d90f53e0 100644 --- a/optimum/exporters/openvino/convert.py +++ b/optimum/exporters/openvino/convert.py @@ -721,27 +721,7 @@ def export_from_model( generation_config = getattr(model, "generation_config", None) if generation_config is not None: try: - # Preserve the original `transformers_version` from the source model's generation_config.json. - # Starting in transformers 4.50, _prepare_generation_config() applies model-default generation - # parameters (do_sample, temperature, top_p, …) when the user-provided GenerationConfig uses - # the global default value for those fields AND the stored `transformers_version` is >= 4.50. - # Exporting bumps the version to the current transformers release, which causes user-supplied - # params (e.g. do_sample=False) to be silently overridden by the model defaults at inference - # time. Preserving the original version keeps the OV model consistent with the PT original. - orig_transformers_version = getattr(generation_config, "transformers_version", None) generation_config.save_pretrained(output) - if orig_transformers_version is not None: - import json as _json - from pathlib import Path as _Path - - gen_cfg_path = _Path(output) / "generation_config.json" - if gen_cfg_path.exists(): - with open(gen_cfg_path, "r", encoding="utf-8") as _f: - _cfg = _json.load(_f) - if _cfg.get("transformers_version") != orig_transformers_version: - _cfg["transformers_version"] = orig_transformers_version - with open(gen_cfg_path, "w", encoding="utf-8") as _f: - _json.dump(_cfg, _f, indent=2) except Exception as exception: logger.warning( f"The generation config will not be saved, saving failed with following error:\n{exception}" diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index a393782973..d998bd3e40 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -401,6 +401,9 @@ def test_compare_to_transformers(self, model_arch): if model_arch in ["gigachat3"]: tokens.pop("token_type_ids", None) + if model_arch == "deepseek": + gen_config.do_sample = False + ov_model.generation_config.eos_token_id = None transformers_model.generation_config.eos_token_id = None ov_model.config.eos_token_id = None From 04b4d9f3ee1a49843c9a3860a2bef3036e5d8c17 Mon Sep 17 00:00:00 2001 From: Mohamed-Ashraf273 Date: Fri, 13 Mar 2026 16:53:41 +0200 Subject: [PATCH 18/39] revert conevrt.py changes --- tests/openvino/test_decoder.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index d998bd3e40..653f831cb0 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -401,8 +401,6 @@ def test_compare_to_transformers(self, model_arch): if model_arch in ["gigachat3"]: tokens.pop("token_type_ids", None) - if model_arch == "deepseek": - gen_config.do_sample = False ov_model.generation_config.eos_token_id = None transformers_model.generation_config.eos_token_id = None @@ -418,6 +416,9 @@ def test_compare_to_transformers(self, model_arch): do_sample=False, ) + if model_arch == "deepseek": + gen_config.do_sample = False + ov_outputs = ov_model.generate(**tokens, generation_config=gen_config) additional_inputs = {} From c0ba5d0b77a2cf417f3f8078fa738499fcbe90fb Mon Sep 17 00:00:00 2001 From: Mohamed-Ashraf273 Date: Fri, 13 Mar 2026 17:08:25 +0200 Subject: [PATCH 19/39] revert conevrt.py changes --- tests/openvino/test_decoder.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index 653f831cb0..c5d26c8d4a 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -401,6 +401,9 @@ def test_compare_to_transformers(self, model_arch): if model_arch in ["gigachat3"]: tokens.pop("token_type_ids", None) + if model_arch == "deepseek": + ov_model.generation_config.do_sample = False + transformers_model.generation_config.do_sample = False ov_model.generation_config.eos_token_id = None transformers_model.generation_config.eos_token_id = None @@ -416,9 +419,6 @@ def test_compare_to_transformers(self, model_arch): do_sample=False, ) - if model_arch == "deepseek": - gen_config.do_sample = False - ov_outputs = ov_model.generate(**tokens, generation_config=gen_config) additional_inputs = {} @@ -778,8 +778,10 @@ def test_beam_search(self, model_arch): ov_model_stateless.config.eos_token_id = None transformers_model.config.eos_token_id = None - if is_transformers_version(">=", "4.51"): - additional_inputs["use_model_defaults"] = False + # For deepseek, sampling is enabled by default, but we need to disable it for the test + if model_arch == "deepseek": + ov_model.generation_config.do_sample = False + transformers_model.generation_config.do_sample = False for gen_config in gen_configs: if gen_config.do_sample and model_arch in ["baichuan2-13b", "olmo", "zamba2"]: From 63a956d049303b99481d595bc2bfbfbd98cee712 Mon Sep 17 00:00:00 2001 From: Mohamed-Ashraf273 Date: Fri, 13 Mar 2026 17:34:51 +0200 Subject: [PATCH 20/39] revert conevrt.py changes --- tests/openvino/test_decoder.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index c5d26c8d4a..1306ce2b47 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -778,10 +778,8 @@ def test_beam_search(self, model_arch): ov_model_stateless.config.eos_token_id = None transformers_model.config.eos_token_id = None - # For deepseek, sampling is enabled by default, but we need to disable it for the test - if model_arch == "deepseek": - ov_model.generation_config.do_sample = False - transformers_model.generation_config.do_sample = False + if is_transformers_version(">=", "4.51"): + additional_inputs["use_model_defaults"] = False for gen_config in gen_configs: if gen_config.do_sample and model_arch in ["baichuan2-13b", "olmo", "zamba2"]: From acd8148f2134a3548c002aef289f7be1075bf5f2 Mon Sep 17 00:00:00 2001 From: Mohamed-Ashraf273 Date: Fri, 13 Mar 2026 17:35:31 +0200 Subject: [PATCH 21/39] revert conevrt.py changes --- tests/openvino/test_decoder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index 1306ce2b47..89bcacf9a4 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -779,7 +779,7 @@ def test_beam_search(self, model_arch): transformers_model.config.eos_token_id = None if is_transformers_version(">=", "4.51"): - additional_inputs["use_model_defaults"] = False + additional_inputs["use_model_defaults"] = False for gen_config in gen_configs: if gen_config.do_sample and model_arch in ["baichuan2-13b", "olmo", "zamba2"]: From dbc143293bf9e2ef8c1138b66a615c92cc3c3a4a Mon Sep 17 00:00:00 2001 From: Mohamed-Ashraf273 Date: Sun, 15 Mar 2026 15:56:58 +0200 Subject: [PATCH 22/39] update deepseek's patcher --- optimum/exporters/openvino/model_patcher.py | 70 ++++++++++++++++----- 1 file changed, 55 insertions(+), 15 deletions(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index dc600868cf..6d5e1f19c8 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -3781,6 +3781,34 @@ def __enter__(self): # new interface (transformers >= 4.57): moe(self, hidden_states, topk_indices, topk_weights) block.mlp._orig_moe = block.mlp.moe block.mlp._orig_moe_infer = None + num_experts = len(block.mlp.experts) + block.mlp.gate_projs = ( + torch.concat( + tuple(block.mlp.experts[i].gate_proj.weight.unsqueeze(0) for i in range(num_experts)), + dim=0, + ) + .transpose(1, 2) + .float() + ) + + block.mlp.up_projs = ( + torch.concat( + tuple(block.mlp.experts[i].up_proj.weight.unsqueeze(0) for i in range(num_experts)), + dim=0, + ) + .transpose(1, 2) + .float() + ) + + block.mlp.down_projs = ( + torch.concat( + tuple(block.mlp.experts[i].down_proj.weight.unsqueeze(0) for i in range(num_experts)), + dim=0, + ) + .transpose(1, 2) + .float() + ) + block.mlp.moe = types.MethodType(deepseek_moe, block.mlp) elif hasattr(block.mlp, "experts"): # fallback: patch by injecting moe_infer with required attributes @@ -3798,6 +3826,12 @@ def __exit__(self, exc_type, exc_value, traceback): if hasattr(block.mlp, "_orig_moe"): if block.mlp._orig_moe is not None: block.mlp.moe = block.mlp._orig_moe + if hasattr(block.mlp, "gate_projs"): + del block.mlp.gate_projs + if hasattr(block.mlp, "up_projs"): + del block.mlp.up_projs + if hasattr(block.mlp, "down_projs"): + del block.mlp.down_projs delattr(block.mlp, "_orig_moe") if hasattr(block.mlp, "_orig_moe_infer"): if block.mlp._orig_moe_infer is not None: @@ -3974,21 +4008,27 @@ def deepseek_moe(self, hidden_states: torch.Tensor, topk_indices: torch.Tensor, The original skips experts with no tokens (data-dependent control flow that breaks tracing). This version unconditionally runs all experts to produce a traceable static graph. """ - final_hidden_states = torch.zeros_like(hidden_states, dtype=topk_weights.dtype) - expert_mask = torch.nn.functional.one_hot(topk_indices, num_classes=len(self.experts)) - expert_mask = expert_mask.permute(2, 0, 1) - - for expert_idx in range(len(self.experts)): - expert = self.experts[expert_idx] - mask = expert_mask[expert_idx] - token_indices, weight_indices = torch.where(mask) - expert_weights = topk_weights[token_indices, weight_indices] - expert_input = hidden_states[token_indices] - expert_output = expert(expert_input) - weighted_output = expert_output * expert_weights.unsqueeze(-1) - final_hidden_states.index_add_(0, token_indices, weighted_output) - - return final_hidden_states.type(hidden_states.dtype) + num_experts = len(self.experts) + batch_tokens, hidden_dim = hidden_states.shape + + routing = torch.zeros( + batch_tokens, num_experts, + dtype=topk_weights.dtype, + device=hidden_states.device + ) + routing.scatter_(1, topk_indices, topk_weights) + + hidden_states = hidden_states.repeat(num_experts, 1) + hidden_states = hidden_states.view(num_experts, batch_tokens, hidden_dim) + act_fn = self.experts[0].act_fn + gate = torch.bmm(hidden_states, self.gate_projs) + up = torch.bmm(hidden_states, self.up_projs) + gate_up = act_fn(gate) * up + next_states = torch.bmm(gate_up, self.down_projs) + routing = routing.transpose(0, 1).unsqueeze(-1) + next_states = next_states * routing + next_states = next_states.sum(dim=0) + return next_states.type(hidden_states.dtype) class Qwen2VLLanguageModelPatcher(OVDecoderModelPatcher): From 47d2910f4d89598c5e6df61287d53553524bbc30 Mon Sep 17 00:00:00 2001 From: Mohamed-Ashraf273 Date: Tue, 17 Mar 2026 16:27:29 +0200 Subject: [PATCH 23/39] modify patcher --- optimum/exporters/openvino/model_patcher.py | 70 +++++++++++++-------- 1 file changed, 44 insertions(+), 26 deletions(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 6d5e1f19c8..1934e2a5ae 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -3782,33 +3782,51 @@ def __enter__(self): block.mlp._orig_moe = block.mlp.moe block.mlp._orig_moe_infer = None num_experts = len(block.mlp.experts) - block.mlp.gate_projs = ( - torch.concat( - tuple(block.mlp.experts[i].gate_proj.weight.unsqueeze(0) for i in range(num_experts)), - dim=0, - ) - .transpose(1, 2) - .float() - ) - block.mlp.up_projs = ( - torch.concat( - tuple(block.mlp.experts[i].up_proj.weight.unsqueeze(0) for i in range(num_experts)), - dim=0, - ) - .transpose(1, 2) - .float() + # Concatenate expert weights + gate_projs = torch.concat( + tuple(block.mlp.experts[i].gate_proj.weight.unsqueeze(0) for i in range(num_experts)), + dim=0, ) - - block.mlp.down_projs = ( - torch.concat( - tuple(block.mlp.experts[i].down_proj.weight.unsqueeze(0) for i in range(num_experts)), - dim=0, - ) - .transpose(1, 2) - .float() + up_projs = torch.concat( + tuple(block.mlp.experts[i].up_proj.weight.unsqueeze(0) for i in range(num_experts)), + dim=0, + ) + down_projs = torch.concat( + tuple(block.mlp.experts[i].down_proj.weight.unsqueeze(0) for i in range(num_experts)), + dim=0, ) + # Handle OpenVINO version check with proper version string parsing + import re + import warnings + + from packaging import version + + import openvino as ov + + ov_version_str = ov.__version__ + version_match = re.match(r'(\d+\.\d+\.\d+)', ov_version_str) + if version_match: + ov_version = version.parse(version_match.group(1)) + if ov_version <= version.parse("2026.0.0"): + warnings.warn( + "This model works best with OpenVINO 2026.1 or later. " + "Earlier versions require float() conversion for MoE weights, " + "which may affect performance." + ) + block.mlp.gate_projs = gate_projs.float() + block.mlp.up_projs = up_projs.float() + block.mlp.down_projs = down_projs.float() + else: + block.mlp.gate_projs = gate_projs + block.mlp.up_projs = up_projs + block.mlp.down_projs = down_projs + else: + block.mlp.gate_projs = gate_projs + block.mlp.up_projs = up_projs + block.mlp.down_projs = down_projs + block.mlp.moe = types.MethodType(deepseek_moe, block.mlp) elif hasattr(block.mlp, "experts"): # fallback: patch by injecting moe_infer with required attributes @@ -4021,10 +4039,10 @@ def deepseek_moe(self, hidden_states: torch.Tensor, topk_indices: torch.Tensor, hidden_states = hidden_states.repeat(num_experts, 1) hidden_states = hidden_states.view(num_experts, batch_tokens, hidden_dim) act_fn = self.experts[0].act_fn - gate = torch.bmm(hidden_states, self.gate_projs) - up = torch.bmm(hidden_states, self.up_projs) + gate = torch.bmm(hidden_states, self.gate_projs.transpose(1, 2)) + up = torch.bmm(hidden_states, self.up_projs.transpose(1, 2)) gate_up = act_fn(gate) * up - next_states = torch.bmm(gate_up, self.down_projs) + next_states = torch.bmm(gate_up, self.down_projs.transpose(1, 2)) routing = routing.transpose(0, 1).unsqueeze(-1) next_states = next_states * routing next_states = next_states.sum(dim=0) From eb601d9a35567195f8b4fd26e7976ac451a0d118 Mon Sep 17 00:00:00 2001 From: Mohamed-Ashraf273 Date: Fri, 20 Mar 2026 12:15:12 +0200 Subject: [PATCH 24/39] update patcher --- optimum/exporters/openvino/model_patcher.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 1934e2a5ae..e71a418234 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -54,7 +54,12 @@ override_arguments, sdpa_mask_without_vmap, ) -from optimum.intel.utils.import_utils import is_diffusers_version, is_torch_version, is_transformers_version +from optimum.intel.utils.import_utils import ( + is_diffusers_version, + is_openvino_version, + is_torch_version, + is_transformers_version, +) from ._ov_ops import convert_recurrent_attention_cell @@ -3797,19 +3802,16 @@ def __enter__(self): dim=0, ) - # Handle OpenVINO version check with proper version string parsing + # Handle OpenVINO version check import re import warnings - from packaging import version - import openvino as ov ov_version_str = ov.__version__ version_match = re.match(r'(\d+\.\d+\.\d+)', ov_version_str) if version_match: - ov_version = version.parse(version_match.group(1)) - if ov_version <= version.parse("2026.0.0"): + if is_openvino_version("<=", "2026.0.0"): warnings.warn( "This model works best with OpenVINO 2026.1 or later. " "Earlier versions require float() conversion for MoE weights, " From fe1b84a50de10837ed6d46832c09a5fe254df7a1 Mon Sep 17 00:00:00 2001 From: Mohamed-Ashraf273 Date: Mon, 23 Mar 2026 10:47:52 +0200 Subject: [PATCH 25/39] removed unnecessary check --- optimum/exporters/openvino/model_patcher.py | 28 +++++++-------------- 1 file changed, 9 insertions(+), 19 deletions(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index e71a418234..120098da4d 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -3803,27 +3803,17 @@ def __enter__(self): ) # Handle OpenVINO version check - import re import warnings - import openvino as ov - - ov_version_str = ov.__version__ - version_match = re.match(r'(\d+\.\d+\.\d+)', ov_version_str) - if version_match: - if is_openvino_version("<=", "2026.0.0"): - warnings.warn( - "This model works best with OpenVINO 2026.1 or later. " - "Earlier versions require float() conversion for MoE weights, " - "which may affect performance." - ) - block.mlp.gate_projs = gate_projs.float() - block.mlp.up_projs = up_projs.float() - block.mlp.down_projs = down_projs.float() - else: - block.mlp.gate_projs = gate_projs - block.mlp.up_projs = up_projs - block.mlp.down_projs = down_projs + if is_openvino_version("<=", "2026.0.0"): + warnings.warn( + "This model works best with OpenVINO 2026.1 or later. " + "Earlier versions require float() conversion for MoE weights, " + "which may affect performance." + ) + block.mlp.gate_projs = gate_projs.float() + block.mlp.up_projs = up_projs.float() + block.mlp.down_projs = down_projs.float() else: block.mlp.gate_projs = gate_projs block.mlp.up_projs = up_projs From e86e7d9a41f63836c5402fde6d73215b180ceda9 Mon Sep 17 00:00:00 2001 From: Mohamed-Ashraf273 Date: Mon, 23 Mar 2026 11:15:43 +0200 Subject: [PATCH 26/39] fix pacther --- optimum/exporters/openvino/model_patcher.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 120098da4d..5a91ea981f 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -3805,11 +3805,12 @@ def __enter__(self): # Handle OpenVINO version check import warnings - if is_openvino_version("<=", "2026.0.0"): + if is_openvino_version("<", "2026.1.0"): warnings.warn( "This model works best with OpenVINO 2026.1 or later. " "Earlier versions require float() conversion for MoE weights, " - "which may affect performance." + "which may affect performance. " + "OpenVINO 2026.1 includes a fix for torch.bmm dtype handling." ) block.mlp.gate_projs = gate_projs.float() block.mlp.up_projs = up_projs.float() From cbc2005b6fa16c0ae59ca806023cfc91dca845e6 Mon Sep 17 00:00:00 2001 From: Mohamed-Ashraf273 Date: Mon, 23 Mar 2026 11:18:43 +0200 Subject: [PATCH 27/39] fix version --- optimum/exporters/openvino/model_patcher.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 5a91ea981f..bf99733807 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -3807,7 +3807,7 @@ def __enter__(self): if is_openvino_version("<", "2026.1.0"): warnings.warn( - "This model works best with OpenVINO 2026.1 or later. " + "This model works best with OpenVINO 2026.0 or later. " "Earlier versions require float() conversion for MoE weights, " "which may affect performance. " "OpenVINO 2026.1 includes a fix for torch.bmm dtype handling." From 199da923b72b4c7f4d4c31faf3cc27746b379129 Mon Sep 17 00:00:00 2001 From: Mohamed-Ashraf273 Date: Mon, 23 Mar 2026 11:19:13 +0200 Subject: [PATCH 28/39] fix version --- optimum/exporters/openvino/model_patcher.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index bf99733807..5a91ea981f 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -3807,7 +3807,7 @@ def __enter__(self): if is_openvino_version("<", "2026.1.0"): warnings.warn( - "This model works best with OpenVINO 2026.0 or later. " + "This model works best with OpenVINO 2026.1 or later. " "Earlier versions require float() conversion for MoE weights, " "which may affect performance. " "OpenVINO 2026.1 includes a fix for torch.bmm dtype handling." From 1dee64cc03c0cf446115b468d3fd1441aef13841 Mon Sep 17 00:00:00 2001 From: Mohamed-Ashraf273 Date: Mon, 23 Mar 2026 23:12:54 +0200 Subject: [PATCH 29/39] revert refactoring --- optimum/exporters/openvino/model_configs.py | 4 +- optimum/exporters/openvino/model_patcher.py | 377 +++++++++++++------- tests/openvino/test_decoder.py | 25 +- tests/openvino/test_export.py | 2 +- tests/openvino/test_exporters_cli.py | 2 +- tests/openvino/utils_tests.py | 7 +- 6 files changed, 274 insertions(+), 143 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 35f33a793f..0624624a77 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -4081,8 +4081,8 @@ class M2M100OpenVINOConfig(BartOpenVINOConfig): ) @register_in_tasks_manager("deepseek", *["text-generation", "text-generation-with-past"], library_name="transformers") class DeepseekOpenVINOConfig(MiniCPM3OpenVINOConfig): - MIN_TRANSFORMERS_VERSION = "4.53.0" - MAX_TRANSFORMERS_VERSION = None + MIN_TRANSFORMERS_VERSION = "4.46.0" + MAX_TRANSFORMERS_VERSION = "4.53.3" _MODEL_PATCHER = DeepseekPatcher diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 5a91ea981f..26ec653030 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -3765,30 +3765,32 @@ class DeepseekPatcher(OVDecoderModelPatcher): def __enter__(self): super().__enter__() self_attn = { - "deepseek_v3": make_deepseek_attn_forward(version=3), - "deepseek_v2": make_deepseek_attn_forward(version=2), + "deepseek_v3": deepseek_v3_attn_forward, + "deepseek_v2": deepseek_v2_attn_forward, "deepseek": minicpm3_attn_forward, } self_attn_fwd = self_attn.get(self._model.config.model_type) for block in self._model.model.layers: + # Patch attention if self_attn_fwd is not None: block.self_attn._orig_forward = block.self_attn.forward block.self_attn.forward = types.MethodType(self_attn_fwd, block.self_attn) + + # Patch MoE if hasattr(block.mlp, "moe_infer"): - # old interface (transformers < 4.57): moe_infer(self, x, topk_ids, topk_weight) block.mlp._orig_moe_infer = block.mlp.moe_infer block.mlp._orig_moe = None block.mlp.ep_rank = getattr(block.mlp, "ep_rank", 0) block.mlp.experts_per_rank = getattr(block.mlp, "experts_per_rank", len(block.mlp.experts)) block.mlp.moe_infer = types.MethodType(deepseek_moe_infer, block.mlp) + elif hasattr(block.mlp, "moe") and hasattr(block.mlp, "experts"): - # new interface (transformers >= 4.57): moe(self, hidden_states, topk_indices, topk_weights) block.mlp._orig_moe = block.mlp.moe block.mlp._orig_moe_infer = None - num_experts = len(block.mlp.experts) - # Concatenate expert weights + # Pre-concatenate expert weights for vectorized computation + num_experts = len(block.mlp.experts) gate_projs = torch.concat( tuple(block.mlp.experts[i].gate_proj.weight.unsqueeze(0) for i in range(num_experts)), dim=0, @@ -3802,16 +3804,7 @@ def __enter__(self): dim=0, ) - # Handle OpenVINO version check - import warnings - if is_openvino_version("<", "2026.1.0"): - warnings.warn( - "This model works best with OpenVINO 2026.1 or later. " - "Earlier versions require float() conversion for MoE weights, " - "which may affect performance. " - "OpenVINO 2026.1 includes a fix for torch.bmm dtype handling." - ) block.mlp.gate_projs = gate_projs.float() block.mlp.up_projs = up_projs.float() block.mlp.down_projs = down_projs.float() @@ -3821,19 +3814,15 @@ def __enter__(self): block.mlp.down_projs = down_projs block.mlp.moe = types.MethodType(deepseek_moe, block.mlp) - elif hasattr(block.mlp, "experts"): - # fallback: patch by injecting moe_infer with required attributes - block.mlp._orig_moe_infer = None - block.mlp._orig_moe = None - block.mlp.ep_rank = 0 - block.mlp.experts_per_rank = len(block.mlp.experts) - block.mlp.moe_infer = types.MethodType(deepseek_moe_infer, block.mlp) def __exit__(self, exc_type, exc_value, traceback): super().__exit__(exc_type, exc_value, traceback) for block in self._model.model.layers: + # Restore attention if hasattr(block.self_attn, "_orig_forward"): block.self_attn.forward = block.self_attn._orig_forward + + # Restore MoE - handle both interfaces if hasattr(block.mlp, "_orig_moe"): if block.mlp._orig_moe is not None: block.mlp.moe = block.mlp._orig_moe @@ -3844,6 +3833,7 @@ def __exit__(self, exc_type, exc_value, traceback): if hasattr(block.mlp, "down_projs"): del block.mlp.down_projs delattr(block.mlp, "_orig_moe") + if hasattr(block.mlp, "_orig_moe_infer"): if block.mlp._orig_moe_infer is not None: block.mlp.moe_infer = block.mlp._orig_moe_infer @@ -3857,126 +3847,261 @@ def __exit__(self, exc_type, exc_value, traceback): delattr(block.mlp, "_orig_moe_infer") -def make_deepseek_attn_forward(version: int = 3): - from typing import Callable +def deepseek_v3_attn_forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value=None, + output_attentions: bool = False, + use_cache: bool = False, + cache_position: Optional[torch.LongTensor] = None, # ← ADD THIS + **kwargs, +) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + # modified from https://huggingface.co/deepseek-ai/DeepSeek-V3/blob/main/modeling_deepseek.py#L751 + def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1): + orig_dtype = k.dtype + cos = cos[position_ids].unsqueeze(unsqueeze_dim) # [bs, 1, seq_len, dim] + sin = sin[position_ids].unsqueeze(unsqueeze_dim) # [bs, 1, seq_len, dim] + q_fp32 = q.to(dtype=torch.float32, device=q.device) + k_fp32 = k.to(dtype=torch.float32, device=k.device) + q_embed = (q_fp32 * cos) + (rotate_half(q_fp32) * sin) + k_embed = (k_fp32 * cos) + (rotate_half(k_fp32) * sin) + return q_embed.to(dtype=orig_dtype), k_embed.to(dtype=orig_dtype) - from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS + if not hasattr(self, 'q_head_dim'): + self.q_head_dim = self.qk_nope_head_dim + self.qk_rope_head_dim - if version == 3: - from transformers.models.deepseek_v3.modeling_deepseek_v3 import ( - apply_rotary_pos_emb, - apply_rotary_pos_emb_interleave, - eager_attention_forward, + if output_attentions: + return self._orig_forward( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + cache_position=cache_position, + **kwargs, ) - elif version == 2: - - def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: - batch, num_key_value_heads, slen, head_dim = hidden_states.shape - if n_rep == 1: - return hidden_states - hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) - return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) - - def apply_rotary_emb(xq: torch.Tensor, xk: torch.Tensor, freqs_cis: torch.Tensor): - xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2)) - xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2)) - freqs_cis = freqs_cis.unsqueeze(1).to(xq_.device) - xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3).type_as(xq) - xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3).type_as(xk) - return xq_out, xk_out - - def eager_attention_forward(module, query, key, value, attention_mask, scaling, dropout=0.0, **kwargs): - key = repeat_kv(key, module.num_key_value_groups) - value = repeat_kv(value, module.num_key_value_groups) - attn_weights = torch.matmul(query, key.transpose(2, 3)) * scaling - if attention_mask is not None: - attn_weights = attn_weights + attention_mask - attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) - attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) - attn_output = torch.matmul(attn_weights, value) - return attn_output.transpose(1, 2).contiguous(), attn_weights + bsz, q_len, _ = hidden_states.size() + + if self.q_lora_rank is None: + q = self.q_proj(hidden_states) else: - raise ValueError(f"Unsupported DeepSeek version: {version}") + q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states))) + q = q.view(bsz, q_len, self.num_heads, self.q_head_dim).transpose(1, 2) + q_nope, q_pe = torch.split(q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1) - def deepseek_attn_forward( - self, - hidden_states: torch.Tensor, - position_embeddings, - attention_mask: Optional[torch.Tensor], - past_key_value: Optional[Cache] = None, - past_key_values=None, - cache_position: Optional[torch.LongTensor] = None, - **kwargs, - ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: - batch_size, seq_length = hidden_states.shape[:-1] + compressed_kv = self.kv_a_proj_with_mqa(hidden_states) + compressed_kv, k_pe = torch.split(compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1) + k_pe = k_pe.view(bsz, q_len, 1, self.qk_rope_head_dim).transpose(1, 2) + kv = ( + self.kv_b_proj(self.kv_a_layernorm(compressed_kv)) + .view(bsz, q_len, self.num_heads, self.qk_nope_head_dim + self.v_head_dim) + .transpose(1, 2) + ) - if self.q_lora_rank is None: - q_states = self.q_proj(hidden_states) - else: - q_states = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states))) - q_states = q_states.view(batch_size, seq_length, -1, self.qk_head_dim).transpose(1, 2) - q_pass, q_rot = torch.split(q_states, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1) - - compressed_kv = self.kv_a_proj_with_mqa(hidden_states) - k_pass, k_rot = torch.split(compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1) - k_pass = self.kv_b_proj(self.kv_a_layernorm(k_pass)) - k_pass = k_pass.view(batch_size, seq_length, -1, self.qk_nope_head_dim + self.v_head_dim).transpose(1, 2) - k_pass, value_states = torch.split(k_pass, [self.qk_nope_head_dim, self.v_head_dim], dim=-1) - k_rot = k_rot.view(batch_size, 1, seq_length, self.qk_rope_head_dim) - - if version == 3: - cos, sin = position_embeddings - if self.config.rope_interleave: - q_rot, k_rot = apply_rotary_pos_emb_interleave(q_rot, k_rot, cos, sin) - else: - q_rot, k_rot = apply_rotary_pos_emb(q_rot, k_rot, cos, sin) - cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} - kv_cache = past_key_values if past_key_values is not None else past_key_value - else: - q_rot, k_rot = apply_rotary_emb(q_rot, k_rot, position_embeddings.to(q_rot.device)) - cache_kwargs = {"cache_position": cache_position} - kv_cache = past_key_values + k_nope, value_states = torch.split(kv, [self.qk_nope_head_dim, self.v_head_dim], dim=-1) + kv_seq_len = value_states.shape[-2] + if past_key_value is not None: + if self.layer_idx is None: + raise ValueError( + f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} " + "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " + "with a layer index." + ) + kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) - k_rot = k_rot.expand(*k_pass.shape[:-1], -1) - query_states = torch.cat((q_pass, q_rot), dim=-1) - key_states = torch.cat((k_pass, k_rot), dim=-1) + new_interface = False # Set to True if using new rotary embedding interface + if hasattr(self, 'rotary_emb'): + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin, position_ids) + else: + from transformers.models.deepseek_v3.modeling_deepseek_v3 import apply_rotary_pos_emb - if kv_cache is not None: - key_states, value_states = kv_cache.update(key_states, value_states, self.layer_idx, cache_kwargs) + cos, sin = position_embeddings + q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin) + new_interface = True - is_flash_attn = "flash" in self.config._attn_implementation - if is_flash_attn and self.qk_head_dim != self.v_head_dim: - value_states = F.pad(value_states, [0, self.qk_head_dim - self.v_head_dim]) - attention_interface: Callable = eager_attention_forward - if self.config._attn_implementation != "eager": - if version == 2: - attention_interface = ALL_ATTENTION_FUNCTIONS.get_interface( - self.config._attn_implementation, eager_attention_forward - ) - else: - attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin, position_ids) - attn_output, attn_weights = attention_interface( - self, - query_states, - key_states, - value_states, - attention_mask, - dropout=0.0 if not self.training else self.attention_dropout, - scaling=self.scaling, - **kwargs, + # Difference with original code, k_pe.new_empty create constant tensor in torchscript + query_states = torch.concat([q_nope, q_pe], dim=-1) + # query_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim) + # query_states[:, :, :, : self.qk_nope_head_dim] = q_nope + # query_states[:, :, :, self.qk_nope_head_dim :] = q_pe + key_states = torch.concat([k_nope, k_pe.expand(-1, self.num_heads, -1, -1)], dim=-1) + # key_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim) + # key_states[:, :, :, : self.qk_nope_head_dim] = k_nope + # key_states[:, :, :, self.qk_nope_head_dim :] = k_pe + if past_key_value is not None: + cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" + ) + + # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask, + # Reference: https://github.com/pytorch/pytorch/issues/112577. + if query_states.device.type == "cuda" and attention_mask is not None: + query_states = query_states.contiguous() + key_states = key_states.contiguous() + value_states = value_states.contiguous() + + attn_output = torch.nn.functional.scaled_dot_product_attention( + query_states, + key_states, + value_states, + attn_mask=attention_mask, + dropout_p=self.attention_dropout if self.training else 0.0, + # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1. + is_causal=self.is_causal and attention_mask is None and q_len > 1, + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + + attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.v_head_dim) + + attn_output = self.o_proj(attn_output) + + if new_interface: + return attn_output, None + + return attn_output, None, past_key_value + + +def deepseek_v2_attn_forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value=None, + output_attentions: bool = False, + use_cache: bool = False, + **kwargs, +) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + # modified from https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite/blob/main/modeling_deepseek.py#L806 + def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1): + cos = cos[position_ids].unsqueeze(unsqueeze_dim) + sin = sin[position_ids].unsqueeze(unsqueeze_dim) + + b, h, s, d = q.shape + q = q.view(b, h, s, d // 2, 2).transpose(4, 3).reshape(b, h, s, d) + + b, h, s, d = k.shape + k = k.view(b, h, s, d // 2, 2).transpose(4, 3).reshape(b, h, s, d) + + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + if output_attentions: + return self._orig_forward( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, ) - if is_flash_attn and self.qk_head_dim != self.v_head_dim: - attn_output = attn_output[:, :, :, : self.v_head_dim] + bsz, q_len, _ = hidden_states.shape - attn_output = attn_output.reshape(batch_size, seq_length, -1).contiguous() - attn_output = self.o_proj(attn_output) - return attn_output, attn_weights + if self.q_lora_rank is None: + q = self.q_proj(hidden_states) + else: + q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states))) + q = q.view(bsz, q_len, self.num_heads, self.q_head_dim).transpose(1, 2) + q_nope, q_pe = torch.split(q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1) + + compressed_kv = self.kv_a_proj_with_mqa(hidden_states) + compressed_kv, k_pe = torch.split(compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1) + k_pe = k_pe.view(bsz, q_len, 1, self.qk_rope_head_dim).transpose(1, 2) + kv = ( + self.kv_b_proj(self.kv_a_layernorm(compressed_kv)) + .view(bsz, q_len, self.num_heads, self.qk_nope_head_dim + self.v_head_dim) + .transpose(1, 2) + ) + + k_nope, value_states = torch.split(kv, [self.qk_nope_head_dim, self.v_head_dim], dim=-1) + kv_seq_len = value_states.shape[-2] + if past_key_value is not None: + if self.layer_idx is None: + raise ValueError( + f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} " + "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " + "with a layer index." + ) + kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + + q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin, position_ids) + + # Difference with original code, k_pe.new_empty create constant tensor in torchscript + query_states = torch.concat([q_nope, q_pe], dim=-1) + # query_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim) + # query_states[:, :, :, : self.qk_nope_head_dim] = q_nope + # query_states[:, :, :, self.qk_nope_head_dim :] = q_pe + key_states = torch.concat([k_nope, k_pe.expand(-1, self.num_heads, -1, -1)], dim=-1) + # key_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim) + # key_states[:, :, :, : self.qk_nope_head_dim] = k_nope + # key_states[:, :, :, self.qk_nope_head_dim :] = k_pe + if past_key_value is not None: + cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" + ) - return deepseek_attn_forward + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" + ) + # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask, + # Reference: https://github.com/pytorch/pytorch/issues/112577. + if query_states.device.type == "cuda" and attention_mask is not None: + query_states = query_states.contiguous() + key_states = key_states.contiguous() + value_states = value_states.contiguous() + + attn_output = torch.nn.functional.scaled_dot_product_attention( + query_states, + key_states, + value_states, + attn_mask=attention_mask, + dropout_p=self.attention_dropout if self.training else 0.0, + # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1. + is_causal=self.is_causal and attention_mask is None and q_len > 1, + ) + attn_output = attn_output.transpose(1, 2).contiguous() + + attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.v_head_dim) + + attn_output = self.o_proj(attn_output) + + return attn_output, None, past_key_value def deepseek_moe_infer(self, x, topk_ids, topk_weight): diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index 89bcacf9a4..a5df1f1cb2 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -119,13 +119,13 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): if is_transformers_version(">=", "4.46.0"): SUPPORTED_ARCHITECTURES += ("glm", "mistral-nemo", "phimoe") + if is_transformers_version("<", "4.54.0"): + SUPPORTED_ARCHITECTURES += ("deepseek", "gigachat3",) + # gptq and awq install disabled for windows test environment if platform.system() != "Windows" and is_transformers_version("<", "4.56.0"): SUPPORTED_ARCHITECTURES += ("opt_gptq", "mixtral_awq") - if is_transformers_version(">=", "4.53.0"): - SUPPORTED_ARCHITECTURES += ("deepseek", "gigachat3") - if is_transformers_version(">", "4.47"): SUPPORTED_ARCHITECTURES += ("olmo2",) @@ -291,8 +291,11 @@ def test_find_untested_architectures(self): if "llama4_text" in supported_architectures: supported_architectures.remove("llama4_text") - if is_transformers_version(">=", str(DeepseekOpenVINOConfig.MIN_TRANSFORMERS_VERSION)): - supported_architectures -= {"deepseek_v2", "deepseek_v3"} + if is_transformers_version(">=", str(DeepseekOpenVINOConfig.MAX_TRANSFORMERS_VERSION)): + if "deepseek_v2" in supported_architectures: + supported_architectures.remove("deepseek_v2") + if "deepseek_v3" in supported_architectures: + supported_architectures.remove("deepseek_v3") if is_transformers_version("<", str(BitnetOpenVINOConfig.MIN_TRANSFORMERS_VERSION)): supported_architectures -= {"bitnet"} if is_transformers_version("<", str(LFM2OpenVINOConfig.MIN_TRANSFORMERS_VERSION)): @@ -401,10 +404,6 @@ def test_compare_to_transformers(self, model_arch): if model_arch in ["gigachat3"]: tokens.pop("token_type_ids", None) - if model_arch == "deepseek": - ov_model.generation_config.do_sample = False - transformers_model.generation_config.do_sample = False - ov_model.generation_config.eos_token_id = None transformers_model.generation_config.eos_token_id = None ov_model.config.eos_token_id = None @@ -421,6 +420,10 @@ def test_compare_to_transformers(self, model_arch): ov_outputs = ov_model.generate(**tokens, generation_config=gen_config) + # TODO: add back once https://huggingface.co/katuni4ka/tiny-random-minicpm3/discussions/1 merged (for all models) as current modeling incompatible with transformers >= v4.49 + if model_arch in {"deepseek"} and is_transformers_version(">=", "4.49"): + self.skipTest("Incompatible modeling code") + additional_inputs = {} # gemma2 does not support dynamic cache, it is unfair to compare dynamic cache result vs hybrid cache, # align cache representation in torch model @@ -670,6 +673,10 @@ def test_beam_search(self, model_arch): if model_arch in ["lfm2", "granitemoehybrid"]: return + # TODO: add back once https://huggingface.co/katuni4ka/tiny-random-minicpm3/discussions/1 merged (for all models) as current modeling incompatible with transformers >= v4.49 + if model_arch in {"deepseek"} and is_transformers_version(">=", "4.49"): + self.skipTest("Incompatible modeling code") + tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=model_arch in REMOTE_CODE_MODELS) if model_arch == "persimmon": tokenizer.pad_token_id = tokenizer.bos_token_id diff --git a/tests/openvino/test_export.py b/tests/openvino/test_export.py index 16b5d528cc..90332cd397 100644 --- a/tests/openvino/test_export.py +++ b/tests/openvino/test_export.py @@ -98,7 +98,7 @@ class ExportModelTest(unittest.TestCase): if is_transformers_version(">=", "4.48.0"): SUPPORTED_ARCHITECTURES.update({"cohere2": OVModelForCausalLM}) - if is_transformers_version(">=", "4.53.0"): + if is_transformers_version(">=", "4.46.0") and is_transformers_version("<=", "4.53.3"): SUPPORTED_ARCHITECTURES.update({"deepseek": OVModelForCausalLM, "gigachat3": OVModelForCausalLM}) if is_transformers_version(">=", "4.49"): diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index ad649baecf..61a1fce622 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -159,7 +159,7 @@ class OVCLIExportTestCase(unittest.TestCase): ] ) - if is_transformers_version(">=", "4.53.0"): + if is_transformers_version(">=", "4.46.0") and is_transformers_version("<=", "4.53.3"): SUPPORTED_ARCHITECTURES.extend( [ ("text-generation-with-past", "gigachat3"), diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index f53bd364e1..6bd099c4cd 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -399,6 +399,7 @@ "exaone4", "decilm", "minicpm3", + "deepseek", "qwen3_eagle3", ) @@ -553,10 +554,8 @@ def get_supported_model_for_library(library_name): if supported_model_type[model_type].get("openvino"): export_config = next(iter(supported_model_type[model_type]["openvino"].values())) - raw_min = getattr(export_config.func, "MIN_TRANSFORMERS_VERSION", None) - raw_max = getattr(export_config.func, "MAX_TRANSFORMERS_VERSION", None) - min_transformers = str(raw_min) if raw_min is not None else "0" - max_transformers = str(raw_max) if raw_max is not None else "999" + min_transformers = str(getattr(export_config.func, "MIN_TRANSFORMERS_VERSION", "0")) + max_transformers = str(getattr(export_config.func, "MAX_TRANSFORMERS_VERSION", "999")) if is_transformers_version(">=", min_transformers) and is_transformers_version("<=", max_transformers): valid_model.add(model_type) From 225aed31af4067d76cc720de2211f43a4965cd17 Mon Sep 17 00:00:00 2001 From: Mohamed-Ashraf273 Date: Tue, 24 Mar 2026 00:09:37 +0200 Subject: [PATCH 30/39] update doc --- optimum/exporters/openvino/model_patcher.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 26ec653030..00dbf68ae1 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -4140,9 +4140,7 @@ def deepseek_moe_infer(self, x, topk_ids, topk_weight): def deepseek_moe(self, hidden_states: torch.Tensor, topk_indices: torch.Tensor, topk_weights: torch.Tensor): """ - Replacement for DeepseekV3MoE.moe (transformers >= 4.57). - The original skips experts with no tokens (data-dependent control flow that breaks tracing). - This version unconditionally runs all experts to produce a traceable static graph. + Vectorized MoE forward for DeepSeek-V3. """ num_experts = len(self.experts) batch_tokens, hidden_dim = hidden_states.shape From 4174478f9d76e86fc490a1add385d2a7b760bfc6 Mon Sep 17 00:00:00 2001 From: Mohamed-Ashraf273 Date: Tue, 24 Mar 2026 23:25:47 +0200 Subject: [PATCH 31/39] modify based on review --- optimum/exporters/openvino/model_patcher.py | 17 +++++++---------- tests/openvino/test_decoder.py | 5 ++++- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 00dbf68ae1..b14897a0fb 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -3856,7 +3856,7 @@ def deepseek_v3_attn_forward( past_key_value=None, output_attentions: bool = False, use_cache: bool = False, - cache_position: Optional[torch.LongTensor] = None, # ← ADD THIS + cache_position: Optional[torch.LongTensor] = None, **kwargs, ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: # modified from https://huggingface.co/deepseek-ai/DeepSeek-V3/blob/main/modeling_deepseek.py#L751 @@ -3876,7 +3876,7 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1): k_embed = (k_fp32 * cos) + (rotate_half(k_fp32) * sin) return q_embed.to(dtype=orig_dtype), k_embed.to(dtype=orig_dtype) - if not hasattr(self, 'q_head_dim'): + if not hasattr(self, "q_head_dim"): self.q_head_dim = self.qk_nope_head_dim + self.qk_rope_head_dim if output_attentions: @@ -3920,8 +3920,8 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1): ) kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) - new_interface = False # Set to True if using new rotary embedding interface - if hasattr(self, 'rotary_emb'): + new_interface = False # Set to True if using new rotary embedding interface + if hasattr(self, "rotary_emb"): cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin, position_ids) else: @@ -3931,7 +3931,6 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1): q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin) new_interface = True - q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin, position_ids) # Difference with original code, k_pe.new_empty create constant tensor in torchscript @@ -3977,6 +3976,8 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1): attn_output = self.o_proj(attn_output) if new_interface: + # Some models (e.g. gigachat3) expect 2-tuple return (attn_output, attn_weights) + # Returning 3-tuple breaks tracing with "too many values to unpack" return attn_output, None return attn_output, None, past_key_value @@ -4145,11 +4146,7 @@ def deepseek_moe(self, hidden_states: torch.Tensor, topk_indices: torch.Tensor, num_experts = len(self.experts) batch_tokens, hidden_dim = hidden_states.shape - routing = torch.zeros( - batch_tokens, num_experts, - dtype=topk_weights.dtype, - device=hidden_states.device - ) + routing = torch.zeros(batch_tokens, num_experts, dtype=topk_weights.dtype, device=hidden_states.device) routing.scatter_(1, topk_indices, topk_weights) hidden_states = hidden_states.repeat(num_experts, 1) diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index a5df1f1cb2..4dfdea8647 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -120,7 +120,10 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): SUPPORTED_ARCHITECTURES += ("glm", "mistral-nemo", "phimoe") if is_transformers_version("<", "4.54.0"): - SUPPORTED_ARCHITECTURES += ("deepseek", "gigachat3",) + SUPPORTED_ARCHITECTURES += ( + "deepseek", + "gigachat3", + ) # gptq and awq install disabled for windows test environment if platform.system() != "Windows" and is_transformers_version("<", "4.56.0"): From 9ae11622b641f34801d454a32b11f5cafd884ef6 Mon Sep 17 00:00:00 2001 From: Mohamed-Ashraf273 Date: Thu, 26 Mar 2026 02:39:39 +0200 Subject: [PATCH 32/39] fix issues --- optimum/exporters/openvino/model_configs.py | 4 +- optimum/exporters/openvino/model_patcher.py | 180 +++++++++++--------- tests/openvino/test_decoder.py | 28 +-- tests/openvino/test_export.py | 2 +- tests/openvino/test_exporters_cli.py | 2 +- tests/openvino/utils_tests.py | 7 +- 6 files changed, 120 insertions(+), 103 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 0624624a77..35f33a793f 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -4081,8 +4081,8 @@ class M2M100OpenVINOConfig(BartOpenVINOConfig): ) @register_in_tasks_manager("deepseek", *["text-generation", "text-generation-with-past"], library_name="transformers") class DeepseekOpenVINOConfig(MiniCPM3OpenVINOConfig): - MIN_TRANSFORMERS_VERSION = "4.46.0" - MAX_TRANSFORMERS_VERSION = "4.53.3" + MIN_TRANSFORMERS_VERSION = "4.53.0" + MAX_TRANSFORMERS_VERSION = None _MODEL_PATCHER = DeepseekPatcher diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index b14897a0fb..f20d3bf38c 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -3772,25 +3772,23 @@ def __enter__(self): self_attn_fwd = self_attn.get(self._model.config.model_type) for block in self._model.model.layers: - # Patch attention if self_attn_fwd is not None: block.self_attn._orig_forward = block.self_attn.forward block.self_attn.forward = types.MethodType(self_attn_fwd, block.self_attn) - - # Patch MoE if hasattr(block.mlp, "moe_infer"): + # old interface (transformers < 4.57): moe_infer(self, x, topk_ids, topk_weight) block.mlp._orig_moe_infer = block.mlp.moe_infer block.mlp._orig_moe = None block.mlp.ep_rank = getattr(block.mlp, "ep_rank", 0) block.mlp.experts_per_rank = getattr(block.mlp, "experts_per_rank", len(block.mlp.experts)) block.mlp.moe_infer = types.MethodType(deepseek_moe_infer, block.mlp) - elif hasattr(block.mlp, "moe") and hasattr(block.mlp, "experts"): + # new interface (transformers >= 4.57): moe(self, hidden_states, topk_indices, topk_weights) block.mlp._orig_moe = block.mlp.moe block.mlp._orig_moe_infer = None - - # Pre-concatenate expert weights for vectorized computation num_experts = len(block.mlp.experts) + + # Concatenate expert weights gate_projs = torch.concat( tuple(block.mlp.experts[i].gate_proj.weight.unsqueeze(0) for i in range(num_experts)), dim=0, @@ -3805,6 +3803,12 @@ def __enter__(self): ) if is_openvino_version("<", "2026.1.0"): + logger.warning( + "This model works best with OpenVINO 2026.1 or later. " + "Earlier versions require float() conversion for MoE weights, " + "which may affect performance. " + "OpenVINO 2026.1 includes a fix for torch.bmm dtype handling." + ) block.mlp.gate_projs = gate_projs.float() block.mlp.up_projs = up_projs.float() block.mlp.down_projs = down_projs.float() @@ -3814,15 +3818,19 @@ def __enter__(self): block.mlp.down_projs = down_projs block.mlp.moe = types.MethodType(deepseek_moe, block.mlp) + elif hasattr(block.mlp, "experts"): + # fallback: patch by injecting moe_infer with required attributes + block.mlp._orig_moe_infer = None + block.mlp._orig_moe = None + block.mlp.ep_rank = 0 + block.mlp.experts_per_rank = len(block.mlp.experts) + block.mlp.moe_infer = types.MethodType(deepseek_moe_infer, block.mlp) def __exit__(self, exc_type, exc_value, traceback): super().__exit__(exc_type, exc_value, traceback) for block in self._model.model.layers: - # Restore attention if hasattr(block.self_attn, "_orig_forward"): block.self_attn.forward = block.self_attn._orig_forward - - # Restore MoE - handle both interfaces if hasattr(block.mlp, "_orig_moe"): if block.mlp._orig_moe is not None: block.mlp.moe = block.mlp._orig_moe @@ -3833,7 +3841,6 @@ def __exit__(self, exc_type, exc_value, traceback): if hasattr(block.mlp, "down_projs"): del block.mlp.down_projs delattr(block.mlp, "_orig_moe") - if hasattr(block.mlp, "_orig_moe_infer"): if block.mlp._orig_moe_infer is not None: block.mlp.moe_infer = block.mlp._orig_moe_infer @@ -3850,45 +3857,44 @@ def __exit__(self, exc_type, exc_value, traceback): def deepseek_v3_attn_forward( self, hidden_states: torch.Tensor, + position_embeddings=None, attention_mask: Optional[torch.Tensor] = None, - position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, position_ids: Optional[torch.LongTensor] = None, past_key_value=None, + past_key_values=None, + cache_position: Optional[torch.LongTensor] = None, output_attentions: bool = False, use_cache: bool = False, - cache_position: Optional[torch.LongTensor] = None, **kwargs, ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: # modified from https://huggingface.co/deepseek-ai/DeepSeek-V3/blob/main/modeling_deepseek.py#L751 def rotate_half(x): - """Rotates half the hidden dims of the input.""" x1 = x[..., : x.shape[-1] // 2] x2 = x[..., x.shape[-1] // 2 :] return torch.cat((-x2, x1), dim=-1) def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1): orig_dtype = k.dtype - cos = cos[position_ids].unsqueeze(unsqueeze_dim) # [bs, 1, seq_len, dim] - sin = sin[position_ids].unsqueeze(unsqueeze_dim) # [bs, 1, seq_len, dim] - q_fp32 = q.to(dtype=torch.float32, device=q.device) - k_fp32 = k.to(dtype=torch.float32, device=k.device) + cos = cos[position_ids].unsqueeze(unsqueeze_dim) + sin = sin[position_ids].unsqueeze(unsqueeze_dim) + q_fp32 = q.to(dtype=torch.float32) + k_fp32 = k.to(dtype=torch.float32) q_embed = (q_fp32 * cos) + (rotate_half(q_fp32) * sin) k_embed = (k_fp32 * cos) + (rotate_half(k_fp32) * sin) return q_embed.to(dtype=orig_dtype), k_embed.to(dtype=orig_dtype) - if not hasattr(self, "q_head_dim"): - self.q_head_dim = self.qk_nope_head_dim + self.qk_rope_head_dim - if output_attentions: return self._orig_forward( hidden_states=hidden_states, attention_mask=attention_mask, position_ids=position_ids, + position_embeddings=position_embeddings, past_key_value=past_key_value, + past_key_values=past_key_values, output_attentions=output_attentions, use_cache=use_cache, cache_position=cache_position, - **kwargs, + kwargs=kwargs, ) bsz, q_len, _ = hidden_states.size() @@ -3897,60 +3903,84 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1): q = self.q_proj(hidden_states) else: q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states))) - q = q.view(bsz, q_len, self.num_heads, self.q_head_dim).transpose(1, 2) - q_nope, q_pe = torch.split(q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1) + q = q.view(bsz, q_len, self.num_heads, self.qk_head_dim).transpose(1, 2) + q_nope, q_pe = torch.split(q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1) compressed_kv = self.kv_a_proj_with_mqa(hidden_states) - compressed_kv, k_pe = torch.split(compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1) - k_pe = k_pe.view(bsz, q_len, 1, self.qk_rope_head_dim).transpose(1, 2) - kv = ( - self.kv_b_proj(self.kv_a_layernorm(compressed_kv)) - .view(bsz, q_len, self.num_heads, self.qk_nope_head_dim + self.v_head_dim) - .transpose(1, 2) + + k_pass, k_rot = torch.split( + compressed_kv, + [self.kv_lora_rank, self.qk_rope_head_dim], + dim=-1, ) - k_nope, value_states = torch.split(kv, [self.qk_nope_head_dim, self.v_head_dim], dim=-1) - kv_seq_len = value_states.shape[-2] - if past_key_value is not None: - if self.layer_idx is None: - raise ValueError( - f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} " - "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " - "with a layer index." - ) - kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + k_pass = self.kv_b_proj(self.kv_a_layernorm(k_pass)) + k_pass = k_pass.view(bsz, q_len, self.num_heads, self.qk_nope_head_dim + self.v_head_dim).transpose(1, 2) - new_interface = False # Set to True if using new rotary embedding interface - if hasattr(self, "rotary_emb"): - cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) - q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin, position_ids) - else: - from transformers.models.deepseek_v3.modeling_deepseek_v3 import apply_rotary_pos_emb + k_pass, value_states = torch.split( + k_pass, + [self.qk_nope_head_dim, self.v_head_dim], + dim=-1, + ) + + k_rot = k_rot.view(bsz, 1, q_len, self.qk_rope_head_dim) + + new_interface = position_embeddings is not None and not hasattr(self, "rotary_emb") + + if new_interface: + from transformers.models.deepseek_v3.modeling_deepseek_v3 import apply_rotary_pos_emb_interleave cos, sin = position_embeddings - q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin) - new_interface = True - q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin, position_ids) + if getattr(self.config, "rope_interleave", False): + try: + q_pe, k_rot = apply_rotary_pos_emb_interleave(q_pe, k_rot, cos, sin) + except Exception as e: + raise RuntimeError( + "Failed to apply interleaved rotary position embeddings, " + f"may due to incompatible transformers version, try to `pip install transformers>=4.57.1`: {e}" + ) + else: + q_pe, k_rot = apply_rotary_pos_emb(q_pe, k_rot, cos, sin) - # Difference with original code, k_pe.new_empty create constant tensor in torchscript - query_states = torch.concat([q_nope, q_pe], dim=-1) - # query_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim) - # query_states[:, :, :, : self.qk_nope_head_dim] = q_nope - # query_states[:, :, :, self.qk_nope_head_dim :] = q_pe - key_states = torch.concat([k_nope, k_pe.expand(-1, self.num_heads, -1, -1)], dim=-1) - # key_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim) - # key_states[:, :, :, : self.qk_nope_head_dim] = k_nope - # key_states[:, :, :, self.qk_nope_head_dim :] = k_pe - if past_key_value is not None: - cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models - key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + kv_cache = past_key_values if past_key_values is not None else past_key_value - if attention_mask is not None: - if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): - raise ValueError( - f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" - ) + else: + kv_seq_len = value_states.shape[-2] + if past_key_value is not None: + if self.layer_idx is None: + raise ValueError( + f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} " + "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " + "with a layer index." + ) + kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + q_pe, k_rot = apply_rotary_pos_emb(q_pe, k_rot, cos, sin, position_ids) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" + ) + + kv_cache = past_key_value + + k_rot = k_rot.expand(*k_pass.shape[:-1], -1) + query_states = torch.cat((q_nope, q_pe), dim=-1) + key_states = torch.cat((k_pass, k_rot), dim=-1) + + if kv_cache is not None: + if new_interface: + cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} + key_states, value_states = kv_cache.update(key_states, value_states, self.layer_idx, cache_kwargs) + + if attention_mask is not None: + attention_mask = attention_mask[:, :, :, : key_states.shape[-2]] + + else: + cache_kwargs = {"sin": sin, "cos": cos} + key_states, value_states = kv_cache.update(key_states, value_states, self.layer_idx, cache_kwargs) # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask, # Reference: https://github.com/pytorch/pytorch/issues/112577. @@ -3967,8 +3997,8 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1): dropout_p=self.attention_dropout if self.training else 0.0, # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1. is_causal=self.is_causal and attention_mask is None and q_len > 1, + scale=None if not new_interface else self.scaling, ) - attn_output = attn_output.transpose(1, 2).contiguous() attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.v_head_dim) @@ -3976,8 +4006,6 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1): attn_output = self.o_proj(attn_output) if new_interface: - # Some models (e.g. gigachat3) expect 2-tuple return (attn_output, attn_weights) - # Returning 3-tuple breaks tracing with "too many values to unpack" return attn_output, None return attn_output, None, past_key_value @@ -4141,25 +4169,23 @@ def deepseek_moe_infer(self, x, topk_ids, topk_weight): def deepseek_moe(self, hidden_states: torch.Tensor, topk_indices: torch.Tensor, topk_weights: torch.Tensor): """ - Vectorized MoE forward for DeepSeek-V3. + Vectorized MoE that matches original behavior. """ + orig_dtype = hidden_states.dtype num_experts = len(self.experts) - batch_tokens, hidden_dim = hidden_states.shape - + batch_tokens, _ = hidden_states.shape routing = torch.zeros(batch_tokens, num_experts, dtype=topk_weights.dtype, device=hidden_states.device) routing.scatter_(1, topk_indices, topk_weights) - - hidden_states = hidden_states.repeat(num_experts, 1) - hidden_states = hidden_states.view(num_experts, batch_tokens, hidden_dim) + expanded = hidden_states.unsqueeze(0).expand(num_experts, -1, -1) act_fn = self.experts[0].act_fn - gate = torch.bmm(hidden_states, self.gate_projs.transpose(1, 2)) - up = torch.bmm(hidden_states, self.up_projs.transpose(1, 2)) + gate = torch.bmm(expanded, self.gate_projs.transpose(1, 2)) + up = torch.bmm(expanded, self.up_projs.transpose(1, 2)) gate_up = act_fn(gate) * up next_states = torch.bmm(gate_up, self.down_projs.transpose(1, 2)) routing = routing.transpose(0, 1).unsqueeze(-1) next_states = next_states * routing next_states = next_states.sum(dim=0) - return next_states.type(hidden_states.dtype) + return next_states.to(orig_dtype) class Qwen2VLLanguageModelPatcher(OVDecoderModelPatcher): diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index 4dfdea8647..89bcacf9a4 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -119,16 +119,13 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): if is_transformers_version(">=", "4.46.0"): SUPPORTED_ARCHITECTURES += ("glm", "mistral-nemo", "phimoe") - if is_transformers_version("<", "4.54.0"): - SUPPORTED_ARCHITECTURES += ( - "deepseek", - "gigachat3", - ) - # gptq and awq install disabled for windows test environment if platform.system() != "Windows" and is_transformers_version("<", "4.56.0"): SUPPORTED_ARCHITECTURES += ("opt_gptq", "mixtral_awq") + if is_transformers_version(">=", "4.53.0"): + SUPPORTED_ARCHITECTURES += ("deepseek", "gigachat3") + if is_transformers_version(">", "4.47"): SUPPORTED_ARCHITECTURES += ("olmo2",) @@ -294,11 +291,8 @@ def test_find_untested_architectures(self): if "llama4_text" in supported_architectures: supported_architectures.remove("llama4_text") - if is_transformers_version(">=", str(DeepseekOpenVINOConfig.MAX_TRANSFORMERS_VERSION)): - if "deepseek_v2" in supported_architectures: - supported_architectures.remove("deepseek_v2") - if "deepseek_v3" in supported_architectures: - supported_architectures.remove("deepseek_v3") + if is_transformers_version(">=", str(DeepseekOpenVINOConfig.MIN_TRANSFORMERS_VERSION)): + supported_architectures -= {"deepseek_v2", "deepseek_v3"} if is_transformers_version("<", str(BitnetOpenVINOConfig.MIN_TRANSFORMERS_VERSION)): supported_architectures -= {"bitnet"} if is_transformers_version("<", str(LFM2OpenVINOConfig.MIN_TRANSFORMERS_VERSION)): @@ -407,6 +401,10 @@ def test_compare_to_transformers(self, model_arch): if model_arch in ["gigachat3"]: tokens.pop("token_type_ids", None) + if model_arch == "deepseek": + ov_model.generation_config.do_sample = False + transformers_model.generation_config.do_sample = False + ov_model.generation_config.eos_token_id = None transformers_model.generation_config.eos_token_id = None ov_model.config.eos_token_id = None @@ -423,10 +421,6 @@ def test_compare_to_transformers(self, model_arch): ov_outputs = ov_model.generate(**tokens, generation_config=gen_config) - # TODO: add back once https://huggingface.co/katuni4ka/tiny-random-minicpm3/discussions/1 merged (for all models) as current modeling incompatible with transformers >= v4.49 - if model_arch in {"deepseek"} and is_transformers_version(">=", "4.49"): - self.skipTest("Incompatible modeling code") - additional_inputs = {} # gemma2 does not support dynamic cache, it is unfair to compare dynamic cache result vs hybrid cache, # align cache representation in torch model @@ -676,10 +670,6 @@ def test_beam_search(self, model_arch): if model_arch in ["lfm2", "granitemoehybrid"]: return - # TODO: add back once https://huggingface.co/katuni4ka/tiny-random-minicpm3/discussions/1 merged (for all models) as current modeling incompatible with transformers >= v4.49 - if model_arch in {"deepseek"} and is_transformers_version(">=", "4.49"): - self.skipTest("Incompatible modeling code") - tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=model_arch in REMOTE_CODE_MODELS) if model_arch == "persimmon": tokenizer.pad_token_id = tokenizer.bos_token_id diff --git a/tests/openvino/test_export.py b/tests/openvino/test_export.py index 90332cd397..16b5d528cc 100644 --- a/tests/openvino/test_export.py +++ b/tests/openvino/test_export.py @@ -98,7 +98,7 @@ class ExportModelTest(unittest.TestCase): if is_transformers_version(">=", "4.48.0"): SUPPORTED_ARCHITECTURES.update({"cohere2": OVModelForCausalLM}) - if is_transformers_version(">=", "4.46.0") and is_transformers_version("<=", "4.53.3"): + if is_transformers_version(">=", "4.53.0"): SUPPORTED_ARCHITECTURES.update({"deepseek": OVModelForCausalLM, "gigachat3": OVModelForCausalLM}) if is_transformers_version(">=", "4.49"): diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index 61a1fce622..ad649baecf 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -159,7 +159,7 @@ class OVCLIExportTestCase(unittest.TestCase): ] ) - if is_transformers_version(">=", "4.46.0") and is_transformers_version("<=", "4.53.3"): + if is_transformers_version(">=", "4.53.0"): SUPPORTED_ARCHITECTURES.extend( [ ("text-generation-with-past", "gigachat3"), diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index 6bd099c4cd..f53bd364e1 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -399,7 +399,6 @@ "exaone4", "decilm", "minicpm3", - "deepseek", "qwen3_eagle3", ) @@ -554,8 +553,10 @@ def get_supported_model_for_library(library_name): if supported_model_type[model_type].get("openvino"): export_config = next(iter(supported_model_type[model_type]["openvino"].values())) - min_transformers = str(getattr(export_config.func, "MIN_TRANSFORMERS_VERSION", "0")) - max_transformers = str(getattr(export_config.func, "MAX_TRANSFORMERS_VERSION", "999")) + raw_min = getattr(export_config.func, "MIN_TRANSFORMERS_VERSION", None) + raw_max = getattr(export_config.func, "MAX_TRANSFORMERS_VERSION", None) + min_transformers = str(raw_min) if raw_min is not None else "0" + max_transformers = str(raw_max) if raw_max is not None else "999" if is_transformers_version(">=", min_transformers) and is_transformers_version("<=", max_transformers): valid_model.add(model_type) From 5dbc1c8c22e2a60a39f3d8b115740a485234efa3 Mon Sep 17 00:00:00 2001 From: Mohamed-Ashraf273 Date: Thu, 26 Mar 2026 02:41:54 +0200 Subject: [PATCH 33/39] fix issues --- optimum/exporters/openvino/model_patcher.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index f20d3bf38c..a86be32aab 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -3869,16 +3869,17 @@ def deepseek_v3_attn_forward( ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: # modified from https://huggingface.co/deepseek-ai/DeepSeek-V3/blob/main/modeling_deepseek.py#L751 def rotate_half(x): + """Rotates half the hidden dims of the input.""" x1 = x[..., : x.shape[-1] // 2] x2 = x[..., x.shape[-1] // 2 :] return torch.cat((-x2, x1), dim=-1) def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1): orig_dtype = k.dtype - cos = cos[position_ids].unsqueeze(unsqueeze_dim) - sin = sin[position_ids].unsqueeze(unsqueeze_dim) - q_fp32 = q.to(dtype=torch.float32) - k_fp32 = k.to(dtype=torch.float32) + cos = cos[position_ids].unsqueeze(unsqueeze_dim) # [bs, 1, seq_len, dim] + sin = sin[position_ids].unsqueeze(unsqueeze_dim) # [bs, 1, seq_len, dim] + q_fp32 = q.to(dtype=torch.float32, device=q.device) + k_fp32 = k.to(dtype=torch.float32, device=k.device) q_embed = (q_fp32 * cos) + (rotate_half(q_fp32) * sin) k_embed = (k_fp32 * cos) + (rotate_half(k_fp32) * sin) return q_embed.to(dtype=orig_dtype), k_embed.to(dtype=orig_dtype) From f7043c717aea9f7bf8ba2622a5e78d13d247870e Mon Sep 17 00:00:00 2001 From: Mohamed-Ashraf273 Date: Thu, 26 Mar 2026 02:43:31 +0200 Subject: [PATCH 34/39] fix issues --- optimum/exporters/openvino/model_patcher.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index a86be32aab..029ed27679 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -3904,9 +3904,9 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1): q = self.q_proj(hidden_states) else: q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states))) - q = q.view(bsz, q_len, self.num_heads, self.qk_head_dim).transpose(1, 2) q_nope, q_pe = torch.split(q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1) + compressed_kv = self.kv_a_proj_with_mqa(hidden_states) k_pass, k_rot = torch.split( From 2877a8e9be7974ae9437bb126e34dbbf0aa0809f Mon Sep 17 00:00:00 2001 From: Mohamed-Ashraf273 Date: Thu, 26 Mar 2026 02:50:07 +0200 Subject: [PATCH 35/39] fix issues --- optimum/exporters/openvino/model_patcher.py | 1 + 1 file changed, 1 insertion(+) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 029ed27679..d6c898fbbb 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -4000,6 +4000,7 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1): is_causal=self.is_causal and attention_mask is None and q_len > 1, scale=None if not new_interface else self.scaling, ) + attn_output = attn_output.transpose(1, 2).contiguous() attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.v_head_dim) From 2c2d31bf7d8f7cd76a2d728a452b64debef5be01 Mon Sep 17 00:00:00 2001 From: Mohamed-Ashraf273 Date: Thu, 26 Mar 2026 15:51:16 +0200 Subject: [PATCH 36/39] fix issues --- optimum/exporters/openvino/model_patcher.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index d6c898fbbb..4f1853d0a8 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -3929,20 +3929,23 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1): new_interface = position_embeddings is not None and not hasattr(self, "rotary_emb") if new_interface: - from transformers.models.deepseek_v3.modeling_deepseek_v3 import apply_rotary_pos_emb_interleave + from transformers.models.deepseek_v3.modeling_deepseek_v3 import ( + apply_rotary_pos_emb as deepseek_v3_apply_rotary_pos_emb, + apply_rotary_pos_emb_interleave as deepseek_v3_apply_rotary_pos_emb_interleave, + ) cos, sin = position_embeddings if getattr(self.config, "rope_interleave", False): try: - q_pe, k_rot = apply_rotary_pos_emb_interleave(q_pe, k_rot, cos, sin) + q_pe, k_rot = deepseek_v3_apply_rotary_pos_emb_interleave(q_pe, k_rot, cos, sin) except Exception as e: raise RuntimeError( "Failed to apply interleaved rotary position embeddings, " f"may due to incompatible transformers version, try to `pip install transformers>=4.57.1`: {e}" ) else: - q_pe, k_rot = apply_rotary_pos_emb(q_pe, k_rot, cos, sin) + q_pe, k_rot = deepseek_v3_apply_rotary_pos_emb(q_pe, k_rot, cos, sin) kv_cache = past_key_values if past_key_values is not None else past_key_value From 5cb2d8b17275f8de37601f02ae96e1b89c9d099d Mon Sep 17 00:00:00 2001 From: Mohamed-Ashraf273 Date: Thu, 26 Mar 2026 20:01:11 +0200 Subject: [PATCH 37/39] fix issues --- optimum/exporters/openvino/model_patcher.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 4f1853d0a8..50056b479c 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -3931,6 +3931,8 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1): if new_interface: from transformers.models.deepseek_v3.modeling_deepseek_v3 import ( apply_rotary_pos_emb as deepseek_v3_apply_rotary_pos_emb, + ) + from transformers.models.deepseek_v3.modeling_deepseek_v3 import ( apply_rotary_pos_emb_interleave as deepseek_v3_apply_rotary_pos_emb_interleave, ) @@ -4179,14 +4181,15 @@ def deepseek_moe(self, hidden_states: torch.Tensor, topk_indices: torch.Tensor, orig_dtype = hidden_states.dtype num_experts = len(self.experts) batch_tokens, _ = hidden_states.shape - routing = torch.zeros(batch_tokens, num_experts, dtype=topk_weights.dtype, device=hidden_states.device) - routing.scatter_(1, topk_indices, topk_weights) - expanded = hidden_states.unsqueeze(0).expand(num_experts, -1, -1) + compute_dtype = torch.promote_types(hidden_states.dtype, self.gate_projs.dtype) + routing = torch.zeros(batch_tokens, num_experts, dtype=compute_dtype, device=hidden_states.device) + routing.scatter_(1, topk_indices, topk_weights.to(dtype=compute_dtype)) + expanded = hidden_states.to(dtype=compute_dtype).unsqueeze(0).expand(num_experts, -1, -1) act_fn = self.experts[0].act_fn - gate = torch.bmm(expanded, self.gate_projs.transpose(1, 2)) - up = torch.bmm(expanded, self.up_projs.transpose(1, 2)) + gate = torch.bmm(expanded, self.gate_projs.to(dtype=compute_dtype).transpose(1, 2)) + up = torch.bmm(expanded, self.up_projs.to(dtype=compute_dtype).transpose(1, 2)) gate_up = act_fn(gate) * up - next_states = torch.bmm(gate_up, self.down_projs.transpose(1, 2)) + next_states = torch.bmm(gate_up, self.down_projs.to(dtype=compute_dtype).transpose(1, 2)) routing = routing.transpose(0, 1).unsqueeze(-1) next_states = next_states * routing next_states = next_states.sum(dim=0) From 39e770cd612ddfef1cdca8dd13f5bc544e9e166f Mon Sep 17 00:00:00 2001 From: Mohamed Ashraf <117025882+Mohamed-Ashraf273@users.noreply.github.com> Date: Mon, 30 Mar 2026 16:29:00 +0200 Subject: [PATCH 38/39] Remove Flaubert and add GigaChat3 to models list --- docs/source/openvino/models.mdx | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/docs/source/openvino/models.mdx b/docs/source/openvino/models.mdx index 03ec2999e8..041f07a9a9 100644 --- a/docs/source/openvino/models.mdx +++ b/docs/source/openvino/models.mdx @@ -60,9 +60,8 @@ Here is the list of the supported architectures : - EXAONE 4 - Falcon - Falcon-Mamba -- Flaubert -- GigaChat3 - FlauBERT +- GigaChat3 - GLM-4 - GLM-Edge - GPT-2 @@ -187,4 +186,4 @@ Here is the list of the supported architectures : - All Transformer and CLIP-based models. ## [OpenCLIP](https://github.com/mlfoundations/open_clip) -- All CLIP-based models \ No newline at end of file +- All CLIP-based models From aec4a90159ead91bf4d55167febb706775db33f7 Mon Sep 17 00:00:00 2001 From: Mohamed-Ashraf273 Date: Mon, 30 Mar 2026 16:38:14 +0200 Subject: [PATCH 39/39] update docs --- docs/source/openvino/models.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/openvino/models.mdx b/docs/source/openvino/models.mdx index 041f07a9a9..6339ade837 100644 --- a/docs/source/openvino/models.mdx +++ b/docs/source/openvino/models.mdx @@ -186,4 +186,4 @@ Here is the list of the supported architectures : - All Transformer and CLIP-based models. ## [OpenCLIP](https://github.com/mlfoundations/open_clip) -- All CLIP-based models +- All CLIP-based models \ No newline at end of file