Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions optimum/exporters/openvino/model_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,7 @@
InternVLChatImageEmbeddingModelPatcher,
JaisModelPatcher,
Lfm2ModelPatcher,
Lfm2MoeModelPatcher,
Llama4ImageEmbeddingsModelPatcher,
Llama4TextModelPatcher,
LlavaImageEmbeddingModelPatcher,
Expand Down Expand Up @@ -5499,3 +5500,16 @@ def generate_dummy_inputs(self, framework: str = "pt", **kwargs):
)

return dummy_inputs


@register_in_tasks_manager(
"lfm2_moe",
*[
"text-generation",
"text-generation-with-past",
],
library_name="transformers",
)
class LFM2MoeOpenVINOConfig(LFM2OpenVINOConfig):
MIN_TRANSFORMERS_VERSION = "5.1"
_MODEL_PATCHER = Lfm2MoeModelPatcher
76 changes: 76 additions & 0 deletions optimum/exporters/openvino/model_patcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -8422,3 +8422,79 @@ def __exit__(self, exc_type, exc_value, traceback):
sparse_moe_block = decoder_layer.mlp
decoder_layer.mlp.forward = decoder_layer.mlp._orig_forward
del sparse_moe_block.down_projs, sparse_moe_block.gate_projs, sparse_moe_block.up_projs


def lfm2_moe_experts_forward(
self,
hidden_states: torch.Tensor,
top_k_index: torch.Tensor,
top_k_weights: torch.Tensor,
) -> torch.Tensor:

routing_weights = top_k_weights.to(hidden_states.dtype)
num_tokens, hidden_dim = hidden_states.shape
num_experts = self.num_experts

dense_routing_weights = torch.zeros(
num_tokens,
num_experts,
device=hidden_states.device,
dtype=hidden_states.dtype,
)
dense_routing_weights.scatter_(dim=1, index=top_k_index, src=routing_weights)
hidden_states_expanded = hidden_states.repeat(num_experts, 1) # (num_experts * num_tokens, hidden_dim)
hidden_states_expanded = hidden_states_expanded.view(
num_experts, -1, hidden_dim
) # (num_experts, num_tokens, hidden_dim)

gate_proj, up_proj = self.gate_up_proj.chunk(2, dim=-2)

gate = torch.bmm(hidden_states_expanded, gate_proj.transpose(1, 2))
up = torch.bmm(hidden_states_expanded, up_proj.transpose(1, 2))
next_states = self.act_fn(gate) * up
next_states = torch.bmm(next_states, self.down_proj.transpose(1, 2))

next_states = next_states.view(num_experts, num_tokens, hidden_dim)
next_states = next_states * dense_routing_weights.transpose(0, 1).view(num_experts, num_tokens)[..., None]
next_states = next_states.sum(dim=0)

return next_states


class Lfm2MoeModelPatcher(Lfm2ModelPatcher):
def __enter__(self):
super().__enter__()
from transformers.models.lfm2_moe.modeling_lfm2_moe import Lfm2MoeShortConv, Lfm2MoeDecoderLayer, Lfm2MoeSparseMoeBlock, Lfm2MoeExperts

super().__enter__()
setattr(self._model, self.orig_forward_name, self.patched_forward)

for layer in self._model.model.layers:
if hasattr(layer, "conv") and isinstance(layer.conv, Lfm2MoeShortConv):
conv_layer = layer.conv
conv_layer._orig_forward = conv_layer.slow_forward
conv_layer.slow_forward = types.MethodType(lfm2_short_conv_forward_patched, conv_layer)

if isinstance(layer, Lfm2MoeDecoderLayer) and isinstance(layer.feed_forward, Lfm2MoeSparseMoeBlock):
sparse_moe_block = layer.feed_forward
if isinstance(sparse_moe_block.experts, Lfm2MoeExperts):
lfm2_moe_experts = sparse_moe_block.experts
lfm2_moe_experts._orig_forward = lfm2_moe_experts.forward
lfm2_moe_experts.forward = types.MethodType(lfm2_moe_experts_forward, lfm2_moe_experts)

def __exit__(self, exc_type, exc_value, traceback):
from transformers.models.lfm2_moe.modeling_lfm2_moe import Lfm2MoeShortConv, Lfm2MoeDecoderLayer, Lfm2MoeSparseMoeBlock, Lfm2MoeExperts

super().__exit__(exc_type, exc_value, traceback)
setattr(self._model, self.orig_forward_name, self.model_orig_forward)

for layer in self._model.model.layers:
if hasattr(layer, "conv") and isinstance(layer.conv, Lfm2MoeShortConv):
conv_layer = layer.conv
conv_layer.slow_forward = conv_layer._orig_forward

if isinstance(layer, Lfm2MoeDecoderLayer) and isinstance(layer.feed_forward, Lfm2MoeSparseMoeBlock):
sparse_moe_block = layer.feed_forward
if isinstance(sparse_moe_block.experts, Lfm2MoeExperts):
lfm2_moe_experts = sparse_moe_block.experts
lfm2_moe_experts.forward = lfm2_moe_experts._orig_forward
2 changes: 1 addition & 1 deletion optimum/exporters/openvino/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,7 +305,7 @@ def get_submodels(model):
"minicpmo",
]

SSM_MODELS = ["mamba", "falcon_mamba", "zamba2", "lfm2", "granitemoehybrid", "qwen3_next"]
SSM_MODELS = ["mamba", "falcon_mamba", "zamba2", "lfm2", "lfm2_moe", "granitemoehybrid", "qwen3_next"]

# All transformers, diffusers, timm and sentence transformers models that are supported via optimum-onnx OnnxConfigs but that have currently no test
# TODO: add tests for all models that are compatible and remove support for all others
Expand Down
2 changes: 1 addition & 1 deletion optimum/intel/openvino/modeling_decoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -1449,7 +1449,7 @@ def prepare_inputs_for_generation(
# decoding stage so it takes the last token
input_ids = input_ids[:, -1].unsqueeze(-1)

if self.config.model_type not in ["lfm2", "granitemoehybrid", "qwen3_next"]:
if self.config.model_type not in ["lfm2", "lfm2_moe", "granitemoehybrid", "qwen3_next"]:
# LFM2, GraniteMoeHybrid (Granite-4.0), and Qwen3-Next require the attention mask
# to be the length of the full context, so default mask from OVModelForCausalLM needs to be used.
# Other models like Mamba typically do not require an attention_mask
Expand Down
8 changes: 6 additions & 2 deletions tests/openvino/test_decoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,9 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase):
if is_transformers_version(">=", "4.57.0") and is_transformers_version("<", "5"):
SUPPORTED_SSM_ARCHITECTURES += ("qwen3_next",)

if is_transformers_version(">=", "5.1.0"):
SUPPORTED_SSM_ARCHITECTURES += ("lfm2_moe",)

SUPPORTED_ARCHITECTURES += SUPPORTED_SSM_ARCHITECTURES

if is_transformers_version(">=", "4.48.0"):
Expand Down Expand Up @@ -200,6 +203,7 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase):
"gpt_neo": 4,
"gpt_neox": 5,
"lfm2": 1,
"lfm2_moe": 2,
"llama": 2,
"llama4": 5,
"marian": 2,
Expand Down Expand Up @@ -384,7 +388,7 @@ def test_compare_to_transformers(self, model_arch):
self.assertIsInstance(ov_outputs.cache_params.conv_states, list)
self.assertIsInstance(ov_outputs.cache_params.ssm_states, list)
self.assertTrue(len(ov_outputs.cache_params.conv_states) > 0)
if model_arch != "lfm2":
if model_arch not in ["lfm2", "lfm2_moe"]:
self.assertTrue(len(ov_outputs.cache_params.ssm_states) > 0)
else:
self.assertTrue("past_key_values" in ov_outputs)
Expand Down Expand Up @@ -707,7 +711,7 @@ def test_beam_search(self, model_arch):
# LFM2 fails with beam search, issue link: https://github.com/huggingface/transformers/issues/42257
# CVS-177964 GraniteMoeHybrid fails due to lack support of Beam search for hybrid models in OpenVINO
# For this support, we expect changes in IRs to have connected beam_idx with Mamba/Linear attention states
if model_arch in ["lfm2", "granitemoehybrid"]:
if model_arch in ["lfm2", "lfm2_moe", "granitemoehybrid"]:
return

# TODO: add back once https://huggingface.co/katuni4ka/tiny-random-minicpm3/discussions/1 merged (for all models) as current modeling incompatible with transformers >= v4.49
Expand Down
3 changes: 3 additions & 0 deletions tests/openvino/test_export.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,9 @@ class ExportModelTest(unittest.TestCase):
if is_transformers_version(">=", "4.57.0") and is_transformers_version("<", "5"):
SUPPORTED_ARCHITECTURES.update({"qwen3_next": OVModelForCausalLM})

if is_transformers_version(">=", "5.1"):
SUPPORTED_ARCHITECTURES.update({"lfm2_moe": OVModelForCausalLM})

EXPECTED_DIFFUSERS_SCALE_FACTORS = {
"stable-diffusion-xl": {"vae_encoder": "128.0", "vae_decoder": "128.0"},
"stable-diffusion-3": {"text_encoder_3": "8.0"},
Expand Down
9 changes: 9 additions & 0 deletions tests/openvino/test_exporters_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,14 @@ class OVCLIExportTestCase(unittest.TestCase):
]
)

if is_transformers_version(">=", "5.1"):
SUPPORTED_ARCHITECTURES.extend(
[
("text-generation", "lfm2_moe"),
("text-generation-with-past", "lfm2_moe"),
]
)

EXPECTED_NUMBER_OF_TOKENIZER_MODELS = {
"gpt2": 2,
"t5": 2,
Expand All @@ -197,6 +205,7 @@ class OVCLIExportTestCase(unittest.TestCase):
"lfm2": 2
if is_openvino_version(">=", "2026.0")
else 0, # Tokenizers fail to convert on 2025.4, ticket: CVS-176880
"lfm2_moe": 2,
"llava": 2,
"sana": 2,
"ltx-video": 2,
Expand Down
2 changes: 2 additions & 0 deletions tests/openvino/utils_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@
"jais": "optimum-intel-internal-testing/tiny-random-jais",
"levit": "optimum-intel-internal-testing/tiny-random-LevitModel",
"lfm2": "optimum-intel-internal-testing/tiny-random-lfm2",
"lfm2_moe": "optimum-intel-internal-testing/tiny-random-lfm2-moe",
"longt5": "optimum-intel-internal-testing/tiny-random-longt5",
"llama": "optimum-intel-internal-testing/tiny-random-LlamaForCausalLM",
"llama_awq": "optimum-intel-internal-testing/tiny-random-LlamaForCausalLM",
Expand Down Expand Up @@ -369,6 +370,7 @@
"zamba2": {"model": 44},
"exaone4": {"model": 16},
"lfm2": {"model": 52 if is_transformers_version("<", "5") else 54},
"lfm2_moe": {"model": 54},
"hunyuan_v1_dense": {"model": 32},
"qwen3_eagle3": {"model": 20},
"qwen3_next": {"model": 100},
Expand Down