as-suvorov · as-suvorov · Apr 1, 2026
diff --git a/docs/source/openvino/models.mdx b/docs/source/openvino/models.mdx
@@ -62,6 +62,7 @@ Here is the list of the supported architectures :
 - Falcon-Mamba
 - FlauBERT
 - GLM-4
+- GLM-4.7-Flash
 - GLM-Edge
 - GPT-2
 - GPT-BigCode
@@ -168,6 +169,7 @@ Here is the list of the supported architectures :
 - Zamba2
 
 ## [Diffusers](https://huggingface.co/docs/diffusers/index)
+
 - Stable Diffusion
 - Stable Diffusion XL
 - Latent Consistency
@@ -178,11 +180,14 @@ Here is the list of the supported architectures :
 - LTX
 
 ## [Timm](https://huggingface.co/docs/timm/index)
+
 - PiT
 - ViT
 
 ## [Sentence Transformers](https://github.com/UKPLab/sentence-transformers)
+
 - All Transformer and CLIP-based models.
 
 ## [OpenCLIP](https://github.com/mlfoundations/open_clip)
-- All CLIP-based models
+
+- All CLIP-based models
diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
@@ -139,6 +139,7 @@
     BlenderbotSmallModelPatcher,
     BloomModelPatcher,
     ChatGLMModelPatcher,
+    Glm4MoeLitePatcher,
     CodeGenModelPatcher,
     CommonImageEmbeddingsModelPatcher,
     DBRXModelPatcher,
@@ -3976,6 +3977,23 @@ class GLM4OpenVINOConfig(LlamaOpenVINOConfig):
     MIN_TRANSFORMERS_VERSION = "4.51.3"
 
 
+@register_in_tasks_manager(
+    "glm4_moe_lite",
+    *[
+        "text-generation",
+        "text-generation-with-past",
+    ],
+    library_name="transformers",
+)
+class Glm4MoeLiteOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig):
+    DEFAULT_ONNX_OPSET = 14
+    MIN_TRANSFORMERS_VERSION = "5.0.0"
+    DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, OVMiniCPM3DummyPastKeyValuesGenerator)
+    DUMMY_PKV_GENERATOR_CLASS = OVMiniCPM3DummyPastKeyValuesGenerator
+    NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
+    _MODEL_PATCHER = Glm4MoeLitePatcher
+
+
 @register_in_tasks_manager(
     "granite",
     *[

diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
@@ -7744,6 +7744,18 @@ def __exit__(self, exc_type, exc_value, traceback):
                 del afmoe_moe.down_projs, afmoe_moe.gate_projs, afmoe_moe.up_projs
 
 
+# Patch MoE implementation for glm4_moe_lite to enable correct Torch tracing.
+# The default "eager" experts implementation in Glm4MoeLiteNaiveMoe.forward() uses
+# torch.no_grad(), nonzero(), and a data-dependent loop over experts, producing
+# inconsistent graphs for different inputs during torch.jit.trace.
+# The transformers built-in "batched_mm" implementation uses index-based operations
+# and batched matrix multiplications that are fully trace-safe.
+class Glm4MoeLitePatcher(OVDecoderModelPatcher):
+    def __enter__(self):
+        super().__enter__()
+        self._model.set_experts_implementation("batched_mm")
+
+
 # adopted from https://github.com/huggingface/transformers/blob/v4.57.6/src/transformers/models/llama/modeling_llama.py#L197
 class LlamaEagle3Attention(LlamaAttention):
     """

diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py
@@ -156,6 +156,9 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase):
     if is_transformers_version(">=", "4.57.0"):
         SUPPORTED_ARCHITECTURES += ("hunyuan_v1_dense",)
 
+    if is_transformers_version(">=", "5.0.0"):
+        SUPPORTED_ARCHITECTURES += ("glm4_moe_lite",)
+
     if is_transformers_version("<", "4.56.0"):
         SUPPORTED_ARCHITECTURES += ("qwen", "chatglm", "chatglm4")
 
@@ -262,6 +265,7 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase):
         "zamba2": 1,
         "bitnet": 6,
         "hunyuan_v1_dense": 2,
+        "glm4_moe_lite": 2,
         "qwen3_next": 1,
     }
     TASK = "text-generation"

diff --git a/tests/openvino/test_export.py b/tests/openvino/test_export.py
@@ -113,6 +113,9 @@ class ExportModelTest(unittest.TestCase):
     if is_transformers_version(">=", "4.57.0"):
         SUPPORTED_ARCHITECTURES.update({"hunyuan_v1_dense": OVModelForCausalLM})
 
+    if is_transformers_version(">=", "5.0.0"):
+        SUPPORTED_ARCHITECTURES.update({"glm4_moe_lite": OVModelForCausalLM})
+
     if is_transformers_version(">=", "4.57.0") and is_transformers_version("<", "5"):
         SUPPORTED_ARCHITECTURES.update({"qwen3_next": OVModelForCausalLM})
 

diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py
@@ -172,6 +172,13 @@ class OVCLIExportTestCase(unittest.TestCase):
             ]
         )
 
+    if is_transformers_version(">=", "5.0.0"):
+        SUPPORTED_ARCHITECTURES.extend(
+            [
+                ("text-generation-with-past", "glm4_moe_lite"),
+            ]
+        )
+
     if is_transformers_version(">=", "4.57.0") and is_transformers_version("<", "5"):
         SUPPORTED_ARCHITECTURES.extend(
             [

diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py
@@ -1081,6 +1081,9 @@ class OVWeightCompressionTest(unittest.TestCase):
         SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION.append((OVModelForVisualCausalLM, "qwen3_vl", False))
         SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION.append((OVModelForCausalLM, "hunyuan_v1_dense", False))
 
+    if is_transformers_version(">=", "5.0.0"):
+        SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION.append((OVModelForCausalLM, "glm4_moe_lite", False))
+
     if is_transformers_version("<", "5"):
         SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION.extend(
             [

diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py
@@ -218,6 +218,7 @@
     "xglm": "optimum-intel-internal-testing/tiny-random-XGLMForCausalLM",
     "xverse": "optimum-intel-internal-testing/tiny-random-xverse",
     "glm4": "optimum-intel-internal-testing/tiny-random-glm4",
+    "glm4_moe_lite": "zai-org/GLM-4.7-Flash",
     "glm": "optimum-intel-internal-testing/tiny-random-glm-edge",
     "open-clip": "optimum-intel-internal-testing/tiny-open-clip-model",
     "open-clip-ov": "optimum-intel-internal-testing/tiny-open-clip-model",
@@ -372,6 +373,7 @@
     "hunyuan_v1_dense": {"model": 32},
     "qwen3_eagle3": {"model": 20},
     "qwen3_next": {"model": 100},
+    "glm4_moe_lite": {"model": 16},
 }
 
 TEST_IMAGE_URL = "http://images.cocodataset.org/val2017/000000039769.jpg"