Llama4 Export: Remove outdated MLP weight transform (NVIDIA-NeMo#14297)

suiyoubi · guyueh1 · commit 683c6272edb4 · 2025-08-25T08:55:08.000-07:00
* Update HFLlamaExporter to remove outdated MLP weight transform for Llama4 model

Signed-off-by: Ao Tang &lt;aot@nvidia.com&gt;

* Apply isort and black reformatting

Signed-off-by: suiyoubi &lt;suiyoubi@users.noreply.github.com&gt;

---------

Signed-off-by: Ao Tang &lt;aot@nvidia.com&gt;
Signed-off-by: suiyoubi &lt;suiyoubi@users.noreply.github.com&gt;
Co-authored-by: suiyoubi &lt;suiyoubi@users.noreply.github.com&gt;
Signed-off-by: Guyue Huang &lt;guyueh@nvidia.com&gt;
diff --git a/nemo/collections/llm/gpt/model/llama.py b/nemo/collections/llm/gpt/model/llama.py
@@ -903,6 +903,13 @@ def convert_state(self, source, target, source_config=None):
                     "decoder.layers.*.mlp.experts.linear_fc1.weight": "model.layers.*.feed_forward.experts.gate_up_proj",
                 }
             )
+
+            # Remove the transform with source_key "decoder.layers.*.mlp.linear_fc1.weight" from transforms
+            # Llama4's HF model has a different mapping for the MLP weights (map to feed_forward instead of mlp)
+            transforms = [
+                t for t in transforms if getattr(t, "source_key", None) != "decoder.layers.*.mlp.linear_fc1.weight"
+            ]
+
             transforms.extend(
                 [
                     io.state_transform(

Original file line number	Diff line number	Diff line change
`@@ -903,6 +903,13 @@ def convert_state(self, source, target, source_config=None):`
`903`	`903`	`"decoder.layers..mlp.experts.linear_fc1.weight": "model.layers..feed_forward.experts.gate_up_proj",`
`904`	`904`	`}`
`905`	`905`	`)`
	`906`	`+`
	`907`	`+ # Remove the transform with source_key "decoder.layers.*.mlp.linear_fc1.weight" from transforms`
	`908`	`+ # Llama4's HF model has a different mapping for the MLP weights (map to feed_forward instead of mlp)`
	`909`	`+ transforms = [`
	`910`	`+ t for t in transforms if getattr(t, "source_key", None) != "decoder.layers.*.mlp.linear_fc1.weight"`
	`911`	`+ ]`
	`912`	`+`
`906`	`913`	`transforms.extend(`
`907`	`914`	`[`
`908`	`915`	`io.state_transform(`