[https://nvbugs/5455836][fix] Fix llama 4 FP4 (NVIDIA#6911)

mikeiovine · dominicshanshan · commit 54e44add1620 · 2025-08-26T01:39:11.000-07:00
Signed-off-by: Mike Iovine &lt;6158008+mikeiovine@users.noreply.github.com&gt;
Signed-off-by: Wangshanshan &lt;30051912+dominicshanshan@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/models/modeling_llama.py b/tensorrt_llm/_torch/models/modeling_llama.py
@@ -183,6 +183,9 @@ def _forward_nope(
                                         mrope_config,
                                         attention_sinks=None)
 
+        if isinstance(attn_output, tuple):
+            attn_output = Fp4QuantizedTensor(attn_output[0], attn_output[1])
+
         attn_output = self.o_proj(attn_output,
                                   all_reduce_params=all_reduce_params)