[fix] Fix Mistral3VLM weight-loading & enable in pre-merge (NVIDIA#6105)

2ez4bz · NVShreyas · commit 271d4b067dcd · 2025-07-28T07:27:57.000-07:00
Signed-off-by: William Zhang &lt;133824995+2ez4bz@users.noreply.github.com&gt;
Signed-off-by: Shreyas Misra &lt;shreyasm@nvidia.com&gt;
diff --git a/tensorrt_llm/_torch/models/__init__.py b/tensorrt_llm/_torch/models/__init__.py
@@ -10,7 +10,7 @@
 from .modeling_hyperclovax import HCXVisionForCausalLM
 from .modeling_llama import LlamaForCausalLM
 from .modeling_llava_next import LlavaNextModel
-from .modeling_mistral import MistralForCausalLM
+from .modeling_mistral import Mistral3VLM, MistralForCausalLM
 from .modeling_mixtral import MixtralForCausalLM
 from .modeling_nemotron import NemotronForCausalLM
 from .modeling_nemotron_h import NemotronHForCausalLM
@@ -39,6 +39,7 @@
     "HCXVisionForCausalLM",
     "LlamaForCausalLM",
     "LlavaNextModel",
+    "Mistral3VLM",
     "MistralForCausalLM",
     "MixtralForCausalLM",
     "NemotronForCausalLM",
diff --git a/tensorrt_llm/_torch/models/modeling_mistral.py b/tensorrt_llm/_torch/models/modeling_mistral.py
@@ -296,6 +296,8 @@ def __init__(
 
         llm_model_config = self._get_sub_model_config(model_config,
                                                       "text_config")
+        # This is necessary for the auto weight mapper to figure out what it needs.
+        llm_model_config.pretrained_config.architectures = config.architectures
         self.llm = MistralForCausalLM(llm_model_config)
 
         self._device = "cuda"
diff --git a/tests/integration/defs/local_venv.py b/tests/integration/defs/local_venv.py
@@ -4,6 +4,7 @@
 """
 import copy
 import os
+import shlex
 import subprocess
 import tempfile
 import textwrap as tw
@@ -116,12 +117,17 @@ def run_cmd(self,
             new_env = os.environ
 
         if caller.__name__ == 'check_output':
-            result = subprocess.run(call_args,
-                                    env=new_env,
-                                    check=True,
-                                    capture_output=True,
-                                    **kwargs)
-            return result.stdout.decode('utf-8')
+            try:
+                result = subprocess.run(call_args,
+                                        env=new_env,
+                                        check=True,
+                                        capture_output=True,
+                                        **kwargs)
+                return result.stdout.decode('utf-8')
+            except subprocess.CalledProcessError as e:
+                raise RuntimeError(f"Failed to run `{shlex.join(e.cmd)}`:\n"
+                                   f"Stdout: {e.stdout.decode()}\n"
+                                   f"Stderr: {e.stderr.decode()}\n")
         else:
             print(f"Start subprocess with {caller}({call_args}, env={new_env})")
             return caller(call_args, env=new_env, **kwargs)
diff --git a/tests/integration/test_lists/test-db/l0_h100.yml b/tests/integration/test_lists/test-db/l0_h100.yml
@@ -193,6 +193,7 @@ l0_h100:
   - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8_block_scales[latency]
   - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding[llguidance]
   - test_e2e.py::test_ptp_quickstart_multimodal[gemma-3-27b-it-gemma/gemma-3-27b-it-image-True]
+  - test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-True]
 - condition:
     ranges:
       system_gpu_count: