Fix speculator model integration by detecting speculators before ModelConfig creation

rahul-tuli · rahul-tuli · commit aca9493b7028 · 2025-10-22T16:39:05.000Z
When using 'vllm serve' with a speculator model path directly
  (e.g., RedHatAI/Llama-3.1-8B-Instruct-speculator.eagle3), the tokenizer
  loading was failing because ModelConfig was created with the speculator
  path before maybe_override_with_speculators() could swap it to the
  target model path.

  This fix moves the maybe_override_with_speculators() call to happen
  BEFORE create_model_config(), ensuring that:
  1. Speculator models are detected early
  2. The target model path is extracted from the speculators config
  3. ModelConfig is created with the correct target model path
  4. Tokenizer loads successfully from the target model

Signed-off-by: Rahul Tuli &lt;rtuli@redhat.com&gt;
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
@@ -1275,10 +1275,8 @@ def create_engine_config(
 
         device_config = DeviceConfig(device=cast(Device, current_platform.device_type))
 
-        model_config = self.create_model_config()
-        self.model = model_config.model
-        self.tokenizer = model_config.tokenizer
-
+        # Check if the model is a speculator and override model/tokenizer/config
+        # BEFORE creating ModelConfig, so the config is created with the target model
         (self.model, self.tokenizer, self.speculative_config) = (
             maybe_override_with_speculators(
                 model=self.model,
@@ -1289,6 +1287,10 @@ def create_engine_config(
             )
         )
 
+        model_config = self.create_model_config()
+        self.model = model_config.model
+        self.tokenizer = model_config.tokenizer
+
         # * If VLLM_USE_V1 is unset, we enable V1 for "supported features"
         #   and fall back to V0 for experimental or unsupported features.
         # * If VLLM_USE_V1=1, we enable V1 for supported + experimental