[bugfix] support eagle with lora cudagraph specialization

gnovack · gnovack · commit a6ac802cf098 · 2025-11-07T18:48:01.000Z
Signed-off-by: gnovack &lt;gnovack@amazon.com&gt;
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
@@ -15,7 +15,7 @@
     get_layers_from_vllm_config,
 )
 from vllm.distributed.parallel_state import get_pp_group
-from vllm.forward_context import set_forward_context
+from vllm.forward_context import BatchDescriptor, set_forward_context
 from vllm.logger import init_logger
 from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
 from vllm.model_executor.model_loader import get_model
@@ -1055,6 +1055,7 @@ def dummy_run(
         self,
         num_tokens: int,
         use_cudagraphs=True,
+        batch_descriptor: BatchDescriptor | None = None,
     ) -> None:
         # Determine if CUDA graphs should be used for this run.
         cudagraphs_enabled = use_cudagraphs and self.use_cuda_graph
@@ -1065,6 +1066,7 @@ def dummy_run(
             None,
             self.vllm_config,
             num_tokens=num_tokens,
+            batch_descriptor=batch_descriptor,
             cudagraph_runtime_mode=(
                 CUDAGraphMode.PIECEWISE if cudagraphs_enabled else CUDAGraphMode.NONE
             ),
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
@@ -3585,7 +3585,11 @@ def _dummy_run(
                     cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE
                     and not self.speculative_config.enforce_eager
                 )
-                self.drafter.dummy_run(num_tokens, use_cudagraphs=use_cudagraphs)
+                self.drafter.dummy_run(
+                    num_tokens,
+                    use_cudagraphs=use_cudagraphs,
+                    batch_descriptor=batch_descriptor,
+                )
 
         # This is necessary to avoid blocking DP.
         # For dummy runs, we typically skip EPLB since we don't have any real