File tree Expand file tree Collapse file tree 2 files changed +8
-2
lines changed
Expand file tree Collapse file tree 2 files changed +8
-2
lines changed Original file line number Diff line number Diff line change 1515 get_layers_from_vllm_config ,
1616)
1717from vllm .distributed .parallel_state import get_pp_group
18- from vllm .forward_context import set_forward_context
18+ from vllm .forward_context import BatchDescriptor , set_forward_context
1919from vllm .logger import init_logger
2020from vllm .model_executor .layers .attention_layer_base import AttentionLayerBase
2121from vllm .model_executor .model_loader import get_model
@@ -1055,6 +1055,7 @@ def dummy_run(
10551055 self ,
10561056 num_tokens : int ,
10571057 use_cudagraphs = True ,
1058+ batch_descriptor : BatchDescriptor | None = None ,
10581059 ) -> None :
10591060 # Determine if CUDA graphs should be used for this run.
10601061 cudagraphs_enabled = use_cudagraphs and self .use_cuda_graph
@@ -1065,6 +1066,7 @@ def dummy_run(
10651066 None ,
10661067 self .vllm_config ,
10671068 num_tokens = num_tokens ,
1069+ batch_descriptor = batch_descriptor ,
10681070 cudagraph_runtime_mode = (
10691071 CUDAGraphMode .PIECEWISE if cudagraphs_enabled else CUDAGraphMode .NONE
10701072 ),
Original file line number Diff line number Diff line change @@ -3585,7 +3585,11 @@ def _dummy_run(
35853585 cudagraph_runtime_mode == CUDAGraphMode .PIECEWISE
35863586 and not self .speculative_config .enforce_eager
35873587 )
3588- self .drafter .dummy_run (num_tokens , use_cudagraphs = use_cudagraphs )
3588+ self .drafter .dummy_run (
3589+ num_tokens ,
3590+ use_cudagraphs = use_cudagraphs ,
3591+ batch_descriptor = batch_descriptor ,
3592+ )
35893593
35903594 # This is necessary to avoid blocking DP.
35913595 # For dummy runs, we typically skip EPLB since we don't have any real
You can’t perform that action at this time.
0 commit comments