Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 18 additions & 8 deletions fastdeploy/spec_decode/mtp.py
Original file line number Diff line number Diff line change
Expand Up @@ -636,7 +636,7 @@ def insert_prefill_inputs(self, req_dicts: List[Request], num_running_requests:
self.model_inputs["not_need_stop"][0] = True
self.model_inputs["seq_lens_this_time"] = self.seq_lens_this_time_buffer

def _initialize_forward_meta(self, step_use_cudagraph: bool = False):
def _initialize_forward_meta(self, step_use_cudagraph: bool = False, is_dummy_run: bool = False, substep: int = 0):
"""
Initialize forward meta and attention meta data
"""
Expand Down Expand Up @@ -672,7 +672,12 @@ def _initialize_forward_meta(self, step_use_cudagraph: bool = False):
for attn_backend in self.attn_backends:
attn_backend.init_attention_metadata(self.forward_meta)

self.forward_meta.step_use_cudagraph = step_use_cudagraph and self.draft_model_use_cudagraph
# Notes(liuzichang):
# 1. CUDA Graph capture sizes must be recorded in descending order (large → small).
# 2. In multi-step execution, only the first step should be captured.
self.forward_meta.step_use_cudagraph = (
step_use_cudagraph and self.draft_model_use_cudagraph and not (substep > 0 and is_dummy_run)
)

def exist_prefill(self):
"""
Expand Down Expand Up @@ -774,7 +779,7 @@ def _post_process(self, sampled_token_ids):
self.model_inputs["step_idx"],
)

def _propose(self, step_use_cudagraph: bool = False):
def _propose(self, step_use_cudagraph: bool = False, is_dummy_run=False):
"""
Main process for MTP inference.
Args:
Expand Down Expand Up @@ -827,7 +832,9 @@ def _propose(self, step_use_cudagraph: bool = False):
self.model_inputs["output_padding_offset"].copy_(output_padding_offset, False)

# Initialize forward meta data
self._initialize_forward_meta(step_use_cudagraph=step_use_cudagraph)
self._initialize_forward_meta(
step_use_cudagraph=step_use_cudagraph, is_dummy_run=is_dummy_run, substep=substep
Copy link

Copilot AI Jan 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The variable 'is_dummy_run' is used but not defined in the _propose method. This will cause a NameError at runtime when _initialize_forward_meta is called. The _propose method signature only includes 'step_use_cudagraph' as a parameter, but 'is_dummy_run' is being passed to _initialize_forward_meta. You need to either add 'is_dummy_run' as a parameter to the _propose method or determine it from existing state/attributes.

Suggested change
step_use_cudagraph=step_use_cudagraph, is_dummy_run=is_dummy_run, substep=substep
step_use_cudagraph=step_use_cudagraph, substep=substep

Copilot uses AI. Check for mistakes.
)
self.forward_meta.batch_id_per_token.copy_(batch_id_per_token, False)

# Padding inputs for cuda graph
Expand All @@ -852,9 +859,10 @@ def _propose(self, step_use_cudagraph: bool = False):
top_p_normalized_logprobs=self.model_inputs["top_p_normalized_logprobs"],
share_inputs=self.model_inputs,
)

# Note(liuzichang):
# paddle.clone would raise error 700 in cudaGraph mode
if self.num_model_steps > 1:
self.last_seq_lens_this_time = paddle.clone(self.model_inputs["seq_lens_this_time"])
self.last_seq_lens_this_time.copy_(self.model_inputs["seq_lens_this_time"], False)

model_output = self.model(
ids_remove_padding=self.model_inputs["ids_remove_padding"],
Expand Down Expand Up @@ -1017,10 +1025,12 @@ def _extend_draft_token_with_ngram_match(self):
self.target_model_inputs["draft_tokens"][:] = draft_tokens.cuda()
self.target_model_inputs["seq_lens_this_time"][:] = seq_lens_this_time.cuda()

def _run_impl(self, full_hidden_states: paddle.Tensor, step_use_cudagraph: bool = False):
def _run_impl(
self, full_hidden_states: paddle.Tensor, step_use_cudagraph: bool = False, is_dummy_run: bool = False
):
"""Execute Draft Model"""
self._prepare_inputs(full_hidden_states)
self._propose(step_use_cudagraph=step_use_cudagraph)
self._propose(step_use_cudagraph=step_use_cudagraph, is_dummy_run=is_dummy_run)
self._update_status()
if self.hybrid_mode:
self._extend_draft_token_with_ngram_match()
Expand Down
43 changes: 2 additions & 41 deletions fastdeploy/worker/gpu_model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -1950,51 +1950,12 @@ def capture_model(self) -> None:
),
batch_size=int(capture_size / (self.speculative_config.num_speculative_tokens + 1)),
in_capturing=True,
expected_decode_len=self.speculative_config.num_speculative_tokens,
expected_decode_len=self.speculative_config.num_speculative_tokens * 2 + 1,
accept_all_drafts=True,
)
logger.info(
f"Warm up the Target model with the num_tokens:{capture_size}, expected_decode_len:{self.speculative_config.num_speculative_tokens}"
f"Warm up the model with the num_tokens:{capture_size}, expected_decode_len:{self.speculative_config.num_speculative_tokens}"
Comment on lines 1956 to +1957
Copy link

Copilot AI Jan 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The log message is inconsistent with the actual expected_decode_len value being passed to _dummy_run. The log says 'expected_decode_len:{self.speculative_config.num_speculative_tokens}' but the actual parameter passed on line 1953 is 'self.speculative_config.num_speculative_tokens * 2 + 1'. The log message should reflect the actual value being used to avoid confusion during debugging.

Copilot uses AI. Check for mistakes.
)
if self.graph_opt_config.draft_model_use_cudagraph:
# Capture Draft Model without bsz 1
# NOTE(liujundong): expected_decode_len = 1, will affect mtp capture in cudagraph
for batch_size in sorted(capture_sizes, reverse=True):
if batch_size == 1:
logger.info("Skip token_num = 1, when capture Draft model for mtp")
else:
assert batch_size % 2 == 0
self._dummy_run(
num_tokens=(
self.scheduler_config.max_num_seqs
if self.scheduler_config.splitwise_role == "decode"
else self.scheduler_config.max_num_batched_tokens
),
batch_size=int(batch_size / 2),
in_capturing=True,
expected_decode_len=3,
accept_all_drafts=True,
)
logger.info(
f"Warm up the Draft model with the num_tokens:{batch_size}, expected_decode_len:{3}"
)
# Capture Draft Model with bsz 1
if 1 in capture_sizes:
self._dummy_run(
num_tokens=(
self.scheduler_config.max_num_seqs
if self.scheduler_config.splitwise_role == "decode"
else self.scheduler_config.max_num_batched_tokens
),
batch_size=int(1),
in_capturing=True,
expected_decode_len=3,
accept_all_drafts=False,
reject_all_drafts=True,
)
logger.info(
f"Warm up the Draft model with the num_tokens:{batch_size}, expected_decode_len:{3}"
)
else:
for batch_size in sorted(capture_sizes, reverse=True):
self._dummy_run(
Expand Down
Loading