From 66e4d6dc14aefe5f67e4e4f5f142d57faaa3768d Mon Sep 17 00:00:00 2001 From: baonudesifeizhai Date: Thu, 18 Sep 2025 17:13:34 -0400 Subject: [PATCH 01/10] fix Signed-off-by: baonudesifeizhai --- vllm/attention/layers/cross_attention.py | 2 ++ vllm/v1/worker/gpu_model_runner.py | 6 +++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/vllm/attention/layers/cross_attention.py b/vllm/attention/layers/cross_attention.py index 9400c5bffa38..8a538cc44211 100644 --- a/vllm/attention/layers/cross_attention.py +++ b/vllm/attention/layers/cross_attention.py @@ -37,6 +37,8 @@ def _get_cross_slot_mapping(encoder_seq_lens: np.ndarray, device: torch.device) -> torch.Tensor: """Get cross-attention slot mappings.""" + encoder_seq_lens = np.atleast_1d(encoder_seq_lens) + block_size = kv_cache_spec.block_size slot_mappings = [] diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 053e8f0537ed..d97eb84de931 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -886,7 +886,11 @@ def _get_encoder_seq_lens( # Build encoder_seq_lens array mapping request indices to # encoder lengths for inputs scheduled in this batch - encoder_seq_lens = np.zeros(num_reqs, dtype=np.int32) + num_reqs_int = int(num_reqs) + if num_reqs_int == 0: + return np.zeros((0,), dtype=np.int32) + + encoder_seq_lens = np.zeros((num_reqs_int,), dtype=np.int32) for req_id in scheduler_output.scheduled_encoder_inputs: req_index = self.input_batch.req_id_to_index[req_id] encoder_seq_lens[req_index] = self.max_encoder_len From 1b215ee1d6eb4b9e2ea5fba4f5b9cb9e9c178243 Mon Sep 17 00:00:00 2001 From: baonudesifeizhai Date: Thu, 18 Sep 2025 17:29:39 -0400 Subject: [PATCH 02/10] yapf Signed-off-by: baonudesifeizhai --- vllm/v1/worker/gpu_model_runner.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index d97eb84de931..c851260ecb5e 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -888,9 +888,9 @@ def _get_encoder_seq_lens( # encoder lengths for inputs scheduled in this batch num_reqs_int = int(num_reqs) if num_reqs_int == 0: - return np.zeros((0,), dtype=np.int32) + return np.zeros((0, ), dtype=np.int32) - encoder_seq_lens = np.zeros((num_reqs_int,), dtype=np.int32) + encoder_seq_lens = np.zeros((num_reqs_int, ), dtype=np.int32) for req_id in scheduler_output.scheduled_encoder_inputs: req_index = self.input_batch.req_id_to_index[req_id] encoder_seq_lens[req_index] = self.max_encoder_len From 4fda4fcc3912d398723c0fca2820af23d37e9123 Mon Sep 17 00:00:00 2001 From: baonudesifeizhai <85092850+baonudesifeizhai@users.noreply.github.com> Date: Thu, 18 Sep 2025 17:49:08 -0400 Subject: [PATCH 03/10] Update vllm/v1/worker/gpu_model_runner.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Signed-off-by: baonudesifeizhai <85092850+baonudesifeizhai@users.noreply.github.com> --- vllm/v1/worker/gpu_model_runner.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index c851260ecb5e..9d932c7cb320 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -886,11 +886,7 @@ def _get_encoder_seq_lens( # Build encoder_seq_lens array mapping request indices to # encoder lengths for inputs scheduled in this batch - num_reqs_int = int(num_reqs) - if num_reqs_int == 0: - return np.zeros((0, ), dtype=np.int32) - - encoder_seq_lens = np.zeros((num_reqs_int, ), dtype=np.int32) + encoder_seq_lens = np.zeros((int(num_reqs),), dtype=np.int32) for req_id in scheduler_output.scheduled_encoder_inputs: req_index = self.input_batch.req_id_to_index[req_id] encoder_seq_lens[req_index] = self.max_encoder_len From e8d12d7c41b19cabe0563aa58ed79096eff77781 Mon Sep 17 00:00:00 2001 From: baonudesifeizhai Date: Thu, 18 Sep 2025 18:00:31 -0400 Subject: [PATCH 04/10] yapf --- vllm/v1/worker/gpu_model_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 9d932c7cb320..958e9b3e0330 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -886,7 +886,7 @@ def _get_encoder_seq_lens( # Build encoder_seq_lens array mapping request indices to # encoder lengths for inputs scheduled in this batch - encoder_seq_lens = np.zeros((int(num_reqs),), dtype=np.int32) + encoder_seq_lens = np.zeros((int(num_reqs), ), dtype=np.int32) for req_id in scheduler_output.scheduled_encoder_inputs: req_index = self.input_batch.req_id_to_index[req_id] encoder_seq_lens[req_index] = self.max_encoder_len From 88059d3651db38e5ba1cd6c548a18b601ea827a6 Mon Sep 17 00:00:00 2001 From: baonudesifeizhai Date: Thu, 18 Sep 2025 23:01:59 -0400 Subject: [PATCH 05/10] fix --- vllm/v1/worker/gpu_model_runner.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 958e9b3e0330..80d64d22e7e5 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -3474,6 +3474,21 @@ def initialize_cudagraph_capture(self) -> None: CUDAGraphMode.NONE logger.warning(msg) + if self.model_config.is_encoder_decoder: + if cudagraph_mode in (CUDAGraphMode.FULL, + CUDAGraphMode.FULL_AND_PIECEWISE): + logger.warning( + "CUDA graph decode-only mode required for encoder-decoder " + "models; setting cudagraph_mode=FULL_DECODE_ONLY") + cudagraph_mode = self.compilation_config.cudagraph_mode = \ + CUDAGraphMode.FULL_DECODE_ONLY + elif cudagraph_mode == CUDAGraphMode.PIECEWISE: + logger.warning( + "Encoder-decoder models do not support cudagraph prefill " + "capture; setting cudagraph_mode=NONE") + cudagraph_mode = self.compilation_config.cudagraph_mode = \ + CUDAGraphMode.NONE + # double check that we can support full cudagraph if they are requested # even after automatic downgrades if cudagraph_mode.has_full_cudagraphs() \ From 28800dc785d7f882d33bbd814c06029785d2c7d4 Mon Sep 17 00:00:00 2001 From: baonudesifeizhai Date: Thu, 18 Sep 2025 23:38:54 -0400 Subject: [PATCH 06/10] fix and debug --- vllm/v1/worker/gpu_model_runner.py | 66 +++++++++++++++++++++++------- 1 file changed, 52 insertions(+), 14 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 80d64d22e7e5..902603505869 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -257,6 +257,8 @@ def __init__( # mm_hash -> encoder_output self.encoder_cache: dict[str, torch.Tensor] = {} + self._encoder_cudagraph_buffers: dict[tuple[Any, ...], + torch.Tensor] = {} self.use_aux_hidden_state_outputs = False # Set up speculative decoding. @@ -1569,8 +1571,58 @@ def _extract_encoder_inputs( # input_features=...) encoder_features.update(mm_kwargs_group) + if self._should_use_encoder_cudagraph_buffers(): + encoder_features = self._prepare_encoder_inputs_for_cudagraph( + encoder_features) + return encoder_features + def _should_use_encoder_cudagraph_buffers(self) -> bool: + return self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE + + def _prepare_encoder_inputs_for_cudagraph( + self, + encoder_inputs: dict[str, Any], + ) -> dict[str, Any]: + input_features = encoder_inputs.get("input_features") + if input_features is None: + return encoder_inputs + + encoder_inputs["input_features"] = self._copy_to_cudagraph_buffer( + ("input_features", ), input_features) + return encoder_inputs + + def _copy_to_cudagraph_buffer( + self, + key_prefix: tuple[Any, ...], + value: Any, + ) -> Any: + if isinstance(value, torch.Tensor): + key = key_prefix + (tuple(value.shape), value.dtype, value.device) + buffer = self._encoder_cudagraph_buffers.get(key) + if buffer is None: + buffer = torch.empty_like(value) + self._encoder_cudagraph_buffers[key] = buffer + else: + assert (buffer.shape == value.shape + and buffer.dtype == value.dtype + and buffer.device == value.device), ( + "CUDAGraph buffer mismatch for encoder inputs.") + buffer.copy_(value) + return buffer + + if isinstance(value, list): + return [ + self._copy_to_cudagraph_buffer(key_prefix + (idx, ), item) + for idx, item in enumerate(value) + ] + if isinstance(value, tuple): + return tuple( + self._copy_to_cudagraph_buffer(key_prefix + (idx, ), item) + for idx, item in enumerate(value)) + + return value + def get_model(self) -> nn.Module: # get raw model out of the cudagraph wrapper. if isinstance(self.model, (CUDAGraphWrapper, UBatchWrapper)): @@ -3474,20 +3526,6 @@ def initialize_cudagraph_capture(self) -> None: CUDAGraphMode.NONE logger.warning(msg) - if self.model_config.is_encoder_decoder: - if cudagraph_mode in (CUDAGraphMode.FULL, - CUDAGraphMode.FULL_AND_PIECEWISE): - logger.warning( - "CUDA graph decode-only mode required for encoder-decoder " - "models; setting cudagraph_mode=FULL_DECODE_ONLY") - cudagraph_mode = self.compilation_config.cudagraph_mode = \ - CUDAGraphMode.FULL_DECODE_ONLY - elif cudagraph_mode == CUDAGraphMode.PIECEWISE: - logger.warning( - "Encoder-decoder models do not support cudagraph prefill " - "capture; setting cudagraph_mode=NONE") - cudagraph_mode = self.compilation_config.cudagraph_mode = \ - CUDAGraphMode.NONE # double check that we can support full cudagraph if they are requested # even after automatic downgrades From 51e8742bb809529b7ed6cf1de5f0959b49ff067c Mon Sep 17 00:00:00 2001 From: baonudesifeizhai Date: Fri, 19 Sep 2025 00:36:51 -0400 Subject: [PATCH 07/10] yapf fix --- vllm/v1/worker/gpu_model_runner.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 902603505869..5a016193b745 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -3526,7 +3526,6 @@ def initialize_cudagraph_capture(self) -> None: CUDAGraphMode.NONE logger.warning(msg) - # double check that we can support full cudagraph if they are requested # even after automatic downgrades if cudagraph_mode.has_full_cudagraphs() \ From 1195357cb593065689d8c2922e8c792c4efdd3df Mon Sep 17 00:00:00 2001 From: baonudesifeizhai Date: Fri, 19 Sep 2025 17:31:54 -0400 Subject: [PATCH 08/10] add log --- vllm/v1/worker/gpu_model_runner.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 5a016193b745..015d08df58a1 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1860,6 +1860,7 @@ def _preprocess( Optional[torch.Tensor], torch.Tensor, Optional[IntermediateTensors], dict[str, Any]]: + preprocess_start = time.perf_counter() num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens if ubatch_slices: assert num_tokens_after_padding is not None @@ -1921,7 +1922,7 @@ def _preprocess( encoder_inputs = self._extract_encoder_inputs(scheduler_output) model_kwargs.update(encoder_inputs) - return ( + preprocess_result = ( num_scheduled_tokens, num_input_tokens, num_tokens_after_padding, @@ -1932,11 +1933,17 @@ def _preprocess( model_kwargs, ) + logger.info("[perf] preprocess took %.2f ms", + (time.perf_counter() - preprocess_start) * 1e3) + + return preprocess_result + def _sample( self, logits: Optional[torch.Tensor], spec_decode_metadata: Optional[SpecDecodeMetadata] ) -> SamplerOutput: # Sample the next token and get logprobs if needed. + sample_start = time.perf_counter() sampling_metadata = self.input_batch.sampling_metadata if spec_decode_metadata is None: sampler_output = self.sampler( @@ -1970,6 +1977,9 @@ def _sample( sampler_output.sampled_token_ids = output_token_ids self._update_states_after_model_execute(output_token_ids) + logger.info("[perf] sampling took %.2f ms", + (time.perf_counter() - sample_start) * 1e3) + return sampler_output def _bookkeeping_sync( @@ -2163,6 +2173,7 @@ def execute_model( ), record_function_or_nullcontext("Forward"), self.maybe_get_kv_connector_output(scheduler_output) as kv_connector_output): + forward_start = time.perf_counter() model_output = self.model( input_ids=input_ids, positions=positions, @@ -2170,6 +2181,8 @@ def execute_model( inputs_embeds=inputs_embeds, **model_kwargs, ) + logger.info("[perf] model forward took %.2f ms", + (time.perf_counter() - forward_start) * 1e3) with record_function_or_nullcontext("Postprocess"): if self.use_aux_hidden_state_outputs: From b28a68a2ca19452076d5f1688bac56fe835bc239 Mon Sep 17 00:00:00 2001 From: baonudesifeizhai Date: Fri, 19 Sep 2025 18:27:07 -0400 Subject: [PATCH 09/10] remove logger --- vllm/v1/worker/gpu_model_runner.py | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 6a64386cfd6e..1ae2021e78d2 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1918,7 +1918,6 @@ def _preprocess( Optional[torch.Tensor], torch.Tensor, Optional[IntermediateTensors], dict[str, Any]]: - preprocess_start = time.perf_counter() num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens if ubatch_slices: assert num_tokens_after_padding is not None @@ -2006,7 +2005,7 @@ def _preprocess( encoder_inputs = self._extract_encoder_inputs(scheduler_output) model_kwargs.update(encoder_inputs) - preprocess_result = ( + return ( num_scheduled_tokens, num_input_tokens, num_tokens_after_padding, @@ -2017,17 +2016,11 @@ def _preprocess( model_kwargs, ) - logger.info("[perf] preprocess took %.2f ms", - (time.perf_counter() - preprocess_start) * 1e3) - - return preprocess_result - def _sample( self, logits: Optional[torch.Tensor], spec_decode_metadata: Optional[SpecDecodeMetadata] ) -> SamplerOutput: # Sample the next token and get logprobs if needed. - sample_start = time.perf_counter() sampling_metadata = self.input_batch.sampling_metadata if spec_decode_metadata is None: sampler_output = self.sampler( @@ -2061,9 +2054,6 @@ def _sample( sampler_output.sampled_token_ids = output_token_ids self._update_states_after_model_execute(output_token_ids) - logger.info("[perf] sampling took %.2f ms", - (time.perf_counter() - sample_start) * 1e3) - return sampler_output def _bookkeeping_sync( @@ -2258,7 +2248,6 @@ def execute_model( ), record_function_or_nullcontext("Forward"), self.maybe_get_kv_connector_output(scheduler_output) as kv_connector_output): - forward_start = time.perf_counter() model_output = self.model( input_ids=input_ids, positions=positions, @@ -2266,8 +2255,6 @@ def execute_model( inputs_embeds=inputs_embeds, **model_kwargs, ) - logger.info("[perf] model forward took %.2f ms", - (time.perf_counter() - forward_start) * 1e3) with record_function_or_nullcontext("Postprocess"): if self.use_aux_hidden_state_outputs: From 2f4e2303a3900258cbb8ae939221c87f8773f802 Mon Sep 17 00:00:00 2001 From: baonudesifeizhai Date: Sat, 11 Oct 2025 18:08:18 -0400 Subject: [PATCH 10/10] fix format error and add tracker in encoder lengths --- vllm/v1/worker/gpu_model_runner.py | 43 +++++++++++++++++++----------- 1 file changed, 27 insertions(+), 16 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 330b677fd55e..d90f8c8ff48a 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -312,8 +312,7 @@ def __init__( # mm_hash -> encoder_output self.encoder_cache: dict[str, torch.Tensor] = {} - self._encoder_cudagraph_buffers: dict[tuple[Any, ...], - torch.Tensor] = {} + self._encoder_cudagraph_buffers: dict[tuple[Any, ...], torch.Tensor] = {} self.use_aux_hidden_state_outputs = False # Set up speculative decoding. @@ -1031,11 +1030,19 @@ def _get_encoder_seq_lens( return None # Build encoder_seq_lens array mapping request indices to - # encoder lengths for inputs scheduled in this batch - encoder_seq_lens = np.zeros((int(num_reqs), ), dtype=np.int32) - for req_id in scheduler_output.scheduled_encoder_inputs: - req_index = self.input_batch.req_id_to_index[req_id] - encoder_seq_lens[req_index] = self.max_encoder_len + # encoder lengths for all requests with encoder inputs. + # Note: This must include ALL requests with encoder features, + # not just those being scheduled in this step, because cross-attention + # needs encoder lengths during decode phase for CUDA graph compatibility. + encoder_seq_lens = np.zeros((int(num_reqs),), dtype=np.int32) + + # Iterate through all active requests in the batch + for req_id in self.input_batch.req_ids[:num_reqs]: + req_state = self.requests.get(req_id) + # Check if this request has encoder inputs (multimodal features) + if req_state and req_state.mm_features: + req_index = self.input_batch.req_id_to_index[req_id] + encoder_seq_lens[req_index] = self.max_encoder_len return encoder_seq_lens @@ -1902,7 +1909,8 @@ def _extract_encoder_inputs( if self._should_use_encoder_cudagraph_buffers(): encoder_features = self._prepare_encoder_inputs_for_cudagraph( - encoder_features) + encoder_features + ) return encoder_features @@ -1918,7 +1926,8 @@ def _prepare_encoder_inputs_for_cudagraph( return encoder_inputs encoder_inputs["input_features"] = self._copy_to_cudagraph_buffer( - ("input_features", ), input_features) + ("input_features",), input_features + ) return encoder_inputs def _copy_to_cudagraph_buffer( @@ -1933,22 +1942,24 @@ def _copy_to_cudagraph_buffer( buffer = torch.empty_like(value) self._encoder_cudagraph_buffers[key] = buffer else: - assert (buffer.shape == value.shape - and buffer.dtype == value.dtype - and buffer.device == value.device), ( - "CUDAGraph buffer mismatch for encoder inputs.") + assert ( + buffer.shape == value.shape + and buffer.dtype == value.dtype + and buffer.device == value.device + ), "CUDAGraph buffer mismatch for encoder inputs." buffer.copy_(value) return buffer if isinstance(value, list): return [ - self._copy_to_cudagraph_buffer(key_prefix + (idx, ), item) + self._copy_to_cudagraph_buffer(key_prefix + (idx,), item) for idx, item in enumerate(value) ] if isinstance(value, tuple): return tuple( - self._copy_to_cudagraph_buffer(key_prefix + (idx, ), item) - for idx, item in enumerate(value)) + self._copy_to_cudagraph_buffer(key_prefix + (idx,), item) + for idx, item in enumerate(value) + ) return value