Skip to content

Commit 324fcec

Browse files
committed
computel ast
1 parent d57951f commit 324fcec

File tree

1 file changed

+5
-7
lines changed

1 file changed

+5
-7
lines changed

vllm/worker/model_runner.py

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -531,18 +531,16 @@ def _compute_for_prefix_cache_hit(
531531
inter_data.query_lens[
532532
seq_idx] = inter_data.seq_lens[seq_idx] - context_len
533533
elif seq_len <= prefix_cache_len:
534-
# Full hit. Only compute the last block to avoid
534+
# Full hit. Only compute the last token to avoid
535535
# erroneous behavior. FIXME: Ideally we should directly
536536
# mark all tokens as computed in the scheduler and do not
537537
# schedule this sequence, so this case should not happen.
538-
block_size = self.block_size
539538
inter_data.input_tokens[seq_idx] = inter_data.input_tokens[
540-
seq_idx][-block_size:]
539+
seq_idx][-1:]
541540
inter_data.input_positions[seq_idx] = inter_data.input_positions[
542-
seq_idx][-block_size:]
543-
inter_data.query_lens[seq_idx] = block_size
544-
inter_data.context_lens[seq_idx] = inter_data.seq_lens[
545-
seq_idx] - inter_data.query_lens[seq_idx]
541+
seq_idx][-1:]
542+
inter_data.query_lens[seq_idx] = 1
543+
inter_data.context_lens[seq_idx] = inter_data.seq_lens[seq_idx] - 1
546544

547545
def _compute_for_sliding_window(self, inter_data: InterDataForSeqGroup,
548546
seq_idx: int,

0 commit comments

Comments
 (0)