vllm-project · mmoskal · Jul 7, 2024 · Jul 7, 2024 · Jul 7, 2024 · Jul 7, 2024
diff --git a/vllm/core/block/block_table.py b/vllm/core/block/block_table.py
@@ -147,10 +147,13 @@ def append_token_ids(self,
 
         # Update the blocks with the new tokens
         first_block_idx = self._num_full_slots // self._block_size
-        token_blocks = self._chunk_token_blocks_for_append(token_ids)
 
-        for i, token_block in enumerate(token_blocks):
-            self._blocks.append_token_ids(first_block_idx + i, token_block)
+        # don't bother appending anything, if no new token_ids were generated
+        if token_ids:
+            token_blocks = self._chunk_token_blocks_for_append(token_ids)
+
+            for i, token_block in enumerate(token_blocks):
+                self._blocks.append_token_ids(first_block_idx + i, token_block)
 
         self._num_full_slots += len(token_ids)
 

diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
@@ -237,7 +237,8 @@ async def step_async(
                 virtual_engine=virtual_engine,
                 num_lookahead_slots=scheduler_outputs.num_lookahead_slots,
                 running_queue_size=scheduler_outputs.running_queue_size,
-                finished_requests_ids=finished_requests_ids)
+                finished_requests_ids=finished_requests_ids,
+                sampling_controller=self.sampling_controller)
             output = await self.model_executor.execute_model_async(
                 execute_model_req)
         else:
@@ -257,6 +258,8 @@ async def step_async(
 
     async def stop_remote_worker_execution_loop_async(self) -> None:
         """Stop the remote worker execution loop."""
+        if ctrl := self.sampling_controller:
+            ctrl.empty_step()
         await self.model_executor.stop_remote_worker_execution_loop_async()
 
     async def process_model_inputs_async(

diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
@@ -29,8 +29,8 @@
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import (EmbeddingSequenceGroupOutput, ExecuteModelRequest,
-                           PoolerOutput, SamplerOutput, Sequence,
-                           SequenceGroup, SequenceGroupMetadata,
+                           PoolerOutput, SamplerOutput, SamplingController,
+                           Sequence, SequenceGroup, SequenceGroupMetadata,
                            SequenceStatus)
 from vllm.tracing import (SpanAttributes, SpanKind, extract_trace_context,
                           init_tracer)
@@ -225,6 +225,7 @@ def __init__(
         self.observability_config = observability_config or ObservabilityConfig(
         )
         self.log_stats = log_stats
+        self.sampling_controller: Optional[SamplingController] = None
 
         if not self.model_config.skip_tokenizer_init:
             self.tokenizer = self._init_tokenizer()
@@ -528,6 +529,8 @@ def _add_processed_request(
         min_cost_scheduler.add_seq_group(seq_group)
 
     def stop_remote_worker_execution_loop(self) -> None:
+        if ctrl := self.sampling_controller:
+            ctrl.empty_step()
         self.model_executor.stop_remote_worker_execution_loop()
 
     def process_model_inputs(
@@ -857,10 +860,13 @@ def step(self) -> List[Union[RequestOutput, EmbeddingRequestOutput]]:
                 blocks_to_copy=scheduler_outputs.blocks_to_copy,
                 num_lookahead_slots=scheduler_outputs.num_lookahead_slots,
                 running_queue_size=scheduler_outputs.running_queue_size,
-                finished_requests_ids=finished_requests_ids)
+                finished_requests_ids=finished_requests_ids,
+                sampling_controller=self.sampling_controller)
             output = self.model_executor.execute_model(
                 execute_model_req=execute_model_req)
         else:
+            if ctrl := self.sampling_controller:
+                ctrl.empty_step()
             output = []
 
         request_outputs = self._process_model_outputs(

diff --git a/vllm/engine/output_processor/single_step.py b/vllm/engine/output_processor/single_step.py
@@ -102,15 +102,13 @@ def _process_sequence_group_outputs(self, seq_group: SequenceGroup,
             for child_sample in child_samples[:-1]:
                 new_child_seq_id: int = next(self.seq_counter)
                 child = parent.fork(new_child_seq_id)
-                child.append_token_id(child_sample.output_token,
-                                      child_sample.logprobs)
+                child_sample.append_to(child)
                 child_seqs.append((child, parent))
             # Continue the parent sequence for the last child sample.
             # We reuse the parent sequence here to reduce redundant memory
             # copies, especially when using non-beam search sampling methods.
             last_child_sample = child_samples[-1]
-            parent.append_token_id(last_child_sample.output_token,
-                                   last_child_sample.logprobs)
+            last_child_sample.append_to(parent)
             child_seqs.append((parent, parent))
 
         for seq, _ in child_seqs:

diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py
@@ -24,6 +24,7 @@ class SequenceGroupToSample:
     #                                   |-- query_len ---|
 
     # Sequence ids for the sequence group in a previous step.
+    request_id: str
     seq_ids: List[int]
     sampling_params: SamplingParams
     # seq_id -> sequence data.
@@ -273,6 +274,7 @@ def sample(logits):
 
         seq_groups.append(
             SequenceGroupToSample(
+                request_id=seq_group_metadata.request_id,
                 seq_ids=seq_ids,
                 sampling_params=sampling_params,
                 seq_data=seq_group_metadata.seq_data,

diff --git a/vllm/sequence.py b/vllm/sequence.py
@@ -14,6 +14,7 @@
 
 if TYPE_CHECKING:
     from vllm.inputs import LLMInputs
+    from vllm.model_executor.sampling_metadata import SamplingMetadata
     from vllm.multimodal import MultiModalDataDict
     from vllm.spec_decode.metrics import SpecDecodeWorkerMetrics
 
@@ -185,9 +186,14 @@ def get_num_computed_tokens(self) -> int:
 
     def update_num_computed_tokens(self, num_new_computed_tokens: int):
         """Update number of tokens computed so far."""
+        seq_len = self.get_len()
         self._num_computed_tokens += num_new_computed_tokens
-        assert self._num_computed_tokens <= self.get_len(), (
-            self._num_computed_tokens, self.get_len())
+        # We can overflow by 1 if previous sampling was updated by
+        # SamplingController to generate an empty sequence of tokens.
+        if self._num_computed_tokens == seq_len + 1:
+            self._num_computed_tokens = seq_len
+        assert self._num_computed_tokens <= seq_len, (
+            self._num_computed_tokens, seq_len)
         # If all tokens are computed, it means it is in decoding phase.
         if self.get_num_uncomputed_tokens() == 0:
             self._stage = SequenceStage.DECODE
@@ -468,8 +474,8 @@ def lora_int_id(self) -> int:
 
     def get_last_latency(self, now: float) -> Optional[float]:
         """Sets the last token time for Request level timings."""
-        # If still in prefill phase, raise Error.
-        if self.is_prefill():
+        # If still in initial prefill phase, raise Error.
+        if self.is_prefill() and self.get_seqs()[0].get_output_len() == 0:
             raise ValueError(
                 "seq_group.get_last_latency() should not be called "
                 "if the seq_group is in prefill phase.")
@@ -701,6 +707,36 @@ def __init__(
         self.parent_seq_id = parent_seq_id
         self.output_token = output_token
         self.logprobs = logprobs
+        # If present, these tokens should appended to the output
+        # instead of output_token.
+        self.fast_forward_tokens: Optional[List[int]] = None
+
+    def append_to(self, seq: Sequence) -> None:
+        """
+        Append the sampling output to the sequence.
+
+        If fast forward tokens is set, this appends them, generating appropriate
+        Logprobs, and switching the sequence to PREFILL if needed.
+        Otherwise, just the output token is appended.
+        """
+        if self.fast_forward_tokens is not None:
+            logprobs = self.logprobs
+            for token in self.fast_forward_tokens:
+                # On first iteration, use the existing self.logprobs, provided
+                # they contain the token.
+                if token not in logprobs:
+                    logprobs = {
+                        token: Logprob(logprob=0.0, rank=1, decoded_token=None)
+                    }
+                seq.append_token_id(token, logprobs)
+                # On subsequent iterations always use artificially created
+                # logprobs.
+                logprobs = {}
+            # If more than one token was appended, switch to prefill stage.
+            if seq.data.get_num_uncomputed_tokens() > 1:
+                seq.data._stage = SequenceStage.PREFILL
+        else:
+            seq.append_token_id(self.output_token, self.logprobs)
 
     def __repr__(self) -> str:
         return (f"SequenceOutput(parent_seq_id={self.parent_seq_id}, "
@@ -912,6 +948,53 @@ def prune(self,
             self.seq_ids = seq_ids
 
 
+class SamplingController:
+    """
+    This is used to modify sampling process for a given LLMEngine.
+    There is only one instance of this class per LLMEngine.
+
+    In each generation step, one of the following things can happen:
+
+    There are no sequences to run, and empty_step() is called;
+    this can be used to run actions that normally run in sync with step,
+    when there are no sequences to run
+
+    Otherwise (normal case), the following methods are run in this exact order:
+    - prepare() causes the sampling controller to start logit bias prepreation
+      for the sequences that will be run; typically the logit indices from
+      sampling_metadata will have to be stored in the sampling controller
+    - forward pass is started
+    - transform_logits() is called after the forward pass has finished, to
+      modify the logits
+    - sampling happens on biased logits
+    - transform_sampler_output() is called to modify the sampler output
+
+    This class does nothing for each of these steps. Subclasses can override
+    any and each of these methods to modify the sampling process; they will
+    be stateful.
+
+    Currently, you just have to assign an instance of your subclass to
+    engine.sampling_controller to use it.
+    """
+
+    def prepare(self, sampling_metadata: "SamplingMetadata"):
+        """Prepare the sampling controller for the next step."""
+        pass
+
+    def empty_step(self):
+        """Called instead of prepare() when the scheduler found no sequences
+        to run."""
+        pass
+
+    def transform_logits(self, logits: torch.Tensor) -> torch.Tensor:
+        """Apply the sampling controller to the logits."""
+        return logits
+
+    def transform_sampler_output(self, output: SamplerOutput) -> SamplerOutput:
+        """Apply the sampling controller to the sampler output."""
+        return output
+
+
 @dataclass
 class ExecuteModelRequest:
     """The model execution request, containing CPU metadata only. The LLM
@@ -936,6 +1019,8 @@ class ExecuteModelRequest:
     num_steps: int = 1
     # Finished request ids since last step.
     finished_requests_ids: List[str] = field(default_factory=list)
+    # Sampling controller to use for this step.
+    sampling_controller: Optional[SamplingController] = None
 
     def clone(
         self, seq_group_metadata_list: List[SequenceGroupMetadata]
@@ -951,4 +1036,5 @@ def clone(
             running_queue_size=self.running_queue_size,
             previous_hidden_states=self.previous_hidden_states,
             num_steps=self.num_steps,
-            finished_requests_ids=self.finished_requests_ids)
+            finished_requests_ids=self.finished_requests_ids,
+            sampling_controller=self.sampling_controller)
diff --git a/vllm/transformers_utils/detokenizer.py b/vllm/transformers_utils/detokenizer.py
@@ -124,7 +124,7 @@ def decode_sequence_inplace(self, seq: Sequence,
          )
 
         # Decode logprobs
-        logprobs = seq.output_logprobs[-1]
+        logprobs = seq.output_logprobs[-1] if seq.output_logprobs else None
         if logprobs:
             previous_tokens = all_input_ids[:-1]
             for token_id, sample_logprob in logprobs.items():

diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
@@ -1240,6 +1240,11 @@ def execute_model(
             "finished_requests_ids": model_input.finished_requests_ids,
             "request_ids_to_seq_ids": model_input.request_ids_to_seq_ids,
         } if self.has_seqlen_agnostic else {}
+
+        if (ctrl := model_input.sampling_controller) is not None:
+            assert model_input.sampling_metadata is not None
+            ctrl.prepare(model_input.sampling_metadata)
+
         hidden_or_intermediate_states = model_executable(
             input_ids=model_input.input_tokens,
             positions=model_input.input_positions,
@@ -1259,12 +1264,18 @@ def execute_model(
         if not self.is_driver_worker:
             return []
 
+        if ctrl is not None:
+            logits = ctrl.transform_logits(logits)
+
         # Sample the next token.
         output: SamplerOutput = self.model.sample(
             logits=logits,
             sampling_metadata=model_input.sampling_metadata,
         )
 
+        if ctrl is not None:
+            output = ctrl.transform_sampler_output(output)
+
         if self.return_hidden_states:
             # we only need to pass hidden states of most recent token
             assert model_input.sampling_metadata is not None

diff --git a/vllm/worker/model_runner_base.py b/vllm/worker/model_runner_base.py
@@ -6,7 +6,7 @@
 import torch
 
 from vllm.sequence import (IntermediateTensors, SamplerOutput,
-                           SequenceGroupMetadata)
+                           SamplingController, SequenceGroupMetadata)
 
 if TYPE_CHECKING:
     from vllm.attention import AttentionMetadata
@@ -92,6 +92,8 @@ class ModelRunnerInputBase(ABC):
     serialize/deserialize a ModelInput for broadcast between workers.
     """
 
+    sampling_controller: Optional[SamplingController] = None
+
     def as_broadcastable_tensor_dict(self) -> Dict[str, Any]:
         """
         Extract broadcastable fields. Override for fields that require some

diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
@@ -244,6 +244,13 @@ def execute_model(
                     model_input.as_broadcastable_tensor_dict())
                 broadcast_data["num_steps"] = num_steps
                 broadcast_tensor_dict(broadcast_data, src=0)
+
+            # SamplingController is only used in the driver worker, so it
+            # doesn't need to be broadcasted.
+            ctrl = execute_model_req.sampling_controller
+            if ctrl is not None:
+                model_input = dataclasses.replace(model_input,
+                                                  sampling_controller=ctrl)
         else:
             assert self.do_metadata_broadcast
             broadcast_data = broadcast_tensor_dict(src=0)