From 455e27d57370e0dd2ee3c5438d6543b01910f940 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 6 Mar 2025 12:32:57 +0100 Subject: [PATCH 1/2] Reinstate `best_of` for V0 Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- tests/v1/sample/test_sampling_params_e2e.py | 8 +++++++ vllm/entrypoints/llm.py | 6 ++++- vllm/entrypoints/openai/protocol.py | 4 ++++ vllm/entrypoints/openai/serving_completion.py | 8 +++++-- vllm/sampling_params.py | 24 +++++++++++++++++++ vllm/v1/engine/processor.py | 3 +++ 6 files changed, 50 insertions(+), 3 deletions(-) diff --git a/tests/v1/sample/test_sampling_params_e2e.py b/tests/v1/sample/test_sampling_params_e2e.py index 4e88feae44dd..2ea01e667c6b 100644 --- a/tests/v1/sample/test_sampling_params_e2e.py +++ b/tests/v1/sample/test_sampling_params_e2e.py @@ -25,6 +25,14 @@ def test_n_gt_1(model): assert len(outputs[0].outputs) == 3 +def test_best_of(model): + """Raise a ValueError since best_of is deprecated.""" + + params = SamplingParams(n=2, best_of=3) + with pytest.raises(ValueError): + _ = model.generate(PROMPT, params) + + def test_penalties(model): """Check that we do not get errors if applied.""" diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index dd46a1376adf..4be1a532ee74 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -97,7 +97,11 @@ class LLM: throughput. However, if the value is too high, it may cause out-of- memory (OOM) errors. swap_space: The size (GiB) of CPU memory per GPU to use as swap space. - Too small values may cause out-of-memory (OOM) errors. + This can be used for temporarily storing the states of the requests + when their `best_of` sampling parameters are larger than 1. If all + requests will have `best_of=1`, you can safely set this to 0. + Noting that `best_of` is only supported in V0. Otherwise, too small + values may cause out-of-memory (OOM) errors. cpu_offload_gb: The size (GiB) of CPU memory to use for offloading the model weights. This virtually increases the GPU memory space you can use to hold the model weights, at the cost of CPU-GPU data diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 4c4d86fddb59..2c740caf20fb 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -242,6 +242,7 @@ class ChatCompletionRequest(OpenAIBaseModel): user: Optional[str] = None # doc: begin-chat-completion-sampling-params + best_of: Optional[int] = None use_beam_search: bool = False top_k: Optional[int] = None min_p: Optional[float] = None @@ -478,6 +479,7 @@ def to_sampling_params( return SamplingParams.from_optional( n=self.n, + best_of=self.best_of, presence_penalty=self.presence_penalty, frequency_penalty=self.frequency_penalty, repetition_penalty=repetition_penalty, @@ -648,6 +650,7 @@ class CompletionRequest(OpenAIBaseModel): # https://platform.openai.com/docs/api-reference/completions/create model: Optional[str] = None prompt: Union[list[int], list[list[int]], str, list[str]] + best_of: Optional[int] = None echo: Optional[bool] = False frequency_penalty: Optional[float] = 0.0 logit_bias: Optional[dict[str, float]] = None @@ -845,6 +848,7 @@ def to_sampling_params( return SamplingParams.from_optional( n=self.n, + best_of=self.best_of, presence_penalty=self.presence_penalty, frequency_penalty=self.frequency_penalty, repetition_penalty=repetition_penalty, diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index 592f213b6f5e..667ff448e041 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -168,8 +168,12 @@ async def create_completion( model_name = self._get_model_name(request.model, lora_request) num_prompts = len(engine_prompts) - # We do not stream the results when use beam search. - stream = (request.stream and not request.use_beam_search) + # Similar to the OpenAI API, when n != best_of, we do not stream the + # results. Noting that best_of is only supported in V0. In addition, + # we do not stream the results when use beam search. + stream = (request.stream + and (request.best_of is None or request.n == request.best_of) + and not request.use_beam_search) # Streaming response if stream: diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py index 599d52ee670b..1848fd1de5cf 100644 --- a/vllm/sampling_params.py +++ b/vllm/sampling_params.py @@ -116,6 +116,10 @@ class SamplingParams( Args: n: Number of output sequences to return for the given prompt. + best_of: Number of output sequences that are generated from the prompt. + From these `best_of` sequences, the top `n` sequences are returned. + `best_of` must be greater than or equal to `n`. By default, + `best_of` is set to `n`. Warning, this is only supported in V0. presence_penalty: Float that penalizes new tokens based on whether they appear in the generated text so far. Values > 0 encourage the model to use new tokens, while values < 0 encourage the model to repeat @@ -183,6 +187,7 @@ class SamplingParams( """ n: int = 1 + best_of: Optional[int] = None _real_n: Optional[int] = None presence_penalty: float = 0.0 frequency_penalty: float = 0.0 @@ -226,6 +231,7 @@ class SamplingParams( @staticmethod def from_optional( n: Optional[int] = 1, + best_of: Optional[int] = None, presence_penalty: Optional[float] = 0.0, frequency_penalty: Optional[float] = 0.0, repetition_penalty: Optional[float] = 1.0, @@ -264,6 +270,7 @@ def from_optional( return SamplingParams( n=1 if n is None else n, + best_of=best_of, presence_penalty=0.0 if presence_penalty is None else presence_penalty, frequency_penalty=0.0 @@ -296,6 +303,20 @@ def from_optional( ) def __post_init__(self) -> None: + # how we deal with `best_of``: + # if `best_of`` is not set, we default to `n`; + # if `best_of`` is set, we set `n`` to `best_of`, + # and set `_real_n`` to the original `n`. + # when we return the result, we will check + # if we need to return `n` or `_real_n` results + if self.best_of: + if self.best_of < self.n: + raise ValueError( + f"best_of must be greater than or equal to n, " + f"got n={self.n} and best_of={self.best_of}.") + if not self._real_n: + self._real_n = self.n + self.n = self.best_of if 0 < self.temperature < _MAX_TEMP: logger.warning( @@ -402,6 +423,9 @@ def _verify_args(self) -> None: raise ValueError( "stop strings are only supported when detokenize is True. " "Set detokenize=True to use stop.") + if self.best_of != self._real_n and self.output_kind == ( + RequestOutputKind.DELTA): + raise ValueError("best_of must equal n to use output_kind=DELTA") def _verify_greedy_sampling(self) -> None: if self.n > 1: diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 6a2c1c545f1b..713a5d38dfdd 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -93,6 +93,9 @@ def _validate_supported_sampling_params( self, params: SamplingParams, ) -> None: + # Best of not yet supported. + if params.best_of: + raise ValueError("VLLM V1 does not yet support best_of.") # Bad words not yet supported. if params.bad_words: raise ValueError("VLLM V1 does not yet support bad_words.") From 589e7b05dfb71ba3cad72882f5ec94b09440394d Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 6 Mar 2025 12:40:53 +0100 Subject: [PATCH 2/2] Silent ignore if `best_of` is 1 Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/v1/engine/processor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 713a5d38dfdd..a75f0946b4ce 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -94,7 +94,7 @@ def _validate_supported_sampling_params( params: SamplingParams, ) -> None: # Best of not yet supported. - if params.best_of: + if params.best_of is not None and params.best_of > 1: raise ValueError("VLLM V1 does not yet support best_of.") # Bad words not yet supported. if params.bad_words: