diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 72020a8ccf96..3d7d28055dd0 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -6,12 +6,10 @@ import time import weakref from functools import partial -from typing import (Any, AsyncGenerator, Callable, Coroutine, Dict, Iterable, - List, Mapping, Optional, Set, Tuple, Type, Union, overload) +from typing import (Any, AsyncGenerator, Callable, Dict, Iterable, List, + Mapping, Optional, Set, Tuple, Type, Union) from weakref import ReferenceType -from typing_extensions import deprecated - import vllm.envs as envs from vllm.config import (DecodingConfig, LoRAConfig, ModelConfig, ParallelConfig, SchedulerConfig, VllmConfig) @@ -36,7 +34,7 @@ from vllm.sequence import ExecuteModelRequest from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.usage.usage_lib import UsageContext -from vllm.utils import Device, deprecate_kwargs, weak_bind +from vllm.utils import Device, weak_bind logger = init_logger(__name__) ENGINE_ITERATION_TIMEOUT_S = envs.VLLM_ENGINE_ITERATION_TIMEOUT_S @@ -429,24 +427,6 @@ async def get_tokenizer_async(self, return await ( self.get_tokenizer_group().get_lora_tokenizer_async(lora_request)) - @overload - @deprecated("'inputs' will be renamed to 'prompt") - async def add_request_async( - self, - request_id: str, - *, - inputs: PromptType, - params: Union[SamplingParams, PoolingParams], - arrival_time: Optional[float] = None, - lora_request: Optional[LoRARequest] = None, - trace_headers: Optional[Mapping[str, str]] = None, - prompt_adapter_request: Optional[PromptAdapterRequest] = None, - priority: int = 0, - data_parallel_rank: Optional[int] = None, - ) -> None: - ... - - @overload async def add_request_async( self, request_id: str, @@ -459,32 +439,10 @@ async def add_request_async( priority: int = 0, data_parallel_rank: Optional[int] = None, ) -> None: - ... - - @deprecate_kwargs( - "inputs", - additional_message="Please use the 'prompt' parameter instead.", - ) - async def add_request_async( - self, - request_id: str, - prompt: Optional[PromptType] = None, - params: Optional[Union[SamplingParams, PoolingParams]] = None, - arrival_time: Optional[float] = None, - lora_request: Optional[LoRARequest] = None, - trace_headers: Optional[Mapping[str, str]] = None, - prompt_adapter_request: Optional[PromptAdapterRequest] = None, - priority: int = 0, - data_parallel_rank: Optional[int] = None, - *, - inputs: Optional[PromptType] = None, # DEPRECATED - ) -> None: - """Async version of - [`add_request`][vllm.engine.llm_engine.LLMEngine.add_request].""" - if inputs is not None: - prompt = inputs - assert prompt is not None and params is not None - + """ + Async version of + [`add_request`][vllm.engine.llm_engine.LLMEngine.add_request]. + """ if lora_request is not None and not self.lora_config: raise ValueError(f"Got lora_request {lora_request} but LoRA is " "not enabled!") @@ -521,8 +479,7 @@ async def add_request_async( params = await build_guided_decoding_logits_processor_async( sampling_params=params, tokenizer=await self.get_tokenizer_async(lora_request), - default_guided_backend=self.decoding_config. - guided_decoding_backend, + default_guided_backend=self.decoding_config.backend, reasoning_backend=self.decoding_config.reasoning_backend, model_config=self.model_config) @@ -894,28 +851,7 @@ async def run_engine_loop(engine_ref: ReferenceType): raise await asyncio.sleep(0) - # This method does not need to be async, but kept that way - # for backwards compatibility. - @overload - @deprecated("'inputs' will be renamed to 'prompt") - def add_request( - self, - request_id: str, - *, - inputs: PromptType, - params: Union[SamplingParams, PoolingParams], - arrival_time: Optional[float] = None, - lora_request: Optional[LoRARequest] = None, - trace_headers: Optional[Mapping[str, str]] = None, - prompt_adapter_request: Optional[PromptAdapterRequest] = None, - priority: int = 0, - data_parallel_rank: Optional[int] = None, - ) -> Coroutine[None, None, AsyncGenerator[Union[ - RequestOutput, PoolingRequestOutput], None]]: - ... - - @overload - def add_request( + async def add_request( self, request_id: str, prompt: PromptType, @@ -926,32 +862,7 @@ def add_request( prompt_adapter_request: Optional[PromptAdapterRequest] = None, priority: int = 0, data_parallel_rank: Optional[int] = None, - ) -> Coroutine[None, None, AsyncGenerator[Union[ - RequestOutput, PoolingRequestOutput], None]]: - ... - - @deprecate_kwargs( - "inputs", - additional_message="Please use the 'prompt' parameter instead.", - ) - async def add_request( - self, - request_id: str, - prompt: Optional[PromptType] = None, - params: Optional[Union[SamplingParams, PoolingParams]] = None, - arrival_time: Optional[float] = None, - lora_request: Optional[LoRARequest] = None, - trace_headers: Optional[Mapping[str, str]] = None, - prompt_adapter_request: Optional[PromptAdapterRequest] = None, - priority: int = 0, - data_parallel_rank: Optional[int] = None, - *, - inputs: Optional[PromptType] = None, # DEPRECATED ) -> AsyncGenerator[Union[RequestOutput, PoolingRequestOutput], None]: - if inputs is not None: - prompt = inputs - assert prompt is not None and params is not None - if not self.is_running: if self.start_engine_loop: self.start_background_loop() diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index dbcf78f02361..8fccf9bd2aa0 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -11,10 +11,10 @@ from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Deque, Dict, Iterable, List, Literal, Mapping, NamedTuple, Optional) from typing import Sequence as GenericSequence -from typing import Set, Type, Union, cast, overload +from typing import Set, Type, Union, cast import torch -from typing_extensions import TypeVar, deprecated +from typing_extensions import TypeVar import vllm.envs as envs from vllm.config import (DecodingConfig, LoRAConfig, ModelConfig, @@ -58,8 +58,7 @@ TokenizerGroup, init_tokenizer_from_configs) from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled, usage_message) -from vllm.utils import (Counter, Device, deprecate_kwargs, - resolve_obj_by_qualname, weak_bind) +from vllm.utils import Counter, Device, resolve_obj_by_qualname, weak_bind from vllm.version import __version__ as VLLM_VERSION from vllm.worker.model_runner_base import InputProcessingError @@ -629,7 +628,6 @@ def _add_processed_request( def stop_remote_worker_execution_loop(self) -> None: self.model_executor.stop_remote_worker_execution_loop() - @overload def add_request( self, request_id: str, @@ -641,42 +639,6 @@ def add_request( trace_headers: Optional[Mapping[str, str]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, priority: int = 0, - ) -> None: - ... - - @overload - @deprecated("'inputs' will be renamed to 'prompt") - def add_request( - self, - request_id: str, - *, - inputs: PromptType, - params: Union[SamplingParams, PoolingParams], - arrival_time: Optional[float] = None, - lora_request: Optional[LoRARequest] = None, - trace_headers: Optional[Mapping[str, str]] = None, - prompt_adapter_request: Optional[PromptAdapterRequest] = None, - priority: int = 0, - ) -> None: - ... - - @deprecate_kwargs( - "inputs", - additional_message="Please use the 'prompt' parameter instead.", - ) - def add_request( - self, - request_id: str, - prompt: Optional[PromptType] = None, - params: Optional[Union[SamplingParams, PoolingParams]] = None, - arrival_time: Optional[float] = None, - lora_request: Optional[LoRARequest] = None, - tokenization_kwargs: Optional[dict[str, Any]] = None, - trace_headers: Optional[Mapping[str, str]] = None, - prompt_adapter_request: Optional[PromptAdapterRequest] = None, - priority: int = 0, - *, - inputs: Optional[PromptType] = None, # DEPRECATED ) -> None: """Add a request to the engine's request pool. @@ -725,10 +687,6 @@ def add_request( >>> # continue the request processing >>> ... """ - if inputs is not None: - prompt = inputs - assert prompt is not None and params is not None - if lora_request is not None and not self.lora_config: raise ValueError(f"Got lora_request {lora_request} but LoRA is " "not enabled!") diff --git a/vllm/engine/multiprocessing/__init__.py b/vllm/engine/multiprocessing/__init__.py index bf9f669031cb..db968cd6b5d8 100644 --- a/vllm/engine/multiprocessing/__init__.py +++ b/vllm/engine/multiprocessing/__init__.py @@ -4,9 +4,7 @@ import uuid from dataclasses import dataclass, field from enum import Enum -from typing import List, Mapping, Optional, Union, overload - -from typing_extensions import deprecated +from typing import List, Mapping, Optional, Union from vllm import PoolingParams from vllm.inputs import PromptType @@ -14,7 +12,7 @@ from vllm.outputs import RequestOutput from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sampling_params import SamplingParams -from vllm.utils import Device, deprecate_kwargs +from vllm.utils import Device VLLM_RPC_SUCCESS_STR = "SUCCESS" @@ -38,7 +36,6 @@ class RPCProcessRequest: prompt_adapter_request: Optional[PromptAdapterRequest] = None priority: int = 0 - @overload def __init__( self, prompt: PromptType, @@ -49,44 +46,6 @@ def __init__( prompt_adapter_request: Optional[PromptAdapterRequest] = None, priority: int = 0, ) -> None: - ... - - @overload - @deprecated("'inputs' will be renamed to 'prompt") - def __init__( - self, - *, - inputs: PromptType, - params: Union[SamplingParams, PoolingParams], - request_id: str, - lora_request: Optional[LoRARequest] = None, - trace_headers: Optional[Mapping[str, str]] = None, - prompt_adapter_request: Optional[PromptAdapterRequest] = None, - priority: int = 0, - ) -> None: - ... - - @deprecate_kwargs( - "inputs", - additional_message="Please use the 'prompt' parameter instead.", - ) - def __init__( - self, - prompt: Optional[PromptType] = None, - params: Optional[Union[SamplingParams, PoolingParams]] = None, - request_id: Optional[str] = None, - lora_request: Optional[LoRARequest] = None, - trace_headers: Optional[Mapping[str, str]] = None, - prompt_adapter_request: Optional[PromptAdapterRequest] = None, - priority: int = 0, - *, - inputs: Optional[PromptType] = None, # DEPRECATED - ) -> None: - if inputs is not None: - prompt = inputs - assert (prompt is not None and params is not None - and request_id is not None) - super().__init__() self.prompt = prompt diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py index f2f442485933..9e018ec7f344 100644 --- a/vllm/engine/multiprocessing/client.py +++ b/vllm/engine/multiprocessing/client.py @@ -6,13 +6,12 @@ import pickle from contextlib import contextmanager, suppress from typing import (Any, AsyncGenerator, Dict, Iterator, List, Mapping, - Optional, Union, cast, overload) + Optional, Union, cast) import cloudpickle import psutil import zmq import zmq.asyncio -from typing_extensions import deprecated from zmq import Frame # type: ignore[attr-defined] from zmq.asyncio import Socket @@ -49,7 +48,7 @@ from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sampling_params import SamplingParams from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs -from vllm.utils import Device, deprecate_kwargs +from vllm.utils import Device logger = init_logger(__name__) @@ -442,7 +441,6 @@ def errored(self) -> bool: def dead_error(self) -> BaseException: return ENGINE_DEAD_ERROR(self._errored_with) - @overload def generate( self, prompt: PromptType, @@ -452,39 +450,6 @@ def generate( trace_headers: Optional[Mapping[str, str]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, priority: int = 0, - ) -> AsyncGenerator[RequestOutput, None]: - ... - - @overload - @deprecated("'inputs' will be renamed to 'prompt") - def generate( - self, - *, - inputs: PromptType, - sampling_params: SamplingParams, - request_id: str, - lora_request: Optional[LoRARequest] = None, - trace_headers: Optional[Mapping[str, str]] = None, - prompt_adapter_request: Optional[PromptAdapterRequest] = None, - priority: int = 0, - ) -> AsyncGenerator[RequestOutput, None]: - ... - - @deprecate_kwargs( - "inputs", - additional_message="Please use the 'prompt' parameter instead.", - ) - def generate( - self, - prompt: Optional[PromptType] = None, - sampling_params: Optional[SamplingParams] = None, - request_id: Optional[str] = None, - lora_request: Optional[LoRARequest] = None, - trace_headers: Optional[Mapping[str, str]] = None, - prompt_adapter_request: Optional[PromptAdapterRequest] = None, - priority: int = 0, - *, - inputs: Optional[PromptType] = None # DEPRECATED ) -> AsyncGenerator[RequestOutput, None]: """Generate outputs for a request. @@ -506,16 +471,12 @@ def generate( Any priority other than 0 will lead to an error if the scheduling policy is not "priority". """ - if inputs is not None: - prompt = inputs - assert (prompt is not None and sampling_params is not None - and request_id is not None) - - return self._process_request(prompt, sampling_params, request_id, - lora_request, trace_headers, - prompt_adapter_request, priority) + return cast( + AsyncGenerator[RequestOutput, None], + self._process_request(prompt, sampling_params, request_id, + lora_request, trace_headers, + prompt_adapter_request, priority)) - @overload def encode( self, prompt: PromptType, @@ -524,37 +485,6 @@ def encode( lora_request: Optional[LoRARequest] = None, trace_headers: Optional[Mapping[str, str]] = None, priority: int = 0, - ) -> AsyncGenerator[PoolingRequestOutput, None]: - ... - - @overload - @deprecated("'inputs' will be renamed to 'prompt") - def encode( - self, - *, - inputs: PromptType, - pooling_params: PoolingParams, - request_id: str, - lora_request: Optional[LoRARequest] = None, - trace_headers: Optional[Mapping[str, str]] = None, - priority: int = 0, - ) -> AsyncGenerator[PoolingRequestOutput, None]: - ... - - @deprecate_kwargs( - "inputs", - additional_message="Please use the 'prompt' parameter instead.", - ) - def encode( - self, - prompt: Optional[PromptType] = None, - pooling_params: Optional[PoolingParams] = None, - request_id: Optional[str] = None, - lora_request: Optional[LoRARequest] = None, - trace_headers: Optional[Mapping[str, str]] = None, - priority: int = 0, - *, - inputs: Optional[PromptType] = None # DEPRECATED ) -> AsyncGenerator[PoolingRequestOutput, None]: """Generate outputs for a request from a pooling model. @@ -575,11 +505,6 @@ def encode( The output `PoolingRequestOutput` objects from the LLMEngine for the request. """ - if inputs is not None: - prompt = inputs - assert (prompt is not None and pooling_params is not None - and request_id is not None) - return cast( AsyncGenerator[PoolingRequestOutput, None], self._process_request(prompt,