From 037bd7f2a7d8691a5f3ba03765787ab7ec6f6cc6 Mon Sep 17 00:00:00 2001 From: Kourosh Hakhamaneshi Date: Tue, 24 Jun 2025 10:36:32 -0700 Subject: [PATCH 01/37] wip Signed-off-by: Kourosh Hakhamaneshi --- .../serve/deployments/llm/llm_server.py | 3 +- .../serve/deployments/llm/vllm/vllm_engine.py | 104 ++++++++++++------ 2 files changed, 70 insertions(+), 37 deletions(-) diff --git a/python/ray/llm/_internal/serve/deployments/llm/llm_server.py b/python/ray/llm/_internal/serve/deployments/llm/llm_server.py index d1105db5afa8..4c23716458bd 100644 --- a/python/ray/llm/_internal/serve/deployments/llm/llm_server.py +++ b/python/ray/llm/_internal/serve/deployments/llm/llm_server.py @@ -597,7 +597,8 @@ async def chat(self, request: ChatCompletionRequest) -> LLMChatResponse: Returns: A LLMChatResponse object. """ - return self._process_llm_request(request, is_chat=True) + # return self._process_llm_request(request, is_chat=True) + self.engine.chat(request) async def completions(self, request: CompletionRequest) -> LLMCompletionsResponse: """Runs a completion request to the LLM engine and returns the response. diff --git a/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py b/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py index 25d8bd2fdf75..a98af1b53024 100644 --- a/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py +++ b/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py @@ -177,6 +177,7 @@ def get_error(self): return self._error + class VLLMEngine(LLMEngine): def __init__( self, @@ -188,6 +189,11 @@ def __init__( llm_config: The llm configuration for this engine """ super().__init__(llm_config) + + from argparse import Namespace + # Convert this to a namespace object + vllm_cli_args = llm_config.experimental_configs.get("vllm_cli_args", {}) + self.vllm_cli_args = Namespace(**vllm_cli_args) if vllm is None: raise ImportError( @@ -267,44 +273,64 @@ async def start(self): If the engine is already running, do nothing. """ - from vllm.entrypoints.chat_utils import ( - resolve_chat_template_content_format as _resolve_chat_template_content_format, - ) - - if self.running: - # The engine is already running! - logger.info("Skipping engine restart because the engine is already running") - return - + # from vllm.entrypoints.chat_utils import ( + # resolve_chat_template_content_format as _resolve_chat_template_content_format, + # ) + + # if self.running: + # # The engine is already running! + # logger.info("Skipping engine restart because the engine is already running") + # return + + # self.engine = await self._start_engine() + # self.running = True + # self.model_config = await self.engine.get_model_config() + + # self._tokenizer = await self.engine.get_tokenizer() + + # def resolve_chat_template_content_format(model_config, **kwargs): + # try: + # return _resolve_chat_template_content_format( + # model_config=model_config, **kwargs + # ) + # except TypeError: + # # Legacy API before vLLM 0.9.0. + # # TODO(#52975): Remove this try-except once vLLM <0.9.0 is no longer supported. + # return _resolve_chat_template_content_format( + # trust_remote_code=model_config.trust_remote_code, **kwargs + # ) + + # self._resolved_content_format = resolve_chat_template_content_format( + # model_config=self.model_config, + # # Use HF to get the chat template so set it to None here. + # chat_template=None, + # # Default to None, change when it's needed. + # # vLLM does not have a high level API to support all of this. + # tools=None, + # # Let vLLM decide the content format. + # given_format="auto", + # tokenizer=self._tokenizer, + # ) + + + from vllm.entrypoints.openai.api_server import init_app_state self.engine = await self._start_engine() - self.running = True - self.model_config = await self.engine.get_model_config() - - self._tokenizer = await self.engine.get_tokenizer() - - def resolve_chat_template_content_format(model_config, **kwargs): - try: - return _resolve_chat_template_content_format( - model_config=model_config, **kwargs - ) - except TypeError: - # Legacy API before vLLM 0.9.0. - # TODO(#52975): Remove this try-except once vLLM <0.9.0 is no longer supported. - return _resolve_chat_template_content_format( - trust_remote_code=model_config.trust_remote_code, **kwargs - ) - - self._resolved_content_format = resolve_chat_template_content_format( - model_config=self.model_config, - # Use HF to get the chat template so set it to None here. - chat_template=None, - # Default to None, change when it's needed. - # vLLM does not have a high level API to support all of this. - tools=None, - # Let vLLM decide the content format. - given_format="auto", - tokenizer=self._tokenizer, + + from starlette.datastructures import State + state = State() + + await init_app_state( + engine_client=self.engine, + vllm_config=self.vllm_config, + state=state, + args=self.vllm_cli_args, ) + + self.oai_serving_chat = state.openai_serving_chat + self.oai_serving_completion = state.openai_serving_completion + self.oai_serving_embedding = state.openai_serving_embedding + + self.running = True logger.info("Started vLLM engine.") @@ -587,6 +613,12 @@ def apply_hf_chat_template(model_config, **kwargs): vllm_request = VLLMGenerationRequest(**request_params) return vllm_request + async def chat(self, request: GenerationRequest) -> AsyncGenerator[LLMRawResponse, None]: + generator = self.oai_serving_chat.create_chat_completion(request) + async for response in generator: + yield response + + async def generate( self, request: GenerationRequest ) -> AsyncGenerator[LLMRawResponse, None]: From 0b0a5d8c2f5291fc9256c67a68c904a987ae9c82 Mon Sep 17 00:00:00 2001 From: Kourosh Hakhamaneshi Date: Tue, 24 Jun 2025 18:38:57 -0700 Subject: [PATCH 02/37] prototype api server Signed-off-by: Kourosh Hakhamaneshi --- .../serve/deployments/llm/llm_server.py | 7 ++- .../serve/deployments/llm/vllm/vllm_engine.py | 52 +++++++++++++++---- .../serve/deployments/llm/vllm/vllm_models.py | 27 +++++++++- .../serve/deployments/routers/router.py | 10 +++- 4 files changed, 81 insertions(+), 15 deletions(-) diff --git a/python/ray/llm/_internal/serve/deployments/llm/llm_server.py b/python/ray/llm/_internal/serve/deployments/llm/llm_server.py index 4c23716458bd..e5132d2f4a6d 100644 --- a/python/ray/llm/_internal/serve/deployments/llm/llm_server.py +++ b/python/ray/llm/_internal/serve/deployments/llm/llm_server.py @@ -19,7 +19,7 @@ ChatCompletionLogProb, ChatCompletionLogProbs, ChatCompletionLogProbsContent, - ChatCompletionRequest, + # ChatCompletionRequest, ChatCompletionResponse, ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice, @@ -39,6 +39,7 @@ LLMEmbeddingsResponse, UsageInfo, ) +from vllm.entrypoints.openai.protocol import ChatCompletionRequest from ray.llm._internal.serve.configs.prompt_formats import Message, Prompt from ray.llm._internal.serve.configs.server_models import ( DiskMultiplexConfig, @@ -598,7 +599,9 @@ async def chat(self, request: ChatCompletionRequest) -> LLMChatResponse: A LLMChatResponse object. """ # return self._process_llm_request(request, is_chat=True) - self.engine.chat(request) + async for response in self.engine.chat(request): + logger.info(f"[Kourosh] in llm_server.chat, response: {response}") + yield response async def completions(self, request: CompletionRequest) -> LLMCompletionsResponse: """Runs a completion request to the LLM engine and returns the response. diff --git a/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py b/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py index a98af1b53024..67c3ad13694e 100644 --- a/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py +++ b/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py @@ -54,11 +54,12 @@ from ray.util import metrics from ray.util.placement_group import PlacementGroup from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy +from vllm.entrypoints.openai.cli_args import FrontendArgs +from vllm.engine.arg_utils import AsyncEngineArgs if TYPE_CHECKING: from vllm import SamplingParams as VLLMInternalSamplingParams from vllm.config import ModelConfig, VllmConfig - from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.protocol import EngineClient from vllm.outputs import PoolingRequestOutput, RequestOutput @@ -177,6 +178,15 @@ def get_error(self): return self._error +class CustomNamespace: + def __init__(self, *args): + self.classes = args + + def __getattr__(self, name): + for cls in self.classes: + if hasattr(cls, name): + return getattr(cls, name) + raise AttributeError(f"Attribute {name} not found in {self.classes}") class VLLMEngine(LLMEngine): def __init__( @@ -190,10 +200,21 @@ def __init__( """ super().__init__(llm_config) - from argparse import Namespace + # Convert this to a namespace object - vllm_cli_args = llm_config.experimental_configs.get("vllm_cli_args", {}) - self.vllm_cli_args = Namespace(**vllm_cli_args) + # TODO: How to get the args in a way that is also inherits the default values? + # vllm_cli_args = llm_config.experimental_configs.get("vllm_cli_args", {}) + # self.vllm_cli_args = CustomNamespace(**vllm_cli_args) + # self.vllm_cli_args.update( + # disable_request_logs=True, + # ) + + # filter out the llm_config.engine_kwargs to those that belong to FrontendArgs and pop them over. + engine_config = llm_config.get_engine_config() + self.frontend_args = FrontendArgs(**engine_config.frontend_kwargs) + self.engine_args = AsyncEngineArgs(**engine_config.engine_kwargs) + + self.namespace_args = CustomNamespace(self.engine_args, self.frontend_args) if vllm is None: raise ImportError( @@ -318,12 +339,12 @@ async def start(self): from starlette.datastructures import State state = State() - + await init_app_state( engine_client=self.engine, vllm_config=self.vllm_config, state=state, - args=self.vllm_cli_args, + args=self.namespace_args, ) self.oai_serving_chat = state.openai_serving_chat @@ -523,6 +544,11 @@ def _start_async_llm_engine( from vllm.v1.executor.abstract import Executor vllm_config.parallel_config.placement_group = placement_group + + if use_v1: + from vllm.v1.engine.async_llm import AsyncLLM as AsyncLLMEngine + else: + from vllm.engine.async_llm_engine import AsyncLLMEngine _clear_current_platform_cache() @@ -538,7 +564,7 @@ def _start_async_llm_engine( executor_class = Executor.get_class(vllm_config) logger.info(f"Using executor class: {executor_class}") - engine = vllm.engine.async_llm_engine.AsyncLLMEngine( + engine = AsyncLLMEngine( vllm_config=vllm_config, executor_class=executor_class, log_stats=not engine_args.disable_log_stats, @@ -614,9 +640,15 @@ def apply_hf_chat_template(model_config, **kwargs): return vllm_request async def chat(self, request: GenerationRequest) -> AsyncGenerator[LLMRawResponse, None]: - generator = self.oai_serving_chat.create_chat_completion(request) - async for response in generator: - yield response + + chat_response = await self.oai_serving_chat.create_chat_completion(request) + + if isinstance(chat_response, AsyncGenerator): + async for response in chat_response: + yield response + else: + logger.info(f"[Kourosh] non streaming response received, chat_response: {chat_response}") + yield chat_response async def generate( diff --git a/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_models.py b/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_models.py index 91dc471474b7..33eb441e2d4c 100644 --- a/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_models.py +++ b/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_models.py @@ -1,5 +1,6 @@ import os from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Union +import dataclasses from pydantic import ConfigDict, Field, ValidationError, field_validator @@ -26,6 +27,9 @@ placement_group_table, ) +from vllm.engine.arg_utils import AsyncEngineArgs +from vllm.entrypoints.openai.cli_args import FrontendArgs + # The key for the kv_transfer_params in the internal metadata. KV_TRANSFER_PARAMS_KEY = "kv_transfer_params" @@ -64,6 +68,7 @@ class VLLMEngineConfig(BaseModelExtended): ) runtime_env: Optional[Dict[str, Any]] = None engine_kwargs: Dict[str, Any] = {} + frontend_kwargs: Dict[str, Any] = {} @property def actual_hf_model_id(self) -> str: @@ -106,6 +111,25 @@ def from_llm_config(cls, llm_config: LLMConfig) -> "VLLMEngineConfig": else: # If it's a CloudMirrorConfig (or subtype) mirror_config = llm_config.model_loading_config.model_source + + + all_engine_kwargs = llm_config.engine_kwargs.copy() + engine_kwargs = {} + frontend_kwargs = {} + + # Get field names from dataclasses + frontend_field_names = {field.name for field in dataclasses.fields(FrontendArgs)} + async_engine_field_names = {field.name for field in dataclasses.fields(AsyncEngineArgs)} + + for key, value in all_engine_kwargs.items(): + if key in frontend_field_names: + frontend_kwargs[key] = value + elif key in async_engine_field_names: + engine_kwargs[key] = value + else: + raise ValueError(f"Unknown engine argument: {key}") + engine_kwargs["model"] = hf_model_id + engine_kwargs["served_model_name"] = [llm_config.model_id] return VLLMEngineConfig( model_id=llm_config.model_id, @@ -113,7 +137,8 @@ def from_llm_config(cls, llm_config: LLMConfig) -> "VLLMEngineConfig": mirror_config=mirror_config, resources_per_bundle=llm_config.resources_per_bundle, accelerator_type=llm_config.accelerator_type, - engine_kwargs=llm_config.engine_kwargs, + engine_kwargs=engine_kwargs, + frontend_kwargs=frontend_kwargs, runtime_env=llm_config.runtime_env, ) diff --git a/python/ray/llm/_internal/serve/deployments/routers/router.py b/python/ray/llm/_internal/serve/deployments/routers/router.py index e488f269605c..0cf5e4cd2fbf 100644 --- a/python/ray/llm/_internal/serve/deployments/routers/router.py +++ b/python/ray/llm/_internal/serve/deployments/routers/router.py @@ -30,7 +30,7 @@ ROUTER_TO_MODEL_REPLICA_RATIO, ) from ray.llm._internal.serve.configs.openai_api_models import ( - ChatCompletionRequest, + # ChatCompletionRequest, ChatCompletionResponse, ChatCompletionStreamResponse, CompletionRequest, @@ -44,6 +44,7 @@ OpenAIHTTPException, to_model_metadata, ) +from vllm.entrypoints.openai.protocol import ChatCompletionRequest from ray.llm._internal.serve.configs.openai_api_models_patch import ( ErrorResponse, ) @@ -139,7 +140,9 @@ def _apply_openai_json_format( return "".join(f"data: {r.model_dump_json()}\n\n" for r in response) if hasattr(response, "model_dump_json"): return f"data: {response.model_dump_json()}\n\n" - raise ValueError(f"Unexpected response type: {type(response)}") + if isinstance(response, str): + return response + raise ValueError(f"Unexpected response type: {type(response)}, {response=}") async def _peek_at_generator( @@ -294,6 +297,7 @@ async def _get_response( model_handle = self._get_configured_serve_handle(model) async for response in getattr(model_handle, call_method).remote(body): + logger.info(f"[Kourosh] in router._get_response, response: {response}") yield response async def model(self, model_id: str) -> Optional[ModelData]: @@ -381,6 +385,7 @@ async def _process_llm_request( first_chunk = initial_response if isinstance(first_chunk, ErrorResponse): + logger.info(f"[Kourosh] error encountered in first_chunk: {first_chunk}") raise OpenAIHTTPException( message=first_chunk.message, status_code=first_chunk.code, @@ -389,6 +394,7 @@ async def _process_llm_request( if isinstance(first_chunk, NoneStreamingResponseType): # Not streaming, first chunk should be a single response + logger.info(f"[Kourosh] non streaming response received, first_chunk: {first_chunk}") return JSONResponse(content=first_chunk.model_dump()) # In case of streaming we need to iterate over the chunks and yield them From 7dfabdec38ec5cb040b63ed07cef7c1789a6810f Mon Sep 17 00:00:00 2001 From: Kourosh Hakhamaneshi Date: Tue, 24 Jun 2025 18:39:12 -0700 Subject: [PATCH 03/37] wip Signed-off-by: Kourosh Hakhamaneshi --- .../serve/deployments/llm/vllm/vllm_engine.py | 34 +++++++++++-------- .../serve/deployments/llm/vllm/vllm_models.py | 13 ++++--- .../serve/deployments/routers/router.py | 8 +++-- 3 files changed, 33 insertions(+), 22 deletions(-) diff --git a/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py b/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py index 67c3ad13694e..d145338a5d89 100644 --- a/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py +++ b/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py @@ -181,13 +181,14 @@ def get_error(self): class CustomNamespace: def __init__(self, *args): self.classes = args - + def __getattr__(self, name): for cls in self.classes: if hasattr(cls, name): return getattr(cls, name) raise AttributeError(f"Attribute {name} not found in {self.classes}") + class VLLMEngine(LLMEngine): def __init__( self, @@ -199,7 +200,6 @@ def __init__( llm_config: The llm configuration for this engine """ super().__init__(llm_config) - # Convert this to a namespace object # TODO: How to get the args in a way that is also inherits the default values? @@ -208,12 +208,12 @@ def __init__( # self.vllm_cli_args.update( # disable_request_logs=True, # ) - - # filter out the llm_config.engine_kwargs to those that belong to FrontendArgs and pop them over. + + # filter out the llm_config.engine_kwargs to those that belong to FrontendArgs and pop them over. engine_config = llm_config.get_engine_config() self.frontend_args = FrontendArgs(**engine_config.frontend_kwargs) self.engine_args = AsyncEngineArgs(**engine_config.engine_kwargs) - + self.namespace_args = CustomNamespace(self.engine_args, self.frontend_args) if vllm is None: @@ -332,12 +332,13 @@ async def start(self): # given_format="auto", # tokenizer=self._tokenizer, # ) - - + from vllm.entrypoints.openai.api_server import init_app_state + self.engine = await self._start_engine() - + from starlette.datastructures import State + state = State() await init_app_state( @@ -346,11 +347,11 @@ async def start(self): state=state, args=self.namespace_args, ) - + self.oai_serving_chat = state.openai_serving_chat self.oai_serving_completion = state.openai_serving_completion self.oai_serving_embedding = state.openai_serving_embedding - + self.running = True logger.info("Started vLLM engine.") @@ -544,7 +545,7 @@ def _start_async_llm_engine( from vllm.v1.executor.abstract import Executor vllm_config.parallel_config.placement_group = placement_group - + if use_v1: from vllm.v1.engine.async_llm import AsyncLLM as AsyncLLMEngine else: @@ -639,18 +640,21 @@ def apply_hf_chat_template(model_config, **kwargs): vllm_request = VLLMGenerationRequest(**request_params) return vllm_request - async def chat(self, request: GenerationRequest) -> AsyncGenerator[LLMRawResponse, None]: + async def chat( + self, request: GenerationRequest + ) -> AsyncGenerator[LLMRawResponse, None]: chat_response = await self.oai_serving_chat.create_chat_completion(request) - + if isinstance(chat_response, AsyncGenerator): async for response in chat_response: yield response else: - logger.info(f"[Kourosh] non streaming response received, chat_response: {chat_response}") + logger.info( + f"[Kourosh] non streaming response received, chat_response: {chat_response}" + ) yield chat_response - async def generate( self, request: GenerationRequest ) -> AsyncGenerator[LLMRawResponse, None]: diff --git a/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_models.py b/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_models.py index 33eb441e2d4c..400947343b0b 100644 --- a/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_models.py +++ b/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_models.py @@ -111,16 +111,19 @@ def from_llm_config(cls, llm_config: LLMConfig) -> "VLLMEngineConfig": else: # If it's a CloudMirrorConfig (or subtype) mirror_config = llm_config.model_loading_config.model_source - all_engine_kwargs = llm_config.engine_kwargs.copy() engine_kwargs = {} frontend_kwargs = {} - + # Get field names from dataclasses - frontend_field_names = {field.name for field in dataclasses.fields(FrontendArgs)} - async_engine_field_names = {field.name for field in dataclasses.fields(AsyncEngineArgs)} - + frontend_field_names = { + field.name for field in dataclasses.fields(FrontendArgs) + } + async_engine_field_names = { + field.name for field in dataclasses.fields(AsyncEngineArgs) + } + for key, value in all_engine_kwargs.items(): if key in frontend_field_names: frontend_kwargs[key] = value diff --git a/python/ray/llm/_internal/serve/deployments/routers/router.py b/python/ray/llm/_internal/serve/deployments/routers/router.py index 0cf5e4cd2fbf..a75f952ebd58 100644 --- a/python/ray/llm/_internal/serve/deployments/routers/router.py +++ b/python/ray/llm/_internal/serve/deployments/routers/router.py @@ -385,7 +385,9 @@ async def _process_llm_request( first_chunk = initial_response if isinstance(first_chunk, ErrorResponse): - logger.info(f"[Kourosh] error encountered in first_chunk: {first_chunk}") + logger.info( + f"[Kourosh] error encountered in first_chunk: {first_chunk}" + ) raise OpenAIHTTPException( message=first_chunk.message, status_code=first_chunk.code, @@ -394,7 +396,9 @@ async def _process_llm_request( if isinstance(first_chunk, NoneStreamingResponseType): # Not streaming, first chunk should be a single response - logger.info(f"[Kourosh] non streaming response received, first_chunk: {first_chunk}") + logger.info( + f"[Kourosh] non streaming response received, first_chunk: {first_chunk}" + ) return JSONResponse(content=first_chunk.model_dump()) # In case of streaming we need to iterate over the chunks and yield them From 07d42fbb2fae368678ef7c0c27cd847946b8a987 Mon Sep 17 00:00:00 2001 From: Kourosh Hakhamaneshi Date: Tue, 24 Jun 2025 22:56:48 -0700 Subject: [PATCH 04/37] fixed error handling and lora Signed-off-by: Kourosh Hakhamaneshi --- .../serve/deployments/llm/llm_server.py | 14 +++++- .../serve/deployments/llm/vllm/vllm_engine.py | 46 +++++++++++++++---- .../serve/deployments/routers/router.py | 12 +++-- 3 files changed, 58 insertions(+), 14 deletions(-) diff --git a/python/ray/llm/_internal/serve/deployments/llm/llm_server.py b/python/ray/llm/_internal/serve/deployments/llm/llm_server.py index e5132d2f4a6d..ff4c24baebf6 100644 --- a/python/ray/llm/_internal/serve/deployments/llm/llm_server.py +++ b/python/ray/llm/_internal/serve/deployments/llm/llm_server.py @@ -598,9 +598,21 @@ async def chat(self, request: ChatCompletionRequest) -> LLMChatResponse: Returns: A LLMChatResponse object. """ + + multiplexed_model_id = serve.get_multiplexed_model_id() + + if multiplexed_model_id: + assert ( + self._llm_config.lora_config is not None + ), "Must setup lora config for multiplexed requests." + disk_lora_model = await self._disk_lora_model(multiplexed_model_id) + await self.engine.resolve_lora(disk_lora_model) + else: + disk_lora_model = None + # return self._process_llm_request(request, is_chat=True) async for response in self.engine.chat(request): - logger.info(f"[Kourosh] in llm_server.chat, response: {response}") + logger.info(f"[Kourosh] in llm_server.chat, response_type: {type(response)} response: {response}") yield response async def completions(self, request: CompletionRequest) -> LLMCompletionsResponse: diff --git a/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py b/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py index d145338a5d89..eab49205309c 100644 --- a/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py +++ b/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py @@ -56,6 +56,8 @@ from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy from vllm.entrypoints.openai.cli_args import FrontendArgs from vllm.engine.arg_utils import AsyncEngineArgs +from vllm.entrypoints.openai.protocol import ErrorResponse +from ray.llm._internal.serve.configs.openai_api_models_patch import ErrorResponse as PatchedErrorResponse if TYPE_CHECKING: from vllm import SamplingParams as VLLMInternalSamplingParams @@ -201,14 +203,6 @@ def __init__( """ super().__init__(llm_config) - # Convert this to a namespace object - # TODO: How to get the args in a way that is also inherits the default values? - # vllm_cli_args = llm_config.experimental_configs.get("vllm_cli_args", {}) - # self.vllm_cli_args = CustomNamespace(**vllm_cli_args) - # self.vllm_cli_args.update( - # disable_request_logs=True, - # ) - # filter out the llm_config.engine_kwargs to those that belong to FrontendArgs and pop them over. engine_config = llm_config.get_engine_config() self.frontend_args = FrontendArgs(**engine_config.frontend_kwargs) @@ -348,6 +342,7 @@ async def start(self): args=self.namespace_args, ) + self.oai_models = state.openai_serving_models self.oai_serving_chat = state.openai_serving_chat self.oai_serving_completion = state.openai_serving_completion self.oai_serving_embedding = state.openai_serving_embedding @@ -573,6 +568,31 @@ def _start_async_llm_engine( ) return engine + + + async def resolve_lora(self, disk_lora_model: DiskMultiplexConfig): + from vllm.entrypoints.openai.protocol import LoadLoRAAdapterRequest + # lora_add_response = await self.oai_models.load_lora_adapter( + # request=LoadLoRAAdapterRequest( + # lora_name=disk_lora_model.model_id, + # lora_path=disk_lora_model.local_path, + # ) + # ) + + if disk_lora_model.model_id in self.oai_models.lora_requests: + return self.oai_models.lora_requests[disk_lora_model.model_id] + else: + lora_request = await self.oai_models.load_lora_adapter( + request=LoadLoRAAdapterRequest( + lora_name=disk_lora_model.model_id, + lora_path=disk_lora_model.local_path, + ) + ) + + if isinstance(lora_request, ErrorResponse): + raise ValueError(f"Failed to load lora model: {lora_request.message}") + + return lora_request async def prepare_request( self, @@ -653,7 +673,15 @@ async def chat( logger.info( f"[Kourosh] non streaming response received, chat_response: {chat_response}" ) - yield chat_response + if isinstance(chat_response, ErrorResponse): + yield PatchedErrorResponse( + message=chat_response.message, + internal_message=chat_response.message, + type=chat_response.type, + code=chat_response.code, + ) + else: + yield chat_response async def generate( self, request: GenerationRequest diff --git a/python/ray/llm/_internal/serve/deployments/routers/router.py b/python/ray/llm/_internal/serve/deployments/routers/router.py index a75f952ebd58..277f360acb4e 100644 --- a/python/ray/llm/_internal/serve/deployments/routers/router.py +++ b/python/ray/llm/_internal/serve/deployments/routers/router.py @@ -31,8 +31,8 @@ ) from ray.llm._internal.serve.configs.openai_api_models import ( # ChatCompletionRequest, - ChatCompletionResponse, - ChatCompletionStreamResponse, + # ChatCompletionResponse, + # ChatCompletionStreamResponse, CompletionRequest, CompletionResponse, CompletionStreamResponse, @@ -44,7 +44,11 @@ OpenAIHTTPException, to_model_metadata, ) -from vllm.entrypoints.openai.protocol import ChatCompletionRequest +from vllm.entrypoints.openai.protocol import ( + ChatCompletionRequest, + ChatCompletionResponse, + ChatCompletionStreamResponse, +) from ray.llm._internal.serve.configs.openai_api_models_patch import ( ErrorResponse, ) @@ -297,7 +301,7 @@ async def _get_response( model_handle = self._get_configured_serve_handle(model) async for response in getattr(model_handle, call_method).remote(body): - logger.info(f"[Kourosh] in router._get_response, response: {response}") + logger.info(f"[Kourosh] in router._get_response, response_type: {type(response)}, response: {response}") yield response async def model(self, model_id: str) -> Optional[ModelData]: From eddc710cd9d3b33bfb19cefc6bd95c50ec361b79 Mon Sep 17 00:00:00 2001 From: Kourosh Hakhamaneshi Date: Tue, 24 Jun 2025 22:57:02 -0700 Subject: [PATCH 05/37] wip Signed-off-by: Kourosh Hakhamaneshi --- .../serve/deployments/llm/llm_server.py | 8 +++++--- .../serve/deployments/llm/vllm/vllm_engine.py | 16 +++++++++------- .../serve/deployments/routers/router.py | 8 +++++--- 3 files changed, 19 insertions(+), 13 deletions(-) diff --git a/python/ray/llm/_internal/serve/deployments/llm/llm_server.py b/python/ray/llm/_internal/serve/deployments/llm/llm_server.py index ff4c24baebf6..0d4aa28671a3 100644 --- a/python/ray/llm/_internal/serve/deployments/llm/llm_server.py +++ b/python/ray/llm/_internal/serve/deployments/llm/llm_server.py @@ -598,7 +598,7 @@ async def chat(self, request: ChatCompletionRequest) -> LLMChatResponse: Returns: A LLMChatResponse object. """ - + multiplexed_model_id = serve.get_multiplexed_model_id() if multiplexed_model_id: @@ -609,10 +609,12 @@ async def chat(self, request: ChatCompletionRequest) -> LLMChatResponse: await self.engine.resolve_lora(disk_lora_model) else: disk_lora_model = None - + # return self._process_llm_request(request, is_chat=True) async for response in self.engine.chat(request): - logger.info(f"[Kourosh] in llm_server.chat, response_type: {type(response)} response: {response}") + logger.info( + f"[Kourosh] in llm_server.chat, response_type: {type(response)} response: {response}" + ) yield response async def completions(self, request: CompletionRequest) -> LLMCompletionsResponse: diff --git a/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py b/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py index eab49205309c..43af1f6d89db 100644 --- a/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py +++ b/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py @@ -57,7 +57,9 @@ from vllm.entrypoints.openai.cli_args import FrontendArgs from vllm.engine.arg_utils import AsyncEngineArgs from vllm.entrypoints.openai.protocol import ErrorResponse -from ray.llm._internal.serve.configs.openai_api_models_patch import ErrorResponse as PatchedErrorResponse +from ray.llm._internal.serve.configs.openai_api_models_patch import ( + ErrorResponse as PatchedErrorResponse, +) if TYPE_CHECKING: from vllm import SamplingParams as VLLMInternalSamplingParams @@ -568,17 +570,17 @@ def _start_async_llm_engine( ) return engine - - + async def resolve_lora(self, disk_lora_model: DiskMultiplexConfig): from vllm.entrypoints.openai.protocol import LoadLoRAAdapterRequest + # lora_add_response = await self.oai_models.load_lora_adapter( # request=LoadLoRAAdapterRequest( # lora_name=disk_lora_model.model_id, # lora_path=disk_lora_model.local_path, # ) # ) - + if disk_lora_model.model_id in self.oai_models.lora_requests: return self.oai_models.lora_requests[disk_lora_model.model_id] else: @@ -588,10 +590,10 @@ async def resolve_lora(self, disk_lora_model: DiskMultiplexConfig): lora_path=disk_lora_model.local_path, ) ) - + if isinstance(lora_request, ErrorResponse): - raise ValueError(f"Failed to load lora model: {lora_request.message}") - + raise ValueError(f"Failed to load lora model: {lora_request.message}") + return lora_request async def prepare_request( diff --git a/python/ray/llm/_internal/serve/deployments/routers/router.py b/python/ray/llm/_internal/serve/deployments/routers/router.py index 277f360acb4e..d5b6fc8fa71a 100644 --- a/python/ray/llm/_internal/serve/deployments/routers/router.py +++ b/python/ray/llm/_internal/serve/deployments/routers/router.py @@ -45,8 +45,8 @@ to_model_metadata, ) from vllm.entrypoints.openai.protocol import ( - ChatCompletionRequest, - ChatCompletionResponse, + ChatCompletionRequest, + ChatCompletionResponse, ChatCompletionStreamResponse, ) from ray.llm._internal.serve.configs.openai_api_models_patch import ( @@ -301,7 +301,9 @@ async def _get_response( model_handle = self._get_configured_serve_handle(model) async for response in getattr(model_handle, call_method).remote(body): - logger.info(f"[Kourosh] in router._get_response, response_type: {type(response)}, response: {response}") + logger.info( + f"[Kourosh] in router._get_response, response_type: {type(response)}, response: {response}" + ) yield response async def model(self, model_id: str) -> Optional[ModelData]: From 14e5263246ab9ad276ada099a99030a11166c5b0 Mon Sep 17 00:00:00 2001 From: Kourosh Hakhamaneshi Date: Tue, 24 Jun 2025 23:33:30 -0700 Subject: [PATCH 06/37] mistral Signed-off-by: Kourosh Hakhamaneshi --- .../_internal/serve/deployments/llm/vllm/vllm_engine.py | 1 + .../serve/deployments/utils/node_initialization_utils.py | 9 +++++---- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py b/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py index 43af1f6d89db..074bde703ca0 100644 --- a/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py +++ b/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py @@ -393,6 +393,7 @@ async def _prepare_engine_config(self, use_v1: bool): node_initialization: The node initialization. """ # Initialize node and return all configurations + # TODO: NEEDED for Mistral models node_initialization = await self.initialize_node(self.llm_config) if self.engine_config.use_gpu: diff --git a/python/ray/llm/_internal/serve/deployments/utils/node_initialization_utils.py b/python/ray/llm/_internal/serve/deployments/utils/node_initialization_utils.py index e066722accab..e1733bebe54f 100644 --- a/python/ray/llm/_internal/serve/deployments/utils/node_initialization_utils.py +++ b/python/ray/llm/_internal/serve/deployments/utils/node_initialization_utils.py @@ -148,7 +148,8 @@ def _initialize_local_node( if not isinstance(local_path, str) or not os.path.exists(local_path): logger.info(f"Downloading the tokenizer for {engine_config.actual_hf_model_id}") - _ = transformers.AutoTokenizer.from_pretrained( - engine_config.actual_hf_model_id, - trust_remote_code=engine_config.trust_remote_code, - ) + # TODO: NEEDED for Mistral models that don't support tekken + # _ = transformers.AutoTokenizer.from_pretrained( + # engine_config.actual_hf_model_id, + # trust_remote_code=engine_config.trust_remote_code, + # ) From cdfb32c7135f024a95fc03e1491891fff4dbd9fd Mon Sep 17 00:00:00 2001 From: Kourosh Hakhamaneshi Date: Fri, 27 Jun 2025 19:34:21 -0700 Subject: [PATCH 07/37] batching is also done Signed-off-by: Kourosh Hakhamaneshi --- .../_internal/serve/deployments/llm/llm_server.py | 13 ++++++++++--- .../_internal/serve/deployments/routers/router.py | 3 +++ 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/python/ray/llm/_internal/serve/deployments/llm/llm_server.py b/python/ray/llm/_internal/serve/deployments/llm/llm_server.py index 0d4aa28671a3..e960670ba25e 100644 --- a/python/ray/llm/_internal/serve/deployments/llm/llm_server.py +++ b/python/ray/llm/_internal/serve/deployments/llm/llm_server.py @@ -607,11 +607,18 @@ async def chat(self, request: ChatCompletionRequest) -> LLMChatResponse: ), "Must setup lora config for multiplexed requests." disk_lora_model = await self._disk_lora_model(multiplexed_model_id) await self.engine.resolve_lora(disk_lora_model) + + + if request.stream: + # 4. Apply batching with appropriate interval in case of streaming + response_generator = OpenAIResponseBatcher( + self.engine.chat(request), + interval_ms=self._get_batch_interval_ms(), + ).stream() else: - disk_lora_model = None + response_generator = self.engine.chat(request) - # return self._process_llm_request(request, is_chat=True) - async for response in self.engine.chat(request): + async for response in response_generator: logger.info( f"[Kourosh] in llm_server.chat, response_type: {type(response)} response: {response}" ) diff --git a/python/ray/llm/_internal/serve/deployments/routers/router.py b/python/ray/llm/_internal/serve/deployments/routers/router.py index d5b6fc8fa71a..e9e7eb306df7 100644 --- a/python/ray/llm/_internal/serve/deployments/routers/router.py +++ b/python/ray/llm/_internal/serve/deployments/routers/router.py @@ -141,6 +141,9 @@ def _apply_openai_json_format( data: \n\ndata: \n\n... """ if isinstance(response, list): + first_response = next(iter(response)) + if isinstance(first_response, str): + return "".join(response) return "".join(f"data: {r.model_dump_json()}\n\n" for r in response) if hasattr(response, "model_dump_json"): return f"data: {response.model_dump_json()}\n\n" From dbb2db7190136ecbb55f22eb999568f685505dc7 Mon Sep 17 00:00:00 2001 From: Kourosh Hakhamaneshi Date: Sat, 28 Jun 2025 12:54:11 -0700 Subject: [PATCH 08/37] wip Signed-off-by: Kourosh Hakhamaneshi --- python/ray/llm/_internal/serve/deployments/llm/llm_server.py | 1 - .../serve/deployments/utils/node_initialization_utils.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/python/ray/llm/_internal/serve/deployments/llm/llm_server.py b/python/ray/llm/_internal/serve/deployments/llm/llm_server.py index e960670ba25e..ce06e5a8f015 100644 --- a/python/ray/llm/_internal/serve/deployments/llm/llm_server.py +++ b/python/ray/llm/_internal/serve/deployments/llm/llm_server.py @@ -608,7 +608,6 @@ async def chat(self, request: ChatCompletionRequest) -> LLMChatResponse: disk_lora_model = await self._disk_lora_model(multiplexed_model_id) await self.engine.resolve_lora(disk_lora_model) - if request.stream: # 4. Apply batching with appropriate interval in case of streaming response_generator = OpenAIResponseBatcher( diff --git a/python/ray/llm/_internal/serve/deployments/utils/node_initialization_utils.py b/python/ray/llm/_internal/serve/deployments/utils/node_initialization_utils.py index e1733bebe54f..893778024801 100644 --- a/python/ray/llm/_internal/serve/deployments/utils/node_initialization_utils.py +++ b/python/ray/llm/_internal/serve/deployments/utils/node_initialization_utils.py @@ -148,7 +148,7 @@ def _initialize_local_node( if not isinstance(local_path, str) or not os.path.exists(local_path): logger.info(f"Downloading the tokenizer for {engine_config.actual_hf_model_id}") - # TODO: NEEDED for Mistral models that don't support tekken + # TODO: NEEDED for Mistral models that don't support tekken # _ = transformers.AutoTokenizer.from_pretrained( # engine_config.actual_hf_model_id, # trust_remote_code=engine_config.trust_remote_code, From 2fc73d9cd4dae706a2c83f60abea88de71620772 Mon Sep 17 00:00:00 2001 From: Kourosh Hakhamaneshi Date: Sat, 28 Jun 2025 16:40:35 -0700 Subject: [PATCH 09/37] wip Signed-off-by: Kourosh Hakhamaneshi --- .../serve/deployments/llm/llm_engine.py | 45 +- .../serve/deployments/llm/llm_server.py | 200 ++----- .../serve/deployments/llm/vllm/vllm_engine.py | 532 ++++++++++-------- 3 files changed, 355 insertions(+), 422 deletions(-) diff --git a/python/ray/llm/_internal/serve/deployments/llm/llm_engine.py b/python/ray/llm/_internal/serve/deployments/llm/llm_engine.py index b55720ab85d2..6bbdc444350f 100644 --- a/python/ray/llm/_internal/serve/deployments/llm/llm_engine.py +++ b/python/ray/llm/_internal/serve/deployments/llm/llm_engine.py @@ -1,7 +1,5 @@ import abc -from typing import AsyncGenerator, Optional - -from transformers.dynamic_module_utils import init_hf_modules +from typing import AsyncGenerator, Optional, Any from ray.llm._internal.serve.configs.server_models import ( DiskMultiplexConfig, @@ -13,37 +11,36 @@ class LLMEngine(abc.ABC): - """Base class for all LLM engines""" + """Base protocal class for all LLM engines""" + @abc.abstractmethod def __init__(self, llm_config: LLMConfig): - self._llm_config = llm_config - - # Ensure transformers_modules is initialized early in worker processes. - # This is critical for models with trust_remote_code=True to avoid pickle errors. - init_hf_modules() + """Initialize the engine with the llm config""" + pass @abc.abstractmethod async def start(self): """Start the engine""" pass - + @abc.abstractmethod - async def prepare_request( - self, - request_id: str, - prompt: Prompt, - stream: bool, - disk_lora_model: Optional[DiskMultiplexConfig] = None, - **kwargs, - ) -> GenerationRequest: - """Prepare a GenerationRequest for the engine""" + async def resolve_lora(self, lora_model: DiskMultiplexConfig): + """Resolve the lora model""" pass - + + @abc.abstractmethod + async def chat(self, request) -> AsyncGenerator[Any, None]: + """Chat with the engine""" + pass + + @abc.abstractmethod + async def completions(self, request) -> AsyncGenerator[Any, None]: + """Completion with the engine""" + pass + @abc.abstractmethod - async def generate( - self, request: GenerationRequest - ) -> AsyncGenerator[LLMRawResponse, None]: - """Generate an LLMRawResponse stream based on the GenerationRequest""" + async def embeddings(self, request) -> AsyncGenerator[Any, None]: + """Embed with the engine""" pass async def check_health(self) -> None: diff --git a/python/ray/llm/_internal/serve/deployments/llm/llm_server.py b/python/ray/llm/_internal/serve/deployments/llm/llm_server.py index 305218fa9c7b..9430a064f9f5 100644 --- a/python/ray/llm/_internal/serve/deployments/llm/llm_server.py +++ b/python/ray/llm/_internal/serve/deployments/llm/llm_server.py @@ -78,11 +78,10 @@ class _LLMServerBase(ABC): """ # TODO (Kourosh): I don't know why this is an async init. Need to fix. - async def __init__(self, llm_config: LLMConfig): + async def __init__(self): """ Constructor takes in an LLMConfig object and start the underlying engine. """ - self._llm_config = llm_config @abstractmethod async def chat(self, request: ChatCompletionRequest) -> LLMChatResponse: @@ -105,9 +104,10 @@ async def check_health(self) -> None: the engine is dead and needs to be restarted. """ ... - - async def llm_config(self) -> LLMConfig: - return self._llm_config + + # TODO (Kourosh): This does not belong here. + async def llm_config(self) -> Optional[LLMConfig]: + return None class ResponsePostprocessor: @@ -408,6 +408,12 @@ async def process_completions( class LLMServer(_LLMServerBase): + """This is a shm layer to decouple the LLM engine from the ingress deployment. + + It has a very similar API as the engine. Almost all of the abstractions are implemented by the engine. This class just a little bit more logic on top, e.g.: + 1. Logic for serve multiplexing, etc. + 2. Telemetry reporting + """ _default_engine_cls = VLLMEngine async def __init__( @@ -415,7 +421,6 @@ async def __init__( llm_config: LLMConfig, *, engine_cls: Optional[Type[LLMEngine]] = None, - model_downloader: Optional[LoraModelLoader] = None, ): """Constructor of LLMServer. @@ -426,10 +431,9 @@ async def __init__( llm_config: LLMConfig for the model. engine_cls: Dependency injection for the vllm engine class. Defaults to `VLLMEngine`. - model_downloader: Dependency injection for the model downloader - object. Defaults to be initialized with `LoraModelLoader`. """ - await super().__init__(llm_config) + await super().__init__() + self._llm_config = llm_config self._engine_cls = engine_cls or self._get_default_engine_class() self.engine: Optional[LLMEngine] = None @@ -437,24 +441,6 @@ async def __init__( self.engine = self._engine_cls(self._llm_config) await asyncio.wait_for(self._start_engine(), timeout=ENGINE_START_TIMEOUT_S) - multiplex_config = self._llm_config.multiplex_config() - if model_downloader: - self.model_downloader = model_downloader - elif multiplex_config: - self.model_downloader = LoraModelLoader( - download_timeout_s=multiplex_config.download_timeout_s, - max_tries=multiplex_config.max_download_tries, - ) - else: - self.model_downloader = LoraModelLoader() - - # Hack that lets us set max_num_models_per_replica from the llm_config - if multiplex_config: - self.load_model = serve.multiplexed( - max_num_models_per_replica=multiplex_config.max_num_models_per_replica - )(lambda lora_model_id: self._load_model(lora_model_id)) - - self.response_postprocessor = ResponsePostprocessor() def _get_default_engine_class(self) -> Type[LLMEngine]: """Helper to load the engine class from the environment variable. @@ -478,39 +464,6 @@ async def _start_engine(self): if self._llm_config.model_architecture: push_telemetry_report_for_all_models(all_models=[self._llm_config]) - async def _predict( - self, - request_id: str, - prompt: Prompt, - stream: bool, - ) -> AsyncGenerator[LLMRawResponse, None]: - """A thin wrapper around VLLMEngine.generate(). - - 1. Load the model to disk - 2. Format parameters correctly - 3. Forward request to VLLMEngine.generate() - """ - - logger.info(f"Received streaming request {request_id}") - multiplexed_model_id = serve.get_multiplexed_model_id() - - if multiplexed_model_id: - assert ( - self._llm_config.lora_config is not None - ), "Must setup lora config for multiplexed requests." - disk_lora_model = await self._disk_lora_model(multiplexed_model_id) - else: - disk_lora_model = None - - llm_request = await self.engine.prepare_request( - request_id=request_id, - prompt=prompt, - stream=stream, - disk_lora_model=disk_lora_model, - ) - - async for llm_response in self.engine.generate(llm_request): - yield llm_response def _get_batch_interval_ms(self, stream: bool = True) -> int: """Calculate the batching interval for responses.""" @@ -520,93 +473,41 @@ def _get_batch_interval_ms(self, stream: bool = True) -> int: if stream_batching_interval_ms is None: stream_batching_interval_ms = MODEL_RESPONSE_BATCH_TIMEOUT_MS return stream_batching_interval_ms if stream else None - - def _process_llm_request( - self, request: Union[ChatCompletionRequest, CompletionRequest], is_chat: bool - ) -> Union[LLMChatResponse, LLMCompletionsResponse]: - """Common processing pipeline for both chat and completions APIs. - - Args: - request: Either a ChatCompletionRequest or CompletionRequest object - is_chat: Whether this is a chat request (True) or completions request (False) - - Returns: - A generator of response objects (either chat completion or text completion) - """ - request_id = get_serve_request_id() - - # 1. Construct the appropriate prompt based on request type - if is_chat: - prompt = Prompt( - prompt=[ - Message.model_validate(message) for message in request.messages - ], - parameters=request, - ) - else: - prompt = Prompt( - prompt=request.prompt, - parameters=request, - use_prompt_format=False, - ) - - # 2. Predict using the engine - gen = self._predict(request_id=request_id, prompt=prompt, stream=request.stream) - - # 3. Convert raw LLM responses to OpenAI format - processor_method = ( - self.response_postprocessor.process_chat - if is_chat - else self.response_postprocessor.process_completions - ) - openai_resp_generator = processor_method( - model=self._llm_config.model_id, gen=gen, stream=request.stream - ) - - if request.stream: - # 4. Apply batching with appropriate interval in case of streaming - batched_openai_response_stream = OpenAIResponseBatcher( - openai_resp_generator, - interval_ms=self._get_batch_interval_ms(), - ) - - return batched_openai_response_stream.stream() - - return openai_resp_generator - - async def chat(self, request: ChatCompletionRequest) -> LLMChatResponse: - """Runs a chat request to the LLM engine and returns the response. - - Args: - request: A ChatCompletionRequest object. - - Returns: - A LLMChatResponse object. - """ - + + + async def _maybe_resolve_lora_from_multiplex(self) -> None: + """Handle the lora model for the request.""" multiplexed_model_id = serve.get_multiplexed_model_id() - if multiplexed_model_id: assert ( self._llm_config.lora_config is not None ), "Must setup lora config for multiplexed requests." disk_lora_model = await self._disk_lora_model(multiplexed_model_id) await self.engine.resolve_lora(disk_lora_model) + + def _batch_output_stream(self, generator): + return OpenAIResponseBatcher( + generator, + interval_ms=self._get_batch_interval_ms(), + ).stream() + + + async def chat(self, request: ChatCompletionRequest): + """Runs a chat request to the LLM engine and returns the response. - if request.stream: - # 4. Apply batching with appropriate interval in case of streaming - response_generator = OpenAIResponseBatcher( - self.engine.chat(request), - interval_ms=self._get_batch_interval_ms(), - ).stream() - else: - response_generator = self.engine.chat(request) + Args: + request: A ChatCompletionRequest object. - async for response in response_generator: - logger.info( - f"[Kourosh] in llm_server.chat, response_type: {type(response)} response: {response}" - ) - yield response + Returns: + A LLMChatResponse object. + """ + await self._maybe_resolve_lora_from_multiplex() + stream = self._batch_output_stream( + self.engine.chat(request) + ) + + async for chunk in stream: + yield chunk async def completions(self, request: CompletionRequest) -> LLMCompletionsResponse: """Runs a completion request to the LLM engine and returns the response. @@ -617,7 +518,15 @@ async def completions(self, request: CompletionRequest) -> LLMCompletionsRespons Returns: A LLMCompletionsResponse object. """ - return self._process_llm_request(request, is_chat=False) + await self._maybe_resolve_lora_from_multiplex() + response_generator = self._batch_output_stream( + request, + self.engine.completions(request) + ) + + async for response in response_generator: + yield response + async def check_health(self) -> None: """ @@ -681,16 +590,9 @@ async def embeddings(self, request: EmbeddingRequest) -> LLMEmbeddingsResponse: exc_info=e, ) - async def _load_model(self, lora_model_id: str) -> DiskMultiplexConfig: - return await self.model_downloader.load_model( - lora_model_id=lora_model_id, - llm_config=self._llm_config, - ) - - async def _disk_lora_model(self, lora_model_id: str) -> DiskMultiplexConfig: - disk_lora_model: DiskMultiplexConfig = await self.load_model(lora_model_id) - return disk_lora_model - + async def llm_config(self) -> Optional[LLMConfig]: + return self._llm_config + @classmethod def as_deployment( cls, deployment_options: Dict[str, Any] = None diff --git a/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py b/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py index 074bde703ca0..3a41d103a93b 100644 --- a/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py +++ b/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py @@ -27,6 +27,8 @@ LogProbs, Prompt, ) +from transformers.dynamic_module_utils import init_hf_modules + from ray.llm._internal.serve.deployments.llm.llm_engine import LLMEngine from ray.llm._internal.serve.deployments.llm.vllm.vllm_engine_stats import ( ArgUsage, @@ -204,6 +206,10 @@ def __init__( llm_config: The llm configuration for this engine """ super().__init__(llm_config) + + # Ensure transformers_modules is initialized early in worker processes. + # This is critical for models with trust_remote_code=True to avoid pickle errors. + init_hf_modules() # filter out the llm_config.engine_kwargs to those that belong to FrontendArgs and pop them over. engine_config = llm_config.get_engine_config() @@ -542,7 +548,7 @@ def _start_async_llm_engine( """Creates an async LLM engine from the engine arguments.""" from vllm.v1.executor.abstract import Executor - vllm_config.parallel_config.placement_group = placement_group + # vllm_config.parallel_config.placement_group = placement_group if use_v1: from vllm.v1.engine.async_llm import AsyncLLM as AsyncLLMEngine @@ -597,77 +603,100 @@ async def resolve_lora(self, disk_lora_model: DiskMultiplexConfig): return lora_request - async def prepare_request( - self, - request_id: str, - prompt: Prompt, - stream: bool, - disk_lora_model: Optional[DiskMultiplexConfig] = None, - ) -> GenerationRequest: - from vllm.entrypoints.chat_utils import ( - apply_hf_chat_template as _apply_hf_chat_template, - parse_chat_messages_futures, - ) - - model_config = self.model_config - mm_data = None - - if isinstance(prompt.prompt, list): - messages = [m.model_dump() for m in prompt.prompt] - conversation, mm_futures = parse_chat_messages_futures( - messages=messages, - model_config=model_config, - tokenizer=self._tokenizer, - content_format=self._resolved_content_format, - ) - mm_data = await mm_futures - - def apply_hf_chat_template(model_config, **kwargs): - try: - return _apply_hf_chat_template(model_config=model_config, **kwargs) - except TypeError: - # Legacy API before vLLM 0.9.0. - # TODO(#52975): Remove above once vLLM <0.9.0 is no longer supported. - return _apply_hf_chat_template( - trust_remote_code=model_config.trust_remote_code, **kwargs - ) - - prompt_text = apply_hf_chat_template( - model_config=model_config, - tokenizer=self._tokenizer, - conversation=conversation, - chat_template=None, - tools=None, - tokenize=False, - # **kwargs for tokenizer.apply_chat_template - trust_remote_code=model_config.trust_remote_code, - add_generation_prompt=True, - continue_final_message=False, - ) - else: - prompt_text = prompt.prompt - - prompt_token_ids = await self._atokenize(prompt_text) - - request_params = { - "prompt": prompt_text, - "prompt_token_ids": prompt_token_ids, - "request_id": request_id, - "sampling_params": VLLMSamplingParams.from_prompt(prompt), - "disk_multiplex_config": disk_lora_model, - "stream": stream, - } - if mm_data: - request_params["multi_modal_data"] = mm_data - - vllm_request = VLLMGenerationRequest(**request_params) - return vllm_request + # async def prepare_request( + # self, + # request_id: str, + # prompt: Prompt, + # stream: bool, + # disk_lora_model: Optional[DiskMultiplexConfig] = None, + # ) -> GenerationRequest: + # from vllm.entrypoints.chat_utils import ( + # apply_hf_chat_template as _apply_hf_chat_template, + # parse_chat_messages_futures, + # ) + + # model_config = self.model_config + # mm_data = None + + # if isinstance(prompt.prompt, list): + # messages = [m.model_dump() for m in prompt.prompt] + # conversation, mm_futures = parse_chat_messages_futures( + # messages=messages, + # model_config=model_config, + # tokenizer=self._tokenizer, + # content_format=self._resolved_content_format, + # ) + # mm_data = await mm_futures + + # def apply_hf_chat_template(model_config, **kwargs): + # try: + # return _apply_hf_chat_template(model_config=model_config, **kwargs) + # except TypeError: + # # Legacy API before vLLM 0.9.0. + # # TODO(#52975): Remove above once vLLM <0.9.0 is no longer supported. + # return _apply_hf_chat_template( + # trust_remote_code=model_config.trust_remote_code, **kwargs + # ) + + # prompt_text = apply_hf_chat_template( + # model_config=model_config, + # tokenizer=self._tokenizer, + # conversation=conversation, + # chat_template=None, + # tools=None, + # tokenize=False, + # # **kwargs for tokenizer.apply_chat_template + # trust_remote_code=model_config.trust_remote_code, + # add_generation_prompt=True, + # continue_final_message=False, + # ) + # else: + # prompt_text = prompt.prompt + + # prompt_token_ids = await self._atokenize(prompt_text) + + # request_params = { + # "prompt": prompt_text, + # "prompt_token_ids": prompt_token_ids, + # "request_id": request_id, + # "sampling_params": VLLMSamplingParams.from_prompt(prompt), + # "disk_multiplex_config": disk_lora_model, + # "stream": stream, + # } + # if mm_data: + # request_params["multi_modal_data"] = mm_data + + # vllm_request = VLLMGenerationRequest(**request_params) + # return vllm_request async def chat( self, request: GenerationRequest - ) -> AsyncGenerator[LLMRawResponse, None]: + ) -> AsyncGenerator[str, None]: + """ + + input: Take a genric free form input type and cast it to the target engine request type inside the engine. + + output: + - stream: True --> for each chunk, yield astring representing data: \n\n + - stream: False --> yield only one string representing the response + + Error: + option A: + when request hits an error, raise an HTTPException(msg, code, type) + option B: + yield a HTTPException object + """ - chat_response = await self.oai_serving_chat.create_chat_completion(request) + try: + chat_response = await self.oai_serving_chat.create_chat_completion(request) + except Exception as e: + logger.error(f"[Kourosh] error in chat: {e}") + yield PatchedErrorResponse( + message=str(e), + internal_message=str(e), + type="internal_error", + code=500, + ) if isinstance(chat_response, AsyncGenerator): async for response in chat_response: @@ -684,188 +713,193 @@ async def chat( code=chat_response.code, ) else: - yield chat_response - - async def generate( - self, request: GenerationRequest - ) -> AsyncGenerator[LLMRawResponse, None]: - """Generate an LLMRawResponse stream - - The vLLM generation request will be passed into vLLM, and the resulting output - will be wrapped in an LLMRawResponse and yielded back to the user. - - Error handling: - - We schedule a finalizer that will abort the request on the engine. - - If an exception is raised in this function or vllm, the finalizer guarantees that the request is aborted. - If an exception is raised in the caller, when this generator is gced, it will run the finalizer and abort the request. - - This should also handle the case where the caller is cancelled (raises asyncio.CancelledError) - """ - if RAYLLM_ENABLE_REQUEST_PROMPT_LOGS: - logger.info( - f"Request {request.request_id} started. " f"Prompt: {request.prompt}" - ) - - if request.prompt_token_ids is not None: - prompt = vllm.inputs.TokensPrompt( - prompt_token_ids=request.prompt_token_ids, - multi_modal_data=request.multi_modal_data, - ) - else: - prompt = vllm.inputs.TextPrompt( - prompt=request.prompt, - multi_modal_data=request.multi_modal_data, - ) - - # Construct a results generator from vLLM - results_generator: AsyncGenerator["RequestOutput", None] = self.engine.generate( - prompt=prompt, - sampling_params=self._parse_sampling_params(request.sampling_params), - request_id=request.request_id, - lora_request=request.lora_request, # type: ignore - ) - - # Loop over the results - num_text_returned = 0 - all_tokens_collected = 0 - clock = MsClock(unit=ClockUnit.s) - log_probs_idx = 0 - finish_reason = None - num_input_tokens = 0 - try: - start = time.perf_counter() - request_output = None - async for request_output in self._stats.auto_track(results_generator): - # TODO(tchordia): handle more than one output - assert ( - len(request_output.outputs) == 1 - ), "Received more than 1 output from vllm, aborting" - - output = request_output.outputs[0] - text_output = output.text[num_text_returned:] - num_text_returned += len(text_output) - num_input_tokens = len(request_output.prompt_token_ids) - tokens_collected = len(output.token_ids) - all_tokens_collected - all_tokens_collected += tokens_collected - finish_reason = FinishReason.from_vllm_finish_reason( - output.finish_reason - ) - - self._handle_input_too_long(request_output, finish_reason) - - log_probs, log_probs_idx = self._extract_logprobs( - output, - log_probs_idx, - request.sampling_params.top_logprobs, - ) - internal_metadata = {} - if getattr(request_output, "kv_transfer_params", None) is not None: - internal_metadata[ - KV_TRANSFER_PARAMS_KEY - ] = request_output.kv_transfer_params - yield LLMRawResponse( - generated_text=text_output, - num_generated_tokens=tokens_collected, - logprobs=log_probs, - num_generated_tokens_batch=tokens_collected, - num_input_tokens=num_input_tokens, - num_input_tokens_batch=num_input_tokens, - preprocessing_time=0, - generation_time=clock.reset_interval(), - finish_reason=finish_reason, - metadata=internal_metadata, - ) - - if request_output is not None: - total_request_time = time.perf_counter() - start - if request_output.metrics is None: - # vLLM V1 metrics are not included in the request output yet. - queue_time = "N/A" - generation_time_str = "N/A" - tokens_s = "N/A" - generated_tokens_s = "N/A" - else: - time_in_queue_histogram.observe( - request_output.metrics.time_in_queue - ) - queue_time = f"{request_output.metrics.time_in_queue}s" - generation_time = ( - total_request_time - request_output.metrics.time_in_queue - ) - generation_time_str = f"{generation_time}s" - tokens_s = ( - num_input_tokens + all_tokens_collected - ) / generation_time - generated_tokens_s = all_tokens_collected / generation_time - - logger.info( - f"Request {request.request_id} finished ({finish_reason}). " - f"Total time: {total_request_time}s, " - f"Queue time: {queue_time}, " - f"Generation+async time: {generation_time_str}, " - f"Input tokens: {num_input_tokens}, " - f"Generated tokens: {all_tokens_collected}, " - f"tokens/s: {tokens_s}, " - f"generated tokens/s: {generated_tokens_s}." - ) - else: - logger.warning( - f"Request {request.request_id} " - "finished without any output. " - f"Input tokens: {num_input_tokens}." - ) - except ValueError as e: - error_args = e.args - if len(error_args) == 3 and "Input too long." == error_args[0]: - _, input_length, max_input_length = error_args - raise InputTooLong(input_length, max_input_length).exception from None - elif len(error_args) == 1 and V1_TOO_LONG_PATTERN.match(error_args[0]): - parsed_error = V1_TOO_LONG_PATTERN.match(error_args[0]) - raise InputTooLong( - int(parsed_error[1]), int(parsed_error[2]) - ).exception from None - else: - raise e from None - finally: - # Ensure that we cancel on the engine once we have exited the streaming - # phase - await self.engine.abort(request.request_id) - - def _get_prompt_limit(self) -> int: - """Helper to get the prompt limit from scheduler config - - Port from https://github.com/vllm-project/vllm/blob/7b5ecf79bd94aab0d782c70126d0dcc37c16bc60/vllm/core/scheduler.py#L939 - """ - scheduler_config = self.vllm_config.scheduler_config - if ( - scheduler_config.chunked_prefill_enabled - and not scheduler_config.is_multi_step - ): - prompt_limit = scheduler_config.max_model_len - else: - prompt_limit = min( - scheduler_config.max_model_len, - scheduler_config.max_num_batched_tokens, - ) - return prompt_limit - - def _handle_input_too_long( - self, request_output: "RequestOutput", finish_reason: Optional[FinishReason] + yield chat_response.model_dump_json() + + # async def generate( + # self, request: GenerationRequest + # ) -> AsyncGenerator[LLMRawResponse, None]: + # """Generate an LLMRawResponse stream + + # The vLLM generation request will be passed into vLLM, and the resulting output + # will be wrapped in an LLMRawResponse and yielded back to the user. + + # Error handling: + + # We schedule a finalizer that will abort the request on the engine. + + # If an exception is raised in this function or vllm, the finalizer guarantees that the request is aborted. + # If an exception is raised in the caller, when this generator is gced, it will run the finalizer and abort the request. + + # This should also handle the case where the caller is cancelled (raises asyncio.CancelledError) + # """ + # if RAYLLM_ENABLE_REQUEST_PROMPT_LOGS: + # logger.info( + # f"Request {request.request_id} started. " f"Prompt: {request.prompt}" + # ) + + # if request.prompt_token_ids is not None: + # prompt = vllm.inputs.TokensPrompt( + # prompt_token_ids=request.prompt_token_ids, + # multi_modal_data=request.multi_modal_data, + # ) + # else: + # prompt = vllm.inputs.TextPrompt( + # prompt=request.prompt, + # multi_modal_data=request.multi_modal_data, + # ) + + # # Construct a results generator from vLLM + # results_generator: AsyncGenerator["RequestOutput", None] = self.engine.generate( + # prompt=prompt, + # sampling_params=self._parse_sampling_params(request.sampling_params), + # request_id=request.request_id, + # lora_request=request.lora_request, # type: ignore + # ) + + # # Loop over the results + # num_text_returned = 0 + # all_tokens_collected = 0 + # clock = MsClock(unit=ClockUnit.s) + # log_probs_idx = 0 + # finish_reason = None + # num_input_tokens = 0 + # try: + # start = time.perf_counter() + # request_output = None + # async for request_output in self._stats.auto_track(results_generator): + # # TODO(tchordia): handle more than one output + # assert ( + # len(request_output.outputs) == 1 + # ), "Received more than 1 output from vllm, aborting" + + # output = request_output.outputs[0] + # text_output = output.text[num_text_returned:] + # num_text_returned += len(text_output) + # num_input_tokens = len(request_output.prompt_token_ids) + # tokens_collected = len(output.token_ids) - all_tokens_collected + # all_tokens_collected += tokens_collected + # finish_reason = FinishReason.from_vllm_finish_reason( + # output.finish_reason + # ) + + # self._handle_input_too_long(request_output, finish_reason) + + # log_probs, log_probs_idx = self._extract_logprobs( + # output, + # log_probs_idx, + # request.sampling_params.top_logprobs, + # ) + # internal_metadata = {} + # if getattr(request_output, "kv_transfer_params", None) is not None: + # internal_metadata[ + # KV_TRANSFER_PARAMS_KEY + # ] = request_output.kv_transfer_params + # yield LLMRawResponse( + # generated_text=text_output, + # num_generated_tokens=tokens_collected, + # logprobs=log_probs, + # num_generated_tokens_batch=tokens_collected, + # num_input_tokens=num_input_tokens, + # num_input_tokens_batch=num_input_tokens, + # preprocessing_time=0, + # generation_time=clock.reset_interval(), + # finish_reason=finish_reason, + # metadata=internal_metadata, + # ) + + # if request_output is not None: + # total_request_time = time.perf_counter() - start + # if request_output.metrics is None: + # # vLLM V1 metrics are not included in the request output yet. + # queue_time = "N/A" + # generation_time_str = "N/A" + # tokens_s = "N/A" + # generated_tokens_s = "N/A" + # else: + # time_in_queue_histogram.observe( + # request_output.metrics.time_in_queue + # ) + # queue_time = f"{request_output.metrics.time_in_queue}s" + # generation_time = ( + # total_request_time - request_output.metrics.time_in_queue + # ) + # generation_time_str = f"{generation_time}s" + # tokens_s = ( + # num_input_tokens + all_tokens_collected + # ) / generation_time + # generated_tokens_s = all_tokens_collected / generation_time + + # logger.info( + # f"Request {request.request_id} finished ({finish_reason}). " + # f"Total time: {total_request_time}s, " + # f"Queue time: {queue_time}, " + # f"Generation+async time: {generation_time_str}, " + # f"Input tokens: {num_input_tokens}, " + # f"Generated tokens: {all_tokens_collected}, " + # f"tokens/s: {tokens_s}, " + # f"generated tokens/s: {generated_tokens_s}." + # ) + # else: + # logger.warning( + # f"Request {request.request_id} " + # "finished without any output. " + # f"Input tokens: {num_input_tokens}." + # ) + # except ValueError as e: + # error_args = e.args + # if len(error_args) == 3 and "Input too long." == error_args[0]: + # _, input_length, max_input_length = error_args + # raise InputTooLong(input_length, max_input_length).exception from None + # elif len(error_args) == 1 and V1_TOO_LONG_PATTERN.match(error_args[0]): + # parsed_error = V1_TOO_LONG_PATTERN.match(error_args[0]) + # raise InputTooLong( + # int(parsed_error[1]), int(parsed_error[2]) + # ).exception from None + # else: + # raise e from None + # finally: + # # Ensure that we cancel on the engine once we have exited the streaming + # # phase + # await self.engine.abort(request.request_id) + + # def _get_prompt_limit(self) -> int: + # """Helper to get the prompt limit from scheduler config + + # Port from https://github.com/vllm-project/vllm/blob/7b5ecf79bd94aab0d782c70126d0dcc37c16bc60/vllm/core/scheduler.py#L939 + # """ + # scheduler_config = self.vllm_config.scheduler_config + # if ( + # scheduler_config.chunked_prefill_enabled + # and not scheduler_config.is_multi_step + # ): + # prompt_limit = scheduler_config.max_model_len + # else: + # prompt_limit = min( + # scheduler_config.max_model_len, + # scheduler_config.max_num_batched_tokens, + # ) + # return prompt_limit + + # def _handle_input_too_long( + # self, request_output: "RequestOutput", finish_reason: Optional[FinishReason] + # ): + # if ( + # finish_reason + # and finish_reason == FinishReason.LENGTH + # and hasattr(request_output.metrics, "first_token_time") + # and request_output.metrics.first_token_time is None + # ): + # # This means that the prompt was too long and we did not generate anything. + # raise InputTooLong( + # len(request_output.prompt_token_ids), self._get_prompt_limit() + # ).exception + + async def completions( + self, request ): - if ( - finish_reason - and finish_reason == FinishReason.LENGTH - and hasattr(request_output.metrics, "first_token_time") - and request_output.metrics.first_token_time is None - ): - # This means that the prompt was too long and we did not generate anything. - raise InputTooLong( - len(request_output.prompt_token_ids), self._get_prompt_limit() - ).exception - - async def embed( + raise NotImplementedError("Completions are not supported yet") + + async def embeddings( self, vllm_embedding_request: VLLMEmbeddingRequest ) -> Tuple[List[List[float]], int]: """Return (embeddings, num_prompt_tokens)""" From 00ac8680e20214c303d7854a6dda43615f326876 Mon Sep 17 00:00:00 2001 From: Kourosh Hakhamaneshi Date: Mon, 30 Jun 2025 10:19:42 -0700 Subject: [PATCH 10/37] wip Signed-off-by: Kourosh Hakhamaneshi --- .../serve/deployments/llm/vllm/vllm_engine.py | 344 ++++-------------- .../serve/deployments/routers/router.py | 1 + 2 files changed, 74 insertions(+), 271 deletions(-) diff --git a/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py b/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py index 3a41d103a93b..0262b690febb 100644 --- a/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py +++ b/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py @@ -146,44 +146,6 @@ def _clear_current_platform_cache(): current_platform.get_device_capability.cache_clear() -class _EngineBackgroundProcess: - def __init__(self, ipc_path, engine_args, engine_config): - from vllm.engine.multiprocessing.engine import MQLLMEngine - - # Adapted from vllm.engine.multiprocessing.engine.MQLLMEngine.from_engine_args - vllm.plugins.load_general_plugins() - - # Note (genesu): There is a bug in vllm 0.7.2 forced the use of uni processing - # executor when world_size is 1. This is a bug in vllm 0.7.2 and - # is fixed by https://github.com/vllm-project/vllm/pull/12934 which is shipped - # with vllm 0.7.3. However, in Ray's llm package, we will enforce the use of - # ray distributed executor for all cases so it's always compatible with Ray. - from vllm.executor.ray_distributed_executor import RayDistributedExecutor - - # Clear the cache of the current platform. - _clear_current_platform_cache() - - self.engine = MQLLMEngine( - ipc_path=ipc_path, - use_async_sockets=engine_config.model_config.use_async_output_proc, - vllm_config=engine_config, - executor_class=RayDistributedExecutor, - log_requests=not engine_args.disable_log_requests, - log_stats=not engine_args.disable_log_stats, - usage_context=vllm.usage.usage_lib.UsageContext.API_SERVER, - ) - self._error = None - - def start(self): - try: - self.engine.start() - except Exception as e: - self._error = e - - def get_error(self): - return self._error - - class CustomNamespace: def __init__(self, *args): self.classes = args @@ -206,6 +168,7 @@ def __init__( llm_config: The llm configuration for this engine """ super().__init__(llm_config) + # Ensure transformers_modules is initialized early in worker processes. # This is critical for models with trust_remote_code=True to avoid pickle errors. @@ -222,12 +185,14 @@ def __init__( raise ImportError( "vLLM is not installed. Please install it with `pip install ray[llm]`." ) + + if not vllm.envs.VLLM_USE_V1: + raise ValueError("vLLM v0 is getting fully deprecated. As a result in Ray Serve LLM only v1 is supported.") + # TODO (Kourosh): This validation logic belongs to the PDProxy module. # Pick a random port in P/D case. kv_transfer_config = llm_config.engine_kwargs.get("kv_transfer_config", None) if kv_transfer_config is not None: - if not vllm.envs.VLLM_USE_V1: - logger.warning("Ray Serve LLM only supports P/D with v1 vLLM engine.") connector_type = getattr(kv_transfer_config, "kv_connector", "") if connector_type != "NixlConnector": raise ValueError("Only NixlConnector is supported for kv transfer.") @@ -253,27 +218,32 @@ def __init__( port = vllm.envs.VLLM_NIXL_SIDE_CHANNEL_PORT kv_transfer_config.engine_id = "-".join([engine_id, host, str(port)]) - assert isinstance( - llm_config, LLMConfig - ), f"Got invalid config {llm_config} of type {type(llm_config)}" + self.llm_config = llm_config self.engine_config = VLLMEngineConfig.from_llm_config(llm_config) self._stats = VLLMEngineStatTracker() self.running = False self.model_config: "ModelConfig" = None - self.engine = None + # self.engine = None self.vllm_config: "VllmConfig" = None - # Chat template content format (openai or string) - self._resolved_content_format = None - # Also need local instance of the tokenizer to manage prompt formatting. - self._tokenizer = None + # # Chat template content format (openai or string) + # self._resolved_content_format = None + # # Also need local instance of the tokenizer to manage prompt formatting. + # self._tokenizer = None - self._tokenizer_executor = ThreadPoolExecutor(max_workers=1) - self._atokenize = vllm.utils.make_async( - self._tokenize, executor=self._tokenizer_executor - ) + # self._tokenizer_executor = ThreadPoolExecutor(max_workers=1) + # self._atokenize = vllm.utils.make_async( + # self._tokenize, executor=self._tokenizer_executor + # ) + + # vLLM Integration points. Will be set through .start() + self._engine_client = None + self._oai_models = None + self._oai_serving_chat = None + self._oai_serving_completion = None + self._oai_serving_embedding = None @staticmethod async def initialize_node(llm_config: LLMConfig) -> InitializeNodeOutput: @@ -285,114 +255,68 @@ async def initialize_node(llm_config: LLMConfig) -> InitializeNodeOutput: """ return await initialize_node_util(llm_config) - def _tokenize( - self, prompt_text: str, add_special_tokens: bool = False - ) -> List[int]: - encoded = self._tokenizer(prompt_text, add_special_tokens=add_special_tokens) - return encoded.input_ids async def start(self): """Start the vLLM engine. If the engine is already running, do nothing. """ - # from vllm.entrypoints.chat_utils import ( - # resolve_chat_template_content_format as _resolve_chat_template_content_format, - # ) - - # if self.running: - # # The engine is already running! - # logger.info("Skipping engine restart because the engine is already running") - # return - - # self.engine = await self._start_engine() - # self.running = True - # self.model_config = await self.engine.get_model_config() - - # self._tokenizer = await self.engine.get_tokenizer() - - # def resolve_chat_template_content_format(model_config, **kwargs): - # try: - # return _resolve_chat_template_content_format( - # model_config=model_config, **kwargs - # ) - # except TypeError: - # # Legacy API before vLLM 0.9.0. - # # TODO(#52975): Remove this try-except once vLLM <0.9.0 is no longer supported. - # return _resolve_chat_template_content_format( - # trust_remote_code=model_config.trust_remote_code, **kwargs - # ) - - # self._resolved_content_format = resolve_chat_template_content_format( - # model_config=self.model_config, - # # Use HF to get the chat template so set it to None here. - # chat_template=None, - # # Default to None, change when it's needed. - # # vLLM does not have a high level API to support all of this. - # tools=None, - # # Let vLLM decide the content format. - # given_format="auto", - # tokenizer=self._tokenizer, - # ) + + if self.running: + # The engine is already running! + logger.info("Skipping engine restart because the engine is already running") + return from vllm.entrypoints.openai.api_server import init_app_state - self.engine = await self._start_engine() + self._engine_client = await self._start_engine_client() from starlette.datastructures import State state = State() await init_app_state( - engine_client=self.engine, + engine_client=self._engine_client, vllm_config=self.vllm_config, state=state, args=self.namespace_args, ) - self.oai_models = state.openai_serving_models - self.oai_serving_chat = state.openai_serving_chat - self.oai_serving_completion = state.openai_serving_completion - self.oai_serving_embedding = state.openai_serving_embedding + self._validate_openai_serving_models(state.openai_serving_models) + self._oai_models = state.openai_serving_models + + self._oai_serving_chat = state.openai_serving_chat + self._oai_serving_completion = state.openai_serving_completion + self._oai_serving_embedding = state.openai_serving_embedding self.running = True logger.info("Started vLLM engine.") - async def _start_engine(self) -> "EngineClient": - from vllm import envs - - # Since vLLM 0.8.0, the logic to determine v0/v1 engine is as follows: - # 1. If VLLM_USE_V1 is not set, then it tries to use v1 engine. However, - # if any feature specified in the engine config is not supported, then - # it falls back to v0. Note that launching vLLM on a non-main thread - # is an experimental feature, so vLLM will fall back to v0 in this case. - # 2. If VLLM_USE_V1 is set to 1, then it will use v1 engine even with - # experimental features (such as launching vLLM on a non-main thread). - # 3. If VLLM_USE_V1 is set to 0, force using v0 engine. - # In Ray Serve LLM, we forbid case 1 because we have to know exactly which engine is used. - if not envs.is_set("VLLM_USE_V1"): - logger.warning( - "VLLM_USE_V1 environment variable is not set, using vLLM v0 as default. " - "Later we may switch default to use v1 once vLLM v1 is mature." - ) - envs.set_vllm_use_v1(False) - - if not envs.VLLM_USE_V1: - if self.llm_config.log_engine_metrics: - raise ValueError("V1 vLLM Engine is required to log engine metrics") - - return await self._start_engine_v0() + def _validate_openai_serving_models(self, models): + if not hasattr(models, "lora_requests"): + raise ValueError("oai_models must have a lora_requests attribute") + + if not hasattr(models, "load_lora_adapter"): + raise ValueError("oai_models must have a load_lora_adapter attribute") + + async def _start_engine_client(self) -> "EngineClient": + ( + engine_args, + engine_config, + node_initialization, + ) = await self._prepare_engine_config() - return await self._start_engine_v1() + return self._start_async_llm_engine( + engine_args, + engine_config, + node_initialization.placement_group, + ) - async def _prepare_engine_config(self, use_v1: bool): + async def _prepare_engine_config(self): """ Prepare the engine config to start the engine. - Args: - use_v1: Whether to use vLLM V1 engine. - Returns: engine_args: The engine arguments. engine_config: The engine configuration. @@ -428,133 +352,18 @@ async def _prepare_engine_config(self, use_v1: bool): self.vllm_config = engine_config return engine_args, engine_config, node_initialization - async def _start_engine_v1(self) -> "EngineClient": - """Start the vLLM v1 engine. Note that we only use _get_async_engine_args - to get the engine args and don't use _get_vllm_engine_config, because - we integrate vLLM v1 using the highest-level async engine API. - TODO: Refactor vLLM v0 integration to use the same async engine API - to simplify the code. - """ - ( - engine_args, - engine_config, - node_initialization, - ) = await self._prepare_engine_config(use_v1=True) - - return self._start_async_llm_engine( - engine_args, - engine_config, - node_initialization.placement_group, - use_v1=True, - ) - - async def _start_engine_v0(self) -> "EngineClient": - from vllm.engine.multiprocessing.client import MQLLMEngineClient - - ( - engine_args, - engine_config, - node_initialization, - ) = await self._prepare_engine_config(use_v1=False) - - if MQLLMEngineClient.is_unsupported_config(engine_config): - # If the engine is not supported, we fall back to the legacy async engine. - # - # Note (genesu): as of 2025-02-11, this code path is only triggered when - # pipeline parallelism is > 1. And this is due to the vllm mq engine have - # not implemented the pipeline parallelism yet. - return self._start_async_llm_engine( - engine_args, - engine_config, - node_initialization.placement_group, - use_v1=False, - ) - - return await self._start_mq_engine( - engine_args, engine_config, node_initialization.placement_group - ) - - async def _start_mq_engine( - self, - engine_args: "AsyncEngineArgs", - engine_config: "VllmConfig", - placement_group: PlacementGroup, - ) -> "EngineClient": - from vllm.engine.multiprocessing.client import MQLLMEngineClient - - ipc_path = vllm.utils.get_open_zmq_ipc_path() - - BackgroundCls = ray.remote( - num_cpus=0, - scheduling_strategy=PlacementGroupSchedulingStrategy( - placement_group=placement_group, - placement_group_capture_child_tasks=True, - ), - runtime_env=dict( - env_vars=dict( - VLLM_USE_V1="0", - ), - ), - )(_EngineBackgroundProcess) - # Run the process in the background - process_ref = BackgroundCls.remote(ipc_path, engine_args, engine_config) - process_ref.start.remote() - engine_client = MQLLMEngineClient( - ipc_path=ipc_path, - engine_config=engine_config, - engine_pid=os.getpid(), - ) - - logger.info("[STATUS] Getting the server ready ...") - while True: - try: - await engine_client.setup() - break - except TimeoutError: - # A timeout is raised if client cannot connect to the background process. - # This could be due to one of the following reasons: - # 1. The engine has died during construction of the actor: In this case - # get() on any of its methods will raise an ActorDiedError which should - # be re-raised - # 2. The engine is just not up yet (downloading the model, sharding, etc.) - # In this case, we should just wait. - # 3. Something in the .start() has caused the engine to fail: In this - # case the exception is caught and get_error will return the error - # which should be re-raised. - logger.info("[STATUS] Waiting for engine process ...") - try: - # Wait 1 second to get any potential error raised in the engine loop - err = ray.get(process_ref.get_error.remote(), timeout=1) - if err: - raise RuntimeError("Background Engine loop is dead.") from err - except ray.exceptions.GetTimeoutError: - # If it times out then the background loop is keeping it busy - pass - except ray.exceptions.ActorDiedError as e: - logger.error("[ERROR] Actor died.") - raise RuntimeError("Background Engine loop is dead.") from e - - logger.info("[STATUS] Server is ready.") - - return engine_client def _start_async_llm_engine( self, engine_args: "AsyncEngineArgs", vllm_config: "VllmConfig", placement_group: PlacementGroup, - use_v1: bool = False, ) -> "EngineClient": """Creates an async LLM engine from the engine arguments.""" from vllm.v1.executor.abstract import Executor + from vllm.v1.engine.async_llm import AsyncLLM - # vllm_config.parallel_config.placement_group = placement_group - - if use_v1: - from vllm.v1.engine.async_llm import AsyncLLM as AsyncLLMEngine - else: - from vllm.engine.async_llm_engine import AsyncLLMEngine - + vllm_config.parallel_config.placement_group = placement_group _clear_current_platform_cache() custom_stat_loggers = None @@ -563,13 +372,13 @@ def _start_async_llm_engine( RayPrometheusStatLogger, ) - # V1 AsyncLLMEngine does not yet support add_logger + # V1 AsyncLLM does not yet support add_logger # For now, assume folks enabling log_engine_metrics do not require LoggingStatLogger, PrometheusStatLogger custom_stat_loggers = [RayPrometheusStatLogger] executor_class = Executor.get_class(vllm_config) logger.info(f"Using executor class: {executor_class}") - engine = AsyncLLMEngine( + engine = AsyncLLM( vllm_config=vllm_config, executor_class=executor_class, log_stats=not engine_args.disable_log_stats, @@ -581,27 +390,20 @@ def _start_async_llm_engine( async def resolve_lora(self, disk_lora_model: DiskMultiplexConfig): from vllm.entrypoints.openai.protocol import LoadLoRAAdapterRequest - # lora_add_response = await self.oai_models.load_lora_adapter( - # request=LoadLoRAAdapterRequest( - # lora_name=disk_lora_model.model_id, - # lora_path=disk_lora_model.local_path, - # ) - # ) - if disk_lora_model.model_id in self.oai_models.lora_requests: - return self.oai_models.lora_requests[disk_lora_model.model_id] - else: - lora_request = await self.oai_models.load_lora_adapter( - request=LoadLoRAAdapterRequest( - lora_name=disk_lora_model.model_id, - lora_path=disk_lora_model.local_path, - ) + # Lora is already loaded, return + return + + lora_request = await self.oai_models.load_lora_adapter( + request=LoadLoRAAdapterRequest( + lora_name=disk_lora_model.model_id, + lora_path=disk_lora_model.local_path, ) + ) - if isinstance(lora_request, ErrorResponse): - raise ValueError(f"Failed to load lora model: {lora_request.message}") + if isinstance(lora_request, ErrorResponse): + raise ValueError(f"Failed to load lora model: {lora_request.message}") - return lora_request # async def prepare_request( # self, @@ -688,7 +490,7 @@ async def chat( """ try: - chat_response = await self.oai_serving_chat.create_chat_completion(request) + chat_response = await self._oai_serving_chat.create_chat_completion(request) except Exception as e: logger.error(f"[Kourosh] error in chat: {e}") yield PatchedErrorResponse( @@ -919,7 +721,7 @@ async def embeddings( for i, prompt in enumerate(prompts): request_id = f"{vllm_embedding_request.request_id}-{i}" - gen: AsyncGenerator["PoolingRequestOutput", None] = self.engine.encode( + gen: AsyncGenerator["PoolingRequestOutput", None] = self._engine_client.encode( prompt=vllm.inputs.TextPrompt( prompt=prompt, ), @@ -944,11 +746,11 @@ async def embeddings( return embedding_data, total_prompt_tokens async def check_health(self) -> None: - if not hasattr(self.engine, "check_health"): - raise RuntimeError(f"{type(self.engine)} does not support health check.") + if not hasattr(self._engine_client, "check_health"): + raise RuntimeError(f"{type(self._engine_client)} does not support health check.") try: - await self.engine.check_health() + await self._engine_client.check_health() except BaseException as e: logger.error("Healthcheck failed. The replica will be restarted") raise e from None diff --git a/python/ray/llm/_internal/serve/deployments/routers/router.py b/python/ray/llm/_internal/serve/deployments/routers/router.py index e9e7eb306df7..731a91a1b183 100644 --- a/python/ray/llm/_internal/serve/deployments/routers/router.py +++ b/python/ray/llm/_internal/serve/deployments/routers/router.py @@ -413,6 +413,7 @@ async def _process_llm_request( # In case of streaming we need to iterate over the chunks and yield them openai_stream_generator = _openai_json_wrapper(gen) + print("Hitting streaming response") return StreamingResponse( openai_stream_generator, media_type="text/event-stream" ) From 02e5ecf8b0efd7b6223fb6a44ebc4b7d095532e0 Mon Sep 17 00:00:00 2001 From: Kourosh Hakhamaneshi Date: Mon, 30 Jun 2025 10:58:37 -0700 Subject: [PATCH 11/37] wip Signed-off-by: Kourosh Hakhamaneshi --- .../serve/deployments/llm/vllm/vllm_engine.py | 288 +++++++++--------- 1 file changed, 140 insertions(+), 148 deletions(-) diff --git a/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py b/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py index 0262b690febb..db30538a1c33 100644 --- a/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py +++ b/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py @@ -146,15 +146,7 @@ def _clear_current_platform_cache(): current_platform.get_device_capability.cache_clear() -class CustomNamespace: - def __init__(self, *args): - self.classes = args - def __getattr__(self, name): - for cls in self.classes: - if hasattr(cls, name): - return getattr(cls, name) - raise AttributeError(f"Attribute {name} not found in {self.classes}") class VLLMEngine(LLMEngine): @@ -174,12 +166,12 @@ def __init__( # This is critical for models with trust_remote_code=True to avoid pickle errors. init_hf_modules() - # filter out the llm_config.engine_kwargs to those that belong to FrontendArgs and pop them over. - engine_config = llm_config.get_engine_config() - self.frontend_args = FrontendArgs(**engine_config.frontend_kwargs) - self.engine_args = AsyncEngineArgs(**engine_config.engine_kwargs) + # filter out the llm_config.engine_kwargs to those that belong to FrontendArgs and engine_args and decouple them. + self.llm_config = llm_config + self._engine_config = llm_config.get_engine_config() + self._vllm_frontend_args = FrontendArgs(**self._engine_config.frontend_kwargs) + self._vllm_engine_args = AsyncEngineArgs(**self._engine_config.engine_kwargs) - self.namespace_args = CustomNamespace(self.engine_args, self.frontend_args) if vllm is None: raise ImportError( @@ -219,25 +211,10 @@ def __init__( kv_transfer_config.engine_id = "-".join([engine_id, host, str(port)]) - self.llm_config = llm_config - self.engine_config = VLLMEngineConfig.from_llm_config(llm_config) - + # TODO (Kourosh): What do we do with this stats tracker? self._stats = VLLMEngineStatTracker() - self.running = False - self.model_config: "ModelConfig" = None - # self.engine = None - self.vllm_config: "VllmConfig" = None - - # # Chat template content format (openai or string) - # self._resolved_content_format = None - # # Also need local instance of the tokenizer to manage prompt formatting. - # self._tokenizer = None - - # self._tokenizer_executor = ThreadPoolExecutor(max_workers=1) - # self._atokenize = vllm.utils.make_async( - # self._tokenize, executor=self._tokenizer_executor - # ) - + self._running = False + # vLLM Integration points. Will be set through .start() self._engine_client = None self._oai_models = None @@ -262,56 +239,74 @@ async def start(self): If the engine is already running, do nothing. """ - if self.running: + if self._running: # The engine is already running! logger.info("Skipping engine restart because the engine is already running") return from vllm.entrypoints.openai.api_server import init_app_state - self._engine_client = await self._start_engine_client() + self._engine_client, vllm_config = await self._start_engine_client() from starlette.datastructures import State + + class _Namespace: + def __init__(self, *args): + self.classes = args + + def __getattr__(self, name): + for cls in self.classes: + if hasattr(cls, name): + return getattr(cls, name) + raise AttributeError(f"Attribute {name} not found in {self.classes}") state = State() + args = _Namespace(self._vllm_engine_args, self._vllm_frontend_args) await init_app_state( engine_client=self._engine_client, - vllm_config=self.vllm_config, + vllm_config=vllm_config, state=state, - args=self.namespace_args, + args=args, ) - self._validate_openai_serving_models(state.openai_serving_models) self._oai_models = state.openai_serving_models - self._oai_serving_chat = state.openai_serving_chat self._oai_serving_completion = state.openai_serving_completion self._oai_serving_embedding = state.openai_serving_embedding + + self._validate_openai_serving_models() + self._validate_openai_serving_chat() + - self.running = True + self._running = True logger.info("Started vLLM engine.") - def _validate_openai_serving_models(self, models): - if not hasattr(models, "lora_requests"): + def _validate_openai_serving_models(self): + if not hasattr(self._oai_models, "lora_requests"): raise ValueError("oai_models must have a lora_requests attribute") - if not hasattr(models, "load_lora_adapter"): + if not hasattr(self._oai_models, "load_lora_adapter"): raise ValueError("oai_models must have a load_lora_adapter attribute") - async def _start_engine_client(self) -> "EngineClient": + def _validate_openai_serving_chat(self): + if not hasattr(self._oai_serving_chat, "create_chat_completion"): + raise ValueError("oai_serving_chat must have a create_chat_completion attribute") + + async def _start_engine_client(self) -> Tuple["EngineClient", "VllmConfig"]: ( engine_args, - engine_config, + vllm_config, node_initialization, ) = await self._prepare_engine_config() - return self._start_async_llm_engine( + engine_client = self._start_async_llm_engine( engine_args, - engine_config, + vllm_config, node_initialization.placement_group, ) + return engine_client, vllm_config async def _prepare_engine_config(self): """ @@ -326,7 +321,7 @@ async def _prepare_engine_config(self): # TODO: NEEDED for Mistral models node_initialization = await self.initialize_node(self.llm_config) - if self.engine_config.use_gpu: + if self._engine_config.use_gpu: # Create engine config on a task with access to GPU, # as GPU capability may be queried. ref = ( @@ -343,14 +338,11 @@ async def _prepare_engine_config(self): ) .remote(self.llm_config) ) - engine_args, engine_config = ray.get(ref) + engine_args, vllm_config = ray.get(ref) else: - engine_args, engine_config = _get_vllm_engine_config(self.llm_config) + engine_args, vllm_config = _get_vllm_engine_config(self.llm_config) - # Note (genesu): vllm_config is used to extract the scheduler config for - # computing the correct prompt limit. - self.vllm_config = engine_config - return engine_args, engine_config, node_initialization + return engine_args, vllm_config, node_initialization def _start_async_llm_engine( @@ -505,7 +497,7 @@ async def chat( yield response else: logger.info( - f"[Kourosh] non streaming response received, chat_response: {chat_response}" + f"[Kourosh] non streaming response received, type: {type(chat_response)}, chat_response: {chat_response}" ) if isinstance(chat_response, ErrorResponse): yield PatchedErrorResponse( @@ -790,103 +782,103 @@ def _collect_usage_metrics(sampling_params: VLLMSamplingParams) -> None: if sampling_params.logprobs is not None: usage_counters[ArgUsage.LOGPROBS].inc() - def _parse_sampling_params( - self, sampling_params: VLLMSamplingParams - ) -> "VLLMInternalSamplingParams": - """Parse the vllm sampling parameters from the prompt. - This function is used to parse the sampling parameters from the prompt. - It also collects the usage metrics for the sampling parameters. - Args: - sampling_params: The sampling parameters defined in ray.serve.llm. - Returns: - vllm.SamplingParams, The parsed sampling parameters. - """ - self._collect_usage_metrics(sampling_params) - try: - if self.model_config is None: - raise RuntimeError( - "VLLMEngine.model_config not set. Maybe VLLMEngine.start() was not called?" - ) - - log_probs = None - if sampling_params.logprobs: - max_logprobs = getattr(self.model_config, "max_logprobs", 0) - max_logprobs = min(MAX_NUM_TOPLOGPROBS_ALLOWED, max_logprobs) - if max_logprobs == 0: - raise ValueError("This model doesn't support outputting logprobs.") - if sampling_params.top_logprobs: - if not ( - MIN_NUM_TOPLOGPROBS_ALLOWED - <= sampling_params.top_logprobs - <= max_logprobs - ): - raise ValueError( - f"top_logprobs must be between {MIN_NUM_TOPLOGPROBS_ALLOWED} " - f"and {max_logprobs}. Got {sampling_params.top_logprobs}." - ) - log_probs = sampling_params.top_logprobs - else: - log_probs = 1 - else: - if sampling_params.top_logprobs: - raise ValueError( - "if top_logprobs is specified, logprobs must be set to `True`" - ) + # def _parse_sampling_params( + # self, sampling_params: VLLMSamplingParams + # ) -> "VLLMInternalSamplingParams": + # """Parse the vllm sampling parameters from the prompt. + # This function is used to parse the sampling parameters from the prompt. + # It also collects the usage metrics for the sampling parameters. + # Args: + # sampling_params: The sampling parameters defined in ray.serve.llm. + # Returns: + # vllm.SamplingParams, The parsed sampling parameters. + # """ + # self._collect_usage_metrics(sampling_params) + # try: + # if self.model_config is None: + # raise RuntimeError( + # "VLLMEngine.model_config not set. Maybe VLLMEngine.start() was not called?" + # ) - kwargs = dict( - n=1, - best_of=sampling_params.best_of, - presence_penalty=0.0, - frequency_penalty=0.0, - repetition_penalty=1.0, - temperature=1.0, - top_p=1.0, - top_k=-1, - stop=sampling_params.stop, - stop_token_ids=sampling_params.stop_tokens, - ignore_eos=False, - # vLLM will cancel internally if input+output>max_tokens - max_tokens=self.model_config.max_model_len, - logprobs=log_probs, - ) - if sampling_params.presence_penalty is not None: - kwargs["presence_penalty"] = sampling_params.presence_penalty - if sampling_params.frequency_penalty is not None: - kwargs["frequency_penalty"] = sampling_params.frequency_penalty - if sampling_params.repetition_penalty is not None: - kwargs["repetition_penalty"] = sampling_params.repetition_penalty - if sampling_params.temperature is not None: - kwargs["temperature"] = sampling_params.temperature - if sampling_params.top_p is not None: - kwargs["top_p"] = sampling_params.top_p - if sampling_params.top_k is not None: - kwargs["top_k"] = sampling_params.top_k - if sampling_params.ignore_eos is not None: - kwargs["ignore_eos"] = sampling_params.ignore_eos - if sampling_params.max_tokens is not None: - kwargs["max_tokens"] = sampling_params.max_tokens - # If we set it to None, vLLM will throw an exception - # as that is not the default value. Omitting it - # will allow vLLM to generate a new seed internally, - # as expected. - if sampling_params.seed is not None: - kwargs["seed"] = sampling_params.seed - if sampling_params.response_format is not None: - kwargs[ - "guided_decoding" - ] = sampling_params.response_format.to_guided_decoding_params( - backend=RAYLLM_GUIDED_DECODING_BACKEND - ) - if sampling_params.kv_transfer_params is not None: - kwargs["extra_args"] = { - KV_TRANSFER_PARAMS_KEY: sampling_params.kv_transfer_params - } + # log_probs = None + # if sampling_params.logprobs: + # max_logprobs = getattr(self.model_config, "max_logprobs", 0) + # max_logprobs = min(MAX_NUM_TOPLOGPROBS_ALLOWED, max_logprobs) + # if max_logprobs == 0: + # raise ValueError("This model doesn't support outputting logprobs.") + # if sampling_params.top_logprobs: + # if not ( + # MIN_NUM_TOPLOGPROBS_ALLOWED + # <= sampling_params.top_logprobs + # <= max_logprobs + # ): + # raise ValueError( + # f"top_logprobs must be between {MIN_NUM_TOPLOGPROBS_ALLOWED} " + # f"and {max_logprobs}. Got {sampling_params.top_logprobs}." + # ) + # log_probs = sampling_params.top_logprobs + # else: + # log_probs = 1 + # else: + # if sampling_params.top_logprobs: + # raise ValueError( + # "if top_logprobs is specified, logprobs must be set to `True`" + # ) - return vllm.SamplingParams(**kwargs) - except Exception as e: - # Wrap the error in ValidationError so the status code - # returned to the user is correct. - raise ValidationError(str(e)) from e + # kwargs = dict( + # n=1, + # best_of=sampling_params.best_of, + # presence_penalty=0.0, + # frequency_penalty=0.0, + # repetition_penalty=1.0, + # temperature=1.0, + # top_p=1.0, + # top_k=-1, + # stop=sampling_params.stop, + # stop_token_ids=sampling_params.stop_tokens, + # ignore_eos=False, + # # vLLM will cancel internally if input+output>max_tokens + # max_tokens=self.model_config.max_model_len, + # logprobs=log_probs, + # ) + # if sampling_params.presence_penalty is not None: + # kwargs["presence_penalty"] = sampling_params.presence_penalty + # if sampling_params.frequency_penalty is not None: + # kwargs["frequency_penalty"] = sampling_params.frequency_penalty + # if sampling_params.repetition_penalty is not None: + # kwargs["repetition_penalty"] = sampling_params.repetition_penalty + # if sampling_params.temperature is not None: + # kwargs["temperature"] = sampling_params.temperature + # if sampling_params.top_p is not None: + # kwargs["top_p"] = sampling_params.top_p + # if sampling_params.top_k is not None: + # kwargs["top_k"] = sampling_params.top_k + # if sampling_params.ignore_eos is not None: + # kwargs["ignore_eos"] = sampling_params.ignore_eos + # if sampling_params.max_tokens is not None: + # kwargs["max_tokens"] = sampling_params.max_tokens + # # If we set it to None, vLLM will throw an exception + # # as that is not the default value. Omitting it + # # will allow vLLM to generate a new seed internally, + # # as expected. + # if sampling_params.seed is not None: + # kwargs["seed"] = sampling_params.seed + # if sampling_params.response_format is not None: + # kwargs[ + # "guided_decoding" + # ] = sampling_params.response_format.to_guided_decoding_params( + # backend=RAYLLM_GUIDED_DECODING_BACKEND + # ) + # if sampling_params.kv_transfer_params is not None: + # kwargs["extra_args"] = { + # KV_TRANSFER_PARAMS_KEY: sampling_params.kv_transfer_params + # } + + # return vllm.SamplingParams(**kwargs) + # except Exception as e: + # # Wrap the error in ValidationError so the status code + # # returned to the user is correct. + # raise ValidationError(str(e)) from e @staticmethod def _extract_logprobs( From 7640a92a65cab5e3f7736bd6db1d0cf4025f683a Mon Sep 17 00:00:00 2001 From: Kourosh Hakhamaneshi Date: Mon, 30 Jun 2025 12:06:40 -0700 Subject: [PATCH 12/37] wip Signed-off-by: Kourosh Hakhamaneshi --- .../serve/deployments/llm/vllm/vllm_engine.py | 97 +++++++------------ .../serve/deployments/llm/vllm/vllm_models.py | 24 ++++- 2 files changed, 57 insertions(+), 64 deletions(-) diff --git a/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py b/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py index db30538a1c33..3940be75a584 100644 --- a/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py +++ b/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py @@ -82,36 +82,14 @@ r".* (\d+).* is longer than the maximum model length of (\d+).*" ) - -def _get_async_engine_args(llm_config: LLMConfig) -> "AsyncEngineArgs": - engine_config = llm_config.get_engine_config() - - # This `model` is the local path on disk, or the hf model id. - # If it is the hf_model_id, vLLM automatically downloads the correct model from HF. - # We want this to be the local path on the disk when we already downloaded the - # model artifacts from a remote storage during node initialization, - # so vLLM will not require HF token for it and try to download it again. - model = engine_config.actual_hf_model_id - if isinstance(llm_config.model_loading_config.model_source, str): - model = llm_config.model_loading_config.model_source - - return vllm.engine.arg_utils.AsyncEngineArgs( - **{ - "model": model, - "distributed_executor_backend": "ray", - "guided_decoding_backend": RAYLLM_GUIDED_DECODING_BACKEND, - "disable_log_stats": False, - **engine_config.get_initialization_kwargs(), - } - ) - - def _get_vllm_engine_config( llm_config: LLMConfig, ) -> Tuple["AsyncEngineArgs", "VllmConfig"]: - async_engine_args = _get_async_engine_args(llm_config) - vllm_config = async_engine_args.create_engine_config() - return async_engine_args, vllm_config + engine_config = llm_config.get_engine_config() + async_engine_args = vllm.engine.arg_utils.AsyncEngineArgs(**engine_config.engine_kwargs) + print(f"[Kourosh] async_engine_args: {engine_config.engine_kwargs}") + vllm_engine_config = async_engine_args.create_engine_config() + return async_engine_args, vllm_engine_config def _clear_current_platform_cache(): @@ -166,11 +144,7 @@ def __init__( # This is critical for models with trust_remote_code=True to avoid pickle errors. init_hf_modules() - # filter out the llm_config.engine_kwargs to those that belong to FrontendArgs and engine_args and decouple them. self.llm_config = llm_config - self._engine_config = llm_config.get_engine_config() - self._vllm_frontend_args = FrontendArgs(**self._engine_config.frontend_kwargs) - self._vllm_engine_args = AsyncEngineArgs(**self._engine_config.engine_kwargs) if vllm is None: @@ -223,7 +197,7 @@ def __init__( self._oai_serving_embedding = None @staticmethod - async def initialize_node(llm_config: LLMConfig) -> InitializeNodeOutput: + async def _initialize_node(llm_config: LLMConfig) -> InitializeNodeOutput: """Run the node initializer. This is separate from `start` so it can run concurrently while starting the engine actor. @@ -245,8 +219,20 @@ async def start(self): return from vllm.entrypoints.openai.api_server import init_app_state - - self._engine_client, vllm_config = await self._start_engine_client() + + + node_initialization = await self._initialize_node(self.llm_config) + ( + vllm_engine_args, + vllm_frontend_args, + vllm_engine_config, + ) = self._prepare_engine_config(node_initialization) + + self._engine_client = self._start_async_llm_engine( + vllm_engine_args, + vllm_engine_config, + node_initialization.placement_group, + ) from starlette.datastructures import State @@ -261,11 +247,11 @@ def __getattr__(self, name): raise AttributeError(f"Attribute {name} not found in {self.classes}") state = State() - args = _Namespace(self._vllm_engine_args, self._vllm_frontend_args) + args = _Namespace(vllm_engine_args, vllm_frontend_args) await init_app_state( engine_client=self._engine_client, - vllm_config=vllm_config, + vllm_config=vllm_engine_config, state=state, args=args, ) @@ -294,34 +280,20 @@ def _validate_openai_serving_chat(self): if not hasattr(self._oai_serving_chat, "create_chat_completion"): raise ValueError("oai_serving_chat must have a create_chat_completion attribute") - async def _start_engine_client(self) -> Tuple["EngineClient", "VllmConfig"]: - ( - engine_args, - vllm_config, - node_initialization, - ) = await self._prepare_engine_config() - - engine_client = self._start_async_llm_engine( - engine_args, - vllm_config, - node_initialization.placement_group, - ) - return engine_client, vllm_config - async def _prepare_engine_config(self): - """ - Prepare the engine config to start the engine. + def _prepare_engine_config(self, node_initialization: InitializeNodeOutput): + """Prepare the engine config to start the engine. Returns: - engine_args: The engine arguments. - engine_config: The engine configuration. - node_initialization: The node initialization. + engine_args: The vLLM's internal engine arguments that is flattened. + frontend_args: The vLLM's internal frontend arguments that is + flattened. + engine_config: The vLLM's internal engine config that is nested. """ - # Initialize node and return all configurations - # TODO: NEEDED for Mistral models - node_initialization = await self.initialize_node(self.llm_config) + + engine_config: VLLMEngineConfig = self.llm_config.get_engine_config() - if self._engine_config.use_gpu: + if engine_config.use_gpu: # Create engine config on a task with access to GPU, # as GPU capability may be queried. ref = ( @@ -338,11 +310,12 @@ async def _prepare_engine_config(self): ) .remote(self.llm_config) ) - engine_args, vllm_config = ray.get(ref) + vllm_engine_args, vllm_engine_config = ray.get(ref) else: - engine_args, vllm_config = _get_vllm_engine_config(self.llm_config) + vllm_engine_args, vllm_engine_config = _get_vllm_engine_config(self.llm_config) - return engine_args, vllm_config, node_initialization + vllm_frontend_args = FrontendArgs(**engine_config.frontend_kwargs) + return vllm_engine_args, vllm_frontend_args, vllm_engine_config def _start_async_llm_engine( diff --git a/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_models.py b/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_models.py index 400947343b0b..405c8e000049 100644 --- a/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_models.py +++ b/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_models.py @@ -131,8 +131,9 @@ def from_llm_config(cls, llm_config: LLMConfig) -> "VLLMEngineConfig": engine_kwargs[key] = value else: raise ValueError(f"Unknown engine argument: {key}") - engine_kwargs["model"] = hf_model_id - engine_kwargs["served_model_name"] = [llm_config.model_id] + + + VLLMEngineConfig._validate_engine_kwargs(engine_kwargs, hf_model_id, llm_config) return VLLMEngineConfig( model_id=llm_config.model_id, @@ -144,6 +145,25 @@ def from_llm_config(cls, llm_config: LLMConfig) -> "VLLMEngineConfig": frontend_kwargs=frontend_kwargs, runtime_env=llm_config.runtime_env, ) + + @staticmethod + def _validate_engine_kwargs(engine_kwargs: Dict[str, Any], hf_model_id: str, llm_config: LLMConfig): + # Modify the engine_kwargs to match with expectations of Ray Serve LLM Configs. + + if "model" in engine_kwargs or "served_model_name" in engine_kwargs: + raise ValueError("model or served_model_name is not allowed in engine_kwargs when using Ray Serve LLM. Please use `model_loading_config` in LLMConfig instead.") + + engine_kwargs["model"] = hf_model_id or llm_config.model_id + engine_kwargs["served_model_name"] = [llm_config.model_id] + + if "distributed_executor_backend" in engine_kwargs and engine_kwargs["distributed_executor_backend"] != "ray": + raise ValueError("distributed_executor_backend != 'ray' is not allowed in engine_kwargs when using Ray Serve LLM Configs.") + else: + engine_kwargs["distributed_executor_backend"] = "ray" + + if "disable_log_stats" in engine_kwargs and engine_kwargs["disable_log_stats"] != False: + logger.warning("disable_log_stats = True is not allowed in engine_kwargs when using Ray Serve LLM Configs. Setting it to False.") + engine_kwargs["disable_log_stats"] = False def ray_accelerator_type(self) -> str: """Converts the accelerator type to the Ray Core format.""" From 8df78df540ce64c4905de0aa83724a620a717a6e Mon Sep 17 00:00:00 2001 From: Kourosh Hakhamaneshi Date: Mon, 30 Jun 2025 12:45:12 -0700 Subject: [PATCH 13/37] wip Signed-off-by: Kourosh Hakhamaneshi --- python/ray/llm/_internal/serve/configs/server_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/ray/llm/_internal/serve/configs/server_models.py b/python/ray/llm/_internal/serve/configs/server_models.py index c1b4972e8590..45a4d37e7781 100644 --- a/python/ray/llm/_internal/serve/configs/server_models.py +++ b/python/ray/llm/_internal/serve/configs/server_models.py @@ -262,7 +262,7 @@ def _set_model_architecture( """ if model_id_or_path: hf_config = transformers.PretrainedConfig.from_pretrained(model_id_or_path) - if hasattr(hf_config, "architectures"): + if hasattr(hf_config, "architectures") and hf_config.architectures: self._model_architecture = hf_config.architectures[0] if model_architecture: From c7d67b5bb8d93fdc08ea6b97a7041b62abc23fab Mon Sep 17 00:00:00 2001 From: Kourosh Hakhamaneshi Date: Mon, 30 Jun 2025 13:10:01 -0700 Subject: [PATCH 14/37] wip Signed-off-by: Kourosh Hakhamaneshi --- .../serve/deployments/llm/vllm/vllm_engine.py | 3 +- .../serve/deployments/llm/vllm/vllm_models.py | 40 +++++++++---------- 2 files changed, 20 insertions(+), 23 deletions(-) diff --git a/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py b/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py index 3940be75a584..faf6c873a644 100644 --- a/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py +++ b/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py @@ -86,8 +86,7 @@ def _get_vllm_engine_config( llm_config: LLMConfig, ) -> Tuple["AsyncEngineArgs", "VllmConfig"]: engine_config = llm_config.get_engine_config() - async_engine_args = vllm.engine.arg_utils.AsyncEngineArgs(**engine_config.engine_kwargs) - print(f"[Kourosh] async_engine_args: {engine_config.engine_kwargs}") + async_engine_args = vllm.engine.arg_utils.AsyncEngineArgs(**engine_config.get_initialization_kwargs()) vllm_engine_config = async_engine_args.create_engine_config() return async_engine_args, vllm_engine_config diff --git a/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_models.py b/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_models.py index 405c8e000049..e30e142bae9b 100644 --- a/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_models.py +++ b/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_models.py @@ -87,7 +87,24 @@ def get_initialization_kwargs(self) -> dict: Get kwargs that will be actually passed to the LLMInitializer constructor. """ - return self.engine_kwargs.copy() + engine_kwargs = self.engine_kwargs.copy() + + if "model" in engine_kwargs or "served_model_name" in engine_kwargs: + raise ValueError("model or served_model_name is not allowed in engine_kwargs when using Ray Serve LLM. Please use `model_loading_config` in LLMConfig instead.") + + engine_kwargs["model"] = self.actual_hf_model_id + engine_kwargs["served_model_name"] = [self.model_id] + + if "distributed_executor_backend" in engine_kwargs and engine_kwargs["distributed_executor_backend"] != "ray": + raise ValueError("distributed_executor_backend != 'ray' is not allowed in engine_kwargs when using Ray Serve LLM Configs.") + else: + engine_kwargs["distributed_executor_backend"] = "ray" + + if "disable_log_stats" in engine_kwargs and engine_kwargs["disable_log_stats"] != False: + logger.warning("disable_log_stats = True is not allowed in engine_kwargs when using Ray Serve LLM Configs. Setting it to False.") + engine_kwargs["disable_log_stats"] = False + + return engine_kwargs def get_runtime_env_with_local_env_vars(self) -> dict: runtime_env = self.runtime_env or {} @@ -132,8 +149,6 @@ def from_llm_config(cls, llm_config: LLMConfig) -> "VLLMEngineConfig": else: raise ValueError(f"Unknown engine argument: {key}") - - VLLMEngineConfig._validate_engine_kwargs(engine_kwargs, hf_model_id, llm_config) return VLLMEngineConfig( model_id=llm_config.model_id, @@ -145,25 +160,8 @@ def from_llm_config(cls, llm_config: LLMConfig) -> "VLLMEngineConfig": frontend_kwargs=frontend_kwargs, runtime_env=llm_config.runtime_env, ) - - @staticmethod - def _validate_engine_kwargs(engine_kwargs: Dict[str, Any], hf_model_id: str, llm_config: LLMConfig): - # Modify the engine_kwargs to match with expectations of Ray Serve LLM Configs. - - if "model" in engine_kwargs or "served_model_name" in engine_kwargs: - raise ValueError("model or served_model_name is not allowed in engine_kwargs when using Ray Serve LLM. Please use `model_loading_config` in LLMConfig instead.") + - engine_kwargs["model"] = hf_model_id or llm_config.model_id - engine_kwargs["served_model_name"] = [llm_config.model_id] - - if "distributed_executor_backend" in engine_kwargs and engine_kwargs["distributed_executor_backend"] != "ray": - raise ValueError("distributed_executor_backend != 'ray' is not allowed in engine_kwargs when using Ray Serve LLM Configs.") - else: - engine_kwargs["distributed_executor_backend"] = "ray" - - if "disable_log_stats" in engine_kwargs and engine_kwargs["disable_log_stats"] != False: - logger.warning("disable_log_stats = True is not allowed in engine_kwargs when using Ray Serve LLM Configs. Setting it to False.") - engine_kwargs["disable_log_stats"] = False def ray_accelerator_type(self) -> str: """Converts the accelerator type to the Ray Core format.""" From 0e97923780f9ad49c7fd17c209adb2d5ffa0b08c Mon Sep 17 00:00:00 2001 From: Kourosh Hakhamaneshi Date: Mon, 30 Jun 2025 18:59:49 -0700 Subject: [PATCH 15/37] wip Signed-off-by: Kourosh Hakhamaneshi --- .../serve/deployments/llm/vllm/vllm_engine.py | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py b/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py index faf6c873a644..89ed4445c112 100644 --- a/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py +++ b/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py @@ -2,6 +2,9 @@ import re import time import uuid +import argparse +from starlette.datastructures import State + from concurrent.futures.thread import ThreadPoolExecutor from typing import TYPE_CHECKING, AsyncGenerator, List, Optional, Tuple @@ -233,20 +236,12 @@ async def start(self): node_initialization.placement_group, ) - from starlette.datastructures import State - class _Namespace: - def __init__(self, *args): - self.classes = args - - def __getattr__(self, name): - for cls in self.classes: - if hasattr(cls, name): - return getattr(cls, name) - raise AttributeError(f"Attribute {name} not found in {self.classes}") - state = State() - args = _Namespace(vllm_engine_args, vllm_frontend_args) + args = argparse.Namespace( + **vllm_frontend_args.__dict__, + **vllm_engine_args.__dict__, + ) await init_app_state( engine_client=self._engine_client, From 1d74fc91b34f9389401cabafc455d24f4fa0f301 Mon Sep 17 00:00:00 2001 From: Kourosh Hakhamaneshi Date: Mon, 30 Jun 2025 22:29:50 -0700 Subject: [PATCH 16/37] simplify by reusing vllm apis Signed-off-by: Kourosh Hakhamaneshi --- .../serve/configs/openai_api_models.py | 769 ++---------------- .../_internal/serve/configs/server_models.py | 28 +- .../serve/deployments/llm/llm_server.py | 331 +------- .../serve/deployments/llm/vllm/vllm_engine.py | 464 +---------- .../serve/deployments/routers/router.py | 42 +- python/ray/serve/llm/openai_api_models.py | 9 +- 6 files changed, 99 insertions(+), 1544 deletions(-) diff --git a/python/ray/llm/_internal/serve/configs/openai_api_models.py b/python/ray/llm/_internal/serve/configs/openai_api_models.py index 0936abb9589b..76a1be4f65a0 100644 --- a/python/ray/llm/_internal/serve/configs/openai_api_models.py +++ b/python/ray/llm/_internal/serve/configs/openai_api_models.py @@ -1,719 +1,61 @@ -""" -Note (genesu): majority of this file is adapted from -- https://github.com/vllm-project/vllm/blob/5095e966069b9e65b7c4c63427e06cebacaad0a0/vllm/entrypoints/openai/protocol.py -- https://github.com/vllm-project/vllm/blob/5095e966069b9e65b7c4c63427e06cebacaad0a0/vllm/entrypoints/chat_utils.py -- https://github.com/openai/openai-python/tree/2e56c8da6f163db00a4ca362020148bb391edca9/src/openai/types/chat - -We patched `ErrorResponse` and `ResponseFormat` to be slightly different from the -original source. -""" - - -import time -from argparse import Namespace -from typing import ( - Any, - AsyncGenerator, - Dict, - Iterable, - List, - Literal, - Optional, - TypeVar, - Union, -) +from typing import Union, AsyncGenerator, Optional, Dict, Any, List from pydantic import ( BaseModel, - Field, - model_validator, + ConfigDict, ) -from typing_extensions import Annotated, Required, TypeAlias, TypedDict -from ray.llm._internal.serve.configs.openai_api_models_patch import ( - ErrorResponse, - ResponseFormatType as ResponseFormat, -) -from ray.llm._internal.serve.configs.server_models import ( - LLMConfig, - LLMRawResponse, - ModelData, -) -from ray.serve._private.utils import ( - generate_request_id, -) - -# openai.types.chat aliases. -# We use aliases becasuse openai.types.chat is not installed in the docs build. -# This is a hack to make the docs build pass. -ChatCompletionContentPartInputAudioParam = TypeVar( - "ChatCompletionContentPartInputAudioParam", bound=Any -) -ChatCompletionContentPartRefusalParam = TypeVar( - "ChatCompletionContentPartRefusalParam", bound=Any -) -ChatCompletionMessageToolCallParam = TypeVar( - "ChatCompletionMessageToolCallParam", bound=Any -) -OpenAIChatCompletionContentPartParam = TypeVar( - "OpenAIChatCompletionContentPartParam", bound=Any +from vllm.entrypoints.openai.protocol import ( + ChatCompletionRequest as vLLMChatCompletionRequest, + ChatCompletionResponse as vLLMChatCompletionResponse, + ChatCompletionStreamResponse as vLLMChatCompletionStreamResponse, + ErrorResponse as vLLMErrorResponse, + CompletionRequest as vLLMCompletionRequest, + CompletionResponse as vLLMCompletionResponse, + CompletionStreamResponse as vLLMCompletionStreamResponse, + EmbeddingCompletionRequest as vLLMEmbeddingCompletionRequest, + EmbeddingChatRequest as vLLMEmbeddingChatRequest, + EmbeddingResponse as vLLMEmbeddingResponse, ) -_LONG_INFO = Namespace(min=-9223372036854775808, max=9223372036854775807) - - -class AudioURL(TypedDict, total=False): - url: Required[str] - """ - Either a URL of the audio or a data URL with base64 encoded audio data. - """ - - -class ChatCompletionContentPartAudioParam(TypedDict, total=False): - audio_url: Required[AudioURL] - - type: Required[Literal["audio_url"]] - """The type of the content part.""" - - -class VideoURL(TypedDict, total=False): - url: Required[str] - """ - Either a URL of the video or a data URL with base64 encoded video data. - """ - - -class ChatCompletionContentPartVideoParam(TypedDict, total=False): - video_url: Required[VideoURL] - - type: Required[Literal["video_url"]] - """The type of the content part.""" - - -class CustomChatCompletionContentSimpleImageParam(TypedDict, total=False): - """A simpler version of the param that only accepts a plain image_url. - This is supported by OpenAI API, although it is not documented. - - Example: - { - "image_url": "https://example.com/image.jpg" - } - """ - - image_url: Required[str] - - -class CustomChatCompletionContentSimpleAudioParam(TypedDict, total=False): - """A simpler version of the param that only accepts a plain audio_url. - - Example: - { - "audio_url": "https://example.com/audio.mp3" - } - """ - - audio_url: Required[str] - - -class CustomChatCompletionContentSimpleVideoParam(TypedDict, total=False): - """A simpler version of the param that only accepts a plain audio_url. - - Example: - { - "video_url": "https://example.com/video.mp4" - } - """ - - video_url: Required[str] - - -# Ref: https://huggingface.co/mistral-community/pixtral-12b -# -# Community version of pixtral uses the key `content` instead of `text` in the content. -# This is to support the "content" content type in the prompt format, as opposite of -# the "text" content from the above which most other model uses. -class ChatCompletionContentPartContentParam(TypedDict, total=False): - content: Required[str] - """The content content.""" - - type: Required[Literal["text"]] - """The type of the content part.""" - - -ChatCompletionContentPartParam: TypeAlias = Union[ - OpenAIChatCompletionContentPartParam, - ChatCompletionContentPartAudioParam, - ChatCompletionContentPartInputAudioParam, - ChatCompletionContentPartVideoParam, - ChatCompletionContentPartRefusalParam, - CustomChatCompletionContentSimpleImageParam, - CustomChatCompletionContentSimpleAudioParam, - CustomChatCompletionContentSimpleVideoParam, - str, -] - - -class ChatCompletionMessageParam(TypedDict, total=False): - """Enables custom roles in the Chat Completion API.""" - - role: Required[str] - """The role of the message's author.""" - - content: Union[str, List[ChatCompletionContentPartParam]] - """The contents of the message.""" - - name: str - """An optional name for the participant. - - Provides the model information to differentiate between participants of the - same role. - """ - - tool_call_id: Optional[str] - """Tool call that this message is responding to.""" - - tool_calls: Optional[Iterable[ChatCompletionMessageToolCallParam]] - """The tool calls generated by the model, such as function calls.""" - - -class StreamOptions(BaseModel): - include_usage: Optional[bool] = True - continuous_usage_stats: Optional[bool] = False - - -class FunctionDefinition(BaseModel): - name: str - description: Optional[str] = None - parameters: Optional[Dict[str, Any]] = None +from typing import TYPE_CHECKING +if TYPE_CHECKING: + from ray.llm._internal.serve.configs.server_models import LLMConfig -class ChatCompletionToolsParam(BaseModel): - type: Literal["function"] = "function" - function: FunctionDefinition -class ChatCompletionNamedFunction(BaseModel): - name: str +class ChatCompletionRequest(vLLMChatCompletionRequest): + pass +class ChatCompletionResponse(vLLMChatCompletionResponse): + pass -class ChatCompletionNamedToolChoiceParam(BaseModel): - function: ChatCompletionNamedFunction - type: Literal["function"] = "function" +class ChatCompletionStreamResponse(vLLMChatCompletionStreamResponse): + pass +class ErrorResponse(vLLMErrorResponse): + pass -class LogitsProcessorConstructor(BaseModel): - qualname: str - args: Optional[List[Any]] = None - kwargs: Optional[Dict[str, Any]] = None +class CompletionRequest(vLLMCompletionRequest): + pass +class CompletionResponse(vLLMCompletionResponse): + pass -LogitsProcessors = List[Union[str, LogitsProcessorConstructor]] +class CompletionStreamResponse(vLLMCompletionStreamResponse): + pass +class EmbeddingCompletionRequest(vLLMEmbeddingCompletionRequest): + pass -class ChatCompletionRequest(BaseModel): - # Ordered by official OpenAI API documentation - # https://platform.openai.com/docs/api-reference/chat/create - messages: Annotated[List[ChatCompletionMessageParam], Field(min_length=1)] - model: str - frequency_penalty: Optional[float] = 0.0 - logit_bias: Optional[Dict[str, float]] = None - logprobs: Optional[bool] = False - top_logprobs: Optional[int] = 0 - # TODO(#9845): remove max_tokens when field is removed from OpenAI API - max_tokens: Optional[int] = Field( - default=None, - deprecated="max_tokens is deprecated in favor of the max_completion_tokens field", - ) - max_completion_tokens: Optional[int] = None - n: Optional[int] = 1 - presence_penalty: Optional[float] = 0.0 - response_format: Optional[ResponseFormat] = None - seed: Optional[int] = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max) - stop: Optional[Union[str, List[str]]] = Field(default_factory=list) - stream: Optional[bool] = False - stream_options: Optional[StreamOptions] = None - temperature: Optional[float] = None - top_p: Optional[float] = None - tools: Optional[List[ChatCompletionToolsParam]] = None - tool_choice: Optional[ - Union[Literal["none"], Literal["auto"], ChatCompletionNamedToolChoiceParam] - ] = "none" - - # NOTE this will be ignored by vLLM -- the model determines the behavior - parallel_tool_calls: Optional[bool] = False - user: Optional[str] = None - - # doc: begin-chat-completion-sampling-params - best_of: Optional[int] = None - use_beam_search: bool = False - top_k: Optional[int] = None - min_p: Optional[float] = None - repetition_penalty: Optional[float] = None - length_penalty: float = 1.0 - stop_token_ids: Optional[List[int]] = Field(default_factory=list) - include_stop_str_in_output: bool = False - ignore_eos: bool = False - min_tokens: int = 0 - skip_special_tokens: bool = True - spaces_between_special_tokens: bool = True - truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None - prompt_logprobs: Optional[int] = None - # doc: end-chat-completion-sampling-params - - # doc: begin-chat-completion-extra-params - echo: bool = Field( - default=False, - description=( - "If true, the new message will be prepended with the last message " - "if they belong to the same role." - ), - ) - add_generation_prompt: bool = Field( - default=True, - description=( - "If true, the generation prompt will be added to the chat template. " - "This is a parameter used by chat template in tokenizer config of the " - "model." - ), - ) - continue_final_message: bool = Field( - default=False, - description=( - "If this is set, the chat will be formatted so that the final " - "message in the chat is open-ended, without any EOS tokens. The " - "model will continue this message rather than starting a new one. " - 'This allows you to "prefill" part of the model\'s response for it. ' - "Cannot be used at the same time as `add_generation_prompt`." - ), - ) - add_special_tokens: bool = Field( - default=False, - description=( - "If true, special tokens (e.g. BOS) will be added to the prompt " - "on top of what is added by the chat template. " - "For most models, the chat template takes care of adding the " - "special tokens so this should be set to false (as is the " - "default)." - ), - ) - documents: Optional[List[Dict[str, str]]] = Field( - default=None, - description=( - "A list of dicts representing documents that will be accessible to " - "the model if it is performing RAG (retrieval-augmented generation)." - " If the template does not support RAG, this argument will have no " - "effect. We recommend that each document should be a dict containing " - '"title" and "text" keys.' - ), - ) - chat_template: Optional[str] = Field( - default=None, - description=( - "A Jinja template to use for this conversion. " - "As of transformers v4.44, default chat template is no longer " - "allowed, so you must provide a chat template if the tokenizer " - "does not define one." - ), - ) - chat_template_kwargs: Optional[Dict[str, Any]] = Field( - default=None, - description=( - "Additional kwargs to pass to the template renderer. " - "Will be accessible by the chat template." - ), - ) - guided_json: Optional[Union[str, dict, BaseModel]] = Field( - default=None, - description=("If specified, the output will follow the JSON schema."), - ) - guided_regex: Optional[str] = Field( - default=None, - description=("If specified, the output will follow the regex pattern."), - ) - guided_choice: Optional[List[str]] = Field( - default=None, - description=("If specified, the output will be exactly one of the choices."), - ) - guided_grammar: Optional[str] = Field( - default=None, - description=("If specified, the output will follow the context free grammar."), - ) - guided_decoding_backend: Optional[str] = Field( - default=None, - description=( - "If specified, will override the default guided decoding backend " - "of the server for this specific request. If set, must be either " - "'outlines' / 'lm-format-enforcer'" - ), - ) - guided_whitespace_pattern: Optional[str] = Field( - default=None, - description=( - "If specified, will override the default whitespace pattern " - "for guided json decoding." - ), - ) - priority: int = Field( - default=0, - description=( - "The priority of the request (lower means earlier handling; " - "default: 0). Any priority other than 0 will raise an error " - "if the served model does not use priority scheduling." - ), - ) - request_id: str = Field( - default_factory=lambda: f"{generate_request_id()}", - description=( - "The request_id related to this request. If the caller does " - "not set it, a generate_request_id will be generated. This id is used " - "through out the inference process and return in response." - ), - ) - logits_processors: Optional[LogitsProcessors] = Field( - default=None, - description=( - "A list of either qualified names of logits processors, or " - "constructor objects, to apply when sampling. A constructor is " - "a JSON object with a required 'qualname' field specifying the " - "qualified name of the processor class/factory, and optional " - "'args' and 'kwargs' fields containing positional and keyword " - "arguments. For example: {'qualname': " - "'my_module.MyLogitsProcessor', 'args': [1, 2], 'kwargs': " - "{'param': 'value'}}." - ), - ) +class EmbeddingChatRequest(vLLMEmbeddingChatRequest): + pass - # doc: end-chat-completion-extra-params - - -class CompletionRequest(BaseModel): - # Ordered by official OpenAI API documentation - # https://platform.openai.com/docs/api-reference/completions/create - model: str - prompt: Union[List[int], List[List[int]], str, List[str]] - best_of: Optional[int] = None - echo: Optional[bool] = False - frequency_penalty: Optional[float] = 0.0 - logit_bias: Optional[Dict[str, float]] = None - logprobs: Optional[int] = None - max_tokens: Optional[int] = 16 - n: int = 1 - presence_penalty: Optional[float] = 0.0 - seed: Optional[int] = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max) - stop: Optional[Union[str, List[str]]] = Field(default_factory=list) - stream: Optional[bool] = False - stream_options: Optional[StreamOptions] = None - suffix: Optional[str] = None - temperature: Optional[float] = None - top_p: Optional[float] = None - user: Optional[str] = None - - # doc: begin-completion-sampling-params - use_beam_search: bool = False - top_k: Optional[int] = None - min_p: Optional[float] = None - repetition_penalty: Optional[float] = None - length_penalty: float = 1.0 - stop_token_ids: Optional[List[int]] = Field(default_factory=list) - include_stop_str_in_output: bool = False - ignore_eos: bool = False - min_tokens: int = 0 - skip_special_tokens: bool = True - spaces_between_special_tokens: bool = True - truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None - allowed_token_ids: Optional[List[int]] = None - prompt_logprobs: Optional[int] = None - # doc: end-completion-sampling-params - - # doc: begin-completion-extra-params - add_special_tokens: bool = Field( - default=True, - description=( - "If true (the default), special tokens (e.g. BOS) will be added to " - "the prompt." - ), - ) - response_format: Optional[ResponseFormat] = Field( - default=None, - description=( - "Similar to chat completion, this parameter specifies the format of " - "output. Only {'type': 'json_object'}, {'type': 'json_schema'} or " - "{'type': 'text' } is supported." - ), - ) - guided_json: Optional[Union[str, dict, BaseModel]] = Field( - default=None, - description="If specified, the output will follow the JSON schema.", - ) - guided_regex: Optional[str] = Field( - default=None, - description=("If specified, the output will follow the regex pattern."), - ) - guided_choice: Optional[List[str]] = Field( - default=None, - description=("If specified, the output will be exactly one of the choices."), - ) - guided_grammar: Optional[str] = Field( - default=None, - description=("If specified, the output will follow the context free grammar."), - ) - guided_decoding_backend: Optional[str] = Field( - default=None, - description=( - "If specified, will override the default guided decoding backend " - "of the server for this specific request. If set, must be one of " - "'outlines' / 'lm-format-enforcer'" - ), - ) - guided_whitespace_pattern: Optional[str] = Field( - default=None, - description=( - "If specified, will override the default whitespace pattern " - "for guided json decoding." - ), - ) - priority: int = Field( - default=0, - description=( - "The priority of the request (lower means earlier handling; " - "default: 0). Any priority other than 0 will raise an error " - "if the served model does not use priority scheduling." - ), - ) - logits_processors: Optional[LogitsProcessors] = Field( - default=None, - description=( - "A list of either qualified names of logits processors, or " - "constructor objects, to apply when sampling. A constructor is " - "a JSON object with a required 'qualname' field specifying the " - "qualified name of the processor class/factory, and optional " - "'args' and 'kwargs' fields containing positional and keyword " - "arguments. For example: {'qualname': " - "'my_module.MyLogitsProcessor', 'args': [1, 2], 'kwargs': " - "{'param': 'value'}}." - ), - ) - - # doc: end-completion-extra-params - - -class FunctionCall(BaseModel): - name: str - arguments: str - - -class ToolCall(BaseModel): - id: str = Field(default_factory=lambda: f"chatcmpl-tool-{generate_request_id()}") - type: Literal["function"] = "function" - function: FunctionCall - - -class ChatMessage(BaseModel): - role: str - reasoning_content: Optional[str] = None - content: Optional[str] = None - tool_calls: List[ToolCall] = Field(default_factory=list) - - -class ChatCompletionLogProb(BaseModel): - token: str - logprob: float = -9999.0 - bytes: Optional[List[int]] = None - - -class ChatCompletionLogProbsContent(ChatCompletionLogProb): - top_logprobs: List[ChatCompletionLogProb] = Field(default_factory=list) - - -class ChatCompletionLogProbs(BaseModel): - content: Optional[List[ChatCompletionLogProbsContent]] = None - - -class ChatCompletionResponseChoice(BaseModel): - index: int - message: ChatMessage - logprobs: Optional[ChatCompletionLogProbs] = None - # per OpenAI spec this is the default - finish_reason: Optional[str] = "stop" - # not part of the OpenAI spec but included in vLLM for legacy reasons - stop_reason: Optional[Union[int, str]] = None - - -class DeltaFunctionCall(BaseModel): - name: Optional[str] = None - arguments: Optional[str] = None - - -class DeltaToolCall(BaseModel): - id: str = Field(default_factory=lambda: f"chatcmpl-tool-{generate_request_id()}") - type: Literal["function"] = "function" - index: int - function: Optional[DeltaFunctionCall] = None - - -class DeltaMessage(BaseModel): - role: Optional[str] = None - content: Optional[str] = None - reasoning_content: Optional[str] = None - tool_calls: List[DeltaToolCall] = Field(default_factory=list) - - @model_validator(mode="after") - def _non_null_content(self): - self.content = self.content or "" - return self - - -class ChatCompletionResponseStreamChoice(BaseModel): - index: int - delta: DeltaMessage - logprobs: Optional[ChatCompletionLogProbs] = None - finish_reason: Optional[str] = None - stop_reason: Optional[Union[int, str]] = None - - -class PromptTokenUsageInfo(BaseModel): - cached_tokens: Optional[int] = None - - -class UsageInfo(BaseModel): - prompt_tokens: int = 0 - total_tokens: int = 0 - completion_tokens: Optional[int] = 0 - prompt_tokens_details: Optional[PromptTokenUsageInfo] = None - - -class Logprob(BaseModel): - """Infos for supporting OpenAI compatible logprobs and token ranks. - - Attributes: - logprob: The logprob of chosen token - rank: The vocab rank of chosen token (>=1) - decoded_token: The decoded chosen token index - """ - - logprob: float - rank: Optional[int] = None - decoded_token: Optional[str] = None - - -class ChatCompletionStreamResponse(BaseModel): - id: str = Field(default_factory=lambda: f"chatcmpl-{generate_request_id()}") - object: Literal["chat.completion.chunk"] = "chat.completion.chunk" - created: int = Field(default_factory=lambda: int(time.time())) - model: str - choices: List[ChatCompletionResponseStreamChoice] - usage: Optional[UsageInfo] = Field(default=None) - - -class ChatCompletionResponse(BaseModel): - id: str = Field(default_factory=lambda: f"chatcmpl-{generate_request_id()}") - object: Literal["chat.completion"] = "chat.completion" - created: int = Field(default_factory=lambda: int(time.time())) - model: str - choices: List[ChatCompletionResponseChoice] - usage: UsageInfo - prompt_logprobs: Optional[List[Optional[Dict[int, Logprob]]]] = None - - -class CompletionLogProbs(BaseModel): - text_offset: List[int] = Field(default_factory=list) - token_logprobs: List[Optional[float]] = Field(default_factory=list) - tokens: List[str] = Field(default_factory=list) - top_logprobs: List[Optional[Dict[str, float]]] = Field(default_factory=list) - - -class CompletionResponseChoice(BaseModel): - index: int - text: str - logprobs: Optional[CompletionLogProbs] = None - finish_reason: Optional[str] = None - stop_reason: Optional[Union[int, str]] = Field( - default=None, - description=( - "The stop string or token id that caused the completion " - "to stop, None if the completion finished for some other reason " - "including encountering the EOS token" - ), - ) - prompt_logprobs: Optional[List[Optional[Dict[int, Logprob]]]] = None - - -class CompletionResponse(BaseModel): - id: str = Field(default_factory=lambda: f"cmpl-{generate_request_id()}") - object: str = "text_completion" - created: int = Field(default_factory=lambda: int(time.time())) - model: str - choices: List[CompletionResponseChoice] - usage: UsageInfo - - -class CompletionResponseStreamChoice(BaseModel): - index: int - text: str - logprobs: Optional[CompletionLogProbs] = None - finish_reason: Optional[str] = None - stop_reason: Optional[Union[int, str]] = Field( - default=None, - description=( - "The stop string or token id that caused the completion " - "to stop, None if the completion finished for some other reason " - "including encountering the EOS token" - ), - ) - - -class CompletionStreamResponse(BaseModel): - id: str = Field(default_factory=lambda: f"cmpl-{generate_request_id()}") - object: str = "text_completion" - created: int = Field(default_factory=lambda: int(time.time())) - model: str - choices: List[CompletionResponseStreamChoice] - usage: Optional[UsageInfo] = Field(default=None) - - -class EmbeddingCompletionRequest(BaseModel): - model: Optional[str] = None - input: Union[List[int], List[List[int]], str, List[str]] - encoding_format: Literal["float", "base64"] = "float" - dimensions: Optional[int] = None - user: Optional[str] = None - truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None - - additional_data: Optional[Any] = None - add_special_tokens: bool = Field( - default=True, - description=( - "If true (the default), special tokens (e.g. BOS) will be added to " - "the prompt." - ), - ) - priority: int = Field( - default=0, - description=( - "The priority of the request (lower means earlier handling; " - "default: 0). Any priority other than 0 will raise an error " - "if the served model does not use priority scheduling." - ), - ) - - -EmbeddingRequest = EmbeddingCompletionRequest - - -class EmbeddingResponseData(BaseModel): - index: int - object: str = "embedding" - embedding: Union[List[float], str] - - -class EmbeddingResponse(BaseModel): - id: str = Field(default_factory=lambda: f"embd-{generate_request_id()}") - object: str = "list" - created: int = Field(default_factory=lambda: int(time.time())) - model: str - data: List[EmbeddingResponseData] - usage: UsageInfo +class EmbeddingResponse(vLLMEmbeddingResponse): + pass +EmbeddingRequest = Union[EmbeddingCompletionRequest, EmbeddingChatRequest] LLMEmbeddingsResponse = Union[ AsyncGenerator[Union[EmbeddingResponse, ErrorResponse], None], @@ -731,7 +73,7 @@ class EmbeddingResponse(BaseModel): ], ] - +# TODO: remove this class class OpenAIHTTPException(Exception): def __init__( self, @@ -745,21 +87,32 @@ def __init__( self.type = type self.internal_message = internal_message - @classmethod - def from_model_response(cls, response: LLMRawResponse) -> "OpenAIHTTPException": - return cls( - status_code=response.error.code, - message=response.error.message, - type=response.error.type, - internal_message=response.error.internal_message, - ) + +# TODO: upstream metadata for ModelData +# Compared to vLLM this has a metadata field. +class ModelCard(BaseModel): + model_config = ConfigDict(protected_namespaces=tuple()) + + id: str + object: str + owned_by: str + permission: List[str] + metadata: Dict[str, Any] + + @property + def model_type(self) -> str: + return self.metadata["engine_config"]["model_type"] + +class ModelList(BaseModel): + data: List[ModelCard] + object: str = "list" def to_model_metadata( model_id: str, - model_config: LLMConfig, + model_config: "LLMConfig", overrides: Optional[Dict[str, Any]] = None, -): +) -> ModelCard: """Creates an OpenAI-compatible ModelData object. Args: @@ -779,10 +132,10 @@ def to_model_metadata( if overrides: metadata.update(overrides) - return ModelData( + return ModelCard( id=model_id, - rayllm_metadata=metadata, object="model", owned_by="organization-owner", permission=[], + metadata=metadata, ) diff --git a/python/ray/llm/_internal/serve/configs/server_models.py b/python/ray/llm/_internal/serve/configs/server_models.py index 45a4d37e7781..c8fd87c93bc4 100644 --- a/python/ray/llm/_internal/serve/configs/server_models.py +++ b/python/ray/llm/_internal/serve/configs/server_models.py @@ -45,6 +45,9 @@ ErrorResponse, ResponseFormatType, ) +from ray.llm._internal.serve.configs.openai_api_models import ( + ModelCard, +) from ray.llm._internal.serve.configs.prompt_formats import ( Prompt, ) @@ -572,31 +575,6 @@ def parse_args(self) -> "LLMServingArgs": return LLMServingArgs(llm_configs=llm_configs) -TModel = TypeVar("TModel", bound="Model") - - -class ModelData(BaseModel): - model_config = ConfigDict(protected_namespaces=tuple()) - - id: str - object: str - owned_by: str - permission: List[str] - rayllm_metadata: Dict[str, Any] - - @property - def model_type(self) -> str: - return self.rayllm_metadata["engine_config"]["model_type"] - - -class Model(BaseModel): - data: List[ModelData] - object: str = "list" - - @classmethod - def list(cls) -> TModel: - pass - class FinishReason(str, Enum): LENGTH = "length" diff --git a/python/ray/llm/_internal/serve/deployments/llm/llm_server.py b/python/ray/llm/_internal/serve/deployments/llm/llm_server.py index 9430a064f9f5..d96103adccb1 100644 --- a/python/ray/llm/_internal/serve/deployments/llm/llm_server.py +++ b/python/ray/llm/_internal/serve/deployments/llm/llm_server.py @@ -1,7 +1,7 @@ import asyncio import os from abc import ABC, abstractmethod -from typing import Any, AsyncGenerator, Dict, Optional, Type, Union +from typing import Any, Dict, Optional, Type # Third-party imports from ray import serve @@ -16,51 +16,23 @@ RAYLLM_VLLM_ENGINE_CLS_ENV, ) from ray.llm._internal.serve.configs.openai_api_models import ( - ChatCompletionLogProb, - ChatCompletionLogProbs, - ChatCompletionLogProbsContent, - # ChatCompletionRequest, - ChatCompletionResponse, - ChatCompletionResponseChoice, - ChatCompletionResponseStreamChoice, - ChatCompletionStreamResponse, - ChatMessage, + ChatCompletionRequest, CompletionRequest, - CompletionResponse, - CompletionResponseChoice, - CompletionResponseStreamChoice, - CompletionStreamResponse, - DeltaMessage, EmbeddingRequest, - EmbeddingResponse, - EmbeddingResponseData, LLMChatResponse, LLMCompletionsResponse, LLMEmbeddingsResponse, - UsageInfo, ) -from vllm.entrypoints.openai.protocol import ChatCompletionRequest -from ray.llm._internal.serve.configs.prompt_formats import Message, Prompt from ray.llm._internal.serve.configs.server_models import ( - DiskMultiplexConfig, LLMConfig, - LLMRawResponse, ) from ray.llm._internal.serve.deployments.llm.llm_engine import LLMEngine -from ray.llm._internal.serve.deployments.llm.multiplex.lora_model_loader import ( - LoraModelLoader, -) from ray.llm._internal.serve.deployments.llm.vllm.vllm_engine import VLLMEngine from ray.llm._internal.serve.deployments.llm.vllm.vllm_models import ( VLLMEmbeddingRequest, ) from ray.llm._internal.serve.deployments.utils.batcher import OpenAIResponseBatcher -from ray.llm._internal.serve.deployments.utils.error_handling_utils import ( - StreamingErrorHandler, -) from ray.llm._internal.serve.deployments.utils.server_utils import ( - get_model_request_id, - get_response_for_error, get_serve_request_id, ) from ray.llm._internal.serve.observability.logging import get_logger @@ -110,303 +82,6 @@ async def llm_config(self) -> Optional[LLMConfig]: return None -class ResponsePostprocessor: - """Processes raw LLM responses into OpenAI-compatible formats. - - This class handles: - 1. Error handling for the response stream - 2. Converting LLMRawResponse to Chat/Completion API formats - 3. Supporting both streaming and non-streaming responses - """ - - def __init__(self): - self.metrics_wrapper = StreamingErrorHandler() - - async def handle_failure( - self, model: str, gen: AsyncGenerator[LLMRawResponse, None] - ) -> AsyncGenerator[LLMRawResponse, None]: - async for llm_response in self.metrics_wrapper.handle_failure(model, gen): - yield llm_response - - @staticmethod - async def merge_stream( - response_stream: AsyncGenerator[LLMRawResponse, None] - ) -> LLMRawResponse: - responses = [resp async for resp in response_stream] - return LLMRawResponse.merge_stream(*responses) - - async def process_chat( - self, model: str, gen: AsyncGenerator[LLMRawResponse, None], stream: bool - ) -> LLMChatResponse: - """Process raw LLM responses into chat completion format.""" - gen = self.handle_failure(model=model, gen=gen) - request_id = get_serve_request_id() - completion_id = get_model_request_id(model) - - if stream: - # Stream processing - preserve batching from generator - yielded_role = False - all_results = [] - try: - async for batched_results in gen: - - for result in batched_results.unpack(): - all_results.append(result) - - # Handle errors - if result.error: - logger.error(f"{result.error}") - # Drop finish reason as OpenAI doesn't expect it for errors - result.finish_reason = None - all_results.pop() - yield result.error - return - - finish_reason = result.finish_reason - - # Send role message first - if not yielded_role: - yield ChatCompletionStreamResponse( - id=completion_id, - model=model, - choices=[ - ChatCompletionResponseStreamChoice( - delta=DeltaMessage(role="assistant"), - index=0, - finish_reason=None, - logprobs=ChatCompletionLogProbs(content=[]), - ) - ], - usage=None, - ) - yielded_role = True - - # Process logprobs if present - logprobs = None - if result.logprobs: - logprobs = ChatCompletionLogProbs( - content=[ - ChatCompletionLogProbsContent( - token=logprobs.token, - logprob=logprobs.logprob, - bytes=logprobs.bytes, - top_logprobs=[ - ChatCompletionLogProb( - token=logprob.token, - logprob=logprob.logprob, - bytes=logprob.bytes, - ) - for logprob in logprobs.top_logprobs - ], - ) - for logprobs in result.logprobs - ] - ) - - yield ChatCompletionStreamResponse( - id=completion_id, - model=model, - choices=[ - ChatCompletionResponseStreamChoice( - delta=DeltaMessage( - content=result.generated_text or "" - ), - index=0, - finish_reason=None, - logprobs=logprobs, - ) - ], - usage=None, - ) - - # Send final message with finish_reason if there were any results - # TODO (Kourosh): Doing this much for the last token - # (usage token) might add extra overhead to ITL of the last token. - # We should find a better way to do this. - if all_results: - merged_results = LLMRawResponse.merge_stream(*all_results) - finish_reason = merged_results.finish_reason - usage = UsageInfo( - prompt_tokens=merged_results.num_input_tokens or 0, - completion_tokens=merged_results.num_generated_tokens or 0, - total_tokens=(merged_results.num_input_tokens or 0) - + (merged_results.num_generated_tokens or 0), - ) - - yield ChatCompletionStreamResponse( - id=completion_id, - model=model, - choices=[ - ChatCompletionResponseStreamChoice( - delta=DeltaMessage(), - index=0, - finish_reason=finish_reason, - ) - ], - usage=usage, - ) - except Exception as e: - logger.error( - f"Failed while handling chat-completions for request ({request_id}): {repr(e)}", - exc_info=e, - ) - yield get_response_for_error(e, request_id).error - else: - # Non-streaming processing - merge and return a single response - try: - results: LLMRawResponse = await self.merge_stream(gen) - if results.error: - yield results.error - return - - logprobs = None - if results.logprobs: - logprobs = ChatCompletionLogProbs( - content=[ - ChatCompletionLogProbsContent( - token=logprobs.token, - logprob=logprobs.logprob, - bytes=logprobs.bytes, - top_logprobs=[ - ChatCompletionLogProb( - token=logprob.token, - logprob=logprob.logprob, - bytes=logprob.bytes, - ) - for logprob in logprobs.top_logprobs - ], - ) - for logprobs in results.logprobs - ] - ) - - yield ChatCompletionResponse( - id=completion_id, - model=model, - choices=[ - ChatCompletionResponseChoice( - message=ChatMessage( - role="assistant", - content=results.generated_text or "", - ), - index=0, - finish_reason=results.finish_reason, - logprobs=logprobs, - ) - ], - usage=UsageInfo( - prompt_tokens=results.num_input_tokens or 0, - completion_tokens=results.num_generated_tokens or 0, - total_tokens=(results.num_input_tokens or 0) - + (results.num_generated_tokens or 0), - ), - ) - except Exception as e: - logger.error( - f"Failed while handling chat-completions for request ({request_id}): {repr(e)}", - exc_info=e, - ) - yield get_response_for_error(e, request_id).error - - async def process_completions( - self, model: str, gen: AsyncGenerator[LLMRawResponse, None], stream: bool - ) -> LLMCompletionsResponse: - """Process raw LLM responses into completions format.""" - gen = self.handle_failure(model=model, gen=gen) - request_id = get_serve_request_id() - completion_id = get_model_request_id(model) - - if stream: - # Stream processing - preserve batching from generator - all_results = [] - try: - async for batched_results in gen: - - for result in batched_results.unpack(): - all_results.append(result) - - # Handle errors - if result.error: - # Drop finish reason as OpenAI doesn't expect it for errors - result.finish_reason = None - logger.error( - f"Reporting back an error: {result.error}", - extra={ - "ray_serve_extra_fields": {"response": str(result)} - }, - ) - all_results.pop() - yield result.error - return - - # Calculate usage if finished - usage = None - if result.finish_reason: - merged_results = LLMRawResponse.merge_stream(*all_results) - usage = UsageInfo( - prompt_tokens=merged_results.num_input_tokens or 0, - completion_tokens=merged_results.num_generated_tokens - or 0, - total_tokens=(merged_results.num_input_tokens or 0) - + (merged_results.num_generated_tokens or 0), - ) - - chunk = CompletionStreamResponse( - id=completion_id, - model=model, - choices=[ - CompletionResponseStreamChoice( - text=result.generated_text or "", - index=0, - logprobs={}, - finish_reason=result.finish_reason, - ) - ], - usage=usage, - ) - - yield chunk - - except Exception as e: - logger.error( - f"Failed while handling completions for request ({request_id}): {repr(e)}", - exc_info=e, - ) - yield get_response_for_error(e, request_id).error - else: - # Non-streaming processing - merge and return a single response - try: - results: LLMRawResponse = await self.merge_stream(gen) - if results.error: - yield results.error - return - - yield CompletionResponse( - id=completion_id, - model=model, - choices=[ - CompletionResponseChoice( - text=results.generated_text or "", - index=0, - logprobs={}, - finish_reason=results.finish_reason, - ) - ], - usage=UsageInfo( - prompt_tokens=results.num_input_tokens or 0, - completion_tokens=results.num_generated_tokens or 0, - total_tokens=(results.num_input_tokens or 0) - + (results.num_generated_tokens or 0), - ), - ) - except Exception as e: - logger.error( - f"Failed while handling completions for request ({request_id}): {repr(e)}", - exc_info=e, - ) - yield get_response_for_error(e, request_id).error - - class LLMServer(_LLMServerBase): """This is a shm layer to decouple the LLM engine from the ingress deployment. @@ -595,7 +270,7 @@ async def llm_config(self) -> Optional[LLMConfig]: @classmethod def as_deployment( - cls, deployment_options: Dict[str, Any] = None + cls, deployment_options: Optional[Dict[str, Any]] = None ) -> serve.Deployment: """Convert the LLMServer to a Ray Serve deployment. diff --git a/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py b/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py index 89ed4445c112..e2e887cf0b6c 100644 --- a/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py +++ b/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py @@ -1,34 +1,19 @@ import os -import re -import time import uuid import argparse from starlette.datastructures import State -from concurrent.futures.thread import ThreadPoolExecutor -from typing import TYPE_CHECKING, AsyncGenerator, List, Optional, Tuple +from typing import TYPE_CHECKING, AsyncGenerator, List, Tuple import ray from ray.llm._internal.common.utils.import_utils import try_import from ray.llm._internal.serve.configs.constants import ( - MAX_NUM_TOPLOGPROBS_ALLOWED, - MIN_NUM_TOPLOGPROBS_ALLOWED, RAYLLM_ENABLE_REQUEST_PROMPT_LOGS, - RAYLLM_GUIDED_DECODING_BACKEND, -) -from ray.llm._internal.serve.configs.error_handling import ( - InputTooLong, - ValidationError, ) from ray.llm._internal.serve.configs.server_models import ( DiskMultiplexConfig, - FinishReason, GenerationRequest, LLMConfig, - LLMRawResponse, - LogProb, - LogProbs, - Prompt, ) from transformers.dynamic_module_utils import init_hf_modules @@ -39,51 +24,32 @@ usage_counters, ) from ray.llm._internal.serve.deployments.llm.vllm.vllm_models import ( - KV_TRANSFER_PARAMS_KEY, VLLMEmbeddingRequest, VLLMEngineConfig, - VLLMGenerationRequest, VLLMSamplingParams, ) from ray.llm._internal.serve.deployments.utils.node_initialization_utils import ( InitializeNodeOutput, - initialize_node as initialize_node_util, + initialize_node, ) from ray.llm._internal.serve.deployments.utils.server_utils import floats_to_base64 from ray.llm._internal.serve.observability.logging import get_logger -from ray.llm._internal.serve.observability.metrics.utils import ( - LONG_RANGE_LATENCY_HISTOGRAM_BUCKETS_MS, - ClockUnit, - MsClock, -) from ray.util import metrics from ray.util.placement_group import PlacementGroup from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy from vllm.entrypoints.openai.cli_args import FrontendArgs from vllm.engine.arg_utils import AsyncEngineArgs -from vllm.entrypoints.openai.protocol import ErrorResponse -from ray.llm._internal.serve.configs.openai_api_models_patch import ( - ErrorResponse as PatchedErrorResponse, -) +from vllm.entrypoints.openai.protocol import ErrorResponse as VLLMErrorResponse + if TYPE_CHECKING: - from vllm import SamplingParams as VLLMInternalSamplingParams - from vllm.config import ModelConfig, VllmConfig + from vllm.config import VllmConfig from vllm.engine.protocol import EngineClient - from vllm.outputs import PoolingRequestOutput, RequestOutput + from vllm.outputs import PoolingRequestOutput vllm = try_import("vllm") logger = get_logger(__name__) -time_in_queue_histogram = metrics.Histogram( - "vllm_engine_stats_time_in_queue_ms", - "Time a request spends in the queue first forward pass not included (ms).", - boundaries=LONG_RANGE_LATENCY_HISTOGRAM_BUCKETS_MS, -) - -V1_TOO_LONG_PATTERN = re.compile( - r".* (\d+).* is longer than the maximum model length of (\d+).*" -) def _get_vllm_engine_config( llm_config: LLMConfig, @@ -92,7 +58,7 @@ def _get_vllm_engine_config( async_engine_args = vllm.engine.arg_utils.AsyncEngineArgs(**engine_config.get_initialization_kwargs()) vllm_engine_config = async_engine_args.create_engine_config() return async_engine_args, vllm_engine_config - + def _clear_current_platform_cache(): """Clear the cache of the current platform. @@ -127,8 +93,6 @@ def _clear_current_platform_cache(): - - class VLLMEngine(LLMEngine): def __init__( self, @@ -198,18 +162,8 @@ def __init__( self._oai_serving_completion = None self._oai_serving_embedding = None - @staticmethod - async def _initialize_node(llm_config: LLMConfig) -> InitializeNodeOutput: - """Run the node initializer. - - This is separate from `start` so it can run concurrently while starting the engine actor. - It's a static method so it can be overridden for testing. - """ - return await initialize_node_util(llm_config) - - - async def start(self): + async def start(self) -> None: """Start the vLLM engine. If the engine is already running, do nothing. @@ -223,7 +177,7 @@ async def start(self): from vllm.entrypoints.openai.api_server import init_app_state - node_initialization = await self._initialize_node(self.llm_config) + node_initialization = await initialize_node(self.llm_config) ( vllm_engine_args, vllm_frontend_args, @@ -360,76 +314,9 @@ async def resolve_lora(self, disk_lora_model: DiskMultiplexConfig): ) ) - if isinstance(lora_request, ErrorResponse): + if isinstance(lora_request, VLLMErrorResponse): raise ValueError(f"Failed to load lora model: {lora_request.message}") - - # async def prepare_request( - # self, - # request_id: str, - # prompt: Prompt, - # stream: bool, - # disk_lora_model: Optional[DiskMultiplexConfig] = None, - # ) -> GenerationRequest: - # from vllm.entrypoints.chat_utils import ( - # apply_hf_chat_template as _apply_hf_chat_template, - # parse_chat_messages_futures, - # ) - - # model_config = self.model_config - # mm_data = None - - # if isinstance(prompt.prompt, list): - # messages = [m.model_dump() for m in prompt.prompt] - # conversation, mm_futures = parse_chat_messages_futures( - # messages=messages, - # model_config=model_config, - # tokenizer=self._tokenizer, - # content_format=self._resolved_content_format, - # ) - # mm_data = await mm_futures - - # def apply_hf_chat_template(model_config, **kwargs): - # try: - # return _apply_hf_chat_template(model_config=model_config, **kwargs) - # except TypeError: - # # Legacy API before vLLM 0.9.0. - # # TODO(#52975): Remove above once vLLM <0.9.0 is no longer supported. - # return _apply_hf_chat_template( - # trust_remote_code=model_config.trust_remote_code, **kwargs - # ) - - # prompt_text = apply_hf_chat_template( - # model_config=model_config, - # tokenizer=self._tokenizer, - # conversation=conversation, - # chat_template=None, - # tools=None, - # tokenize=False, - # # **kwargs for tokenizer.apply_chat_template - # trust_remote_code=model_config.trust_remote_code, - # add_generation_prompt=True, - # continue_final_message=False, - # ) - # else: - # prompt_text = prompt.prompt - - # prompt_token_ids = await self._atokenize(prompt_text) - - # request_params = { - # "prompt": prompt_text, - # "prompt_token_ids": prompt_token_ids, - # "request_id": request_id, - # "sampling_params": VLLMSamplingParams.from_prompt(prompt), - # "disk_multiplex_config": disk_lora_model, - # "stream": stream, - # } - # if mm_data: - # request_params["multi_modal_data"] = mm_data - - # vllm_request = VLLMGenerationRequest(**request_params) - # return vllm_request - async def chat( self, request: GenerationRequest ) -> AsyncGenerator[str, None]: @@ -448,212 +335,22 @@ async def chat( yield a HTTPException object """ - try: - chat_response = await self._oai_serving_chat.create_chat_completion(request) - except Exception as e: - logger.error(f"[Kourosh] error in chat: {e}") - yield PatchedErrorResponse( - message=str(e), - internal_message=str(e), - type="internal_error", - code=500, - ) + + chat_response = await self._oai_serving_chat.create_chat_completion(request) if isinstance(chat_response, AsyncGenerator): async for response in chat_response: + if not isinstance(response, str): + raise ValueError(f"Expected create_chat_completion to return a stream of strings, got and item with type {type(response)}") yield response else: logger.info( f"[Kourosh] non streaming response received, type: {type(chat_response)}, chat_response: {chat_response}" ) - if isinstance(chat_response, ErrorResponse): - yield PatchedErrorResponse( - message=chat_response.message, - internal_message=chat_response.message, - type=chat_response.type, - code=chat_response.code, - ) - else: - yield chat_response.model_dump_json() - - # async def generate( - # self, request: GenerationRequest - # ) -> AsyncGenerator[LLMRawResponse, None]: - # """Generate an LLMRawResponse stream - - # The vLLM generation request will be passed into vLLM, and the resulting output - # will be wrapped in an LLMRawResponse and yielded back to the user. - - # Error handling: - - # We schedule a finalizer that will abort the request on the engine. - - # If an exception is raised in this function or vllm, the finalizer guarantees that the request is aborted. - # If an exception is raised in the caller, when this generator is gced, it will run the finalizer and abort the request. - - # This should also handle the case where the caller is cancelled (raises asyncio.CancelledError) - # """ - # if RAYLLM_ENABLE_REQUEST_PROMPT_LOGS: - # logger.info( - # f"Request {request.request_id} started. " f"Prompt: {request.prompt}" - # ) - - # if request.prompt_token_ids is not None: - # prompt = vllm.inputs.TokensPrompt( - # prompt_token_ids=request.prompt_token_ids, - # multi_modal_data=request.multi_modal_data, - # ) - # else: - # prompt = vllm.inputs.TextPrompt( - # prompt=request.prompt, - # multi_modal_data=request.multi_modal_data, - # ) - - # # Construct a results generator from vLLM - # results_generator: AsyncGenerator["RequestOutput", None] = self.engine.generate( - # prompt=prompt, - # sampling_params=self._parse_sampling_params(request.sampling_params), - # request_id=request.request_id, - # lora_request=request.lora_request, # type: ignore - # ) - - # # Loop over the results - # num_text_returned = 0 - # all_tokens_collected = 0 - # clock = MsClock(unit=ClockUnit.s) - # log_probs_idx = 0 - # finish_reason = None - # num_input_tokens = 0 - # try: - # start = time.perf_counter() - # request_output = None - # async for request_output in self._stats.auto_track(results_generator): - # # TODO(tchordia): handle more than one output - # assert ( - # len(request_output.outputs) == 1 - # ), "Received more than 1 output from vllm, aborting" - - # output = request_output.outputs[0] - # text_output = output.text[num_text_returned:] - # num_text_returned += len(text_output) - # num_input_tokens = len(request_output.prompt_token_ids) - # tokens_collected = len(output.token_ids) - all_tokens_collected - # all_tokens_collected += tokens_collected - # finish_reason = FinishReason.from_vllm_finish_reason( - # output.finish_reason - # ) - - # self._handle_input_too_long(request_output, finish_reason) - - # log_probs, log_probs_idx = self._extract_logprobs( - # output, - # log_probs_idx, - # request.sampling_params.top_logprobs, - # ) - # internal_metadata = {} - # if getattr(request_output, "kv_transfer_params", None) is not None: - # internal_metadata[ - # KV_TRANSFER_PARAMS_KEY - # ] = request_output.kv_transfer_params - # yield LLMRawResponse( - # generated_text=text_output, - # num_generated_tokens=tokens_collected, - # logprobs=log_probs, - # num_generated_tokens_batch=tokens_collected, - # num_input_tokens=num_input_tokens, - # num_input_tokens_batch=num_input_tokens, - # preprocessing_time=0, - # generation_time=clock.reset_interval(), - # finish_reason=finish_reason, - # metadata=internal_metadata, - # ) - - # if request_output is not None: - # total_request_time = time.perf_counter() - start - # if request_output.metrics is None: - # # vLLM V1 metrics are not included in the request output yet. - # queue_time = "N/A" - # generation_time_str = "N/A" - # tokens_s = "N/A" - # generated_tokens_s = "N/A" - # else: - # time_in_queue_histogram.observe( - # request_output.metrics.time_in_queue - # ) - # queue_time = f"{request_output.metrics.time_in_queue}s" - # generation_time = ( - # total_request_time - request_output.metrics.time_in_queue - # ) - # generation_time_str = f"{generation_time}s" - # tokens_s = ( - # num_input_tokens + all_tokens_collected - # ) / generation_time - # generated_tokens_s = all_tokens_collected / generation_time - - # logger.info( - # f"Request {request.request_id} finished ({finish_reason}). " - # f"Total time: {total_request_time}s, " - # f"Queue time: {queue_time}, " - # f"Generation+async time: {generation_time_str}, " - # f"Input tokens: {num_input_tokens}, " - # f"Generated tokens: {all_tokens_collected}, " - # f"tokens/s: {tokens_s}, " - # f"generated tokens/s: {generated_tokens_s}." - # ) - # else: - # logger.warning( - # f"Request {request.request_id} " - # "finished without any output. " - # f"Input tokens: {num_input_tokens}." - # ) - # except ValueError as e: - # error_args = e.args - # if len(error_args) == 3 and "Input too long." == error_args[0]: - # _, input_length, max_input_length = error_args - # raise InputTooLong(input_length, max_input_length).exception from None - # elif len(error_args) == 1 and V1_TOO_LONG_PATTERN.match(error_args[0]): - # parsed_error = V1_TOO_LONG_PATTERN.match(error_args[0]) - # raise InputTooLong( - # int(parsed_error[1]), int(parsed_error[2]) - # ).exception from None - # else: - # raise e from None - # finally: - # # Ensure that we cancel on the engine once we have exited the streaming - # # phase - # await self.engine.abort(request.request_id) - - # def _get_prompt_limit(self) -> int: - # """Helper to get the prompt limit from scheduler config - - # Port from https://github.com/vllm-project/vllm/blob/7b5ecf79bd94aab0d782c70126d0dcc37c16bc60/vllm/core/scheduler.py#L939 - # """ - # scheduler_config = self.vllm_config.scheduler_config - # if ( - # scheduler_config.chunked_prefill_enabled - # and not scheduler_config.is_multi_step - # ): - # prompt_limit = scheduler_config.max_model_len - # else: - # prompt_limit = min( - # scheduler_config.max_model_len, - # scheduler_config.max_num_batched_tokens, - # ) - # return prompt_limit - - # def _handle_input_too_long( - # self, request_output: "RequestOutput", finish_reason: Optional[FinishReason] - # ): - # if ( - # finish_reason - # and finish_reason == FinishReason.LENGTH - # and hasattr(request_output.metrics, "first_token_time") - # and request_output.metrics.first_token_time is None - # ): - # # This means that the prompt was too long and we did not generate anything. - # raise InputTooLong( - # len(request_output.prompt_token_ids), self._get_prompt_limit() - # ).exception + if isinstance(chat_response, VLLMErrorResponse): + yield ErrorResponse(**chat_response.model_dump()) + yield ChatCompletionResponse(**chat_response.model_dump()) + async def completions( self, request @@ -748,128 +445,3 @@ def _collect_usage_metrics(sampling_params: VLLMSamplingParams) -> None: if sampling_params.logprobs is not None: usage_counters[ArgUsage.LOGPROBS].inc() - - # def _parse_sampling_params( - # self, sampling_params: VLLMSamplingParams - # ) -> "VLLMInternalSamplingParams": - # """Parse the vllm sampling parameters from the prompt. - # This function is used to parse the sampling parameters from the prompt. - # It also collects the usage metrics for the sampling parameters. - # Args: - # sampling_params: The sampling parameters defined in ray.serve.llm. - # Returns: - # vllm.SamplingParams, The parsed sampling parameters. - # """ - # self._collect_usage_metrics(sampling_params) - # try: - # if self.model_config is None: - # raise RuntimeError( - # "VLLMEngine.model_config not set. Maybe VLLMEngine.start() was not called?" - # ) - - # log_probs = None - # if sampling_params.logprobs: - # max_logprobs = getattr(self.model_config, "max_logprobs", 0) - # max_logprobs = min(MAX_NUM_TOPLOGPROBS_ALLOWED, max_logprobs) - # if max_logprobs == 0: - # raise ValueError("This model doesn't support outputting logprobs.") - # if sampling_params.top_logprobs: - # if not ( - # MIN_NUM_TOPLOGPROBS_ALLOWED - # <= sampling_params.top_logprobs - # <= max_logprobs - # ): - # raise ValueError( - # f"top_logprobs must be between {MIN_NUM_TOPLOGPROBS_ALLOWED} " - # f"and {max_logprobs}. Got {sampling_params.top_logprobs}." - # ) - # log_probs = sampling_params.top_logprobs - # else: - # log_probs = 1 - # else: - # if sampling_params.top_logprobs: - # raise ValueError( - # "if top_logprobs is specified, logprobs must be set to `True`" - # ) - - # kwargs = dict( - # n=1, - # best_of=sampling_params.best_of, - # presence_penalty=0.0, - # frequency_penalty=0.0, - # repetition_penalty=1.0, - # temperature=1.0, - # top_p=1.0, - # top_k=-1, - # stop=sampling_params.stop, - # stop_token_ids=sampling_params.stop_tokens, - # ignore_eos=False, - # # vLLM will cancel internally if input+output>max_tokens - # max_tokens=self.model_config.max_model_len, - # logprobs=log_probs, - # ) - # if sampling_params.presence_penalty is not None: - # kwargs["presence_penalty"] = sampling_params.presence_penalty - # if sampling_params.frequency_penalty is not None: - # kwargs["frequency_penalty"] = sampling_params.frequency_penalty - # if sampling_params.repetition_penalty is not None: - # kwargs["repetition_penalty"] = sampling_params.repetition_penalty - # if sampling_params.temperature is not None: - # kwargs["temperature"] = sampling_params.temperature - # if sampling_params.top_p is not None: - # kwargs["top_p"] = sampling_params.top_p - # if sampling_params.top_k is not None: - # kwargs["top_k"] = sampling_params.top_k - # if sampling_params.ignore_eos is not None: - # kwargs["ignore_eos"] = sampling_params.ignore_eos - # if sampling_params.max_tokens is not None: - # kwargs["max_tokens"] = sampling_params.max_tokens - # # If we set it to None, vLLM will throw an exception - # # as that is not the default value. Omitting it - # # will allow vLLM to generate a new seed internally, - # # as expected. - # if sampling_params.seed is not None: - # kwargs["seed"] = sampling_params.seed - # if sampling_params.response_format is not None: - # kwargs[ - # "guided_decoding" - # ] = sampling_params.response_format.to_guided_decoding_params( - # backend=RAYLLM_GUIDED_DECODING_BACKEND - # ) - # if sampling_params.kv_transfer_params is not None: - # kwargs["extra_args"] = { - # KV_TRANSFER_PARAMS_KEY: sampling_params.kv_transfer_params - # } - - # return vllm.SamplingParams(**kwargs) - # except Exception as e: - # # Wrap the error in ValidationError so the status code - # # returned to the user is correct. - # raise ValidationError(str(e)) from e - - @staticmethod - def _extract_logprobs( - output: "RequestOutput", - log_probs_idx: int, - top_logprobs: Optional[int] = None, - ) -> Tuple[List[LogProbs], int]: - all_log_probs = output.logprobs[log_probs_idx:] if output.logprobs else None - return_log_probs = [] - if all_log_probs: - for log_probs in all_log_probs: - log_probs_for_n_sampled = [ - LogProb( - logprob=log_prob.logprob, - token=log_prob.decoded_token, - bytes=list(log_prob.decoded_token.encode()), - ) - for log_prob in log_probs.values() - if log_prob.decoded_token is not None - ] - if log_probs_for_n_sampled: - return_log_probs += [ - LogProbs.create( - logprobs=log_probs_for_n_sampled, top_logprobs=top_logprobs - ) - ] - return return_log_probs, log_probs_idx + len(return_log_probs) diff --git a/python/ray/llm/_internal/serve/deployments/routers/router.py b/python/ray/llm/_internal/serve/deployments/routers/router.py index 731a91a1b183..6dc3f3ad005c 100644 --- a/python/ray/llm/_internal/serve/deployments/routers/router.py +++ b/python/ray/llm/_internal/serve/deployments/routers/router.py @@ -30,9 +30,9 @@ ROUTER_TO_MODEL_REPLICA_RATIO, ) from ray.llm._internal.serve.configs.openai_api_models import ( - # ChatCompletionRequest, - # ChatCompletionResponse, - # ChatCompletionStreamResponse, + ChatCompletionRequest, + ChatCompletionResponse, + ChatCompletionStreamResponse, CompletionRequest, CompletionResponse, CompletionStreamResponse, @@ -43,19 +43,13 @@ LLMEmbeddingsResponse, OpenAIHTTPException, to_model_metadata, -) -from vllm.entrypoints.openai.protocol import ( - ChatCompletionRequest, - ChatCompletionResponse, - ChatCompletionStreamResponse, -) -from ray.llm._internal.serve.configs.openai_api_models_patch import ( ErrorResponse, + ModelCard, + ModelList ) + from ray.llm._internal.serve.configs.server_models import ( - LLMConfig, - Model, - ModelData, + LLMConfig ) from ray.llm._internal.serve.deployments.llm.multiplex.utils import ( get_base_model_id, @@ -304,12 +298,9 @@ async def _get_response( model_handle = self._get_configured_serve_handle(model) async for response in getattr(model_handle, call_method).remote(body): - logger.info( - f"[Kourosh] in router._get_response, response_type: {type(response)}, response: {response}" - ) yield response - async def model(self, model_id: str) -> Optional[ModelData]: + async def model(self, model_id: str) -> Optional[ModelCard]: if model_id in self._llm_configs: return to_model_metadata(model_id, self._llm_configs[model_id]) @@ -335,8 +326,8 @@ async def model(self, model_id: str) -> Optional[ModelData]: "Check that adapter config file exists in cloud bucket." ) - @fastapi_router_app.get("/v1/models", response_model=Model) - async def models(self) -> Model: + @fastapi_router_app.get("/v1/models", response_model=ModelList) + async def models(self) -> ModelList: """OpenAI API-compliant endpoint to get all rayllm models.""" all_models = dict() for base_model_id, llm_config in self._llm_configs.items(): @@ -354,11 +345,11 @@ async def models(self) -> Model: if model_data is not None: all_models[lora_id] = model_data - return Model(data=list(all_models.values())) + return ModelList(data=list(all_models.values())) # :path allows us to have slashes in the model name - @fastapi_router_app.get("/v1/models/{model:path}", response_model=ModelData) - async def model_data(self, model: str) -> ModelData: + @fastapi_router_app.get("/v1/models/{model:path}", response_model=ModelCard) + async def model_data(self, model: str) -> ModelCard: """OpenAI API-compliant endpoint to get one rayllm model. :param model: The model ID (e.g. "amazon/LightGPT") @@ -394,9 +385,6 @@ async def _process_llm_request( first_chunk = initial_response if isinstance(first_chunk, ErrorResponse): - logger.info( - f"[Kourosh] error encountered in first_chunk: {first_chunk}" - ) raise OpenAIHTTPException( message=first_chunk.message, status_code=first_chunk.code, @@ -405,15 +393,11 @@ async def _process_llm_request( if isinstance(first_chunk, NoneStreamingResponseType): # Not streaming, first chunk should be a single response - logger.info( - f"[Kourosh] non streaming response received, first_chunk: {first_chunk}" - ) return JSONResponse(content=first_chunk.model_dump()) # In case of streaming we need to iterate over the chunks and yield them openai_stream_generator = _openai_json_wrapper(gen) - print("Hitting streaming response") return StreamingResponse( openai_stream_generator, media_type="text/event-stream" ) diff --git a/python/ray/serve/llm/openai_api_models.py b/python/ray/serve/llm/openai_api_models.py index 210984cc1bd0..496cf794ac4b 100644 --- a/python/ray/serve/llm/openai_api_models.py +++ b/python/ray/serve/llm/openai_api_models.py @@ -72,14 +72,7 @@ class CompletionResponse(_CompletionResponse): pass -@PublicAPI(stability="alpha") -class EmbeddingRequest(_EmbeddingRequest): - """EmbeddingRequest is the request body for the embedding API. - - This model is compatible with vLLM's OpenAI API models. - """ - - pass +EmbeddingRequest = _EmbeddingRequest @PublicAPI(stability="alpha") From 59ac15a5e3e23c6dfbb5c88abc07a2733823eb84 Mon Sep 17 00:00:00 2001 From: Kourosh Hakhamaneshi Date: Mon, 30 Jun 2025 22:32:01 -0700 Subject: [PATCH 17/37] wip Signed-off-by: Kourosh Hakhamaneshi --- .../serve/deployments/llm/vllm/vllm_engine.py | 35 ------------------- 1 file changed, 35 deletions(-) diff --git a/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py b/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py index e2e887cf0b6c..59a055aad4b4 100644 --- a/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py +++ b/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py @@ -410,38 +410,3 @@ async def check_health(self) -> None: except BaseException as e: logger.error("Healthcheck failed. The replica will be restarted") raise e from None - - @staticmethod - def _collect_usage_metrics(sampling_params: VLLMSamplingParams) -> None: - if sampling_params.best_of is not None: - usage_counters[ArgUsage.BEST_OF].inc() - - if sampling_params.presence_penalty is not None: - usage_counters[ArgUsage.PRESENCE_PENALTY].inc() - - if sampling_params.frequency_penalty is not None: - usage_counters[ArgUsage.FREQUENCY_PENALTY].inc() - - if ( - sampling_params.presence_penalty is not None - and sampling_params.frequency_penalty is not None - ): - usage_counters[ArgUsage.PRESENCE_AND_FREQUENCY_PENALTY].inc() - - if sampling_params.temperature is not None: - usage_counters[ArgUsage.TEMPERATURE].inc() - - if sampling_params.top_p is not None: - usage_counters[ArgUsage.TOP_P].inc() - - if sampling_params.top_k is not None: - usage_counters[ArgUsage.TOP_K].inc() - - if sampling_params.stop is not None: - usage_counters[ArgUsage.STOP].inc() - - if sampling_params.max_tokens is not None: - usage_counters[ArgUsage.MAX_TOKENS].inc() - - if sampling_params.logprobs is not None: - usage_counters[ArgUsage.LOGPROBS].inc() From 89002a7934356a3bc49fd31ee4113d59b1f6bfcb Mon Sep 17 00:00:00 2001 From: Kourosh Hakhamaneshi Date: Mon, 30 Jun 2025 22:32:43 -0700 Subject: [PATCH 18/37] wip Signed-off-by: Kourosh Hakhamaneshi --- .../llm/_internal/serve/deployments/llm/vllm/vllm_engine.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py b/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py index 59a055aad4b4..cd9fab50ae5e 100644 --- a/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py +++ b/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py @@ -19,14 +19,11 @@ from ray.llm._internal.serve.deployments.llm.llm_engine import LLMEngine from ray.llm._internal.serve.deployments.llm.vllm.vllm_engine_stats import ( - ArgUsage, VLLMEngineStatTracker, - usage_counters, ) from ray.llm._internal.serve.deployments.llm.vllm.vllm_models import ( VLLMEmbeddingRequest, VLLMEngineConfig, - VLLMSamplingParams, ) from ray.llm._internal.serve.deployments.utils.node_initialization_utils import ( InitializeNodeOutput, @@ -34,7 +31,6 @@ ) from ray.llm._internal.serve.deployments.utils.server_utils import floats_to_base64 from ray.llm._internal.serve.observability.logging import get_logger -from ray.util import metrics from ray.util.placement_group import PlacementGroup from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy from vllm.entrypoints.openai.cli_args import FrontendArgs From 5db78c738896afaf9e94efa6590c5256870cbce0 Mon Sep 17 00:00:00 2001 From: Kourosh Hakhamaneshi Date: Tue, 1 Jul 2025 00:06:53 -0700 Subject: [PATCH 19/37] wip Signed-off-by: Kourosh Hakhamaneshi --- .../serve/deployments/llm/llm_server.py | 84 ++++--------- .../serve/deployments/llm/vllm/vllm_engine.py | 113 ++++++++++-------- 2 files changed, 92 insertions(+), 105 deletions(-) diff --git a/python/ray/llm/_internal/serve/deployments/llm/llm_server.py b/python/ray/llm/_internal/serve/deployments/llm/llm_server.py index d96103adccb1..c38a5be875a3 100644 --- a/python/ray/llm/_internal/serve/deployments/llm/llm_server.py +++ b/python/ray/llm/_internal/serve/deployments/llm/llm_server.py @@ -1,7 +1,7 @@ import asyncio import os from abc import ABC, abstractmethod -from typing import Any, Dict, Optional, Type +from typing import Any, Dict, Optional, Type, Union, AsyncGenerator # Third-party imports from ray import serve @@ -19,6 +19,7 @@ ChatCompletionRequest, CompletionRequest, EmbeddingRequest, + EmbeddingResponse, LLMChatResponse, LLMCompletionsResponse, LLMEmbeddingsResponse, @@ -28,9 +29,6 @@ ) from ray.llm._internal.serve.deployments.llm.llm_engine import LLMEngine from ray.llm._internal.serve.deployments.llm.vllm.vllm_engine import VLLMEngine -from ray.llm._internal.serve.deployments.llm.vllm.vllm_models import ( - VLLMEmbeddingRequest, -) from ray.llm._internal.serve.deployments.utils.batcher import OpenAIResponseBatcher from ray.llm._internal.serve.deployments.utils.server_utils import ( get_serve_request_id, @@ -40,6 +38,7 @@ push_telemetry_report_for_all_models, ) + logger = get_logger(__name__) @@ -149,6 +148,12 @@ def _get_batch_interval_ms(self, stream: bool = True) -> int: stream_batching_interval_ms = MODEL_RESPONSE_BATCH_TIMEOUT_MS return stream_batching_interval_ms if stream else None + async def _maybe_add_request_id_to_request(self, request: Union[ChatCompletionRequest, CompletionRequest, EmbeddingRequest]): + """Add the request id to the request.""" + request_id = get_serve_request_id() + if request_id: + request.request_id = request_id + async def _maybe_resolve_lora_from_multiplex(self) -> None: """Handle the lora model for the request.""" @@ -166,6 +171,19 @@ def _batch_output_stream(self, generator): interval_ms=self._get_batch_interval_ms(), ).stream() + + async def _run_request(self, request, *, engine_method: str, batch_output_stream: bool = False) -> AsyncGenerator[Any, None]: + """Run the stream flow for the request.""" + await self._maybe_add_request_id_to_request(request) + await self._maybe_resolve_lora_from_multiplex() + if batch_output_stream: + stream = self._batch_output_stream( + getattr(self.engine, engine_method)(request) + ) + else: + stream = getattr(self.engine, engine_method)(request) + + return stream async def chat(self, request: ChatCompletionRequest): """Runs a chat request to the LLM engine and returns the response. @@ -176,13 +194,7 @@ async def chat(self, request: ChatCompletionRequest): Returns: A LLMChatResponse object. """ - await self._maybe_resolve_lora_from_multiplex() - stream = self._batch_output_stream( - self.engine.chat(request) - ) - - async for chunk in stream: - yield chunk + return await self._run_request(request, engine_method="chat", batch_output_stream=True) async def completions(self, request: CompletionRequest) -> LLMCompletionsResponse: """Runs a completion request to the LLM engine and returns the response. @@ -193,14 +205,7 @@ async def completions(self, request: CompletionRequest) -> LLMCompletionsRespons Returns: A LLMCompletionsResponse object. """ - await self._maybe_resolve_lora_from_multiplex() - response_generator = self._batch_output_stream( - request, - self.engine.completions(request) - ) - - async for response in response_generator: - yield response + return await self._run_request(request, engine_method="completions", batch_output_stream=True) async def check_health(self) -> None: @@ -225,45 +230,8 @@ async def embeddings(self, request: EmbeddingRequest) -> LLMEmbeddingsResponse: Returns: A LLMEmbeddingsResponse object. """ - request_id = get_serve_request_id() - try: - multiplexed_model_id = serve.get_multiplexed_model_id() - - if multiplexed_model_id: - assert ( - self._llm_config.lora_config is not None - ), "Must setup lora config for multiplexed requests." - disk_lora_model = await self._disk_lora_model(multiplexed_model_id) - else: - disk_lora_model = None - - request_params = { - "request_id": request_id, - "prompt": request.input, - "encoding_format": request.encoding_format, - "disk_multiplex_config": disk_lora_model, - "serve_request_context": serve.context._serve_request_context.get(), - } - vllm_request = VLLMEmbeddingRequest(**request_params) - embedding_data, total_tokens = await self.engine.embed(vllm_request) - - data = [ - EmbeddingResponseData( - object="embedding", index=index, embedding=embedding - ) - for index, embedding in enumerate(embedding_data) - ] - - usage = UsageInfo(prompt_tokens=total_tokens, total_tokens=total_tokens) - - yield EmbeddingResponse( - model=self._llm_config.model_id, data=data, usage=usage, object="list" - ) - except Exception as e: - logger.error( - f"Failed while handling embeddings for request ({request_id}): {repr(e)}", - exc_info=e, - ) + # NOTE: Embeddings does not need batching. + return await self._run_request(request, engine_method="embeddings", batch_output_stream=False) async def llm_config(self) -> Optional[LLMConfig]: return self._llm_config diff --git a/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py b/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py index cd9fab50ae5e..67bdabf42889 100644 --- a/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py +++ b/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py @@ -3,16 +3,25 @@ import argparse from starlette.datastructures import State -from typing import TYPE_CHECKING, AsyncGenerator, List, Tuple +from typing import TYPE_CHECKING, AsyncGenerator, List, Tuple, Union import ray from ray.llm._internal.common.utils.import_utils import try_import from ray.llm._internal.serve.configs.constants import ( RAYLLM_ENABLE_REQUEST_PROMPT_LOGS, ) +from ray.llm._internal.serve.configs.openai_api_models import ( + CompletionRequest, + CompletionResponse, + ChatCompletionRequest, + ChatCompletionResponse, + EmbeddingRequest, + EmbeddingResponse, + ErrorResponse, +) + from ray.llm._internal.serve.configs.server_models import ( DiskMultiplexConfig, - GenerationRequest, LLMConfig, ) from transformers.dynamic_module_utils import init_hf_modules @@ -299,11 +308,11 @@ def _start_async_llm_engine( async def resolve_lora(self, disk_lora_model: DiskMultiplexConfig): from vllm.entrypoints.openai.protocol import LoadLoRAAdapterRequest - if disk_lora_model.model_id in self.oai_models.lora_requests: + if disk_lora_model.model_id in self._oai_models.lora_requests: # Lora is already loaded, return return - lora_request = await self.oai_models.load_lora_adapter( + lora_request = await self._oai_models.load_lora_adapter( request=LoadLoRAAdapterRequest( lora_name=disk_lora_model.model_id, lora_path=disk_lora_model.local_path, @@ -314,8 +323,8 @@ async def resolve_lora(self, disk_lora_model: DiskMultiplexConfig): raise ValueError(f"Failed to load lora model: {lora_request.message}") async def chat( - self, request: GenerationRequest - ) -> AsyncGenerator[str, None]: + self, request: ChatCompletionRequest + ) -> AsyncGenerator[Union[str, ChatCompletionResponse, ErrorResponse], None]: """ input: Take a genric free form input type and cast it to the target engine request type inside the engine. @@ -349,53 +358,63 @@ async def chat( async def completions( - self, request - ): - raise NotImplementedError("Completions are not supported yet") - - async def embeddings( - self, vllm_embedding_request: VLLMEmbeddingRequest - ) -> Tuple[List[List[float]], int]: - """Return (embeddings, num_prompt_tokens)""" + self, request: CompletionRequest + ) -> AsyncGenerator[Union[str, CompletionResponse, ErrorResponse], None]: + """ + + input: Take a generic free form input type and cast it to the target engine request type inside the engine. + + output: + - stream: True --> for each chunk, yield a string representing data: \n\n + - stream: False --> yield only one string representing the response - num_prompts = len(vllm_embedding_request.prompt) - if RAYLLM_ENABLE_REQUEST_PROMPT_LOGS: - logger.info( - f"Encoding request {vllm_embedding_request.request_id} started. " - f"Num prompts: {num_prompts}" - ) + Error: + option A: + when request hits an error, raise an HTTPException(msg, code, type) + option B: + yield a HTTPException object + """ - generators: List[AsyncGenerator["PoolingRequestOutput", None]] = [] - - prompts = vllm_embedding_request.prompt - if isinstance(prompts, str): - prompts = [prompts] - - for i, prompt in enumerate(prompts): - request_id = f"{vllm_embedding_request.request_id}-{i}" - gen: AsyncGenerator["PoolingRequestOutput", None] = self._engine_client.encode( - prompt=vllm.inputs.TextPrompt( - prompt=prompt, - ), - pooling_params=vllm.pooling_params.PoolingParams(), - request_id=request_id, - lora_request=vllm_embedding_request.lora_request, # type: ignore - ) - generators.append(gen) + if self._oai_serving_completion is None: + raise RuntimeError("Completion service is not available. Make sure the engine is started and supports completions.") + + completion_response = await self._oai_serving_completion.create_completion(request) - embedding_data = [] - total_prompt_tokens = 0 + if isinstance(completion_response, AsyncGenerator): + async for response in completion_response: + if not isinstance(response, str): + raise ValueError(f"Expected create_completion to return a stream of strings, got and item with type {type(response)}") + yield response + else: + logger.info( + f"[Kourosh] non streaming response received, type: {type(completion_response)}, completion_response: {completion_response}" + ) + if isinstance(completion_response, VLLMErrorResponse): + yield ErrorResponse(**completion_response.model_dump()) + else: + yield CompletionResponse(**completion_response.model_dump()) - for gen in generators: - async for result in gen: - embedding = result.outputs.embedding - if vllm_embedding_request.encoding_format == "base64": - embedding = floats_to_base64(embedding) + async def embeddings( + self, request: EmbeddingRequest + ) -> AsyncGenerator[Union[EmbeddingResponse, ErrorResponse], None]: + """Generate embeddings using vLLM's OpenAI-compatible API. - embedding_data.append(embedding) - total_prompt_tokens += len(result.prompt_token_ids) + Args: + request: An EmbeddingRequest object. - return embedding_data, total_prompt_tokens + Yields: + An EmbeddingResponse or ErrorResponse object. + """ + + if self._oai_serving_embedding is None: + raise RuntimeError("Embedding service is not available. Make sure the engine is started and supports embeddings.") + + embedding_response = await self._oai_serving_embedding.create_embedding(request) + + if isinstance(embedding_response, VLLMErrorResponse): + yield ErrorResponse(**embedding_response.model_dump()) + else: + yield EmbeddingResponse(**embedding_response.model_dump()) async def check_health(self) -> None: if not hasattr(self._engine_client, "check_health"): From e39daf227273d39cce213b26fedbc463a6e0ce77 Mon Sep 17 00:00:00 2001 From: Kourosh Hakhamaneshi Date: Tue, 1 Jul 2025 09:16:35 -0700 Subject: [PATCH 20/37] wip Signed-off-by: Kourosh Hakhamaneshi --- .../serve/deployments/llm/vllm/vllm_engine.py | 29 +++++++++++++++++-- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py b/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py index 67bdabf42889..a74d23feac6a 100644 --- a/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py +++ b/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py @@ -214,9 +214,7 @@ async def start(self) -> None: self._oai_serving_completion = state.openai_serving_completion self._oai_serving_embedding = state.openai_serving_embedding - self._validate_openai_serving_models() - self._validate_openai_serving_chat() - + self._validate_openai_serving_models() self._running = True @@ -270,6 +268,25 @@ def _prepare_engine_config(self, node_initialization: InitializeNodeOutput): vllm_frontend_args = FrontendArgs(**engine_config.frontend_kwargs) return vllm_engine_args, vllm_frontend_args, vllm_engine_config + def _start_async_llm_engine_v0(self, engine_args: "AsyncEngineArgs", vllm_config: "VllmConfig", placement_group: PlacementGroup) -> "EngineClient": + + from vllm import envs + envs.set_vllm_use_v1(False) + + from vllm.executor.ray_distributed_executor import RayDistributedExecutor + from vllm.engine.async_llm_engine import AsyncLLMEngine + vllm_config.parallel_config.placement_group = placement_group + + _clear_current_platform_cache() + + engine = AsyncLLMEngine( + vllm_config=vllm_config, + executor_class=RayDistributedExecutor, + log_stats=not engine_args.disable_log_stats, + ) + + return engine + def _start_async_llm_engine( self, @@ -278,6 +295,11 @@ def _start_async_llm_engine( placement_group: PlacementGroup, ) -> "EngineClient": """Creates an async LLM engine from the engine arguments.""" + + # NOTE: This is a temporary solution untill vLLM v1 supports embeddings. + if self.llm_config.experimental_configs.get("enable_embeddings", False): + return self._start_async_llm_engine_v0(engine_args, vllm_config, placement_group) + from vllm.v1.executor.abstract import Executor from vllm.v1.engine.async_llm import AsyncLLM @@ -340,6 +362,7 @@ async def chat( yield a HTTPException object """ + self._validate_openai_serving_chat() chat_response = await self._oai_serving_chat.create_chat_completion(request) From 4fe3cef83d570b7abd227aa0d4edc6291a019023 Mon Sep 17 00:00:00 2001 From: Kourosh Hakhamaneshi Date: Tue, 1 Jul 2025 09:22:32 -0700 Subject: [PATCH 21/37] for embedding user must set VLLM_USE_V1=0 Signed-off-by: Kourosh Hakhamaneshi --- .../_internal/serve/deployments/llm/vllm/vllm_engine.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py b/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py index a74d23feac6a..807ce7d762c1 100644 --- a/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py +++ b/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py @@ -124,7 +124,7 @@ def __init__( ) if not vllm.envs.VLLM_USE_V1: - raise ValueError("vLLM v0 is getting fully deprecated. As a result in Ray Serve LLM only v1 is supported.") + logger.warning("vLLM v0 is getting fully deprecated. As a result in Ray Serve LLM only v1 is supported. Only when you know what you are doing, you can set VLLM_USE_V1=0") # TODO (Kourosh): This validation logic belongs to the PDProxy module. # Pick a random port in P/D case. @@ -270,9 +270,6 @@ def _prepare_engine_config(self, node_initialization: InitializeNodeOutput): def _start_async_llm_engine_v0(self, engine_args: "AsyncEngineArgs", vllm_config: "VllmConfig", placement_group: PlacementGroup) -> "EngineClient": - from vllm import envs - envs.set_vllm_use_v1(False) - from vllm.executor.ray_distributed_executor import RayDistributedExecutor from vllm.engine.async_llm_engine import AsyncLLMEngine vllm_config.parallel_config.placement_group = placement_group @@ -297,7 +294,7 @@ def _start_async_llm_engine( """Creates an async LLM engine from the engine arguments.""" # NOTE: This is a temporary solution untill vLLM v1 supports embeddings. - if self.llm_config.experimental_configs.get("enable_embeddings", False): + if not vllm.envs.VLLM_USE_V1: return self._start_async_llm_engine_v0(engine_args, vllm_config, placement_group) from vllm.v1.executor.abstract import Executor From b1c0163b984204078121e87760bb7bf1866a2164 Mon Sep 17 00:00:00 2001 From: Kourosh Hakhamaneshi Date: Tue, 1 Jul 2025 15:35:02 -0700 Subject: [PATCH 22/37] added self contained test for first llm engine mock Signed-off-by: Kourosh Hakhamaneshi --- .../cpu/deployments/llm/test_llm_engine.py | 212 +++ .../llm/vllm/test_vllm_engine_gpu.py | 55 +- .../llm/tests/serve/mocks/mock_vllm_engine.py | 1343 ++++++++++------- 3 files changed, 992 insertions(+), 618 deletions(-) create mode 100644 python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_engine.py diff --git a/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_engine.py b/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_engine.py new file mode 100644 index 000000000000..fc90517fdb61 --- /dev/null +++ b/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_engine.py @@ -0,0 +1,212 @@ +"""This tests the LLM engine by testing the mocked implementations directly. + +This implicitly tests the consistency of the engine API through time. +Also tests that our Mock is behaving as expected to ensure that the downstream tests using Mocks are correct from Mock implementation perspective. + + +We have the following Mocks: + +- An engine that returns a string of form "test_i" for i in range(max_tokens) +- An engine that echos the sent request in its response +- An engine that excercises the multiplexing logic (e.g. LoRA) +- An engine that excercise the structured output logic (e.g. JSON mode) +- An engine that excercises the prefill-disaggregation logic +""" + +from ray.llm.tests.serve.mocks.mock_vllm_engine import MockVLLMEngine +from ray.serve.llm import LLMConfig, ModelLoadingConfig +from ray.llm._internal.serve.configs.openai_api_models import ( + ChatCompletionRequest, + ChatCompletionResponse, + CompletionRequest, + CompletionResponse, + EmbeddingCompletionRequest, + EmbeddingResponse +) +import pytest +import re +import json +from typing import Union, List, AsyncGenerator, Optional + + +class LLMResponseValidator: + """Reusable validation logic for LLM responses.""" + + @staticmethod + def get_expected_content(api_type: str, max_tokens: int) -> str: + """Get expected content based on API type.""" + return " ".join(f"test_{i}" for i in range(max_tokens)) + + @staticmethod + def validate_non_streaming_response( + response: Union[ChatCompletionResponse, CompletionResponse], + api_type: str, + max_tokens: int + ): + """Validate non-streaming responses.""" + expected_content = LLMResponseValidator.get_expected_content(api_type, max_tokens) + + if api_type == "chat": + assert isinstance(response, ChatCompletionResponse) + assert response.choices[0].message.content == expected_content + elif api_type == "completion": + assert isinstance(response, CompletionResponse) + assert response.choices[0].text == expected_content + + @staticmethod + def validate_streaming_chunks( + chunks: List[str], + api_type: str, + max_tokens: int + ): + """Validate streaming response chunks.""" + # Should have max_tokens + 1 chunks (tokens + [DONE]) + assert len(chunks) == max_tokens + 1 + + # Validate each chunk except the last [DONE] chunk + for chunk_iter, chunk in enumerate(chunks[:-1]): + pattern = r"data: (.*)\n\n" + match = re.match(pattern, chunk) + assert match is not None + chunk_data = json.loads(match.group(1)) + + if api_type == "chat": + delta = chunk_data["choices"][0]["delta"] + if chunk_iter == 0: + assert delta["role"] == "assistant" + else: + assert delta["role"] is None + assert delta["content"].strip() == f"test_{chunk_iter}" + elif api_type == "completion": + text = chunk_data["choices"][0]["text"] + assert text.strip() == f"test_{chunk_iter}" + + @staticmethod + def validate_embedding_response( + response: EmbeddingResponse, + expected_dimensions: Optional[int] = None + ): + """Validate embedding responses.""" + assert isinstance(response, EmbeddingResponse) + assert response.object == "list" + assert len(response.data) == 1 + assert response.data[0].object == "embedding" + assert isinstance(response.data[0].embedding, list) + assert len(response.data[0].embedding) > 0 # Should have some embedding dimensions + assert response.data[0].index == 0 + + # Check dimensions if specified + if expected_dimensions: + assert len(response.data[0].embedding) == expected_dimensions + + +@pytest.fixture +def llm_config(): + return LLMConfig( + model_loading_config=ModelLoadingConfig(model_id="mock-model"), + runtime_env={}, + log_engine_metrics=False, + ) + + +@pytest.fixture +def chat_request(stream, max_tokens): + """Fixture for creating chat completion requests.""" + return ChatCompletionRequest( + model="mock-model", + messages=[ + {"role": "user", "content": "Hello, world!"} + ], + max_tokens=max_tokens, + stream=stream, + ) + + +@pytest.fixture +def completion_request(stream, max_tokens): + """Fixture for creating text completion requests.""" + return CompletionRequest( + model="mock-model", + prompt="Complete this text:", + max_tokens=max_tokens, + stream=stream, + ) + + +@pytest.fixture +def embedding_request(dimensions): + """Fixture for creating embedding requests.""" + request = EmbeddingCompletionRequest( + model="mock-model", + input="Text to embed", + ) + if dimensions: + request.dimensions = dimensions + return request + + +class TestMockLLMEngine: + + @pytest.mark.parametrize("api_type", ["chat", "completion"]) + @pytest.mark.parametrize("stream", [False, True]) + @pytest.mark.parametrize("max_tokens", [5, 10, 15]) + @pytest.mark.asyncio + async def test_unified_llm_engine( + self, + llm_config, + chat_request, + completion_request, + api_type: str, + stream: bool, + max_tokens: int + ): + """Unified test for both chat and completion APIs, streaming and non-streaming.""" + # Create and start the engine + engine = MockVLLMEngine(llm_config) + await engine.start() + + # Create request based on API type + if api_type == "chat": + request = chat_request + response_generator = engine.chat(request) + elif api_type == "completion": + request = completion_request + response_generator = engine.completions(request) + + print(f"\n\n_____ {api_type.upper()} ({'STREAMING' if stream else 'NON-STREAMING'}) max_tokens={max_tokens} _____\n\n") + + if stream: + # Collect streaming chunks + chunks = [] + async for chunk in response_generator: + assert isinstance(chunk, str) + chunks.append(chunk) + + # Validate streaming response + LLMResponseValidator.validate_streaming_chunks(chunks, api_type, max_tokens) + else: + # Validate non-streaming response + async for response in response_generator: + LLMResponseValidator.validate_non_streaming_response(response, api_type, max_tokens) + + @pytest.mark.parametrize("dimensions", [None, 512]) + @pytest.mark.asyncio + async def test_embedding_mock_engine( + self, + llm_config, + embedding_request, + dimensions: Optional[int] + ): + """Test embedding API with different dimensions.""" + # Create and start the engine + engine = MockVLLMEngine(llm_config) + await engine.start() + + # Create embedding request + request = embedding_request + + print(f"\n\n_____ EMBEDDING dimensions={dimensions} _____\n\n") + + async for response in engine.embeddings(request): + LLMResponseValidator.validate_embedding_response(response, dimensions) + diff --git a/python/ray/llm/tests/serve/gpu/deployments/llm/vllm/test_vllm_engine_gpu.py b/python/ray/llm/tests/serve/gpu/deployments/llm/vllm/test_vllm_engine_gpu.py index 0607bd59951d..5ca24ac025ab 100644 --- a/python/ray/llm/tests/serve/gpu/deployments/llm/vllm/test_vllm_engine_gpu.py +++ b/python/ray/llm/tests/serve/gpu/deployments/llm/vllm/test_vllm_engine_gpu.py @@ -13,60 +13,7 @@ class TestVLLMEngine: """Test the VLLMEngine.""" - - @pytest.mark.asyncio - @pytest.mark.parametrize( - "engine_kwargs, expected_prompt_limit", - [ - ({"enable_chunked_prefill": True}, 1024000), - ( - { - "enable_chunked_prefill": True, - "max_model_len": 999, - }, - 999, - ), - ( - { - "enable_chunked_prefill": True, - "max_num_batched_tokens": 888, - }, - 1024000, - ), - ( - { - "enable_chunked_prefill": True, - "max_model_len": 999, - "max_num_batched_tokens": 888, - "enforce_eager": True, - }, - 999, - ), - ({"enable_chunked_prefill": False}, 1024000), - ( - { - "enable_chunked_prefill": False, - "max_model_len": 999, - }, - 999, - ), - ], - ) - async def test_get_prompt_limit( - # llm_config is a fixture defined in serve.tests.conftest.py - self, - llm_config: LLMConfig, - engine_kwargs: dict, - expected_prompt_limit: int, - ): - llm_config = llm_config.model_copy(deep=True) - vllm_engine = VLLMEngine(llm_config) - - # Test with default engine kwargs - llm_config.engine_kwargs = engine_kwargs - _, vllm_config = _get_vllm_engine_config(llm_config) - vllm_engine.vllm_config = vllm_config - assert vllm_engine._get_prompt_limit() == expected_prompt_limit + pass if __name__ == "__main__": diff --git a/python/ray/llm/tests/serve/mocks/mock_vllm_engine.py b/python/ray/llm/tests/serve/mocks/mock_vllm_engine.py index 579c374493e0..54cc412945d3 100644 --- a/python/ray/llm/tests/serve/mocks/mock_vllm_engine.py +++ b/python/ray/llm/tests/serve/mocks/mock_vllm_engine.py @@ -2,632 +2,841 @@ import json import random from random import randint -from typing import AsyncGenerator, Dict, Optional - -from PIL import Image -from transformers import AutoTokenizer -from vllm import CompletionOutput, PromptType, RequestOutput -from vllm.config import DeviceConfig, KVTransferConfig, ModelConfig, VllmConfig -from vllm.engine.protocol import EngineClient -from vllm.sampling_params import SamplingParams as VLLMInternalSamplingParams - -from ray.llm._internal.serve.configs.error_handling import ValidationError -from ray.llm._internal.serve.configs.openai_api_models_patch import ( - ResponseFormatJsonObject, +from typing import AsyncGenerator, Dict, Optional, Any, List, Union + +from ray.llm._internal.serve.configs.openai_api_models import ( + ChatCompletionRequest, + ChatCompletionResponse, + CompletionRequest, + CompletionResponse, + EmbeddingRequest, + EmbeddingResponse, + ErrorResponse, ) from ray.llm._internal.serve.configs.server_models import ( DiskMultiplexConfig, - FinishReason, LLMConfig, - LLMRawResponse, - LogProb, - LogProbs, - Prompt, ) from ray.llm._internal.serve.deployments.llm.llm_engine import LLMEngine -from ray.llm._internal.serve.deployments.llm.vllm.vllm_engine import VLLMEngine -from ray.llm._internal.serve.deployments.llm.vllm.vllm_engine_stats import ( - VLLMEngineStats, - VLLMEngineStatTracker, -) -from ray.llm._internal.serve.deployments.llm.vllm.vllm_models import ( - KV_TRANSFER_PARAMS_KEY, - VLLMGenerationRequest, - VLLMSamplingParams, -) -from ray.llm._internal.serve.deployments.utils.node_initialization_utils import ( - InitializeNodeOutput, -) class MockVLLMEngine(LLMEngine): + """Mock vLLM Engine that generates fake text responses.""" + def __init__(self, llm_config: LLMConfig): - """Create a vLLM Engine class + """Create a mock vLLM Engine. Args: llm_config: The llm configuration for this engine """ - assert isinstance( - llm_config, LLMConfig - ), f"Got invalid config {llm_config} of type {type(llm_config)}" self.llm_config = llm_config - - self._stats = VLLMEngineStatTracker() - - @staticmethod - async def initialize_node(llm_config: LLMConfig) -> InitializeNodeOutput: - return InitializeNodeOutput( - placement_group=None, - runtime_env={}, - extra_init_kwargs={}, - ) + self.started = False + self._current_lora_model: Optional[DiskMultiplexConfig] = None async def start(self): - """No-Op""" - return - - @staticmethod - async def async_range(count): - for i in range(count): - yield i - await asyncio.sleep(0.0) - - async def prepare_request( - self, request_id: str, prompt: Prompt, stream: bool, **kwargs - ) -> VLLMGenerationRequest: - - if isinstance(prompt.prompt, list): - # Simplification: Assume prompt is a list of messages with one user message - assert len(prompt.prompt) == 1 - assert hasattr(prompt.prompt[0], "content") - prompt_text = prompt.prompt[0].content - else: - prompt_text = prompt.prompt - - return VLLMGenerationRequest( - request_id=request_id, - prompt=prompt_text, - stream=stream, - sampling_params=VLLMSamplingParams.from_prompt(prompt), - ) - - async def generate(self, vllm_engine_request: VLLMGenerationRequest): - sampling_params = self._parse_sampling_params( - vllm_engine_request.sampling_params - ) - max_tokens = sampling_params.max_tokens - if not max_tokens: - max_tokens = randint(1, 10) - prompt = vllm_engine_request.prompt - prompt_len = ( - len(prompt.split()) if isinstance(prompt, str) else len(prompt.prompt) - ) - generation_time = 0.001 + """Start the mock engine.""" + self.started = True - async for i in self.async_range(max_tokens): - if i == max_tokens - 1: - finish_reason = FinishReason.STOP - else: - finish_reason = None - llm_response = LLMRawResponse( - generated_text=f"test_{i} ", - num_input_tokens=prompt_len, - num_input_tokens_batch=prompt_len, - num_generated_tokens=1, - preprocessing_time=0, - generation_time=generation_time, - finish_reason=finish_reason, - logprobs=self.get_logprobs(i, vllm_engine_request, sampling_params), - ) - yield llm_response - await asyncio.sleep(generation_time) + async def resolve_lora(self, lora_model: DiskMultiplexConfig): + """Resolve/load a LoRA model.""" + self._current_lora_model = lora_model async def check_health(self) -> None: - return - - def stats(self) -> VLLMEngineStats: - return self._stats.to_stats() - - def shutdown(self, shutdown_pg: bool = True): - raise NotImplementedError() - - def _parse_sampling_params( - self, sampling_params: VLLMSamplingParams - ) -> VLLMInternalSamplingParams: - try: - if sampling_params.n != 1: - raise ValueError("n>1 is not supported yet in rayllm") - if sampling_params.logprobs: - if sampling_params.top_logprobs: - if not (0 <= sampling_params.top_logprobs <= 5): - raise ValueError("top_logprobs must be between 0 and 5") - log_probs = sampling_params.top_logprobs - else: - log_probs = 1 - else: - if sampling_params.top_logprobs: - raise ValueError( - "if top_logprobs is specified, logprobs must be set to `True`" - ) - log_probs = None - - return VLLMInternalSamplingParams( - n=1, - best_of=sampling_params.best_of, - presence_penalty=sampling_params.presence_penalty - if sampling_params.presence_penalty is not None - else 0.0, - frequency_penalty=sampling_params.frequency_penalty - if sampling_params.frequency_penalty is not None - else 0.0, - repetition_penalty=sampling_params.repetition_penalty - if sampling_params.repetition_penalty is not None - else 1.0, - temperature=sampling_params.temperature - if sampling_params.temperature is not None - else 1.0, - top_p=sampling_params.top_p - if sampling_params.top_p is not None - else 1.0, - top_k=sampling_params.top_k - if sampling_params.top_k is not None - else -1, - stop=sampling_params.stop, - stop_token_ids=sampling_params.stop_tokens, - ignore_eos=False, - # vLLM will cancel internally if input+output>max_tokens - max_tokens=sampling_params.max_tokens - or self.llm_config.max_request_context_length, - logprobs=log_probs, + """Check the health of the mock engine.""" + if not self.started: + raise RuntimeError("Engine not started") + + async def chat(self, request: ChatCompletionRequest) -> AsyncGenerator[Union[str, ChatCompletionResponse, ErrorResponse], None]: + """Mock chat completion.""" + if not self.started: + raise RuntimeError("Engine not started") + + # Extract prompt text from messages + prompt_text = "" + if request.messages: + for message in request.messages: + if hasattr(message, 'content') and message.content: + prompt_text += str(message.content) + " " + + max_tokens = getattr(request, 'max_tokens', None) or randint(1, 10) + + # Generate streaming response + async for response in self._generate_chat_response( + request=request, + prompt_text=prompt_text.strip(), + max_tokens=max_tokens + ): + yield response + + async def completions(self, request: CompletionRequest) -> AsyncGenerator[Union[str, CompletionResponse, ErrorResponse], None]: + """Mock text completion.""" + if not self.started: + raise RuntimeError("Engine not started") + + prompt_text = str(request.prompt) if request.prompt else "" + max_tokens = getattr(request, 'max_tokens', None) or randint(5, 20) + + # Generate streaming response + async for response in self._generate_completion_response( + request=request, + prompt_text=prompt_text, + max_tokens=max_tokens + ): + yield response + + async def embeddings(self, request: EmbeddingRequest) -> AsyncGenerator[Union[str, EmbeddingResponse, ErrorResponse], None]: + """Mock embeddings generation.""" + if not self.started: + raise RuntimeError("Engine not started") + + # Generate a mock embedding response + embedding_data = [] + inputs = request.input if isinstance(request.input, list) else [request.input] + + for i, text in enumerate(inputs): + # Generate random embedding vector + dimensions = getattr(request, 'dimensions', None) or 1536 + embedding = [random.uniform(-1, 1) for _ in range(dimensions)] + + embedding_data.append({ + "object": "embedding", + "embedding": embedding, + "index": i + }) + + response = EmbeddingResponse( + object="list", + data=embedding_data, + model=getattr(request, 'model', 'mock-model'), + usage={ + "prompt_tokens": len(str(request.input).split()), + "total_tokens": len(str(request.input).split()) + } + ) + yield response + + async def _generate_chat_response( + self, + request: ChatCompletionRequest, + prompt_text: str, + max_tokens: int + ) -> AsyncGenerator[Union[str, ChatCompletionResponse], None]: + """Generate mock chat completion response.""" + + if request.stream: + # Streaming response - return SSE formatted strings + request_id = f"chatcmpl-{random.randint(1000, 9999)}" + created_time = int(asyncio.get_event_loop().time()) + model_name = getattr(request, 'model', 'mock-model') + + for i in range(max_tokens): + token = f"test_{i} " + if i == max_tokens - 1: + # no space for the last token + token = f"test_{i}" + + # Create streaming chunk + choice = { + "index": 0, + "delta": { + "content": token, + "role": "assistant" if i == 0 else None + }, + "finish_reason": "stop" if i == max_tokens - 1 else None + } + + chunk_data = { + "id": request_id, + "object": "chat.completion.chunk", + "created": created_time, + "model": model_name, + "choices": [choice] + } + + # Format as SSE + yield f"data: {json.dumps(chunk_data)}\n\n" + await asyncio.sleep(0.01) # Simulate processing time + + # Send final [DONE] message + yield "data: [DONE]\n\n" + else: + # Non-streaming response - return response object + generated_text = " ".join([f"test_{i}" for i in range(max_tokens)]) + + choice = { + "index": 0, + "message": { + "role": "assistant", + "content": generated_text + }, + "finish_reason": "stop" + } + + response = ChatCompletionResponse( + id=f"chatcmpl-{random.randint(1000, 9999)}", + object="chat.completion", + created=int(asyncio.get_event_loop().time()), + model=getattr(request, 'model', 'mock-model'), + choices=[choice], + usage={ + "prompt_tokens": len(prompt_text.split()), + "completion_tokens": max_tokens, + "total_tokens": len(prompt_text.split()) + max_tokens + } ) - except Exception as e: - # Wrap the error in ValidationError so the status code - # returned to the user is correct. - raise ValidationError(str(e)) from e - - def get_logprobs( - self, - i: int, - vllm_engine_request: VLLMGenerationRequest, - sampling_params: VLLMSamplingParams, - ): - """Helper function for generating LLMRawResponse logprobs""" - num_logprobs = sampling_params.logprobs - top_logprobs = vllm_engine_request.sampling_params.top_logprobs - if num_logprobs: - log_probs = [ - LogProbs.create( - logprobs=[ - LogProb( - logprob=0.0, - token=( - f"test_{i} " if idx == 0 else f"candidate_token_{idx}" - ), - bytes=[], - ) - for idx in range(num_logprobs) - ], - top_logprobs=top_logprobs, - ) - ] + + yield response + + async def _generate_completion_response( + self, + request: CompletionRequest, + prompt_text: str, + max_tokens: int + ) -> AsyncGenerator[Union[str, CompletionResponse], None]: + """Generate mock completion response.""" + + if request.stream: + # Streaming response - return SSE formatted strings + request_id = f"cmpl-{random.randint(1000, 9999)}" + created_time = int(asyncio.get_event_loop().time()) + model_name = getattr(request, 'model', 'mock-model') + + for i in range(max_tokens): + token = f"test_{i} " + if i == max_tokens - 1: + # no space for the last token + token = f"test_{i}" + + choice = { + "index": 0, + "text": token, + "finish_reason": "stop" if i == max_tokens - 1 else None + } + + chunk_data = { + "id": request_id, + "object": "text_completion", + "created": created_time, + "model": model_name, + "choices": [choice] + } + + # Format as SSE + yield f"data: {json.dumps(chunk_data)}\n\n" + await asyncio.sleep(0.01) + + # Send final [DONE] message + yield "data: [DONE]\n\n" else: - log_probs = None - - return log_probs + # Non-streaming response - return response object + generated_text = " ".join([f"test_{i}" for i in range(max_tokens)]) + + choice = { + "index": 0, + "text": generated_text, + "finish_reason": "stop" + } + + response = CompletionResponse( + id=f"cmpl-{random.randint(1000, 9999)}", + object="text_completion", + created=int(asyncio.get_event_loop().time()), + model=getattr(request, 'model', 'mock-model'), + choices=[choice], + usage={ + "prompt_tokens": len(prompt_text.split()), + "completion_tokens": max_tokens, + "total_tokens": len(prompt_text.split()) + max_tokens + } + ) + + yield response class MockEchoVLLMEngine(MockVLLMEngine): - """ - Mock engine that responds with information about the request sent to it. Useful - for testing the contents of VLLMGenerationRequests created in RayLLM code up to - the vLLM boundary. + """Mock engine that responds with information about the request sent to it. + + Useful for testing the contents of requests created in data plane code. """ - def _convert_to_json(self, vllm_engine_request: VLLMGenerationRequest) -> Dict: - """Converts request to json. - - If the request contains an image, this method removes the image - from `vllm_engine_request` and sets `has_image: true` in the - output dictionary. - This is because `Image.Image` is not json serializable. - """ - mm_data = vllm_engine_request.multi_modal_data - if isinstance(mm_data, dict) and "image" in mm_data: - assert isinstance(mm_data["image"], Image.Image) or ( - isinstance(mm_data["image"], list) - and all( - [ - isinstance(image, Image.Image) - for image in vllm_engine_request.multi_modal_data["image"] - ] - ) - ), "Image must be of type Image.Image or a list of Image.Image" - mm_data["image"] = None - has_image = True + async def chat(self, request: ChatCompletionRequest) -> AsyncGenerator[Union[str, ChatCompletionResponse, ErrorResponse], None]: + """Echo the chat request information.""" + if not self.started: + raise RuntimeError("Engine not started") + + # Convert request to JSON for echoing + request_info = { + "request_type": "chat", + "model": getattr(request, 'model', None), + "messages": getattr(request, 'messages', []), + "max_tokens": getattr(request, 'max_tokens', None), + "temperature": getattr(request, 'temperature', None), + "stream": getattr(request, 'stream', False), + "current_lora_model": self._current_lora_model.model_dump() if self._current_lora_model else None + } + + echo_text = json.dumps(request_info, indent=2) + + if request.stream: + # Return as SSE for streaming + chunk_data = { + "id": f"chatcmpl-echo-{random.randint(1000, 9999)}", + "object": "chat.completion.chunk", + "created": int(asyncio.get_event_loop().time()), + "model": getattr(request, 'model', 'mock-echo-model'), + "choices": [{ + "index": 0, + "delta": { + "role": "assistant", + "content": echo_text + }, + "finish_reason": "stop" + }] + } + yield f"data: {json.dumps(chunk_data)}\n\n" + yield "data: [DONE]\n\n" else: - has_image = False - res = vllm_engine_request.model_dump() - res.update({"has_image": has_image}) - return json.dumps(res) - - async def generate(self, vllm_engine_request: VLLMGenerationRequest): - yield LLMRawResponse( - generated_text=self._convert_to_json(vllm_engine_request), - num_input_tokens=0, - num_input_tokens_batch=0, - num_generated_tokens=1, - preprocessing_time=0, - generation_time=0.01, - finish_reason=FinishReason.STOP, - logprobs=None, - ) - - -class MockMultiplexEngine(LLMEngine): - def __init__(self, *args, **kwargs): - self.started = False - - @staticmethod - async def initialize_node(llm_config: LLMConfig) -> InitializeNodeOutput: - return InitializeNodeOutput( - placement_group=None, - runtime_env={}, - extra_init_kwargs={}, - ) - - async def prepare_request( - self, - request_id: str, - prompt: Prompt, - stream: bool, - disk_lora_model: Optional[DiskMultiplexConfig] = None, - ) -> VLLMGenerationRequest: - - if isinstance(prompt.prompt, list): - # Simplification: Assume prompt is a list of messages with one user message - assert len(prompt.prompt) == 1 - assert hasattr(prompt.prompt[0], "content") - prompt_text = prompt.prompt[0].content + # Return as response object + choice = { + "index": 0, + "message": { + "role": "assistant", + "content": echo_text + }, + "finish_reason": "stop" + } + + response = ChatCompletionResponse( + id=f"chatcmpl-echo-{random.randint(1000, 9999)}", + object="chat.completion", + created=int(asyncio.get_event_loop().time()), + model=getattr(request, 'model', 'mock-echo-model'), + choices=[choice] + ) + + yield response + + async def completions(self, request: CompletionRequest) -> AsyncGenerator[Union[str, CompletionResponse, ErrorResponse], None]: + """Echo the completion request information.""" + if not self.started: + raise RuntimeError("Engine not started") + + request_info = { + "request_type": "completion", + "model": getattr(request, 'model', None), + "prompt": getattr(request, 'prompt', None), + "max_tokens": getattr(request, 'max_tokens', None), + "temperature": getattr(request, 'temperature', None), + "stream": getattr(request, 'stream', False), + "current_lora_model": self._current_lora_model.model_dump() if self._current_lora_model else None + } + + echo_text = json.dumps(request_info, indent=2) + + if request.stream: + # Return as SSE for streaming + chunk_data = { + "id": f"cmpl-echo-{random.randint(1000, 9999)}", + "object": "text_completion", + "created": int(asyncio.get_event_loop().time()), + "model": getattr(request, 'model', 'mock-echo-model'), + "choices": [{ + "index": 0, + "text": echo_text, + "finish_reason": "stop" + }] + } + yield f"data: {json.dumps(chunk_data)}\n\n" + yield "data: [DONE]\n\n" else: - prompt_text = prompt.prompt - - output = VLLMGenerationRequest( - request_id=request_id, - prompt=prompt_text, - stream=stream, - sampling_params=VLLMSamplingParams.from_prompt(prompt), - disk_multiplex_config=disk_lora_model, + # Return as response object + choice = { + "index": 0, + "text": echo_text, + "finish_reason": "stop" + } + + response = CompletionResponse( + id=f"cmpl-echo-{random.randint(1000, 9999)}", + object="text_completion", + created=int(asyncio.get_event_loop().time()), + model=getattr(request, 'model', 'mock-echo-model'), + choices=[choice] + ) + + yield response + + async def embeddings(self, request: EmbeddingRequest) -> AsyncGenerator[Union[str, EmbeddingResponse, ErrorResponse], None]: + """Echo the embedding request information.""" + if not self.started: + raise RuntimeError("Engine not started") + + request_info = { + "request_type": "embedding", + "model": getattr(request, 'model', None), + "input": getattr(request, 'input', None), + "encoding_format": getattr(request, 'encoding_format', None), + "dimensions": getattr(request, 'dimensions', None), + "current_lora_model": self._current_lora_model.model_dump() if self._current_lora_model else None + } + + # Return request info as mock embedding + echo_text = json.dumps(request_info, indent=2) + mock_embedding = [float(ord(c)) for c in echo_text[:10]] # Mock embedding from first 10 chars + + response = EmbeddingResponse( + object="list", + data=[{ + "object": "embedding", + "embedding": mock_embedding, + "index": 0 + }], + model=getattr(request, 'model', 'mock-echo-model'), + usage={ + "prompt_tokens": len(str(request.input).split()), + "total_tokens": len(str(request.input).split()) + } ) - return output + + yield response - async def start(self): - self.started = True - - async def generate(self, arg): - assert self.started, "Engine was not started" - yield arg - - async def check_health(self): - return True +class MockMultiplexEngine(MockVLLMEngine): + """Mock engine for testing multiplex/LoRA functionality.""" -class FakeLoraModelLoader: - async def load_model( - self, lora_model_id: str, llm_config: LLMConfig - ) -> DiskMultiplexConfig: - return DiskMultiplexConfig.model_validate( - { - "model_id": lora_model_id, - "max_total_tokens": llm_config.max_request_context_length, - "local_path": "/local/path", - "lora_assigned_int_id": 1, + def __init__(self, llm_config: LLMConfig): + super().__init__(llm_config) + self.loaded_lora_models: List[DiskMultiplexConfig] = [] + + async def resolve_lora(self, lora_model: DiskMultiplexConfig): + """Mock LoRA model loading.""" + self._current_lora_model = lora_model + # Keep track of loaded models + if lora_model not in self.loaded_lora_models: + self.loaded_lora_models.append(lora_model) + + async def chat(self, request: ChatCompletionRequest) -> AsyncGenerator[Union[str, ChatCompletionResponse, ErrorResponse], None]: + """Chat with multiplex information.""" + if not self.started: + raise RuntimeError("Engine not started") + + # Include multiplex info in response + lora_info = "" + if self._current_lora_model: + lora_info = f" [LoRA: {self._current_lora_model.model_id}]" + + generated_text = f"Mock multiplex response{lora_info}" + + if request.stream: + # Return as SSE for streaming + chunk_data = { + "id": f"chatcmpl-multiplex-{random.randint(1000, 9999)}", + "object": "chat.completion.chunk", + "created": int(asyncio.get_event_loop().time()), + "model": getattr(request, 'model', 'mock-multiplex-model'), + "choices": [{ + "index": 0, + "delta": { + "role": "assistant", + "content": generated_text + }, + "finish_reason": "stop" + }] } - ) + yield f"data: {json.dumps(chunk_data)}\n\n" + yield "data: [DONE]\n\n" + else: + # Return as response object + choice = { + "index": 0, + "message": { + "role": "assistant", + "content": generated_text + }, + "finish_reason": "stop" + } + + response = ChatCompletionResponse( + id=f"chatcmpl-multiplex-{random.randint(1000, 9999)}", + object="chat.completion", + created=int(asyncio.get_event_loop().time()), + model=getattr(request, 'model', 'mock-multiplex-model'), + choices=[choice] + ) + + yield response class MockJSONModeVLLMEngine(MockVLLMEngine): - async def generate_text(self, max_tokens, prompt_len): - generation_time = 0.001 - async for i in self.async_range(max_tokens): - if i == max_tokens - 1: - finish_reason = FinishReason.STOP - else: - finish_reason = None - llm_response = LLMRawResponse( - generated_text=f"test_{i} ", - num_input_tokens=prompt_len, - num_input_tokens_batch=prompt_len, - num_generated_tokens=1, - preprocessing_time=0, - generation_time=generation_time, - finish_reason=finish_reason, - ) - yield llm_response - await asyncio.sleep(generation_time) - - async def generate_json(self, json_schema, max_tokens, prompt_len): - random_valid_json = str(generate_from_schema(json_schema)) - # the json has double quotes where single quotes should be and single quotes where double quotes should be: - random_valid_json = random_valid_json.replace("'", '"') - - tokens = split_string_into_chunks(random_valid_json, max_tokens) - - generation_time = 0.001 - async for i in self.async_range(max_tokens): - finish_reason = None - if i == max_tokens - 1: - finish_reason = FinishReason.STOP - - generated_text = tokens[i] - llm_response = LLMRawResponse( - generated_text=generated_text, - num_input_tokens=prompt_len, - num_input_tokens_batch=prompt_len, - num_generated_tokens=1, - preprocessing_time=0, - generation_time=generation_time, - finish_reason=finish_reason, - ) - yield llm_response - await asyncio.sleep(generation_time) - - async def generate(self, vllm_engine_request: VLLMGenerationRequest): - sampling_params = self._parse_sampling_params( - vllm_engine_request.sampling_params + """Mock engine that generates valid JSON responses when JSON mode is requested.""" + + async def chat(self, request: ChatCompletionRequest) -> AsyncGenerator[Union[str, ChatCompletionResponse, ErrorResponse], None]: + """Generate JSON or text response based on request format.""" + if not self.started: + raise RuntimeError("Engine not started") + + # Check if JSON mode is requested + response_format = getattr(request, 'response_format', None) + is_json_mode = ( + response_format and + hasattr(response_format, 'type') and + response_format.type == "json_object" ) - max_tokens = sampling_params.max_tokens - if not max_tokens: - max_tokens = randint(1, 10) - prompt = vllm_engine_request.prompt - prompt_len = get_prompt_length(prompt) - response_format = sampling_params.response_format - if response_format and isinstance(response_format, ResponseFormatJsonObject): - response_format = sampling_params.response_format - generator = self.generate_json( - response_format.json_schema, - max_tokens=max_tokens, - prompt_len=prompt_len, - ) + + if is_json_mode: + # Generate valid JSON based on schema if provided + if hasattr(response_format, 'json_schema') and response_format.json_schema: + try: + # Use the schema to generate a valid response + json_response = generate_from_schema(response_format.json_schema) + generated_text = json.dumps(json_response, ensure_ascii=False) + except Exception as e: + # Fallback to default JSON if schema generation fails + json_response = { + "error": f"Schema generation failed: {str(e)}", + "schema_provided": bool(response_format.json_schema), + "fallback_response": True + } + generated_text = json.dumps(json_response, indent=2) + else: + # Default JSON response when no schema is provided + json_response = { + "message": "This is a mock JSON response", + "timestamp": int(asyncio.get_event_loop().time()), + "request_info": { + "model": getattr(request, 'model', 'unknown'), + "has_messages": bool(getattr(request, 'messages', [])), + "lora_model": self._current_lora_model.model_id if self._current_lora_model else None + } + } + generated_text = json.dumps(json_response, indent=2) else: - generator = self.generate_text(max_tokens=max_tokens, prompt_len=prompt_len) - async for x in generator: - yield x - - def _parse_sampling_params( - self, sampling_params: VLLMSamplingParams - ) -> VLLMInternalSamplingParams: - new_sampling_params = super()._parse_sampling_params(sampling_params) - new_sampling_params.response_format = sampling_params.response_format - return new_sampling_params - - -class MockPDDisaggVLLMEngineClient(EngineClient): - """ - Mock vllm EngineClient that supports PD Disaggregation. - """ - - def __init__(self, vllm_config: VllmConfig): - self._llm_config = vllm_config - self._model_config = vllm_config.model_config - - @property - def kv_transfer_config(self): - # https://github.com/vllm-project/vllm/blob/980a172474fa0f32433dda87ae1fa4aadba24c51/vllm/config.py#L4061 - kv_transfer_config = self._llm_config.kv_transfer_config - if kv_transfer_config is not None: - assert isinstance(kv_transfer_config, KVTransferConfig) - return kv_transfer_config - - @staticmethod - async def async_range(count): - for i in range(count): - yield i - await asyncio.sleep(0.0) - - def is_running(self) -> bool: - return True + # Generate regular text + generated_text = "Mock response from JSON mode engine" + + if request.stream: + # Return as SSE for streaming with realistic JSON chunking + request_id = f"chatcmpl-json-{random.randint(1000, 9999)}" + created_time = int(asyncio.get_event_loop().time()) + model_name = getattr(request, 'model', 'mock-json-model') + + if is_json_mode: + # For JSON streaming, split the JSON into realistic chunks + # This simulates how a real LLM would generate JSON token by token + max_chunk_size = 10 # Characters per chunk + chunks = [generated_text[i:i+max_chunk_size] for i in range(0, len(generated_text), max_chunk_size)] + + for i, chunk in enumerate(chunks): + chunk_data = { + "id": request_id, + "object": "chat.completion.chunk", + "created": created_time, + "model": model_name, + "choices": [{ + "index": 0, + "delta": { + "content": chunk, + "role": "assistant" if i == 0 else None + }, + "finish_reason": "stop" if i == len(chunks) - 1 else None + }] + } + yield f"data: {json.dumps(chunk_data)}\n\n" + await asyncio.sleep(0.01) # Simulate processing time + else: + # For non-JSON streaming, return as single chunk + chunk_data = { + "id": request_id, + "object": "chat.completion.chunk", + "created": created_time, + "model": model_name, + "choices": [{ + "index": 0, + "delta": { + "role": "assistant", + "content": generated_text + }, + "finish_reason": "stop" + }] + } + yield f"data: {json.dumps(chunk_data)}\n\n" + + # Send final [DONE] message + yield "data: [DONE]\n\n" + else: + # Return as response object + choice = { + "index": 0, + "message": { + "role": "assistant", + "content": generated_text + }, + "finish_reason": "stop" + } + + response = ChatCompletionResponse( + id=f"chatcmpl-json-{random.randint(1000, 9999)}", + object="chat.completion", + created=int(asyncio.get_event_loop().time()), + model=getattr(request, 'model', 'mock-json-model'), + choices=[choice] + ) + + yield response - @property - def is_stopped(self) -> bool: - return False - @property - def errored(self) -> bool: - return False +class MockPDDisaggVLLMEngine(MockVLLMEngine): + """Mock engine for testing Prefill/Decode disaggregated functionality.""" - @property - def dead_error(self) -> BaseException: - return None + def __init__(self, llm_config: LLMConfig): + super().__init__(llm_config) + self.prefill_cache = {} + self.kv_transfer_enabled = False - def generate( - self, - prompt: PromptType, - sampling_params: VLLMInternalSamplingParams, - request_id: str, - **kwargs, - ) -> AsyncGenerator[RequestOutput, None]: - """Generate outputs for a request.""" - max_tokens = sampling_params.max_tokens or randint(1, 10) - - # vLLM uses `extra_args` to pass in `kv_transfer_params`: - # https://github.com/vllm-project/vllm/blob/980a172474fa0f32433dda87ae1fa4aadba24c51/vllm/v1/request.py#L65 - kv_transfer_params = None - if ( - self.kv_transfer_config is not None - and KV_TRANSFER_PARAMS_KEY in sampling_params.extra_args - ): - # For now we don't test the items in request/response, so just pass empty dict. - kv_transfer_params = {} # noqa: F841 - - async def generate_response(): - # vLLM EngineClient spits accumulated output in the response. - # ray serve's engine spits output in chunk. - accumulated_output = "" - async for i in self.async_range(max_tokens): - accumulated_output += f"mock_pd_client_response_{i} " - yield RequestOutput( - finished=(i == max_tokens - 1), - request_id=request_id, - prompt=prompt, - prompt_token_ids=[i], - prompt_logprobs=[0.0], - outputs=[ - CompletionOutput( - index=i, - text=accumulated_output, - token_ids=[i], - cumulative_logprob=None, - logprobs=None, - ) - ], - kv_transfer_params=kv_transfer_params, - ) - - return generate_response() - - def encode( - self, - prompt: PromptType, - request_id: str, - **kwargs, - ) -> AsyncGenerator: - """Generate outputs for a request from a pooling model.""" - raise NotImplementedError("Not expected to be reached") - - async def abort(self, request_id: str) -> None: - """Abort a request. + async def start(self): + """Start with disaggregation support.""" + await super().start() + # Mock enabling KV transfer + self.kv_transfer_enabled = True + + async def chat(self, request: ChatCompletionRequest) -> AsyncGenerator[Union[str, ChatCompletionResponse, ErrorResponse], None]: + """Chat with disaggregation simulation.""" + if not self.started: + raise RuntimeError("Engine not started") + + # Simulate prefill/decode disaggregation + request_id = getattr(request, 'request_id', f"req-{random.randint(1000, 9999)}") + + # Mock prefill phase + prompt_text = "" + if hasattr(request, 'messages') and request.messages: + for message in request.messages: + if hasattr(message, 'content') and message.content: + prompt_text += str(message.content) + " " + + # Cache prefill result + self.prefill_cache[request_id] = { + "prompt": prompt_text.strip(), + "kv_cache": f"mock_kv_cache_{len(prompt_text)}" + } + + # Mock decode phase + generated_text = f"Mock PD disagg response [cached: {request_id}]" + if self.kv_transfer_enabled: + generated_text += " [KV transfer enabled]" + + if request.stream: + # Return as SSE for streaming + chunk_data = { + "id": f"chatcmpl-pd-{request_id}", + "object": "chat.completion.chunk", + "created": int(asyncio.get_event_loop().time()), + "model": getattr(request, 'model', 'mock-pd-model'), + "choices": [{ + "index": 0, + "delta": { + "role": "assistant", + "content": generated_text + }, + "finish_reason": "stop" + }] + } + yield f"data: {json.dumps(chunk_data)}\n\n" + yield "data: [DONE]\n\n" + else: + # Return as response object + choice = { + "index": 0, + "message": { + "role": "assistant", + "content": generated_text + }, + "finish_reason": "stop" + } + + response = ChatCompletionResponse( + id=f"chatcmpl-pd-{request_id}", + object="chat.completion", + created=int(asyncio.get_event_loop().time()), + model=getattr(request, 'model', 'mock-pd-model'), + choices=[choice] + ) + + yield response - Args: - request_id: The unique id of the request. - """ - return - - async def get_vllm_config(self): - """Get the vllm configuration of the vLLM engine.""" - return self._llm_config - - async def get_model_config(self): - """Get the model configuration of the vLLM engine.""" - return self._model_config - - async def get_decoding_config(self): - """Get the decoding configuration of the vLLM engine.""" - raise NotImplementedError("Not expected to be reached") - - async def get_input_preprocessor(self): - """Get the input processor of the vLLM engine.""" - raise NotImplementedError("Not expected to be reached") - - async def get_tokenizer( - self, - lora_request=None, - ) -> any: - """Get the appropriate tokenizer for the request""" - return AutoTokenizer.from_pretrained(self._model_config.model) - - async def is_tracing_enabled(self) -> bool: - """Check if tracing is enabled""" - raise NotImplementedError("Not expected to be reached") - - async def do_log_stats( - self, - scheduler_outputs=None, - model_output=None, - ) -> None: - raise NotImplementedError("Not expected to be reached") - async def check_health(self) -> None: - """Raise if unhealthy""" - return - - async def start_profile(self) -> None: - """Start profiling the engine""" - raise NotImplementedError("Not expected to be reached") - - async def stop_profile(self) -> None: - """Start profiling the engine""" - raise NotImplementedError("Not expected to be reached") - - async def reset_prefix_cache(self, device=None) -> None: - """Reset the prefix cache""" - raise NotImplementedError("Not expected to be reached") - - async def sleep(self, level: int = 1) -> None: - """Sleep the engine""" - raise NotImplementedError("Not expected to be reached") - - async def wake_up(self, tags: Optional[list[str]] = None) -> None: - """Wake up the engine""" - raise NotImplementedError("Not expected to be reached") - - async def is_sleeping(self) -> bool: - """Check whether the engine is sleeping""" - raise NotImplementedError("Not expected to be reached") - - async def add_lora(self, lora_request) -> None: - """Load a new LoRA adapter into the engine for future requests.""" - raise NotImplementedError("Not expected to be reached") - - async def reset_mm_cache(self) -> None: - """Reset the multi-modal cache""" - raise NotImplementedError("Not expected to be reached") - - -class MockPDDisaggVLLMEngine(VLLMEngine): - async def _start_engine(self) -> EngineClient: - return MockPDDisaggVLLMEngineClient( - VllmConfig( - model_config=ModelConfig( - model=self.llm_config.model_loading_config.model_id, - task="auto", - tokenizer=self.llm_config.model_loading_config.model_id, - tokenizer_mode="auto", - trust_remote_code=False, - dtype="auto", - seed=0, - ), - device_config=DeviceConfig( - device="cpu", - ), - ) +class FakeLoraModelLoader: + """Fake LoRA model loader for testing.""" + + async def load_model(self, lora_model_id: str, llm_config: LLMConfig) -> DiskMultiplexConfig: + """Load a fake LoRA model.""" + return DiskMultiplexConfig( + model_id=lora_model_id, + max_total_tokens=llm_config.max_request_context_length, + local_path="/fake/local/path", + lora_assigned_int_id=random.randint(1, 100), ) -def generate_from_schema(schema): +# Utility functions for JSON generation and validation +def generate_from_schema(schema: dict) -> Any: + """Generate mock data from JSON schema.""" if "type" not in schema: raise ValueError("Schema must have a 'type' property") - # Check for enum and return a random value from it + # Handle enum values first (takes precedence over type) if "enum" in schema: - return schema["enum"][0] + return random.choice(schema["enum"]) + + # Handle const values + if "const" in schema: + return schema["const"] - if schema["type"] == "object": + schema_type = schema["type"] + + if schema_type == "object": obj = {} - for prop, prop_schema in schema.get("properties", {}).items(): - obj[prop] = generate_from_schema(prop_schema) + properties = schema.get("properties", {}) + required = schema.get("required", []) + + # Generate required properties first + for prop in required: + if prop in properties: + obj[prop] = generate_from_schema(properties[prop]) + + # Generate optional properties (randomly include some) + for prop, prop_schema in properties.items(): + if prop not in obj and random.choice([True, False]): + obj[prop] = generate_from_schema(prop_schema) + return obj - elif schema["type"] == "array": - item_schema = schema.get("items", {}) - return [generate_from_schema(item_schema) for _ in range(random.randint(1, 3))] - - elif schema["type"] == "string": - return "sample_string" + elif schema_type == "array": + item_schema = schema.get("items", {"type": "string"}) + min_items = schema.get("minItems", 1) + max_items = schema.get("maxItems", 5) + array_length = random.randint(min_items, max_items) + + return [generate_from_schema(item_schema) for _ in range(array_length)] + + elif schema_type == "string": + # Handle string patterns and formats + if "pattern" in schema: + # For testing purposes, return a string that might match common patterns + pattern = schema["pattern"] + if "email" in pattern.lower() or "@" in pattern: + return "test@example.com" + elif "phone" in pattern.lower() or "\\d" in pattern: + return "123-456-7890" + else: + return "pattern_match_string" + + if "format" in schema: + format_type = schema["format"] + if format_type == "email": + return "test@example.com" + elif format_type == "date": + return "2024-01-15" + elif format_type == "date-time": + return "2024-01-15T10:30:00Z" + elif format_type == "uri": + return "https://example.com" + elif format_type == "uuid": + return "550e8400-e29b-41d4-a716-446655440000" + + # Handle string length constraints + min_length = schema.get("minLength", 1) + max_length = schema.get("maxLength", 20) + base_string = "mock_string_value" + + if max_length < len(base_string): + return base_string[:max_length] + elif min_length > len(base_string): + return base_string + "x" * (min_length - len(base_string)) + else: + return base_string - elif schema["type"] == "integer": - return random.randint(0, 100) + elif schema_type == "integer": + minimum = schema.get("minimum", 0) + maximum = schema.get("maximum", 100) + return random.randint(minimum, maximum) - elif schema["type"] == "number": - return random.uniform(0, 100) + elif schema_type == "number": + minimum = schema.get("minimum", 0.0) + maximum = schema.get("maximum", 100.0) + return random.uniform(minimum, maximum) - elif schema["type"] == "boolean": + elif schema_type == "boolean": return random.choice([True, False]) + elif schema_type == "null": + return None + + # Handle multiple types (anyOf, oneOf) + elif isinstance(schema_type, list): + chosen_type = random.choice(schema_type) + return generate_from_schema({"type": chosen_type}) + else: - raise ValueError(f"Unsupported type: {schema['type']}") + raise ValueError(f"Unsupported schema type: {schema_type}") -def split_string_into_chunks(s, n): +def validate_json_schema_response(response_text: str, schema: dict) -> bool: + """ + Validate that a JSON response conforms to the provided schema. + This is a simple validation for testing purposes. + """ + try: + data = json.loads(response_text) + # Basic validation - in a real implementation you'd use jsonschema library + return _validate_against_schema(data, schema) + except (json.JSONDecodeError, Exception): + return False + + +def _validate_against_schema(data: Any, schema: dict) -> bool: + """Helper function for basic schema validation.""" + schema_type = schema.get("type") + + if schema_type == "object" and isinstance(data, dict): + # Check required properties + required = schema.get("required", []) + for prop in required: + if prop not in data: + return False + + # Check property types + properties = schema.get("properties", {}) + for prop, value in data.items(): + if prop in properties: + if not _validate_against_schema(value, properties[prop]): + return False + return True + + elif schema_type == "array" and isinstance(data, list): + item_schema = schema.get("items", {}) + return all(_validate_against_schema(item, item_schema) for item in data) + + elif schema_type == "string" and isinstance(data, str): + return True + + elif schema_type == "integer" and isinstance(data, int): + return True + + elif schema_type == "number" and isinstance(data, (int, float)): + return True + + elif schema_type == "boolean" and isinstance(data, bool): + return True + + elif schema_type == "null" and data is None: + return True + + return False + + +def split_string_into_chunks(s: str, n: int) -> List[str]: + """Split string into n chunks.""" if n <= 0: raise ValueError("Number of chunks must be greater than 0") @@ -644,5 +853,11 @@ def split_string_into_chunks(s, n): return chunks -def get_prompt_length(prompt): - return len(prompt.split()) if isinstance(prompt, str) else len(prompt) +def get_prompt_length(prompt: Union[str, List[str], List[int]]) -> int: + """Get the length of a prompt.""" + if isinstance(prompt, str): + return len(prompt.split()) + elif isinstance(prompt, list): + return len(prompt) + else: + return 0 From f385cf2f617dee9a57ba411ea91135c454cbfe78 Mon Sep 17 00:00:00 2001 From: Kourosh Hakhamaneshi Date: Wed, 2 Jul 2025 10:30:18 -0700 Subject: [PATCH 23/37] testing llm_server now with refactor testing utils for more consistent testing Signed-off-by: Kourosh Hakhamaneshi --- .../serve/deployments/llm/llm_server.py | 58 +- python/ray/llm/tests/serve/conftest.py | 51 + .../test_lora_deployment_base_client.py | 10 +- .../cpu/deployments/llm/test_llm_engine.py | 139 +- .../cpu/deployments/llm/test_llm_server.py | 503 ++----- .../llm/tests/serve/mocks/mock_vllm_engine.py | 1158 ++++++++--------- python/ray/llm/tests/serve/utils/__init__.py | 1 + .../llm/tests/serve/utils/testing_utils.py | 82 ++ 8 files changed, 874 insertions(+), 1128 deletions(-) create mode 100644 python/ray/llm/tests/serve/utils/__init__.py create mode 100644 python/ray/llm/tests/serve/utils/testing_utils.py diff --git a/python/ray/llm/_internal/serve/deployments/llm/llm_server.py b/python/ray/llm/_internal/serve/deployments/llm/llm_server.py index c38a5be875a3..3a0e68ae5aa4 100644 --- a/python/ray/llm/_internal/serve/deployments/llm/llm_server.py +++ b/python/ray/llm/_internal/serve/deployments/llm/llm_server.py @@ -1,7 +1,7 @@ import asyncio import os from abc import ABC, abstractmethod -from typing import Any, Dict, Optional, Type, Union, AsyncGenerator +from typing import Any, Dict, Optional, Type, Union, AsyncGenerator, List # Third-party imports from ray import serve @@ -17,7 +17,9 @@ ) from ray.llm._internal.serve.configs.openai_api_models import ( ChatCompletionRequest, + ChatCompletionResponse, CompletionRequest, + CompletionResponse, EmbeddingRequest, EmbeddingResponse, LLMChatResponse, @@ -172,11 +174,22 @@ def _batch_output_stream(self, generator): ).stream() - async def _run_request(self, request, *, engine_method: str, batch_output_stream: bool = False) -> AsyncGenerator[Any, None]: - """Run the stream flow for the request.""" + async def _run_request(self, request: Union[ChatCompletionRequest, CompletionRequest, EmbeddingRequest], *, engine_method: str, batch_output_stream: bool = False) -> AsyncGenerator[Any, None]: + """Run the engine method on the request + perform batching when stream=True. + + Args: + request: The request to run. + engine_method: The method to call on the engine. + batch_output_stream: Whether to batch the output stream. + + Returns: + An AsyncGenerator of the response. If stream is True and batching is enabled, then the generator will yield a list of streaming responses (strings of the format data: {response_json}\n\n). Otherwise, it will yield the non-streaming response from engine directly. + """ await self._maybe_add_request_id_to_request(request) await self._maybe_resolve_lora_from_multiplex() - if batch_output_stream: + + is_stream = hasattr(request, "stream") and request.stream + if is_stream and batch_output_stream: stream = self._batch_output_stream( getattr(self.engine, engine_method)(request) ) @@ -185,28 +198,44 @@ async def _run_request(self, request, *, engine_method: str, batch_output_stream return stream - async def chat(self, request: ChatCompletionRequest): + async def chat(self, request: ChatCompletionRequest) -> \ + AsyncGenerator[Union[List[str], ChatCompletionResponse], None]: """Runs a chat request to the LLM engine and returns the response. Args: request: A ChatCompletionRequest object. Returns: - A LLMChatResponse object. + An AsyncGenerator of the response. If stream is True and batching is enabled, then the generator will yield a list of chat streaming responses (strings of the format data: {response_json}\n\n). Otherwise, it will yield the ChatCompletionResponse object directly. """ return await self._run_request(request, engine_method="chat", batch_output_stream=True) - async def completions(self, request: CompletionRequest) -> LLMCompletionsResponse: + async def completions(self, request: CompletionRequest) -> \ + AsyncGenerator[Union[List[str], CompletionResponse], None]: """Runs a completion request to the LLM engine and returns the response. Args: request: A CompletionRequest object. Returns: - A LLMCompletionsResponse object. + An AsyncGenerator of the response. If stream is True and batching is enabled, then the generator will yield a list of completion streaming responses (strings of the format data: {response_json}\n\n). Otherwise, it will yield the CompletionResponse object directly. """ return await self._run_request(request, engine_method="completions", batch_output_stream=True) - + + + async def embeddings(self, request: EmbeddingRequest) -> AsyncGenerator[EmbeddingResponse, None]: + """Runs an embeddings request to the engine and returns the response. + + Returns an AsyncGenerator over the EmbeddingResponse object. This is so that the caller can have a consistent interface across all the methods of chat, completions, and embeddings. + + Args: + request: An EmbeddingRequest object. + + Returns: + An AsyncGenerator over the EmbeddingResponse object. + """ + # NOTE: Embeddings does not need batching. + return await self._run_request(request, engine_method="embeddings", batch_output_stream=False) async def check_health(self) -> None: """ @@ -221,17 +250,6 @@ async def check_health(self) -> None: logger.error("Engine health check failed in LLMServer.check_health: %s", e) raise e - async def embeddings(self, request: EmbeddingRequest) -> LLMEmbeddingsResponse: - """Runs an embeddings request to the vllm engine, and return the response. - - Args: - request: An EmbeddingRequest object. - - Returns: - A LLMEmbeddingsResponse object. - """ - # NOTE: Embeddings does not need batching. - return await self._run_request(request, engine_method="embeddings", batch_output_stream=False) async def llm_config(self) -> Optional[LLMConfig]: return self._llm_config diff --git a/python/ray/llm/tests/serve/conftest.py b/python/ray/llm/tests/serve/conftest.py index 4ca469db2bea..b3780f96dad6 100644 --- a/python/ray/llm/tests/serve/conftest.py +++ b/python/ray/llm/tests/serve/conftest.py @@ -14,6 +14,11 @@ from ray.llm._internal.serve.deployments.llm.vllm.vllm_models import ( VLLMEngineConfig, ) +from ray.llm._internal.serve.configs.openai_api_models import ( + ChatCompletionRequest, + CompletionRequest, + EmbeddingCompletionRequest, +) from ray.serve.llm import ( LLMConfig, LLMServer, @@ -62,6 +67,52 @@ def llm_config(model_pixtral_12b, disable_placement_bundles): ) +@pytest.fixture +def mock_llm_config(): + """LLM config for mock engine testing.""" + return LLMConfig( + model_loading_config=ModelLoadingConfig(model_id="mock-model"), + runtime_env={}, + log_engine_metrics=False, + ) + + +@pytest.fixture +def mock_chat_request(stream, max_tokens): + """Fixture for creating chat completion requests for mock testing.""" + return ChatCompletionRequest( + model="mock-model", + messages=[ + {"role": "user", "content": "Hello, world!"} + ], + max_tokens=max_tokens, + stream=stream, + ) + + +@pytest.fixture +def mock_completion_request(stream, max_tokens): + """Fixture for creating text completion requests for mock testing.""" + return CompletionRequest( + model="mock-model", + prompt="Complete this text:", + max_tokens=max_tokens, + stream=stream, + ) + + +@pytest.fixture +def mock_embedding_request(dimensions): + """Fixture for creating embedding requests for mock testing.""" + request = EmbeddingCompletionRequest( + model="mock-model", + input="Text to embed", + ) + if dimensions: + request.dimensions = dimensions + return request + + def get_test_model_path(yaml_file: str) -> pathlib.Path: current_file_dir = pathlib.Path(__file__).absolute().parent test_model_path = current_file_dir / yaml_file diff --git a/python/ray/llm/tests/serve/cpu/deployments/llm/multiplex/test_lora_deployment_base_client.py b/python/ray/llm/tests/serve/cpu/deployments/llm/multiplex/test_lora_deployment_base_client.py index 282130cefa20..ec2e75bbf267 100644 --- a/python/ray/llm/tests/serve/cpu/deployments/llm/multiplex/test_lora_deployment_base_client.py +++ b/python/ray/llm/tests/serve/cpu/deployments/llm/multiplex/test_lora_deployment_base_client.py @@ -6,9 +6,9 @@ from fastapi import HTTPException from ray import serve -from ray.llm._internal.serve.configs.server_models import ModelData +from ray.llm._internal.serve.configs.openai_api_models import ModelCard from ray.llm._internal.serve.deployments.llm.llm_server import LLMDeployment -from ray.llm.tests.serve.mocks.mock_vllm_engine import MockEchoVLLMEngine +from ray.llm.tests.serve.mocks.mock_vllm_engine import MockMultiplexEngine from ray.serve.handle import DeploymentHandle from ray.serve.llm import LLMConfig, LLMRouter, LoraConfig @@ -57,7 +57,7 @@ def get_mocked_llm_deployments(llm_configs) -> List[DeploymentHandle]: llm_deployments.append( deployment.bind( llm_config=llm_config, - engine_cls=MockEchoVLLMEngine, + engine_cls=MockMultiplexEngine, ) ) return llm_deployments @@ -97,7 +97,7 @@ async def test_lora_get_model(shutdown_ray_and_serve, disable_placement_bundles) # Case 2: Model has only the base model config. base_model_config = await router_handle.model.remote(base_model_id) - assert isinstance(base_model_config, ModelData) + assert isinstance(base_model_config, ModelCard) base_model_data = base_model_config.model_dump() assert base_model_data["id"] == base_model_id base_model_config = base_model_data["rayllm_metadata"] @@ -122,7 +122,7 @@ async def fake_get_lora_model_metadata(*args, **kwargs): router_handle = serve.run(router_deployment) lora_model_config = await router_handle.model.remote(lora_model) - assert isinstance(lora_model_config, ModelData) + assert isinstance(lora_model_config, ModelCard) lora_model_data = lora_model_config.model_dump() assert lora_model_data["id"] == lora_model lora_metadata = lora_model_data["rayllm_metadata"] diff --git a/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_engine.py b/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_engine.py index fc90517fdb61..cdc44c95077d 100644 --- a/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_engine.py +++ b/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_engine.py @@ -14,6 +14,7 @@ """ from ray.llm.tests.serve.mocks.mock_vllm_engine import MockVLLMEngine +from ray.llm.tests.serve.utils.testing_utils import LLMResponseValidator from ray.serve.llm import LLMConfig, ModelLoadingConfig from ray.llm._internal.serve.configs.openai_api_models import ( ChatCompletionRequest, @@ -24,153 +25,39 @@ EmbeddingResponse ) import pytest -import re -import json -from typing import Union, List, AsyncGenerator, Optional +from typing import AsyncGenerator, Optional -class LLMResponseValidator: - """Reusable validation logic for LLM responses.""" - - @staticmethod - def get_expected_content(api_type: str, max_tokens: int) -> str: - """Get expected content based on API type.""" - return " ".join(f"test_{i}" for i in range(max_tokens)) - - @staticmethod - def validate_non_streaming_response( - response: Union[ChatCompletionResponse, CompletionResponse], - api_type: str, - max_tokens: int - ): - """Validate non-streaming responses.""" - expected_content = LLMResponseValidator.get_expected_content(api_type, max_tokens) - - if api_type == "chat": - assert isinstance(response, ChatCompletionResponse) - assert response.choices[0].message.content == expected_content - elif api_type == "completion": - assert isinstance(response, CompletionResponse) - assert response.choices[0].text == expected_content - - @staticmethod - def validate_streaming_chunks( - chunks: List[str], - api_type: str, - max_tokens: int - ): - """Validate streaming response chunks.""" - # Should have max_tokens + 1 chunks (tokens + [DONE]) - assert len(chunks) == max_tokens + 1 - - # Validate each chunk except the last [DONE] chunk - for chunk_iter, chunk in enumerate(chunks[:-1]): - pattern = r"data: (.*)\n\n" - match = re.match(pattern, chunk) - assert match is not None - chunk_data = json.loads(match.group(1)) - - if api_type == "chat": - delta = chunk_data["choices"][0]["delta"] - if chunk_iter == 0: - assert delta["role"] == "assistant" - else: - assert delta["role"] is None - assert delta["content"].strip() == f"test_{chunk_iter}" - elif api_type == "completion": - text = chunk_data["choices"][0]["text"] - assert text.strip() == f"test_{chunk_iter}" - - @staticmethod - def validate_embedding_response( - response: EmbeddingResponse, - expected_dimensions: Optional[int] = None - ): - """Validate embedding responses.""" - assert isinstance(response, EmbeddingResponse) - assert response.object == "list" - assert len(response.data) == 1 - assert response.data[0].object == "embedding" - assert isinstance(response.data[0].embedding, list) - assert len(response.data[0].embedding) > 0 # Should have some embedding dimensions - assert response.data[0].index == 0 - - # Check dimensions if specified - if expected_dimensions: - assert len(response.data[0].embedding) == expected_dimensions - - -@pytest.fixture -def llm_config(): - return LLMConfig( - model_loading_config=ModelLoadingConfig(model_id="mock-model"), - runtime_env={}, - log_engine_metrics=False, - ) - - -@pytest.fixture -def chat_request(stream, max_tokens): - """Fixture for creating chat completion requests.""" - return ChatCompletionRequest( - model="mock-model", - messages=[ - {"role": "user", "content": "Hello, world!"} - ], - max_tokens=max_tokens, - stream=stream, - ) - - -@pytest.fixture -def completion_request(stream, max_tokens): - """Fixture for creating text completion requests.""" - return CompletionRequest( - model="mock-model", - prompt="Complete this text:", - max_tokens=max_tokens, - stream=stream, - ) -@pytest.fixture -def embedding_request(dimensions): - """Fixture for creating embedding requests.""" - request = EmbeddingCompletionRequest( - model="mock-model", - input="Text to embed", - ) - if dimensions: - request.dimensions = dimensions - return request class TestMockLLMEngine: @pytest.mark.parametrize("api_type", ["chat", "completion"]) @pytest.mark.parametrize("stream", [False, True]) - @pytest.mark.parametrize("max_tokens", [5, 10, 15]) + @pytest.mark.parametrize("max_tokens", [5]) @pytest.mark.asyncio async def test_unified_llm_engine( self, - llm_config, - chat_request, - completion_request, + mock_llm_config, + mock_chat_request, + mock_completion_request, api_type: str, stream: bool, max_tokens: int ): """Unified test for both chat and completion APIs, streaming and non-streaming.""" # Create and start the engine - engine = MockVLLMEngine(llm_config) + engine = MockVLLMEngine(mock_llm_config) await engine.start() # Create request based on API type if api_type == "chat": - request = chat_request + request = mock_chat_request response_generator = engine.chat(request) elif api_type == "completion": - request = completion_request + request = mock_completion_request response_generator = engine.completions(request) print(f"\n\n_____ {api_type.upper()} ({'STREAMING' if stream else 'NON-STREAMING'}) max_tokens={max_tokens} _____\n\n") @@ -193,17 +80,17 @@ async def test_unified_llm_engine( @pytest.mark.asyncio async def test_embedding_mock_engine( self, - llm_config, - embedding_request, + mock_llm_config, + mock_embedding_request, dimensions: Optional[int] ): """Test embedding API with different dimensions.""" # Create and start the engine - engine = MockVLLMEngine(llm_config) + engine = MockVLLMEngine(mock_llm_config) await engine.start() # Create embedding request - request = embedding_request + request = mock_embedding_request print(f"\n\n_____ EMBEDDING dimensions={dimensions} _____\n\n") diff --git a/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_server.py b/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_server.py index 146aa7f96d8e..4a4ccbe7972d 100644 --- a/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_server.py +++ b/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_server.py @@ -1,427 +1,134 @@ import sys -from unittest.mock import AsyncMock +from typing import Optional import pytest -from ray.llm._internal.serve.configs.constants import MODEL_RESPONSE_BATCH_TIMEOUT_MS -from ray.llm._internal.serve.configs.openai_api_models import ( - ChatCompletionRequest, - CompletionRequest, - ErrorResponse, -) -from ray.llm._internal.serve.configs.server_models import ( - FinishReason, - LLMConfig, - LLMRawResponse, - ModelLoadingConfig, -) -from ray.llm._internal.serve.deployments.llm.llm_server import ( - ResponsePostprocessor, -) from ray.llm.tests.serve.mocks.mock_vllm_engine import MockVLLMEngine +from ray.llm.tests.serve.utils.testing_utils import LLMResponseValidator -async def stream_generator(): - yield LLMRawResponse( - generated_text="Hello", - num_generated_tokens=1, - num_generated_tokens_batch=1, - num_input_tokens=5, - finish_reason=None, - ) - yield LLMRawResponse( - generated_text=" world", - num_generated_tokens=1, - num_generated_tokens_batch=1, - num_input_tokens=5, - finish_reason=FinishReason.STOP, - ) - - -class TestResponsePostprocessor: - @pytest.mark.asyncio - async def test_process_chat_streaming(self): - """Test processing streaming chat responses.""" - postprocessor = ResponsePostprocessor() - model = "test_model" - - # Process the generator as a streaming chat response - response_gen = postprocessor.process_chat( - model, stream_generator(), stream=True - ) - - # Collect all responses - responses = [resp async for resp in response_gen] - - # Verify we got the expected responses - assert len(responses) >= 3 # Role message + content chunks + final message - assert ( - responses[0].choices[0].delta.role == "assistant" - ) # First message has role - assert ( - responses[1].choices[0].delta.content == "Hello" - ) # Second has first chunk - assert ( - responses[-1].choices[0].finish_reason == "stop" - ) # Last has finish reason - - @pytest.mark.asyncio - async def test_process_chat_non_streaming(self): - """Test processing non-streaming chat responses.""" - postprocessor = ResponsePostprocessor() - model = "test_model" - - # Process the generator as a non-streaming chat response - response_gen = postprocessor.process_chat( - model, stream_generator(), stream=False - ) - - # Collect the single response - responses = [resp async for resp in response_gen] - assert len(responses) == 1 - - # Verify the content of the response - response = responses[0] - assert response.choices[0].message.role == "assistant" - assert response.choices[0].message.content == "Hello world" - assert response.choices[0].finish_reason == "stop" - assert response.usage.prompt_tokens == 5 - assert response.usage.completion_tokens == 2 - assert response.usage.total_tokens == 7 - - @pytest.mark.asyncio - async def test_process_completions_streaming(self): - """Test processing streaming completion responses.""" - postprocessor = ResponsePostprocessor() - model = "test_model" - - # Process the generator as a streaming completion response - response_gen = postprocessor.process_completions( - model, stream_generator(), stream=True - ) - - # Collect all responses - responses = [resp async for resp in response_gen] - - # Verify we got the expected responses - assert len(responses) == 2 - assert responses[0].choices[0].text == "Hello" - assert responses[0].choices[0].finish_reason is None - assert responses[1].choices[0].text == " world" - assert responses[1].choices[0].finish_reason == "stop" - - @pytest.mark.asyncio - async def test_process_completions_non_streaming(self): - """Test processing non-streaming completion responses.""" - postprocessor = ResponsePostprocessor() - model = "test_model" - - # Process the generator as a non-streaming completion response - response_gen = postprocessor.process_completions( - model, stream_generator(), stream=False - ) - - # Collect the single response - responses = [resp async for resp in response_gen] - assert len(responses) == 1 - - # Verify the content of the response - response = responses[0] - assert response.choices[0].text == "Hello world" - assert response.choices[0].finish_reason == "stop" - assert response.usage.prompt_tokens == 5 - assert response.usage.completion_tokens == 2 - assert response.usage.total_tokens == 7 - - @pytest.mark.asyncio - async def test_error_handling(self): - """Test error handling in response streams.""" - postprocessor = ResponsePostprocessor() - model = "test_model" - - # Create a generator that raises an exception - - error_response = ErrorResponse( - message="Test error", - code=500, - internal_message="Test error", - type="Test error", - original_exception=Exception("Test error"), - ) - - async def gen(): - yield LLMRawResponse( - error=error_response, - ) - yield LLMRawResponse( - generated_text="Hello", - num_generated_tokens=1, - num_generated_tokens_batch=1, - num_input_tokens=5, - finish_reason=None, - ) - - # Process the generator as a non-streaming chat response - response_gen = postprocessor.process_chat(model, gen(), stream=False) - - # Collect the responses, should contain the error - responses = [resp async for resp in response_gen] - assert len(responses) == 1 - assert responses[0] == error_response - class TestLLMServer: - @pytest.mark.asyncio - async def test_get_batch_interval_ms(self, create_server): - """Test that the batch interval is set correctly in the config.""" - - # Test with a no stream_batching_interval_ms. - llm_config = LLMConfig( - model_loading_config=ModelLoadingConfig( - model_id="llm_model_id", - ), - ) - server = await create_server(llm_config, engine_cls=MockVLLMEngine) - - assert server._get_batch_interval_ms() == MODEL_RESPONSE_BATCH_TIMEOUT_MS - - # Test with a non-zero stream_batching_interval_ms. - llm_config = LLMConfig( - model_loading_config=ModelLoadingConfig( - model_id="llm_model_id", - ), - experimental_configs={ - "stream_batching_interval_ms": 13, - }, - ) - server = await create_server(llm_config, engine_cls=MockVLLMEngine) - assert server._get_batch_interval_ms() == 13 - - # Test with zero stream_batching_interval_ms. - llm_config = LLMConfig( - model_loading_config=ModelLoadingConfig( - model_id="llm_model_id", - ), - experimental_configs={ - "stream_batching_interval_ms": 0, - }, - ) - server = await create_server(llm_config, engine_cls=MockVLLMEngine) - assert server._get_batch_interval_ms() == 0 - - @pytest.mark.asyncio - async def test_chat_streaming(self, create_server): - """Test chat completion in streaming mode.""" - llm_config = LLMConfig( - model_loading_config=ModelLoadingConfig( - model_id="test_model", - ), - experimental_configs={ - # Maximum batching - "stream_batching_interval_ms": 10000, - }, - ) - - server = await create_server(llm_config, engine_cls=MockVLLMEngine) - - # Create a chat completion request - request = ChatCompletionRequest( - model="test_model", - messages=[dict(role="user", content="Hello")], - stream=True, - max_tokens=5, - ) - - # Get the response stream - response_stream = await server.chat(request) - - # Collect responses from the stream - responses = [] - async for response in response_stream: - responses.append(response) - - # Each response should be an iterator over ChatCompletionStreamResponse - # Check that we got responses - assert len(responses) > 0 - - text = "" - role = None - for response in responses: - assert isinstance(response, list) - for chunk in response: - if chunk.choices[0].delta.role is not None and role is None: - role = chunk.choices[0].delta.role - - text += chunk.choices[0].delta.content - - assert role == "assistant" - # What mock vllm engine returns - assert text == "test_0 test_1 test_2 test_3 test_4 " + @pytest.mark.parametrize("api_type", ["chat", "completion"]) + @pytest.mark.parametrize("stream", [False, True]) + @pytest.mark.parametrize("max_tokens", [5]) + @pytest.mark.parametrize("batching_interval_ms", [0, 10000]) @pytest.mark.asyncio - async def test_chat_non_streaming(self, create_server): - """Test non-streaming chat completion.""" - llm_config = LLMConfig( - model_loading_config=ModelLoadingConfig( - model_id="test_model", - ), - ) - - server = await create_server(llm_config, engine_cls=MockVLLMEngine) - - # Create a chat completion request - request = ChatCompletionRequest( - model="test_model", - messages=[dict(role="user", content="Hello")], - stream=False, - max_tokens=5, - ) - + async def test_unified_llm_server( + self, + create_server, + mock_llm_config, + mock_chat_request, + mock_completion_request, + api_type: str, + stream: bool, + max_tokens: int, + batching_interval_ms: int + ): + """Unified test for both chat and completion APIs, streaming and non-streaming.""" + # Override the batching interval config (only matters for streaming) + if stream: + mock_llm_config.experimental_configs = { + "stream_batching_interval_ms": batching_interval_ms, + } + + server = await create_server(mock_llm_config, engine_cls=MockVLLMEngine) + + # Create request based on API type + if api_type == "chat": + request = mock_chat_request + batched_chunks = await server.chat(request) + elif api_type == "completion": + request = mock_completion_request + batched_chunks = await server.completions(request) + + print(f"\n\n_____ {api_type.upper()} ({'STREAMING' if stream else 'NON-STREAMING'}) max_tokens={max_tokens} batching_interval_ms={batching_interval_ms} _____\n\n") + + if stream: + # Collect responses from the stream + chunks = [] + async for batch in batched_chunks: + chunks.extend(batch) + + # Check that we got responses + assert len(chunks) > 0 + + # Validate streaming response + LLMResponseValidator.validate_streaming_chunks(chunks, api_type, max_tokens) + else: + # Collect non-streaming response + chunks = [] + async for batch in batched_chunks: + chunks.append(batch) + + # Check that we got one response + assert len(chunks) == 1 + + # Validate non-streaming response + LLMResponseValidator.validate_non_streaming_response(chunks[0], api_type, max_tokens) + + @pytest.mark.parametrize("dimensions", [None, 512]) + @pytest.mark.asyncio + async def test_embedding_llm_server( + self, + create_server, + mock_llm_config, + mock_embedding_request, + dimensions: Optional[int] + ): + """Test embedding API from LLMServer perspective.""" + server = await create_server(mock_llm_config, engine_cls=MockVLLMEngine) + + # Create embedding request + request = mock_embedding_request + + print(f"\n\n_____ EMBEDDING SERVER dimensions={dimensions} _____\n\n") + # Get the response - response_stream = await server.chat(request) - + batched_chunks = await server.embeddings(request) + # Collect responses (should be just one) - responses = [] - async for response in response_stream: - responses.append(response) + chunks = [] + async for batch in batched_chunks: + chunks.append(batch) # Check that we got one response - assert len(responses) == 1 - assert responses[0].choices[0].message.role == "assistant" - assert ( - responses[0].choices[0].message.content - == "test_0 test_1 test_2 test_3 test_4 " - ) - assert responses[0].choices[0].finish_reason == "stop" - - @pytest.mark.asyncio - async def test_completions_streaming(self, create_server): - """Test streaming text completion.""" - llm_config = LLMConfig( - model_loading_config=ModelLoadingConfig( - model_id="test_model", - ), - experimental_configs={ - # Maximum batching - "stream_batching_interval_ms": 10000, - }, - ) - - server = await create_server(llm_config, engine_cls=MockVLLMEngine) - - # Create a completion request - request = CompletionRequest( - model="test_model", - prompt="Hello", - stream=True, - max_tokens=5, - ) - - # Get the response stream - response_stream = await server.completions(request) - - # Collect responses from the stream - responses = [] - async for response in response_stream: - responses.append(response) - - # Check that we got responses - assert len(responses) > 0 - - text = "" - for response in responses: - assert isinstance(response, list) - for chunk in response: - text += chunk.choices[0].text - - assert text == "test_0 test_1 test_2 test_3 test_4 " + assert len(chunks) == 1 + + # Validate embedding response + LLMResponseValidator.validate_embedding_response(chunks[0], dimensions) @pytest.mark.asyncio - async def test_completions_non_streaming(self, create_server): - """Test non-streaming text completion.""" - llm_config = LLMConfig( - model_loading_config=ModelLoadingConfig( - model_id="test_model", - ), - ) - - server = await create_server(llm_config, engine_cls=MockVLLMEngine) - - # Create a completion request - request = CompletionRequest( - model="test_model", - prompt="Hello", - stream=False, - max_tokens=5, - ) - - # Get the response - response_stream = await server.completions(request) - - # Collect responses (should be just one) - responses = [] - async for response in response_stream: - responses.append(response) - - # Check that we got one response - assert len(responses) == 1 - assert responses[0].choices[0].text == "test_0 test_1 test_2 test_3 test_4 " - assert responses[0].choices[0].finish_reason == "stop" - - @pytest.mark.asyncio - async def test_check_health(self, create_server): + async def test_check_health(self, create_server, mock_llm_config): """Test health check functionality.""" - llm_config = LLMConfig( - model_loading_config=ModelLoadingConfig( - model_id="test_model", - ), - ) + + # Mock the engine's check_health method + class LocalMockEngine(MockVLLMEngine): + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.check_health_called = False + + async def check_health(self): + self.check_health_called = True # Create a server with a mocked engine - server = await create_server(llm_config, engine_cls=MockVLLMEngine) - - # Mock the engine's check_health method - server.engine.check_health = AsyncMock(return_value=None) + server = await create_server(mock_llm_config, engine_cls=LocalMockEngine) # Perform the health check, no exceptions should be raised await server.check_health() - server.engine.check_health.assert_called_once() - - @pytest.mark.asyncio - async def test_error_handling(self, create_server): - """Test error handling in the server.""" - llm_config = LLMConfig( - model_loading_config=ModelLoadingConfig( - model_id="test_model", - ), - ) - - server = await create_server(llm_config, engine_cls=MockVLLMEngine) - - # Mock the _predict method to raise an exception - server._predict = AsyncMock(side_effect=Exception("Test error")) - # Create a chat completion request - request = ChatCompletionRequest( - model="test_model", - messages=[dict(role="user", content="Hello")], - stream=False, - ) + # Check that the health check method was called + assert server.engine.check_health_called - # Get the response - response_stream = await server.chat(request) - - # Collect responses (should contain an error) - responses = [] - async for response in response_stream: - responses.append(response) - - # Check that we got an error response - assert len(responses) > 0 - assert isinstance(responses[0], ErrorResponse) + @pytest.mark.asyncio + async def test_llm_config_property(self, create_server, mock_llm_config): + """Test the llm_config property.""" + server = await create_server(mock_llm_config, engine_cls=MockVLLMEngine) + llm_config = await server.llm_config() + assert isinstance(llm_config, type(mock_llm_config)) - # Internal server error - assert responses[0].code == 500 if __name__ == "__main__": diff --git a/python/ray/llm/tests/serve/mocks/mock_vllm_engine.py b/python/ray/llm/tests/serve/mocks/mock_vllm_engine.py index 54cc412945d3..bc82616aa44a 100644 --- a/python/ray/llm/tests/serve/mocks/mock_vllm_engine.py +++ b/python/ray/llm/tests/serve/mocks/mock_vllm_engine.py @@ -253,611 +253,611 @@ async def _generate_completion_response( yield response -class MockEchoVLLMEngine(MockVLLMEngine): - """Mock engine that responds with information about the request sent to it. +# class MockEchoVLLMEngine(MockVLLMEngine): +# """Mock engine that responds with information about the request sent to it. - Useful for testing the contents of requests created in data plane code. - """ - - async def chat(self, request: ChatCompletionRequest) -> AsyncGenerator[Union[str, ChatCompletionResponse, ErrorResponse], None]: - """Echo the chat request information.""" - if not self.started: - raise RuntimeError("Engine not started") - - # Convert request to JSON for echoing - request_info = { - "request_type": "chat", - "model": getattr(request, 'model', None), - "messages": getattr(request, 'messages', []), - "max_tokens": getattr(request, 'max_tokens', None), - "temperature": getattr(request, 'temperature', None), - "stream": getattr(request, 'stream', False), - "current_lora_model": self._current_lora_model.model_dump() if self._current_lora_model else None - } - - echo_text = json.dumps(request_info, indent=2) - - if request.stream: - # Return as SSE for streaming - chunk_data = { - "id": f"chatcmpl-echo-{random.randint(1000, 9999)}", - "object": "chat.completion.chunk", - "created": int(asyncio.get_event_loop().time()), - "model": getattr(request, 'model', 'mock-echo-model'), - "choices": [{ - "index": 0, - "delta": { - "role": "assistant", - "content": echo_text - }, - "finish_reason": "stop" - }] - } - yield f"data: {json.dumps(chunk_data)}\n\n" - yield "data: [DONE]\n\n" - else: - # Return as response object - choice = { - "index": 0, - "message": { - "role": "assistant", - "content": echo_text - }, - "finish_reason": "stop" - } +# Useful for testing the contents of requests created in data plane code. +# """ + +# async def chat(self, request: ChatCompletionRequest) -> AsyncGenerator[Union[str, ChatCompletionResponse, ErrorResponse], None]: +# """Echo the chat request information.""" +# if not self.started: +# raise RuntimeError("Engine not started") + +# # Convert request to JSON for echoing +# request_info = { +# "request_type": "chat", +# "model": getattr(request, 'model', None), +# "messages": getattr(request, 'messages', []), +# "max_tokens": getattr(request, 'max_tokens', None), +# "temperature": getattr(request, 'temperature', None), +# "stream": getattr(request, 'stream', False), +# "current_lora_model": self._current_lora_model.model_dump() if self._current_lora_model else None +# } + +# echo_text = json.dumps(request_info, indent=2) + +# if request.stream: +# # Return as SSE for streaming +# chunk_data = { +# "id": f"chatcmpl-echo-{random.randint(1000, 9999)}", +# "object": "chat.completion.chunk", +# "created": int(asyncio.get_event_loop().time()), +# "model": getattr(request, 'model', 'mock-echo-model'), +# "choices": [{ +# "index": 0, +# "delta": { +# "role": "assistant", +# "content": echo_text +# }, +# "finish_reason": "stop" +# }] +# } +# yield f"data: {json.dumps(chunk_data)}\n\n" +# yield "data: [DONE]\n\n" +# else: +# # Return as response object +# choice = { +# "index": 0, +# "message": { +# "role": "assistant", +# "content": echo_text +# }, +# "finish_reason": "stop" +# } - response = ChatCompletionResponse( - id=f"chatcmpl-echo-{random.randint(1000, 9999)}", - object="chat.completion", - created=int(asyncio.get_event_loop().time()), - model=getattr(request, 'model', 'mock-echo-model'), - choices=[choice] - ) +# response = ChatCompletionResponse( +# id=f"chatcmpl-echo-{random.randint(1000, 9999)}", +# object="chat.completion", +# created=int(asyncio.get_event_loop().time()), +# model=getattr(request, 'model', 'mock-echo-model'), +# choices=[choice] +# ) - yield response - - async def completions(self, request: CompletionRequest) -> AsyncGenerator[Union[str, CompletionResponse, ErrorResponse], None]: - """Echo the completion request information.""" - if not self.started: - raise RuntimeError("Engine not started") - - request_info = { - "request_type": "completion", - "model": getattr(request, 'model', None), - "prompt": getattr(request, 'prompt', None), - "max_tokens": getattr(request, 'max_tokens', None), - "temperature": getattr(request, 'temperature', None), - "stream": getattr(request, 'stream', False), - "current_lora_model": self._current_lora_model.model_dump() if self._current_lora_model else None - } - - echo_text = json.dumps(request_info, indent=2) - - if request.stream: - # Return as SSE for streaming - chunk_data = { - "id": f"cmpl-echo-{random.randint(1000, 9999)}", - "object": "text_completion", - "created": int(asyncio.get_event_loop().time()), - "model": getattr(request, 'model', 'mock-echo-model'), - "choices": [{ - "index": 0, - "text": echo_text, - "finish_reason": "stop" - }] - } - yield f"data: {json.dumps(chunk_data)}\n\n" - yield "data: [DONE]\n\n" - else: - # Return as response object - choice = { - "index": 0, - "text": echo_text, - "finish_reason": "stop" - } +# yield response + +# async def completions(self, request: CompletionRequest) -> AsyncGenerator[Union[str, CompletionResponse, ErrorResponse], None]: +# """Echo the completion request information.""" +# if not self.started: +# raise RuntimeError("Engine not started") + +# request_info = { +# "request_type": "completion", +# "model": getattr(request, 'model', None), +# "prompt": getattr(request, 'prompt', None), +# "max_tokens": getattr(request, 'max_tokens', None), +# "temperature": getattr(request, 'temperature', None), +# "stream": getattr(request, 'stream', False), +# "current_lora_model": self._current_lora_model.model_dump() if self._current_lora_model else None +# } + +# echo_text = json.dumps(request_info, indent=2) + +# if request.stream: +# # Return as SSE for streaming +# chunk_data = { +# "id": f"cmpl-echo-{random.randint(1000, 9999)}", +# "object": "text_completion", +# "created": int(asyncio.get_event_loop().time()), +# "model": getattr(request, 'model', 'mock-echo-model'), +# "choices": [{ +# "index": 0, +# "text": echo_text, +# "finish_reason": "stop" +# }] +# } +# yield f"data: {json.dumps(chunk_data)}\n\n" +# yield "data: [DONE]\n\n" +# else: +# # Return as response object +# choice = { +# "index": 0, +# "text": echo_text, +# "finish_reason": "stop" +# } - response = CompletionResponse( - id=f"cmpl-echo-{random.randint(1000, 9999)}", - object="text_completion", - created=int(asyncio.get_event_loop().time()), - model=getattr(request, 'model', 'mock-echo-model'), - choices=[choice] - ) +# response = CompletionResponse( +# id=f"cmpl-echo-{random.randint(1000, 9999)}", +# object="text_completion", +# created=int(asyncio.get_event_loop().time()), +# model=getattr(request, 'model', 'mock-echo-model'), +# choices=[choice] +# ) - yield response - - async def embeddings(self, request: EmbeddingRequest) -> AsyncGenerator[Union[str, EmbeddingResponse, ErrorResponse], None]: - """Echo the embedding request information.""" - if not self.started: - raise RuntimeError("Engine not started") - - request_info = { - "request_type": "embedding", - "model": getattr(request, 'model', None), - "input": getattr(request, 'input', None), - "encoding_format": getattr(request, 'encoding_format', None), - "dimensions": getattr(request, 'dimensions', None), - "current_lora_model": self._current_lora_model.model_dump() if self._current_lora_model else None - } - - # Return request info as mock embedding - echo_text = json.dumps(request_info, indent=2) - mock_embedding = [float(ord(c)) for c in echo_text[:10]] # Mock embedding from first 10 chars - - response = EmbeddingResponse( - object="list", - data=[{ - "object": "embedding", - "embedding": mock_embedding, - "index": 0 - }], - model=getattr(request, 'model', 'mock-echo-model'), - usage={ - "prompt_tokens": len(str(request.input).split()), - "total_tokens": len(str(request.input).split()) - } - ) - - yield response - - -class MockMultiplexEngine(MockVLLMEngine): - """Mock engine for testing multiplex/LoRA functionality.""" - - def __init__(self, llm_config: LLMConfig): - super().__init__(llm_config) - self.loaded_lora_models: List[DiskMultiplexConfig] = [] - - async def resolve_lora(self, lora_model: DiskMultiplexConfig): - """Mock LoRA model loading.""" - self._current_lora_model = lora_model - # Keep track of loaded models - if lora_model not in self.loaded_lora_models: - self.loaded_lora_models.append(lora_model) - - async def chat(self, request: ChatCompletionRequest) -> AsyncGenerator[Union[str, ChatCompletionResponse, ErrorResponse], None]: - """Chat with multiplex information.""" - if not self.started: - raise RuntimeError("Engine not started") - - # Include multiplex info in response - lora_info = "" - if self._current_lora_model: - lora_info = f" [LoRA: {self._current_lora_model.model_id}]" - - generated_text = f"Mock multiplex response{lora_info}" - - if request.stream: - # Return as SSE for streaming - chunk_data = { - "id": f"chatcmpl-multiplex-{random.randint(1000, 9999)}", - "object": "chat.completion.chunk", - "created": int(asyncio.get_event_loop().time()), - "model": getattr(request, 'model', 'mock-multiplex-model'), - "choices": [{ - "index": 0, - "delta": { - "role": "assistant", - "content": generated_text - }, - "finish_reason": "stop" - }] - } - yield f"data: {json.dumps(chunk_data)}\n\n" - yield "data: [DONE]\n\n" - else: - # Return as response object - choice = { - "index": 0, - "message": { - "role": "assistant", - "content": generated_text - }, - "finish_reason": "stop" - } +# yield response + +# async def embeddings(self, request: EmbeddingRequest) -> AsyncGenerator[Union[str, EmbeddingResponse, ErrorResponse], None]: +# """Echo the embedding request information.""" +# if not self.started: +# raise RuntimeError("Engine not started") + +# request_info = { +# "request_type": "embedding", +# "model": getattr(request, 'model', None), +# "input": getattr(request, 'input', None), +# "encoding_format": getattr(request, 'encoding_format', None), +# "dimensions": getattr(request, 'dimensions', None), +# "current_lora_model": self._current_lora_model.model_dump() if self._current_lora_model else None +# } + +# # Return request info as mock embedding +# echo_text = json.dumps(request_info, indent=2) +# mock_embedding = [float(ord(c)) for c in echo_text[:10]] # Mock embedding from first 10 chars + +# response = EmbeddingResponse( +# object="list", +# data=[{ +# "object": "embedding", +# "embedding": mock_embedding, +# "index": 0 +# }], +# model=getattr(request, 'model', 'mock-echo-model'), +# usage={ +# "prompt_tokens": len(str(request.input).split()), +# "total_tokens": len(str(request.input).split()) +# } +# ) + +# yield response + + +# class MockMultiplexEngine(MockVLLMEngine): +# """Mock engine for testing multiplex/LoRA functionality.""" + +# def __init__(self, llm_config: LLMConfig): +# super().__init__(llm_config) +# self.loaded_lora_models: List[DiskMultiplexConfig] = [] + +# async def resolve_lora(self, lora_model: DiskMultiplexConfig): +# """Mock LoRA model loading.""" +# self._current_lora_model = lora_model +# # Keep track of loaded models +# if lora_model not in self.loaded_lora_models: +# self.loaded_lora_models.append(lora_model) + +# async def chat(self, request: ChatCompletionRequest) -> AsyncGenerator[Union[str, ChatCompletionResponse, ErrorResponse], None]: +# """Chat with multiplex information.""" +# if not self.started: +# raise RuntimeError("Engine not started") + +# # Include multiplex info in response +# lora_info = "" +# if self._current_lora_model: +# lora_info = f" [LoRA: {self._current_lora_model.model_id}]" + +# generated_text = f"Mock multiplex response{lora_info}" + +# if request.stream: +# # Return as SSE for streaming +# chunk_data = { +# "id": f"chatcmpl-multiplex-{random.randint(1000, 9999)}", +# "object": "chat.completion.chunk", +# "created": int(asyncio.get_event_loop().time()), +# "model": getattr(request, 'model', 'mock-multiplex-model'), +# "choices": [{ +# "index": 0, +# "delta": { +# "role": "assistant", +# "content": generated_text +# }, +# "finish_reason": "stop" +# }] +# } +# yield f"data: {json.dumps(chunk_data)}\n\n" +# yield "data: [DONE]\n\n" +# else: +# # Return as response object +# choice = { +# "index": 0, +# "message": { +# "role": "assistant", +# "content": generated_text +# }, +# "finish_reason": "stop" +# } - response = ChatCompletionResponse( - id=f"chatcmpl-multiplex-{random.randint(1000, 9999)}", - object="chat.completion", - created=int(asyncio.get_event_loop().time()), - model=getattr(request, 'model', 'mock-multiplex-model'), - choices=[choice] - ) +# response = ChatCompletionResponse( +# id=f"chatcmpl-multiplex-{random.randint(1000, 9999)}", +# object="chat.completion", +# created=int(asyncio.get_event_loop().time()), +# model=getattr(request, 'model', 'mock-multiplex-model'), +# choices=[choice] +# ) - yield response - - -class MockJSONModeVLLMEngine(MockVLLMEngine): - """Mock engine that generates valid JSON responses when JSON mode is requested.""" - - async def chat(self, request: ChatCompletionRequest) -> AsyncGenerator[Union[str, ChatCompletionResponse, ErrorResponse], None]: - """Generate JSON or text response based on request format.""" - if not self.started: - raise RuntimeError("Engine not started") - - # Check if JSON mode is requested - response_format = getattr(request, 'response_format', None) - is_json_mode = ( - response_format and - hasattr(response_format, 'type') and - response_format.type == "json_object" - ) - - if is_json_mode: - # Generate valid JSON based on schema if provided - if hasattr(response_format, 'json_schema') and response_format.json_schema: - try: - # Use the schema to generate a valid response - json_response = generate_from_schema(response_format.json_schema) - generated_text = json.dumps(json_response, ensure_ascii=False) - except Exception as e: - # Fallback to default JSON if schema generation fails - json_response = { - "error": f"Schema generation failed: {str(e)}", - "schema_provided": bool(response_format.json_schema), - "fallback_response": True - } - generated_text = json.dumps(json_response, indent=2) - else: - # Default JSON response when no schema is provided - json_response = { - "message": "This is a mock JSON response", - "timestamp": int(asyncio.get_event_loop().time()), - "request_info": { - "model": getattr(request, 'model', 'unknown'), - "has_messages": bool(getattr(request, 'messages', [])), - "lora_model": self._current_lora_model.model_id if self._current_lora_model else None - } - } - generated_text = json.dumps(json_response, indent=2) - else: - # Generate regular text - generated_text = "Mock response from JSON mode engine" - - if request.stream: - # Return as SSE for streaming with realistic JSON chunking - request_id = f"chatcmpl-json-{random.randint(1000, 9999)}" - created_time = int(asyncio.get_event_loop().time()) - model_name = getattr(request, 'model', 'mock-json-model') +# yield response + + +# class MockJSONModeVLLMEngine(MockVLLMEngine): +# """Mock engine that generates valid JSON responses when JSON mode is requested.""" + +# async def chat(self, request: ChatCompletionRequest) -> AsyncGenerator[Union[str, ChatCompletionResponse, ErrorResponse], None]: +# """Generate JSON or text response based on request format.""" +# if not self.started: +# raise RuntimeError("Engine not started") + +# # Check if JSON mode is requested +# response_format = getattr(request, 'response_format', None) +# is_json_mode = ( +# response_format and +# hasattr(response_format, 'type') and +# response_format.type == "json_object" +# ) + +# if is_json_mode: +# # Generate valid JSON based on schema if provided +# if hasattr(response_format, 'json_schema') and response_format.json_schema: +# try: +# # Use the schema to generate a valid response +# json_response = generate_from_schema(response_format.json_schema) +# generated_text = json.dumps(json_response, ensure_ascii=False) +# except Exception as e: +# # Fallback to default JSON if schema generation fails +# json_response = { +# "error": f"Schema generation failed: {str(e)}", +# "schema_provided": bool(response_format.json_schema), +# "fallback_response": True +# } +# generated_text = json.dumps(json_response, indent=2) +# else: +# # Default JSON response when no schema is provided +# json_response = { +# "message": "This is a mock JSON response", +# "timestamp": int(asyncio.get_event_loop().time()), +# "request_info": { +# "model": getattr(request, 'model', 'unknown'), +# "has_messages": bool(getattr(request, 'messages', [])), +# "lora_model": self._current_lora_model.model_id if self._current_lora_model else None +# } +# } +# generated_text = json.dumps(json_response, indent=2) +# else: +# # Generate regular text +# generated_text = "Mock response from JSON mode engine" + +# if request.stream: +# # Return as SSE for streaming with realistic JSON chunking +# request_id = f"chatcmpl-json-{random.randint(1000, 9999)}" +# created_time = int(asyncio.get_event_loop().time()) +# model_name = getattr(request, 'model', 'mock-json-model') - if is_json_mode: - # For JSON streaming, split the JSON into realistic chunks - # This simulates how a real LLM would generate JSON token by token - max_chunk_size = 10 # Characters per chunk - chunks = [generated_text[i:i+max_chunk_size] for i in range(0, len(generated_text), max_chunk_size)] +# if is_json_mode: +# # For JSON streaming, split the JSON into realistic chunks +# # This simulates how a real LLM would generate JSON token by token +# max_chunk_size = 10 # Characters per chunk +# chunks = [generated_text[i:i+max_chunk_size] for i in range(0, len(generated_text), max_chunk_size)] - for i, chunk in enumerate(chunks): - chunk_data = { - "id": request_id, - "object": "chat.completion.chunk", - "created": created_time, - "model": model_name, - "choices": [{ - "index": 0, - "delta": { - "content": chunk, - "role": "assistant" if i == 0 else None - }, - "finish_reason": "stop" if i == len(chunks) - 1 else None - }] - } - yield f"data: {json.dumps(chunk_data)}\n\n" - await asyncio.sleep(0.01) # Simulate processing time - else: - # For non-JSON streaming, return as single chunk - chunk_data = { - "id": request_id, - "object": "chat.completion.chunk", - "created": created_time, - "model": model_name, - "choices": [{ - "index": 0, - "delta": { - "role": "assistant", - "content": generated_text - }, - "finish_reason": "stop" - }] - } - yield f"data: {json.dumps(chunk_data)}\n\n" +# for i, chunk in enumerate(chunks): +# chunk_data = { +# "id": request_id, +# "object": "chat.completion.chunk", +# "created": created_time, +# "model": model_name, +# "choices": [{ +# "index": 0, +# "delta": { +# "content": chunk, +# "role": "assistant" if i == 0 else None +# }, +# "finish_reason": "stop" if i == len(chunks) - 1 else None +# }] +# } +# yield f"data: {json.dumps(chunk_data)}\n\n" +# await asyncio.sleep(0.01) # Simulate processing time +# else: +# # For non-JSON streaming, return as single chunk +# chunk_data = { +# "id": request_id, +# "object": "chat.completion.chunk", +# "created": created_time, +# "model": model_name, +# "choices": [{ +# "index": 0, +# "delta": { +# "role": "assistant", +# "content": generated_text +# }, +# "finish_reason": "stop" +# }] +# } +# yield f"data: {json.dumps(chunk_data)}\n\n" - # Send final [DONE] message - yield "data: [DONE]\n\n" - else: - # Return as response object - choice = { - "index": 0, - "message": { - "role": "assistant", - "content": generated_text - }, - "finish_reason": "stop" - } +# # Send final [DONE] message +# yield "data: [DONE]\n\n" +# else: +# # Return as response object +# choice = { +# "index": 0, +# "message": { +# "role": "assistant", +# "content": generated_text +# }, +# "finish_reason": "stop" +# } - response = ChatCompletionResponse( - id=f"chatcmpl-json-{random.randint(1000, 9999)}", - object="chat.completion", - created=int(asyncio.get_event_loop().time()), - model=getattr(request, 'model', 'mock-json-model'), - choices=[choice] - ) +# response = ChatCompletionResponse( +# id=f"chatcmpl-json-{random.randint(1000, 9999)}", +# object="chat.completion", +# created=int(asyncio.get_event_loop().time()), +# model=getattr(request, 'model', 'mock-json-model'), +# choices=[choice] +# ) - yield response - - -class MockPDDisaggVLLMEngine(MockVLLMEngine): - """Mock engine for testing Prefill/Decode disaggregated functionality.""" - - def __init__(self, llm_config: LLMConfig): - super().__init__(llm_config) - self.prefill_cache = {} - self.kv_transfer_enabled = False - - async def start(self): - """Start with disaggregation support.""" - await super().start() - # Mock enabling KV transfer - self.kv_transfer_enabled = True - - async def chat(self, request: ChatCompletionRequest) -> AsyncGenerator[Union[str, ChatCompletionResponse, ErrorResponse], None]: - """Chat with disaggregation simulation.""" - if not self.started: - raise RuntimeError("Engine not started") - - # Simulate prefill/decode disaggregation - request_id = getattr(request, 'request_id', f"req-{random.randint(1000, 9999)}") - - # Mock prefill phase - prompt_text = "" - if hasattr(request, 'messages') and request.messages: - for message in request.messages: - if hasattr(message, 'content') and message.content: - prompt_text += str(message.content) + " " - - # Cache prefill result - self.prefill_cache[request_id] = { - "prompt": prompt_text.strip(), - "kv_cache": f"mock_kv_cache_{len(prompt_text)}" - } - - # Mock decode phase - generated_text = f"Mock PD disagg response [cached: {request_id}]" - if self.kv_transfer_enabled: - generated_text += " [KV transfer enabled]" - - if request.stream: - # Return as SSE for streaming - chunk_data = { - "id": f"chatcmpl-pd-{request_id}", - "object": "chat.completion.chunk", - "created": int(asyncio.get_event_loop().time()), - "model": getattr(request, 'model', 'mock-pd-model'), - "choices": [{ - "index": 0, - "delta": { - "role": "assistant", - "content": generated_text - }, - "finish_reason": "stop" - }] - } - yield f"data: {json.dumps(chunk_data)}\n\n" - yield "data: [DONE]\n\n" - else: - # Return as response object - choice = { - "index": 0, - "message": { - "role": "assistant", - "content": generated_text - }, - "finish_reason": "stop" - } +# yield response + + +# class MockPDDisaggVLLMEngine(MockVLLMEngine): +# """Mock engine for testing Prefill/Decode disaggregated functionality.""" + +# def __init__(self, llm_config: LLMConfig): +# super().__init__(llm_config) +# self.prefill_cache = {} +# self.kv_transfer_enabled = False + +# async def start(self): +# """Start with disaggregation support.""" +# await super().start() +# # Mock enabling KV transfer +# self.kv_transfer_enabled = True + +# async def chat(self, request: ChatCompletionRequest) -> AsyncGenerator[Union[str, ChatCompletionResponse, ErrorResponse], None]: +# """Chat with disaggregation simulation.""" +# if not self.started: +# raise RuntimeError("Engine not started") + +# # Simulate prefill/decode disaggregation +# request_id = getattr(request, 'request_id', f"req-{random.randint(1000, 9999)}") + +# # Mock prefill phase +# prompt_text = "" +# if hasattr(request, 'messages') and request.messages: +# for message in request.messages: +# if hasattr(message, 'content') and message.content: +# prompt_text += str(message.content) + " " + +# # Cache prefill result +# self.prefill_cache[request_id] = { +# "prompt": prompt_text.strip(), +# "kv_cache": f"mock_kv_cache_{len(prompt_text)}" +# } + +# # Mock decode phase +# generated_text = f"Mock PD disagg response [cached: {request_id}]" +# if self.kv_transfer_enabled: +# generated_text += " [KV transfer enabled]" + +# if request.stream: +# # Return as SSE for streaming +# chunk_data = { +# "id": f"chatcmpl-pd-{request_id}", +# "object": "chat.completion.chunk", +# "created": int(asyncio.get_event_loop().time()), +# "model": getattr(request, 'model', 'mock-pd-model'), +# "choices": [{ +# "index": 0, +# "delta": { +# "role": "assistant", +# "content": generated_text +# }, +# "finish_reason": "stop" +# }] +# } +# yield f"data: {json.dumps(chunk_data)}\n\n" +# yield "data: [DONE]\n\n" +# else: +# # Return as response object +# choice = { +# "index": 0, +# "message": { +# "role": "assistant", +# "content": generated_text +# }, +# "finish_reason": "stop" +# } - response = ChatCompletionResponse( - id=f"chatcmpl-pd-{request_id}", - object="chat.completion", - created=int(asyncio.get_event_loop().time()), - model=getattr(request, 'model', 'mock-pd-model'), - choices=[choice] - ) +# response = ChatCompletionResponse( +# id=f"chatcmpl-pd-{request_id}", +# object="chat.completion", +# created=int(asyncio.get_event_loop().time()), +# model=getattr(request, 'model', 'mock-pd-model'), +# choices=[choice] +# ) - yield response - - -class FakeLoraModelLoader: - """Fake LoRA model loader for testing.""" - - async def load_model(self, lora_model_id: str, llm_config: LLMConfig) -> DiskMultiplexConfig: - """Load a fake LoRA model.""" - return DiskMultiplexConfig( - model_id=lora_model_id, - max_total_tokens=llm_config.max_request_context_length, - local_path="/fake/local/path", - lora_assigned_int_id=random.randint(1, 100), - ) - - -# Utility functions for JSON generation and validation -def generate_from_schema(schema: dict) -> Any: - """Generate mock data from JSON schema.""" - if "type" not in schema: - raise ValueError("Schema must have a 'type' property") - - # Handle enum values first (takes precedence over type) - if "enum" in schema: - return random.choice(schema["enum"]) - - # Handle const values - if "const" in schema: - return schema["const"] - - schema_type = schema["type"] - - if schema_type == "object": - obj = {} - properties = schema.get("properties", {}) - required = schema.get("required", []) - - # Generate required properties first - for prop in required: - if prop in properties: - obj[prop] = generate_from_schema(properties[prop]) - - # Generate optional properties (randomly include some) - for prop, prop_schema in properties.items(): - if prop not in obj and random.choice([True, False]): - obj[prop] = generate_from_schema(prop_schema) - - return obj - - elif schema_type == "array": - item_schema = schema.get("items", {"type": "string"}) - min_items = schema.get("minItems", 1) - max_items = schema.get("maxItems", 5) - array_length = random.randint(min_items, max_items) - - return [generate_from_schema(item_schema) for _ in range(array_length)] - - elif schema_type == "string": - # Handle string patterns and formats - if "pattern" in schema: - # For testing purposes, return a string that might match common patterns - pattern = schema["pattern"] - if "email" in pattern.lower() or "@" in pattern: - return "test@example.com" - elif "phone" in pattern.lower() or "\\d" in pattern: - return "123-456-7890" - else: - return "pattern_match_string" - - if "format" in schema: - format_type = schema["format"] - if format_type == "email": - return "test@example.com" - elif format_type == "date": - return "2024-01-15" - elif format_type == "date-time": - return "2024-01-15T10:30:00Z" - elif format_type == "uri": - return "https://example.com" - elif format_type == "uuid": - return "550e8400-e29b-41d4-a716-446655440000" - - # Handle string length constraints - min_length = schema.get("minLength", 1) - max_length = schema.get("maxLength", 20) - base_string = "mock_string_value" - - if max_length < len(base_string): - return base_string[:max_length] - elif min_length > len(base_string): - return base_string + "x" * (min_length - len(base_string)) - else: - return base_string - - elif schema_type == "integer": - minimum = schema.get("minimum", 0) - maximum = schema.get("maximum", 100) - return random.randint(minimum, maximum) - - elif schema_type == "number": - minimum = schema.get("minimum", 0.0) - maximum = schema.get("maximum", 100.0) - return random.uniform(minimum, maximum) - - elif schema_type == "boolean": - return random.choice([True, False]) - - elif schema_type == "null": - return None - - # Handle multiple types (anyOf, oneOf) - elif isinstance(schema_type, list): - chosen_type = random.choice(schema_type) - return generate_from_schema({"type": chosen_type}) - - else: - raise ValueError(f"Unsupported schema type: {schema_type}") - - -def validate_json_schema_response(response_text: str, schema: dict) -> bool: - """ - Validate that a JSON response conforms to the provided schema. - This is a simple validation for testing purposes. - """ - try: - data = json.loads(response_text) - # Basic validation - in a real implementation you'd use jsonschema library - return _validate_against_schema(data, schema) - except (json.JSONDecodeError, Exception): - return False - - -def _validate_against_schema(data: Any, schema: dict) -> bool: - """Helper function for basic schema validation.""" - schema_type = schema.get("type") +# yield response + + +# class FakeLoraModelLoader: +# """Fake LoRA model loader for testing.""" + +# async def load_model(self, lora_model_id: str, llm_config: LLMConfig) -> DiskMultiplexConfig: +# """Load a fake LoRA model.""" +# return DiskMultiplexConfig( +# model_id=lora_model_id, +# max_total_tokens=llm_config.max_request_context_length, +# local_path="/fake/local/path", +# lora_assigned_int_id=random.randint(1, 100), +# ) + + +# # Utility functions for JSON generation and validation +# def generate_from_schema(schema: dict) -> Any: +# """Generate mock data from JSON schema.""" +# if "type" not in schema: +# raise ValueError("Schema must have a 'type' property") + +# # Handle enum values first (takes precedence over type) +# if "enum" in schema: +# return random.choice(schema["enum"]) + +# # Handle const values +# if "const" in schema: +# return schema["const"] + +# schema_type = schema["type"] + +# if schema_type == "object": +# obj = {} +# properties = schema.get("properties", {}) +# required = schema.get("required", []) + +# # Generate required properties first +# for prop in required: +# if prop in properties: +# obj[prop] = generate_from_schema(properties[prop]) + +# # Generate optional properties (randomly include some) +# for prop, prop_schema in properties.items(): +# if prop not in obj and random.choice([True, False]): +# obj[prop] = generate_from_schema(prop_schema) + +# return obj + +# elif schema_type == "array": +# item_schema = schema.get("items", {"type": "string"}) +# min_items = schema.get("minItems", 1) +# max_items = schema.get("maxItems", 5) +# array_length = random.randint(min_items, max_items) + +# return [generate_from_schema(item_schema) for _ in range(array_length)] + +# elif schema_type == "string": +# # Handle string patterns and formats +# if "pattern" in schema: +# # For testing purposes, return a string that might match common patterns +# pattern = schema["pattern"] +# if "email" in pattern.lower() or "@" in pattern: +# return "test@example.com" +# elif "phone" in pattern.lower() or "\\d" in pattern: +# return "123-456-7890" +# else: +# return "pattern_match_string" + +# if "format" in schema: +# format_type = schema["format"] +# if format_type == "email": +# return "test@example.com" +# elif format_type == "date": +# return "2024-01-15" +# elif format_type == "date-time": +# return "2024-01-15T10:30:00Z" +# elif format_type == "uri": +# return "https://example.com" +# elif format_type == "uuid": +# return "550e8400-e29b-41d4-a716-446655440000" + +# # Handle string length constraints +# min_length = schema.get("minLength", 1) +# max_length = schema.get("maxLength", 20) +# base_string = "mock_string_value" + +# if max_length < len(base_string): +# return base_string[:max_length] +# elif min_length > len(base_string): +# return base_string + "x" * (min_length - len(base_string)) +# else: +# return base_string + +# elif schema_type == "integer": +# minimum = schema.get("minimum", 0) +# maximum = schema.get("maximum", 100) +# return random.randint(minimum, maximum) + +# elif schema_type == "number": +# minimum = schema.get("minimum", 0.0) +# maximum = schema.get("maximum", 100.0) +# return random.uniform(minimum, maximum) + +# elif schema_type == "boolean": +# return random.choice([True, False]) + +# elif schema_type == "null": +# return None + +# # Handle multiple types (anyOf, oneOf) +# elif isinstance(schema_type, list): +# chosen_type = random.choice(schema_type) +# return generate_from_schema({"type": chosen_type}) + +# else: +# raise ValueError(f"Unsupported schema type: {schema_type}") + + +# def validate_json_schema_response(response_text: str, schema: dict) -> bool: +# """ +# Validate that a JSON response conforms to the provided schema. +# This is a simple validation for testing purposes. +# """ +# try: +# data = json.loads(response_text) +# # Basic validation - in a real implementation you'd use jsonschema library +# return _validate_against_schema(data, schema) +# except (json.JSONDecodeError, Exception): +# return False + + +# def _validate_against_schema(data: Any, schema: dict) -> bool: +# """Helper function for basic schema validation.""" +# schema_type = schema.get("type") - if schema_type == "object" and isinstance(data, dict): - # Check required properties - required = schema.get("required", []) - for prop in required: - if prop not in data: - return False - - # Check property types - properties = schema.get("properties", {}) - for prop, value in data.items(): - if prop in properties: - if not _validate_against_schema(value, properties[prop]): - return False - return True +# if schema_type == "object" and isinstance(data, dict): +# # Check required properties +# required = schema.get("required", []) +# for prop in required: +# if prop not in data: +# return False + +# # Check property types +# properties = schema.get("properties", {}) +# for prop, value in data.items(): +# if prop in properties: +# if not _validate_against_schema(value, properties[prop]): +# return False +# return True - elif schema_type == "array" and isinstance(data, list): - item_schema = schema.get("items", {}) - return all(_validate_against_schema(item, item_schema) for item in data) +# elif schema_type == "array" and isinstance(data, list): +# item_schema = schema.get("items", {}) +# return all(_validate_against_schema(item, item_schema) for item in data) - elif schema_type == "string" and isinstance(data, str): - return True +# elif schema_type == "string" and isinstance(data, str): +# return True - elif schema_type == "integer" and isinstance(data, int): - return True +# elif schema_type == "integer" and isinstance(data, int): +# return True - elif schema_type == "number" and isinstance(data, (int, float)): - return True +# elif schema_type == "number" and isinstance(data, (int, float)): +# return True - elif schema_type == "boolean" and isinstance(data, bool): - return True +# elif schema_type == "boolean" and isinstance(data, bool): +# return True - elif schema_type == "null" and data is None: - return True +# elif schema_type == "null" and data is None: +# return True - return False +# return False -def split_string_into_chunks(s: str, n: int) -> List[str]: - """Split string into n chunks.""" - if n <= 0: - raise ValueError("Number of chunks must be greater than 0") +# def split_string_into_chunks(s: str, n: int) -> List[str]: +# """Split string into n chunks.""" +# if n <= 0: +# raise ValueError("Number of chunks must be greater than 0") - chunk_size = len(s) // n - remainder = len(s) % n +# chunk_size = len(s) // n +# remainder = len(s) % n - chunks = [] - start = 0 - for i in range(n): - end = start + chunk_size + (1 if i < remainder else 0) - chunks.append(s[start:end]) - start = end +# chunks = [] +# start = 0 +# for i in range(n): +# end = start + chunk_size + (1 if i < remainder else 0) +# chunks.append(s[start:end]) +# start = end - return chunks +# return chunks -def get_prompt_length(prompt: Union[str, List[str], List[int]]) -> int: - """Get the length of a prompt.""" - if isinstance(prompt, str): - return len(prompt.split()) - elif isinstance(prompt, list): - return len(prompt) - else: - return 0 +# def get_prompt_length(prompt: Union[str, List[str], List[int]]) -> int: +# """Get the length of a prompt.""" +# if isinstance(prompt, str): +# return len(prompt.split()) +# elif isinstance(prompt, list): +# return len(prompt) +# else: +# return 0 diff --git a/python/ray/llm/tests/serve/utils/__init__.py b/python/ray/llm/tests/serve/utils/__init__.py new file mode 100644 index 000000000000..f6befe644317 --- /dev/null +++ b/python/ray/llm/tests/serve/utils/__init__.py @@ -0,0 +1 @@ +# Testing utilities for Ray LLM serve tests \ No newline at end of file diff --git a/python/ray/llm/tests/serve/utils/testing_utils.py b/python/ray/llm/tests/serve/utils/testing_utils.py new file mode 100644 index 000000000000..e4bb2b9c7860 --- /dev/null +++ b/python/ray/llm/tests/serve/utils/testing_utils.py @@ -0,0 +1,82 @@ +"""Shared testing utilities for Ray LLM serve tests.""" + +import json +import re +from typing import Union, List, Optional + +from ray.llm._internal.serve.configs.openai_api_models import ( + ChatCompletionResponse, + CompletionResponse, + EmbeddingResponse +) + + +class LLMResponseValidator: + """Reusable validation logic for LLM responses.""" + + @staticmethod + def get_expected_content(api_type: str, max_tokens: int) -> str: + """Get expected content based on API type.""" + return " ".join(f"test_{i}" for i in range(max_tokens)) + + @staticmethod + def validate_non_streaming_response( + response: Union[ChatCompletionResponse, CompletionResponse], + api_type: str, + max_tokens: int + ): + """Validate non-streaming responses.""" + expected_content = LLMResponseValidator.get_expected_content(api_type, max_tokens) + + if api_type == "chat": + assert isinstance(response, ChatCompletionResponse) + assert response.choices[0].message.content == expected_content + elif api_type == "completion": + assert isinstance(response, CompletionResponse) + assert response.choices[0].text == expected_content + + @staticmethod + def validate_streaming_chunks( + chunks: List[str], + api_type: str, + max_tokens: int + ): + """Validate streaming response chunks.""" + # Should have max_tokens + 1 chunks (tokens + [DONE]) + assert len(chunks) == max_tokens + 1 + + # Validate each chunk except the last [DONE] chunk + for chunk_iter, chunk in enumerate(chunks[:-1]): + pattern = r"data: (.*)\n\n" + match = re.match(pattern, chunk) + assert match is not None + chunk_data = json.loads(match.group(1)) + + if api_type == "chat": + delta = chunk_data["choices"][0]["delta"] + if chunk_iter == 0: + assert delta["role"] == "assistant" + else: + assert delta["role"] is None + assert delta["content"].strip() == f"test_{chunk_iter}" + elif api_type == "completion": + text = chunk_data["choices"][0]["text"] + assert text.strip() == f"test_{chunk_iter}" + + @staticmethod + def validate_embedding_response( + response: EmbeddingResponse, + expected_dimensions: Optional[int] = None + ): + """Validate embedding responses.""" + assert isinstance(response, EmbeddingResponse) + assert response.object == "list" + assert len(response.data) == 1 + assert response.data[0].object == "embedding" + assert isinstance(response.data[0].embedding, list) + assert len(response.data[0].embedding) > 0 # Should have some embedding dimensions + assert response.data[0].index == 0 + + # Check dimensions if specified + if expected_dimensions: + assert len(response.data[0].embedding) == expected_dimensions \ No newline at end of file From ccd188ba81c3b4e47471a81453f20ed1d7f5683a Mon Sep 17 00:00:00 2001 From: Kourosh Hakhamaneshi Date: Wed, 2 Jul 2025 12:26:30 -0700 Subject: [PATCH 24/37] added lora logic back and tested the request_id handling from serve Signed-off-by: Kourosh Hakhamaneshi --- .../serve/deployments/llm/llm_server.py | 46 +++++++++++++++---- .../cpu/deployments/llm/test_llm_server.py | 29 ++++++++++++ .../llm/tests/serve/mocks/mock_vllm_engine.py | 8 ++-- 3 files changed, 71 insertions(+), 12 deletions(-) diff --git a/python/ray/llm/_internal/serve/deployments/llm/llm_server.py b/python/ray/llm/_internal/serve/deployments/llm/llm_server.py index 3a0e68ae5aa4..793f9218d5f5 100644 --- a/python/ray/llm/_internal/serve/deployments/llm/llm_server.py +++ b/python/ray/llm/_internal/serve/deployments/llm/llm_server.py @@ -26,6 +26,9 @@ LLMCompletionsResponse, LLMEmbeddingsResponse, ) +from ray.llm._internal.serve.deployments.llm.multiplex.lora_model_loader import ( + LoraModelLoader, +) from ray.llm._internal.serve.configs.server_models import ( LLMConfig, ) @@ -35,6 +38,7 @@ from ray.llm._internal.serve.deployments.utils.server_utils import ( get_serve_request_id, ) +from ray.llm._internal.serve.configs.server_models import DiskMultiplexConfig from ray.llm._internal.serve.observability.logging import get_logger from ray.llm._internal.serve.observability.usage_telemetry.usage import ( push_telemetry_report_for_all_models, @@ -86,9 +90,12 @@ async def llm_config(self) -> Optional[LLMConfig]: class LLMServer(_LLMServerBase): """This is a shm layer to decouple the LLM engine from the ingress deployment. - It has a very similar API as the engine. Almost all of the abstractions are implemented by the engine. This class just a little bit more logic on top, e.g.: - 1. Logic for serve multiplexing, etc. - 2. Telemetry reporting + It has a very similar API as the engine. Almost all of the abstractions are implemented by the engine. This class just a little bit more logic on top: + + 1. Logic for serve multiplexing (e.g. LoRA loading). + 2. Request id handing from serve context. + 3. Batching in case of streaming (only for chat and completions). + 4. Telemetry reporting. """ _default_engine_cls = VLLMEngine @@ -116,8 +123,32 @@ async def __init__( if self._engine_cls is not None: self.engine = self._engine_cls(self._llm_config) await asyncio.wait_for(self._start_engine(), timeout=ENGINE_START_TIMEOUT_S) + + self._init_multiplex_loader() + def _init_multiplex_loader(self): + """Initialize the multiplex loader.""" + + mx_config = self._llm_config.multiplex_config() + self._load_model = lambda lora_model_id: None + + if mx_config is not None: + model_downloader = LoraModelLoader( + download_timeout_s=mx_config.download_timeout_s, + max_tries=mx_config.max_download_tries, + ) + + async def _load_model(lora_model_id: str) -> DiskMultiplexConfig: + return await model_downloader.load_model( + lora_model_id=lora_model_id, + llm_config=self._llm_config, + ) + + self._load_model = serve.multiplexed(max_num_models_per_replica=mx_config.max_num_models_per_replica)(_load_model) + + + def _get_default_engine_class(self) -> Type[LLMEngine]: """Helper to load the engine class from the environment variable. This is used for testing or escape-hatch for patching purposes. @@ -155,16 +186,15 @@ async def _maybe_add_request_id_to_request(self, request: Union[ChatCompletionRe request_id = get_serve_request_id() if request_id: request.request_id = request_id - + async def _maybe_resolve_lora_from_multiplex(self) -> None: """Handle the lora model for the request.""" multiplexed_model_id = serve.get_multiplexed_model_id() if multiplexed_model_id: - assert ( - self._llm_config.lora_config is not None - ), "Must setup lora config for multiplexed requests." - disk_lora_model = await self._disk_lora_model(multiplexed_model_id) + if self._llm_config.lora_config is None: + raise ValueError("Must setup lora config for multiplexed requests.") + disk_lora_model = await self._load_model(multiplexed_model_id) await self.engine.resolve_lora(disk_lora_model) def _batch_output_stream(self, generator): diff --git a/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_server.py b/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_server.py index 4a4ccbe7972d..a9539bbf0672 100644 --- a/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_server.py +++ b/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_server.py @@ -2,11 +2,21 @@ from typing import Optional import pytest +from unittest.mock import patch from ray.llm.tests.serve.mocks.mock_vllm_engine import MockVLLMEngine from ray.llm.tests.serve.utils.testing_utils import LLMResponseValidator +from ray import serve +from ray.llm._internal.serve.deployments.llm.llm_server import LLMServer +@pytest.fixture +def serve_handle(mock_llm_config): + app = serve.deployment(LLMServer).bind(mock_llm_config, engine_cls=MockVLLMEngine) + handle = serve.run(app) + handle = handle.options(stream=True) + yield handle + serve.shutdown() class TestLLMServer: @@ -129,6 +139,25 @@ async def test_llm_config_property(self, create_server, mock_llm_config): llm_config = await server.llm_config() assert isinstance(llm_config, type(mock_llm_config)) + @pytest.mark.parametrize("stream", [False]) + @pytest.mark.parametrize("max_tokens", [5]) + @pytest.mark.asyncio + async def test_request_id_handling(self, serve_handle, mock_llm_config, mock_chat_request, stream: bool, max_tokens: int): + """Test that the request id is handled correctly.""" + + # Create a chat completion request + # We should patch get_server_request_id to return a test_request_id + serve.context._serve_request_context.set( + serve.context._RequestContext(**{"request_id": "test_request_id"}) + ) + # Get the response + chunks = [] + async for chunk in serve_handle.chat.remote(mock_chat_request): + chunks.append(chunk) + + assert len(chunks) == 1 + assert chunks[0].id == "test_request_id" + if __name__ == "__main__": diff --git a/python/ray/llm/tests/serve/mocks/mock_vllm_engine.py b/python/ray/llm/tests/serve/mocks/mock_vllm_engine.py index bc82616aa44a..55b1735906d2 100644 --- a/python/ray/llm/tests/serve/mocks/mock_vllm_engine.py +++ b/python/ray/llm/tests/serve/mocks/mock_vllm_engine.py @@ -123,9 +123,9 @@ async def _generate_chat_response( ) -> AsyncGenerator[Union[str, ChatCompletionResponse], None]: """Generate mock chat completion response.""" + request_id = request.request_id or f"chatcmpl-{random.randint(1000, 9999)}" if request.stream: # Streaming response - return SSE formatted strings - request_id = f"chatcmpl-{random.randint(1000, 9999)}" created_time = int(asyncio.get_event_loop().time()) model_name = getattr(request, 'model', 'mock-model') @@ -173,7 +173,7 @@ async def _generate_chat_response( } response = ChatCompletionResponse( - id=f"chatcmpl-{random.randint(1000, 9999)}", + id=request_id, object="chat.completion", created=int(asyncio.get_event_loop().time()), model=getattr(request, 'model', 'mock-model'), @@ -195,9 +195,9 @@ async def _generate_completion_response( ) -> AsyncGenerator[Union[str, CompletionResponse], None]: """Generate mock completion response.""" + request_id = request.request_id or f"cmpl-{random.randint(1000, 9999)}" if request.stream: # Streaming response - return SSE formatted strings - request_id = f"cmpl-{random.randint(1000, 9999)}" created_time = int(asyncio.get_event_loop().time()) model_name = getattr(request, 'model', 'mock-model') @@ -238,7 +238,7 @@ async def _generate_completion_response( } response = CompletionResponse( - id=f"cmpl-{random.randint(1000, 9999)}", + id=request_id, object="text_completion", created=int(asyncio.get_event_loop().time()), model=getattr(request, 'model', 'mock-model'), From 61e8902f5de2c0dbca47e3ed553075da96ae945e Mon Sep 17 00:00:00 2001 From: Kourosh Hakhamaneshi Date: Wed, 2 Jul 2025 13:45:51 -0700 Subject: [PATCH 25/37] tested multiplexing Signed-off-by: Kourosh Hakhamaneshi --- .../serve/deployments/llm/llm_server.py | 16 +++- .../cpu/deployments/llm/test_llm_server.py | 96 +++++++++++++++++-- .../llm/tests/serve/mocks/mock_vllm_engine.py | 46 +++++---- .../llm/tests/serve/utils/testing_utils.py | 28 ++++-- 4 files changed, 150 insertions(+), 36 deletions(-) diff --git a/python/ray/llm/_internal/serve/deployments/llm/llm_server.py b/python/ray/llm/_internal/serve/deployments/llm/llm_server.py index 793f9218d5f5..8eb476066055 100644 --- a/python/ray/llm/_internal/serve/deployments/llm/llm_server.py +++ b/python/ray/llm/_internal/serve/deployments/llm/llm_server.py @@ -104,6 +104,7 @@ async def __init__( llm_config: LLMConfig, *, engine_cls: Optional[Type[LLMEngine]] = None, + model_downloader: Optional[Type[LoraModelLoader]] = None, ): """Constructor of LLMServer. @@ -114,6 +115,8 @@ async def __init__( llm_config: LLMConfig for the model. engine_cls: Dependency injection for the vllm engine class. Defaults to `VLLMEngine`. + model_downloader: Dependency injection for the model downloader. + Defaults to `LoraModelLoader`. """ await super().__init__() self._llm_config = llm_config @@ -124,17 +127,17 @@ async def __init__( self.engine = self._engine_cls(self._llm_config) await asyncio.wait_for(self._start_engine(), timeout=ENGINE_START_TIMEOUT_S) - self._init_multiplex_loader() + self._init_multiplex_loader(model_downloader) - def _init_multiplex_loader(self): + def _init_multiplex_loader(self, model_downloader_cls: Optional[Type[LoraModelLoader]] = None): """Initialize the multiplex loader.""" + model_downloader_cls = model_downloader_cls or LoraModelLoader mx_config = self._llm_config.multiplex_config() - self._load_model = lambda lora_model_id: None if mx_config is not None: - model_downloader = LoraModelLoader( + model_downloader = model_downloader_cls( download_timeout_s=mx_config.download_timeout_s, max_tries=mx_config.max_download_tries, ) @@ -146,6 +149,11 @@ async def _load_model(lora_model_id: str) -> DiskMultiplexConfig: ) self._load_model = serve.multiplexed(max_num_models_per_replica=mx_config.max_num_models_per_replica)(_load_model) + else: + async def _load_model(lora_model_id: str) -> DiskMultiplexConfig: + raise ValueError("LoRA config is not set in the LLMConfig") + + self._load_model = _load_model diff --git a/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_server.py b/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_server.py index a9539bbf0672..f51fd2f67abc 100644 --- a/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_server.py +++ b/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_server.py @@ -2,21 +2,43 @@ from typing import Optional import pytest -from unittest.mock import patch -from ray.llm.tests.serve.mocks.mock_vllm_engine import MockVLLMEngine +from ray.llm.tests.serve.mocks.mock_vllm_engine import MockVLLMEngine, FakeLoraModelLoader from ray.llm.tests.serve.utils.testing_utils import LLMResponseValidator from ray import serve from ray.llm._internal.serve.deployments.llm.llm_server import LLMServer - +from ray.llm._internal.serve.configs.server_models import LoraConfig @pytest.fixture def serve_handle(mock_llm_config): - app = serve.deployment(LLMServer).bind(mock_llm_config, engine_cls=MockVLLMEngine) - handle = serve.run(app) - handle = handle.options(stream=True) - yield handle - serve.shutdown() + + app = serve.deployment(LLMServer).bind(mock_llm_config, engine_cls=MockVLLMEngine) + handle = serve.run(app) + # We set stream=True because the interfaces are async generators regardless + # of the stream flag on request. + handle = handle.options(stream=True) + yield handle + serve.shutdown() + +@pytest.fixture +def multiplexed_serve_handle(mock_llm_config, stream_batching_interval_ms): + mock_llm_config.experimental_configs = { + "stream_batching_interval_ms": stream_batching_interval_ms, + } + mock_llm_config.lora_config = LoraConfig( + dynamic_lora_loading_path="s3://my/s3/path_here", + download_timeout_s=60, + max_download_tries=3, + ) + app = serve.deployment(LLMServer).bind( + mock_llm_config, + engine_cls=MockVLLMEngine, + model_downloader=FakeLoraModelLoader, + ) + handle = serve.run(app) + handle = handle.options(stream=True, multiplexed_model_id="test_model_id") + yield handle + serve.shutdown() class TestLLMServer: @@ -157,6 +179,64 @@ async def test_request_id_handling(self, serve_handle, mock_llm_config, mock_cha assert len(chunks) == 1 assert chunks[0].id == "test_request_id" + + + @pytest.mark.parametrize("api_type", ["chat", "completion"]) + @pytest.mark.parametrize("stream", [False, True]) + @pytest.mark.parametrize("max_tokens", [5]) + @pytest.mark.parametrize("stream_batching_interval_ms", [0, 10000]) + @pytest.mark.asyncio + async def test_multiplexed_request_handling( + self, + multiplexed_serve_handle, + mock_chat_request, + mock_completion_request, + api_type: str, + stream: bool, + max_tokens: int, + stream_batching_interval_ms: int + ): + """Unified test for multiplexed (LoRA) requests - both chat and completion APIs, streaming and non-streaming.""" + + # Create request based on API type and set model ID for multiplexing + if api_type == "chat": + request = mock_chat_request + batched_chunks = multiplexed_serve_handle.chat.remote(request) + elif api_type == "completion": + request = mock_completion_request + batched_chunks = multiplexed_serve_handle.completions.remote(request) + + request.model = "test_model_id" + print(f"\n\n_____ MULTIPLEXED {api_type.upper()} ({'STREAMING' if stream else 'NON-STREAMING'}) max_tokens={max_tokens} batching_interval_ms={stream_batching_interval_ms} _____\n\n") + + if stream: + # Collect responses from the stream + chunks = [] + async for batch in batched_chunks: + if isinstance(batch, list): + chunks.extend(batch) + else: + chunks.append(batch) + + # Check that we got responses + assert len(chunks) > 0 + + # Validate streaming response with LoRA model ID + LLMResponseValidator.validate_streaming_chunks(chunks, api_type, max_tokens, lora_model_id=request.model) + else: + # Collect non-streaming response + chunks = [] + async for batch in batched_chunks: + if isinstance(batch, list): + chunks.extend(batch) + else: + chunks.append(batch) + + # Check that we got one response + assert len(chunks) == 1 + + # Validate non-streaming response with LoRA model ID + LLMResponseValidator.validate_non_streaming_response(chunks[0], api_type, max_tokens, lora_model_id=request.model) diff --git a/python/ray/llm/tests/serve/mocks/mock_vllm_engine.py b/python/ray/llm/tests/serve/mocks/mock_vllm_engine.py index 55b1735906d2..fd75c13debe3 100644 --- a/python/ray/llm/tests/serve/mocks/mock_vllm_engine.py +++ b/python/ray/llm/tests/serve/mocks/mock_vllm_engine.py @@ -18,10 +18,14 @@ LLMConfig, ) from ray.llm._internal.serve.deployments.llm.llm_engine import LLMEngine +from ray.llm._internal.serve.deployments.llm.multiplex.lora_model_loader import LoraModelLoader class MockVLLMEngine(LLMEngine): - """Mock vLLM Engine that generates fake text responses.""" + """Mock vLLM Engine that generates fake text responses. + + - In case of LoRA it generates a prefix with the model name in the text part of the response. + """ def __init__(self, llm_config: LLMConfig): """Create a mock vLLM Engine. @@ -31,7 +35,7 @@ def __init__(self, llm_config: LLMConfig): """ self.llm_config = llm_config self.started = False - self._current_lora_model: Optional[DiskMultiplexConfig] = None + self._current_lora_model: Dict[str, DiskMultiplexConfig] = {} async def start(self): """Start the mock engine.""" @@ -39,7 +43,7 @@ async def start(self): async def resolve_lora(self, lora_model: DiskMultiplexConfig): """Resolve/load a LoRA model.""" - self._current_lora_model = lora_model + self._current_lora_model[lora_model.model_id] = lora_model async def check_health(self) -> None: """Check the health of the mock engine.""" @@ -124,13 +128,17 @@ async def _generate_chat_response( """Generate mock chat completion response.""" request_id = request.request_id or f"chatcmpl-{random.randint(1000, 9999)}" + lora_prefix = "" if request.model not in self._current_lora_model else f"[lora_model] {request.model}: " if request.stream: # Streaming response - return SSE formatted strings created_time = int(asyncio.get_event_loop().time()) model_name = getattr(request, 'model', 'mock-model') for i in range(max_tokens): - token = f"test_{i} " + if i == 0: + token = f"{lora_prefix}test_{i} " + else: + token = f"test_{i} " if i == max_tokens - 1: # no space for the last token token = f"test_{i}" @@ -162,6 +170,7 @@ async def _generate_chat_response( else: # Non-streaming response - return response object generated_text = " ".join([f"test_{i}" for i in range(max_tokens)]) + generated_text = f"{lora_prefix}{generated_text}" choice = { "index": 0, @@ -196,13 +205,17 @@ async def _generate_completion_response( """Generate mock completion response.""" request_id = request.request_id or f"cmpl-{random.randint(1000, 9999)}" + lora_prefix = "" if request.model not in self._current_lora_model else f"[lora_model] {request.model}: " if request.stream: # Streaming response - return SSE formatted strings created_time = int(asyncio.get_event_loop().time()) model_name = getattr(request, 'model', 'mock-model') - + for i in range(max_tokens): - token = f"test_{i} " + if i == 0: + token = f"{lora_prefix}test_{i} " + else: + token = f"test_{i} " if i == max_tokens - 1: # no space for the last token token = f"test_{i}" @@ -230,6 +243,7 @@ async def _generate_completion_response( else: # Non-streaming response - return response object generated_text = " ".join([f"test_{i}" for i in range(max_tokens)]) + generated_text = f"{lora_prefix}{generated_text}" choice = { "index": 0, @@ -665,17 +679,17 @@ async def _generate_completion_response( # yield response -# class FakeLoraModelLoader: -# """Fake LoRA model loader for testing.""" +class FakeLoraModelLoader(LoraModelLoader): + """Fake LoRA model loader for testing.""" -# async def load_model(self, lora_model_id: str, llm_config: LLMConfig) -> DiskMultiplexConfig: -# """Load a fake LoRA model.""" -# return DiskMultiplexConfig( -# model_id=lora_model_id, -# max_total_tokens=llm_config.max_request_context_length, -# local_path="/fake/local/path", -# lora_assigned_int_id=random.randint(1, 100), -# ) + async def load_model(self, lora_model_id: str, llm_config: LLMConfig) -> DiskMultiplexConfig: + """Load a fake LoRA model.""" + return DiskMultiplexConfig( + model_id=lora_model_id, + max_total_tokens=llm_config.max_request_context_length, + local_path="/fake/local/path", + lora_assigned_int_id=random.randint(1, 100), + ) # # Utility functions for JSON generation and validation diff --git a/python/ray/llm/tests/serve/utils/testing_utils.py b/python/ray/llm/tests/serve/utils/testing_utils.py index e4bb2b9c7860..c8c967e19476 100644 --- a/python/ray/llm/tests/serve/utils/testing_utils.py +++ b/python/ray/llm/tests/serve/utils/testing_utils.py @@ -1,4 +1,7 @@ -"""Shared testing utilities for Ray LLM serve tests.""" +"""Shared testing utilities for Ray LLM serve tests. + +This is written with assumptions around how mocks for testing are expected to behave. +""" import json import re @@ -15,18 +18,22 @@ class LLMResponseValidator: """Reusable validation logic for LLM responses.""" @staticmethod - def get_expected_content(api_type: str, max_tokens: int) -> str: + def get_expected_content(api_type: str, max_tokens: int, lora_model_id: str = "") -> str: """Get expected content based on API type.""" - return " ".join(f"test_{i}" for i in range(max_tokens)) + expected_content = " ".join(f"test_{i}" for i in range(max_tokens)) + if lora_model_id: + expected_content = f"[lora_model] {lora_model_id}: {expected_content}" + return expected_content @staticmethod def validate_non_streaming_response( response: Union[ChatCompletionResponse, CompletionResponse], api_type: str, - max_tokens: int + max_tokens: int, + lora_model_id: str = "" ): """Validate non-streaming responses.""" - expected_content = LLMResponseValidator.get_expected_content(api_type, max_tokens) + expected_content = LLMResponseValidator.get_expected_content(api_type, max_tokens, lora_model_id) if api_type == "chat": assert isinstance(response, ChatCompletionResponse) @@ -39,7 +46,8 @@ def validate_non_streaming_response( def validate_streaming_chunks( chunks: List[str], api_type: str, - max_tokens: int + max_tokens: int, + lora_model_id: str = "" ): """Validate streaming response chunks.""" # Should have max_tokens + 1 chunks (tokens + [DONE]) @@ -52,16 +60,20 @@ def validate_streaming_chunks( assert match is not None chunk_data = json.loads(match.group(1)) + expected_chunk = f"test_{chunk_iter}" + if lora_model_id and chunk_iter == 0: + expected_chunk = f"[lora_model] {lora_model_id}: {expected_chunk}" + if api_type == "chat": delta = chunk_data["choices"][0]["delta"] if chunk_iter == 0: assert delta["role"] == "assistant" else: assert delta["role"] is None - assert delta["content"].strip() == f"test_{chunk_iter}" + assert delta["content"].strip() == expected_chunk elif api_type == "completion": text = chunk_data["choices"][0]["text"] - assert text.strip() == f"test_{chunk_iter}" + assert text.strip() == expected_chunk @staticmethod def validate_embedding_response( From 88a45e03b5c1c5e2d8e2a486cb9528c07e01d804 Mon Sep 17 00:00:00 2001 From: Kourosh Hakhamaneshi Date: Wed, 2 Jul 2025 13:56:18 -0700 Subject: [PATCH 26/37] wip Signed-off-by: Kourosh Hakhamaneshi --- .../cpu/deployments/llm/test_llm_server.py | 33 +++++++++---------- 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_server.py b/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_server.py index f51fd2f67abc..4dc3d292343f 100644 --- a/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_server.py +++ b/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_server.py @@ -10,7 +10,10 @@ from ray.llm._internal.serve.configs.server_models import LoraConfig @pytest.fixture -def serve_handle(mock_llm_config): +def serve_handle(mock_llm_config, stream_batching_interval_ms = 0): + mock_llm_config.experimental_configs = { + "stream_batching_interval_ms": stream_batching_interval_ms, + } app = serve.deployment(LLMServer).bind(mock_llm_config, engine_cls=MockVLLMEngine) handle = serve.run(app) @@ -19,9 +22,10 @@ def serve_handle(mock_llm_config): handle = handle.options(stream=True) yield handle serve.shutdown() + @pytest.fixture -def multiplexed_serve_handle(mock_llm_config, stream_batching_interval_ms): +def multiplexed_serve_handle(mock_llm_config, stream_batching_interval_ms = 0): mock_llm_config.experimental_configs = { "stream_batching_interval_ms": stream_batching_interval_ms, } @@ -45,37 +49,30 @@ class TestLLMServer: @pytest.mark.parametrize("api_type", ["chat", "completion"]) @pytest.mark.parametrize("stream", [False, True]) @pytest.mark.parametrize("max_tokens", [5]) - @pytest.mark.parametrize("batching_interval_ms", [0, 10000]) + @pytest.mark.parametrize("stream_batching_interval_ms", [0, 10000]) @pytest.mark.asyncio async def test_unified_llm_server( self, - create_server, + serve_handle, mock_llm_config, mock_chat_request, mock_completion_request, api_type: str, stream: bool, max_tokens: int, - batching_interval_ms: int + stream_batching_interval_ms: int ): """Unified test for both chat and completion APIs, streaming and non-streaming.""" - # Override the batching interval config (only matters for streaming) - if stream: - mock_llm_config.experimental_configs = { - "stream_batching_interval_ms": batching_interval_ms, - } - - server = await create_server(mock_llm_config, engine_cls=MockVLLMEngine) # Create request based on API type if api_type == "chat": request = mock_chat_request - batched_chunks = await server.chat(request) + batched_chunks = serve_handle.chat.remote(request) elif api_type == "completion": request = mock_completion_request - batched_chunks = await server.completions(request) + batched_chunks = serve_handle.completions.remote(request) - print(f"\n\n_____ {api_type.upper()} ({'STREAMING' if stream else 'NON-STREAMING'}) max_tokens={max_tokens} batching_interval_ms={batching_interval_ms} _____\n\n") + print(f"\n\n_____ {api_type.upper()} ({'STREAMING' if stream else 'NON-STREAMING'}) max_tokens={max_tokens} batching_interval_ms={stream_batching_interval_ms} _____\n\n") if stream: # Collect responses from the stream @@ -104,13 +101,13 @@ async def test_unified_llm_server( @pytest.mark.asyncio async def test_embedding_llm_server( self, - create_server, + serve_handle, mock_llm_config, mock_embedding_request, dimensions: Optional[int] ): """Test embedding API from LLMServer perspective.""" - server = await create_server(mock_llm_config, engine_cls=MockVLLMEngine) + # Create embedding request request = mock_embedding_request @@ -118,7 +115,7 @@ async def test_embedding_llm_server( print(f"\n\n_____ EMBEDDING SERVER dimensions={dimensions} _____\n\n") # Get the response - batched_chunks = await server.embeddings(request) + batched_chunks = serve_handle.embeddings.remote(request) # Collect responses (should be just one) chunks = [] From 4e9a3d298e7d73837b394aedf41498f6da8329be Mon Sep 17 00:00:00 2001 From: Kourosh Hakhamaneshi Date: Wed, 2 Jul 2025 15:59:40 -0700 Subject: [PATCH 27/37] added telemetry tests Signed-off-by: Kourosh Hakhamaneshi --- .../ray/llm/_internal/serve/configs/server_models.py | 2 +- .../_internal/serve/deployments/llm/llm_server.py | 5 +---- .../serve/deployments/llm/vllm/vllm_engine.py | 12 +++++++++++- .../deployments/utils/node_initialization_utils.py | 4 ---- .../serve/cpu/deployments/llm/test_llm_server.py | 10 ++++++++++ 5 files changed, 23 insertions(+), 10 deletions(-) diff --git a/python/ray/llm/_internal/serve/configs/server_models.py b/python/ray/llm/_internal/serve/configs/server_models.py index c8fd87c93bc4..631a671abff0 100644 --- a/python/ray/llm/_internal/serve/configs/server_models.py +++ b/python/ray/llm/_internal/serve/configs/server_models.py @@ -242,7 +242,7 @@ class LLMConfig(BaseModelExtended): ) _supports_vision: bool = PrivateAttr(False) - _model_architecture: str = PrivateAttr("") + _model_architecture: str = PrivateAttr("UNSPECIFIED") _engine_config: EngineConfigType = PrivateAttr(None) def _infer_supports_vision(self, model_id_or_path: str) -> None: diff --git a/python/ray/llm/_internal/serve/deployments/llm/llm_server.py b/python/ray/llm/_internal/serve/deployments/llm/llm_server.py index 8eb476066055..7783e3d4af16 100644 --- a/python/ray/llm/_internal/serve/deployments/llm/llm_server.py +++ b/python/ray/llm/_internal/serve/deployments/llm/llm_server.py @@ -174,10 +174,7 @@ async def _start_engine(self): await self.engine.start() # Push telemetry reports for the model in the current deployment. - # Note: the model architecture is only available after node initialized and the - # engine is started. - if self._llm_config.model_architecture: - push_telemetry_report_for_all_models(all_models=[self._llm_config]) + push_telemetry_report_for_all_models(all_models=[self._llm_config]) def _get_batch_interval_ms(self, stream: bool = True) -> int: diff --git a/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py b/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py index 807ce7d762c1..beda088a196e 100644 --- a/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py +++ b/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py @@ -182,13 +182,23 @@ async def start(self) -> None: from vllm.entrypoints.openai.api_server import init_app_state - node_initialization = await initialize_node(self.llm_config) + node_initialization = await initialize_node(self.llm_config) + ( vllm_engine_args, vllm_frontend_args, vllm_engine_config, ) = self._prepare_engine_config(node_initialization) + # Apply checkpoint info to the llm_config. + # This is needed for capturing model capabilities + # (e.g. supports vision, etc.) on the llm_config. + config = self.llm_config.get_engine_config() + self.llm_config.apply_checkpoint_info( + config.actual_hf_model_id, + trust_remote_code=config.trust_remote_code, + ) + self._engine_client = self._start_async_llm_engine( vllm_engine_args, vllm_engine_config, diff --git a/python/ray/llm/_internal/serve/deployments/utils/node_initialization_utils.py b/python/ray/llm/_internal/serve/deployments/utils/node_initialization_utils.py index 893778024801..601bb97e3cec 100644 --- a/python/ray/llm/_internal/serve/deployments/utils/node_initialization_utils.py +++ b/python/ray/llm/_internal/serve/deployments/utils/node_initialization_utils.py @@ -116,10 +116,6 @@ async def initialize_node(llm_config: LLMConfig) -> InitializeNodeOutput: download_extra_files=True, ) - llm_config.apply_checkpoint_info( - engine_config.actual_hf_model_id, - trust_remote_code=engine_config.trust_remote_code, - ) return InitializeNodeOutput( placement_group=pg, runtime_env=runtime_env, extra_init_kwargs=extra_init_kwargs diff --git a/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_server.py b/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_server.py index 4dc3d292343f..d2c8a7bfdb3e 100644 --- a/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_server.py +++ b/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_server.py @@ -2,6 +2,7 @@ from typing import Optional import pytest +from unittest.mock import patch from ray.llm.tests.serve.mocks.mock_vllm_engine import MockVLLMEngine, FakeLoraModelLoader from ray.llm.tests.serve.utils.testing_utils import LLMResponseValidator @@ -234,6 +235,15 @@ async def test_multiplexed_request_handling( # Validate non-streaming response with LoRA model ID LLMResponseValidator.validate_non_streaming_response(chunks[0], api_type, max_tokens, lora_model_id=request.model) + + + @pytest.mark.asyncio + async def test_push_telemetry(self, create_server, mock_llm_config): + """Test that the telemetry push is called properly.""" + with patch("ray.llm._internal.serve.deployments.llm.llm_server.push_telemetry_report_for_all_models") as mock_push_telemetry: + await create_server(mock_llm_config, engine_cls=MockVLLMEngine) + mock_push_telemetry.assert_called_once() + From 343a39594937136cf7883316ac2d221086e16e68 Mon Sep 17 00:00:00 2001 From: Kourosh Hakhamaneshi Date: Wed, 2 Jul 2025 16:18:22 -0700 Subject: [PATCH 28/37] remove tests that we already had a good coverage on Signed-off-by: Kourosh Hakhamaneshi --- .../serve/deployments/llm/llm_server.py | 3 - .../test_lora_deployment_base_client.py | 8 +- .../multiplex/test_multiplex_deployment.py | 83 -------- .../cpu/deployments/llm/test_llm_engine.py | 16 +- .../deployments/llm/vllm/test_vllm_engine.py | 197 ------------------ 5 files changed, 6 insertions(+), 301 deletions(-) delete mode 100644 python/ray/llm/tests/serve/cpu/deployments/llm/multiplex/test_multiplex_deployment.py delete mode 100644 python/ray/llm/tests/serve/cpu/deployments/llm/vllm/test_vllm_engine.py diff --git a/python/ray/llm/_internal/serve/deployments/llm/llm_server.py b/python/ray/llm/_internal/serve/deployments/llm/llm_server.py index 7783e3d4af16..31089d6148d7 100644 --- a/python/ray/llm/_internal/serve/deployments/llm/llm_server.py +++ b/python/ray/llm/_internal/serve/deployments/llm/llm_server.py @@ -3,11 +3,9 @@ from abc import ABC, abstractmethod from typing import Any, Dict, Optional, Type, Union, AsyncGenerator, List -# Third-party imports from ray import serve from ray._common.utils import import_attr -# Local imports from ray.llm._internal.serve.configs.constants import ( DEFAULT_HEALTH_CHECK_PERIOD_S, DEFAULT_HEALTH_CHECK_TIMEOUT_S, @@ -24,7 +22,6 @@ EmbeddingResponse, LLMChatResponse, LLMCompletionsResponse, - LLMEmbeddingsResponse, ) from ray.llm._internal.serve.deployments.llm.multiplex.lora_model_loader import ( LoraModelLoader, diff --git a/python/ray/llm/tests/serve/cpu/deployments/llm/multiplex/test_lora_deployment_base_client.py b/python/ray/llm/tests/serve/cpu/deployments/llm/multiplex/test_lora_deployment_base_client.py index ec2e75bbf267..7c806cade746 100644 --- a/python/ray/llm/tests/serve/cpu/deployments/llm/multiplex/test_lora_deployment_base_client.py +++ b/python/ray/llm/tests/serve/cpu/deployments/llm/multiplex/test_lora_deployment_base_client.py @@ -8,7 +8,7 @@ from ray import serve from ray.llm._internal.serve.configs.openai_api_models import ModelCard from ray.llm._internal.serve.deployments.llm.llm_server import LLMDeployment -from ray.llm.tests.serve.mocks.mock_vllm_engine import MockMultiplexEngine +from ray.llm.tests.serve.mocks.mock_vllm_engine import MockVLLMEngine from ray.serve.handle import DeploymentHandle from ray.serve.llm import LLMConfig, LLMRouter, LoraConfig @@ -57,7 +57,7 @@ def get_mocked_llm_deployments(llm_configs) -> List[DeploymentHandle]: llm_deployments.append( deployment.bind( llm_config=llm_config, - engine_cls=MockMultiplexEngine, + engine_cls=MockVLLMEngine, ) ) return llm_deployments @@ -100,7 +100,7 @@ async def test_lora_get_model(shutdown_ray_and_serve, disable_placement_bundles) assert isinstance(base_model_config, ModelCard) base_model_data = base_model_config.model_dump() assert base_model_data["id"] == base_model_id - base_model_config = base_model_data["rayllm_metadata"] + base_model_config = base_model_data["metadata"] # Case 3: model has a multiplex config in the cloud. llm_config = VLLM_APP.model_copy(deep=True) @@ -125,7 +125,7 @@ async def fake_get_lora_model_metadata(*args, **kwargs): assert isinstance(lora_model_config, ModelCard) lora_model_data = lora_model_config.model_dump() assert lora_model_data["id"] == lora_model - lora_metadata = lora_model_data["rayllm_metadata"] + lora_metadata = lora_model_data["metadata"] assert lora_metadata["model_id"] == lora_model assert lora_metadata["base_model_id"] == base_model_id assert lora_metadata["max_request_context_length"] == 4096 diff --git a/python/ray/llm/tests/serve/cpu/deployments/llm/multiplex/test_multiplex_deployment.py b/python/ray/llm/tests/serve/cpu/deployments/llm/multiplex/test_multiplex_deployment.py deleted file mode 100644 index 4680ad8b273f..000000000000 --- a/python/ray/llm/tests/serve/cpu/deployments/llm/multiplex/test_multiplex_deployment.py +++ /dev/null @@ -1,83 +0,0 @@ -import sys - -import pytest - -from ray import serve -from ray.llm._internal.serve.configs.prompt_formats import ( - Prompt, -) -from ray.llm._internal.serve.configs.server_models import ( - LLMConfig, -) -from ray.llm._internal.serve.deployments.llm.llm_server import LLMDeployment -from ray.llm.tests.serve.mocks.mock_vllm_engine import ( - FakeLoraModelLoader, - MockMultiplexEngine, -) - - -@pytest.fixture(name="handle") -def handle(shutdown_ray_and_serve): - - llm_config = LLMConfig( - model_loading_config={ - "model_id": "meta-llama/Llama-2-7b-hf", - }, - lora_config={ - "max_num_adapters_per_replica": 16, - "dynamic_lora_loading_path": "s3://my/s3/path_here", - }, - ) - - handle = serve.run( - LLMDeployment.options(placement_group_bundles=[{"CPU": 1}],).bind( - llm_config, - engine_cls=MockMultiplexEngine, - model_downloader=FakeLoraModelLoader(), - ), - ) - - return handle - - -@pytest.mark.asyncio -@pytest.mark.parametrize("stream_tokens", [True, False]) -@pytest.mark.parametrize("multiplexed_model_id", ["test_model", None]) -async def test_multiplex_deployment( - handle, - stream_tokens: bool, - multiplexed_model_id: str, -): - - gen = handle.options( - stream=True, multiplexed_model_id=multiplexed_model_id - )._predict.remote( - "req_id", - Prompt(prompt="Generate some sql please.", use_prompt_format=False), - stream=stream_tokens, - ) - - # gen is an async generator - # we need to convert it to a list of outputs in one line - outputs = [] - async for x in gen: - outputs.append(x) - - assert len(outputs) == 1 - output = outputs[0] - - assert output.stream == stream_tokens - - if multiplexed_model_id is None: - assert output.disk_multiplex_config is None - else: - assert output.disk_multiplex_config.model_dump() == { - "model_id": multiplexed_model_id, - "max_total_tokens": None, - "local_path": "/local/path", - "lora_assigned_int_id": 1, - } - - -if __name__ == "__main__": - sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_engine.py b/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_engine.py index cdc44c95077d..736c1aeff379 100644 --- a/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_engine.py +++ b/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_engine.py @@ -15,21 +15,9 @@ from ray.llm.tests.serve.mocks.mock_vllm_engine import MockVLLMEngine from ray.llm.tests.serve.utils.testing_utils import LLMResponseValidator -from ray.serve.llm import LLMConfig, ModelLoadingConfig -from ray.llm._internal.serve.configs.openai_api_models import ( - ChatCompletionRequest, - ChatCompletionResponse, - CompletionRequest, - CompletionResponse, - EmbeddingCompletionRequest, - EmbeddingResponse -) -import pytest -from typing import AsyncGenerator, Optional - - - +import pytest +from typing import Optional class TestMockLLMEngine: diff --git a/python/ray/llm/tests/serve/cpu/deployments/llm/vllm/test_vllm_engine.py b/python/ray/llm/tests/serve/cpu/deployments/llm/vllm/test_vllm_engine.py deleted file mode 100644 index 8d244efba444..000000000000 --- a/python/ray/llm/tests/serve/cpu/deployments/llm/vllm/test_vllm_engine.py +++ /dev/null @@ -1,197 +0,0 @@ -import asyncio -import json -import sys -from types import SimpleNamespace -from typing import List -from unittest.mock import Mock - -import pytest - -from ray.llm._internal.serve.configs.server_models import ( - FinishReason, - LLMConfig, -) -from ray.llm._internal.serve.deployments.llm.vllm.vllm_engine import ( - VLLMEngine, -) -from ray.llm._internal.serve.deployments.llm.vllm.vllm_models import ( - VLLMGenerationRequest, - VLLMSamplingParams, -) - - -class FakeVLLMEngine: - def __init__(self, mock: Mock, output=None): - self.engine = mock - - self._output = output or [] - self.num_generated = 0 - - async def generate(self, *args, **kwargs): - # Record the call - self.engine.generate(*args, **kwargs) - - for x in self._output: - await asyncio.sleep(0.01) - self.num_generated += 1 - yield x - - async def abort(self, request_id: str): - # Record the call - self.engine.abort(request_id) - - def _abort(self, request_id: str, **kwargs): - # Record the call - self.engine.abort(request_id) - - -def get_fake_responses(*tokens: List[str]): - total = "" - output = [] - - for token in tokens: - total += token - # For some reason vLLM appears to return the full text on each iteration - # We should fix this in vllm - output.append( - SimpleNamespace( - outputs=[ - SimpleNamespace( - text=total, - finish_reason="stop", # for some reason, vllm returns a finish reason on all tokens. We should fix this too. - token_ids=[0], - logprobs=[], - ) - ], - prompt_token_ids=[0], - metrics=SimpleNamespace(time_in_queue=0.01), - ) - ) - - return output - - -def get_fake_engine_and_request(llm_config: LLMConfig, expected_out: List[str]): - vllm_engine = VLLMEngine(llm_config) - # We normally set the model config when calling VLLMEngine.start() - vllm_engine.model_config = Mock() - vllm_engine.model_config.max_model_len = 1 - - engine_mock = Mock() - vllm_engine.engine = FakeVLLMEngine(engine_mock, get_fake_responses(*expected_out)) - - req = VLLMGenerationRequest( - prompt="prompt", - request_id="req_id", - sampling_params=VLLMSamplingParams(), - disk_multiplex_config=None, - stream=True, - ) - return vllm_engine, req, engine_mock - - -class TestVLLMEngine: - """Test the VLLMEngine.""" - - @pytest.mark.asyncio - async def test_generate(self, llm_config): - expected_out = ["hi ", "i ", "am ", "vllm."] - vllm_engine, req, engine_mock = get_fake_engine_and_request( - llm_config, expected_out - ) - - cur_idx = 0 - async for x in vllm_engine.generate(req): - if cur_idx < len(expected_out): - assert x.generated_text == expected_out[cur_idx] - cur_idx += 1 - assert x.generation_time == pytest.approx( - 0.01, abs=0.01 - ), "We are sleeping for this long before returning tokens in the fake" - assert ( - x.num_input_tokens == 1 - ), "We are setting the num input tokens to len 1 in the fake output" - else: - assert x.finish_reason == FinishReason.STOP - - await asyncio.sleep(0.02) # wait for asyncio task scheduling - - # Abort should be called - engine_mock.abort.assert_called_once_with("req_id") - - @pytest.mark.asyncio - async def test_vllm_engine_error_in_caller(self, llm_config): - expected_out = ["hi ", "i ", "am ", "vllm."] - vllm_engine, req, engine_mock = get_fake_engine_and_request( - llm_config, expected_out - ) - - with pytest.raises(RuntimeError): - async for _x in vllm_engine.generate(req): - raise RuntimeError() - - await asyncio.sleep(0.02) # wait for asyncio task scheduling - # Abort should be called - engine_mock.abort.assert_called_once_with("req_id") - - @pytest.mark.asyncio - async def test_vllm_engine_caller_cancellation(self, llm_config): - expected_out = ["hi ", "i ", "am ", "vllm.", "and more"] * 10 # many tokens - vllm_engine, req, engine_mock = get_fake_engine_and_request( - llm_config, expected_out - ) - - async def run(): - async for x in vllm_engine.generate(req): - print(x) - - task = asyncio.create_task(run()) - await asyncio.sleep(0.02) # wait for some tokens to be returned - - # Cancel the task - task.cancel() - - await asyncio.sleep(0.02) # wait for asyncio task scheduling - # Abort should be called - engine_mock.abort.assert_called_once_with("req_id") - assert ( - vllm_engine.engine.num_generated <= 4 - ), "We should have generated not more than 4 tokens" - - @pytest.mark.parametrize("enable_json_mode", [True, False]) - def test_parse_sampling_params_json_mode( - self, llm_config: LLMConfig, enable_json_mode: bool - ): - # Make a deep copy to avoid modifying the session-scoped fixture - llm_config = llm_config.model_copy(deep=True) - vllm_engine = VLLMEngine(llm_config) - - # Mock model_config to avoid None errors - vllm_engine.model_config = Mock() - vllm_engine.model_config.max_model_len = 1000 - - # Create sampling params with response format - sampling_params = VLLMSamplingParams( - response_format={ - "type": "json_object", - "schema": { - "type": "object", - "properties": {"name": {"type": "string"}}, - }, - } - ) - - # Parse the sampling params - parsed_params = vllm_engine._parse_sampling_params(sampling_params) - - # For both cases we should now have guided decoding since we are using oss vllm. - # When json_mode is disabled, guided_decoding should be used instead - assert hasattr(parsed_params, "guided_decoding") - # Parse the JSON string from guided_decoding into a dict - guided_json = json.loads(parsed_params.guided_decoding.json) - assert guided_json == sampling_params.response_format.json_schema - assert getattr(parsed_params, "response_format", None) is None - - -if __name__ == "__main__": - sys.exit(pytest.main(["-v", __file__])) From e0470cc49af15ffa3f4409829aa914bfa06db192 Mon Sep 17 00:00:00 2001 From: Kourosh Hakhamaneshi Date: Wed, 2 Jul 2025 16:29:01 -0700 Subject: [PATCH 29/37] fix test_router Signed-off-by: Kourosh Hakhamaneshi --- .../llm/tests/serve/cpu/deployments/routers/test_router.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/ray/llm/tests/serve/cpu/deployments/routers/test_router.py b/python/ray/llm/tests/serve/cpu/deployments/routers/test_router.py index 5ba14036df08..4204231fd069 100644 --- a/python/ray/llm/tests/serve/cpu/deployments/routers/test_router.py +++ b/python/ray/llm/tests/serve/cpu/deployments/routers/test_router.py @@ -86,7 +86,7 @@ async def test_chat(self, stream_batching_interval_ms, client, stream): role = response.choices[0].message.role assert role == "assistant" - assert text == "".join([f"test_{i} " for i in range(n_tokens)]) + assert text.strip() == " ".join([f"test_{i}" for i in range(n_tokens)]) @pytest.mark.asyncio @pytest.mark.parametrize("stream_batching_interval_ms", [None, 0, 10000]) @@ -112,8 +112,8 @@ async def test_completion(self, stream_batching_interval_ms, client, stream): text = response.choices[0].text # The mock engine produces "test_0 test_1 test_2 ..." pattern - expected_text = "".join([f"test_{i} " for i in range(n_tokens)]) - assert text == expected_text + expected_text = " ".join([f"test_{i}" for i in range(n_tokens)]) + assert text.strip() == expected_text def test_router_with_num_router_replicas_config(self): """Test the router with num_router_replicas config.""" From e9725c3c7909e89586ae9cf63a215740a8468696 Mon Sep 17 00:00:00 2001 From: Kourosh Hakhamaneshi Date: Wed, 2 Jul 2025 18:45:29 -0700 Subject: [PATCH 30/37] pd Signed-off-by: Kourosh Hakhamaneshi --- .../prefill_decode_disagg.py | 121 +++++++++------- .../test_prefill_decode_disagg.py | 132 +----------------- 2 files changed, 71 insertions(+), 182 deletions(-) diff --git a/python/ray/llm/_internal/serve/deployments/prefill_decode_disagg/prefill_decode_disagg.py b/python/ray/llm/_internal/serve/deployments/prefill_decode_disagg/prefill_decode_disagg.py index 399ddbba584b..3d94377a0688 100644 --- a/python/ray/llm/_internal/serve/deployments/prefill_decode_disagg/prefill_decode_disagg.py +++ b/python/ray/llm/_internal/serve/deployments/prefill_decode_disagg/prefill_decode_disagg.py @@ -8,12 +8,9 @@ from vllm.config import KVTransferConfig from ray import serve -from ray.llm._internal.serve.configs.prompt_formats import Prompt from ray.llm._internal.serve.configs.server_models import ( - LLMRawResponse, parse_args as parse_llm_configs, ) -from ray.llm._internal.serve.deployments.llm.llm_server import ResponsePostprocessor from ray.llm._internal.serve.deployments.llm.vllm.vllm_models import ( KV_TRANSFER_PARAMS_KEY, ) @@ -26,8 +23,18 @@ ModelLoadingConfig, build_llm_deployment, ) +from ray.llm._internal.serve.configs.openai_api_models import ( + ChatCompletionRequest, + CompletionRequest, + ChatCompletionResponse, + CompletionResponse, + ErrorResponse, + EmbeddingRequest, + EmbeddingResponse, +) logger = logging.getLogger(__name__) +RequestType = Union[ChatCompletionRequest, CompletionRequest] class PDServingArgs(BaseModel): @@ -92,27 +99,19 @@ async def __init__( llm_config, ) - self.prefill_server = prefill_server - self.decode_server = decode_server - - async def _predict( - self, - request_id: str, - prompt: Prompt, - stream: bool, - ) -> AsyncGenerator[LLMRawResponse, None]: - """ - Disaggregate the P/D requests: - 1. Send the request to the prefill server. - 2. Parse the response and forward necessary fields to the decode server. - 3. Return the response from the decode server. - """ - + self.prefill_server = prefill_server.options(stream=True) + self.decode_server = decode_server.options(stream=True) + + async def embeddings(self, request: EmbeddingRequest) -> AsyncGenerator[EmbeddingResponse, None]: + raise NotImplementedError("Embedding is not supported for P/D disaggregation") + + + def _prepare_prefill_request(self, request: RequestType) -> RequestType: assert ( - prompt.parameters.get(KV_TRANSFER_PARAMS_KEY, None) is None - ), f"{KV_TRANSFER_PARAMS_KEY} should be empty before proxy" - prefill_prompt = prompt.model_copy(deep=True) - prefill_prompt.parameters[KV_TRANSFER_PARAMS_KEY] = { + getattr(request, "kv_transfer_params", None) is None + ), f"kv_transfer_params should be empty before proxy" + prefill_request = request.model_copy(deep=True) + prefill_request.kv_transfer_params = { "do_remote_decode": True, "do_remote_prefill": False, "remote_engine_id": None, @@ -120,37 +119,55 @@ async def _predict( "remote_host": None, "remote_port": None, } - prefill_prompt.parameters["max_tokens"] = 1 - - prefill_response_gen: AsyncGenerator[ - LLMRawResponse, None - ] = self.prefill_server.options( - # _predict returns generator, we have to set stream=True - stream=True - )._predict.remote( - request_id=request_id, prompt=prefill_prompt, stream=False - ) - - prefill_response = await ResponsePostprocessor.merge_stream( - prefill_response_gen - ) - - if prefill_response.error: - logger.error(f"Prefill server returned error: {prefill_response.error}") - yield prefill_response + prefill_request.max_tokens = 1 + prefill_request.stream = False + + return prefill_request + + + def _prepare_decode_request(self, request: RequestType, prefill_chunk: Union[ChatCompletionResponse, CompletionResponse]) -> RequestType: + decode_request = request.model_copy(deep=True) + decode_request.kv_transfer_params = prefill_chunk.kv_transfer_params + + return decode_request + + async def _handle_request( + self, + request: RequestType, + ) -> AsyncGenerator[Union[str, ChatCompletionResponse, CompletionResponse, ErrorResponse], None]: + + if isinstance(request, ChatCompletionRequest): + method = "chat" + elif isinstance(request, CompletionRequest): + method = "completions" + else: + raise ValueError(f"Unsupported request type: {type(request)}") + + prefill_request = self._prepare_prefill_request(request) + prefill_gen = getattr(self.prefill_server, method).remote(prefill_request) + + prefill_chunk = await anext(prefill_gen) + + if isinstance(prefill_chunk, ErrorResponse): + logger.error(f"Prefill returned error: {prefill_chunk.error}") + yield prefill_chunk return - - kv_transfer_params = prefill_response.metadata[KV_TRANSFER_PARAMS_KEY] - logger.debug( - f"Prefill metadata[{KV_TRANSFER_PARAMS_KEY}]: {kv_transfer_params}" - ) - prompt.parameters[KV_TRANSFER_PARAMS_KEY] = kv_transfer_params - - async for chunk in self.decode_server.options(stream=True)._predict.remote( - request_id=request_id, prompt=prompt, stream=stream - ): + + decode_request = self._prepare_decode_request(request, prefill_chunk) + decode_gen = self.decode_server.chat.remote(decode_request) + + + async for chunk in decode_gen: yield chunk - + + + async def chat(self, request: ChatCompletionRequest) -> AsyncGenerator[Union[str, ChatCompletionResponse, ErrorResponse], None]: + return self._handle_request(request) + + + async def completions(self, request: CompletionRequest) -> AsyncGenerator[Union[str, CompletionResponse, ErrorResponse], None]: + return self._handle_request(request) + @classmethod def as_deployment(cls) -> serve.Deployment: """Turns PDProxyServer into a Ray Serve deployment.""" diff --git a/python/ray/llm/tests/serve/cpu/deployments/prefill_decode_disagg/test_prefill_decode_disagg.py b/python/ray/llm/tests/serve/cpu/deployments/prefill_decode_disagg/test_prefill_decode_disagg.py index c73e8d3cfa6f..c6cd17b3f66a 100644 --- a/python/ray/llm/tests/serve/cpu/deployments/prefill_decode_disagg/test_prefill_decode_disagg.py +++ b/python/ray/llm/tests/serve/cpu/deployments/prefill_decode_disagg/test_prefill_decode_disagg.py @@ -1,18 +1,12 @@ import sys -from unittest.mock import patch import pytest -from vllm.config import KVTransferConfig -from vllm.platforms.interface import UnspecifiedPlatform -from ray.llm._internal.serve.configs.prompt_formats import Prompt -from ray.llm._internal.serve.configs.server_models import LLMRawResponse +from ray.serve.llm import LLMConfig + from ray.llm._internal.serve.deployments.prefill_decode_disagg.prefill_decode_disagg import ( build_app, ) -from ray.llm.tests.serve.mocks.mock_vllm_engine import MockPDDisaggVLLMEngine -from ray.serve.llm import LLMConfig, ModelLoadingConfig -from ray.serve.llm.openai_api_models import ChatCompletionRequest class TestServingArgsParsing: @@ -55,127 +49,5 @@ def test_parse_dict(self): assert app is not None -class FakePlatform(UnspecifiedPlatform): - """ - vllm UnspecifiedPlatform has some interfaces that's left unimplemented, which - could trigger exception in following tests. So we implement needed interfaces - and patch. - """ - - def is_async_output_supported(self, enforce_eager: bool) -> bool: - return True - - -class TestPDDisaggLLMServer: - """Test PD-disaggregated LLM server. - - A real P/D disaggregation use case will spawn multiple LLM servers, - so this test suite just does smoke test and verifies certain expected - parameters exist in responses. - """ - - @pytest.mark.asyncio - @patch("vllm.platforms.current_platform", FakePlatform()) - async def test_chat_non_streaming( - self, - create_server, - # model_pixtral_12b is a fixture that only contains config files without weights - model_pixtral_12b, - ): - """This is smoke testing that normal chat completion works.""" - llm_config = LLMConfig( - # Here we - # 1. want to skip GPU placement in cpu test cases (https://github.com/ray-project/ray/blob/945b9d5dd55c9215d0aeb94a66cfda3b71c2fd43/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py#L330) - # 2. cannot set it to None, otherwise it defaults to use_gpu=True (https://github.com/ray-project/ray/blob/c7e07328c9efbd0d67bf2da4fa098d6492478ef4/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_models.py#L159) - # 3. cannot use "CPU" or anything random, which violates the check (https://github.com/ray-project/ray/blob/945b9d5dd55c9215d0aeb94a66cfda3b71c2fd43/python/ray/llm/_internal/serve/configs/server_models.py#L325) - # so we select a non-NVIDIA type here: Intel-GAUDI. - accelerator_type="Intel-GAUDI", - model_loading_config=ModelLoadingConfig( - model_id=model_pixtral_12b, - ), - engine_kwargs={ - "kv_transfer_config": KVTransferConfig( - kv_connector="NixlConnector", - kv_role="kv_both", - ), - }, - ) - - server = await create_server(llm_config, engine_cls=MockPDDisaggVLLMEngine) - - # Create a chat completion request - request = ChatCompletionRequest( - model="test_model", - messages=[dict(role="user", content="Hello")], - stream=False, - max_tokens=5, - ) - - # Get the response - response_stream = await server.chat(request) - - # Collect responses (should be just one) - responses = [r async for r in response_stream] - - # Check that we got one response - assert len(responses) == 1 - assert responses[0].choices[0].message.role == "assistant" - assert ( - responses[0].choices[0].message.content - == "mock_pd_client_response_0 mock_pd_client_response_1 mock_pd_client_response_2 mock_pd_client_response_3 mock_pd_client_response_4 " - ) - - @pytest.mark.asyncio - @patch("vllm.platforms.current_platform", FakePlatform()) - async def test_predict_non_streaming( - self, - create_server, - # model_pixtral_12b is a fixture that only contains config files without weights - model_pixtral_12b, - ): - """Test non-streaming predict.""" - llm_config = LLMConfig( - # Here we - # 1. want to skip GPU placement in cpu test cases (https://github.com/ray-project/ray/blob/945b9d5dd55c9215d0aeb94a66cfda3b71c2fd43/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py#L330) - # 2. cannot set it to None, otherwise it defaults to use_gpu=True (https://github.com/ray-project/ray/blob/c7e07328c9efbd0d67bf2da4fa098d6492478ef4/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_models.py#L159) - # 3. cannot use "CPU" or anything random, which violates the check (https://github.com/ray-project/ray/blob/945b9d5dd55c9215d0aeb94a66cfda3b71c2fd43/python/ray/llm/_internal/serve/configs/server_models.py#L325) - # so we select a non-NVIDIA type here: Intel-GAUDI. - accelerator_type="Intel-GAUDI", - model_loading_config=ModelLoadingConfig( - model_id=model_pixtral_12b, - ), - engine_kwargs={ - "kv_transfer_config": KVTransferConfig( - kv_connector="NixlConnector", - kv_role="kv_both", - ), - }, - ) - - server = await create_server(llm_config, engine_cls=MockPDDisaggVLLMEngine) - - # Create a predict request - request = Prompt( - prompt="test prompt", - parameters=dict( - max_tokens=1, - stream=False, - kv_transfer_params=dict(field_that_does_not_matter="1"), - ), - ) - - # Get the response - responses: list[LLMRawResponse] = [] - async for response in server._predict( - request_id="test_request_id", prompt=request, stream=False - ): - responses.append(response) - - # Collect responses (should be just one) - assert len(responses) == 1 - assert responses[0].generated_text == "mock_pd_client_response_0 " - assert responses[0].metadata is not None - - if __name__ == "__main__": sys.exit(pytest.main(["-v", __file__])) From d4d8a8d276781dbac82d65b747c32cc67f6ddc5a Mon Sep 17 00:00:00 2001 From: Kourosh Hakhamaneshi Date: Wed, 2 Jul 2025 18:54:51 -0700 Subject: [PATCH 31/37] delete dead code Signed-off-by: Kourosh Hakhamaneshi --- .../_internal/serve/configs/prompt_formats.py | 119 ------------------ .../cpu/configs/test_openai_api_models.py | 29 ----- .../serve/cpu/configs/test_prompt_formats.py | 83 ------------ 3 files changed, 231 deletions(-) delete mode 100644 python/ray/llm/_internal/serve/configs/prompt_formats.py delete mode 100644 python/ray/llm/tests/serve/cpu/configs/test_openai_api_models.py delete mode 100644 python/ray/llm/tests/serve/cpu/configs/test_prompt_formats.py diff --git a/python/ray/llm/_internal/serve/configs/prompt_formats.py b/python/ray/llm/_internal/serve/configs/prompt_formats.py deleted file mode 100644 index fe1bdf47527b..000000000000 --- a/python/ray/llm/_internal/serve/configs/prompt_formats.py +++ /dev/null @@ -1,119 +0,0 @@ -from typing import ( - Any, - Dict, - List, - Literal, - Optional, - Union, -) - -from pydantic import ( - BaseModel, - field_validator, - model_validator, -) - -from ray.llm._internal.common.utils.import_utils import try_import - -transformers = try_import("transformers") - - -class Text(BaseModel): - type: str = "text" - text: str - - -# Ref: https://huggingface.co/mistral-community/pixtral-12b -# -# Community version of pixtral uses the key `content` instead of `text` in the content. -# This is to support the "content" content type in the prompt format, as opposite of -# the "text" content from the above which most other model uses. -class Content(BaseModel): - type: str = "text" - content: str - - -class Image(BaseModel): - type: str = "image_url" - image_url: Dict - - @field_validator("image_url") - @classmethod - def check_image_url(cls, value): - """Checks if the image_url is a dict with a 'url' key. - Example: - image_url = { - "url": "https://example.com/image.png" - } - """ - if "url" not in value or not value["url"] or not isinstance(value["url"], str): - raise ValueError( - # TODO(xwjiang): Link to doc. - "Expecting 'url' string to be provided under 'image_url' dict." - ) - return value - - -ContentList = List[Union[Image, Text, Content]] - - -class Message(BaseModel): - role: Literal["system", "assistant", "user"] - content: Optional[Union[str, ContentList]] = None - - def __str__(self): - return self.model_dump_json() - - @model_validator(mode="after") - def check_fields(self): - if self.role == "system": - if not isinstance(self.content, str): - raise ValueError("System content must be a string") - if self.role == "user" and self.content is None: - raise ValueError("User content must not be None.") - if self.role == "assistant": - # passing a regular assistant message - if self.content is not None and not isinstance(self.content, str): - raise ValueError("content must be a string or None") - return self - - -class Prompt(BaseModel): - prompt: Union[str, List[Message]] - use_prompt_format: bool = True - parameters: Optional[Dict[str, Any]] = None - - @field_validator("parameters", mode="before") - @classmethod - def parse_parameters(cls, value): - if isinstance(value, BaseModel): - # Use exclude_unset so that we can distinguish unset values from default values - return value.model_dump(exclude_unset=True) - return value - - @field_validator("prompt") - @classmethod - def check_prompt(cls, value): - if isinstance(value, list) and not value: - raise ValueError("Messages cannot be an empty list.") - return value - - def to_unformatted_string(self) -> str: - if isinstance(self.prompt, list): - return ", ".join(str(message.content) for message in self.prompt) - return self.prompt - - -class ImageInput(BaseModel): - """Prompt output that contains image info.""" - - image_url: str - - -class EngineInput(BaseModel): - """Input to the engine. - - Which is also output from `PromptFormat.generate_prompt()`.""" - - text: str - image: Optional[List[ImageInput]] = None diff --git a/python/ray/llm/tests/serve/cpu/configs/test_openai_api_models.py b/python/ray/llm/tests/serve/cpu/configs/test_openai_api_models.py deleted file mode 100644 index ff92ecea0a7b..000000000000 --- a/python/ray/llm/tests/serve/cpu/configs/test_openai_api_models.py +++ /dev/null @@ -1,29 +0,0 @@ -from ray.llm._internal.serve.configs.openai_api_models import DeltaMessage - - -def test_delta_message_null_content(): - """Test that the DeltaMessage class is correctly constructed. - - When the content is passed as None, it should be set to an empty string. - """ - role = "user" - delta_message_implicitly_null_content = DeltaMessage( - role=role, - ) - - delta_message_explicitly_null_content = DeltaMessage( - role=role, - content=None, - ) - - delta_message_empty_string_content = DeltaMessage( - role=role, - content="", - ) - - assert delta_message_implicitly_null_content.role == role - assert delta_message_explicitly_null_content.role == role - assert delta_message_empty_string_content.role == role - assert delta_message_implicitly_null_content.content == "" - assert delta_message_explicitly_null_content.content == "" - assert delta_message_empty_string_content.content == "" diff --git a/python/ray/llm/tests/serve/cpu/configs/test_prompt_formats.py b/python/ray/llm/tests/serve/cpu/configs/test_prompt_formats.py deleted file mode 100644 index e120d7c1f5f5..000000000000 --- a/python/ray/llm/tests/serve/cpu/configs/test_prompt_formats.py +++ /dev/null @@ -1,83 +0,0 @@ -import sys - -import pytest -from pydantic import ValidationError - -from ray.llm._internal.serve.configs.prompt_formats import ( - Image, - Message, - Prompt, - Text, -) - - -def test_validation_message(): - # check that message with assistant role can have content that - # is a string or none, but nothing else - Message.model_validate({"role": "assistant", "content": "Hello, World!"}) - - Message.model_validate({"role": "assistant", "content": ""}) - - Message.model_validate({"role": "assistant", "content": None}) - - with pytest.raises(ValueError): - Message.model_validate( - { - "role": "assistant", - "content": { - "NOT_VALID", - }, - } - ) - - # Test system and user roles - for role in ["system", "user"]: - # this should pass - Message.model_validate({"role": role, "content": "Hello, World!"}) - - Message.model_validate({"role": role, "content": ""}) - - # a non string content should raise an error - - with pytest.raises(ValueError): - Message.model_validate( - { - "role": role, - "content": { - "NOT_VALID", - }, - } - ) - - with pytest.raises(ValueError): - Message.model_validate({"role": role, "content": None}) - - # test message with image. - Message( - role="user", - content=[ - Text(type="text", text="This is a test."), - Image(type="image_url", image_url={"url": "foo"}), - ], - ) - - -def test_prompt_validation(): - # Test valid prompt creation - Prompt(prompt="This is a test message.") - - Prompt( - prompt=[ - Message(role="system", content="You are a helpful assistant."), - Message(role="user", content="Hello!"), - ] - ) - - # Test invalid prompt creation - with pytest.raises(ValidationError): - # Empty list should raise error - Prompt(prompt=[]) - - -if __name__ == "__main__": - sys.exit(pytest.main(["-v", __file__])) From bc3e7bc975744a47ff46722cc550cb20f3769759 Mon Sep 17 00:00:00 2001 From: Kourosh Hakhamaneshi Date: Wed, 2 Jul 2025 20:13:46 -0700 Subject: [PATCH 32/37] wip Signed-off-by: Kourosh Hakhamaneshi --- .../_internal/serve/configs/server_models.py | 112 +------------- .../serve/deployments/llm/llm_engine.py | 5 +- .../serve/deployments/llm/vllm/vllm_engine.py | 8 +- .../serve/deployments/llm/vllm/vllm_models.py | 98 +----------- .../serve/deployments/routers/middleware.py | 4 +- .../serve/deployments/utils/server_utils.py | 21 +-- .../config_generator/test_text_completion.py | 2 +- .../serve/cpu/configs/test_server_models.py | 96 ------------ .../llm/vllm/test_vllm_engine_gpu.py | 20 --- .../integration/test_openai_compatibility.py | 143 +----------------- ...penai_compatibility_no_accelerator_type.py | 2 +- release/llm_tests/serve/probes/models.py | 6 +- release/llm_tests/serve/probes/test_models.py | 2 +- 13 files changed, 24 insertions(+), 495 deletions(-) delete mode 100644 python/ray/llm/tests/serve/cpu/configs/test_server_models.py delete mode 100644 python/ray/llm/tests/serve/gpu/deployments/llm/vllm/test_vllm_engine_gpu.py diff --git a/python/ray/llm/_internal/serve/configs/server_models.py b/python/ray/llm/_internal/serve/configs/server_models.py index 631a671abff0..6773b0435524 100644 --- a/python/ray/llm/_internal/serve/configs/server_models.py +++ b/python/ray/llm/_internal/serve/configs/server_models.py @@ -7,9 +7,7 @@ List, Optional, Sequence, - Set, Tuple, - Type, TypeVar, Union, ) @@ -37,20 +35,9 @@ DEFAULT_MULTIPLEX_DOWNLOAD_TIMEOUT_S, DEFAULT_MULTIPLEX_DOWNLOAD_TRIES, ENABLE_WORKER_PROCESS_SETUP_HOOK, - MAX_NUM_STOPPING_SEQUENCES, MODEL_RESPONSE_BATCH_TIMEOUT_MS, ) -from ray.llm._internal.serve.configs.error_handling import TooManyStoppingSequences -from ray.llm._internal.serve.configs.openai_api_models_patch import ( - ErrorResponse, - ResponseFormatType, -) -from ray.llm._internal.serve.configs.openai_api_models import ( - ModelCard, -) -from ray.llm._internal.serve.configs.prompt_formats import ( - Prompt, -) +from ray.llm._internal.serve.configs.openai_api_models import ErrorResponse from ray.llm._internal.serve.observability.logging import get_logger from ray.serve._private.config import DeploymentConfig @@ -844,100 +831,3 @@ def merge_dicts(base: Dict, overwrite: Dict) -> Dict: else: base[key] = overwrite[key] return base - - -class SamplingParams(BaseModelExtended): - """Parameters for controlling text generation sampling. - - Args: - max_tokens: The maximum number of tokens to generate. Defaults to inf. - temperature: What sampling temperature to use. - top_p: An alternative to sampling with temperature, called nucleus sampling. - n: How many completions to generate for each prompt. - logprobs: Include the log probabilities on the `logprobs` most likely - tokens, as well the chosen tokens. - top_logprobs: The number of logprobs to return. Defaults to 1. `logprobs` - must be set to `True` in order to use top_logprobs. - stop: Up to 4 sequences where the API will stop generating further tokens. - The returned text will not contain the stop sequence. - stop_tokens: Tokens to stop on (applied before detokenization). - presence_penalty: Number between -2.0 and 2.0. - Positive values penalize new tokens based on whether they appear in - the text so far, increasing the model's likelihood to talk about - new topics. - frequency_penalty: Number between -2.0 and 2.0. Positive values penalize - new tokens based on their existing frequency in the text so far, - decreasing the model's likelihood to repeat the same line verbatim. - best_of: Generates `best_of` completions server-side and returns the "best". - logit_bias: Modify the likelihood of specified tokens appearing in - the completion. - response_format: Format to return the final response in. Can be for ex: - response_format={"type": "json", "schema": "{...}"} - """ - - _ignored_fields: Set[str] = set() - - max_tokens: Optional[int] = None - temperature: Optional[float] = None - top_p: Optional[float] = None - n: int = 1 - logprobs: Optional[bool] = None - top_logprobs: Optional[int] = None - logit_bias: Optional[Dict[str, float]] = None - stop: Optional[List[str]] = None - stop_tokens: Optional[List[int]] = None - ignore_eos: Optional[bool] = None - presence_penalty: Optional[float] = None - frequency_penalty: Optional[float] = None - best_of: int = 1 - response_format: Optional[ResponseFormatType] = None - - def model_dump(self, **kwargs) -> Dict[str, Any]: - if kwargs.get("exclude", None) is None: - kwargs["exclude"] = self._ignored_fields - return super().model_dump(**kwargs) - - @field_validator("stop", mode="before") - @classmethod - def validate_stopping_sequences(cls, values): - if not values: - return values - - unique_val = sorted(set(values)) - - if len(unique_val) > MAX_NUM_STOPPING_SEQUENCES: - TooManyStoppingSequences( - len(unique_val), MAX_NUM_STOPPING_SEQUENCES - ).raise_exception() - - return list(unique_val) - - @field_validator("stop_tokens", mode="before") - @classmethod - def validate_stop_tokens(cls, values): - if not values: - return values - return sorted(set(values)) - - @classmethod - def _get_model_validate_kwargs(cls: Type[ModelT], prompt: Prompt) -> Dict[str, Any]: - generate_kwargs = prompt.parameters or {} - if not isinstance(generate_kwargs, dict): - generate_kwargs = generate_kwargs.model_dump(exclude_unset=True) - - return generate_kwargs - - @classmethod - def from_prompt(cls: Type[ModelT], prompt: Prompt) -> ModelT: - # Extract parameters object from prompt - generate_kwargs = cls._get_model_validate_kwargs(prompt) - return cls.model_validate(generate_kwargs) - - -class GenerationRequest(BaseModelExtended): - prompt: Union[str, List[int], List[str]] - prompt_token_ids: Optional[List[int]] = None - request_id: Union[str, List[str]] - sampling_params: Optional[Union[SamplingParams, List[SamplingParams]]] = None - stream: bool = False - metadata: Optional[Dict[str, Any]] = None diff --git a/python/ray/llm/_internal/serve/deployments/llm/llm_engine.py b/python/ray/llm/_internal/serve/deployments/llm/llm_engine.py index 6bbdc444350f..d92d7e577302 100644 --- a/python/ray/llm/_internal/serve/deployments/llm/llm_engine.py +++ b/python/ray/llm/_internal/serve/deployments/llm/llm_engine.py @@ -1,12 +1,9 @@ import abc -from typing import AsyncGenerator, Optional, Any +from typing import AsyncGenerator, Any from ray.llm._internal.serve.configs.server_models import ( DiskMultiplexConfig, - GenerationRequest, LLMConfig, - LLMRawResponse, - Prompt, ) diff --git a/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py b/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py index beda088a196e..216f59c104ac 100644 --- a/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py +++ b/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py @@ -3,13 +3,10 @@ import argparse from starlette.datastructures import State -from typing import TYPE_CHECKING, AsyncGenerator, List, Tuple, Union +from typing import TYPE_CHECKING, AsyncGenerator, Tuple, Union import ray from ray.llm._internal.common.utils.import_utils import try_import -from ray.llm._internal.serve.configs.constants import ( - RAYLLM_ENABLE_REQUEST_PROMPT_LOGS, -) from ray.llm._internal.serve.configs.openai_api_models import ( CompletionRequest, CompletionResponse, @@ -31,14 +28,12 @@ VLLMEngineStatTracker, ) from ray.llm._internal.serve.deployments.llm.vllm.vllm_models import ( - VLLMEmbeddingRequest, VLLMEngineConfig, ) from ray.llm._internal.serve.deployments.utils.node_initialization_utils import ( InitializeNodeOutput, initialize_node, ) -from ray.llm._internal.serve.deployments.utils.server_utils import floats_to_base64 from ray.llm._internal.serve.observability.logging import get_logger from ray.util.placement_group import PlacementGroup from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy @@ -50,7 +45,6 @@ if TYPE_CHECKING: from vllm.config import VllmConfig from vllm.engine.protocol import EngineClient - from vllm.outputs import PoolingRequestOutput vllm = try_import("vllm") logger = get_logger(__name__) diff --git a/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_models.py b/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_models.py index e30e142bae9b..53d71571a898 100644 --- a/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_models.py +++ b/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_models.py @@ -1,8 +1,8 @@ import os -from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Union +from typing import TYPE_CHECKING, Any, Dict, List, Optional import dataclasses -from pydantic import ConfigDict, Field, ValidationError, field_validator +from pydantic import ConfigDict, Field from ray.llm._internal.common.base_pydantic import BaseModelExtended from ray.llm._internal.common.utils.cloud_utils import CloudMirrorConfig @@ -11,13 +11,9 @@ ALLOW_NEW_PLACEMENT_GROUPS_IN_DEPLOYMENT, ENV_VARS_TO_PROPAGATE, ) -from ray.llm._internal.serve.configs.prompt_formats import Prompt from ray.llm._internal.serve.configs.server_models import ( - DiskMultiplexConfig, - GenerationRequest, GPUType, LLMConfig, - SamplingParams, ) from ray.llm._internal.serve.observability.logging import get_logger from ray.util.placement_group import ( @@ -78,9 +74,6 @@ def actual_hf_model_id(self) -> str: def trust_remote_code(self) -> bool: return self.engine_kwargs.get("trust_remote_code", False) - @property - def sampling_params_model(self): - return VLLMSamplingParams def get_initialization_kwargs(self) -> dict: """ @@ -254,90 +247,3 @@ def get_or_create_pg(self) -> PlacementGroup: return pg -class VLLMSamplingParams(SamplingParams): - """Sampling parameters specific to vLLM engine. - - Args: - top_k: The number of highest probability vocabulary tokens to keep for top-k-filtering. - seed: Seed for deterministic sampling with temperature>0. - repetition_penalty: Float that penalizes new tokens based on whether they - appear in the prompt and the generated text so far. Values > 1 encourage - the model to use new tokens, while values < 1 encourage the model to repeat - tokens. - """ - - _ignored_fields = {"best_of", "n", "logit_bias"} - - top_k: Optional[int] = None - repetition_penalty: Optional[float] = None - seed: Optional[int] = None - kv_transfer_params: Optional[Dict[str, Any]] = None - - @field_validator("n", mode="before") - @classmethod - def validate_n(cls, values): - if values != 1: - raise ValidationError("n>1 is not supported yet in rayllm.") - return values - - @classmethod - def _get_model_validate_kwargs(cls, prompt: Prompt) -> Dict[str, Any]: - """ - Extend the base class's `_get_model_validate_kwargs` to include vllm-specific parameters. - """ - generate_kwargs = super()._get_model_validate_kwargs(prompt) - if ( - prompt.parameters is not None - and KV_TRANSFER_PARAMS_KEY in prompt.parameters - ): - generate_kwargs[KV_TRANSFER_PARAMS_KEY] = prompt.parameters[ - KV_TRANSFER_PARAMS_KEY - ] - return generate_kwargs - - -class VLLMGenerationRequest(GenerationRequest): - model_config = ConfigDict(arbitrary_types_allowed=True) - - # Intentionally override the base class's `sampling_params` field. - sampling_params: Optional[ - Union[ - VLLMSamplingParams, - List[VLLMSamplingParams], - ] - ] = None - multi_modal_data: Optional[Dict[str, Any]] = None - disk_multiplex_config: Optional[DiskMultiplexConfig] = None - - @property - def lora_request(self) -> "LoRARequest": - disk_vllm_config = self.disk_multiplex_config - if not disk_vllm_config: - return None - else: - return vllm.lora.request.LoRARequest( - lora_name=disk_vllm_config.model_id, - lora_int_id=disk_vllm_config.lora_assigned_int_id, - lora_local_path=disk_vllm_config.local_path, - long_lora_max_len=disk_vllm_config.max_total_tokens, - ) - - -class VLLMEmbeddingRequest(GenerationRequest): - model_config = ConfigDict(arbitrary_types_allowed=True) - encoding_format: Optional[Literal["float", "base64"]] = "float" - dimensions: Optional[int] = None - disk_multiplex_config: Optional[DiskMultiplexConfig] = None - - @property - def lora_request(self) -> "LoRARequest": - disk_vllm_config = self.disk_multiplex_config - if not disk_vllm_config: - return None - else: - return vllm.lora.request.LoRARequest( - lora_name=disk_vllm_config.model_id, - lora_int_id=disk_vllm_config.lora_assigned_int_id, - lora_local_path=disk_vllm_config.local_path, - long_lora_max_len=disk_vllm_config.max_total_tokens, - ) diff --git a/python/ray/llm/_internal/serve/deployments/routers/middleware.py b/python/ray/llm/_internal/serve/deployments/routers/middleware.py index d2c2a7a2abde..961e199332ff 100644 --- a/python/ray/llm/_internal/serve/deployments/routers/middleware.py +++ b/python/ray/llm/_internal/serve/deployments/routers/middleware.py @@ -70,7 +70,7 @@ def _uncaught_exception_handler(request: Request, e: Exception): response_payload = get_response_for_error(e, request_id) return JSONResponse( - content=response_payload.model_dump(), status_code=response_payload.error.code + content=response_payload.model_dump(), status_code=response_payload.code ) @@ -115,7 +115,7 @@ async def _handle_application_exceptions( return JSONResponse( content=response_payload.model_dump(), - status_code=response_payload.error.code, + status_code=response_payload.code, ) # This adds last-resort uncaught exception handler into Starlette diff --git a/python/ray/llm/_internal/serve/deployments/utils/server_utils.py b/python/ray/llm/_internal/serve/deployments/utils/server_utils.py index b54b4cb6d5b5..3dd3f9aa0b7e 100644 --- a/python/ray/llm/_internal/serve/deployments/utils/server_utils.py +++ b/python/ray/llm/_internal/serve/deployments/utils/server_utils.py @@ -11,12 +11,7 @@ from ray import serve from ray.llm._internal.serve.configs.openai_api_models import OpenAIHTTPException -from ray.llm._internal.serve.configs.openai_api_models_patch import ( - ErrorResponse, -) -from ray.llm._internal.serve.configs.server_models import ( - LLMRawResponse, -) +from ray.llm._internal.serve.configs.openai_api_models import ErrorResponse from ray.llm._internal.serve.observability.logging import get_logger logger = get_logger(__name__) @@ -78,7 +73,7 @@ def _extract_message(e): def get_response_for_error( e: Exception, request_id: str, -) -> LLMRawResponse: +) -> ErrorResponse: if isinstance(e, HTTPException): status_code = e.status_code elif isinstance(e, OpenAIHTTPException): @@ -116,13 +111,11 @@ def get_response_for_error( internal_message += f" (Request ID: {request_id})" error_response = ErrorResponse( - message=message, + message=f"Message: {message}, Internal exception: {internal_message}, original exception: {str(e)}", code=status_code, - internal_message=internal_message, type=exc_type, - original_exception=e, ) - return LLMRawResponse(error=error_response) + return error_response def get_serve_request_id() -> str: @@ -141,9 +134,3 @@ def replace_prefix(model: str) -> str: """Replace -- with / in model name to handle slashes within the URL path segment""" return model.replace("--", "/") - -def floats_to_base64(float_list: List[float]) -> str: - """Encode a list of floats as base64 as needed for the embedding API response.""" - binary = struct.pack(f"{len(float_list)}f", *float_list) - encoded = base64.b64encode(binary).decode("utf-8") - return encoded diff --git a/python/ray/llm/tests/serve/cpu/config_generator/test_text_completion.py b/python/ray/llm/tests/serve/cpu/config_generator/test_text_completion.py index d8464402fd19..306594caad43 100644 --- a/python/ray/llm/tests/serve/cpu/config_generator/test_text_completion.py +++ b/python/ray/llm/tests/serve/cpu/config_generator/test_text_completion.py @@ -101,7 +101,7 @@ def test_populate_custom_model( model_config = populate_text_completion_model_config(input_model_config) self._assert_models(model_config, input_model_config) - serve_config = get_serve_config(input_model_config, "./file.yaml") + serve_config = get_serve_config("./file.yaml") assert len(serve_config["applications"][0]["args"]["llm_configs"]) == 1 def _assert_models( diff --git a/python/ray/llm/tests/serve/cpu/configs/test_server_models.py b/python/ray/llm/tests/serve/cpu/configs/test_server_models.py deleted file mode 100644 index a885a88e2b11..000000000000 --- a/python/ray/llm/tests/serve/cpu/configs/test_server_models.py +++ /dev/null @@ -1,96 +0,0 @@ -import sys - -import pytest - -from ray.llm._internal.serve.configs.prompt_formats import Prompt -from ray.llm._internal.serve.configs.server_models import SamplingParams - - -class TestSamplingParams: - def test_default_initialization(self): - """Test that SamplingParams can be initialized with default values.""" - params = SamplingParams() - - assert params.max_tokens is None - assert params.temperature is None - assert params.top_p is None - assert params.n == 1 - assert params.logprobs is None - assert params.top_logprobs is None - assert params.logit_bias is None - assert params.stop is None - assert params.stop_tokens is None - assert params.ignore_eos is None - assert params.presence_penalty is None - assert params.frequency_penalty is None - assert params.best_of == 1 - assert params.response_format is None - - def test_initialization_with_values(self): - """Test that SamplingParams can be initialized with specific values.""" - params = SamplingParams( - max_tokens=100, - temperature=0.7, - top_p=0.9, - n=2, - logprobs=True, - top_logprobs=5, - stop=["END", "STOP"], - stop_tokens=[1, 2, 3], - presence_penalty=0.5, - frequency_penalty=0.3, - best_of=3, - ) - - assert params.max_tokens == 100 - assert params.temperature == 0.7 - assert params.top_p == 0.9 - assert params.n == 2 - assert params.logprobs is True - assert params.top_logprobs == 5 - assert params.stop == ["END", "STOP"] - assert params.stop_tokens == [1, 2, 3] - assert params.presence_penalty == 0.5 - assert params.frequency_penalty == 0.3 - assert params.best_of == 3 - - def test_stop_valid_sequences(self): - """Test that valid stop sequences are processed correctly.""" - stop_sequences = ["END", "STOP", "FINISH", "END"] - params = SamplingParams(stop=stop_sequences) - assert params.stop == ["END", "FINISH", "STOP"] # Should be unique - - def test_idempotency(self): - params = SamplingParams() - new_params = SamplingParams.model_validate(params.model_dump()) - assert params.model_dump() == new_params.model_dump() - - @pytest.mark.parametrize( - "stop, stop_tokens", - [ - (["B-END", "A-End"], None), - (["B-END", "A-End"], []), - (None, [100, 50]), - (None, None), - ], - ) - def test_from_prompt_with_dict_parameters(self, stop, stop_tokens): - """Test from_prompt method with dictionary parameters.""" - prompt = Prompt( - prompt="Test prompt", - parameters={ - "stop": stop, - "stop_tokens": stop_tokens, - }, - ) - - params = SamplingParams.from_prompt(prompt) - - assert params.stop == (sorted(stop) if stop is not None else None) - assert params.stop_tokens == ( - sorted(stop_tokens) if stop_tokens is not None else None - ) - - -if __name__ == "__main__": - sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/llm/tests/serve/gpu/deployments/llm/vllm/test_vllm_engine_gpu.py b/python/ray/llm/tests/serve/gpu/deployments/llm/vllm/test_vllm_engine_gpu.py deleted file mode 100644 index 5ca24ac025ab..000000000000 --- a/python/ray/llm/tests/serve/gpu/deployments/llm/vllm/test_vllm_engine_gpu.py +++ /dev/null @@ -1,20 +0,0 @@ -import sys - -import pytest - -from ray.llm._internal.serve.configs.server_models import ( - LLMConfig, -) -from ray.llm._internal.serve.deployments.llm.vllm.vllm_engine import ( - VLLMEngine, - _get_vllm_engine_config, -) - - -class TestVLLMEngine: - """Test the VLLMEngine.""" - pass - - -if __name__ == "__main__": - sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/llm/tests/serve/gpu/integration/test_openai_compatibility.py b/python/ray/llm/tests/serve/gpu/integration/test_openai_compatibility.py index a5405cbded72..704988470d6d 100644 --- a/python/ray/llm/tests/serve/gpu/integration/test_openai_compatibility.py +++ b/python/ray/llm/tests/serve/gpu/integration/test_openai_compatibility.py @@ -3,11 +3,6 @@ import openai import pytest -from ray.llm._internal.serve.configs.constants import ( - MAX_NUM_TOPLOGPROBS_ALLOWED, - MIN_NUM_TOPLOGPROBS_ALLOWED, -) - class TestOpenAICompatibility: """Test that the rayllm are compatible with the OpenAI API""" @@ -17,7 +12,7 @@ def test_models(self, testing_model): # noqa: F811 models = client.models.list() assert len(models.data) == 1, "Only the test model should be returned" assert models.data[0].id == model, "The test model id should match" - assert models.data[0].rayllm_metadata["input_modality"] == "text" + assert models.data[0].metadata["input_modality"] == "text" def test_completions(self, testing_model): # noqa: F811 client, model = testing_model @@ -28,7 +23,7 @@ def test_completions(self, testing_model): # noqa: F811 ) assert completion.model == model assert completion.model - assert completion.choices[0].text == "test_0 test_1 " + assert completion.choices[0].text == "test_0 test_1" def test_chat(self, testing_model): # noqa: F811 client, model = testing_model @@ -43,97 +38,6 @@ def test_chat(self, testing_model): # noqa: F811 assert isinstance(chat_completion.choices, list) assert chat_completion.choices[0].message.content - def test_chat_logprobs(self, testing_model): - client, model = testing_model - num_tokens = 5 - # test logprobs for non-streaming chat completions - for top_logprobs in range(5): - chat_completion = client.chat.completions.create( - model=model, - max_tokens=num_tokens, - messages=[{"role": "user", "content": "Hello world"}], - logprobs=True, - top_logprobs=top_logprobs, - ) - logprobs = chat_completion.choices[0].logprobs.content - assert logprobs, "Logprobs should be not be None or Empty" - assert len(logprobs) == num_tokens - assert all( - len(logprob.top_logprobs) == top_logprobs for logprob in logprobs - ) - text_from_logprobs = [] - for logprob in logprobs: - text_from_logprobs.append(logprob.token) - if logprob.top_logprobs: - assert logprob.token == logprob.top_logprobs[0].token - text_from_logprobs = "".join(text_from_logprobs) - assert ( - text_from_logprobs == chat_completion.choices[0].message.content - ), "Text from logprobs should match text from completion" - - for num_top_logprobs in range(5): - chat_completion = client.chat.completions.create( - model=model, - max_tokens=num_tokens, - messages=[{"role": "user", "content": "Hello world"}], - logprobs=True, - top_logprobs=num_top_logprobs, - stream=True, - ) - - for c in chat_completion: - choice_logprobs = c.choices[0].logprobs - if choice_logprobs and choice_logprobs.content: - for chat_completion_token_logprob in choice_logprobs.content: - top_logprobs_res = chat_completion_token_logprob.top_logprobs - assert len(top_logprobs_res) == num_top_logprobs - if top_logprobs_res: - assert ( - top_logprobs_res[0].token - == chat_completion_token_logprob.token - ) - - # try to send logprobs request with invalid number of toplogprobs - with pytest.raises(openai.BadRequestError): - for top_logprobs in [ - MAX_NUM_TOPLOGPROBS_ALLOWED + 1, - MIN_NUM_TOPLOGPROBS_ALLOWED - 1, - ]: - client.chat.completions.create( - model=model, - max_tokens=num_tokens, - messages=[{"role": "user", "content": "Hello world"}], - logprobs=True, - top_logprobs=top_logprobs, - ) - - def test_completions_bad_request(self, testing_model): # noqa: F811 - client, model = testing_model - with pytest.raises(openai.BadRequestError) as exc_info: - client.completions.create( - model=model, - prompt="Hello world", - temperature=-0.1, - ) - assert "temperature" in str(exc_info.value) - - def test_chat_bad_request(self, testing_model): # noqa: F811 - client, model = testing_model - with pytest.raises(openai.BadRequestError) as exc_info: - client.chat.completions.create( - model=model, - messages=[{"role": "user", "content": "Hello world"}], - temperature=-0.1, - ) - assert "temperature" in str(exc_info.value) - - with pytest.raises(openai.BadRequestError) as exc_info: - client.chat.completions.create( - model=model, - messages=[], - ) - assert "least 1 item" in str(exc_info.value) - def test_completions_missing_model(self, testing_model): # noqa: F811 client, _ = testing_model with pytest.raises(openai.NotFoundError) as exc_info: @@ -174,9 +78,14 @@ def test_chat_stream(self, testing_model): # noqa: F811 model=model, messages=[{"role": "user", "content": "Hello world"}], stream=True, + stream_options=dict( + include_usage=True, + ), temperature=0.4, frequency_penalty=0.02, + max_tokens=5 ): + print(chat_completion) if i == 0: assert chat_completion assert chat_completion.id @@ -190,45 +99,7 @@ def test_chat_stream(self, testing_model): # noqa: F811 chat_completion.choices[0].delta, "content" ) i += 1 - assert chat_completion - assert chat_completion.id - assert isinstance(chat_completion.choices, list) - assert not chat_completion.choices[0].delta.content - assert chat_completion.choices[0].finish_reason - assert i > 4 - - def test_completions_stream_bad_request(self, testing_model): # noqa: F811 - client, model = testing_model - with pytest.raises(openai.BadRequestError) as exc_info: - for _ in client.completions.create( - model=model, - prompt="Hello world", - stream=True, - temperature=-0.1, - ): - pass - assert "temperature" in str(exc_info.value) - def test_chat_stream_bad_request(self, testing_model): # noqa: F811 - client, model = testing_model - with pytest.raises(openai.BadRequestError) as exc_info: - for _chat_completion in client.chat.completions.create( - model=model, - messages=[{"role": "user", "content": "Hello world"}], - stream=True, - temperature=-0.1, - ): - pass - assert "temperature" in str(exc_info.value) - - with pytest.raises(openai.BadRequestError) as exc_info: - for _chat_completion in client.chat.completions.create( - model=model, - messages=[], - stream=True, - ): - pass - assert "least 1 item" in str(exc_info.value) def test_completions_stream_missing_model(self, testing_model): # noqa: F811 client, _ = testing_model diff --git a/python/ray/llm/tests/serve/gpu/integration/test_openai_compatibility_no_accelerator_type.py b/python/ray/llm/tests/serve/gpu/integration/test_openai_compatibility_no_accelerator_type.py index 549f655da85b..1142700b34ed 100644 --- a/python/ray/llm/tests/serve/gpu/integration/test_openai_compatibility_no_accelerator_type.py +++ b/python/ray/llm/tests/serve/gpu/integration/test_openai_compatibility_no_accelerator_type.py @@ -27,7 +27,7 @@ def test_completions_no_accelerator_type( ) assert completion.model == model assert completion.model - assert completion.choices[0].text == "test_0 test_1 " + assert completion.choices[0].text == "test_0 test_1" def test_chat_no_accelerator_type(self, testing_model_no_accelerator): # noqa: F811 """Check chat completions without accelerator_type""" diff --git a/release/llm_tests/serve/probes/models.py b/release/llm_tests/serve/probes/models.py index 27d073a8b1cf..384a69ccaebb 100644 --- a/release/llm_tests/serve/probes/models.py +++ b/release/llm_tests/serve/probes/models.py @@ -97,11 +97,11 @@ def is_release_test_model(model: "openai.types.model.Model") -> bool: def is_finetuned_model(model: "openai.types.model.Model") -> bool: # If base_model_id is set, this is a finetuned model - return model.rayllm_metadata.get("base_model_id") is not None + return model.metadata.get("base_model_id") is not None def is_vision_language_model(model: "openai.types.model.Model") -> bool: - return model.rayllm_metadata.get("input_modality") == "image" + return model.metadata.get("input_modality") == "image" def is_rate_liming_test_model(model: "openai.types.model.Model") -> bool: @@ -130,7 +130,7 @@ def is_completions_only_model(model: "openai.types.model.Model") -> bool: def supports_function_calling_via_prompt(model: "openai.types.model.Model") -> bool: # True if tool template is specified in the generation config - gen_config = model.rayllm_metadata.get("generation", False) + gen_config = model.metadata.get("generation", False) if not gen_config: return False diff --git a/release/llm_tests/serve/probes/test_models.py b/release/llm_tests/serve/probes/test_models.py index 84d1207da673..f2ecc4a076a6 100644 --- a/release/llm_tests/serve/probes/test_models.py +++ b/release/llm_tests/serve/probes/test_models.py @@ -8,4 +8,4 @@ def test_get_model(model: str): model_description = openai_client.models.retrieve(model) assert model_description.id == model - assert "rayllm_metadata" in model_description.model_dump() + assert "metadata" in model_description.model_dump() From a4790e3b4e98084fe5dc55d0fb39e991cc3bd233 Mon Sep 17 00:00:00 2001 From: Kourosh Hakhamaneshi Date: Wed, 2 Jul 2025 20:15:27 -0700 Subject: [PATCH 33/37] delete more dead code Signed-off-by: Kourosh Hakhamaneshi --- .../cpu/deployments/llm/test_llm_engine.py | 6 +- .../llm/tests/serve/mocks/mock_vllm_engine.py | 596 ------------------ 2 files changed, 1 insertion(+), 601 deletions(-) diff --git a/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_engine.py b/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_engine.py index 736c1aeff379..9b88394dfa9e 100644 --- a/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_engine.py +++ b/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_engine.py @@ -4,13 +4,9 @@ Also tests that our Mock is behaving as expected to ensure that the downstream tests using Mocks are correct from Mock implementation perspective. -We have the following Mocks: +We have the following Mock: - An engine that returns a string of form "test_i" for i in range(max_tokens) -- An engine that echos the sent request in its response -- An engine that excercises the multiplexing logic (e.g. LoRA) -- An engine that excercise the structured output logic (e.g. JSON mode) -- An engine that excercises the prefill-disaggregation logic """ from ray.llm.tests.serve.mocks.mock_vllm_engine import MockVLLMEngine diff --git a/python/ray/llm/tests/serve/mocks/mock_vllm_engine.py b/python/ray/llm/tests/serve/mocks/mock_vllm_engine.py index fd75c13debe3..a3d56f7e2f12 100644 --- a/python/ray/llm/tests/serve/mocks/mock_vllm_engine.py +++ b/python/ray/llm/tests/serve/mocks/mock_vllm_engine.py @@ -267,418 +267,6 @@ async def _generate_completion_response( yield response -# class MockEchoVLLMEngine(MockVLLMEngine): -# """Mock engine that responds with information about the request sent to it. - -# Useful for testing the contents of requests created in data plane code. -# """ - -# async def chat(self, request: ChatCompletionRequest) -> AsyncGenerator[Union[str, ChatCompletionResponse, ErrorResponse], None]: -# """Echo the chat request information.""" -# if not self.started: -# raise RuntimeError("Engine not started") - -# # Convert request to JSON for echoing -# request_info = { -# "request_type": "chat", -# "model": getattr(request, 'model', None), -# "messages": getattr(request, 'messages', []), -# "max_tokens": getattr(request, 'max_tokens', None), -# "temperature": getattr(request, 'temperature', None), -# "stream": getattr(request, 'stream', False), -# "current_lora_model": self._current_lora_model.model_dump() if self._current_lora_model else None -# } - -# echo_text = json.dumps(request_info, indent=2) - -# if request.stream: -# # Return as SSE for streaming -# chunk_data = { -# "id": f"chatcmpl-echo-{random.randint(1000, 9999)}", -# "object": "chat.completion.chunk", -# "created": int(asyncio.get_event_loop().time()), -# "model": getattr(request, 'model', 'mock-echo-model'), -# "choices": [{ -# "index": 0, -# "delta": { -# "role": "assistant", -# "content": echo_text -# }, -# "finish_reason": "stop" -# }] -# } -# yield f"data: {json.dumps(chunk_data)}\n\n" -# yield "data: [DONE]\n\n" -# else: -# # Return as response object -# choice = { -# "index": 0, -# "message": { -# "role": "assistant", -# "content": echo_text -# }, -# "finish_reason": "stop" -# } - -# response = ChatCompletionResponse( -# id=f"chatcmpl-echo-{random.randint(1000, 9999)}", -# object="chat.completion", -# created=int(asyncio.get_event_loop().time()), -# model=getattr(request, 'model', 'mock-echo-model'), -# choices=[choice] -# ) - -# yield response - -# async def completions(self, request: CompletionRequest) -> AsyncGenerator[Union[str, CompletionResponse, ErrorResponse], None]: -# """Echo the completion request information.""" -# if not self.started: -# raise RuntimeError("Engine not started") - -# request_info = { -# "request_type": "completion", -# "model": getattr(request, 'model', None), -# "prompt": getattr(request, 'prompt', None), -# "max_tokens": getattr(request, 'max_tokens', None), -# "temperature": getattr(request, 'temperature', None), -# "stream": getattr(request, 'stream', False), -# "current_lora_model": self._current_lora_model.model_dump() if self._current_lora_model else None -# } - -# echo_text = json.dumps(request_info, indent=2) - -# if request.stream: -# # Return as SSE for streaming -# chunk_data = { -# "id": f"cmpl-echo-{random.randint(1000, 9999)}", -# "object": "text_completion", -# "created": int(asyncio.get_event_loop().time()), -# "model": getattr(request, 'model', 'mock-echo-model'), -# "choices": [{ -# "index": 0, -# "text": echo_text, -# "finish_reason": "stop" -# }] -# } -# yield f"data: {json.dumps(chunk_data)}\n\n" -# yield "data: [DONE]\n\n" -# else: -# # Return as response object -# choice = { -# "index": 0, -# "text": echo_text, -# "finish_reason": "stop" -# } - -# response = CompletionResponse( -# id=f"cmpl-echo-{random.randint(1000, 9999)}", -# object="text_completion", -# created=int(asyncio.get_event_loop().time()), -# model=getattr(request, 'model', 'mock-echo-model'), -# choices=[choice] -# ) - -# yield response - -# async def embeddings(self, request: EmbeddingRequest) -> AsyncGenerator[Union[str, EmbeddingResponse, ErrorResponse], None]: -# """Echo the embedding request information.""" -# if not self.started: -# raise RuntimeError("Engine not started") - -# request_info = { -# "request_type": "embedding", -# "model": getattr(request, 'model', None), -# "input": getattr(request, 'input', None), -# "encoding_format": getattr(request, 'encoding_format', None), -# "dimensions": getattr(request, 'dimensions', None), -# "current_lora_model": self._current_lora_model.model_dump() if self._current_lora_model else None -# } - -# # Return request info as mock embedding -# echo_text = json.dumps(request_info, indent=2) -# mock_embedding = [float(ord(c)) for c in echo_text[:10]] # Mock embedding from first 10 chars - -# response = EmbeddingResponse( -# object="list", -# data=[{ -# "object": "embedding", -# "embedding": mock_embedding, -# "index": 0 -# }], -# model=getattr(request, 'model', 'mock-echo-model'), -# usage={ -# "prompt_tokens": len(str(request.input).split()), -# "total_tokens": len(str(request.input).split()) -# } -# ) - -# yield response - - -# class MockMultiplexEngine(MockVLLMEngine): -# """Mock engine for testing multiplex/LoRA functionality.""" - -# def __init__(self, llm_config: LLMConfig): -# super().__init__(llm_config) -# self.loaded_lora_models: List[DiskMultiplexConfig] = [] - -# async def resolve_lora(self, lora_model: DiskMultiplexConfig): -# """Mock LoRA model loading.""" -# self._current_lora_model = lora_model -# # Keep track of loaded models -# if lora_model not in self.loaded_lora_models: -# self.loaded_lora_models.append(lora_model) - -# async def chat(self, request: ChatCompletionRequest) -> AsyncGenerator[Union[str, ChatCompletionResponse, ErrorResponse], None]: -# """Chat with multiplex information.""" -# if not self.started: -# raise RuntimeError("Engine not started") - -# # Include multiplex info in response -# lora_info = "" -# if self._current_lora_model: -# lora_info = f" [LoRA: {self._current_lora_model.model_id}]" - -# generated_text = f"Mock multiplex response{lora_info}" - -# if request.stream: -# # Return as SSE for streaming -# chunk_data = { -# "id": f"chatcmpl-multiplex-{random.randint(1000, 9999)}", -# "object": "chat.completion.chunk", -# "created": int(asyncio.get_event_loop().time()), -# "model": getattr(request, 'model', 'mock-multiplex-model'), -# "choices": [{ -# "index": 0, -# "delta": { -# "role": "assistant", -# "content": generated_text -# }, -# "finish_reason": "stop" -# }] -# } -# yield f"data: {json.dumps(chunk_data)}\n\n" -# yield "data: [DONE]\n\n" -# else: -# # Return as response object -# choice = { -# "index": 0, -# "message": { -# "role": "assistant", -# "content": generated_text -# }, -# "finish_reason": "stop" -# } - -# response = ChatCompletionResponse( -# id=f"chatcmpl-multiplex-{random.randint(1000, 9999)}", -# object="chat.completion", -# created=int(asyncio.get_event_loop().time()), -# model=getattr(request, 'model', 'mock-multiplex-model'), -# choices=[choice] -# ) - -# yield response - - -# class MockJSONModeVLLMEngine(MockVLLMEngine): -# """Mock engine that generates valid JSON responses when JSON mode is requested.""" - -# async def chat(self, request: ChatCompletionRequest) -> AsyncGenerator[Union[str, ChatCompletionResponse, ErrorResponse], None]: -# """Generate JSON or text response based on request format.""" -# if not self.started: -# raise RuntimeError("Engine not started") - -# # Check if JSON mode is requested -# response_format = getattr(request, 'response_format', None) -# is_json_mode = ( -# response_format and -# hasattr(response_format, 'type') and -# response_format.type == "json_object" -# ) - -# if is_json_mode: -# # Generate valid JSON based on schema if provided -# if hasattr(response_format, 'json_schema') and response_format.json_schema: -# try: -# # Use the schema to generate a valid response -# json_response = generate_from_schema(response_format.json_schema) -# generated_text = json.dumps(json_response, ensure_ascii=False) -# except Exception as e: -# # Fallback to default JSON if schema generation fails -# json_response = { -# "error": f"Schema generation failed: {str(e)}", -# "schema_provided": bool(response_format.json_schema), -# "fallback_response": True -# } -# generated_text = json.dumps(json_response, indent=2) -# else: -# # Default JSON response when no schema is provided -# json_response = { -# "message": "This is a mock JSON response", -# "timestamp": int(asyncio.get_event_loop().time()), -# "request_info": { -# "model": getattr(request, 'model', 'unknown'), -# "has_messages": bool(getattr(request, 'messages', [])), -# "lora_model": self._current_lora_model.model_id if self._current_lora_model else None -# } -# } -# generated_text = json.dumps(json_response, indent=2) -# else: -# # Generate regular text -# generated_text = "Mock response from JSON mode engine" - -# if request.stream: -# # Return as SSE for streaming with realistic JSON chunking -# request_id = f"chatcmpl-json-{random.randint(1000, 9999)}" -# created_time = int(asyncio.get_event_loop().time()) -# model_name = getattr(request, 'model', 'mock-json-model') - -# if is_json_mode: -# # For JSON streaming, split the JSON into realistic chunks -# # This simulates how a real LLM would generate JSON token by token -# max_chunk_size = 10 # Characters per chunk -# chunks = [generated_text[i:i+max_chunk_size] for i in range(0, len(generated_text), max_chunk_size)] - -# for i, chunk in enumerate(chunks): -# chunk_data = { -# "id": request_id, -# "object": "chat.completion.chunk", -# "created": created_time, -# "model": model_name, -# "choices": [{ -# "index": 0, -# "delta": { -# "content": chunk, -# "role": "assistant" if i == 0 else None -# }, -# "finish_reason": "stop" if i == len(chunks) - 1 else None -# }] -# } -# yield f"data: {json.dumps(chunk_data)}\n\n" -# await asyncio.sleep(0.01) # Simulate processing time -# else: -# # For non-JSON streaming, return as single chunk -# chunk_data = { -# "id": request_id, -# "object": "chat.completion.chunk", -# "created": created_time, -# "model": model_name, -# "choices": [{ -# "index": 0, -# "delta": { -# "role": "assistant", -# "content": generated_text -# }, -# "finish_reason": "stop" -# }] -# } -# yield f"data: {json.dumps(chunk_data)}\n\n" - -# # Send final [DONE] message -# yield "data: [DONE]\n\n" -# else: -# # Return as response object -# choice = { -# "index": 0, -# "message": { -# "role": "assistant", -# "content": generated_text -# }, -# "finish_reason": "stop" -# } - -# response = ChatCompletionResponse( -# id=f"chatcmpl-json-{random.randint(1000, 9999)}", -# object="chat.completion", -# created=int(asyncio.get_event_loop().time()), -# model=getattr(request, 'model', 'mock-json-model'), -# choices=[choice] -# ) - -# yield response - - -# class MockPDDisaggVLLMEngine(MockVLLMEngine): -# """Mock engine for testing Prefill/Decode disaggregated functionality.""" - -# def __init__(self, llm_config: LLMConfig): -# super().__init__(llm_config) -# self.prefill_cache = {} -# self.kv_transfer_enabled = False - -# async def start(self): -# """Start with disaggregation support.""" -# await super().start() -# # Mock enabling KV transfer -# self.kv_transfer_enabled = True - -# async def chat(self, request: ChatCompletionRequest) -> AsyncGenerator[Union[str, ChatCompletionResponse, ErrorResponse], None]: -# """Chat with disaggregation simulation.""" -# if not self.started: -# raise RuntimeError("Engine not started") - -# # Simulate prefill/decode disaggregation -# request_id = getattr(request, 'request_id', f"req-{random.randint(1000, 9999)}") - -# # Mock prefill phase -# prompt_text = "" -# if hasattr(request, 'messages') and request.messages: -# for message in request.messages: -# if hasattr(message, 'content') and message.content: -# prompt_text += str(message.content) + " " - -# # Cache prefill result -# self.prefill_cache[request_id] = { -# "prompt": prompt_text.strip(), -# "kv_cache": f"mock_kv_cache_{len(prompt_text)}" -# } - -# # Mock decode phase -# generated_text = f"Mock PD disagg response [cached: {request_id}]" -# if self.kv_transfer_enabled: -# generated_text += " [KV transfer enabled]" - -# if request.stream: -# # Return as SSE for streaming -# chunk_data = { -# "id": f"chatcmpl-pd-{request_id}", -# "object": "chat.completion.chunk", -# "created": int(asyncio.get_event_loop().time()), -# "model": getattr(request, 'model', 'mock-pd-model'), -# "choices": [{ -# "index": 0, -# "delta": { -# "role": "assistant", -# "content": generated_text -# }, -# "finish_reason": "stop" -# }] -# } -# yield f"data: {json.dumps(chunk_data)}\n\n" -# yield "data: [DONE]\n\n" -# else: -# # Return as response object -# choice = { -# "index": 0, -# "message": { -# "role": "assistant", -# "content": generated_text -# }, -# "finish_reason": "stop" -# } - -# response = ChatCompletionResponse( -# id=f"chatcmpl-pd-{request_id}", -# object="chat.completion", -# created=int(asyncio.get_event_loop().time()), -# model=getattr(request, 'model', 'mock-pd-model'), -# choices=[choice] -# ) - -# yield response - - class FakeLoraModelLoader(LoraModelLoader): """Fake LoRA model loader for testing.""" @@ -691,187 +279,3 @@ async def load_model(self, lora_model_id: str, llm_config: LLMConfig) -> DiskMul lora_assigned_int_id=random.randint(1, 100), ) - -# # Utility functions for JSON generation and validation -# def generate_from_schema(schema: dict) -> Any: -# """Generate mock data from JSON schema.""" -# if "type" not in schema: -# raise ValueError("Schema must have a 'type' property") - -# # Handle enum values first (takes precedence over type) -# if "enum" in schema: -# return random.choice(schema["enum"]) - -# # Handle const values -# if "const" in schema: -# return schema["const"] - -# schema_type = schema["type"] - -# if schema_type == "object": -# obj = {} -# properties = schema.get("properties", {}) -# required = schema.get("required", []) - -# # Generate required properties first -# for prop in required: -# if prop in properties: -# obj[prop] = generate_from_schema(properties[prop]) - -# # Generate optional properties (randomly include some) -# for prop, prop_schema in properties.items(): -# if prop not in obj and random.choice([True, False]): -# obj[prop] = generate_from_schema(prop_schema) - -# return obj - -# elif schema_type == "array": -# item_schema = schema.get("items", {"type": "string"}) -# min_items = schema.get("minItems", 1) -# max_items = schema.get("maxItems", 5) -# array_length = random.randint(min_items, max_items) - -# return [generate_from_schema(item_schema) for _ in range(array_length)] - -# elif schema_type == "string": -# # Handle string patterns and formats -# if "pattern" in schema: -# # For testing purposes, return a string that might match common patterns -# pattern = schema["pattern"] -# if "email" in pattern.lower() or "@" in pattern: -# return "test@example.com" -# elif "phone" in pattern.lower() or "\\d" in pattern: -# return "123-456-7890" -# else: -# return "pattern_match_string" - -# if "format" in schema: -# format_type = schema["format"] -# if format_type == "email": -# return "test@example.com" -# elif format_type == "date": -# return "2024-01-15" -# elif format_type == "date-time": -# return "2024-01-15T10:30:00Z" -# elif format_type == "uri": -# return "https://example.com" -# elif format_type == "uuid": -# return "550e8400-e29b-41d4-a716-446655440000" - -# # Handle string length constraints -# min_length = schema.get("minLength", 1) -# max_length = schema.get("maxLength", 20) -# base_string = "mock_string_value" - -# if max_length < len(base_string): -# return base_string[:max_length] -# elif min_length > len(base_string): -# return base_string + "x" * (min_length - len(base_string)) -# else: -# return base_string - -# elif schema_type == "integer": -# minimum = schema.get("minimum", 0) -# maximum = schema.get("maximum", 100) -# return random.randint(minimum, maximum) - -# elif schema_type == "number": -# minimum = schema.get("minimum", 0.0) -# maximum = schema.get("maximum", 100.0) -# return random.uniform(minimum, maximum) - -# elif schema_type == "boolean": -# return random.choice([True, False]) - -# elif schema_type == "null": -# return None - -# # Handle multiple types (anyOf, oneOf) -# elif isinstance(schema_type, list): -# chosen_type = random.choice(schema_type) -# return generate_from_schema({"type": chosen_type}) - -# else: -# raise ValueError(f"Unsupported schema type: {schema_type}") - - -# def validate_json_schema_response(response_text: str, schema: dict) -> bool: -# """ -# Validate that a JSON response conforms to the provided schema. -# This is a simple validation for testing purposes. -# """ -# try: -# data = json.loads(response_text) -# # Basic validation - in a real implementation you'd use jsonschema library -# return _validate_against_schema(data, schema) -# except (json.JSONDecodeError, Exception): -# return False - - -# def _validate_against_schema(data: Any, schema: dict) -> bool: -# """Helper function for basic schema validation.""" -# schema_type = schema.get("type") - -# if schema_type == "object" and isinstance(data, dict): -# # Check required properties -# required = schema.get("required", []) -# for prop in required: -# if prop not in data: -# return False - -# # Check property types -# properties = schema.get("properties", {}) -# for prop, value in data.items(): -# if prop in properties: -# if not _validate_against_schema(value, properties[prop]): -# return False -# return True - -# elif schema_type == "array" and isinstance(data, list): -# item_schema = schema.get("items", {}) -# return all(_validate_against_schema(item, item_schema) for item in data) - -# elif schema_type == "string" and isinstance(data, str): -# return True - -# elif schema_type == "integer" and isinstance(data, int): -# return True - -# elif schema_type == "number" and isinstance(data, (int, float)): -# return True - -# elif schema_type == "boolean" and isinstance(data, bool): -# return True - -# elif schema_type == "null" and data is None: -# return True - -# return False - - -# def split_string_into_chunks(s: str, n: int) -> List[str]: -# """Split string into n chunks.""" -# if n <= 0: -# raise ValueError("Number of chunks must be greater than 0") - -# chunk_size = len(s) // n -# remainder = len(s) % n - -# chunks = [] -# start = 0 -# for i in range(n): -# end = start + chunk_size + (1 if i < remainder else 0) -# chunks.append(s[start:end]) -# start = end - -# return chunks - - -# def get_prompt_length(prompt: Union[str, List[str], List[int]]) -> int: -# """Get the length of a prompt.""" -# if isinstance(prompt, str): -# return len(prompt.split()) -# elif isinstance(prompt, list): -# return len(prompt) -# else: -# return 0 From a0ad5971db320a3b4953c7a94489f3ac100d0b6b Mon Sep 17 00:00:00 2001 From: Kourosh Hakhamaneshi Date: Wed, 2 Jul 2025 23:09:25 -0700 Subject: [PATCH 34/37] wip Signed-off-by: Kourosh Hakhamaneshi --- .../integration/test_openai_compatibility.py | 1 - release/llm_tests/serve/probes/query_utils.py | 20 +++++++++++++++++-- release/llm_tests/serve/probes/test_basic.py | 2 +- 3 files changed, 19 insertions(+), 4 deletions(-) diff --git a/python/ray/llm/tests/serve/gpu/integration/test_openai_compatibility.py b/python/ray/llm/tests/serve/gpu/integration/test_openai_compatibility.py index 704988470d6d..4293435a8820 100644 --- a/python/ray/llm/tests/serve/gpu/integration/test_openai_compatibility.py +++ b/python/ray/llm/tests/serve/gpu/integration/test_openai_compatibility.py @@ -85,7 +85,6 @@ def test_chat_stream(self, testing_model): # noqa: F811 frequency_penalty=0.02, max_tokens=5 ): - print(chat_completion) if i == 0: assert chat_completion assert chat_completion.id diff --git a/release/llm_tests/serve/probes/query_utils.py b/release/llm_tests/serve/probes/query_utils.py index e76d2338e3fc..9c265386a2f9 100644 --- a/release/llm_tests/serve/probes/query_utils.py +++ b/release/llm_tests/serve/probes/query_utils.py @@ -42,7 +42,12 @@ def _apply_delta(base, delta): # in order to merge them, not recursively merge them. if key == "logprobs": if delta[key]: - base[key]["content"].extend(delta[key]["content"]) + cur_val = (base[key] or {}).get("content", []) or [] + cur_val.extend(delta[key]["content"]) + if base[key]: + base[key]["content"] = cur_val + else: + base[key] = {"content": cur_val} continue if isinstance(base[key], dict): @@ -97,6 +102,8 @@ def messages(self): """In case of streamed response, what are the individual chunked messages? that contain the content we care about?""" vals = [] for r in self.response: + if len(r.choices) == 0: + continue v = r.choices[0].model_dump() if "message" in v and "content" in v["message"]: vals.append(v["message"]["content"] or "") @@ -128,7 +135,11 @@ def num_completion_tokens(self): def finish_reason(self): # This should be set on the last response. - return self.response[-1].choices[0].finish_reason + for chunk in self.response: + if len(chunk.choices) > 0: + if chunk.choices[0].finish_reason: + return chunk.choices[0].finish_reason + return None class BaseProbe: @@ -171,6 +182,11 @@ async def query( "stream": stream, **chat_args, } + + if stream: + args["stream_options"] = { + "include_usage": True, + } if chat: method = self.client.chat.completions.create else: diff --git a/release/llm_tests/serve/probes/test_basic.py b/release/llm_tests/serve/probes/test_basic.py index 0c35adaeba01..c1b34ba36137 100755 --- a/release/llm_tests/serve/probes/test_basic.py +++ b/release/llm_tests/serve/probes/test_basic.py @@ -160,7 +160,7 @@ async def test_too_long_completion_request( ) # XXX: AE-686 hack, should read model data instead - length = 20000 + length = 200000 if "8x22" in model: length = 70000 From c05c83f6bb4f270ef5edb19b6adffc9409000a98 Mon Sep 17 00:00:00 2001 From: Kourosh Hakhamaneshi Date: Mon, 7 Jul 2025 18:37:11 -0700 Subject: [PATCH 35/37] fixed the probes Signed-off-by: Kourosh Hakhamaneshi --- release/llm_tests/serve/probes/test_basic.py | 4 ++-- release/llm_tests/serve/probes/test_json_mode.py | 13 ++++++++----- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/release/llm_tests/serve/probes/test_basic.py b/release/llm_tests/serve/probes/test_basic.py index c1b34ba36137..7ef3873311ae 100755 --- a/release/llm_tests/serve/probes/test_basic.py +++ b/release/llm_tests/serve/probes/test_basic.py @@ -315,8 +315,8 @@ async def test_logprobs( running_str += logprob["token"] assert running_str == resp["message"]["content"] - # top logprobs have to be between 0 and 5 - invalid_num_logprobs = [-1, 6] + # top logprobs have to be positive integer + invalid_num_logprobs = [-1] bad_config = configuration.copy() for invalid_num_logprob in invalid_num_logprobs: bad_config["top_logprobs"] = invalid_num_logprob diff --git a/release/llm_tests/serve/probes/test_json_mode.py b/release/llm_tests/serve/probes/test_json_mode.py index a971be59c49a..0150db2f583a 100644 --- a/release/llm_tests/serve/probes/test_json_mode.py +++ b/release/llm_tests/serve/probes/test_json_mode.py @@ -101,8 +101,11 @@ def get_params_and_expected_type(response_type: str, test_id: str): params.update( { "response_format": { - "type": "json_object", - "schema": expected_type.schema_json(), + "type": "json_schema", + "json_schema": { + "name": "expected_schema", + "schema": expected_type.model_json_schema(), + }, } } ) @@ -118,7 +121,7 @@ def get_response_formats(): {"type": "json_object", "schema": json.dumps({})}, {"type": "json_object", "schema": json.loads(BasicResponse.schema_json())}, {"type": "json_object", "schema": BasicResponse.schema_json()}, - {"type": "grammar", "grammar": JSON_GRAMMAR_EBNF_STR}, + # {"type": "grammar", "grammar": JSON_GRAMMAR_EBNF_STR}, ] @@ -201,8 +204,8 @@ async def test_response_format_options( async def test_invalid_schema(model: str, openai_async_client): querier = TextGenerationProbeQuerier(openai_async_client, {"temperature": 0.0}) response_format = { - "type": "json_object", - "schema": {"type": "object", "properties": {"name": {"type": "str"}}}, + "type": "json_schema", + "json_schema": {"name": "expected_schema", "schema": {"type": "object", "properties": {"name": {"type": "str"}}}}, } params = { From e2c61716d3ab92a6a3be7290ed567b20cabbbf86 Mon Sep 17 00:00:00 2001 From: Kourosh Hakhamaneshi Date: Mon, 7 Jul 2025 18:39:29 -0700 Subject: [PATCH 36/37] wip Signed-off-by: Kourosh Hakhamaneshi --- .../serve/configs/openai_api_models.py | 16 +- .../_internal/serve/configs/server_models.py | 1 - .../serve/deployments/llm/llm_engine.py | 14 +- .../serve/deployments/llm/llm_server.py | 98 +++++----- .../serve/deployments/llm/vllm/vllm_engine.py | 130 ++++++++------ .../serve/deployments/llm/vllm/vllm_models.py | 40 +++-- .../prefill_decode_disagg.py | 57 +++--- .../serve/deployments/routers/router.py | 6 +- .../utils/node_initialization_utils.py | 1 - .../serve/deployments/utils/server_utils.py | 1 - python/ray/llm/tests/serve/conftest.py | 6 +- .../cpu/deployments/llm/test_llm_engine.py | 43 +++-- .../cpu/deployments/llm/test_llm_server.py | 127 +++++++------ .../integration/test_openai_compatibility.py | 3 +- .../llm/tests/serve/mocks/mock_vllm_engine.py | 170 +++++++++--------- python/ray/llm/tests/serve/utils/__init__.py | 2 +- .../llm/tests/serve/utils/testing_utils.py | 42 ++--- release/llm_tests/serve/probes/query_utils.py | 4 +- .../llm_tests/serve/probes/test_json_mode.py | 5 +- 19 files changed, 417 insertions(+), 349 deletions(-) diff --git a/python/ray/llm/_internal/serve/configs/openai_api_models.py b/python/ray/llm/_internal/serve/configs/openai_api_models.py index 76a1be4f65a0..98c3b9d491ce 100644 --- a/python/ray/llm/_internal/serve/configs/openai_api_models.py +++ b/python/ray/llm/_internal/serve/configs/openai_api_models.py @@ -19,42 +19,51 @@ ) from typing import TYPE_CHECKING + if TYPE_CHECKING: from ray.llm._internal.serve.configs.server_models import LLMConfig - - class ChatCompletionRequest(vLLMChatCompletionRequest): pass + class ChatCompletionResponse(vLLMChatCompletionResponse): pass + class ChatCompletionStreamResponse(vLLMChatCompletionStreamResponse): pass + class ErrorResponse(vLLMErrorResponse): pass + class CompletionRequest(vLLMCompletionRequest): pass + class CompletionResponse(vLLMCompletionResponse): pass + class CompletionStreamResponse(vLLMCompletionStreamResponse): pass + class EmbeddingCompletionRequest(vLLMEmbeddingCompletionRequest): pass + class EmbeddingChatRequest(vLLMEmbeddingChatRequest): pass + class EmbeddingResponse(vLLMEmbeddingResponse): pass + EmbeddingRequest = Union[EmbeddingCompletionRequest, EmbeddingChatRequest] LLMEmbeddingsResponse = Union[ @@ -89,7 +98,7 @@ def __init__( # TODO: upstream metadata for ModelData -# Compared to vLLM this has a metadata field. +# Compared to vLLM this has a metadata field. class ModelCard(BaseModel): model_config = ConfigDict(protected_namespaces=tuple()) @@ -103,6 +112,7 @@ class ModelCard(BaseModel): def model_type(self) -> str: return self.metadata["engine_config"]["model_type"] + class ModelList(BaseModel): data: List[ModelCard] object: str = "list" diff --git a/python/ray/llm/_internal/serve/configs/server_models.py b/python/ray/llm/_internal/serve/configs/server_models.py index 6773b0435524..ccb67b260b5e 100644 --- a/python/ray/llm/_internal/serve/configs/server_models.py +++ b/python/ray/llm/_internal/serve/configs/server_models.py @@ -562,7 +562,6 @@ def parse_args(self) -> "LLMServingArgs": return LLMServingArgs(llm_configs=llm_configs) - class FinishReason(str, Enum): LENGTH = "length" STOP = "stop" diff --git a/python/ray/llm/_internal/serve/deployments/llm/llm_engine.py b/python/ray/llm/_internal/serve/deployments/llm/llm_engine.py index d92d7e577302..f0d0637990e0 100644 --- a/python/ray/llm/_internal/serve/deployments/llm/llm_engine.py +++ b/python/ray/llm/_internal/serve/deployments/llm/llm_engine.py @@ -19,24 +19,24 @@ def __init__(self, llm_config: LLMConfig): async def start(self): """Start the engine""" pass - + @abc.abstractmethod async def resolve_lora(self, lora_model: DiskMultiplexConfig): """Resolve the lora model""" pass - + @abc.abstractmethod - async def chat(self, request) -> AsyncGenerator[Any, None]: + async def chat(self, request) -> AsyncGenerator[Any, None]: """Chat with the engine""" pass - + @abc.abstractmethod - async def completions(self, request) -> AsyncGenerator[Any, None]: + async def completions(self, request) -> AsyncGenerator[Any, None]: """Completion with the engine""" pass - + @abc.abstractmethod - async def embeddings(self, request) -> AsyncGenerator[Any, None]: + async def embeddings(self, request) -> AsyncGenerator[Any, None]: """Embed with the engine""" pass diff --git a/python/ray/llm/_internal/serve/deployments/llm/llm_server.py b/python/ray/llm/_internal/serve/deployments/llm/llm_server.py index 31089d6148d7..023ceec971fa 100644 --- a/python/ray/llm/_internal/serve/deployments/llm/llm_server.py +++ b/python/ray/llm/_internal/serve/deployments/llm/llm_server.py @@ -78,22 +78,23 @@ async def check_health(self) -> None: the engine is dead and needs to be restarted. """ ... - - # TODO (Kourosh): This does not belong here. + + # TODO (Kourosh): This does not belong here. async def llm_config(self) -> Optional[LLMConfig]: return None class LLMServer(_LLMServerBase): """This is a shm layer to decouple the LLM engine from the ingress deployment. - + It has a very similar API as the engine. Almost all of the abstractions are implemented by the engine. This class just a little bit more logic on top: - + 1. Logic for serve multiplexing (e.g. LoRA loading). 2. Request id handing from serve context. 3. Batching in case of streaming (only for chat and completions). 4. Telemetry reporting. """ + _default_engine_cls = VLLMEngine async def __init__( @@ -123,36 +124,38 @@ async def __init__( if self._engine_cls is not None: self.engine = self._engine_cls(self._llm_config) await asyncio.wait_for(self._start_engine(), timeout=ENGINE_START_TIMEOUT_S) - - self._init_multiplex_loader(model_downloader) + self._init_multiplex_loader(model_downloader) - def _init_multiplex_loader(self, model_downloader_cls: Optional[Type[LoraModelLoader]] = None): + def _init_multiplex_loader( + self, model_downloader_cls: Optional[Type[LoraModelLoader]] = None + ): """Initialize the multiplex loader.""" - + model_downloader_cls = model_downloader_cls or LoraModelLoader mx_config = self._llm_config.multiplex_config() - + if mx_config is not None: model_downloader = model_downloader_cls( download_timeout_s=mx_config.download_timeout_s, max_tries=mx_config.max_download_tries, ) - + async def _load_model(lora_model_id: str) -> DiskMultiplexConfig: return await model_downloader.load_model( lora_model_id=lora_model_id, llm_config=self._llm_config, ) - - self._load_model = serve.multiplexed(max_num_models_per_replica=mx_config.max_num_models_per_replica)(_load_model) + + self._load_model = serve.multiplexed( + max_num_models_per_replica=mx_config.max_num_models_per_replica + )(_load_model) else: + async def _load_model(lora_model_id: str) -> DiskMultiplexConfig: raise ValueError("LoRA config is not set in the LLMConfig") - + self._load_model = _load_model - - def _get_default_engine_class(self) -> Type[LLMEngine]: """Helper to load the engine class from the environment variable. @@ -173,7 +176,6 @@ async def _start_engine(self): # Push telemetry reports for the model in the current deployment. push_telemetry_report_for_all_models(all_models=[self._llm_config]) - def _get_batch_interval_ms(self, stream: bool = True) -> int: """Calculate the batching interval for responses.""" stream_batching_interval_ms = self._llm_config.experimental_configs.get( @@ -182,14 +184,15 @@ def _get_batch_interval_ms(self, stream: bool = True) -> int: if stream_batching_interval_ms is None: stream_batching_interval_ms = MODEL_RESPONSE_BATCH_TIMEOUT_MS return stream_batching_interval_ms if stream else None - - async def _maybe_add_request_id_to_request(self, request: Union[ChatCompletionRequest, CompletionRequest, EmbeddingRequest]): + + async def _maybe_add_request_id_to_request( + self, request: Union[ChatCompletionRequest, CompletionRequest, EmbeddingRequest] + ): """Add the request id to the request.""" request_id = get_serve_request_id() if request_id: request.request_id = request_id - - + async def _maybe_resolve_lora_from_multiplex(self) -> None: """Handle the lora model for the request.""" multiplexed_model_id = serve.get_multiplexed_model_id() @@ -198,28 +201,33 @@ async def _maybe_resolve_lora_from_multiplex(self) -> None: raise ValueError("Must setup lora config for multiplexed requests.") disk_lora_model = await self._load_model(multiplexed_model_id) await self.engine.resolve_lora(disk_lora_model) - + def _batch_output_stream(self, generator): return OpenAIResponseBatcher( generator, interval_ms=self._get_batch_interval_ms(), ).stream() - - - async def _run_request(self, request: Union[ChatCompletionRequest, CompletionRequest, EmbeddingRequest], *, engine_method: str, batch_output_stream: bool = False) -> AsyncGenerator[Any, None]: + + async def _run_request( + self, + request: Union[ChatCompletionRequest, CompletionRequest, EmbeddingRequest], + *, + engine_method: str, + batch_output_stream: bool = False, + ) -> AsyncGenerator[Any, None]: """Run the engine method on the request + perform batching when stream=True. - + Args: request: The request to run. engine_method: The method to call on the engine. batch_output_stream: Whether to batch the output stream. - + Returns: - An AsyncGenerator of the response. If stream is True and batching is enabled, then the generator will yield a list of streaming responses (strings of the format data: {response_json}\n\n). Otherwise, it will yield the non-streaming response from engine directly. + An AsyncGenerator of the response. If stream is True and batching is enabled, then the generator will yield a list of streaming responses (strings of the format data: {response_json}\n\n). Otherwise, it will yield the non-streaming response from engine directly. """ await self._maybe_add_request_id_to_request(request) await self._maybe_resolve_lora_from_multiplex() - + is_stream = hasattr(request, "stream") and request.stream if is_stream and batch_output_stream: stream = self._batch_output_stream( @@ -227,11 +235,12 @@ async def _run_request(self, request: Union[ChatCompletionRequest, CompletionReq ) else: stream = getattr(self.engine, engine_method)(request) - + return stream - async def chat(self, request: ChatCompletionRequest) -> \ - AsyncGenerator[Union[List[str], ChatCompletionResponse], None]: + async def chat( + self, request: ChatCompletionRequest + ) -> AsyncGenerator[Union[List[str], ChatCompletionResponse], None]: """Runs a chat request to the LLM engine and returns the response. Args: @@ -240,10 +249,13 @@ async def chat(self, request: ChatCompletionRequest) -> \ Returns: An AsyncGenerator of the response. If stream is True and batching is enabled, then the generator will yield a list of chat streaming responses (strings of the format data: {response_json}\n\n). Otherwise, it will yield the ChatCompletionResponse object directly. """ - return await self._run_request(request, engine_method="chat", batch_output_stream=True) + return await self._run_request( + request, engine_method="chat", batch_output_stream=True + ) - async def completions(self, request: CompletionRequest) -> \ - AsyncGenerator[Union[List[str], CompletionResponse], None]: + async def completions( + self, request: CompletionRequest + ) -> AsyncGenerator[Union[List[str], CompletionResponse], None]: """Runs a completion request to the LLM engine and returns the response. Args: @@ -252,12 +264,15 @@ async def completions(self, request: CompletionRequest) -> \ Returns: An AsyncGenerator of the response. If stream is True and batching is enabled, then the generator will yield a list of completion streaming responses (strings of the format data: {response_json}\n\n). Otherwise, it will yield the CompletionResponse object directly. """ - return await self._run_request(request, engine_method="completions", batch_output_stream=True) - + return await self._run_request( + request, engine_method="completions", batch_output_stream=True + ) - async def embeddings(self, request: EmbeddingRequest) -> AsyncGenerator[EmbeddingResponse, None]: + async def embeddings( + self, request: EmbeddingRequest + ) -> AsyncGenerator[EmbeddingResponse, None]: """Runs an embeddings request to the engine and returns the response. - + Returns an AsyncGenerator over the EmbeddingResponse object. This is so that the caller can have a consistent interface across all the methods of chat, completions, and embeddings. Args: @@ -267,7 +282,9 @@ async def embeddings(self, request: EmbeddingRequest) -> AsyncGenerator[Embeddin An AsyncGenerator over the EmbeddingResponse object. """ # NOTE: Embeddings does not need batching. - return await self._run_request(request, engine_method="embeddings", batch_output_stream=False) + return await self._run_request( + request, engine_method="embeddings", batch_output_stream=False + ) async def check_health(self) -> None: """ @@ -282,10 +299,9 @@ async def check_health(self) -> None: logger.error("Engine health check failed in LLMServer.check_health: %s", e) raise e - async def llm_config(self) -> Optional[LLMConfig]: return self._llm_config - + @classmethod def as_deployment( cls, deployment_options: Optional[Dict[str, Any]] = None diff --git a/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py b/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py index 216f59c104ac..2e12c087232a 100644 --- a/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py +++ b/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py @@ -16,7 +16,7 @@ EmbeddingResponse, ErrorResponse, ) - + from ray.llm._internal.serve.configs.server_models import ( DiskMultiplexConfig, LLMConfig, @@ -54,10 +54,12 @@ def _get_vllm_engine_config( llm_config: LLMConfig, ) -> Tuple["AsyncEngineArgs", "VllmConfig"]: engine_config = llm_config.get_engine_config() - async_engine_args = vllm.engine.arg_utils.AsyncEngineArgs(**engine_config.get_initialization_kwargs()) + async_engine_args = vllm.engine.arg_utils.AsyncEngineArgs( + **engine_config.get_initialization_kwargs() + ) vllm_engine_config = async_engine_args.create_engine_config() return async_engine_args, vllm_engine_config - + def _clear_current_platform_cache(): """Clear the cache of the current platform. @@ -91,7 +93,6 @@ def _clear_current_platform_cache(): current_platform.get_device_capability.cache_clear() - class VLLMEngine(LLMEngine): def __init__( self, @@ -104,21 +105,21 @@ def __init__( """ super().__init__(llm_config) - # Ensure transformers_modules is initialized early in worker processes. # This is critical for models with trust_remote_code=True to avoid pickle errors. init_hf_modules() self.llm_config = llm_config - if vllm is None: raise ImportError( "vLLM is not installed. Please install it with `pip install ray[llm]`." ) - + if not vllm.envs.VLLM_USE_V1: - logger.warning("vLLM v0 is getting fully deprecated. As a result in Ray Serve LLM only v1 is supported. Only when you know what you are doing, you can set VLLM_USE_V1=0") + logger.warning( + "vLLM v0 is getting fully deprecated. As a result in Ray Serve LLM only v1 is supported. Only when you know what you are doing, you can set VLLM_USE_V1=0" + ) # TODO (Kourosh): This validation logic belongs to the PDProxy module. # Pick a random port in P/D case. @@ -149,8 +150,7 @@ def __init__( port = vllm.envs.VLLM_NIXL_SIDE_CHANNEL_PORT kv_transfer_config.engine_id = "-".join([engine_id, host, str(port)]) - - # TODO (Kourosh): What do we do with this stats tracker? + # TODO (Kourosh): What do we do with this stats tracker? self._stats = VLLMEngineStatTracker() self._running = False @@ -161,33 +161,31 @@ def __init__( self._oai_serving_completion = None self._oai_serving_embedding = None - async def start(self) -> None: """Start the vLLM engine. If the engine is already running, do nothing. """ - + if self._running: # The engine is already running! logger.info("Skipping engine restart because the engine is already running") return from vllm.entrypoints.openai.api_server import init_app_state - - - node_initialization = await initialize_node(self.llm_config) - + + node_initialization = await initialize_node(self.llm_config) + ( vllm_engine_args, vllm_frontend_args, vllm_engine_config, ) = self._prepare_engine_config(node_initialization) - # Apply checkpoint info to the llm_config. - # This is needed for capturing model capabilities + # Apply checkpoint info to the llm_config. + # This is needed for capturing model capabilities # (e.g. supports vision, etc.) on the llm_config. - config = self.llm_config.get_engine_config() + config = self.llm_config.get_engine_config() self.llm_config.apply_checkpoint_info( config.actual_hf_model_id, trust_remote_code=config.trust_remote_code, @@ -199,10 +197,9 @@ async def start(self) -> None: node_initialization.placement_group, ) - state = State() args = argparse.Namespace( - **vllm_frontend_args.__dict__, + **vllm_frontend_args.__dict__, **vllm_engine_args.__dict__, ) @@ -217,8 +214,8 @@ async def start(self) -> None: self._oai_serving_chat = state.openai_serving_chat self._oai_serving_completion = state.openai_serving_completion self._oai_serving_embedding = state.openai_serving_embedding - - self._validate_openai_serving_models() + + self._validate_openai_serving_models() self._running = True @@ -227,25 +224,26 @@ async def start(self) -> None: def _validate_openai_serving_models(self): if not hasattr(self._oai_models, "lora_requests"): raise ValueError("oai_models must have a lora_requests attribute") - + if not hasattr(self._oai_models, "load_lora_adapter"): raise ValueError("oai_models must have a load_lora_adapter attribute") - + def _validate_openai_serving_chat(self): if not hasattr(self._oai_serving_chat, "create_chat_completion"): - raise ValueError("oai_serving_chat must have a create_chat_completion attribute") - + raise ValueError( + "oai_serving_chat must have a create_chat_completion attribute" + ) def _prepare_engine_config(self, node_initialization: InitializeNodeOutput): """Prepare the engine config to start the engine. Returns: engine_args: The vLLM's internal engine arguments that is flattened. - frontend_args: The vLLM's internal frontend arguments that is + frontend_args: The vLLM's internal frontend arguments that is flattened. engine_config: The vLLM's internal engine config that is nested. """ - + engine_config: VLLMEngineConfig = self.llm_config.get_engine_config() if engine_config.use_gpu: @@ -267,27 +265,34 @@ def _prepare_engine_config(self, node_initialization: InitializeNodeOutput): ) vllm_engine_args, vllm_engine_config = ray.get(ref) else: - vllm_engine_args, vllm_engine_config = _get_vllm_engine_config(self.llm_config) + vllm_engine_args, vllm_engine_config = _get_vllm_engine_config( + self.llm_config + ) vllm_frontend_args = FrontendArgs(**engine_config.frontend_kwargs) return vllm_engine_args, vllm_frontend_args, vllm_engine_config - def _start_async_llm_engine_v0(self, engine_args: "AsyncEngineArgs", vllm_config: "VllmConfig", placement_group: PlacementGroup) -> "EngineClient": - + def _start_async_llm_engine_v0( + self, + engine_args: "AsyncEngineArgs", + vllm_config: "VllmConfig", + placement_group: PlacementGroup, + ) -> "EngineClient": + from vllm.executor.ray_distributed_executor import RayDistributedExecutor from vllm.engine.async_llm_engine import AsyncLLMEngine + vllm_config.parallel_config.placement_group = placement_group - + _clear_current_platform_cache() - + engine = AsyncLLMEngine( vllm_config=vllm_config, executor_class=RayDistributedExecutor, log_stats=not engine_args.disable_log_stats, ) - + return engine - def _start_async_llm_engine( self, @@ -296,11 +301,13 @@ def _start_async_llm_engine( placement_group: PlacementGroup, ) -> "EngineClient": """Creates an async LLM engine from the engine arguments.""" - + # NOTE: This is a temporary solution untill vLLM v1 supports embeddings. if not vllm.envs.VLLM_USE_V1: - return self._start_async_llm_engine_v0(engine_args, vllm_config, placement_group) - + return self._start_async_llm_engine_v0( + engine_args, vllm_config, placement_group + ) + from vllm.v1.executor.abstract import Executor from vllm.v1.engine.async_llm import AsyncLLM @@ -334,7 +341,7 @@ async def resolve_lora(self, disk_lora_model: DiskMultiplexConfig): if disk_lora_model.model_id in self._oai_models.lora_requests: # Lora is already loaded, return return - + lora_request = await self._oai_models.load_lora_adapter( request=LoadLoRAAdapterRequest( lora_name=disk_lora_model.model_id, @@ -349,10 +356,10 @@ async def chat( self, request: ChatCompletionRequest ) -> AsyncGenerator[Union[str, ChatCompletionResponse, ErrorResponse], None]: """ - + input: Take a genric free form input type and cast it to the target engine request type inside the engine. - - output: + + output: - stream: True --> for each chunk, yield astring representing data: \n\n - stream: False --> yield only one string representing the response @@ -370,7 +377,9 @@ async def chat( if isinstance(chat_response, AsyncGenerator): async for response in chat_response: if not isinstance(response, str): - raise ValueError(f"Expected create_chat_completion to return a stream of strings, got and item with type {type(response)}") + raise ValueError( + f"Expected create_chat_completion to return a stream of strings, got and item with type {type(response)}" + ) yield response else: logger.info( @@ -380,15 +389,14 @@ async def chat( yield ErrorResponse(**chat_response.model_dump()) yield ChatCompletionResponse(**chat_response.model_dump()) - async def completions( self, request: CompletionRequest ) -> AsyncGenerator[Union[str, CompletionResponse, ErrorResponse], None]: """ - + input: Take a generic free form input type and cast it to the target engine request type inside the engine. - - output: + + output: - stream: True --> for each chunk, yield a string representing data: \n\n - stream: False --> yield only one string representing the response @@ -400,14 +408,20 @@ async def completions( """ if self._oai_serving_completion is None: - raise RuntimeError("Completion service is not available. Make sure the engine is started and supports completions.") + raise RuntimeError( + "Completion service is not available. Make sure the engine is started and supports completions." + ) - completion_response = await self._oai_serving_completion.create_completion(request) + completion_response = await self._oai_serving_completion.create_completion( + request + ) if isinstance(completion_response, AsyncGenerator): async for response in completion_response: if not isinstance(response, str): - raise ValueError(f"Expected create_completion to return a stream of strings, got and item with type {type(response)}") + raise ValueError( + f"Expected create_completion to return a stream of strings, got and item with type {type(response)}" + ) yield response else: logger.info( @@ -429,12 +443,14 @@ async def embeddings( Yields: An EmbeddingResponse or ErrorResponse object. """ - + if self._oai_serving_embedding is None: - raise RuntimeError("Embedding service is not available. Make sure the engine is started and supports embeddings.") - + raise RuntimeError( + "Embedding service is not available. Make sure the engine is started and supports embeddings." + ) + embedding_response = await self._oai_serving_embedding.create_embedding(request) - + if isinstance(embedding_response, VLLMErrorResponse): yield ErrorResponse(**embedding_response.model_dump()) else: @@ -442,7 +458,9 @@ async def embeddings( async def check_health(self) -> None: if not hasattr(self._engine_client, "check_health"): - raise RuntimeError(f"{type(self._engine_client)} does not support health check.") + raise RuntimeError( + f"{type(self._engine_client)} does not support health check." + ) try: await self._engine_client.check_health() diff --git a/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_models.py b/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_models.py index 53d71571a898..910eb3c3498d 100644 --- a/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_models.py +++ b/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_models.py @@ -74,29 +74,40 @@ def actual_hf_model_id(self) -> str: def trust_remote_code(self) -> bool: return self.engine_kwargs.get("trust_remote_code", False) - def get_initialization_kwargs(self) -> dict: """ Get kwargs that will be actually passed to the LLMInitializer constructor. """ engine_kwargs = self.engine_kwargs.copy() - + if "model" in engine_kwargs or "served_model_name" in engine_kwargs: - raise ValueError("model or served_model_name is not allowed in engine_kwargs when using Ray Serve LLM. Please use `model_loading_config` in LLMConfig instead.") - + raise ValueError( + "model or served_model_name is not allowed in engine_kwargs when using Ray Serve LLM. Please use `model_loading_config` in LLMConfig instead." + ) + engine_kwargs["model"] = self.actual_hf_model_id engine_kwargs["served_model_name"] = [self.model_id] - - if "distributed_executor_backend" in engine_kwargs and engine_kwargs["distributed_executor_backend"] != "ray": - raise ValueError("distributed_executor_backend != 'ray' is not allowed in engine_kwargs when using Ray Serve LLM Configs.") - else: + + if ( + "distributed_executor_backend" in engine_kwargs + and engine_kwargs["distributed_executor_backend"] != "ray" + ): + raise ValueError( + "distributed_executor_backend != 'ray' is not allowed in engine_kwargs when using Ray Serve LLM Configs." + ) + else: engine_kwargs["distributed_executor_backend"] = "ray" - - if "disable_log_stats" in engine_kwargs and engine_kwargs["disable_log_stats"] != False: - logger.warning("disable_log_stats = True is not allowed in engine_kwargs when using Ray Serve LLM Configs. Setting it to False.") + + if ( + "disable_log_stats" in engine_kwargs + and engine_kwargs["disable_log_stats"] != False + ): + logger.warning( + "disable_log_stats = True is not allowed in engine_kwargs when using Ray Serve LLM Configs. Setting it to False." + ) engine_kwargs["disable_log_stats"] = False - + return engine_kwargs def get_runtime_env_with_local_env_vars(self) -> dict: @@ -141,7 +152,6 @@ def from_llm_config(cls, llm_config: LLMConfig) -> "VLLMEngineConfig": engine_kwargs[key] = value else: raise ValueError(f"Unknown engine argument: {key}") - return VLLMEngineConfig( model_id=llm_config.model_id, @@ -153,8 +163,6 @@ def from_llm_config(cls, llm_config: LLMConfig) -> "VLLMEngineConfig": frontend_kwargs=frontend_kwargs, runtime_env=llm_config.runtime_env, ) - - def ray_accelerator_type(self) -> str: """Converts the accelerator type to the Ray Core format.""" @@ -245,5 +253,3 @@ def get_or_create_pg(self) -> PlacementGroup: logger.info(f"Using new placement group {pg}. {placement_group_table(pg)}") return pg - - diff --git a/python/ray/llm/_internal/serve/deployments/prefill_decode_disagg/prefill_decode_disagg.py b/python/ray/llm/_internal/serve/deployments/prefill_decode_disagg/prefill_decode_disagg.py index 3d94377a0688..25579d284f23 100644 --- a/python/ray/llm/_internal/serve/deployments/prefill_decode_disagg/prefill_decode_disagg.py +++ b/python/ray/llm/_internal/serve/deployments/prefill_decode_disagg/prefill_decode_disagg.py @@ -101,11 +101,12 @@ async def __init__( self.prefill_server = prefill_server.options(stream=True) self.decode_server = decode_server.options(stream=True) - - async def embeddings(self, request: EmbeddingRequest) -> AsyncGenerator[EmbeddingResponse, None]: + + async def embeddings( + self, request: EmbeddingRequest + ) -> AsyncGenerator[EmbeddingResponse, None]: raise NotImplementedError("Embedding is not supported for P/D disaggregation") - - + def _prepare_prefill_request(self, request: RequestType) -> RequestType: assert ( getattr(request, "kv_transfer_params", None) is None @@ -121,21 +122,26 @@ def _prepare_prefill_request(self, request: RequestType) -> RequestType: } prefill_request.max_tokens = 1 prefill_request.stream = False - + return prefill_request - - - def _prepare_decode_request(self, request: RequestType, prefill_chunk: Union[ChatCompletionResponse, CompletionResponse]) -> RequestType: + + def _prepare_decode_request( + self, + request: RequestType, + prefill_chunk: Union[ChatCompletionResponse, CompletionResponse], + ) -> RequestType: decode_request = request.model_copy(deep=True) decode_request.kv_transfer_params = prefill_chunk.kv_transfer_params - + return decode_request - + async def _handle_request( - self, + self, request: RequestType, - ) -> AsyncGenerator[Union[str, ChatCompletionResponse, CompletionResponse, ErrorResponse], None]: - + ) -> AsyncGenerator[ + Union[str, ChatCompletionResponse, CompletionResponse, ErrorResponse], None + ]: + if isinstance(request, ChatCompletionRequest): method = "chat" elif isinstance(request, CompletionRequest): @@ -145,29 +151,30 @@ async def _handle_request( prefill_request = self._prepare_prefill_request(request) prefill_gen = getattr(self.prefill_server, method).remote(prefill_request) - + prefill_chunk = await anext(prefill_gen) - + if isinstance(prefill_chunk, ErrorResponse): logger.error(f"Prefill returned error: {prefill_chunk.error}") yield prefill_chunk return - + decode_request = self._prepare_decode_request(request, prefill_chunk) decode_gen = self.decode_server.chat.remote(decode_request) - - + async for chunk in decode_gen: yield chunk - - - async def chat(self, request: ChatCompletionRequest) -> AsyncGenerator[Union[str, ChatCompletionResponse, ErrorResponse], None]: + + async def chat( + self, request: ChatCompletionRequest + ) -> AsyncGenerator[Union[str, ChatCompletionResponse, ErrorResponse], None]: return self._handle_request(request) - - - async def completions(self, request: CompletionRequest) -> AsyncGenerator[Union[str, CompletionResponse, ErrorResponse], None]: + + async def completions( + self, request: CompletionRequest + ) -> AsyncGenerator[Union[str, CompletionResponse, ErrorResponse], None]: return self._handle_request(request) - + @classmethod def as_deployment(cls) -> serve.Deployment: """Turns PDProxyServer into a Ray Serve deployment.""" diff --git a/python/ray/llm/_internal/serve/deployments/routers/router.py b/python/ray/llm/_internal/serve/deployments/routers/router.py index 6dc3f3ad005c..9782940dc9e5 100644 --- a/python/ray/llm/_internal/serve/deployments/routers/router.py +++ b/python/ray/llm/_internal/serve/deployments/routers/router.py @@ -45,12 +45,10 @@ to_model_metadata, ErrorResponse, ModelCard, - ModelList + ModelList, ) -from ray.llm._internal.serve.configs.server_models import ( - LLMConfig -) +from ray.llm._internal.serve.configs.server_models import LLMConfig from ray.llm._internal.serve.deployments.llm.multiplex.utils import ( get_base_model_id, get_lora_model_ids, diff --git a/python/ray/llm/_internal/serve/deployments/utils/node_initialization_utils.py b/python/ray/llm/_internal/serve/deployments/utils/node_initialization_utils.py index 601bb97e3cec..af1650fbe996 100644 --- a/python/ray/llm/_internal/serve/deployments/utils/node_initialization_utils.py +++ b/python/ray/llm/_internal/serve/deployments/utils/node_initialization_utils.py @@ -116,7 +116,6 @@ async def initialize_node(llm_config: LLMConfig) -> InitializeNodeOutput: download_extra_files=True, ) - return InitializeNodeOutput( placement_group=pg, runtime_env=runtime_env, extra_init_kwargs=extra_init_kwargs ) diff --git a/python/ray/llm/_internal/serve/deployments/utils/server_utils.py b/python/ray/llm/_internal/serve/deployments/utils/server_utils.py index 3dd3f9aa0b7e..e6628e266e38 100644 --- a/python/ray/llm/_internal/serve/deployments/utils/server_utils.py +++ b/python/ray/llm/_internal/serve/deployments/utils/server_utils.py @@ -133,4 +133,3 @@ def get_model_request_id(model: str): def replace_prefix(model: str) -> str: """Replace -- with / in model name to handle slashes within the URL path segment""" return model.replace("--", "/") - diff --git a/python/ray/llm/tests/serve/conftest.py b/python/ray/llm/tests/serve/conftest.py index b3780f96dad6..4b6c5a38390e 100644 --- a/python/ray/llm/tests/serve/conftest.py +++ b/python/ray/llm/tests/serve/conftest.py @@ -15,7 +15,7 @@ VLLMEngineConfig, ) from ray.llm._internal.serve.configs.openai_api_models import ( - ChatCompletionRequest, + ChatCompletionRequest, CompletionRequest, EmbeddingCompletionRequest, ) @@ -82,9 +82,7 @@ def mock_chat_request(stream, max_tokens): """Fixture for creating chat completion requests for mock testing.""" return ChatCompletionRequest( model="mock-model", - messages=[ - {"role": "user", "content": "Hello, world!"} - ], + messages=[{"role": "user", "content": "Hello, world!"}], max_tokens=max_tokens, stream=stream, ) diff --git a/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_engine.py b/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_engine.py index 9b88394dfa9e..a7253dde1dec 100644 --- a/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_engine.py +++ b/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_engine.py @@ -17,25 +17,24 @@ class TestMockLLMEngine: - @pytest.mark.parametrize("api_type", ["chat", "completion"]) @pytest.mark.parametrize("stream", [False, True]) @pytest.mark.parametrize("max_tokens", [5]) @pytest.mark.asyncio async def test_unified_llm_engine( - self, - mock_llm_config, - mock_chat_request, + self, + mock_llm_config, + mock_chat_request, mock_completion_request, - api_type: str, - stream: bool, - max_tokens: int + api_type: str, + stream: bool, + max_tokens: int, ): """Unified test for both chat and completion APIs, streaming and non-streaming.""" # Create and start the engine engine = MockVLLMEngine(mock_llm_config) await engine.start() - + # Create request based on API type if api_type == "chat": request = mock_chat_request @@ -43,41 +42,41 @@ async def test_unified_llm_engine( elif api_type == "completion": request = mock_completion_request response_generator = engine.completions(request) - - print(f"\n\n_____ {api_type.upper()} ({'STREAMING' if stream else 'NON-STREAMING'}) max_tokens={max_tokens} _____\n\n") - + + print( + f"\n\n_____ {api_type.upper()} ({'STREAMING' if stream else 'NON-STREAMING'}) max_tokens={max_tokens} _____\n\n" + ) + if stream: # Collect streaming chunks chunks = [] async for chunk in response_generator: assert isinstance(chunk, str) chunks.append(chunk) - + # Validate streaming response LLMResponseValidator.validate_streaming_chunks(chunks, api_type, max_tokens) else: # Validate non-streaming response async for response in response_generator: - LLMResponseValidator.validate_non_streaming_response(response, api_type, max_tokens) + LLMResponseValidator.validate_non_streaming_response( + response, api_type, max_tokens + ) @pytest.mark.parametrize("dimensions", [None, 512]) - @pytest.mark.asyncio + @pytest.mark.asyncio async def test_embedding_mock_engine( - self, - mock_llm_config, - mock_embedding_request, - dimensions: Optional[int] + self, mock_llm_config, mock_embedding_request, dimensions: Optional[int] ): """Test embedding API with different dimensions.""" # Create and start the engine engine = MockVLLMEngine(mock_llm_config) await engine.start() - + # Create embedding request request = mock_embedding_request - + print(f"\n\n_____ EMBEDDING dimensions={dimensions} _____\n\n") - + async for response in engine.embeddings(request): LLMResponseValidator.validate_embedding_response(response, dimensions) - diff --git a/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_server.py b/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_server.py index d2c8a7bfdb3e..dd16a4f094f2 100644 --- a/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_server.py +++ b/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_server.py @@ -4,29 +4,33 @@ import pytest from unittest.mock import patch -from ray.llm.tests.serve.mocks.mock_vllm_engine import MockVLLMEngine, FakeLoraModelLoader +from ray.llm.tests.serve.mocks.mock_vllm_engine import ( + MockVLLMEngine, + FakeLoraModelLoader, +) from ray.llm.tests.serve.utils.testing_utils import LLMResponseValidator from ray import serve from ray.llm._internal.serve.deployments.llm.llm_server import LLMServer from ray.llm._internal.serve.configs.server_models import LoraConfig + @pytest.fixture -def serve_handle(mock_llm_config, stream_batching_interval_ms = 0): +def serve_handle(mock_llm_config, stream_batching_interval_ms=0): mock_llm_config.experimental_configs = { "stream_batching_interval_ms": stream_batching_interval_ms, } - app = serve.deployment(LLMServer).bind(mock_llm_config, engine_cls=MockVLLMEngine) + app = serve.deployment(LLMServer).bind(mock_llm_config, engine_cls=MockVLLMEngine) handle = serve.run(app) - # We set stream=True because the interfaces are async generators regardless + # We set stream=True because the interfaces are async generators regardless # of the stream flag on request. handle = handle.options(stream=True) yield handle serve.shutdown() - + @pytest.fixture -def multiplexed_serve_handle(mock_llm_config, stream_batching_interval_ms = 0): +def multiplexed_serve_handle(mock_llm_config, stream_batching_interval_ms=0): mock_llm_config.experimental_configs = { "stream_batching_interval_ms": stream_batching_interval_ms, } @@ -36,35 +40,35 @@ def multiplexed_serve_handle(mock_llm_config, stream_batching_interval_ms = 0): max_download_tries=3, ) app = serve.deployment(LLMServer).bind( - mock_llm_config, + mock_llm_config, engine_cls=MockVLLMEngine, model_downloader=FakeLoraModelLoader, - ) + ) handle = serve.run(app) handle = handle.options(stream=True, multiplexed_model_id="test_model_id") yield handle serve.shutdown() -class TestLLMServer: +class TestLLMServer: @pytest.mark.parametrize("api_type", ["chat", "completion"]) @pytest.mark.parametrize("stream", [False, True]) @pytest.mark.parametrize("max_tokens", [5]) @pytest.mark.parametrize("stream_batching_interval_ms", [0, 10000]) @pytest.mark.asyncio async def test_unified_llm_server( - self, - serve_handle, + self, + serve_handle, mock_llm_config, mock_chat_request, mock_completion_request, - api_type: str, + api_type: str, stream: bool, max_tokens: int, - stream_batching_interval_ms: int + stream_batching_interval_ms: int, ): """Unified test for both chat and completion APIs, streaming and non-streaming.""" - + # Create request based on API type if api_type == "chat": request = mock_chat_request @@ -72,9 +76,11 @@ async def test_unified_llm_server( elif api_type == "completion": request = mock_completion_request batched_chunks = serve_handle.completions.remote(request) - - print(f"\n\n_____ {api_type.upper()} ({'STREAMING' if stream else 'NON-STREAMING'}) max_tokens={max_tokens} batching_interval_ms={stream_batching_interval_ms} _____\n\n") - + + print( + f"\n\n_____ {api_type.upper()} ({'STREAMING' if stream else 'NON-STREAMING'}) max_tokens={max_tokens} batching_interval_ms={stream_batching_interval_ms} _____\n\n" + ) + if stream: # Collect responses from the stream chunks = [] @@ -94,30 +100,31 @@ async def test_unified_llm_server( # Check that we got one response assert len(chunks) == 1 - + # Validate non-streaming response - LLMResponseValidator.validate_non_streaming_response(chunks[0], api_type, max_tokens) + LLMResponseValidator.validate_non_streaming_response( + chunks[0], api_type, max_tokens + ) @pytest.mark.parametrize("dimensions", [None, 512]) - @pytest.mark.asyncio + @pytest.mark.asyncio async def test_embedding_llm_server( - self, + self, serve_handle, - mock_llm_config, - mock_embedding_request, - dimensions: Optional[int] + mock_llm_config, + mock_embedding_request, + dimensions: Optional[int], ): """Test embedding API from LLMServer perspective.""" - - + # Create embedding request request = mock_embedding_request - + print(f"\n\n_____ EMBEDDING SERVER dimensions={dimensions} _____\n\n") - + # Get the response batched_chunks = serve_handle.embeddings.remote(request) - + # Collect responses (should be just one) chunks = [] async for batch in batched_chunks: @@ -125,21 +132,20 @@ async def test_embedding_llm_server( # Check that we got one response assert len(chunks) == 1 - + # Validate embedding response LLMResponseValidator.validate_embedding_response(chunks[0], dimensions) @pytest.mark.asyncio async def test_check_health(self, create_server, mock_llm_config): """Test health check functionality.""" - + # Mock the engine's check_health method class LocalMockEngine(MockVLLMEngine): - def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.check_health_called = False - + async def check_health(self): self.check_health_called = True @@ -162,9 +168,16 @@ async def test_llm_config_property(self, create_server, mock_llm_config): @pytest.mark.parametrize("stream", [False]) @pytest.mark.parametrize("max_tokens", [5]) @pytest.mark.asyncio - async def test_request_id_handling(self, serve_handle, mock_llm_config, mock_chat_request, stream: bool, max_tokens: int): + async def test_request_id_handling( + self, + serve_handle, + mock_llm_config, + mock_chat_request, + stream: bool, + max_tokens: int, + ): """Test that the request id is handled correctly.""" - + # Create a chat completion request # We should patch get_server_request_id to return a test_request_id serve.context._serve_request_context.set( @@ -174,28 +187,27 @@ async def test_request_id_handling(self, serve_handle, mock_llm_config, mock_cha chunks = [] async for chunk in serve_handle.chat.remote(mock_chat_request): chunks.append(chunk) - + assert len(chunks) == 1 assert chunks[0].id == "test_request_id" - - + @pytest.mark.parametrize("api_type", ["chat", "completion"]) @pytest.mark.parametrize("stream", [False, True]) @pytest.mark.parametrize("max_tokens", [5]) @pytest.mark.parametrize("stream_batching_interval_ms", [0, 10000]) @pytest.mark.asyncio async def test_multiplexed_request_handling( - self, + self, multiplexed_serve_handle, - mock_chat_request, - mock_completion_request, - api_type: str, - stream: bool, + mock_chat_request, + mock_completion_request, + api_type: str, + stream: bool, max_tokens: int, - stream_batching_interval_ms: int + stream_batching_interval_ms: int, ): """Unified test for multiplexed (LoRA) requests - both chat and completion APIs, streaming and non-streaming.""" - + # Create request based on API type and set model ID for multiplexing if api_type == "chat": request = mock_chat_request @@ -205,8 +217,10 @@ async def test_multiplexed_request_handling( batched_chunks = multiplexed_serve_handle.completions.remote(request) request.model = "test_model_id" - print(f"\n\n_____ MULTIPLEXED {api_type.upper()} ({'STREAMING' if stream else 'NON-STREAMING'}) max_tokens={max_tokens} batching_interval_ms={stream_batching_interval_ms} _____\n\n") - + print( + f"\n\n_____ MULTIPLEXED {api_type.upper()} ({'STREAMING' if stream else 'NON-STREAMING'}) max_tokens={max_tokens} batching_interval_ms={stream_batching_interval_ms} _____\n\n" + ) + if stream: # Collect responses from the stream chunks = [] @@ -220,7 +234,9 @@ async def test_multiplexed_request_handling( assert len(chunks) > 0 # Validate streaming response with LoRA model ID - LLMResponseValidator.validate_streaming_chunks(chunks, api_type, max_tokens, lora_model_id=request.model) + LLMResponseValidator.validate_streaming_chunks( + chunks, api_type, max_tokens, lora_model_id=request.model + ) else: # Collect non-streaming response chunks = [] @@ -232,19 +248,20 @@ async def test_multiplexed_request_handling( # Check that we got one response assert len(chunks) == 1 - + # Validate non-streaming response with LoRA model ID - LLMResponseValidator.validate_non_streaming_response(chunks[0], api_type, max_tokens, lora_model_id=request.model) - - + LLMResponseValidator.validate_non_streaming_response( + chunks[0], api_type, max_tokens, lora_model_id=request.model + ) + @pytest.mark.asyncio async def test_push_telemetry(self, create_server, mock_llm_config): """Test that the telemetry push is called properly.""" - with patch("ray.llm._internal.serve.deployments.llm.llm_server.push_telemetry_report_for_all_models") as mock_push_telemetry: + with patch( + "ray.llm._internal.serve.deployments.llm.llm_server.push_telemetry_report_for_all_models" + ) as mock_push_telemetry: await create_server(mock_llm_config, engine_cls=MockVLLMEngine) mock_push_telemetry.assert_called_once() - - if __name__ == "__main__": diff --git a/python/ray/llm/tests/serve/gpu/integration/test_openai_compatibility.py b/python/ray/llm/tests/serve/gpu/integration/test_openai_compatibility.py index 4293435a8820..e1a4f02b8c22 100644 --- a/python/ray/llm/tests/serve/gpu/integration/test_openai_compatibility.py +++ b/python/ray/llm/tests/serve/gpu/integration/test_openai_compatibility.py @@ -83,7 +83,7 @@ def test_chat_stream(self, testing_model): # noqa: F811 ), temperature=0.4, frequency_penalty=0.02, - max_tokens=5 + max_tokens=5, ): if i == 0: assert chat_completion @@ -99,7 +99,6 @@ def test_chat_stream(self, testing_model): # noqa: F811 ) i += 1 - def test_completions_stream_missing_model(self, testing_model): # noqa: F811 client, _ = testing_model with pytest.raises(openai.NotFoundError) as exc_info: diff --git a/python/ray/llm/tests/serve/mocks/mock_vllm_engine.py b/python/ray/llm/tests/serve/mocks/mock_vllm_engine.py index a3d56f7e2f12..7f33b93d0ce6 100644 --- a/python/ray/llm/tests/serve/mocks/mock_vllm_engine.py +++ b/python/ray/llm/tests/serve/mocks/mock_vllm_engine.py @@ -18,13 +18,15 @@ LLMConfig, ) from ray.llm._internal.serve.deployments.llm.llm_engine import LLMEngine -from ray.llm._internal.serve.deployments.llm.multiplex.lora_model_loader import LoraModelLoader +from ray.llm._internal.serve.deployments.llm.multiplex.lora_model_loader import ( + LoraModelLoader, +) class MockVLLMEngine(LLMEngine): """Mock vLLM Engine that generates fake text responses. - - - In case of LoRA it generates a prefix with the model name in the text part of the response. + + - In case of LoRA it generates a prefix with the model name in the text part of the response. """ def __init__(self, llm_config: LLMConfig): @@ -50,90 +52,91 @@ async def check_health(self) -> None: if not self.started: raise RuntimeError("Engine not started") - async def chat(self, request: ChatCompletionRequest) -> AsyncGenerator[Union[str, ChatCompletionResponse, ErrorResponse], None]: + async def chat( + self, request: ChatCompletionRequest + ) -> AsyncGenerator[Union[str, ChatCompletionResponse, ErrorResponse], None]: """Mock chat completion.""" if not self.started: raise RuntimeError("Engine not started") - + # Extract prompt text from messages prompt_text = "" if request.messages: for message in request.messages: - if hasattr(message, 'content') and message.content: + if hasattr(message, "content") and message.content: prompt_text += str(message.content) + " " - - max_tokens = getattr(request, 'max_tokens', None) or randint(1, 10) - + + max_tokens = getattr(request, "max_tokens", None) or randint(1, 10) + # Generate streaming response async for response in self._generate_chat_response( - request=request, - prompt_text=prompt_text.strip(), - max_tokens=max_tokens + request=request, prompt_text=prompt_text.strip(), max_tokens=max_tokens ): yield response - async def completions(self, request: CompletionRequest) -> AsyncGenerator[Union[str, CompletionResponse, ErrorResponse], None]: + async def completions( + self, request: CompletionRequest + ) -> AsyncGenerator[Union[str, CompletionResponse, ErrorResponse], None]: """Mock text completion.""" if not self.started: raise RuntimeError("Engine not started") - + prompt_text = str(request.prompt) if request.prompt else "" - max_tokens = getattr(request, 'max_tokens', None) or randint(5, 20) - + max_tokens = getattr(request, "max_tokens", None) or randint(5, 20) + # Generate streaming response async for response in self._generate_completion_response( - request=request, - prompt_text=prompt_text, - max_tokens=max_tokens + request=request, prompt_text=prompt_text, max_tokens=max_tokens ): yield response - async def embeddings(self, request: EmbeddingRequest) -> AsyncGenerator[Union[str, EmbeddingResponse, ErrorResponse], None]: + async def embeddings( + self, request: EmbeddingRequest + ) -> AsyncGenerator[Union[str, EmbeddingResponse, ErrorResponse], None]: """Mock embeddings generation.""" if not self.started: raise RuntimeError("Engine not started") - + # Generate a mock embedding response embedding_data = [] inputs = request.input if isinstance(request.input, list) else [request.input] - + for i, text in enumerate(inputs): # Generate random embedding vector - dimensions = getattr(request, 'dimensions', None) or 1536 + dimensions = getattr(request, "dimensions", None) or 1536 embedding = [random.uniform(-1, 1) for _ in range(dimensions)] - - embedding_data.append({ - "object": "embedding", - "embedding": embedding, - "index": i - }) - + + embedding_data.append( + {"object": "embedding", "embedding": embedding, "index": i} + ) + response = EmbeddingResponse( object="list", data=embedding_data, - model=getattr(request, 'model', 'mock-model'), + model=getattr(request, "model", "mock-model"), usage={ "prompt_tokens": len(str(request.input).split()), - "total_tokens": len(str(request.input).split()) - } + "total_tokens": len(str(request.input).split()), + }, ) yield response async def _generate_chat_response( - self, - request: ChatCompletionRequest, - prompt_text: str, - max_tokens: int + self, request: ChatCompletionRequest, prompt_text: str, max_tokens: int ) -> AsyncGenerator[Union[str, ChatCompletionResponse], None]: """Generate mock chat completion response.""" - + request_id = request.request_id or f"chatcmpl-{random.randint(1000, 9999)}" - lora_prefix = "" if request.model not in self._current_lora_model else f"[lora_model] {request.model}: " + lora_prefix = ( + "" + if request.model not in self._current_lora_model + else f"[lora_model] {request.model}: " + ) if request.stream: # Streaming response - return SSE formatted strings created_time = int(asyncio.get_event_loop().time()) - model_name = getattr(request, 'model', 'mock-model') - + model_name = getattr(request, "model", "mock-model") + for i in range(max_tokens): if i == 0: token = f"{lora_prefix}test_{i} " @@ -142,75 +145,73 @@ async def _generate_chat_response( if i == max_tokens - 1: # no space for the last token token = f"test_{i}" - + # Create streaming chunk choice = { "index": 0, "delta": { "content": token, - "role": "assistant" if i == 0 else None + "role": "assistant" if i == 0 else None, }, - "finish_reason": "stop" if i == max_tokens - 1 else None + "finish_reason": "stop" if i == max_tokens - 1 else None, } - + chunk_data = { "id": request_id, "object": "chat.completion.chunk", "created": created_time, "model": model_name, - "choices": [choice] + "choices": [choice], } - + # Format as SSE yield f"data: {json.dumps(chunk_data)}\n\n" await asyncio.sleep(0.01) # Simulate processing time - + # Send final [DONE] message yield "data: [DONE]\n\n" else: # Non-streaming response - return response object generated_text = " ".join([f"test_{i}" for i in range(max_tokens)]) generated_text = f"{lora_prefix}{generated_text}" - + choice = { "index": 0, - "message": { - "role": "assistant", - "content": generated_text - }, - "finish_reason": "stop" + "message": {"role": "assistant", "content": generated_text}, + "finish_reason": "stop", } - + response = ChatCompletionResponse( id=request_id, object="chat.completion", created=int(asyncio.get_event_loop().time()), - model=getattr(request, 'model', 'mock-model'), + model=getattr(request, "model", "mock-model"), choices=[choice], usage={ "prompt_tokens": len(prompt_text.split()), "completion_tokens": max_tokens, - "total_tokens": len(prompt_text.split()) + max_tokens - } + "total_tokens": len(prompt_text.split()) + max_tokens, + }, ) - + yield response async def _generate_completion_response( - self, - request: CompletionRequest, - prompt_text: str, - max_tokens: int + self, request: CompletionRequest, prompt_text: str, max_tokens: int ) -> AsyncGenerator[Union[str, CompletionResponse], None]: """Generate mock completion response.""" - + request_id = request.request_id or f"cmpl-{random.randint(1000, 9999)}" - lora_prefix = "" if request.model not in self._current_lora_model else f"[lora_model] {request.model}: " + lora_prefix = ( + "" + if request.model not in self._current_lora_model + else f"[lora_model] {request.model}: " + ) if request.stream: # Streaming response - return SSE formatted strings created_time = int(asyncio.get_event_loop().time()) - model_name = getattr(request, 'model', 'mock-model') - + model_name = getattr(request, "model", "mock-model") + for i in range(max_tokens): if i == 0: token = f"{lora_prefix}test_{i} " @@ -219,58 +220,56 @@ async def _generate_completion_response( if i == max_tokens - 1: # no space for the last token token = f"test_{i}" - + choice = { "index": 0, "text": token, - "finish_reason": "stop" if i == max_tokens - 1 else None + "finish_reason": "stop" if i == max_tokens - 1 else None, } - + chunk_data = { "id": request_id, "object": "text_completion", "created": created_time, "model": model_name, - "choices": [choice] + "choices": [choice], } - + # Format as SSE yield f"data: {json.dumps(chunk_data)}\n\n" await asyncio.sleep(0.01) - + # Send final [DONE] message yield "data: [DONE]\n\n" else: # Non-streaming response - return response object generated_text = " ".join([f"test_{i}" for i in range(max_tokens)]) generated_text = f"{lora_prefix}{generated_text}" - - choice = { - "index": 0, - "text": generated_text, - "finish_reason": "stop" - } - + + choice = {"index": 0, "text": generated_text, "finish_reason": "stop"} + response = CompletionResponse( id=request_id, object="text_completion", created=int(asyncio.get_event_loop().time()), - model=getattr(request, 'model', 'mock-model'), + model=getattr(request, "model", "mock-model"), choices=[choice], usage={ "prompt_tokens": len(prompt_text.split()), "completion_tokens": max_tokens, - "total_tokens": len(prompt_text.split()) + max_tokens - } + "total_tokens": len(prompt_text.split()) + max_tokens, + }, ) - + yield response class FakeLoraModelLoader(LoraModelLoader): """Fake LoRA model loader for testing.""" - async def load_model(self, lora_model_id: str, llm_config: LLMConfig) -> DiskMultiplexConfig: + async def load_model( + self, lora_model_id: str, llm_config: LLMConfig + ) -> DiskMultiplexConfig: """Load a fake LoRA model.""" return DiskMultiplexConfig( model_id=lora_model_id, @@ -278,4 +277,3 @@ async def load_model(self, lora_model_id: str, llm_config: LLMConfig) -> DiskMul local_path="/fake/local/path", lora_assigned_int_id=random.randint(1, 100), ) - diff --git a/python/ray/llm/tests/serve/utils/__init__.py b/python/ray/llm/tests/serve/utils/__init__.py index f6befe644317..e356527468b2 100644 --- a/python/ray/llm/tests/serve/utils/__init__.py +++ b/python/ray/llm/tests/serve/utils/__init__.py @@ -1 +1 @@ -# Testing utilities for Ray LLM serve tests \ No newline at end of file +# Testing utilities for Ray LLM serve tests diff --git a/python/ray/llm/tests/serve/utils/testing_utils.py b/python/ray/llm/tests/serve/utils/testing_utils.py index c8c967e19476..1cdab168418b 100644 --- a/python/ray/llm/tests/serve/utils/testing_utils.py +++ b/python/ray/llm/tests/serve/utils/testing_utils.py @@ -10,15 +10,17 @@ from ray.llm._internal.serve.configs.openai_api_models import ( ChatCompletionResponse, CompletionResponse, - EmbeddingResponse + EmbeddingResponse, ) class LLMResponseValidator: """Reusable validation logic for LLM responses.""" - + @staticmethod - def get_expected_content(api_type: str, max_tokens: int, lora_model_id: str = "") -> str: + def get_expected_content( + api_type: str, max_tokens: int, lora_model_id: str = "" + ) -> str: """Get expected content based on API type.""" expected_content = " ".join(f"test_{i}" for i in range(max_tokens)) if lora_model_id: @@ -27,14 +29,16 @@ def get_expected_content(api_type: str, max_tokens: int, lora_model_id: str = "" @staticmethod def validate_non_streaming_response( - response: Union[ChatCompletionResponse, CompletionResponse], - api_type: str, + response: Union[ChatCompletionResponse, CompletionResponse], + api_type: str, max_tokens: int, - lora_model_id: str = "" + lora_model_id: str = "", ): """Validate non-streaming responses.""" - expected_content = LLMResponseValidator.get_expected_content(api_type, max_tokens, lora_model_id) - + expected_content = LLMResponseValidator.get_expected_content( + api_type, max_tokens, lora_model_id + ) + if api_type == "chat": assert isinstance(response, ChatCompletionResponse) assert response.choices[0].message.content == expected_content @@ -44,26 +48,23 @@ def validate_non_streaming_response( @staticmethod def validate_streaming_chunks( - chunks: List[str], - api_type: str, - max_tokens: int, - lora_model_id: str = "" + chunks: List[str], api_type: str, max_tokens: int, lora_model_id: str = "" ): """Validate streaming response chunks.""" # Should have max_tokens + 1 chunks (tokens + [DONE]) assert len(chunks) == max_tokens + 1 - + # Validate each chunk except the last [DONE] chunk for chunk_iter, chunk in enumerate(chunks[:-1]): pattern = r"data: (.*)\n\n" match = re.match(pattern, chunk) assert match is not None chunk_data = json.loads(match.group(1)) - + expected_chunk = f"test_{chunk_iter}" if lora_model_id and chunk_iter == 0: expected_chunk = f"[lora_model] {lora_model_id}: {expected_chunk}" - + if api_type == "chat": delta = chunk_data["choices"][0]["delta"] if chunk_iter == 0: @@ -77,8 +78,7 @@ def validate_streaming_chunks( @staticmethod def validate_embedding_response( - response: EmbeddingResponse, - expected_dimensions: Optional[int] = None + response: EmbeddingResponse, expected_dimensions: Optional[int] = None ): """Validate embedding responses.""" assert isinstance(response, EmbeddingResponse) @@ -86,9 +86,11 @@ def validate_embedding_response( assert len(response.data) == 1 assert response.data[0].object == "embedding" assert isinstance(response.data[0].embedding, list) - assert len(response.data[0].embedding) > 0 # Should have some embedding dimensions + assert ( + len(response.data[0].embedding) > 0 + ) # Should have some embedding dimensions assert response.data[0].index == 0 - + # Check dimensions if specified if expected_dimensions: - assert len(response.data[0].embedding) == expected_dimensions \ No newline at end of file + assert len(response.data[0].embedding) == expected_dimensions diff --git a/release/llm_tests/serve/probes/query_utils.py b/release/llm_tests/serve/probes/query_utils.py index 9c265386a2f9..1026e303f19f 100644 --- a/release/llm_tests/serve/probes/query_utils.py +++ b/release/llm_tests/serve/probes/query_utils.py @@ -182,8 +182,8 @@ async def query( "stream": stream, **chat_args, } - - if stream: + + if stream: args["stream_options"] = { "include_usage": True, } diff --git a/release/llm_tests/serve/probes/test_json_mode.py b/release/llm_tests/serve/probes/test_json_mode.py index 0150db2f583a..1dc2eb51af0e 100644 --- a/release/llm_tests/serve/probes/test_json_mode.py +++ b/release/llm_tests/serve/probes/test_json_mode.py @@ -205,7 +205,10 @@ async def test_invalid_schema(model: str, openai_async_client): querier = TextGenerationProbeQuerier(openai_async_client, {"temperature": 0.0}) response_format = { "type": "json_schema", - "json_schema": {"name": "expected_schema", "schema": {"type": "object", "properties": {"name": {"type": "str"}}}}, + "json_schema": { + "name": "expected_schema", + "schema": {"type": "object", "properties": {"name": {"type": "str"}}}, + }, } params = { From 33de82ccaec45ecad720e179f72afafcefe969da Mon Sep 17 00:00:00 2001 From: Kourosh Hakhamaneshi Date: Thu, 10 Jul 2025 18:21:28 -0700 Subject: [PATCH 37/37] wip Signed-off-by: Kourosh Hakhamaneshi --- release/llm_tests/serve/probes/models.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/release/llm_tests/serve/probes/models.py b/release/llm_tests/serve/probes/models.py index 2e347aa0da52..f0714c209ad9 100644 --- a/release/llm_tests/serve/probes/models.py +++ b/release/llm_tests/serve/probes/models.py @@ -97,15 +97,11 @@ def is_release_test_model(model: "openai.types.model.Model") -> bool: def is_finetuned_model(model: "openai.types.model.Model") -> bool: # If base_model_id is set, this is a finetuned model - return ( - model.model_dump().get("metadata", {}).get("base_model_id") is not None - ) + return model.model_dump().get("metadata", {}).get("base_model_id") is not None def is_vision_language_model(model: "openai.types.model.Model") -> bool: - return ( - model.model_dump().get("metadata", {}).get("input_modality") == "image" - ) + return model.model_dump().get("metadata", {}).get("input_modality") == "image" def is_rate_liming_test_model(model: "openai.types.model.Model") -> bool: