Skip to content
Closed
Show file tree
Hide file tree
Changes from 37 commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
037bd7f
wip
kouroshHakha Jun 24, 2025
0b0a5d8
prototype api server
kouroshHakha Jun 25, 2025
7dfabde
wip
kouroshHakha Jun 25, 2025
07d42fb
fixed error handling and lora
kouroshHakha Jun 25, 2025
eddc710
wip
kouroshHakha Jun 25, 2025
14e5263
mistral
kouroshHakha Jun 25, 2025
cdfb32c
batching is also done
kouroshHakha Jun 28, 2025
dbb2db7
wip
kouroshHakha Jun 28, 2025
9b9bbde
Merge branch 'master' into kh/proto-api-server
kouroshHakha Jun 28, 2025
2fc73d9
wip
kouroshHakha Jun 28, 2025
00ac868
wip
kouroshHakha Jun 30, 2025
02e5ecf
wip
kouroshHakha Jun 30, 2025
7640a92
wip
kouroshHakha Jun 30, 2025
8df78df
wip
kouroshHakha Jun 30, 2025
c7d67b5
wip
kouroshHakha Jun 30, 2025
0e97923
wip
kouroshHakha Jul 1, 2025
1d74fc9
simplify by reusing vllm apis
kouroshHakha Jul 1, 2025
59ac15a
wip
kouroshHakha Jul 1, 2025
89002a7
wip
kouroshHakha Jul 1, 2025
5db78c7
wip
kouroshHakha Jul 1, 2025
e39daf2
wip
kouroshHakha Jul 1, 2025
4fe3cef
for embedding user must set VLLM_USE_V1=0
kouroshHakha Jul 1, 2025
b1c0163
added self contained test for first llm engine mock
kouroshHakha Jul 1, 2025
f385cf2
testing llm_server now with refactor testing utils for more consisten…
kouroshHakha Jul 2, 2025
ccd188b
added lora logic back and tested the request_id handling from serve
kouroshHakha Jul 2, 2025
61e8902
tested multiplexing
kouroshHakha Jul 2, 2025
88a45e0
wip
kouroshHakha Jul 2, 2025
4e9a3d2
added telemetry tests
kouroshHakha Jul 2, 2025
343a395
remove tests that we already had a good coverage on
kouroshHakha Jul 2, 2025
e0470cc
fix test_router
kouroshHakha Jul 2, 2025
e9725c3
pd
kouroshHakha Jul 3, 2025
d4d8a8d
delete dead code
kouroshHakha Jul 3, 2025
bc3e7bc
wip
kouroshHakha Jul 3, 2025
a4790e3
delete more dead code
kouroshHakha Jul 3, 2025
a0ad597
wip
kouroshHakha Jul 3, 2025
c05c83f
fixed the probes
kouroshHakha Jul 8, 2025
e2c6171
wip
kouroshHakha Jul 8, 2025
51f7deb
Merge branch 'master' into kh/proto-api-server
kouroshHakha Jul 11, 2025
33de82c
wip
kouroshHakha Jul 11, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
761 changes: 62 additions & 699 deletions python/ray/llm/_internal/serve/configs/openai_api_models.py

Large diffs are not rendered by default.

119 changes: 0 additions & 119 deletions python/ray/llm/_internal/serve/configs/prompt_formats.py

This file was deleted.

139 changes: 3 additions & 136 deletions python/ray/llm/_internal/serve/configs/server_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,7 @@
List,
Optional,
Sequence,
Set,
Tuple,
Type,
TypeVar,
Union,
)
Expand Down Expand Up @@ -37,17 +35,9 @@
DEFAULT_MULTIPLEX_DOWNLOAD_TIMEOUT_S,
DEFAULT_MULTIPLEX_DOWNLOAD_TRIES,
ENABLE_WORKER_PROCESS_SETUP_HOOK,
MAX_NUM_STOPPING_SEQUENCES,
MODEL_RESPONSE_BATCH_TIMEOUT_MS,
)
from ray.llm._internal.serve.configs.error_handling import TooManyStoppingSequences
from ray.llm._internal.serve.configs.openai_api_models_patch import (
ErrorResponse,
ResponseFormatType,
)
from ray.llm._internal.serve.configs.prompt_formats import (
Prompt,
)
from ray.llm._internal.serve.configs.openai_api_models import ErrorResponse
from ray.llm._internal.serve.observability.logging import get_logger
from ray.serve._private.config import DeploymentConfig

Expand Down Expand Up @@ -239,7 +229,7 @@ class LLMConfig(BaseModelExtended):
)

_supports_vision: bool = PrivateAttr(False)
_model_architecture: str = PrivateAttr("")
_model_architecture: str = PrivateAttr("UNSPECIFIED")
_engine_config: EngineConfigType = PrivateAttr(None)

def _infer_supports_vision(self, model_id_or_path: str) -> None:
Expand All @@ -262,7 +252,7 @@ def _set_model_architecture(
"""
if model_id_or_path:
hf_config = transformers.PretrainedConfig.from_pretrained(model_id_or_path)
if hasattr(hf_config, "architectures"):
if hasattr(hf_config, "architectures") and hf_config.architectures:
self._model_architecture = hf_config.architectures[0]

if model_architecture:
Expand Down Expand Up @@ -572,32 +562,6 @@ def parse_args(self) -> "LLMServingArgs":
return LLMServingArgs(llm_configs=llm_configs)


TModel = TypeVar("TModel", bound="Model")


class ModelData(BaseModel):
model_config = ConfigDict(protected_namespaces=tuple())

id: str
object: str
owned_by: str
permission: List[str]
rayllm_metadata: Dict[str, Any]

@property
def model_type(self) -> str:
return self.rayllm_metadata["engine_config"]["model_type"]


class Model(BaseModel):
data: List[ModelData]
object: str = "list"

@classmethod
def list(cls) -> TModel:
pass


class FinishReason(str, Enum):
LENGTH = "length"
STOP = "stop"
Expand Down Expand Up @@ -866,100 +830,3 @@ def merge_dicts(base: Dict, overwrite: Dict) -> Dict:
else:
base[key] = overwrite[key]
return base


class SamplingParams(BaseModelExtended):
"""Parameters for controlling text generation sampling.

Args:
max_tokens: The maximum number of tokens to generate. Defaults to inf.
temperature: What sampling temperature to use.
top_p: An alternative to sampling with temperature, called nucleus sampling.
n: How many completions to generate for each prompt.
logprobs: Include the log probabilities on the `logprobs` most likely
tokens, as well the chosen tokens.
top_logprobs: The number of logprobs to return. Defaults to 1. `logprobs`
must be set to `True` in order to use top_logprobs.
stop: Up to 4 sequences where the API will stop generating further tokens.
The returned text will not contain the stop sequence.
stop_tokens: Tokens to stop on (applied before detokenization).
presence_penalty: Number between -2.0 and 2.0.
Positive values penalize new tokens based on whether they appear in
the text so far, increasing the model's likelihood to talk about
new topics.
frequency_penalty: Number between -2.0 and 2.0. Positive values penalize
new tokens based on their existing frequency in the text so far,
decreasing the model's likelihood to repeat the same line verbatim.
best_of: Generates `best_of` completions server-side and returns the "best".
logit_bias: Modify the likelihood of specified tokens appearing in
the completion.
response_format: Format to return the final response in. Can be for ex:
response_format={"type": "json", "schema": "{...}"}
"""

_ignored_fields: Set[str] = set()

max_tokens: Optional[int] = None
temperature: Optional[float] = None
top_p: Optional[float] = None
n: int = 1
logprobs: Optional[bool] = None
top_logprobs: Optional[int] = None
logit_bias: Optional[Dict[str, float]] = None
stop: Optional[List[str]] = None
stop_tokens: Optional[List[int]] = None
ignore_eos: Optional[bool] = None
presence_penalty: Optional[float] = None
frequency_penalty: Optional[float] = None
best_of: int = 1
response_format: Optional[ResponseFormatType] = None

def model_dump(self, **kwargs) -> Dict[str, Any]:
if kwargs.get("exclude", None) is None:
kwargs["exclude"] = self._ignored_fields
return super().model_dump(**kwargs)

@field_validator("stop", mode="before")
@classmethod
def validate_stopping_sequences(cls, values):
if not values:
return values

unique_val = sorted(set(values))

if len(unique_val) > MAX_NUM_STOPPING_SEQUENCES:
TooManyStoppingSequences(
len(unique_val), MAX_NUM_STOPPING_SEQUENCES
).raise_exception()

return list(unique_val)

@field_validator("stop_tokens", mode="before")
@classmethod
def validate_stop_tokens(cls, values):
if not values:
return values
return sorted(set(values))

@classmethod
def _get_model_validate_kwargs(cls: Type[ModelT], prompt: Prompt) -> Dict[str, Any]:
generate_kwargs = prompt.parameters or {}
if not isinstance(generate_kwargs, dict):
generate_kwargs = generate_kwargs.model_dump(exclude_unset=True)

return generate_kwargs

@classmethod
def from_prompt(cls: Type[ModelT], prompt: Prompt) -> ModelT:
# Extract parameters object from prompt
generate_kwargs = cls._get_model_validate_kwargs(prompt)
return cls.model_validate(generate_kwargs)


class GenerationRequest(BaseModelExtended):
prompt: Union[str, List[int], List[str]]
prompt_token_ids: Optional[List[int]] = None
request_id: Union[str, List[str]]
sampling_params: Optional[Union[SamplingParams, List[SamplingParams]]] = None
stream: bool = False
metadata: Optional[Dict[str, Any]] = None
44 changes: 19 additions & 25 deletions python/ray/llm/_internal/serve/deployments/llm/llm_engine.py
Original file line number Diff line number Diff line change
@@ -1,49 +1,43 @@
import abc
from typing import AsyncGenerator, Optional

from transformers.dynamic_module_utils import init_hf_modules
from typing import AsyncGenerator, Any

from ray.llm._internal.serve.configs.server_models import (
DiskMultiplexConfig,
GenerationRequest,
LLMConfig,
LLMRawResponse,
Prompt,
)


class LLMEngine(abc.ABC):
"""Base class for all LLM engines"""
"""Base protocal class for all LLM engines"""

@abc.abstractmethod
def __init__(self, llm_config: LLMConfig):
self._llm_config = llm_config

# Ensure transformers_modules is initialized early in worker processes.
# This is critical for models with trust_remote_code=True to avoid pickle errors.
init_hf_modules()
"""Initialize the engine with the llm config"""
pass

@abc.abstractmethod
async def start(self):
"""Start the engine"""
pass

@abc.abstractmethod
async def prepare_request(
self,
request_id: str,
prompt: Prompt,
stream: bool,
disk_lora_model: Optional[DiskMultiplexConfig] = None,
**kwargs,
) -> GenerationRequest:
"""Prepare a GenerationRequest for the engine"""
async def resolve_lora(self, lora_model: DiskMultiplexConfig):
"""Resolve the lora model"""
pass

@abc.abstractmethod
async def chat(self, request) -> AsyncGenerator[Any, None]:
"""Chat with the engine"""
pass

@abc.abstractmethod
async def completions(self, request) -> AsyncGenerator[Any, None]:
"""Completion with the engine"""
pass

@abc.abstractmethod
async def generate(
self, request: GenerationRequest
) -> AsyncGenerator[LLMRawResponse, None]:
"""Generate an LLMRawResponse stream based on the GenerationRequest"""
async def embeddings(self, request) -> AsyncGenerator[Any, None]:
"""Embed with the engine"""
pass

async def check_health(self) -> None:
Expand Down
Loading