Skip to content
Merged
3 changes: 2 additions & 1 deletion tests/entrypoints/openai/test_serving_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,8 @@ async def test_load_lora_adapter_success():
response = await serving_models.load_lora_adapter(request)
assert response == LORA_LOADING_SUCCESS_MESSAGE.format(lora_name='adapter')
assert len(serving_models.lora_requests) == 1
assert serving_models.lora_requests[0].lora_name == "adapter"
assert "adapter" in serving_models.lora_requests
assert serving_models.lora_requests["adapter"].lora_name == "adapter"


@pytest.mark.asyncio
Expand Down
2 changes: 2 additions & 0 deletions vllm/entrypoints/openai/api_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -1129,6 +1129,8 @@ async def init_app_state(
served_model_names = args.served_model_name
else:
served_model_names = [args.model]

logger.info(f"[Kourosh] init_app_state, {served_model_names=}")

if args.disable_log_requests:
request_logger = None
Expand Down
359 changes: 166 additions & 193 deletions vllm/entrypoints/openai/cli_args.py

Large diffs are not rendered by default.

15 changes: 13 additions & 2 deletions vllm/entrypoints/openai/serving_chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,8 @@ async def create_chat_completion(
tokenizer = await self.engine_client.get_tokenizer(lora_request)

tool_parser = self.tool_parser

logger.info(f"[Kourosh] 1")

if isinstance(tokenizer, MistralTokenizer):
# because of issues with pydantic we need to potentially
Expand All @@ -176,6 +178,8 @@ async def create_chat_completion(
tool.model_dump() for tool in request.tools
]

logger.info(f"[Kourosh] 2")

(
conversation,
request_prompts,
Expand Down Expand Up @@ -270,9 +274,11 @@ async def create_chat_completion(
conversation, tokenizer, request_metadata)
except ValueError as e:
# TODO: Use a vllm-specific Validation Error
logger.error(f"[Kourosh] error in chat_completion_full_generator: {e}")
return self.create_error_response(str(e))

def get_chat_request_role(self, request: ChatCompletionRequest) -> str:
logger.info(f"[Kourosh] get_chat_request_role, {request.add_generation_prompt=}, {request.messages=}")
if request.add_generation_prompt:
return self.response_role
return request.messages[-1]["role"]
Expand Down Expand Up @@ -942,6 +948,7 @@ async def chat_completion_full_generator(
choices: list[ChatCompletionResponseChoice] = []

role = self.get_chat_request_role(request)
logger.info(f"[Kourosh] role: {role}")
for output in final_res.outputs:
token_ids = output.token_ids
out_logprobs = output.logprobs
Expand Down Expand Up @@ -980,6 +987,7 @@ async def chat_completion_full_generator(
(not isinstance(request.tool_choice,
ChatCompletionNamedToolChoiceParam
) and request.tool_choice != "required"):
logger.info(f"[Kourosh] auto tool is not enabled, {role=}, {reasoning_content=}, {content=}")
message = ChatMessage(role=role,
reasoning_content=reasoning_content,
content=content)
Expand Down Expand Up @@ -1009,6 +1017,7 @@ async def chat_completion_full_generator(
assert content is not None
tool_calls = TypeAdapter(
list[FunctionDefinition]).validate_json(content)
logger.info(f"[Kourosh] tool_calls: {tool_calls}")
message = ChatMessage(
role=role,
content="",
Expand All @@ -1023,7 +1032,7 @@ async def chat_completion_full_generator(
# if the request doesn't use tool choice
# OR specifies to not use a tool
elif not request.tool_choice or request.tool_choice == "none":

logger.info(f"[Kourosh] no tool choice")
message = ChatMessage(role=role,
reasoning_content=reasoning_content,
content=content)
Expand All @@ -1033,7 +1042,7 @@ async def chat_completion_full_generator(
request.tool_choice == "auto"
or request.tool_choice is None) and self.enable_auto_tools \
and self.tool_parser:

logger.info(f"[Kourosh] tool choice is auto")
try:
tool_parser = self.tool_parser(tokenizer)
except RuntimeError as e:
Expand All @@ -1047,6 +1056,7 @@ async def chat_completion_full_generator(
# call. The same is not true for named function calls
auto_tools_called = tool_call_info.tools_called
if tool_call_info.tools_called:
logger.info(f"[Kourosh] tool_call_info.tools_called: {tool_call_info.tools_called}")
message = ChatMessage(role=role,
reasoning_content=reasoning_content,
content=tool_call_info.content,
Expand All @@ -1055,6 +1065,7 @@ async def chat_completion_full_generator(
else:
# FOR NOW make it a chat message; we will have to detect
# the type to make it later.
logger.info(f"[Kourosh] no tool call info")
message = ChatMessage(role=role,
reasoning_content=reasoning_content,
content=content)
Expand Down
14 changes: 8 additions & 6 deletions vllm/entrypoints/openai/serving_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -426,9 +426,7 @@ async def _check_model(

if self._is_model_supported(request.model):
return None
if request.model in [
lora.lora_name for lora in self.models.lora_requests
]:
if request.model in self.models.lora_requests:
return None
if envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING and request.model and (
load_result := await self.models.resolve_lora(request.model)):
Expand All @@ -454,9 +452,8 @@ def _maybe_get_adapters(
None, PromptAdapterRequest]]:
if self._is_model_supported(request.model):
return None, None
for lora in self.models.lora_requests:
if request.model == lora.lora_name:
return lora, None
if request.model in self.models.lora_requests:
return self.models.lora_requests[request.model], None
for prompt_adapter in self.models.prompt_adapter_requests:
if request.model == prompt_adapter.prompt_adapter_name:
return None, prompt_adapter
Expand Down Expand Up @@ -816,6 +813,8 @@ async def _preprocess_chat(
)

mm_data = await mm_data_future

logger.info(f"[Kourosh] 3")

# tool parsing is done only if a tool_parser has been set and if
# tool_choice is not "none" (if tool_choice is "none" but a tool_parser
Expand Down Expand Up @@ -846,6 +845,7 @@ async def _preprocess_chat(
prompt_inputs = TextTokensPrompt(
prompt=tokenizer.decode(request_prompt),
prompt_token_ids=request_prompt)
logger.info(f"[Kourosh] 4, {prompt_inputs=}")

engine_prompt = EngineTokensPrompt(
prompt_token_ids=prompt_inputs["prompt_token_ids"])
Expand All @@ -857,6 +857,8 @@ async def _preprocess_chat(
if hasattr(request, "cache_salt") and request.cache_salt is not None:
engine_prompt["cache_salt"] = request.cache_salt

logger.info(f"[Kourosh] 5, {conversation=}, {request_prompt=}, {engine_prompt=}")

return conversation, [request_prompt], [engine_prompt]

def _load_prompt_embeds(
Expand Down
26 changes: 11 additions & 15 deletions vllm/entrypoints/openai/serving_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,12 +65,14 @@ def __init__(
super().__init__()

self.base_model_paths = base_model_paths
logger.info(f"[Kourosh] is_base_model, {self.base_model_paths=}")

self.max_model_len = model_config.max_model_len
self.engine_client = engine_client
self.model_config = model_config

self.static_lora_modules = lora_modules
self.lora_requests: list[LoRARequest] = []
self.lora_requests: dict[str, LoRARequest] = {}
self.lora_id_counter = AtomicCounter(0)

self.lora_resolvers: list[LoRAResolver] = []
Expand Down Expand Up @@ -138,7 +140,7 @@ async def show_available_models(self) -> ModelList:
parent=lora.base_model_name if lora.base_model_name else
self.base_model_paths[0].name,
permission=[ModelPermission()])
for lora in self.lora_requests
for lora in self.lora_requests.values()
]
prompt_adapter_cards = [
ModelCard(id=prompt_adapter.prompt_adapter_name,
Expand Down Expand Up @@ -182,7 +184,7 @@ async def load_lora_adapter(
err_type=error_type,
status_code=status_code)

self.lora_requests.append(lora_request)
self.lora_requests[lora_name] = lora_request
logger.info("Loaded new LoRA adapter: name '%s', path '%s'", lora_name,
lora_path)
return f"Success: LoRA adapter '{lora_name}' added successfully."
Expand All @@ -196,10 +198,7 @@ async def unload_lora_adapter(
return error_check_ret

lora_name = request.lora_name
self.lora_requests = [
lora_request for lora_request in self.lora_requests
if lora_request.lora_name != lora_name
]
del self.lora_requests[lora_name]
logger.info("Removed LoRA adapter: name '%s'", lora_name)
return f"Success: LoRA adapter '{lora_name}' removed successfully."

Expand All @@ -213,8 +212,7 @@ async def _check_load_lora_adapter_request(
status_code=HTTPStatus.BAD_REQUEST)

# Check if the lora adapter with the given name already exists
if any(lora_request.lora_name == request.lora_name
for lora_request in self.lora_requests):
if request.lora_name in self.lora_requests:
return create_error_response(
message=
f"The lora adapter '{request.lora_name}' has already been "
Expand All @@ -236,8 +234,7 @@ async def _check_unload_lora_adapter_request(
status_code=HTTPStatus.BAD_REQUEST)

# Check if the lora adapter with the given name exists
if not any(lora_request.lora_name == request.lora_name
for lora_request in self.lora_requests):
if request.lora_name not in self.lora_requests:
return create_error_response(
message=
f"The lora adapter '{request.lora_name}' cannot be found.",
Expand All @@ -260,9 +257,8 @@ async def resolve_lora(
"""
async with self.lora_resolver_lock[lora_name]:
# First check if this LoRA is already loaded
for existing in self.lora_requests:
if existing.lora_name == lora_name:
return existing
if lora_name in self.lora_requests:
return self.lora_requests[lora_name]

base_model_name = self.model_config.model
unique_id = self.lora_id_counter.inc(1)
Expand All @@ -279,7 +275,7 @@ async def resolve_lora(

try:
await self.engine_client.add_lora(lora_request)
self.lora_requests.append(lora_request)
self.lora_requests[lora_name] = lora_request
logger.info(
"Resolved and loaded LoRA adapter '%s' using %s",
lora_name, resolver.__class__.__name__)
Expand Down
Loading