From 05838da39b98174ad2fb5bb25a6187ea11b0ca1f Mon Sep 17 00:00:00 2001 From: Christopher Albert Date: Wed, 25 Mar 2026 23:51:23 +0100 Subject: [PATCH 1/4] server: add OpenAI-compatible /v1/responses endpoint --- tests/test_chat_template_kwargs.py | 165 +++++ tests/test_responses_api.py | 505 +++++++++++++ vllm_mlx/api/__init__.py | 34 + vllm_mlx/api/models.py | 3 + vllm_mlx/api/responses_models.py | 316 ++++++++ vllm_mlx/engine/simple.py | 44 +- vllm_mlx/models/llm.py | 9 +- vllm_mlx/server.py | 1107 ++++++++++++++++++++++++++++ 8 files changed, 2179 insertions(+), 4 deletions(-) create mode 100644 tests/test_chat_template_kwargs.py create mode 100644 tests/test_responses_api.py create mode 100644 vllm_mlx/api/responses_models.py diff --git a/tests/test_chat_template_kwargs.py b/tests/test_chat_template_kwargs.py new file mode 100644 index 00000000..3cd88057 --- /dev/null +++ b/tests/test_chat_template_kwargs.py @@ -0,0 +1,165 @@ +# SPDX-License-Identifier: Apache-2.0 +"""Focused regressions for chat_template_kwargs forwarding.""" + +from types import SimpleNamespace +from unittest.mock import MagicMock, patch + +import pytest +from fastapi.testclient import TestClient + +import vllm_mlx.server as srv +from vllm_mlx.engine.base import GenerationOutput + + +@pytest.fixture +def anyio_backend(): + return "asyncio" + + +def test_chat_completion_request_preserves_chat_template_kwargs(): + request = srv.ChatCompletionRequest( + model="test-model", + messages=[srv.Message(role="user", content="Hello")], + chat_template_kwargs={"enable_thinking": False}, + ) + + assert request.chat_template_kwargs == {"enable_thinking": False} + + +def test_chat_completion_endpoint_forwards_chat_template_kwargs(): + captured = {} + + class FakeEngine: + model_name = "test-model" + is_mllm = False + preserve_native_tool_format = False + + async def chat(self, messages, **kwargs): + captured["messages"] = messages + captured["kwargs"] = kwargs + return GenerationOutput( + text="ORBIT", + prompt_tokens=4, + completion_tokens=1, + finish_reason="stop", + ) + + client = TestClient(srv.app) + original_engine = srv._engine + original_model_name = srv._model_name + srv._engine = FakeEngine() + srv._model_name = "test-model" + try: + response = client.post( + "/v1/chat/completions", + json={ + "model": "test-model", + "messages": [{"role": "user", "content": "Reply with ORBIT."}], + "max_tokens": 8, + "chat_template_kwargs": {"enable_thinking": False}, + }, + ) + finally: + srv._engine = original_engine + srv._model_name = original_model_name + + assert response.status_code == 200 + assert captured["kwargs"]["chat_template_kwargs"] == {"enable_thinking": False} + assert response.json()["choices"][0]["message"]["content"] == "ORBIT" + + +def test_llm_chat_applies_chat_template_kwargs_before_generate(): + from vllm_mlx.models.llm import MLXLanguageModel + + model = MLXLanguageModel.__new__(MLXLanguageModel) + model._loaded = True + model.tokenizer = MagicMock() + model.tokenizer.apply_chat_template.return_value = "prompt" + model.generate = MagicMock(return_value="ok") + + result = model.chat( + [{"role": "user", "content": "Hello"}], + chat_template_kwargs={"enable_thinking": False}, + ) + + assert result == "ok" + model.tokenizer.apply_chat_template.assert_called_once() + assert ( + model.tokenizer.apply_chat_template.call_args.kwargs["enable_thinking"] is False + ) + model.generate.assert_called_once() + + +@pytest.mark.anyio +async def test_simple_engine_llm_chat_forwards_chat_template_kwargs(): + from vllm_mlx.engine.simple import SimpleEngine + + with patch("vllm_mlx.engine.simple.is_mllm_model", return_value=False): + engine = SimpleEngine("test-model") + engine._loaded = True + engine._is_mllm = False + engine._model = MagicMock() + engine._model.chat = MagicMock( + return_value=SimpleNamespace( + text="OK", + tokens=[1], + finish_reason="stop", + ) + ) + + await engine.chat( + [{"role": "user", "content": "Hello"}], + chat_template_kwargs={"enable_thinking": False}, + ) + + assert engine._model.chat.call_args.kwargs["chat_template_kwargs"] == { + "enable_thinking": False + } + + +@pytest.mark.anyio +async def test_simple_engine_tool_fallback_preserves_stream_state_and_kwargs(): + from vllm_mlx.engine.simple import SimpleEngine + + captured = {} + + async def fake_stream_chat(*args, **kwargs): + captured["kwargs"] = kwargs + yield SimpleNamespace( + text="partial", + tokens=[7], + prompt_tokens=11, + completion_tokens=1, + finish_reason=None, + finished=False, + ) + yield SimpleNamespace( + text="<|im_end|>{\"name\":\"bash\",\"arguments\":{\"command\":\"pwd\"}}", + tokens=[7, 8], + prompt_tokens=11, + completion_tokens=4, + finish_reason="stop", + finished=True, + ) + + with patch("vllm_mlx.engine.simple.is_mllm_model", return_value=False): + engine = SimpleEngine("test-model") + engine._loaded = True + engine._is_mllm = False + engine._model = MagicMock() + engine._model.tokenizer.encode = MagicMock(return_value=[99]) + engine.stream_chat = fake_stream_chat # type: ignore[method-assign] + + output = await engine.chat( + [{"role": "user", "content": "Hello"}], + tools=[{"type": "function", "function": {"name": "bash"}}], + chat_template_kwargs={"enable_thinking": False}, + ) + + assert captured["kwargs"]["chat_template_kwargs"] == { + "enable_thinking": False + } + assert output.tokens == [7, 8] + assert output.prompt_tokens == 11 + assert output.completion_tokens == 4 + assert output.finish_reason == "stop" diff --git a/tests/test_responses_api.py b/tests/test_responses_api.py new file mode 100644 index 00000000..206712d9 --- /dev/null +++ b/tests/test_responses_api.py @@ -0,0 +1,505 @@ +# SPDX-License-Identifier: Apache-2.0 +"""Tests for the OpenAI-compatible Responses API.""" + +import platform +import sys +from types import SimpleNamespace +from unittest.mock import AsyncMock, MagicMock + +import pytest +from fastapi.testclient import TestClient + +pytestmark = pytest.mark.skipif( + sys.platform != "darwin" or platform.machine() != "arm64", + reason="Requires Apple Silicon", +) + + +@pytest.fixture() +def client(): + from vllm_mlx.server import app + + return TestClient(app) + + +@pytest.fixture(autouse=True) +def server_state(): + import vllm_mlx.server as srv + + original_engine = srv._engine + original_model_name = srv._model_name + original_store = srv._responses_store + original_api_key = srv._api_key + + srv._engine = None + srv._model_name = "test-model" + srv._responses_store = {} + srv._api_key = None + + try: + yield + finally: + srv._engine = original_engine + srv._model_name = original_model_name + srv._responses_store = original_store + srv._api_key = original_api_key + + +def _mock_engine(*outputs): + engine = MagicMock() + engine.model_name = "test-model" + engine.preserve_native_tool_format = False + engine.chat = AsyncMock(side_effect=list(outputs)) + stream_calls = [] + + async def _stream_chat(**kwargs): + stream_calls.append(kwargs) + for output in getattr(engine, "_stream_outputs", []): + yield output + + engine._stream_calls = stream_calls + engine._stream_outputs = [] + engine.stream_chat = _stream_chat + return engine + + +def _output(text: str, prompt_tokens: int = 7, completion_tokens: int = 3): + return SimpleNamespace( + text=text, + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, + finish_reason="stop", + ) + + +def _stream_output( + new_text: str, + prompt_tokens: int = 7, + completion_tokens: int = 1, + finish_reason: str | None = None, +): + return SimpleNamespace( + new_text=new_text, + text=new_text, + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, + finish_reason=finish_reason, + finished=finish_reason is not None, + ) + + +class TestResponsesEndpoint: + def test_basic_response(self, client): + import vllm_mlx.server as srv + + srv._engine = _mock_engine(_output("Hello there")) + + resp = client.post( + "/v1/responses", + json={"model": "test-model", "input": "Say hello"}, + ) + + assert resp.status_code == 200 + body = resp.json() + assert body["object"] == "response" + assert body["output_text"] == "Hello there" + assert body["output"][0]["type"] == "message" + assert body["output"][0]["content"][0]["type"] == "output_text" + assert body["usage"]["input_tokens"] == 7 + assert body["usage"]["output_tokens"] == 3 + + def test_previous_response_id_reuses_prior_context(self, client): + import vllm_mlx.server as srv + + engine = _mock_engine(_output("First answer"), _output("Second answer")) + srv._engine = engine + + first = client.post( + "/v1/responses", + json={"model": "test-model", "input": "First prompt"}, + ) + first_id = first.json()["id"] + + second = client.post( + "/v1/responses", + json={ + "model": "test-model", + "previous_response_id": first_id, + "input": "Follow-up prompt", + }, + ) + + assert second.status_code == 200 + second_messages = engine.chat.call_args_list[1].kwargs["messages"] + assert second_messages[0]["role"] == "user" + assert second_messages[0]["content"] == "First prompt" + assert second_messages[1]["role"] == "assistant" + assert second_messages[1]["content"] == "First answer" + assert second_messages[2]["role"] == "user" + assert second_messages[2]["content"] == "Follow-up prompt" + + def test_previous_response_id_chains_across_multiple_follow_ups(self, client): + import vllm_mlx.server as srv + + engine = _mock_engine( + _output("First answer"), + _output("Second answer"), + _output("Third answer"), + ) + srv._engine = engine + + first = client.post( + "/v1/responses", + json={"model": "test-model", "input": "First prompt"}, + ) + first_id = first.json()["id"] + + second = client.post( + "/v1/responses", + json={ + "model": "test-model", + "previous_response_id": first_id, + "input": "Second prompt", + }, + ) + second_id = second.json()["id"] + + third = client.post( + "/v1/responses", + json={ + "model": "test-model", + "previous_response_id": second_id, + "input": "Third prompt", + }, + ) + + assert third.status_code == 200 + third_messages = engine.chat.call_args_list[2].kwargs["messages"] + assert third_messages[0]["role"] == "user" + assert third_messages[0]["content"] == "First prompt" + assert third_messages[1]["role"] == "assistant" + assert third_messages[1]["content"] == "First answer" + assert third_messages[2]["role"] == "user" + assert third_messages[2]["content"] == "Second prompt" + assert third_messages[3]["role"] == "assistant" + assert third_messages[3]["content"] == "Second answer" + assert third_messages[4]["role"] == "user" + assert third_messages[4]["content"] == "Third prompt" + + def test_previous_response_id_does_not_carry_prior_instructions(self, client): + import vllm_mlx.server as srv + + engine = _mock_engine(_output("First answer"), _output("Second answer")) + srv._engine = engine + + first = client.post( + "/v1/responses", + json={ + "model": "test-model", + "instructions": "First system instruction", + "input": "First prompt", + }, + ) + first_id = first.json()["id"] + + second = client.post( + "/v1/responses", + json={ + "model": "test-model", + "instructions": "Second system instruction", + "previous_response_id": first_id, + "input": "Follow-up prompt", + }, + ) + + assert second.status_code == 200 + second_messages = engine.chat.call_args_list[1].kwargs["messages"] + assert second_messages[0]["role"] == "system" + assert second_messages[0]["content"] == "Second system instruction" + assert "First system instruction" not in second_messages[0]["content"] + assert second_messages[1]["role"] == "user" + assert second_messages[1]["content"] == "First prompt" + assert second_messages[2]["role"] == "assistant" + assert second_messages[3]["role"] == "user" + + def test_previous_response_id_preserves_prior_system_message_items(self, client): + import vllm_mlx.server as srv + + engine = _mock_engine(_output("First answer"), _output("Second answer")) + srv._engine = engine + + first = client.post( + "/v1/responses", + json={ + "model": "test-model", + "input": [ + {"type": "message", "role": "system", "content": "Persist me"}, + {"type": "message", "role": "user", "content": "First prompt"}, + ], + }, + ) + first_id = first.json()["id"] + + second = client.post( + "/v1/responses", + json={ + "model": "test-model", + "previous_response_id": first_id, + "input": "Follow-up prompt", + }, + ) + + assert second.status_code == 200 + second_messages = engine.chat.call_args_list[1].kwargs["messages"] + assert second_messages[0]["role"] == "system" + assert second_messages[0]["content"] == "Persist me" + assert second_messages[1]["role"] == "user" + assert second_messages[1]["content"] == "First prompt" + assert second_messages[2]["role"] == "assistant" + assert second_messages[3]["role"] == "user" + + def test_developer_role_is_normalized_to_system(self, client): + import vllm_mlx.server as srv + + engine = _mock_engine(_output("Ready")) + srv._engine = engine + + resp = client.post( + "/v1/responses", + json={ + "model": "test-model", + "input": [ + {"type": "message", "role": "user", "content": "Hi"}, + {"type": "message", "role": "developer", "content": "Be terse"}, + ], + }, + ) + + assert resp.status_code == 200 + messages = engine.chat.call_args.kwargs["messages"] + assert messages[0]["role"] == "system" + assert messages[0]["content"] == "Be terse" + assert messages[1]["role"] == "user" + assert messages[1]["content"] == "Hi" + + def test_instructions_and_developer_message_are_merged(self, client): + import vllm_mlx.server as srv + + engine = _mock_engine(_output("Ready")) + srv._engine = engine + + resp = client.post( + "/v1/responses", + json={ + "model": "test-model", + "instructions": "System instructions", + "input": [ + {"type": "message", "role": "developer", "content": "Developer note"}, + {"type": "message", "role": "user", "content": "Hi"}, + ], + }, + ) + + assert resp.status_code == 200 + messages = engine.chat.call_args.kwargs["messages"] + assert len([m for m in messages if m["role"] == "system"]) == 1 + assert messages[0]["role"] == "system" + assert "System instructions" in messages[0]["content"] + assert "Developer note" in messages[0]["content"] + assert messages[1]["role"] == "user" + + def test_function_call_output_input_is_mapped_cleanly(self, client): + import vllm_mlx.server as srv + + engine = _mock_engine(_output("Done")) + srv._engine = engine + + resp = client.post( + "/v1/responses", + json={ + "model": "test-model", + "input": [ + {"type": "message", "role": "user", "content": "Run it"}, + { + "type": "function_call", + "call_id": "call_1", + "name": "shell", + "arguments": "{\"cmd\":\"pwd\"}", + }, + { + "type": "function_call_output", + "call_id": "call_1", + "output": "/tmp/work", + }, + ], + }, + ) + + assert resp.status_code == 200 + messages = engine.chat.call_args.kwargs["messages"] + assert messages[1]["role"] == "assistant" + assert "[Calling tool: shell(" in messages[1]["content"] + assert messages[2]["role"] == "user" + assert "[Tool Result (call_1)]" in messages[2]["content"] + assert "/tmp/work" in messages[2]["content"] + + def test_unsupported_tools_and_items_do_not_fail(self, client): + import vllm_mlx.server as srv + + engine = _mock_engine(_output("Fallback answer")) + srv._engine = engine + + resp = client.post( + "/v1/responses", + json={ + "model": "test-model", + "input": [ + {"type": "message", "role": "user", "content": "Answer directly"}, + { + "type": "web_search_call", + "status": "completed", + "action": {"type": "search", "query": "ignored"}, + }, + ], + "tools": [ + {"type": "web_search_preview"}, + {"type": "file_search", "vector_store_ids": ["vs_123"]}, + { + "type": "function", + "name": "shell", + "parameters": {"type": "object", "properties": {}}, + }, + ], + }, + ) + + assert resp.status_code == 200 + messages = engine.chat.call_args.kwargs["messages"] + assert messages[0]["role"] == "system" + assert "not available on this backend" in messages[0]["content"] + assert messages[1]["role"] == "user" + assert engine.chat.call_args.kwargs["tools"][0]["type"] == "function" + + def test_function_call_response_item(self, client): + import vllm_mlx.server as srv + + srv._engine = _mock_engine( + _output('{"name":"shell","arguments":{"cmd":"pwd"}}') + ) + + resp = client.post( + "/v1/responses", + json={ + "model": "test-model", + "input": "Use a tool", + "tools": [ + { + "type": "function", + "name": "shell", + "parameters": {"type": "object", "properties": {}}, + } + ], + }, + ) + + assert resp.status_code == 200 + body = resp.json() + assert body["output"][0]["type"] == "function_call" + assert body["output"][0]["name"] == "shell" + assert body["output_text"] == "" + + def test_streaming_response_returns_sse_events(self, client): + import vllm_mlx.server as srv + + engine = _mock_engine(_output("unused")) + engine.chat = AsyncMock(side_effect=AssertionError("stream path should not call chat")) + engine._stream_outputs = [ + _stream_output("Hello ", completion_tokens=1), + _stream_output("stream", completion_tokens=2, finish_reason="stop"), + ] + srv._engine = engine + + with client.stream( + "POST", + "/v1/responses", + json={"model": "test-model", "input": "Hello", "stream": True}, + ) as resp: + body = "".join(resp.iter_text()) + + assert resp.status_code == 200 + assert "event: response.created" in body + assert "event: response.output_text.delta" in body + assert "Hello stream" in body + assert "event: response.completed" in body + assert len(engine._stream_calls) == 1 + engine.chat.assert_not_awaited() + + def test_json_object_response_format_is_rejected(self, client): + import vllm_mlx.server as srv + + srv._engine = _mock_engine(_output("Hello")) + + resp = client.post( + "/v1/responses", + json={ + "model": "test-model", + "input": "Hello", + "text": {"format": {"type": "json_object"}}, + }, + ) + + assert resp.status_code == 400 + assert "json_object" in resp.json()["detail"] + + def test_reasoning_configuration_is_rejected(self, client): + import vllm_mlx.server as srv + + srv._engine = _mock_engine(_output("Hello")) + + resp = client.post( + "/v1/responses", + json={ + "model": "test-model", + "input": "Hello", + "reasoning": {"effort": "xhigh"}, + }, + ) + + assert resp.status_code == 400 + assert "reasoning configuration" in resp.json()["detail"] + + def test_reasoning_input_item_is_rejected(self, client): + import vllm_mlx.server as srv + + srv._engine = _mock_engine(_output("Hello")) + + resp = client.post( + "/v1/responses", + json={ + "model": "test-model", + "input": [ + {"type": "message", "role": "user", "content": "Hello"}, + {"type": "reasoning", "content": [{"type": "reasoning_text", "text": "x"}]}, + ], + }, + ) + + assert resp.status_code == 400 + assert "reasoning input items are not supported" in resp.json()["detail"] + + def test_length_finish_reason_marks_response_incomplete(self, client): + import vllm_mlx.server as srv + + output = _output("Cut off", completion_tokens=5) + output.finish_reason = "length" + srv._engine = _mock_engine(output) + + resp = client.post( + "/v1/responses", + json={"model": "test-model", "input": "Hello", "max_output_tokens": 5}, + ) + + assert resp.status_code == 200 + body = resp.json() + assert body["status"] == "incomplete" + assert body["incomplete_details"] == {"reason": "max_output_tokens"} diff --git a/vllm_mlx/api/__init__.py b/vllm_mlx/api/__init__.py index cfb62f45..552f253f 100644 --- a/vllm_mlx/api/__init__.py +++ b/vllm_mlx/api/__init__.py @@ -53,6 +53,24 @@ EmbeddingUsage, EmbeddingResponse, ) +from .responses_models import ( + ResponseTextFormat, + ResponseTextConfig, + ResponseReasoningConfig, + ResponseTextContentPart, + ResponseReasoningTextPart, + ResponseReasoningSummaryTextPart, + ResponseMessageItem, + ResponseReasoningItem, + ResponseFunctionCallItem, + ResponseFunctionCallOutputItem, + ResponseFunctionTool, + ResponsesUsage, + ResponseError, + ResponseIncompleteDetails, + ResponsesRequest, + ResponseObject, +) from .utils import ( clean_output_text, @@ -111,6 +129,22 @@ "EmbeddingData", "EmbeddingUsage", "EmbeddingResponse", + "ResponseTextFormat", + "ResponseTextConfig", + "ResponseReasoningConfig", + "ResponseTextContentPart", + "ResponseReasoningTextPart", + "ResponseReasoningSummaryTextPart", + "ResponseMessageItem", + "ResponseReasoningItem", + "ResponseFunctionCallItem", + "ResponseFunctionCallOutputItem", + "ResponseFunctionTool", + "ResponsesUsage", + "ResponseError", + "ResponseIncompleteDetails", + "ResponsesRequest", + "ResponseObject", # Utils "clean_output_text", "is_mllm_model", diff --git a/vllm_mlx/api/models.py b/vllm_mlx/api/models.py index 32b26e03..f7bcaaaa 100644 --- a/vllm_mlx/api/models.py +++ b/vllm_mlx/api/models.py @@ -11,6 +11,7 @@ import time import uuid +from typing import Any from pydantic import BaseModel, Field, computed_field @@ -169,6 +170,8 @@ class ChatCompletionRequest(BaseModel): tool_choice: str | dict | None = None # "auto", "none", or specific tool # Structured output response_format: ResponseFormat | dict | None = None + # Extra kwargs forwarded to tokenizer.apply_chat_template + chat_template_kwargs: dict[str, Any] | None = None # MLLM-specific parameters video_fps: float | None = None video_max_frames: int | None = None diff --git a/vllm_mlx/api/responses_models.py b/vllm_mlx/api/responses_models.py new file mode 100644 index 00000000..9fdfae8f --- /dev/null +++ b/vllm_mlx/api/responses_models.py @@ -0,0 +1,316 @@ +# SPDX-License-Identifier: Apache-2.0 +""" +Pydantic models for the OpenAI-compatible Responses API. + +This intentionally implements the subset needed for local coding-agent +workflows: text messages, function tools, function call outputs, and SSE +streaming events. The object and event shapes follow the conventions used by +OpenAI's gpt-oss reference server and llama.cpp's OpenAI-compatible server. +""" + +import time +import uuid +from typing import Literal + +from pydantic import BaseModel, Field, computed_field + + +class ResponseTextFormat(BaseModel): + """Output text format configuration.""" + + type: Literal["text", "json_object"] = "text" + + +class ResponseTextConfig(BaseModel): + """Text output configuration.""" + + format: ResponseTextFormat = Field(default_factory=ResponseTextFormat) + + +class ResponseReasoningConfig(BaseModel): + """Reasoning configuration.""" + + effort: Literal["none", "minimal", "low", "medium", "high", "xhigh"] | None = None + + +class ResponseTextContentPart(BaseModel): + """A text content part for message items.""" + + type: Literal["text", "input_text", "output_text"] = "output_text" + text: str + annotations: list[dict] = Field(default_factory=list) + logprobs: list[dict] = Field(default_factory=list) + + +class ResponseReasoningTextPart(BaseModel): + """A reasoning text content part.""" + + type: Literal["reasoning_text"] = "reasoning_text" + text: str + + +class ResponseReasoningSummaryTextPart(BaseModel): + """A reasoning summary item.""" + + type: Literal["summary_text"] = "summary_text" + text: str + + +class ResponseMessageItem(BaseModel): + """A Responses API message item.""" + + id: str | None = None + type: Literal["message"] = "message" + role: Literal["system", "user", "assistant", "developer"] = "assistant" + content: str | list[ResponseTextContentPart] = Field(default_factory=list) + status: Literal["in_progress", "completed", "incomplete"] | None = "completed" + + +class ResponseReasoningItem(BaseModel): + """A reasoning output item.""" + + id: str | None = None + type: Literal["reasoning"] = "reasoning" + summary: list[ResponseReasoningSummaryTextPart] = Field(default_factory=list) + content: list[ResponseReasoningTextPart] = Field(default_factory=list) + status: Literal["in_progress", "completed", "incomplete"] | None = "completed" + + +class ResponseFunctionCallItem(BaseModel): + """A function call output item.""" + + id: str | None = None + type: Literal["function_call"] = "function_call" + call_id: str + name: str + arguments: str + status: Literal["in_progress", "completed", "incomplete"] = "completed" + + +class ResponseFunctionCallOutputItem(BaseModel): + """A tool result item passed back into a later request.""" + + type: Literal["function_call_output"] = "function_call_output" + call_id: str + output: str + + +class ResponseFunctionTool(BaseModel): + """A function tool definition.""" + + type: Literal["function"] = "function" + name: str + description: str | None = "" + parameters: dict = Field( + default_factory=lambda: {"type": "object", "properties": {}} + ) + strict: bool = False + + +class ResponsesInputTokenDetails(BaseModel): + """Input token breakdown.""" + + cached_tokens: int = 0 + + +class ResponsesOutputTokenDetails(BaseModel): + """Output token breakdown.""" + + reasoning_tokens: int = 0 + + +class ResponsesUsage(BaseModel): + """Responses API token usage.""" + + input_tokens: int + output_tokens: int + total_tokens: int + input_tokens_details: ResponsesInputTokenDetails = Field( + default_factory=ResponsesInputTokenDetails + ) + output_tokens_details: ResponsesOutputTokenDetails = Field( + default_factory=ResponsesOutputTokenDetails + ) + + +class ResponseError(BaseModel): + """Error payload.""" + + code: str + message: str + + +class ResponseIncompleteDetails(BaseModel): + """Incomplete response details.""" + + reason: str + + +class ResponsesRequest(BaseModel): + """Request payload for /v1/responses.""" + + model: str + input: ( + str + | list[ + ResponseMessageItem + | ResponseReasoningItem + | ResponseFunctionCallItem + | ResponseFunctionCallOutputItem + | dict + ] + ) + instructions: str | None = None + max_output_tokens: int | None = None + stream: bool = False + tools: list[ResponseFunctionTool | dict] = Field(default_factory=list) + tool_choice: str | dict | None = "auto" + parallel_tool_calls: bool = True + previous_response_id: str | None = None + temperature: float | None = None + top_p: float | None = None + metadata: dict = Field(default_factory=dict) + text: ResponseTextConfig = Field(default_factory=ResponseTextConfig) + reasoning: ResponseReasoningConfig | None = None + store: bool = True + truncation: str = "disabled" + user: str | None = None + + +class ResponseObject(BaseModel): + """Response object for /v1/responses.""" + + id: str = Field(default_factory=lambda: f"resp_{uuid.uuid4().hex}") + object: Literal["response"] = "response" + created_at: int = Field(default_factory=lambda: int(time.time())) + status: Literal["completed", "failed", "incomplete", "in_progress"] = "completed" + background: bool = False + error: ResponseError | None = None + incomplete_details: ResponseIncompleteDetails | None = None + instructions: str | None = None + max_output_tokens: int | None = None + max_tool_calls: int | None = None + metadata: dict = Field(default_factory=dict) + model: str + output: list[ + ResponseMessageItem | ResponseReasoningItem | ResponseFunctionCallItem + ] = Field(default_factory=list) + parallel_tool_calls: bool = True + previous_response_id: str | None = None + text: ResponseTextConfig = Field(default_factory=ResponseTextConfig) + tool_choice: str | dict | None = "auto" + tools: list[ResponseFunctionTool | dict] = Field(default_factory=list) + top_p: float = 1.0 + temperature: float | None = None + truncation: str = "disabled" + usage: ResponsesUsage | None = None + user: str | None = None + store: bool = True + + @computed_field + @property + def output_text(self) -> str: + """Concatenate assistant text content into the convenience field.""" + text_parts: list[str] = [] + for item in self.output: + if not isinstance(item, ResponseMessageItem): + continue + if isinstance(item.content, str): + text_parts.append(item.content) + continue + for part in item.content: + if part.type == "output_text": + text_parts.append(part.text) + return "".join(text_parts) + + +class ResponsesEventBase(BaseModel): + """Base event fields.""" + + sequence_number: int + + +class ResponseCreatedEvent(ResponsesEventBase): + type: Literal["response.created"] = "response.created" + response: ResponseObject + + +class ResponseInProgressEvent(ResponsesEventBase): + type: Literal["response.in_progress"] = "response.in_progress" + response: ResponseObject + + +class ResponseCompletedEvent(ResponsesEventBase): + type: Literal["response.completed"] = "response.completed" + response: ResponseObject + + +class ResponseOutputItemAddedEvent(ResponsesEventBase): + type: Literal["response.output_item.added"] = "response.output_item.added" + output_index: int + item: ResponseMessageItem | ResponseReasoningItem | ResponseFunctionCallItem + + +class ResponseOutputItemDoneEvent(ResponsesEventBase): + type: Literal["response.output_item.done"] = "response.output_item.done" + output_index: int + item: ResponseMessageItem | ResponseReasoningItem | ResponseFunctionCallItem + + +class ResponseContentPartAddedEvent(ResponsesEventBase): + type: Literal["response.content_part.added"] = "response.content_part.added" + item_id: str + output_index: int + content_index: int + part: ResponseTextContentPart | ResponseReasoningTextPart + + +class ResponseContentPartDoneEvent(ResponsesEventBase): + type: Literal["response.content_part.done"] = "response.content_part.done" + item_id: str + output_index: int + content_index: int + part: ResponseTextContentPart | ResponseReasoningTextPart + + +class ResponseOutputTextDeltaEvent(ResponsesEventBase): + type: Literal["response.output_text.delta"] = "response.output_text.delta" + item_id: str + output_index: int + content_index: int + delta: str + logprobs: list[dict] = Field(default_factory=list) + + +class ResponseOutputTextDoneEvent(ResponsesEventBase): + type: Literal["response.output_text.done"] = "response.output_text.done" + item_id: str + output_index: int + content_index: int + text: str + logprobs: list[dict] = Field(default_factory=list) + + +class ResponseReasoningTextDeltaEvent(ResponsesEventBase): + type: Literal["response.reasoning_text.delta"] = "response.reasoning_text.delta" + item_id: str + output_index: int + content_index: int + delta: str + + +class ResponseReasoningTextDoneEvent(ResponsesEventBase): + type: Literal["response.reasoning_text.done"] = "response.reasoning_text.done" + item_id: str + output_index: int + content_index: int + text: str + + +class ResponseFunctionCallArgumentsDeltaEvent(ResponsesEventBase): + type: Literal["response.function_call_arguments.delta"] = ( + "response.function_call_arguments.delta" + ) + item_id: str + output_index: int + delta: str diff --git a/vllm_mlx/engine/simple.py b/vllm_mlx/engine/simple.py index e96317ef..bb48c7f6 100644 --- a/vllm_mlx/engine/simple.py +++ b/vllm_mlx/engine/simple.py @@ -437,11 +437,40 @@ async def chat( if not self._loaded: await self.start() + chat_template_kwargs = dict(kwargs.pop("chat_template_kwargs", {}) or {}) + + if tools and not self._is_mllm: + stream_kwargs = dict(kwargs) + if chat_template_kwargs: + stream_kwargs["chat_template_kwargs"] = chat_template_kwargs + final_output = GenerationOutput(text="") + async for output in self.stream_chat( + messages=messages, + max_tokens=max_tokens, + temperature=temperature, + top_p=top_p, + tools=tools, + images=images, + videos=videos, + **stream_kwargs, + ): + final_output = output + text = clean_output_text(final_output.text) + return GenerationOutput( + text=text, + tokens=list(final_output.tokens), + prompt_tokens=final_output.prompt_tokens, + completion_tokens=final_output.completion_tokens, + finish_reason=final_output.finish_reason, + ) + # Convert tools for template if provided template_tools = convert_tools_for_template(tools) if tools else None async with self._generation_lock: if self._is_mllm: + if chat_template_kwargs: + kwargs["chat_template_kwargs"] = chat_template_kwargs # For MLLM, use the chat method which handles images/videos # Run in thread pool to allow asyncio timeout to work output = await asyncio.to_thread( @@ -460,6 +489,8 @@ async def chat( finish_reason=output.finish_reason, ) else: + if chat_template_kwargs: + kwargs["chat_template_kwargs"] = chat_template_kwargs # For LLM, use the chat method # Run in thread pool to allow asyncio timeout to work output = await asyncio.to_thread( @@ -509,6 +540,8 @@ async def stream_chat( if not self._loaded: await self.start() + chat_template_kwargs = dict(kwargs.pop("chat_template_kwargs", {}) or {}) + # Convert tools for template template_tools = convert_tools_for_template(tools) if tools else None @@ -519,6 +552,8 @@ async def stream_chat( and not _has_media_content(messages) ): logger.info("Text-only request → LLM path (MTP=True)") + if chat_template_kwargs: + kwargs["chat_template_kwargs"] = chat_template_kwargs async for chunk in self._stream_generate_text( messages, max_tokens, @@ -543,13 +578,16 @@ async def stream_chat( # Run stream_chat in thread pool since it's synchronous def run_stream(): + local_kwargs = dict(kwargs) + if chat_template_kwargs: + local_kwargs["chat_template_kwargs"] = chat_template_kwargs return list( self._model.stream_chat( messages=messages, max_tokens=max_tokens, temperature=temperature, tools=template_tools, - **kwargs, + **local_kwargs, ) ) @@ -586,6 +624,8 @@ def run_stream(): "add_generation_prompt": True, "enable_thinking": enable_thinking, } + if chat_template_kwargs: + template_kwargs.update(chat_template_kwargs) if template_tools: template_kwargs["tools"] = template_tools @@ -593,7 +633,7 @@ def run_stream(): prompt = tokenizer.apply_chat_template(messages, **template_kwargs) except TypeError: # Some templates don't support all kwargs - for key in ["tools", "enable_thinking"]: + for key in ["tools", "enable_thinking", *chat_template_kwargs.keys()]: if key in template_kwargs: del template_kwargs[key] prompt = tokenizer.apply_chat_template(messages, **template_kwargs) diff --git a/vllm_mlx/models/llm.py b/vllm_mlx/models/llm.py index 72182037..537b5b87 100644 --- a/vllm_mlx/models/llm.py +++ b/vllm_mlx/models/llm.py @@ -253,6 +253,7 @@ def chat( temperature: float = 0.7, top_p: float = 0.9, tools: list | None = None, + chat_template_kwargs: dict | None = None, **kwargs, ) -> GenerationOutput: """ @@ -283,6 +284,8 @@ def chat( # Add tools if provided and supported if tools: template_kwargs["tools"] = tools + if chat_template_kwargs: + template_kwargs.update(chat_template_kwargs) try: prompt = self.tokenizer.apply_chat_template( @@ -290,8 +293,10 @@ def chat( **template_kwargs, ) except TypeError: - # Tokenizer doesn't support tools parameter - del template_kwargs["tools"] + # Tokenizer doesn't support all requested template kwargs + template_kwargs.pop("tools", None) + for key in (chat_template_kwargs or {}).keys(): + template_kwargs.pop(key, None) prompt = self.tokenizer.apply_chat_template( messages, **template_kwargs, diff --git a/vllm_mlx/server.py b/vllm_mlx/server.py index a0038d5f..838bb885 100644 --- a/vllm_mlx/server.py +++ b/vllm_mlx/server.py @@ -39,6 +39,7 @@ import argparse import asyncio +import copy import json import logging import os @@ -54,6 +55,7 @@ from fastapi import Depends, FastAPI, HTTPException, Request, UploadFile from fastapi.responses import Response, StreamingResponse from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer +from pydantic import BaseModel # Import from new modular API # Re-export for backwards compatibility with tests @@ -90,6 +92,31 @@ Usage, # noqa: F401 VideoUrl, # noqa: F401 ) +from .api.responses_models import ( + ResponseCompletedEvent, + ResponseContentPartAddedEvent, + ResponseContentPartDoneEvent, + ResponseCreatedEvent, + ResponseFunctionCallArgumentsDeltaEvent, + ResponseFunctionCallItem, + ResponseFunctionCallOutputItem, + ResponseFunctionTool, + ResponseIncompleteDetails, + ResponseInProgressEvent, + ResponseMessageItem, + ResponseObject, + ResponseOutputItemAddedEvent, + ResponseOutputItemDoneEvent, + ResponseOutputTextDeltaEvent, + ResponseOutputTextDoneEvent, + ResponseReasoningItem, + ResponseReasoningTextDeltaEvent, + ResponseReasoningTextDoneEvent, + ResponseReasoningTextPart, + ResponseTextContentPart, + ResponsesRequest, + ResponsesUsage, +) from .api.tool_calling import ( build_json_system_prompt, convert_tools_for_template, @@ -160,6 +187,7 @@ def _resolve_top_p(request_value: float | None) -> float: _enable_auto_tool_choice: bool = False _tool_call_parser: str | None = None # Parser name: auto, mistral, qwen, llama, hermes _tool_parser_instance = None # Instantiated parser +_responses_store: dict[str, dict] = {} def _load_prefix_cache_from_disk() -> None: @@ -419,6 +447,1062 @@ def _parse_tool_calls_with_parser( return parse_tool_calls(output_text, request_dict) +def _new_response_item_id(prefix: str) -> str: + """Generate stable OpenAI-style item ids.""" + return f"{prefix}_{uuid.uuid4().hex}" + + +def _response_content_to_text(content) -> str: + """Normalize Responses API content items into plain text.""" + if content is None: + return "" + if isinstance(content, str): + return content + + text_parts = [] + for part in content: + if isinstance(part, dict): + part_type = part.get("type") + text = part.get("text", "") + else: + part_type = getattr(part, "type", None) + text = getattr(part, "text", "") + if part_type in {"text", "input_text", "output_text"}: + text_parts.append(text) + return "\n".join(part for part in text_parts if part) + + +def _responses_tools_to_chat_tools( + tools: list[ResponseFunctionTool | dict], +) -> tuple[list[dict] | None, list[str]]: + """Convert supported Responses tools and report unsupported tool types.""" + if not tools: + return None, [] + + supported: list[dict] = [] + unsupported: list[str] = [] + + for tool in tools: + if isinstance(tool, ResponseFunctionTool): + tool_type = tool.type + tool_name = tool.name + tool_description = tool.description or "" + tool_parameters = tool.parameters + elif isinstance(tool, dict): + tool_type = tool.get("type", "unknown") + tool_name = tool.get("name", "") + tool_description = tool.get("description", "") + tool_parameters = tool.get("parameters", {}) + else: + unsupported.append(type(tool).__name__) + continue + + if tool_type == "function": + supported.append( + { + "type": "function", + "function": { + "name": tool_name, + "description": tool_description, + "parameters": tool_parameters + or {"type": "object", "properties": {}}, + }, + } + ) + else: + unsupported.append(tool_type) + + return supported or None, unsupported + + +def _responses_input_to_chat_messages(request: ResponsesRequest) -> list[dict]: + """Convert Responses API input items into chat-completions-style messages.""" + messages: list[dict] = [] + + if request.previous_response_id: + previous = _responses_store.get(request.previous_response_id) + if previous is None: + raise HTTPException( + status_code=404, + detail=f"Previous response `{request.previous_response_id}` not found", + ) + messages.extend(copy.deepcopy(previous["messages"])) + + if request.instructions: + messages.append({"role": "system", "content": request.instructions}) + + if isinstance(request.input, str): + messages.append({"role": "user", "content": request.input}) + return messages + + for item in request.input: + if isinstance(item, dict): + item_type = item.get("type", "") + if item_type == "message": + role = item.get("role", "user") + if role == "developer": + role = "system" + messages.append( + { + "role": role, + "content": _response_content_to_text(item.get("content")), + } + ) + elif item_type == "function_call": + messages.append( + { + "role": "assistant", + "content": "", + "tool_calls": [ + { + "id": item.get("call_id", _new_response_item_id("call")), + "type": "function", + "function": { + "name": item.get("name", ""), + "arguments": item.get("arguments", ""), + }, + } + ], + } + ) + elif item_type == "function_call_output": + messages.append( + { + "role": "tool", + "tool_call_id": item.get("call_id", ""), + "content": item.get("output", ""), + } + ) + elif item_type == "reasoning": + raise HTTPException( + status_code=400, + detail="Responses reasoning input items are not supported on this backend", + ) + else: + logger.info( + "Skipping unsupported Responses input item type %r", item_type + ) + continue + + if isinstance(item, ResponseMessageItem): + role = item.role + if role == "developer": + role = "system" + messages.append( + { + "role": role, + "content": _response_content_to_text(item.content), + } + ) + elif isinstance(item, ResponseFunctionCallItem): + messages.append( + { + "role": "assistant", + "content": "", + "tool_calls": [ + { + "id": item.call_id, + "type": "function", + "function": { + "name": item.name, + "arguments": item.arguments, + }, + } + ], + } + ) + elif isinstance(item, ResponseFunctionCallOutputItem): + messages.append( + { + "role": "tool", + "tool_call_id": item.call_id, + "content": item.output, + } + ) + elif isinstance(item, ResponseReasoningItem): + raise HTTPException( + status_code=400, + detail="Responses reasoning input items are not supported on this backend", + ) + else: + logger.info( + "Skipping unsupported Responses input item type %r", + getattr(item, "type", type(item).__name__), + ) + + return messages + + +def _responses_request_to_new_persisted_messages(request: ResponsesRequest) -> list[dict]: + """Persist only the current request's replayable input items.""" + request_without_history = request.model_copy( + update={"previous_response_id": None, "instructions": None}, + deep=True, + ) + return _responses_input_to_chat_messages(request_without_history) + + +def _responses_request_to_persisted_messages(request: ResponsesRequest) -> list[dict]: + """Persist replayable history for chained previous_response_id requests. + + Responses `instructions` are intentionally not replayed across + `previous_response_id`, but replayable message items are. + """ + messages: list[dict] = [] + if request.previous_response_id: + previous = _responses_store.get(request.previous_response_id) + if previous is None: + raise HTTPException( + status_code=404, + detail=f"Previous response `{request.previous_response_id}` not found", + ) + messages.extend(copy.deepcopy(previous["messages"])) + messages.extend(_responses_request_to_new_persisted_messages(request)) + return messages + + +def _responses_request_to_chat_request(request: ResponsesRequest) -> ChatCompletionRequest: + """Build a ChatCompletionRequest from a ResponsesRequest.""" + if request.text.format.type == "json_object": + raise HTTPException( + status_code=400, + detail="Responses text.format.type='json_object' is not supported on this backend", + ) + if request.reasoning is not None: + raise HTTPException( + status_code=400, + detail="Responses reasoning configuration is not supported on this backend", + ) + + tools, unsupported_tools = _responses_tools_to_chat_tools(request.tools) + messages = _responses_input_to_chat_messages(request) + if unsupported_tools: + tool_list = ", ".join(sorted(set(unsupported_tools))) + messages.insert( + 0, + { + "role": "system", + "content": ( + "The following requested tool types are not available on this " + f"backend: {tool_list}. Do not call them." + ), + }, + ) + + system_messages = [msg for msg in messages if msg.get("role") == "system"] + non_system_messages = [msg for msg in messages if msg.get("role") != "system"] + merged_system_content = "\n\n".join( + str(msg.get("content", "")).strip() + for msg in system_messages + if str(msg.get("content", "")).strip() + ) + messages = ( + [{"role": "system", "content": merged_system_content}] + if merged_system_content + else [] + ) + non_system_messages + + return ChatCompletionRequest( + model=request.model, + messages=[Message(**msg) for msg in messages], + temperature=request.temperature, + top_p=request.top_p, + max_tokens=request.max_output_tokens, + stream=False, + tools=tools, + tool_choice=request.tool_choice, + ) + + +def _build_responses_output_items( + text: str | None, + reasoning: str | None, + tool_calls: list[ToolCall] | None, +) -> list[ResponseMessageItem | ResponseReasoningItem | ResponseFunctionCallItem]: + """Convert parsed assistant output into Responses API output items.""" + output_items: list[ + ResponseMessageItem | ResponseReasoningItem | ResponseFunctionCallItem + ] = [] + + if reasoning: + output_items.append( + ResponseReasoningItem( + id=_new_response_item_id("rs"), + content=[ResponseReasoningTextPart(text=reasoning)], + ) + ) + + if text: + output_items.append( + ResponseMessageItem( + id=_new_response_item_id("msg"), + role="assistant", + content=[ResponseTextContentPart(type="output_text", text=text)], + ) + ) + + for tool_call in tool_calls or []: + output_items.append( + ResponseFunctionCallItem( + id=_new_response_item_id("fc"), + call_id=tool_call.id, + name=tool_call.function.name, + arguments=tool_call.function.arguments, + ) + ) + + return output_items + + +def _response_output_items_to_chat_messages(output_items: list) -> list[dict]: + """Persist assistant output in chat-completions form for previous_response_id.""" + assistant_text_parts: list[str] = [] + assistant_tool_calls: list[dict] = [] + + for item in output_items: + if isinstance(item, ResponseMessageItem): + assistant_text_parts.append(_response_content_to_text(item.content)) + elif isinstance(item, ResponseFunctionCallItem): + assistant_tool_calls.append( + { + "id": item.call_id, + "type": "function", + "function": { + "name": item.name, + "arguments": item.arguments, + }, + } + ) + + if not assistant_text_parts and not assistant_tool_calls: + return [] + + return [ + { + "role": "assistant", + "content": "".join(assistant_text_parts), + "tool_calls": assistant_tool_calls or None, + } + ] + + +def _build_response_object( + request: ResponsesRequest, + output_items: list[ResponseMessageItem | ResponseReasoningItem | ResponseFunctionCallItem], + prompt_tokens: int, + completion_tokens: int, + finish_reason: str | None, + response_id: str | None = None, +) -> ResponseObject: + """Build a full Responses API object.""" + response = ResponseObject( + id=response_id or _new_response_item_id("resp"), + model=_model_name or request.model, + instructions=request.instructions, + max_output_tokens=request.max_output_tokens, + metadata=request.metadata, + output=output_items, + parallel_tool_calls=request.parallel_tool_calls, + previous_response_id=request.previous_response_id, + text=request.text, + tool_choice=request.tool_choice, + tools=request.tools, + top_p=_resolve_top_p(request.top_p), + temperature=_resolve_temperature(request.temperature), + truncation=request.truncation, + user=request.user, + store=request.store, + usage=ResponsesUsage( + input_tokens=prompt_tokens, + output_tokens=completion_tokens, + total_tokens=prompt_tokens + completion_tokens, + ), + ) + if finish_reason == "length": + response.status = "incomplete" + response.incomplete_details = ResponseIncompleteDetails( + reason="max_output_tokens" + ) + return response + + +def _prepare_responses_request( + request: ResponsesRequest, +) -> tuple[BaseEngine, ChatCompletionRequest, list[dict], dict]: + """Prepare a Responses request for execution on the chat engine.""" + _validate_model_name(request.model) + engine = get_engine() + chat_request = _responses_request_to_chat_request(request) + + if chat_request.messages: + logger.info( + f"[REQUEST] POST /v1/responses stream={request.stream} " + f"model={request.model!r} items=" + f"{len(request.input) if isinstance(request.input, list) else 1} " + f"tools={len(request.tools)}" + ) + + messages, images, videos = extract_multimodal_content( + chat_request.messages, + preserve_native_format=engine.preserve_native_tool_format, + ) + + chat_kwargs = { + "max_tokens": chat_request.max_tokens or _default_max_tokens, + "temperature": _resolve_temperature(chat_request.temperature), + "top_p": _resolve_top_p(chat_request.top_p), + } + if request.tools: + chat_kwargs["tools"] = convert_tools_for_template(chat_request.tools) + if images: + chat_kwargs["images"] = images + if videos: + chat_kwargs["videos"] = videos + + return engine, chat_request, messages, chat_kwargs + + +async def _run_responses_request( + request: ResponsesRequest, + raw_request: Request, +) -> tuple[ResponseObject | None, list[dict]]: + """Execute a Responses API request against the backend chat engine.""" + engine, chat_request, messages, chat_kwargs = _prepare_responses_request(request) + + timeout = _default_timeout + output = await _wait_with_disconnect( + engine.chat(messages=messages, **chat_kwargs), + raw_request, + timeout=timeout, + ) + if output is None: + return None, [] + + cleaned_text, tool_calls = _parse_tool_calls_with_parser(output.text, chat_request) + reasoning_text = None + if _reasoning_parser and not tool_calls: + reasoning_text, cleaned_text = _reasoning_parser.extract_reasoning( + cleaned_text or output.text + ) + + output_items = _build_responses_output_items( + clean_output_text(cleaned_text) if cleaned_text else None, + reasoning_text, + tool_calls, + ) + response_object = _build_response_object( + request=request, + output_items=output_items, + prompt_tokens=output.prompt_tokens, + completion_tokens=output.completion_tokens, + finish_reason=output.finish_reason, + ) + + persisted_messages = _responses_request_to_persisted_messages(request) + persisted_messages.extend(_response_output_items_to_chat_messages(output_items)) + if request.store: + _responses_store[response_object.id] = { + "messages": copy.deepcopy(persisted_messages), + "response": response_object.model_copy(deep=True), + } + + return response_object, persisted_messages + + +async def _stream_responses_request(request: ResponsesRequest) -> AsyncIterator[str]: + """Execute a Responses API request and stream SSE events incrementally.""" + engine, chat_request, messages, chat_kwargs = _prepare_responses_request(request) + + response_id = _new_response_item_id("resp") + sequence = 1 + base_response = _build_response_object( + request=request, + output_items=[], + prompt_tokens=0, + completion_tokens=0, + finish_reason=None, + response_id=response_id, + ) + base_response.status = "in_progress" + base_response.usage = None + + yield _responses_sse_event( + "response.created", + ResponseCreatedEvent(sequence_number=sequence, response=base_response), + ) + sequence += 1 + yield _responses_sse_event( + "response.in_progress", + ResponseInProgressEvent(sequence_number=sequence, response=base_response), + ) + sequence += 1 + + prompt_tokens = 0 + completion_tokens = 0 + finish_reason = None + last_output = None + raw_accumulated_text = "" + accumulated_text = "" + accumulated_reasoning = "" + + text_item_id: str | None = None + text_output_index: int | None = None + reasoning_item_id: str | None = None + reasoning_output_index: int | None = None + next_output_index = 0 + + def _start_text_item() -> list[str]: + nonlocal text_item_id, text_output_index, next_output_index, sequence + events: list[str] = [] + if text_item_id is None: + text_item_id = _new_response_item_id("msg") + text_output_index = next_output_index + next_output_index += 1 + events.append( + _responses_sse_event( + "response.output_item.added", + ResponseOutputItemAddedEvent( + sequence_number=sequence, + output_index=text_output_index, + item=ResponseMessageItem( + id=text_item_id, + role="assistant", + status="in_progress", + content=[], + ), + ), + ) + ) + sequence += 1 + events.append( + _responses_sse_event( + "response.content_part.added", + ResponseContentPartAddedEvent( + sequence_number=sequence, + item_id=text_item_id, + output_index=text_output_index, + content_index=0, + part=ResponseTextContentPart(type="output_text", text=""), + ), + ) + ) + sequence += 1 + return events + + def _start_reasoning_item() -> list[str]: + nonlocal reasoning_item_id, reasoning_output_index, next_output_index, sequence + events: list[str] = [] + if reasoning_item_id is None: + reasoning_item_id = _new_response_item_id("rs") + reasoning_output_index = next_output_index + next_output_index += 1 + events.append( + _responses_sse_event( + "response.output_item.added", + ResponseOutputItemAddedEvent( + sequence_number=sequence, + output_index=reasoning_output_index, + item=ResponseReasoningItem( + id=reasoning_item_id, + status="in_progress", + content=[], + ), + ), + ) + ) + sequence += 1 + events.append( + _responses_sse_event( + "response.content_part.added", + ResponseContentPartAddedEvent( + sequence_number=sequence, + item_id=reasoning_item_id, + output_index=reasoning_output_index, + content_index=0, + part=ResponseReasoningTextPart(text=""), + ), + ) + ) + sequence += 1 + return events + + if _reasoning_parser: + _reasoning_parser.reset_state() + + global _tool_parser_instance + tool_parser = None + tool_accumulated_text = "" + tool_markup_possible = False + if _enable_auto_tool_choice and _tool_call_parser: + if _tool_parser_instance is None: + try: + parser_cls = ToolParserManager.get_tool_parser(_tool_call_parser) + tokenizer = None + if _engine is not None and hasattr(_engine, "_tokenizer"): + tokenizer = _engine._tokenizer + _tool_parser_instance = parser_cls(tokenizer) + logger.info( + "Initialized tool call parser for responses streaming: %s", + _tool_call_parser, + ) + except Exception as e: + logger.warning( + "Failed to init tool parser for responses streaming: %s", e + ) + if _tool_parser_instance is not None: + tool_parser = _tool_parser_instance + tool_parser.reset() + + async for output in engine.stream_chat(messages=messages, **chat_kwargs): + last_output = output + finish_reason = output.finish_reason + if hasattr(output, "prompt_tokens") and output.prompt_tokens: + prompt_tokens = output.prompt_tokens + if hasattr(output, "completion_tokens") and output.completion_tokens: + completion_tokens = output.completion_tokens + + delta_text = output.new_text or "" + if not delta_text: + continue + + previous_text = raw_accumulated_text + raw_accumulated_text += delta_text + + if _reasoning_parser: + delta_msg = _reasoning_parser.extract_reasoning_streaming( + previous_text, raw_accumulated_text, delta_text + ) + if delta_msg is None: + continue + + if delta_msg.reasoning: + for event in _start_reasoning_item(): + yield event + accumulated_reasoning += delta_msg.reasoning + yield _responses_sse_event( + "response.reasoning_text.delta", + ResponseReasoningTextDeltaEvent( + sequence_number=sequence, + item_id=reasoning_item_id, + output_index=reasoning_output_index, + content_index=0, + delta=delta_msg.reasoning, + ), + ) + sequence += 1 + + if delta_msg.content: + for event in _start_text_item(): + yield event + accumulated_text += delta_msg.content + yield _responses_sse_event( + "response.output_text.delta", + ResponseOutputTextDeltaEvent( + sequence_number=sequence, + item_id=text_item_id, + output_index=text_output_index, + content_index=0, + delta=delta_msg.content, + ), + ) + sequence += 1 + continue + + content = SPECIAL_TOKENS_PATTERN.sub("", delta_text) + if tool_parser and delta_text: + if not tool_markup_possible and "<" not in delta_text: + tool_accumulated_text += delta_text + else: + if not tool_markup_possible: + tool_markup_possible = True + tool_result = tool_parser.extract_tool_calls_streaming( + tool_accumulated_text, tool_accumulated_text + delta_text, delta_text + ) + tool_accumulated_text += delta_text + if tool_result is None: + continue + if "tool_calls" in tool_result: + continue + content = tool_result.get("content", "") + + if not content: + continue + + for event in _start_text_item(): + yield event + accumulated_text += content + yield _responses_sse_event( + "response.output_text.delta", + ResponseOutputTextDeltaEvent( + sequence_number=sequence, + item_id=text_item_id, + output_index=text_output_index, + content_index=0, + delta=content, + ), + ) + sequence += 1 + + cleaned_text, tool_calls = _parse_tool_calls_with_parser( + raw_accumulated_text, chat_request + ) + final_text = accumulated_text + if cleaned_text is not None and not final_text and not tool_calls: + final_text = clean_output_text(cleaned_text) + + reasoning_item = None + if reasoning_item_id is not None: + reasoning_item = ResponseReasoningItem( + id=reasoning_item_id, + status="completed", + content=[ResponseReasoningTextPart(text=accumulated_reasoning)], + ) + yield _responses_sse_event( + "response.reasoning_text.done", + ResponseReasoningTextDoneEvent( + sequence_number=sequence, + item_id=reasoning_item_id, + output_index=reasoning_output_index, + content_index=0, + text=accumulated_reasoning, + ), + ) + sequence += 1 + yield _responses_sse_event( + "response.content_part.done", + ResponseContentPartDoneEvent( + sequence_number=sequence, + item_id=reasoning_item_id, + output_index=reasoning_output_index, + content_index=0, + part=reasoning_item.content[0], + ), + ) + sequence += 1 + yield _responses_sse_event( + "response.output_item.done", + ResponseOutputItemDoneEvent( + sequence_number=sequence, + output_index=reasoning_output_index, + item=reasoning_item, + ), + ) + sequence += 1 + + text_item = None + if text_item_id is not None or final_text: + if text_item_id is None: + for event in _start_text_item(): + yield event + text_item = ResponseMessageItem( + id=text_item_id, + role="assistant", + status="completed", + content=[ResponseTextContentPart(type="output_text", text=final_text)], + ) + yield _responses_sse_event( + "response.output_text.done", + ResponseOutputTextDoneEvent( + sequence_number=sequence, + item_id=text_item_id, + output_index=text_output_index, + content_index=0, + text=final_text, + ), + ) + sequence += 1 + yield _responses_sse_event( + "response.content_part.done", + ResponseContentPartDoneEvent( + sequence_number=sequence, + item_id=text_item_id, + output_index=text_output_index, + content_index=0, + part=text_item.content[0], + ), + ) + sequence += 1 + yield _responses_sse_event( + "response.output_item.done", + ResponseOutputItemDoneEvent( + sequence_number=sequence, + output_index=text_output_index, + item=text_item, + ), + ) + sequence += 1 + + function_call_items: list[ResponseFunctionCallItem] = [] + for tool_call in tool_calls or []: + output_index = next_output_index + next_output_index += 1 + item = ResponseFunctionCallItem( + id=_new_response_item_id("fc"), + call_id=tool_call.id, + name=tool_call.function.name, + arguments=tool_call.function.arguments, + ) + function_call_items.append(item) + yield _responses_sse_event( + "response.output_item.added", + ResponseOutputItemAddedEvent( + sequence_number=sequence, + output_index=output_index, + item=item.model_copy(update={"status": "in_progress"}), + ), + ) + sequence += 1 + yield _responses_sse_event( + "response.function_call_arguments.delta", + ResponseFunctionCallArgumentsDeltaEvent( + sequence_number=sequence, + item_id=item.id, + output_index=output_index, + delta=item.arguments, + ), + ) + sequence += 1 + yield _responses_sse_event( + "response.output_item.done", + ResponseOutputItemDoneEvent( + sequence_number=sequence, + output_index=output_index, + item=item, + ), + ) + sequence += 1 + + output_items: list[ + ResponseMessageItem | ResponseReasoningItem | ResponseFunctionCallItem + ] = [] + if reasoning_item is not None: + output_items.append(reasoning_item) + if text_item is not None: + output_items.append(text_item) + output_items.extend(function_call_items) + + response_object = _build_response_object( + request=request, + output_items=output_items, + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, + finish_reason=finish_reason, + response_id=response_id, + ) + + if request.store and last_output is not None: + persisted_messages = _responses_request_to_persisted_messages(request) + persisted_messages.extend(_response_output_items_to_chat_messages(output_items)) + _responses_store[response_object.id] = { + "messages": copy.deepcopy(persisted_messages), + "response": response_object.model_copy(deep=True), + } + + yield _responses_sse_event( + "response.completed", + ResponseCompletedEvent(sequence_number=sequence, response=response_object), + ) + + +def _responses_sse_event(event_type: str, payload: BaseModel | dict) -> str: + """Encode a Responses API SSE event.""" + data = payload.model_dump_json() if isinstance(payload, BaseModel) else json.dumps(payload) + return f"event: {event_type}\ndata: {data}\n\n" + + +async def _stream_response_object(response: ResponseObject) -> AsyncIterator[str]: + """Stream a completed response object as OpenAI-style SSE events.""" + sequence = 1 + in_progress = response.model_copy(deep=True) + in_progress.status = "in_progress" + in_progress.usage = None + in_progress.output = [] + + yield _responses_sse_event( + "response.created", + ResponseCreatedEvent(sequence_number=sequence, response=in_progress), + ) + sequence += 1 + yield _responses_sse_event( + "response.in_progress", + ResponseInProgressEvent(sequence_number=sequence, response=in_progress), + ) + sequence += 1 + + for output_index, item in enumerate(response.output): + if isinstance(item, ResponseReasoningItem): + item_id = item.id or _new_response_item_id("rs") + in_progress_item = item.model_copy(update={"id": item_id, "status": "in_progress"}) + yield _responses_sse_event( + "response.output_item.added", + ResponseOutputItemAddedEvent( + sequence_number=sequence, + output_index=output_index, + item=in_progress_item, + ), + ) + sequence += 1 + part = item.content[0] if item.content else ResponseReasoningTextPart(text="") + yield _responses_sse_event( + "response.content_part.added", + ResponseContentPartAddedEvent( + sequence_number=sequence, + item_id=item_id, + output_index=output_index, + content_index=0, + part=ResponseReasoningTextPart(text=""), + ), + ) + sequence += 1 + yield _responses_sse_event( + "response.reasoning_text.delta", + ResponseReasoningTextDeltaEvent( + sequence_number=sequence, + item_id=item_id, + output_index=output_index, + content_index=0, + delta=part.text, + ), + ) + sequence += 1 + yield _responses_sse_event( + "response.reasoning_text.done", + ResponseReasoningTextDoneEvent( + sequence_number=sequence, + item_id=item_id, + output_index=output_index, + content_index=0, + text=part.text, + ), + ) + sequence += 1 + yield _responses_sse_event( + "response.content_part.done", + ResponseContentPartDoneEvent( + sequence_number=sequence, + item_id=item_id, + output_index=output_index, + content_index=0, + part=part, + ), + ) + sequence += 1 + yield _responses_sse_event( + "response.output_item.done", + ResponseOutputItemDoneEvent( + sequence_number=sequence, + output_index=output_index, + item=item, + ), + ) + sequence += 1 + continue + + if isinstance(item, ResponseMessageItem): + item_id = item.id or _new_response_item_id("msg") + in_progress_item = item.model_copy(update={"id": item_id, "status": "in_progress", "content": []}) + text_part = item.content[0] if isinstance(item.content, list) and item.content else ResponseTextContentPart(type="output_text", text="") + yield _responses_sse_event( + "response.output_item.added", + ResponseOutputItemAddedEvent( + sequence_number=sequence, + output_index=output_index, + item=in_progress_item, + ), + ) + sequence += 1 + yield _responses_sse_event( + "response.content_part.added", + ResponseContentPartAddedEvent( + sequence_number=sequence, + item_id=item_id, + output_index=output_index, + content_index=0, + part=ResponseTextContentPart(type="output_text", text=""), + ), + ) + sequence += 1 + yield _responses_sse_event( + "response.output_text.delta", + ResponseOutputTextDeltaEvent( + sequence_number=sequence, + item_id=item_id, + output_index=output_index, + content_index=0, + delta=text_part.text, + ), + ) + sequence += 1 + yield _responses_sse_event( + "response.output_text.done", + ResponseOutputTextDoneEvent( + sequence_number=sequence, + item_id=item_id, + output_index=output_index, + content_index=0, + text=text_part.text, + ), + ) + sequence += 1 + yield _responses_sse_event( + "response.content_part.done", + ResponseContentPartDoneEvent( + sequence_number=sequence, + item_id=item_id, + output_index=output_index, + content_index=0, + part=text_part, + ), + ) + sequence += 1 + yield _responses_sse_event( + "response.output_item.done", + ResponseOutputItemDoneEvent( + sequence_number=sequence, + output_index=output_index, + item=item, + ), + ) + sequence += 1 + continue + + if isinstance(item, ResponseFunctionCallItem): + in_progress_item = item.model_copy(update={"status": "in_progress"}) + yield _responses_sse_event( + "response.output_item.added", + ResponseOutputItemAddedEvent( + sequence_number=sequence, + output_index=output_index, + item=in_progress_item, + ), + ) + sequence += 1 + yield _responses_sse_event( + "response.function_call_arguments.delta", + ResponseFunctionCallArgumentsDeltaEvent( + sequence_number=sequence, + item_id=item.id or _new_response_item_id("fc"), + output_index=output_index, + delta=item.arguments, + ), + ) + sequence += 1 + yield _responses_sse_event( + "response.output_item.done", + ResponseOutputItemDoneEvent( + sequence_number=sequence, + output_index=output_index, + item=item, + ), + ) + sequence += 1 + + yield _responses_sse_event( + "response.completed", + ResponseCompletedEvent(sequence_number=sequence, response=response), + ) + + def _detect_native_tool_support() -> bool: """ Detect if the active tool parser supports native tool format. @@ -1420,6 +2504,8 @@ async def create_chat_completion(request: ChatCompletionRequest, raw_request: Re chat_kwargs["specprefill"] = request.specprefill if request.specprefill_keep_pct is not None: chat_kwargs["specprefill_keep_pct"] = request.specprefill_keep_pct + if request.chat_template_kwargs: + chat_kwargs["chat_template_kwargs"] = dict(request.chat_template_kwargs) # Add tools if provided if request.tools: @@ -1496,6 +2582,27 @@ async def create_chat_completion(request: ChatCompletionRequest, raw_request: Re ) +@app.post( + "/v1/responses", + dependencies=[Depends(verify_api_key), Depends(check_rate_limit)], +) +async def create_response(request: ResponsesRequest, raw_request: Request): + """Create a Responses API response.""" + if request.stream: + return StreamingResponse( + _disconnect_guard(_stream_responses_request(request), raw_request), + media_type="text/event-stream", + ) + + response_object, _persisted_messages = await _run_responses_request( + request, raw_request + ) + if response_object is None: + return Response(status_code=499) + + return response_object + + def _inject_json_instruction(messages: list, instruction: str) -> list: """ Inject JSON instruction into messages. From 3432ed0d431c73bd41f046113196191500851533 Mon Sep 17 00:00:00 2001 From: Christopher Albert Date: Thu, 26 Mar 2026 01:15:15 +0100 Subject: [PATCH 2/4] fix: cap _responses_store with LRU eviction, remove dead _stream_response_object Replace unbounded dict with OrderedDict (max 1000 entries) to prevent memory leaks from accumulated stored responses. Evict oldest entries on insert when the cap is exceeded. Remove the _stream_response_object function (190 lines) which was never called anywhere in the codebase. --- tests/test_responses_api.py | 3 +- vllm_mlx/server.py | 201 ++---------------------------------- 2 files changed, 9 insertions(+), 195 deletions(-) diff --git a/tests/test_responses_api.py b/tests/test_responses_api.py index 206712d9..cbff5d5c 100644 --- a/tests/test_responses_api.py +++ b/tests/test_responses_api.py @@ -4,6 +4,7 @@ import platform import sys from types import SimpleNamespace +from collections import OrderedDict from unittest.mock import AsyncMock, MagicMock import pytest @@ -33,7 +34,7 @@ def server_state(): srv._engine = None srv._model_name = "test-model" - srv._responses_store = {} + srv._responses_store = OrderedDict() srv._api_key = None try: diff --git a/vllm_mlx/server.py b/vllm_mlx/server.py index 838bb885..6723f6e6 100644 --- a/vllm_mlx/server.py +++ b/vllm_mlx/server.py @@ -48,7 +48,7 @@ import threading import time import uuid -from collections import defaultdict +from collections import OrderedDict, defaultdict from collections.abc import AsyncIterator import uvicorn @@ -187,7 +187,8 @@ def _resolve_top_p(request_value: float | None) -> float: _enable_auto_tool_choice: bool = False _tool_call_parser: str | None = None # Parser name: auto, mistral, qwen, llama, hermes _tool_parser_instance = None # Instantiated parser -_responses_store: dict[str, dict] = {} +_responses_store: OrderedDict[str, dict] = OrderedDict() +_RESPONSES_STORE_MAX_SIZE: int = 1000 def _load_prefix_cache_from_disk() -> None: @@ -905,6 +906,8 @@ async def _run_responses_request( "messages": copy.deepcopy(persisted_messages), "response": response_object.model_copy(deep=True), } + while len(_responses_store) > _RESPONSES_STORE_MAX_SIZE: + _responses_store.popitem(last=False) return response_object, persisted_messages @@ -1297,6 +1300,8 @@ def _start_reasoning_item() -> list[str]: "messages": copy.deepcopy(persisted_messages), "response": response_object.model_copy(deep=True), } + while len(_responses_store) > _RESPONSES_STORE_MAX_SIZE: + _responses_store.popitem(last=False) yield _responses_sse_event( "response.completed", @@ -1310,198 +1315,6 @@ def _responses_sse_event(event_type: str, payload: BaseModel | dict) -> str: return f"event: {event_type}\ndata: {data}\n\n" -async def _stream_response_object(response: ResponseObject) -> AsyncIterator[str]: - """Stream a completed response object as OpenAI-style SSE events.""" - sequence = 1 - in_progress = response.model_copy(deep=True) - in_progress.status = "in_progress" - in_progress.usage = None - in_progress.output = [] - - yield _responses_sse_event( - "response.created", - ResponseCreatedEvent(sequence_number=sequence, response=in_progress), - ) - sequence += 1 - yield _responses_sse_event( - "response.in_progress", - ResponseInProgressEvent(sequence_number=sequence, response=in_progress), - ) - sequence += 1 - - for output_index, item in enumerate(response.output): - if isinstance(item, ResponseReasoningItem): - item_id = item.id or _new_response_item_id("rs") - in_progress_item = item.model_copy(update={"id": item_id, "status": "in_progress"}) - yield _responses_sse_event( - "response.output_item.added", - ResponseOutputItemAddedEvent( - sequence_number=sequence, - output_index=output_index, - item=in_progress_item, - ), - ) - sequence += 1 - part = item.content[0] if item.content else ResponseReasoningTextPart(text="") - yield _responses_sse_event( - "response.content_part.added", - ResponseContentPartAddedEvent( - sequence_number=sequence, - item_id=item_id, - output_index=output_index, - content_index=0, - part=ResponseReasoningTextPart(text=""), - ), - ) - sequence += 1 - yield _responses_sse_event( - "response.reasoning_text.delta", - ResponseReasoningTextDeltaEvent( - sequence_number=sequence, - item_id=item_id, - output_index=output_index, - content_index=0, - delta=part.text, - ), - ) - sequence += 1 - yield _responses_sse_event( - "response.reasoning_text.done", - ResponseReasoningTextDoneEvent( - sequence_number=sequence, - item_id=item_id, - output_index=output_index, - content_index=0, - text=part.text, - ), - ) - sequence += 1 - yield _responses_sse_event( - "response.content_part.done", - ResponseContentPartDoneEvent( - sequence_number=sequence, - item_id=item_id, - output_index=output_index, - content_index=0, - part=part, - ), - ) - sequence += 1 - yield _responses_sse_event( - "response.output_item.done", - ResponseOutputItemDoneEvent( - sequence_number=sequence, - output_index=output_index, - item=item, - ), - ) - sequence += 1 - continue - - if isinstance(item, ResponseMessageItem): - item_id = item.id or _new_response_item_id("msg") - in_progress_item = item.model_copy(update={"id": item_id, "status": "in_progress", "content": []}) - text_part = item.content[0] if isinstance(item.content, list) and item.content else ResponseTextContentPart(type="output_text", text="") - yield _responses_sse_event( - "response.output_item.added", - ResponseOutputItemAddedEvent( - sequence_number=sequence, - output_index=output_index, - item=in_progress_item, - ), - ) - sequence += 1 - yield _responses_sse_event( - "response.content_part.added", - ResponseContentPartAddedEvent( - sequence_number=sequence, - item_id=item_id, - output_index=output_index, - content_index=0, - part=ResponseTextContentPart(type="output_text", text=""), - ), - ) - sequence += 1 - yield _responses_sse_event( - "response.output_text.delta", - ResponseOutputTextDeltaEvent( - sequence_number=sequence, - item_id=item_id, - output_index=output_index, - content_index=0, - delta=text_part.text, - ), - ) - sequence += 1 - yield _responses_sse_event( - "response.output_text.done", - ResponseOutputTextDoneEvent( - sequence_number=sequence, - item_id=item_id, - output_index=output_index, - content_index=0, - text=text_part.text, - ), - ) - sequence += 1 - yield _responses_sse_event( - "response.content_part.done", - ResponseContentPartDoneEvent( - sequence_number=sequence, - item_id=item_id, - output_index=output_index, - content_index=0, - part=text_part, - ), - ) - sequence += 1 - yield _responses_sse_event( - "response.output_item.done", - ResponseOutputItemDoneEvent( - sequence_number=sequence, - output_index=output_index, - item=item, - ), - ) - sequence += 1 - continue - - if isinstance(item, ResponseFunctionCallItem): - in_progress_item = item.model_copy(update={"status": "in_progress"}) - yield _responses_sse_event( - "response.output_item.added", - ResponseOutputItemAddedEvent( - sequence_number=sequence, - output_index=output_index, - item=in_progress_item, - ), - ) - sequence += 1 - yield _responses_sse_event( - "response.function_call_arguments.delta", - ResponseFunctionCallArgumentsDeltaEvent( - sequence_number=sequence, - item_id=item.id or _new_response_item_id("fc"), - output_index=output_index, - delta=item.arguments, - ), - ) - sequence += 1 - yield _responses_sse_event( - "response.output_item.done", - ResponseOutputItemDoneEvent( - sequence_number=sequence, - output_index=output_index, - item=item, - ), - ) - sequence += 1 - - yield _responses_sse_event( - "response.completed", - ResponseCompletedEvent(sequence_number=sequence, response=response), - ) - def _detect_native_tool_support() -> bool: """ From f4815bef8b72da11323dea1c7856132ae9592491 Mon Sep 17 00:00:00 2001 From: Christopher Albert Date: Thu, 26 Mar 2026 10:23:43 +0100 Subject: [PATCH 3/4] fix: convert reasoning input items to assistant messages in Responses API Codex sends ResponseReasoningItem in the Responses API input array during multi-turn conversations. Convert reasoning content to assistant messages so the model sees its prior chain-of-thought. Previously this raised an HTTPException, but since the streaming response had already started, this caused a RuntimeError that broke the SSE stream mid-flight. --- vllm_mlx/server.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/vllm_mlx/server.py b/vllm_mlx/server.py index 6723f6e6..ae643b23 100644 --- a/vllm_mlx/server.py +++ b/vllm_mlx/server.py @@ -575,10 +575,12 @@ def _responses_input_to_chat_messages(request: ResponsesRequest) -> list[dict]: } ) elif item_type == "reasoning": - raise HTTPException( - status_code=400, - detail="Responses reasoning input items are not supported on this backend", + parts = item.get("content", []) + reasoning_text = "\n".join( + p.get("text", "") for p in parts if isinstance(p, dict) ) + if reasoning_text: + messages.append({"role": "assistant", "content": reasoning_text}) else: logger.info( "Skipping unsupported Responses input item type %r", item_type @@ -621,10 +623,9 @@ def _responses_input_to_chat_messages(request: ResponsesRequest) -> list[dict]: } ) elif isinstance(item, ResponseReasoningItem): - raise HTTPException( - status_code=400, - detail="Responses reasoning input items are not supported on this backend", - ) + reasoning_text = "\n".join(part.text for part in (item.content or [])) + if reasoning_text: + messages.append({"role": "assistant", "content": reasoning_text}) else: logger.info( "Skipping unsupported Responses input item type %r", From c9f6bdcca43a4ec7311bbb72944cb85967821036 Mon Sep 17 00:00:00 2001 From: Christopher Albert Date: Fri, 27 Mar 2026 01:07:10 +0100 Subject: [PATCH 4/4] fix: stop rejecting reasoning input items and config in Responses API The reasoning input item rejection (HTTPException 400) was added in PR #28 but conflicts with the earlier fix that converts reasoning items to assistant messages in _responses_input_to_chat_messages. The rejection ran first, crashing the SSE stream mid-flight. Also downgrade reasoning config rejection to a debug log since raising inside the streaming generator causes "response already started" crashes. --- vllm_mlx/server.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/vllm_mlx/server.py b/vllm_mlx/server.py index ae643b23..2ff57b97 100644 --- a/vllm_mlx/server.py +++ b/vllm_mlx/server.py @@ -671,10 +671,7 @@ def _responses_request_to_chat_request(request: ResponsesRequest) -> ChatComplet detail="Responses text.format.type='json_object' is not supported on this backend", ) if request.reasoning is not None: - raise HTTPException( - status_code=400, - detail="Responses reasoning configuration is not supported on this backend", - ) + logger.debug("Ignoring reasoning configuration (not supported on this backend)") tools, unsupported_tools = _responses_tools_to_chat_tools(request.tools) messages = _responses_input_to_chat_messages(request)