Skip to content

Commit 5c9ad13

Browse files
[Frontend] supports interleaved thinking (#28531)
Signed-off-by: chaunceyjiang <[email protected]>
1 parent fa183e9 commit 5c9ad13

File tree

3 files changed

+135
-1
lines changed

3 files changed

+135
-1
lines changed
Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
# Interleaved Thinking
2+
3+
## Introduction
4+
5+
Interleaved thinking allows models to reason between tool calls, enabling more sophisticated decision-making after receiving tool results. This feature helps models chain multiple tool calls with reasoning steps in between and make nuanced decisions based on intermediate results.
6+
7+
Important: Interleaved thinking increases token usage and response latency. Consider your budget and performance requirements when enabling this feature.
8+
9+
## How Interleaved Thinking Works
10+
11+
With interleaved thinking, the model can:
12+
13+
- Reason about the results of a tool call before deciding what to do next
14+
- Chain multiple tool calls with reasoning steps in between
15+
- Make more nuanced decisions based on intermediate results
16+
- Provide transparent reasoning for its tool selection process
17+
18+
## Supported Models
19+
20+
vLLM currently supports the following interleaved thinking models:
21+
22+
| Model Series | Reasoning Parser Name |
23+
|--------------|-----------------------|
24+
| moonshotai/Kimi-K2-Thinking | kimi_k2 |
25+
| MiniMaxAI/MiniMax-M2 | minimax_m2 |
26+
27+
## Example Usage
28+
29+
To use interleaved thinking with tool calls, specify a model that supports this feature and enable tool calls in your chat completion request. Here's an example:
30+
31+
??? code
32+
33+
```python
34+
"""
35+
vllm serve MiniMaxAI/MiniMax-M2 \
36+
--tensor-parallel-size 4 \
37+
--tool-call-parser minimax_m2 \
38+
--reasoning-parser minimax_m2 \
39+
--enable-auto-tool-choice
40+
"""
41+
import json
42+
43+
from openai import OpenAI
44+
45+
client = OpenAI(base_url="http://localhost:8000/v1", api_key="dummy")
46+
47+
48+
def get_current_weather(location: str, unit: "str"):
49+
"""Get the current weather in a given location"""
50+
if unit == "celsius":
51+
return f"The current temperature in {location} is 22°C."
52+
else:
53+
return f"The current temperature in {location} is 72°F."
54+
55+
56+
tools = [
57+
{
58+
"type": "function",
59+
"function": {
60+
"name": "get_weather",
61+
"description": "Get the current weather in a given location",
62+
"parameters": {
63+
"type": "object",
64+
"properties": {
65+
"location": {
66+
"type": "string",
67+
"description": "City and state, e.g., 'San Francisco, CA'",
68+
},
69+
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
70+
},
71+
"required": ["location", "unit"],
72+
},
73+
},
74+
}
75+
]
76+
messages = [{"role": "user", "content": "What's the weather in Fahrenheit like in San Francisco?"}]
77+
response = client.chat.completions.create(
78+
model=client.models.list().data[0].id,
79+
messages=messages,
80+
tools=tools,
81+
tool_choice="auto",
82+
)
83+
84+
tool_call = response.choices[0].message.tool_calls[0].function
85+
86+
messages.append(
87+
{
88+
"role": "assistant",
89+
"tool_calls": response.choices[0].message.tool_calls,
90+
"reasoning": response.choices[0].message.reasoning, # append reasoning
91+
}
92+
)
93+
94+
# Simulate tool execution
95+
available_tools = {"get_weather": get_current_weather}
96+
97+
completion_tool_calls = response.choices[0].message.tool_calls
98+
for call in completion_tool_calls:
99+
tool_to_call = available_tools[call.function.name]
100+
args = json.loads(call.function.arguments)
101+
result = tool_to_call(**args)
102+
messages.append(
103+
{
104+
"role": "tool",
105+
"content": result,
106+
"tool_call_id": call.id,
107+
"name": call.function.name,
108+
}
109+
)
110+
response_2 = client.chat.completions.create(
111+
model=client.models.list().data[0].id,
112+
messages=messages,
113+
tools=tools,
114+
tool_choice="auto",
115+
)
116+
print(response_2.choices[0].message.content)
117+
```
118+
This example demonstrates how to set up interleaved thinking with tool calls using a weather retrieval function. The model reasons about the tool results before generating the final response.

examples/online_serving/openai_chat_completion_client_with_tools.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,7 @@ def main():
161161
{
162162
"role": "assistant",
163163
"tool_calls": chat_completion.choices[0].message.tool_calls,
164+
"reasoning": chat_completion.choices[0].message.reasoning,
164165
}
165166
)
166167

vllm/entrypoints/chat_utils.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -240,6 +240,9 @@ class CustomChatCompletionMessageParam(TypedDict, total=False):
240240
tool_calls: Iterable[ChatCompletionMessageToolCallParam] | None
241241
"""The tool calls generated by the model, such as function calls."""
242242

243+
reasoning: str | None
244+
"""The reasoning content for interleaved thinking."""
245+
243246

244247
ChatCompletionMessageParam: TypeAlias = (
245248
OpenAIChatCompletionMessageParam
@@ -265,6 +268,12 @@ class ConversationMessage(TypedDict, total=False):
265268
tool_calls: Iterable[ChatCompletionMessageToolCallParam] | None
266269
"""The tool calls generated by the model, such as function calls."""
267270

271+
reasoning: str | None
272+
"""The reasoning content for interleaved thinking."""
273+
274+
reasoning_content: str | None
275+
"""Deprecated: The reasoning content for interleaved thinking."""
276+
268277

269278
# Passed in by user
270279
ChatTemplateContentFormatOption = Literal["auto", "string", "openai"]
@@ -1374,7 +1383,7 @@ def _parse_chat_message_content(
13741383
) -> list[ConversationMessage]:
13751384
role = message["role"]
13761385
content = message.get("content")
1377-
1386+
reasoning = message.get("reasoning") or message.get("reasoning_content")
13781387
if content is None:
13791388
content = []
13801389
elif isinstance(content, str):
@@ -1396,6 +1405,12 @@ def _parse_chat_message_content(
13961405
# follow the OpenAI spec.
13971406
if "tool_calls" in parsed_msg and parsed_msg["tool_calls"] is not None:
13981407
result_msg["tool_calls"] = list(parsed_msg["tool_calls"])
1408+
# Include reasoning if present for interleaved thinking.
1409+
if reasoning is not None:
1410+
result_msg["reasoning"] = cast(str, reasoning)
1411+
result_msg["reasoning_content"] = cast(
1412+
str, reasoning
1413+
) # keep compatibility
13991414
elif role == "tool":
14001415
parsed_msg = _ToolParser(message)
14011416
if "tool_call_id" in parsed_msg:

0 commit comments

Comments
 (0)