Skip to content

Commit b7ac12f

Browse files
committed
Add smollm3 docs
1 parent db63ff3 commit b7ac12f

File tree

16 files changed

+332
-160
lines changed

16 files changed

+332
-160
lines changed

.typos.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ extend-ignore-identifiers-re = [
1313
"tese",
1414
"seperable",
1515
"Seperable",
16+
"setp",
1617
]
1718

1819
[files]

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ Please submit requests for new models [here](https://github.com/EricLBuehler/mis
6868

6969
*After following installation instructions*
7070

71-
- 🤗🤗🤗 Run the **SmolLM 3** long-context hybrid-reasoning model:
71+
- 🤗🤗🤗 Run the **SmolLM 3** long-context hybrid-reasoning model: [documentation](docs/SMOLLM3.md)
7272
<details>
7373
<summary>Show command</summary>
7474

docs/QWEN3.md

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ The Qwen 3 family is a collection of hybrid reasoning MoE and non-MoE models ran
1212
> Note: tool calling support is fully implemented for the Qwen 3 models, including agentic web search.
1313
1414
## Enabling thinking
15-
The Qwen 3 models are hybrid reasoning models which can be controlled at inference-time. By default, reasoning is enabled for these models. To dynamically control this, it is recommended to either add `/no_think` or `/think` to your prompt. Alternatively, you can specify the `enable_thinking` flag as detailed by the API-specific examples.
15+
The Qwen 3 models are hybrid reasoning models which can be controlled at inference-time. **By default, reasoning is enabled for these models.** To dynamically control this, it is recommended to either add `/no_think` or `/think` to your prompt. Alternatively, you can specify the `enable_thinking` flag as detailed by the API-specific examples.
1616

1717
## HTTP API
1818
You can find a more detailed example demonstrating enabling/disabling thinking [here](../examples/server/qwen3.py).
@@ -40,6 +40,7 @@ while True:
4040
frequency_penalty=1.0,
4141
top_p=0.1,
4242
temperature=0,
43+
# enable_thinking=False,
4344
)
4445
resp = completion.choices[0].message.content
4546
print(resp)
@@ -69,6 +70,7 @@ res = runner.send_chat_completion_request(
6970
presence_penalty=1.0,
7071
top_p=0.1,
7172
temperature=0.1,
73+
# enable_thinking=False,
7274
)
7375
)
7476
print(res.choices[0].message.content)
@@ -94,6 +96,7 @@ async fn main() -> Result<()> {
9496
.await?;
9597

9698
let messages = TextMessages::new()
99+
// .enable_thinking(false)
97100
.add_message(
98101
TextMessageRole::System,
99102
"You are an AI agent with a specialty in programming.",

docs/SMOLLM3.md

Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
# SmolLM3: [`HuggingFaceTB/SmolLM3-3B`](https://huggingface.co/HuggingFaceTB/SmolLM3-3B)
2+
3+
SmolLM3 is a 3B parameter long-context hybrid reasoning language model. It supports 6 languages, advanced reasoning and long context. SmolLM3 is a fully open model that offers strong performance at the 3B–4B scale.
4+
5+
**Default, easiest:**
6+
```bash
7+
./mistralrs-server -i --isq 8 run -m HuggingFaceTB/SmolLM3-3B
8+
```
9+
10+
**UQFF prequantized:**
11+
```bash
12+
./mistralrs-server -i run -m EricB/SmolLM3-3B-UQFF -f smollm33b-q4k-0.uqff
13+
```
14+
15+
> Note: tool calling support is fully implemented for the SmolLM3 models, including agentic web search.
16+
17+
> Check out prequantized UQFF SmolLM3 here: https://huggingface.co/EricB/SmolLM3-3B-UQFF
18+
19+
## Enabling thinking
20+
The SmolLM3 models are hybrid reasoning models which can be controlled at inference-time. **By default, reasoning is enabled for these models.** To dynamically control this, it is recommended to either add `/no_think` or `/think` to your prompt. Alternatively, you can specify the `enable_thinking` flag as detailed by the API-specific examples.
21+
22+
## HTTP API
23+
You can find a more detailed example demonstrating enabling/disabling thinking [here](../examples/server/smollm3.py).
24+
25+
```
26+
./mistralrs-server --isq 8 --port 1234 plain -m HuggingFaceTB/SmolLM3-3B
27+
```
28+
29+
```py
30+
import openai
31+
32+
messages = []
33+
prompt = input("Enter system prompt >>> ")
34+
if len(prompt) > 0:
35+
messages.append({"role": "system", "content": prompt})
36+
37+
38+
while True:
39+
prompt = input(">>> ")
40+
messages.append({"role": "user", "content": prompt})
41+
completion = client.chat.completions.create(
42+
model="ignore",
43+
messages=messages,
44+
max_tokens=256,
45+
frequency_penalty=1.0,
46+
top_p=0.1,
47+
temperature=0,
48+
# enable_thinking=False,
49+
)
50+
resp = completion.choices[0].message.content
51+
print(resp)
52+
messages.append({"role": "assistant", "content": resp})
53+
```
54+
55+
## Python API
56+
You can find a more detailed example demonstrating enabling/disabling thinking [here](../examples/python/smollm3.py).
57+
58+
```py
59+
from mistralrs import Runner, Which, ChatCompletionRequest, Architecture
60+
61+
runner = Runner(
62+
which=Which.Plain(
63+
model_id="HuggingFaceTB/SmolLM3-3B",
64+
arch=Architecture.SmolLm3,
65+
),
66+
)
67+
68+
res = runner.send_chat_completion_request(
69+
ChatCompletionRequest(
70+
model="ignore",
71+
messages=[
72+
{"role": "user", "content": "Tell me a story about the Rust type system."}
73+
],
74+
max_tokens=256,
75+
presence_penalty=1.0,
76+
top_p=0.1,
77+
temperature=0.1,
78+
# enable_thinking=False,
79+
)
80+
)
81+
print(res.choices[0].message.content)
82+
print(res.usage)
83+
```
84+
85+
## Rust API
86+
You can find a more detailed example demonstrating enabling/disabling thinking [here](../mistralrs/examples/smollm3/main.rs).
87+
88+
```rust
89+
use anyhow::Result;
90+
use mistralrs::{
91+
IsqType, PagedAttentionMetaBuilder, TextMessageRole, TextMessages, TextModelBuilder,
92+
};
93+
94+
#[tokio::main]
95+
async fn main() -> Result<()> {
96+
let model = TextModelBuilder::new("HuggingFaceTB/SmolLM3-3B")
97+
.with_isq(IsqType::Q8_0)
98+
.with_logging()
99+
.with_paged_attn(|| PagedAttentionMetaBuilder::default().build())?
100+
.build()
101+
.await?;
102+
103+
let messages = TextMessages::new()
104+
// .enable_thinking(false)
105+
.add_message(
106+
TextMessageRole::System,
107+
"You are an AI agent with a specialty in programming.",
108+
)
109+
.add_message(
110+
TextMessageRole::User,
111+
"Hello! How are you? Please write generic binary search function in Rust.",
112+
);
113+
114+
let response = model.send_chat_request(messages).await?;
115+
116+
println!("{}", response.choices[0].message.content.as_ref().unwrap());
117+
dbg!(
118+
response.usage.avg_prompt_tok_per_sec,
119+
response.usage.avg_compl_tok_per_sec
120+
);
121+
122+
Ok(())
123+
}
124+
```

examples/server/qwen3.py

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -75,9 +75,7 @@ def log_response(response: httpx.Response):
7575
frequency_penalty=1.0,
7676
top_p=0.1,
7777
temperature=0,
78-
# extra_body={
79-
# "enable_thinking": False
80-
# }
78+
# enable_thinking=False,
8179
)
8280
resp = completion.choices[0].message.content
8381
print(resp)
@@ -102,9 +100,7 @@ def log_response(response: httpx.Response):
102100
frequency_penalty=1.0,
103101
top_p=0.1,
104102
temperature=0,
105-
# extra_body={
106-
# "enable_thinking": True
107-
# }
103+
# enable_thinking=False,
108104
)
109105
resp = completion.choices[0].message.content
110106
print(resp)

examples/server/smollm3.py

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
from openai import OpenAI
2+
import httpx
3+
import textwrap
4+
import json
5+
6+
7+
def log_response(response: httpx.Response):
8+
request = response.request
9+
print(f"Request: {request.method} {request.url}")
10+
print(" Headers:")
11+
for key, value in request.headers.items():
12+
if key.lower() == "authorization":
13+
value = "[...]"
14+
if key.lower() == "cookie":
15+
value = value.split("=")[0] + "=..."
16+
print(f" {key}: {value}")
17+
print(" Body:")
18+
try:
19+
request_body = json.loads(request.content)
20+
print(textwrap.indent(json.dumps(request_body, indent=2), " "))
21+
except json.JSONDecodeError:
22+
print(textwrap.indent(request.content.decode(), " "))
23+
print(f"Response: status_code={response.status_code}")
24+
print(" Headers:")
25+
for key, value in response.headers.items():
26+
if key.lower() == "set-cookie":
27+
value = value.split("=")[0] + "=..."
28+
print(f" {key}: {value}")
29+
30+
31+
client = OpenAI(api_key="foobar", base_url="http://localhost:1234/v1/")
32+
33+
# Enable this to log requests and responses
34+
# client._client = httpx.Client(
35+
# event_hooks={"request": [print], "response": [log_response]}
36+
# )
37+
38+
messages = [
39+
{
40+
"role": "user",
41+
"content": "Hello! How many rs in strawberry?",
42+
},
43+
]
44+
45+
# ------------------------------------------------------------------
46+
# First question, thinking mode is enabled by default
47+
# ------------------------------------------------------------------
48+
completion = client.chat.completions.create(
49+
model="ignore",
50+
messages=messages,
51+
max_tokens=1024,
52+
frequency_penalty=1.0,
53+
top_p=0.1,
54+
temperature=0,
55+
)
56+
resp = completion.choices[0].message.content
57+
print(resp)
58+
59+
messages.append({"role": "assistant", "content": completion.choices[0].message.content})
60+
61+
messages = [
62+
{
63+
"role": "user",
64+
"content": "How many rs in blueberry? /no_think",
65+
},
66+
]
67+
68+
# ------------------------------------------------------------------
69+
# Second question, disable thinking mode with extra body or /no_think
70+
# ------------------------------------------------------------------
71+
completion = client.chat.completions.create(
72+
model="ignore",
73+
messages=messages,
74+
max_tokens=1024,
75+
frequency_penalty=1.0,
76+
top_p=0.1,
77+
temperature=0,
78+
# extra_body={
79+
# "enable_thinking": False
80+
# }
81+
)
82+
resp = completion.choices[0].message.content
83+
print(resp)
84+
85+
86+
messages.append({"role": "assistant", "content": completion.choices[0].message.content})
87+
88+
messages = [
89+
{
90+
"role": "user",
91+
"content": "Are you sure? /think",
92+
},
93+
]
94+
95+
# ------------------------------------------------------------------
96+
# Third question, reenable thinking mode with extra body or /think
97+
# ------------------------------------------------------------------
98+
completion = client.chat.completions.create(
99+
model="ignore",
100+
messages=messages,
101+
max_tokens=1024,
102+
frequency_penalty=1.0,
103+
top_p=0.1,
104+
temperature=0,
105+
# extra_body={
106+
# "enable_thinking": True
107+
# }
108+
)
109+
resp = completion.choices[0].message.content
110+
print(resp)

mistralrs-core/src/pipeline/chat_template.rs

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -309,10 +309,15 @@ pub fn apply_chat_template_to(
309309
.into_owned();
310310

311311
if template.contains("{{ meta }}") {
312-
//fix for GLM4 models
312+
// Fix for GLM4 models
313313
template = template.replace("{%- set meta = message.get(\"metadata\", \"\") %}", "");
314314
template = template.replace("{{ meta }}", "");
315315
}
316+
if template.contains("{% generation %}") && template.contains("{% endgeneration %}") {
317+
// Strip for smollm3 models
318+
template = template.replace("{% generation %}", "");
319+
template = template.replace("{% endgeneration %}", "");
320+
}
316321

317322
env.add_template("chat_template", &template)?;
318323
env.add_function("raise_exception", raise_exception);

0 commit comments

Comments
 (0)