EricLBuehler
diff --git a/‎.typos.toml‎
Lines changed: 1 addition & 0 deletions b/‎.typos.toml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/QWEN3.md‎
Lines changed: 4 additions & 1 deletion b/‎docs/QWEN3.md‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎docs/SMOLLM3.md‎
Lines changed: 124 additions & 0 deletions b/‎docs/SMOLLM3.md‎
Lines changed: 124 additions & 0 deletions
diff --git a/‎examples/server/qwen3.py‎
Lines changed: 2 additions & 6 deletions b/‎examples/server/qwen3.py‎
Lines changed: 2 additions & 6 deletions
diff --git a/‎examples/server/smollm3.py‎
Lines changed: 110 additions & 0 deletions b/‎examples/server/smollm3.py‎
Lines changed: 110 additions & 0 deletions
diff --git a/‎mistralrs-core/src/pipeline/chat_template.rs‎
Lines changed: 6 additions & 1 deletion b/‎mistralrs-core/src/pipeline/chat_template.rs‎
Lines changed: 6 additions & 1 deletion
@@ -13,6 +13,7 @@ extend-ignore-identifiers-re = [
     "tese",
     "seperable",
     "Seperable",
+    "setp",
 ]
 
 [files]
 
@@ -68,7 +68,7 @@ Please submit requests for new models [here](https://github.com/EricLBuehler/mis
 
 *After following installation instructions*
 
-- 🤗🤗🤗 Run the **SmolLM 3** long-context hybrid-reasoning model:
+- 🤗🤗🤗 Run the **SmolLM 3** long-context hybrid-reasoning model: [documentation](docs/SMOLLM3.md)  
   <details>
     <summary>Show command</summary>
 
 
@@ -12,7 +12,7 @@ The Qwen 3 family is a collection of hybrid reasoning MoE and non-MoE models ran
 > Note: tool calling support is fully implemented for the Qwen 3 models, including agentic web search.
 
 ## Enabling thinking
-The Qwen 3 models are hybrid reasoning models which can be controlled at inference-time. By default, reasoning is enabled for these models. To dynamically control this, it is recommended to either add `/no_think` or `/think` to your prompt. Alternatively, you can specify the `enable_thinking` flag as detailed by the API-specific examples.
+The Qwen 3 models are hybrid reasoning models which can be controlled at inference-time. **By default, reasoning is enabled for these models.** To dynamically control this, it is recommended to either add `/no_think` or `/think` to your prompt. Alternatively, you can specify the `enable_thinking` flag as detailed by the API-specific examples.
 
 ## HTTP API
 You can find a more detailed example demonstrating enabling/disabling thinking [here](../examples/server/qwen3.py).
@@ -40,6 +40,7 @@ while True:
         frequency_penalty=1.0,
         top_p=0.1,
         temperature=0,
+        # enable_thinking=False,
     )
     resp = completion.choices[0].message.content
     print(resp)
@@ -69,6 +70,7 @@ res = runner.send_chat_completion_request(
         presence_penalty=1.0,
         top_p=0.1,
         temperature=0.1,
+        # enable_thinking=False,
     )
 )
 print(res.choices[0].message.content)
@@ -94,6 +96,7 @@ async fn main() -> Result<()> {
         .await?;
 
     let messages = TextMessages::new()
+        // .enable_thinking(false)
         .add_message(
             TextMessageRole::System,
             "You are an AI agent with a specialty in programming.",
 
@@ -0,0 +1,124 @@
+# SmolLM3: [`HuggingFaceTB/SmolLM3-3B`](https://huggingface.co/HuggingFaceTB/SmolLM3-3B)
+
+SmolLM3 is a 3B parameter long-context hybrid reasoning language model. It supports 6 languages, advanced reasoning and long context. SmolLM3 is a fully open model that offers strong performance at the 3B–4B scale.
+
+**Default, easiest:**
+```bash
+./mistralrs-server -i --isq 8 run -m HuggingFaceTB/SmolLM3-3B
+```
+
+**UQFF prequantized:**
+```bash
+./mistralrs-server -i run -m EricB/SmolLM3-3B-UQFF -f smollm33b-q4k-0.uqff
+```
+
+> Note: tool calling support is fully implemented for the SmolLM3 models, including agentic web search.
+
+> Check out prequantized UQFF SmolLM3 here: https://huggingface.co/EricB/SmolLM3-3B-UQFF
+
+## Enabling thinking
+The SmolLM3 models are hybrid reasoning models which can be controlled at inference-time. **By default, reasoning is enabled for these models.** To dynamically control this, it is recommended to either add `/no_think` or `/think` to your prompt. Alternatively, you can specify the `enable_thinking` flag as detailed by the API-specific examples.
+
+## HTTP API
+You can find a more detailed example demonstrating enabling/disabling thinking [here](../examples/server/smollm3.py).
+
+```
+./mistralrs-server --isq 8 --port 1234 plain -m HuggingFaceTB/SmolLM3-3B
+```
+
+```py
+import openai
+
+messages = []
+prompt = input("Enter system prompt >>> ")
+if len(prompt) > 0:
+    messages.append({"role": "system", "content": prompt})
+
+
+while True:
+    prompt = input(">>> ")
+    messages.append({"role": "user", "content": prompt})
+    completion = client.chat.completions.create(
+        model="ignore",
+        messages=messages,
+        max_tokens=256,
+        frequency_penalty=1.0,
+        top_p=0.1,
+        temperature=0,
+        # enable_thinking=False,
+    )
+    resp = completion.choices[0].message.content
+    print(resp)
+    messages.append({"role": "assistant", "content": resp})
+```
+
+## Python API
+You can find a more detailed example demonstrating enabling/disabling thinking [here](../examples/python/smollm3.py).
+
+```py
+from mistralrs import Runner, Which, ChatCompletionRequest, Architecture
+
+runner = Runner(
+    which=Which.Plain(
+        model_id="HuggingFaceTB/SmolLM3-3B",
+        arch=Architecture.SmolLm3,
+    ),
+)
+
+res = runner.send_chat_completion_request(
+    ChatCompletionRequest(
+        model="ignore",
+        messages=[
+            {"role": "user", "content": "Tell me a story about the Rust type system."}
+        ],
+        max_tokens=256,
+        presence_penalty=1.0,
+        top_p=0.1,
+        temperature=0.1,
+        # enable_thinking=False,
+    )
+)
+print(res.choices[0].message.content)
+print(res.usage)
+```
+
+## Rust API
+You can find a more detailed example demonstrating enabling/disabling thinking [here](../mistralrs/examples/smollm3/main.rs).
+
+```rust
+use anyhow::Result;
+use mistralrs::{
+    IsqType, PagedAttentionMetaBuilder, TextMessageRole, TextMessages, TextModelBuilder,
+};
+
+#[tokio::main]
+async fn main() -> Result<()> {
+    let model = TextModelBuilder::new("HuggingFaceTB/SmolLM3-3B")
+        .with_isq(IsqType::Q8_0)
+        .with_logging()
+        .with_paged_attn(|| PagedAttentionMetaBuilder::default().build())?
+        .build()
+        .await?;
+
+    let messages = TextMessages::new()
+        // .enable_thinking(false)
+        .add_message(
+            TextMessageRole::System,
+            "You are an AI agent with a specialty in programming.",
+        )
+        .add_message(
+            TextMessageRole::User,
+            "Hello! How are you? Please write generic binary search function in Rust.",
+        );
+
+    let response = model.send_chat_request(messages).await?;
+
+    println!("{}", response.choices[0].message.content.as_ref().unwrap());
+    dbg!(
+        response.usage.avg_prompt_tok_per_sec,
+        response.usage.avg_compl_tok_per_sec
+    );
+
+    Ok(())
+}
+```
@@ -75,9 +75,7 @@ def log_response(response: httpx.Response):
     frequency_penalty=1.0,
     top_p=0.1,
     temperature=0,
-    # extra_body={
-    #     "enable_thinking": False
-    # }
+    # enable_thinking=False,
 )
 resp = completion.choices[0].message.content
 print(resp)
@@ -102,9 +100,7 @@ def log_response(response: httpx.Response):
     frequency_penalty=1.0,
     top_p=0.1,
     temperature=0,
-    # extra_body={
-    #     "enable_thinking": True
-    # }
+    # enable_thinking=False,
 )
 resp = completion.choices[0].message.content
 print(resp)
@@ -0,0 +1,110 @@
+from openai import OpenAI
+import httpx
+import textwrap
+import json
+
+
+def log_response(response: httpx.Response):
+    request = response.request
+    print(f"Request: {request.method} {request.url}")
+    print("  Headers:")
+    for key, value in request.headers.items():
+        if key.lower() == "authorization":
+            value = "[...]"
+        if key.lower() == "cookie":
+            value = value.split("=")[0] + "=..."
+        print(f"    {key}: {value}")
+    print("  Body:")
+    try:
+        request_body = json.loads(request.content)
+        print(textwrap.indent(json.dumps(request_body, indent=2), "    "))
+    except json.JSONDecodeError:
+        print(textwrap.indent(request.content.decode(), "    "))
+    print(f"Response: status_code={response.status_code}")
+    print("  Headers:")
+    for key, value in response.headers.items():
+        if key.lower() == "set-cookie":
+            value = value.split("=")[0] + "=..."
+        print(f"    {key}: {value}")
+
+
+client = OpenAI(api_key="foobar", base_url="http://localhost:1234/v1/")
+
+# Enable this to log requests and responses
+# client._client = httpx.Client(
+#     event_hooks={"request": [print], "response": [log_response]}
+# )
+
+messages = [
+    {
+        "role": "user",
+        "content": "Hello! How many rs in strawberry?",
+    },
+]
+
+# ------------------------------------------------------------------
+# First question, thinking mode is enabled by default
+# ------------------------------------------------------------------
+completion = client.chat.completions.create(
+    model="ignore",
+    messages=messages,
+    max_tokens=1024,
+    frequency_penalty=1.0,
+    top_p=0.1,
+    temperature=0,
+)
+resp = completion.choices[0].message.content
+print(resp)
+
+messages.append({"role": "assistant", "content": completion.choices[0].message.content})
+
+messages = [
+    {
+        "role": "user",
+        "content": "How many rs in blueberry? /no_think",
+    },
+]
+
+# ------------------------------------------------------------------
+# Second question, disable thinking mode with extra body or /no_think
+# ------------------------------------------------------------------
+completion = client.chat.completions.create(
+    model="ignore",
+    messages=messages,
+    max_tokens=1024,
+    frequency_penalty=1.0,
+    top_p=0.1,
+    temperature=0,
+    # extra_body={
+    #     "enable_thinking": False
+    # }
+)
+resp = completion.choices[0].message.content
+print(resp)
+
+
+messages.append({"role": "assistant", "content": completion.choices[0].message.content})
+
+messages = [
+    {
+        "role": "user",
+        "content": "Are you sure? /think",
+    },
+]
+
+# ------------------------------------------------------------------
+# Third question, reenable thinking mode with extra body or /think
+# ------------------------------------------------------------------
+completion = client.chat.completions.create(
+    model="ignore",
+    messages=messages,
+    max_tokens=1024,
+    frequency_penalty=1.0,
+    top_p=0.1,
+    temperature=0,
+    # extra_body={
+    #     "enable_thinking": True
+    # }
+)
+resp = completion.choices[0].message.content
+print(resp)
@@ -309,10 +309,15 @@ pub fn apply_chat_template_to(
         .into_owned();
 
     if template.contains("{{ meta }}") {
-        //fix for GLM4 models
+        // Fix for GLM4 models
         template = template.replace("{%- set meta = message.get(\"metadata\", \"\") %}", "");
         template = template.replace("{{ meta }}", "");
     }
+    if template.contains("{% generation %}") && template.contains("{% endgeneration %}") {
+        // Strip for smollm3 models
+        template = template.replace("{% generation %}", "");
+        template = template.replace("{% endgeneration %}", "");
+    }
 
     env.add_template("chat_template", &template)?;
     env.add_function("raise_exception", raise_exception);
Original file line number	Diff line number	Diff line change
`@@ -13,6 +13,7 @@ extend-ignore-identifiers-re = [`
`13`	`13`	`"tese",`
`14`	`14`	`"seperable",`
`15`	`15`	`"Seperable",`
	`16`	`+ "setp",`
`16`	`17`	`]`
`17`	`18`
`18`	`19`	`[files]`