|
| 1 | +# SmolLM3: [`HuggingFaceTB/SmolLM3-3B`](https://huggingface.co/HuggingFaceTB/SmolLM3-3B) |
| 2 | + |
| 3 | +SmolLM3 is a 3B parameter long-context hybrid reasoning language model. It supports 6 languages, advanced reasoning and long context. SmolLM3 is a fully open model that offers strong performance at the 3B–4B scale. |
| 4 | + |
| 5 | +**Default, easiest:** |
| 6 | +```bash |
| 7 | +./mistralrs-server -i --isq 8 run -m HuggingFaceTB/SmolLM3-3B |
| 8 | +``` |
| 9 | + |
| 10 | +**UQFF prequantized:** |
| 11 | +```bash |
| 12 | +./mistralrs-server -i run -m EricB/SmolLM3-3B-UQFF -f smollm33b-q4k-0.uqff |
| 13 | +``` |
| 14 | + |
| 15 | +> Note: tool calling support is fully implemented for the SmolLM3 models, including agentic web search. |
| 16 | +
|
| 17 | +> Check out prequantized UQFF SmolLM3 here: https://huggingface.co/EricB/SmolLM3-3B-UQFF |
| 18 | +
|
| 19 | +## Enabling thinking |
| 20 | +The SmolLM3 models are hybrid reasoning models which can be controlled at inference-time. **By default, reasoning is enabled for these models.** To dynamically control this, it is recommended to either add `/no_think` or `/think` to your prompt. Alternatively, you can specify the `enable_thinking` flag as detailed by the API-specific examples. |
| 21 | + |
| 22 | +## HTTP API |
| 23 | +You can find a more detailed example demonstrating enabling/disabling thinking [here](../examples/server/smollm3.py). |
| 24 | + |
| 25 | +``` |
| 26 | +./mistralrs-server --isq 8 --port 1234 plain -m HuggingFaceTB/SmolLM3-3B |
| 27 | +``` |
| 28 | + |
| 29 | +```py |
| 30 | +import openai |
| 31 | + |
| 32 | +messages = [] |
| 33 | +prompt = input("Enter system prompt >>> ") |
| 34 | +if len(prompt) > 0: |
| 35 | + messages.append({"role": "system", "content": prompt}) |
| 36 | + |
| 37 | + |
| 38 | +while True: |
| 39 | + prompt = input(">>> ") |
| 40 | + messages.append({"role": "user", "content": prompt}) |
| 41 | + completion = client.chat.completions.create( |
| 42 | + model="ignore", |
| 43 | + messages=messages, |
| 44 | + max_tokens=256, |
| 45 | + frequency_penalty=1.0, |
| 46 | + top_p=0.1, |
| 47 | + temperature=0, |
| 48 | + # enable_thinking=False, |
| 49 | + ) |
| 50 | + resp = completion.choices[0].message.content |
| 51 | + print(resp) |
| 52 | + messages.append({"role": "assistant", "content": resp}) |
| 53 | +``` |
| 54 | + |
| 55 | +## Python API |
| 56 | +You can find a more detailed example demonstrating enabling/disabling thinking [here](../examples/python/smollm3.py). |
| 57 | + |
| 58 | +```py |
| 59 | +from mistralrs import Runner, Which, ChatCompletionRequest, Architecture |
| 60 | + |
| 61 | +runner = Runner( |
| 62 | + which=Which.Plain( |
| 63 | + model_id="HuggingFaceTB/SmolLM3-3B", |
| 64 | + arch=Architecture.SmolLm3, |
| 65 | + ), |
| 66 | +) |
| 67 | + |
| 68 | +res = runner.send_chat_completion_request( |
| 69 | + ChatCompletionRequest( |
| 70 | + model="ignore", |
| 71 | + messages=[ |
| 72 | + {"role": "user", "content": "Tell me a story about the Rust type system."} |
| 73 | + ], |
| 74 | + max_tokens=256, |
| 75 | + presence_penalty=1.0, |
| 76 | + top_p=0.1, |
| 77 | + temperature=0.1, |
| 78 | + # enable_thinking=False, |
| 79 | + ) |
| 80 | +) |
| 81 | +print(res.choices[0].message.content) |
| 82 | +print(res.usage) |
| 83 | +``` |
| 84 | + |
| 85 | +## Rust API |
| 86 | +You can find a more detailed example demonstrating enabling/disabling thinking [here](../mistralrs/examples/smollm3/main.rs). |
| 87 | + |
| 88 | +```rust |
| 89 | +use anyhow::Result; |
| 90 | +use mistralrs::{ |
| 91 | + IsqType, PagedAttentionMetaBuilder, TextMessageRole, TextMessages, TextModelBuilder, |
| 92 | +}; |
| 93 | + |
| 94 | +#[tokio::main] |
| 95 | +async fn main() -> Result<()> { |
| 96 | + let model = TextModelBuilder::new("HuggingFaceTB/SmolLM3-3B") |
| 97 | + .with_isq(IsqType::Q8_0) |
| 98 | + .with_logging() |
| 99 | + .with_paged_attn(|| PagedAttentionMetaBuilder::default().build())? |
| 100 | + .build() |
| 101 | + .await?; |
| 102 | + |
| 103 | + let messages = TextMessages::new() |
| 104 | + // .enable_thinking(false) |
| 105 | + .add_message( |
| 106 | + TextMessageRole::System, |
| 107 | + "You are an AI agent with a specialty in programming.", |
| 108 | + ) |
| 109 | + .add_message( |
| 110 | + TextMessageRole::User, |
| 111 | + "Hello! How are you? Please write generic binary search function in Rust.", |
| 112 | + ); |
| 113 | + |
| 114 | + let response = model.send_chat_request(messages).await?; |
| 115 | + |
| 116 | + println!("{}", response.choices[0].message.content.as_ref().unwrap()); |
| 117 | + dbg!( |
| 118 | + response.usage.avg_prompt_tok_per_sec, |
| 119 | + response.usage.avg_compl_tok_per_sec |
| 120 | + ); |
| 121 | + |
| 122 | + Ok(()) |
| 123 | +} |
| 124 | +``` |
0 commit comments