EricLBuehler · EricLBuehler · Jun 10, 2025 · Jun 9, 2025 · Jun 9, 2025 · Jun 9, 2025
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -160,6 +160,7 @@ mime_guess = "2.0.5"
 include_dir = "0.7.4"
 http = "1.3.1"
 hyper = "1.6.0"
+rust-mcp-sdk = { version = "0.4.2", default-features = false, features = ["server", "hyper-server", "2025_03_26"] }
 bindgen_cuda = { git = "https://github.com/guoqingbao/bindgen_cuda.git", version = "0.1.6" }
 rubato = "0.16.2"
 rustfft = "6.3.0"

diff --git a/README.md b/README.md
@@ -127,6 +127,7 @@ Please submit requests for new models [here](https://github.com/EricLBuehler/mis
    - [Rust API](https://ericlbuehler.github.io/mistral.rs/mistralrs/) & [Python API](mistralrs-pyo3/API.md)
    - [Automatic device mapping](docs/DEVICE_MAPPING.md) (multi-GPU, CPU)
    - [Chat templates](docs/CHAT_TOK.md) & tokenizer auto-detection
+   - [MCP protocol](docs/MCP.md) for structured, realtime tool calls
 
 2. **Performance**
    - CPU acceleration (MKL, AVX, [NEON](docs/DEVICE_MAPPING.md#arm-neon), [Accelerate](docs/DEVICE_MAPPING.md#apple-accelerate))
@@ -184,6 +185,16 @@ OpenAI API compatible API server
 - [Example](examples/server/chat.py)
 - [Use or extend the server in other axum projects](https://ericlbuehler.github.io/mistral.rs/mistralrs_server_core/)
 
+### MCP Protocol
+
+Serve the same models over the open [MCP](docs/MCP.md) (Model Control Protocol) in parallel to the HTTP API:
+
+```bash
+./mistralrs-server --mcp-port 4321 plain -m Qwen/Qwen3-4B
+```
+
+See the [docs](docs/MCP.md) for feature flags, examples and limitations.
+
 
 ### Llama Index integration
 

diff --git a/docs/HTTP.md b/docs/HTTP.md
@@ -4,6 +4,9 @@ Mistral.rs provides a lightweight OpenAI API compatible HTTP server based on [ax
 
 The API consists of the following endpoints. They can be viewed in your browser interactively by going to `http://localhost:<port>/docs`.
 
+> ℹ️  Besides the HTTP endpoints described below `mistralrs-server` can also expose the same functionality via the **MCP protocol**.  
+> Enable it with `--mcp-port <port>` and see [MCP.md](MCP.md) for details.
+
 ## Additional object keys
 
 To support additional features, we have extended the completion and chat completion request objects. Both have the same keys added:

diff --git a/docs/MCP.md b/docs/MCP.md
@@ -0,0 +1,202 @@
+# MCP protocol support
+
+`mistralrs-server` can serve **MCP (Model Control Protocol)** traffic next to the regular OpenAI-compatible HTTP interface!
+
+MCP is an open, tool-based protocol that lets clients interact with models through structured *tool calls* instead of free-form HTTP routes.  
+
+Under the hood the server uses [`rust-mcp-sdk`](https://crates.io/crates/rust-mcp-sdk) and exposes tools based on the supported modalities of the loaded model.
+
+Exposed tools:
+
+| Tool | Minimum `input` -> `output` modalities | Description |
+| -- | -- | -- |
+| `chat` | | `Text` -> `Text` | Wraps the OpenAI `/v1/chat/completions` endpoint. |
+
+
+---
+
+## ToC
+- [MCP protocol support](#mcp-protocol-support)
+  - [ToC](#toc)
+  - [Running](#running)
+  - [Check if it's working](#check-if-its-working)
+  - [Example clients](#example-clients)
+    - [Rust](#rust)
+    - [Python](#python)
+    - [HTTP](#http)
+  - [Limitations](#limitations)
+
+---
+
+## Running
+
+Start the normal HTTP server and add the `--mcp-port` flag to spin up an MCP server on a separate port:
+
+```bash
+./target/release/mistralrs-server \
+  --port 1234            # OpenAI compatible HTTP API
+  --mcp-port 4321        # MCP protocol endpoint (Streamable HTTP)
+  plain -m mistralai/Mistral-7B-Instruct-v0.3
+```
+
+## Check if it's working
+
+Run this `curl` command to check the available tools:
+
+```
+curl -X POST http://localhost:4321/mcp \
+-H "Content-Type: application/json" \
+-d '{
+  "jsonrpc": "2.0",
+  "id": 2,
+  "method": "tools/list",
+  "params": {}
+}'      
+```
+
+## Example clients
+
+### Rust
+
+```rust
+use anyhow::Result;
+use rust_mcp_sdk::{
+    mcp_client::client_runtime,
+    schema::{
+        CallToolRequestParams, ClientCapabilities, CreateMessageRequest,
+        Implementation, InitializeRequestParams, Message, LATEST_PROTOCOL_VERSION,
+    },
+    ClientSseTransport, ClientSseTransportOptions,
+};
+
+struct Handler;
+#[async_trait::async_trait]
+impl rust_mcp_sdk::mcp_client::ClientHandler for Handler {}
+
+#[tokio::main]
+async fn main() -> Result<()> {
+    let transport = ClientSseTransport::new(
+        "http://localhost:4321/mcp",
+        ClientSseTransportOptions::default(),
+    )?;
+
+    let details = InitializeRequestParams {
+        capabilities: ClientCapabilities::default(),
+        client_info: Implementation { name: "mcp-client".into(), version: "0.1".into() },
+        protocol_version: LATEST_PROTOCOL_VERSION.into(),
+    };
+
+    let client = client_runtime::create_client(details, transport, Handler);
+    client.clone().start().await?;
+
+    let req = CreateMessageRequest {
+        model: "mistralai/Mistral-7B-Instruct-v0.3".into(),
+        messages: vec![Message::user("Explain Rust ownership.")],
+        ..Default::default()
+    };
+
+    let result = client
+        .call_tool(CallToolRequestParams::new("chat", req.into()))
+        .await?;
+
+    println!("{}", result.content[0].as_text_content()?.text);
+    client.shut_down().await?;
+    Ok(())
+}
+```
+
+### Python
+
+```py
+import asyncio
+from mcp import ClientSession
+from mcp.client.streamable_http import streamablehttp_client
+
+SERVER_URL = "http://localhost:4321/mcp"
+
+async def main() -> None:
+    async with streamablehttp_client(SERVER_URL) as (read, write, _):
+        async with ClientSession(read, write) as session:
+
+            # --- INITIALIZE ---
+            init_result = await session.initialize()
+            print("Server info:", init_result.serverInfo)
+
+            # --- LIST TOOLS ---
+            tools = await session.list_tools()
+            print("Available tools:", [t.name for t in tools.tools])
+
+            # --- CALL TOOL ---
+            resp = await session.call_tool(
+                "chat",
+                arguments={
+                    "messages": [
+                        {"role": "user", "content": "Hello MCP 👋"},
+                        {"role": "assistant", "content": "Hi there!"}
+                    ],
+                    "maxTokens": 50,
+                    "temperature": 0.7,
+                },
+            )
+            # resp.content is a list[CallToolResultContentItem]; extract text parts
+            text = "\n".join(c.text for c in resp.content if c.type == "text")
+            print("Model replied:", text)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+### HTTP
+
+**Call a tool:**
+```bash
+curl -X POST http://localhost:4321/mcp \
+-H "Content-Type: application/json" \
+-d '{
+  "jsonrpc": "2.0",
+  "id": 3,
+  "method": "tools/call",
+  "params": {
+    "name": "chat",
+    "arguments": {
+    "messages": [
+      { "role": "system",    "content": "You are a helpful assistant." },
+      { "role": "user",      "content": "Hello, what’s the time?" }
+    ],
+    "maxTokens": 50,
+    "temperature": 0.7
+  }
+  }
+}'
+```
+
+**Initialize:**
+```bash
+curl -X POST http://localhost:4321/mcp \
+-H "Content-Type: application/json" \
+-d '{
+  "jsonrpc": "2.0",
+  "id": 1,
+  "method": "initialize",
+  "params": {}
+}'         
+```
+
+**List tools:**
+```bash
+curl -X POST http://localhost:4321/mcp \
+-H "Content-Type: application/json" \
+-d '{
+  "jsonrpc": "2.0",
+  "id": 2,
+  "method": "tools/list",
+  "params": {}
+}'      
+```
+
+## Limitations
+
+- Streaming requests are not implemented.
+- No authentication layer is provided – run the MCP port behind a reverse proxy if you need auth.
+
+Contributions to extend MCP coverage (streaming, more tools, auth hooks) are welcome!
diff --git a/docs/README.md b/docs/README.md
@@ -38,6 +38,7 @@
 - [Sampling](SAMPLING.md)
 - [TOML selector](TOML_SELECTOR.md)
 - [Tool calling](TOOL_CALLING.md)
+- [MCP protocol](MCP.md)
 
 ## Cross-device inference
 - [Device mapping](DEVICE_MAPPING.md)

diff --git a/mistralrs-core/src/lib.rs b/mistralrs-core/src/lib.rs
@@ -94,8 +94,8 @@ pub use pipeline::{
     MultimodalPromptPrefixer, NormalLoader, NormalLoaderBuilder, NormalLoaderType,
     NormalSpecificConfig, Phi2Loader, Phi3Loader, Phi3VLoader, Qwen2Loader, SpeculativeConfig,
     SpeculativeLoader, SpeculativePipeline, SpeechLoader, SpeechPipeline, Starcoder2Loader,
-    TokenSource, VisionLoader, VisionLoaderBuilder, VisionLoaderType, VisionSpecificConfig,
-    UQFF_MULTI_FILE_DELIMITER,
+    SupportedModality, TokenSource, VisionLoader, VisionLoaderBuilder, VisionLoaderType,
+    VisionSpecificConfig, UQFF_MULTI_FILE_DELIMITER,
 };
 pub use request::{
     ApproximateUserLocation, Constraint, DetokenizationRequest, ImageGenerationResponseFormat,

diff --git a/mistralrs-core/src/pipeline/mod.rs b/mistralrs-core/src/pipeline/mod.rs
@@ -77,7 +77,7 @@ pub use crate::kv_cache::{
     Cache, CacheManager, EitherCache, KvCache, LayerCaches, NormalCache, NormalCacheType,
 };
 
-#[derive(Clone)]
+#[derive(Clone, PartialEq, Eq)]
 pub enum SupportedModality {
     Text,
     Audio,

diff --git a/mistralrs-server/Cargo.toml b/mistralrs-server/Cargo.toml
@@ -29,6 +29,8 @@ serde.workspace = true
 serde_json.workspace = true
 tokio.workspace = true
 tracing.workspace = true
+rust-mcp-sdk.workspace = true
+async-trait.workspace = true
 
 [features]
 cuda = ["mistralrs-core/cuda", "mistralrs-server-core/cuda"]
@@ -43,3 +45,4 @@ accelerate = ["mistralrs-core/accelerate", "mistralrs-server-core/accelerate"]
 mkl = ["mistralrs-core/mkl", "mistralrs-server-core/mkl"]
 nccl = ["mistralrs-core/nccl", "mistralrs-server-core/nccl"]
 ring = ["mistralrs-core/ring", "mistralrs-server-core/ring"]
+mcp-server = ["rust-mcp-sdk/server", "rust-mcp-sdk/hyper-server"]
diff --git a/mistralrs-server/src/main.rs b/mistralrs-server/src/main.rs
@@ -1,6 +1,8 @@
 use anyhow::Result;
 use clap::Parser;
 use mistralrs_core::{initialize_logging, ModelSelected, TokenSource};
+use rust_mcp_sdk::schema::LATEST_PROTOCOL_VERSION;
+use tokio::join;
 use tracing::info;
 
 use mistralrs_server_core::{
@@ -10,6 +12,7 @@ use mistralrs_server_core::{
 
 mod interactive_mode;
 use interactive_mode::interactive_mode;
+mod mcp_server;
 
 #[derive(Parser)]
 #[command(version, about, long_about = None)]
@@ -24,7 +27,7 @@ struct Args {
 
     /// Port to serve on.
     #[arg(short, long)]
-    port: Option<String>,
+    port: Option<u16>,
 
     /// Log all responses and requests to this file
     #[clap(long, short)]
@@ -134,6 +137,10 @@ struct Args {
     /// Enable thinking for interactive mode and models that support it.
     #[arg(long = "enable-thinking")]
     enable_thinking: bool,
+
+    /// Port to serve MCP protocol on
+    #[arg(long)]
+    mcp_port: Option<u16>,
 }
 
 fn parse_token_source(s: &str) -> Result<TokenSource, String> {
@@ -185,27 +192,54 @@ async fn main() -> Result<()> {
         return Ok(());
     }
 
-    // Needs to be after the .build call as that is where the daemon waits.
-    let setting_server = if !args.interactive_mode {
-        let port = args.port.expect("Interactive mode was not specified, so expected port to be specified. Perhaps you forgot `-i` or `--port`?");
-        let ip = args.serve_ip.unwrap_or_else(|| "0.0.0.0".to_string());
+    if !args.interactive_mode && args.port.is_none() && args.mcp_port.is_none() {
+        anyhow::bail!("Interactive mode was not specified, so expected port to be specified. Perhaps you forgot `-i` or `--port` or `--mcp-port`?")
+    }
 
-        // Create listener early to validate address before model loading
-        let listener = tokio::net::TcpListener::bind(format!("{ip}:{port}")).await?;
-        Some((listener, ip, port))
+    let mcp_port = if let Some(port) = args.mcp_port {
+        let host = args
+            .serve_ip
+            .clone()
+            .unwrap_or_else(|| "0.0.0.0".to_string());
+        info!("MCP server listening on http://{host}:{port}/mcp.");
+        info!("MCP protocol version is {}.", LATEST_PROTOCOL_VERSION);
+        let mcp_server = mcp_server::create_http_mcp_server(mistralrs.clone(), host, port);
+
+        tokio::spawn(async move {
+            if let Err(e) = mcp_server.await {
+                eprintln!("MCP server error: {e}");
+            }
+        })
     } else {
-        None
+        tokio::spawn(async {})
     };
 
-    let app = MistralRsServerRouterBuilder::new()
-        .with_mistralrs(mistralrs)
-        .build()
-        .await?;
+    let oai_port = if let Some(port) = args.port {
+        let ip = args
+            .serve_ip
+            .clone()
+            .unwrap_or_else(|| "0.0.0.0".to_string());
 
-    if let Some((listener, ip, port)) = setting_server {
-        info!("Serving on http://{ip}:{}.", port);
-        axum::serve(listener, app).await?;
+        // Create listener early to validate address before model loading
+        let listener = tokio::net::TcpListener::bind(format!("{ip}:{port}")).await?;
+
+        let app = MistralRsServerRouterBuilder::new()
+            .with_mistralrs(mistralrs)
+            .build()
+            .await?;
+
+        info!("OpenAI-compatible server listening on http://{ip}:{port}.");
+
+        tokio::spawn(async move {
+            if let Err(e) = axum::serve(listener, app).await {
+                eprintln!("OpenAI server error: {e}");
+            }
+        })
+    } else {
+        tokio::spawn(async {})
     };
 
+    let (_, _) = join!(oai_port, mcp_port);
+
     Ok(())
 }