EricLBuehler · EricLBuehler · Jun 10, 2025 · Jun 9, 2025 · Jun 9, 2025 · Jun 9, 2025
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -160,6 +160,7 @@ mime_guess = "2.0.5"
 include_dir = "0.7.4"
 http = "1.3.1"
 hyper = "1.6.0"
+rust-mcp-sdk = { version = "0.4.2", default-features = false, features = ["server", "hyper-server", "2025_03_26"] }
 bindgen_cuda = { git = "https://github.com/guoqingbao/bindgen_cuda.git", version = "0.1.6" }
 rubato = "0.16.2"
 rustfft = "6.3.0"

diff --git a/README.md b/README.md
@@ -127,6 +127,7 @@ Please submit requests for new models [here](https://github.com/EricLBuehler/mis
    - [Rust API](https://ericlbuehler.github.io/mistral.rs/mistralrs/) & [Python API](mistralrs-pyo3/API.md)
    - [Automatic device mapping](docs/DEVICE_MAPPING.md) (multi-GPU, CPU)
    - [Chat templates](docs/CHAT_TOK.md) & tokenizer auto-detection
+   - [MCP protocol](docs/MCP.md) for structured, realtime tool calls
 
 2. **Performance**
    - CPU acceleration (MKL, AVX, [NEON](docs/DEVICE_MAPPING.md#arm-neon), [Accelerate](docs/DEVICE_MAPPING.md#apple-accelerate))
@@ -184,6 +185,16 @@ OpenAI API compatible API server
 - [Example](examples/server/chat.py)
 - [Use or extend the server in other axum projects](https://ericlbuehler.github.io/mistral.rs/mistralrs_server_core/)
 
+### MCP Protocol
+
+Serve the same models over the open [MCP](docs/MCP.md) (Model Control Protocol) in parallel to the HTTP API:
+
+```bash
+./mistralrs-server --mcp-port 4321 plain -m Qwen/Qwen3-4B
+```
+
+See the [docs](docs/MCP.md) for feature flags, examples and limitations.
+
 
 ### Llama Index integration
 

diff --git a/docs/HTTP.md b/docs/HTTP.md
@@ -4,6 +4,9 @@ Mistral.rs provides a lightweight OpenAI API compatible HTTP server based on [ax
 
 The API consists of the following endpoints. They can be viewed in your browser interactively by going to `http://localhost:<port>/docs`.
 
+> ℹ️  Besides the HTTP endpoints described below `mistralrs-server` can also expose the same functionality via the **MCP protocol**.  
+> Enable it with `--mcp-port <port>` and see [MCP.md](MCP.md) for details.
+
 ## Additional object keys
 
 To support additional features, we have extended the completion and chat completion request objects. Both have the same keys added:

diff --git a/docs/MCP.md b/docs/MCP.md
@@ -0,0 +1,100 @@
+# MCP protocol support
+
+`mistralrs-server` can serve **MCP (Model Control Protocol)** traffic next to the regular OpenAI-compatible HTTP interface.  
+MCP is an open, tool-based protocol that lets clients interact with models through structured *tool calls* instead of free-form HTTP routes.  
+Under the hood the server uses [`rust-mcp-sdk`](https://crates.io/crates/rust-mcp-sdk) and exposes a single tool called **`chat`** that mirrors the behaviour of the `/v1/chat/completions` endpoint.
+
+---
+
+## 1. Building
+
+Support for MCP is compiled in by default because the workspace enables the `server` and `hyper-server` features of `rust-mcp-sdk`.  
+When you only compile the `mistralrs-server` crate outside the workspace enable the `mcp-server` Cargo feature manually:
+
+```bash
+cargo build -p mistralrs-server --release --features "mcp-server"
+```
+
+## 2. Running
+
+Start the normal HTTP server and add the `--mcp-port` flag to spin up an MCP server on a separate port:
+
+```bash
+./target/release/mistralrs-server \
+  --port 1234            # OpenAI compatible HTTP API
+  --mcp-port 4321        # MCP protocol endpoint (SSE over HTTP)
+  plain -m mistralai/Mistral-7B-Instruct-v0.3
+```
+
+* `--mcp-port` takes precedence over `--port` – you can run the HTTP and MCP servers on totally independent ports or omit `--port` when you only need MCP.*
+
+The server prints an extra line such as
+
+```
+MCP ‑ listening on http://0.0.0.0:9001
+```
+
+## 3. Capabilities announced to clients
+
+At start-up the MCP handler advertises the following `InitializeResult` (abridged):
+
+```jsonc
+{
+  "server_info": { "name": "mistralrs", "version": "<crate-version>" },
+  "protocol_version": "2025-03-26",            // latest spec version from rust-mcp-sdk
+  "instructions": "use tool 'chat'",
+  "capabilities": {
+    "tools": {}
+  }
+}
+```
+
+Only one tool is currently exposed:
+
+| tool | description                                          |
+|------|------------------------------------------------------|
+| `chat` | Wraps the OpenAI `/v1/chat/completions` endpoint. |
+
+## 4. Calling the `chat` tool
+
+Clients send a [`CallToolRequest`](https://docs.rs/rust-mcp-schema/latest/rust_mcp_schema/struct.CallToolRequest.html) event where `params.name` is `"chat"` and `params.arguments` contains a standard MCP [`CreateMessageRequest`](https://docs.rs/rust-mcp-schema/latest/rust_mcp_schema/struct.CreateMessageRequest.html).
+
+Example request (sent as SSE `POST /mcp/stream` or via the convenience helpers in `rust-mcp-sdk`):
+
+```jsonc
+{
+  "kind": "callToolRequest",
+  "id": "123",
+  "params": {
+    "name": "chat",
+    "arguments": {
+      "model": "mistralai/Mistral-7B-Instruct-v0.3",
+      "messages": [
+        { "role": "user", "content": "Explain Rust ownership." }
+      ]
+    }
+  }
+}
+```
+
+The response is a `CallToolResult` event whose `content` array contains a single `TextContent` item with the assistant response.
+
+```jsonc
+{
+  "kind": "callToolResult",
+  "id": "123",
+  "content": [
+    { "type": "text", "text": "Rust’s ownership system ..." }
+  ]
+}
+```
+
+Error cases are mapped to `CallToolError` with `is_error = true`.
+
+## 5. Limitations & future work
+
+• Only synchronous, single-shot requests are supported right now.  
+• Streaming responses (`partialCallToolResult`) are not yet implemented.  
+• No authentication layer is provided – run the MCP port behind a reverse proxy if you need auth.
+
+Contributions to extend MCP coverage (streaming, more tools, auth hooks) are welcome!
diff --git a/docs/README.md b/docs/README.md
@@ -38,6 +38,7 @@
 - [Sampling](SAMPLING.md)
 - [TOML selector](TOML_SELECTOR.md)
 - [Tool calling](TOOL_CALLING.md)
+- [MCP protocol](MCP.md)
 
 ## Cross-device inference
 - [Device mapping](DEVICE_MAPPING.md)

diff --git a/mistralrs-server/Cargo.toml b/mistralrs-server/Cargo.toml
@@ -29,6 +29,8 @@ serde.workspace = true
 serde_json.workspace = true
 tokio.workspace = true
 tracing.workspace = true
+rust-mcp-sdk.workspace = true
+async-trait.workspace = true
 
 [features]
 cuda = ["mistralrs-core/cuda", "mistralrs-server-core/cuda"]
@@ -43,3 +45,4 @@ accelerate = ["mistralrs-core/accelerate", "mistralrs-server-core/accelerate"]
 mkl = ["mistralrs-core/mkl", "mistralrs-server-core/mkl"]
 nccl = ["mistralrs-core/nccl", "mistralrs-server-core/nccl"]
 ring = ["mistralrs-core/ring", "mistralrs-server-core/ring"]
+mcp-server = ["rust-mcp-sdk/server", "rust-mcp-sdk/hyper-server"]
diff --git a/mistralrs-server/src/main.rs b/mistralrs-server/src/main.rs
@@ -10,6 +10,7 @@ use mistralrs_server_core::{
 
 mod interactive_mode;
 use interactive_mode::interactive_mode;
+mod mcp_server;
 
 #[derive(Parser)]
 #[command(version, about, long_about = None)]
@@ -134,6 +135,10 @@ struct Args {
     /// Enable thinking for interactive mode and models that support it.
     #[arg(long = "enable-thinking")]
     enable_thinking: bool,
+
+    /// Port to serve MCP protocol on
+    #[arg(long)]
+    mcp_port: Option<u16>,
 }
 
 fn parse_token_source(s: &str) -> Result<TokenSource, String> {
@@ -188,7 +193,10 @@ async fn main() -> Result<()> {
     // Needs to be after the .build call as that is where the daemon waits.
     let setting_server = if !args.interactive_mode {
         let port = args.port.expect("Interactive mode was not specified, so expected port to be specified. Perhaps you forgot `-i` or `--port`?");
-        let ip = args.serve_ip.unwrap_or_else(|| "0.0.0.0".to_string());
+        let ip = args
+            .serve_ip
+            .clone()
+            .unwrap_or_else(|| "0.0.0.0".to_string());
 
         // Create listener early to validate address before model loading
         let listener = tokio::net::TcpListener::bind(format!("{ip}:{port}")).await?;
@@ -197,6 +205,19 @@ async fn main() -> Result<()> {
         None
     };
 
+    if let Some(port) = args.mcp_port {
+        let host = args
+            .serve_ip
+            .clone()
+            .unwrap_or_else(|| "0.0.0.0".to_string());
+        let mcp_server = mcp_server::create_mcp_server(mistralrs.clone(), host, port);
+        tokio::spawn(async move {
+            if let Err(e) = mcp_server.start().await {
+                eprintln!("MCP server error: {e}");
+            }
+        });
+    }
+
     let app = MistralRsServerRouterBuilder::new()
         .with_mistralrs(mistralrs)
         .build()

diff --git a/mistralrs-server/src/mcp_server.rs b/mistralrs-server/src/mcp_server.rs
@@ -0,0 +1,95 @@
+use async_trait::async_trait;
+use rust_mcp_sdk::{
+    mcp_server::{hyper_server, HyperServerOptions, ServerHandler},
+    schema::{
+        schema_utils::CallToolError, CallToolRequest, CallToolResult, CallToolResultContentItem,
+        Implementation, InitializeResult, ServerCapabilities, ServerCapabilitiesTools, TextContent,
+        LATEST_PROTOCOL_VERSION,
+    },
+};
+use std::io;
+
+use mistralrs_server_core::{
+    chat_completion::{create_response_channel, parse_request},
+    types::SharedMistralRsState,
+};
+
+pub struct MistralMcpHandler {
+    pub state: SharedMistralRsState,
+}
+
+#[async_trait]
+impl ServerHandler for MistralMcpHandler {
+    async fn handle_call_tool_request(
+        &self,
+        request: CallToolRequest,
+        _runtime: &dyn rust_mcp_sdk::McpServer,
+    ) -> std::result::Result<CallToolResult, CallToolError> {
+        if request.params.name != "chat" {
+            return Err(CallToolError::unknown_tool(request.params.name));
+        }
+        let args = request.params.arguments.into();
+        let req: rust_mcp_sdk::schema::CreateMessageRequest =
+            serde_json::from_value(args).map_err(|e| CallToolError::new(io::Error::other(e)))?;
+        // Translate to ChatCompletionRequest
+        let chat_req: mistralrs_server_core::openai::ChatCompletionRequest =
+            serde_json::from_value(serde_json::to_value(req).unwrap())
+                .map_err(CallToolError::new)?;
-        let req: rust_mcp_sdk::schema::CreateMessageRequest =
-            serde_json::from_value(args).map_err(|e| CallToolError::new(io::Error::other(e)))?;
-        // Translate to ChatCompletionRequest
-        let chat_req: mistralrs_server_core::openai::ChatCompletionRequest =
-            serde_json::from_value(serde_json::to_value(req).unwrap())
-                .map_err(CallToolError::new)?;
+        let args = request.params.arguments.into();
+        let req: rust_mcp_sdk::schema::CreateMessageRequest =
+            serde_json::from_value(args).map_err(|e| CallToolError::new(io::Error::other(e)))?;
+        // TODO: Implement direct conversion from CreateMessageRequest to ChatCompletionRequest
+        // to avoid the overhead of JSON serialization/deserialization
+        let chat_req: mistralrs_server_core::openai::ChatCompletionRequest =
+            serde_json::from_value(serde_json::to_value(req).unwrap())
+                .map_err(CallToolError::new)?;
-        let req: rust_mcp_sdk::schema::CreateMessageRequest =
-            serde_json::from_value(args).map_err(|e| CallToolError::new(io::Error::other(e)))?;
-        // Translate to ChatCompletionRequest
-        let chat_req: mistralrs_server_core::openai::ChatCompletionRequest =
-            serde_json::from_value(serde_json::to_value(req).unwrap())
-                .map_err(CallToolError::new)?;
+        let args = request.params.arguments.into();
+        let req: rust_mcp_sdk::schema::CreateMessageRequest =
+            serde_json::from_value(args).map_err(|e| CallToolError::new(io::Error::other(e)))?;
+        // TODO: Implement direct conversion from CreateMessageRequest to ChatCompletionRequest
+        // to avoid the overhead of JSON serialization/deserialization
+        let chat_req: mistralrs_server_core::openai::ChatCompletionRequest =
+            serde_json::from_value(serde_json::to_value(req).unwrap())
+                .map_err(CallToolError::new)?;
+
+        let (tx, mut rx) = create_response_channel(None);
+        let (request, _is_streaming) = parse_request(chat_req, self.state.clone(), tx)
+            .await
+            .map_err(|e| CallToolError::new(io::Error::other(e.to_string())))?;
+        mistralrs_server_core::chat_completion::send_request(&self.state, request)
+            .await
+            .map_err(|e| CallToolError::new(io::Error::other(e.to_string())))?;
+        match rx.recv().await {
+            Some(mistralrs_core::Response::Done(resp)) => {
+                let content = resp
+                    .choices
+                    .iter()
+                    .filter_map(|c| c.message.content.clone())
+                    .collect::<Vec<_>>()
+                    .join("\n");
+                Ok(CallToolResult {
+                    content: vec![CallToolResultContentItem::TextContent(TextContent::new(
+                        content, None,
+                    ))],
+                    is_error: None,
+                    meta: None,
+                })
+            }
+            Some(mistralrs_core::Response::ModelError(msg, _)) => {
+                Err(CallToolError::new(io::Error::other(msg)))
+            }
+            Some(_) | None => Err(CallToolError::new(io::Error::other("no response"))),
+        }
+    }
+}
+
+pub fn create_mcp_server(
+    state: SharedMistralRsState,
+    host: String,
+    port: u16,
+) -> rust_mcp_sdk::mcp_server::HyperServer {
+    let server_details = InitializeResult {
+        server_info: Implementation {
+            name: "mistralrs".to_string(),
+            version: env!("CARGO_PKG_VERSION").to_string(),
+        },
+        capabilities: ServerCapabilities {
+            tools: Some(ServerCapabilitiesTools { list_changed: None }),
+            ..Default::default()
+        },
+        meta: None,
+        instructions: Some("use tool 'chat'".to_string()),
+        protocol_version: LATEST_PROTOCOL_VERSION.to_string(),
+    };
+    let handler = MistralMcpHandler { state };
+    let opts = HyperServerOptions {
+        host,
+        port,
+        ..Default::default()
+    };
+    hyper_server::create_server(server_details, handler, opts)
+}