EricLBuehler
diff --git a/‎mistralrs-server-core/src/chat_completion.rs‎
Lines changed: 23 additions & 35 deletions b/‎mistralrs-server-core/src/chat_completion.rs‎
Lines changed: 23 additions & 35 deletions
diff --git a/‎mistralrs-server-core/src/completion_base.rs‎
Lines changed: 117 additions & 2 deletions b/‎mistralrs-server-core/src/completion_base.rs‎
Lines changed: 117 additions & 2 deletions
@@ -23,17 +23,18 @@ use serde_json::Value;
 use tokio::sync::mpsc::{Receiver, Sender};
 
 use crate::{
-    completion_base::{base_handle_completion_error, BaseCompletionResponder},
+    completion_base::{
+        base_handle_completion_error, base_process_non_streaming_response, create_response_channel,
+        send_model_request, BaseCompletionResponder, BaseJsonModelError, ErrorToResponse,
+        JsonError, ModelErrorMessage,
+    },
     openai::{
         ChatCompletionRequest, Grammar, JsonSchemaResponseFormat, MessageInnerContent,
         ResponseFormat, StopTokens,
     },
-    streaming::{get_keep_alive_interval, BaseStreamer, DoneState},
+    streaming::{base_create_streamer, get_keep_alive_interval, BaseStreamer, DoneState},
     types::{ExtractedMistralRsState, SharedMistralRsState},
-    util::{
-        create_response_channel, parse_image_url, send_model_request, BaseJsonModelError,
-        ErrorToResponse, JsonError, ModelErrorMessage,
-    },
+    util::parse_image_url,
 };
 
 /// A callback function that processes streaming response chunks before they are sent to the client.
@@ -53,7 +54,7 @@ use crate::{
 ///     chunk
 /// });
 /// ```
-pub type OnChunkCallback =
+pub type ChatCompletionOnChunkCallback =
     Box<dyn Fn(ChatCompletionChunkResponse) -> ChatCompletionChunkResponse + Send + Sync>;
 
 /// A callback function that is executed when the streaming response completes.
@@ -71,15 +72,19 @@ pub type OnChunkCallback =
 ///     // Process all chunks for analytics
 /// });
 /// ```
-pub type OnDoneCallback = Box<dyn Fn(&[ChatCompletionChunkResponse]) + Send + Sync>;
+pub type ChatCompletionOnDoneCallback = Box<dyn Fn(&[ChatCompletionChunkResponse]) + Send + Sync>;
 
 /// A streaming response handler.
 ///
 /// It processes incoming response chunks from a model and converts them
 /// into Server-Sent Events (SSE) format for real-time streaming to clients.
-pub type Streamer = BaseStreamer<ChatCompletionChunkResponse, OnChunkCallback, OnDoneCallback>;
+pub type ChatCompletionStreamer = BaseStreamer<
+    ChatCompletionChunkResponse,
+    ChatCompletionOnChunkCallback,
+    ChatCompletionOnDoneCallback,
+>;
 
-impl futures::Stream for Streamer {
+impl futures::Stream for ChatCompletionStreamer {
     type Item = Result<Event, axum::Error>;
 
     /// Polls the stream for the next Server-Sent Event.
@@ -158,7 +163,8 @@ impl futures::Stream for Streamer {
 }
 
 /// Represents different types of chat completion responses.
-pub type ChatCompletionResponder = BaseCompletionResponder<ChatCompletionResponse, Streamer>;
+pub type ChatCompletionResponder =
+    BaseCompletionResponder<ChatCompletionResponse, ChatCompletionStreamer>;
 
 type JsonModelError = BaseJsonModelError<ChatCompletionResponse>;
 impl ErrorToResponse for JsonModelError {}
@@ -475,21 +481,10 @@ pub fn handle_chat_completion_error(
 pub fn create_chat_streamer(
     rx: Receiver<Response>,
     state: SharedMistralRsState,
-    on_chunk: Option<OnChunkCallback>,
-    on_done: Option<OnDoneCallback>,
-) -> Sse<Streamer> {
-    let store_chunks = on_done.is_some();
-
-    let streamer = Streamer {
-        rx,
-        done_state: DoneState::Running,
-        store_chunks,
-        state,
-        chunks: Vec::new(),
-        on_chunk,
-        on_done,
-    };
-
+    on_chunk: Option<ChatCompletionOnChunkCallback>,
+    on_done: Option<ChatCompletionOnDoneCallback>,
+) -> Sse<ChatCompletionStreamer> {
+    let streamer = base_create_streamer(rx, state, on_chunk, on_done);
     let keep_alive_interval = get_keep_alive_interval();
 
     Sse::new(streamer)
@@ -501,15 +496,8 @@ pub async fn process_non_streaming_chat_response(
     rx: &mut Receiver<Response>,
     state: SharedMistralRsState,
 ) -> ChatCompletionResponder {
-    let response = match rx.recv().await {
-        Some(response) => response,
-        None => {
-            let e = anyhow::Error::msg("No response received from the model.");
-            return handle_chat_completion_error(state, e.into());
-        }
-    };
-
-    match_responses(state, response)
+    base_process_non_streaming_response(rx, state, match_responses, handle_chat_completion_error)
+        .await
 }
 
 /// Matches and processes different types of model responses into appropriate chat completion responses.
 
@@ -2,11 +2,87 @@
 
 use std::error::Error;
 
-use axum::response::Sse;
-use mistralrs_core::MistralRs;
+use anyhow::{Context, Result};
+use axum::{
+    extract::Json,
+    http::StatusCode,
+    response::{IntoResponse, Sse},
+};
+use mistralrs_core::{MistralRs, Request, Response};
+use serde::Serialize;
+use tokio::sync::mpsc::{channel, Receiver, Sender};
 
 use crate::types::SharedMistralRsState;
 
+/// Default buffer size for the response channel used in streaming operations.
+///
+/// This constant defines the maximum number of response messages that can be buffered
+/// in the channel before backpressure is applied. A larger buffer reduces the likelihood
+/// of blocking but uses more memory.
+pub const DEFAULT_CHANNEL_BUFFER_SIZE: usize = 10_000;
+
+/// Trait for converting errors to HTTP responses with appropriate status codes.
+pub(crate) trait ErrorToResponse: Serialize {
+    /// Converts the error to an HTTP response with the specified status code.
+    fn to_response(&self, code: StatusCode) -> axum::response::Response {
+        let mut r = Json(self).into_response();
+        *r.status_mut() = code;
+        r
+    }
+}
+
+/// Standard JSON error response structure.
+#[derive(Serialize, Debug)]
+pub(crate) struct JsonError {
+    pub(crate) message: String,
+}
+
+impl JsonError {
+    /// Creates a new JSON error with the specified message.
+    pub(crate) fn new(message: String) -> Self {
+        Self { message }
+    }
+}
+
+impl std::fmt::Display for JsonError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.message)
+    }
+}
+
+impl std::error::Error for JsonError {}
+
+impl ErrorToResponse for JsonError {}
+
+/// Internal error type for model-related errors with a descriptive message.
+///
+/// This struct wraps error messages from the underlying model and implements
+/// the standard error traits for proper error handling and display.
+#[derive(Debug)]
+pub(crate) struct ModelErrorMessage(pub(crate) String);
+impl std::fmt::Display for ModelErrorMessage {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.0)
+    }
+}
+impl std::error::Error for ModelErrorMessage {}
+
+/// Generic JSON error response structure
+#[derive(Serialize, Debug)]
+pub(crate) struct BaseJsonModelError<T> {
+    pub(crate) message: String,
+    pub(crate) partial_response: T,
+}
+
+impl<T> BaseJsonModelError<T> {
+    pub(crate) fn new(message: String, partial_response: T) -> Self {
+        Self {
+            message,
+            partial_response,
+        }
+    }
+}
+
 /// Generic responder enum for different completion types
 #[derive(Debug)]
 pub enum BaseCompletionResponder<R, S> {
@@ -22,6 +98,15 @@ pub enum BaseCompletionResponder<R, S> {
     ValidationError(Box<dyn Error>),
 }
 
+/// Creates a channel for response communication.
+pub fn create_response_channel(
+    buffer_size: Option<usize>,
+) -> (Sender<Response>, Receiver<Response>) {
+    let channel_buffer_size = buffer_size.unwrap_or(DEFAULT_CHANNEL_BUFFER_SIZE);
+
+    channel(channel_buffer_size)
+}
+
 /// Generic function to handle completion errors and logging them.
 pub(crate) fn base_handle_completion_error<R, S>(
     state: SharedMistralRsState,
@@ -31,3 +116,33 @@ pub(crate) fn base_handle_completion_error<R, S>(
     MistralRs::maybe_log_error(state, &*e);
     BaseCompletionResponder::InternalError(e.into())
 }
+
+/// Sends a request to the model processing pipeline.
+pub async fn send_model_request(state: &SharedMistralRsState, request: Request) -> Result<()> {
+    let sender = state
+        .get_sender()
+        .context("mistral.rs sender not available.")?;
+
+    sender.send(request).await.map_err(|e| e.into())
+}
+
+/// Generic function to process non-streaming responses.
+pub(crate) async fn base_process_non_streaming_response<R>(
+    rx: &mut Receiver<Response>,
+    state: SharedMistralRsState,
+    match_fn: fn(SharedMistralRsState, Response) -> R,
+    error_handler: fn(
+        SharedMistralRsState,
+        Box<dyn std::error::Error + Send + Sync + 'static>,
+    ) -> R,
+) -> R {
+    let response = match rx.recv().await {
+        Some(response) => response,
+        None => {
+            let e = anyhow::Error::msg("No response received from the model.");
+            return error_handler(state, e.into());
+        }
+    };
+
+    match_fn(state, response)
+}