More work on consolidating completions and chat completions

matthewhaynesonline · matthewhaynesonline · commit b9c9604530b8 · 2025-06-05T16:47:06.000-04:00
diff --git a/mistralrs-server-core/src/chat_completion.rs b/mistralrs-server-core/src/chat_completion.rs
@@ -1,6 +1,6 @@
 //! ## Chat Completions functionality and route handler.
 
-use std::{error::Error, ops::Deref, pin::Pin, task::Poll, time::Duration};
+use std::{ops::Deref, pin::Pin, task::Poll, time::Duration};
 
 use anyhow::{Context, Result};
 use axum::{
@@ -19,20 +19,20 @@ use mistralrs_core::{
     NormalRequest, Request, RequestMessage, Response, SamplingParams,
     StopTokens as InternalStopTokens,
 };
-use serde::Serialize;
 use serde_json::Value;
 use tokio::sync::mpsc::{Receiver, Sender};
 
 use crate::{
+    completion_base::{base_handle_completion_error, BaseCompletionResponder},
     openai::{
         ChatCompletionRequest, Grammar, JsonSchemaResponseFormat, MessageInnerContent,
         ResponseFormat, StopTokens,
     },
-    streaming::{get_keep_alive_interval, DoneState},
+    streaming::{get_keep_alive_interval, BaseStreamer, DoneState},
     types::{ExtractedMistralRsState, SharedMistralRsState},
     util::{
-        create_response_channel, parse_image_url, send_model_request, ErrorToResponse, JsonError,
-        ModelErrorMessage,
+        create_response_channel, parse_image_url, send_model_request, BaseJsonModelError,
+        ErrorToResponse, JsonError, ModelErrorMessage,
     },
 };
 
@@ -77,22 +77,7 @@ pub type OnDoneCallback = Box<dyn Fn(&[ChatCompletionChunkResponse]) + Send + Sy
 ///
 /// It processes incoming response chunks from a model and converts them
 /// into Server-Sent Events (SSE) format for real-time streaming to clients.
-pub struct Streamer {
-    /// Channel receiver for incoming model responses
-    rx: Receiver<Response>,
-    /// Current state of the streaming operation
-    done_state: DoneState,
-    /// Underlying mistral.rs instance
-    state: SharedMistralRsState,
-    /// Whether to store chunks for the completion callback
-    store_chunks: bool,
-    /// All chunks received during streaming (if `store_chunks` is true)
-    chunks: Vec<ChatCompletionChunkResponse>,
-    /// Optional callback to process each chunk before sending
-    on_chunk: Option<OnChunkCallback>,
-    /// Optional callback to execute when streaming completes
-    on_done: Option<OnDoneCallback>,
-}
+pub type Streamer = BaseStreamer<ChatCompletionChunkResponse, OnChunkCallback, OnDoneCallback>;
 
 impl futures::Stream for Streamer {
     type Item = Result<Event, axum::Error>;
@@ -173,37 +158,9 @@ impl futures::Stream for Streamer {
 }
 
 /// Represents different types of chat completion responses.
-pub enum ChatCompletionResponder {
-    /// Server-Sent Events streaming response
-    Sse(Sse<Streamer>),
-    /// Complete JSON response for non-streaming requests
-    Json(ChatCompletionResponse),
-    /// Model error with partial response data
-    ModelError(String, ChatCompletionResponse),
-    /// Internal server error
-    InternalError(Box<dyn Error>),
-    /// Request validation error
-    ValidationError(Box<dyn Error>),
-}
-
-/// JSON error response structure for model errors.
-#[derive(Serialize)]
-struct JsonModelError {
-    message: String,
-    /// Partial response data that was generated before the error occurred
-    partial_response: ChatCompletionResponse,
-}
-
-impl JsonModelError {
-    /// Creates a new JSON model error with message and partial response.
-    fn new(message: String, partial_response: ChatCompletionResponse) -> Self {
-        Self {
-            message,
-            partial_response,
-        }
-    }
-}
+pub type ChatCompletionResponder = BaseCompletionResponder<ChatCompletionResponse, Streamer>;
 
+type JsonModelError = BaseJsonModelError<ChatCompletionResponse>;
 impl ErrorToResponse for JsonModelError {}
 
 impl IntoResponse for ChatCompletionResponder {
@@ -511,9 +468,7 @@ pub fn handle_chat_completion_error(
     state: SharedMistralRsState,
     e: Box<dyn std::error::Error + Send + Sync + 'static>,
 ) -> ChatCompletionResponder {
-    let e = anyhow::Error::msg(e.to_string());
-    MistralRs::maybe_log_error(state, &*e);
-    ChatCompletionResponder::InternalError(e.into())
+    base_handle_completion_error(state, e)
 }
 
 /// Creates a SSE streamer for chat completions with optional callbacks.
diff --git a/mistralrs-server-core/src/completion_base.rs b/mistralrs-server-core/src/completion_base.rs
@@ -0,0 +1,33 @@
+//! Base functionality for completions.
+
+use std::error::Error;
+
+use axum::response::Sse;
+use mistralrs_core::MistralRs;
+
+use crate::types::SharedMistralRsState;
+
+/// Generic responder enum for different completion types
+#[derive(Debug)]
+pub enum BaseCompletionResponder<R, S> {
+    /// Server-Sent Events streaming response
+    Sse(Sse<S>),
+    /// Complete JSON response for non-streaming requests
+    Json(R),
+    /// Model error with partial response data
+    ModelError(String, R),
+    /// Internal server error
+    InternalError(Box<dyn Error>),
+    /// Request validation error
+    ValidationError(Box<dyn Error>),
+}
+
+/// Generic function to handle completion errors and logging them.
+pub(crate) fn base_handle_completion_error<R, S>(
+    state: SharedMistralRsState,
+    e: Box<dyn std::error::Error + Send + Sync + 'static>,
+) -> BaseCompletionResponder<R, S> {
+    let e = anyhow::Error::msg(e.to_string());
+    MistralRs::maybe_log_error(state, &*e);
+    BaseCompletionResponder::InternalError(e.into())
+}
diff --git a/mistralrs-server-core/src/completions.rs b/mistralrs-server-core/src/completions.rs
@@ -1,5 +1,4 @@
 use std::{
-    error::Error,
     pin::Pin,
     sync::Arc,
     task::{Context, Poll},
@@ -19,16 +18,17 @@ use mistralrs_core::{
     CompletionResponse, Constraint, DrySamplingParams, MistralRs, NormalRequest, Request,
     RequestMessage, Response, SamplingParams, StopTokens as InternalStopTokens,
 };
-use serde::Serialize;
 use tokio::sync::mpsc::{Receiver, Sender};
 use tracing::warn;
 
 use crate::{
+    completion_base::BaseCompletionResponder,
     openai::{CompletionRequest, Grammar, StopTokens},
     streaming::{get_keep_alive_interval, DoneState},
     types::ExtractedMistralRsState,
     util::{
-        create_response_channel, send_model_request, ErrorToResponse, JsonError, ModelErrorMessage,
+        create_response_channel, send_model_request, BaseJsonModelError, ErrorToResponse,
+        JsonError, ModelErrorMessage,
     },
 };
 
@@ -94,29 +94,10 @@ impl futures::Stream for Streamer {
     }
 }
 
-pub enum CompletionResponder {
-    Sse(Sse<Streamer>),
-    Json(CompletionResponse),
-    ModelError(String, CompletionResponse),
-    InternalError(Box<dyn Error>),
-    ValidationError(Box<dyn Error>),
-}
-
-#[derive(Serialize)]
-struct JsonModelError {
-    message: String,
-    partial_response: CompletionResponse,
-}
-
-impl JsonModelError {
-    fn new(message: String, partial_response: CompletionResponse) -> Self {
-        Self {
-            message,
-            partial_response,
-        }
-    }
-}
+pub type CompletionResponder = BaseCompletionResponder<CompletionResponse, Streamer>;
 
+/// JSON error response structure for model errors.
+type JsonModelError = BaseJsonModelError<CompletionResponse>;
 impl ErrorToResponse for JsonModelError {}
 
 impl IntoResponse for CompletionResponder {
diff --git a/mistralrs-server-core/src/lib.rs b/mistralrs-server-core/src/lib.rs
@@ -215,6 +215,7 @@
 //! ```
 
 pub mod chat_completion;
+pub mod completion_base;
 mod completions;
 mod handlers;
 mod image_generation;
diff --git a/mistralrs-server-core/src/streaming.rs b/mistralrs-server-core/src/streaming.rs
@@ -2,6 +2,11 @@
 
 use std::env;
 
+use mistralrs_core::Response;
+use tokio::sync::mpsc::Receiver;
+
+use crate::types::SharedMistralRsState;
+
 /// Default keep-alive interval for Server-Sent Events (SSE) streams in milliseconds.
 pub const DEFAULT_KEEP_ALIVE_INTERVAL_MS: u64 = 10_000;
 
@@ -15,6 +20,27 @@ pub enum DoneState {
     Done,
 }
 
+/// A streaming response handler.
+///
+/// It processes incoming response chunks from a model and converts them
+/// into Server-Sent Events (SSE) format for real-time streaming to clients.
+pub struct BaseStreamer<R, C, D> {
+    /// Channel receiver for incoming model responses
+    pub rx: Receiver<Response>,
+    /// Current state of the streaming operation
+    pub done_state: DoneState,
+    /// Underlying mistral.rs instance
+    pub state: SharedMistralRsState,
+    /// Whether to store chunks for the completion callback
+    pub store_chunks: bool,
+    /// All chunks received during streaming (if `store_chunks` is true)
+    pub chunks: Vec<R>,
+    /// Optional callback to process each chunk before sending
+    pub on_chunk: Option<C>,
+    /// Optional callback to execute when streaming completes
+    pub on_done: Option<D>,
+}
+
 /// Gets the keep-alive interval for SSE streams from environment or default.
 pub fn get_keep_alive_interval() -> u64 {
     env::var("KEEP_ALIVE_INTERVAL")
diff --git a/mistralrs-server-core/src/util.rs b/mistralrs-server-core/src/util.rs
@@ -57,6 +57,22 @@ impl std::fmt::Display for ModelErrorMessage {
 }
 impl std::error::Error for ModelErrorMessage {}
 
+/// Generic JSON error response structure
+#[derive(Serialize, Debug)]
+pub(crate) struct BaseJsonModelError<T> {
+    pub(crate) message: String,
+    pub(crate) partial_response: T,
+}
+
+impl<T> BaseJsonModelError<T> {
+    pub(crate) fn new(message: String, partial_response: T) -> Self {
+        Self {
+            message,
+            partial_response,
+        }
+    }
+}
+
 /// Creates a channel for response communication.
 pub fn create_response_channel(
     buffer_size: Option<usize>,