Cache the user prompt states too.

cryscan · cryscan · commit 1d40a25d4e6a · 2024-02-14T20:36:11.000+08:00
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "ai00_server"
-version = "0.3.14"
+version = "0.3.15"
 edition = "2021"
 authors = ["Gu ZhenNiu <448885@qq.com>", "Zhang Zhenyuan <cryscan@umich.edu>"]
 license = "MIT OR Apache-2.0"
diff --git a/assets/configs/Config.toml b/assets/configs/Config.toml
@@ -5,6 +5,7 @@ quant_type = "Int8"                                             # Quantization t
 turbo = true                                                    # Whether to use alternative GEMM kernel to speed-up long prompts.
 token_chunk_size = 32                                           # Size of token chunk that is inferred at once. For high end GPUs, this could be 64 or 128 (faster).
 head_chunk_size = 8192                                          # DO NOT modify this if you don't know what you are doing.
+state_chunk_size = 4                                            # The chunk size of layers in model state.
 max_runtime_batch = 8                                           # The maximum batches that can be scheduled for inference at the same time.
 max_batch = 16                                                  # The maximum batches that are cached on GPU.
 embed_layer = 2                                                 # The (reversed) layer number whose output is used as embedding.
diff --git a/src/config.rs b/src/config.rs
@@ -25,6 +25,7 @@ impl From<Config> for ReloadRequest {
                     turbo,
                     token_chunk_size,
                     head_chunk_size,
+                    state_chunk_size,
                     max_runtime_batch,
                     max_batch,
                     embed_layer,
@@ -45,6 +46,7 @@ impl From<Config> for ReloadRequest {
             turbo,
             token_chunk_size,
             head_chunk_size,
+            state_chunk_size,
             max_runtime_batch,
             max_batch,
             embed_layer,
@@ -70,6 +72,8 @@ pub struct Model {
     pub token_chunk_size: usize,
     /// The chunk size for each split of the head matrix.
     pub head_chunk_size: usize,
+    /// The chunk size of layers in model state.
+    pub state_chunk_size: usize,
     /// Maximum number of batches that are active at once.
     pub max_runtime_batch: usize,
     /// Number of states that are cached on GPU.
@@ -89,6 +93,7 @@ impl Default for Model {
             turbo: true,
             token_chunk_size: 32,
             head_chunk_size: 8192,
+            state_chunk_size: 4,
             max_runtime_batch: 8,
             max_batch: 16,
             embed_layer: 2,
diff --git a/src/main.rs b/src/main.rs
@@ -45,7 +45,6 @@ mod sampler;
 mod utils;
 
 pub const MAX_TOKENS: usize = 4096;
-pub const STATE_CHUNK_SIZE: usize = 4;
 
 #[derive(Debug)]
 pub enum Token {
@@ -182,6 +181,8 @@ pub struct ReloadRequest {
     pub token_chunk_size: usize,
     /// The chunk size for each split of the head matrix.
     pub head_chunk_size: usize,
+    /// The chunk size of layers in model state.
+    pub state_chunk_size: usize,
     /// Maximum number of batches that are active at once.
     pub max_runtime_batch: usize,
     /// Number of states that are cached on GPU.
@@ -290,7 +291,7 @@ where
 
     let state: S = StateBuilder::new(context, model.info())
         .with_num_batch(request.max_batch)
-        .with_chunk_size(STATE_CHUNK_SIZE)
+        .with_chunk_size(request.state_chunk_size)
         .build();
     Ok((model, state))
 }
@@ -397,6 +398,7 @@ async fn model_route(receiver: Receiver<ThreadRequest>) -> Result<()> {
                     let reload = async move {
                         let sender = sender.clone();
                         let max_runtime_batch = request.max_runtime_batch;
+                        let state_chunk_size = request.state_chunk_size;
                         let embed_layer = request.embed_layer;
 
                         let file = File::open(&request.model_path)?;
@@ -419,6 +421,7 @@ async fn model_route(receiver: Receiver<ThreadRequest>) -> Result<()> {
                                     model,
                                     state,
                                     max_runtime_batch,
+                                    state_chunk_size,
                                     embed_layer,
                                 ))
                             }
@@ -429,6 +432,7 @@ async fn model_route(receiver: Receiver<ThreadRequest>) -> Result<()> {
                                     model,
                                     state,
                                     max_runtime_batch,
+                                    state_chunk_size,
                                     embed_layer,
                                 ))
                             }
@@ -439,6 +443,7 @@ async fn model_route(receiver: Receiver<ThreadRequest>) -> Result<()> {
                                     model,
                                     state,
                                     max_runtime_batch,
+                                    state_chunk_size,
                                     embed_layer,
                                 ))
                             }
@@ -492,6 +497,7 @@ async fn model_route(receiver: Receiver<ThreadRequest>) -> Result<()> {
 
                     let context = GenerateContext {
                         prompt_tokens: tokens.to_vec(),
+                        prompt_cached: false,
                         prefix: Default::default(),
                         suffix: tokens,
                         penalties,
diff --git a/src/run.rs b/src/run.rs