EricLBuehler
diff --git a/‎README.md‎
Lines changed: 10 additions & 0 deletions b/‎README.md‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎mistralrs-core/src/speech_models/dia/mod.rs‎
Lines changed: 4 additions & 4 deletions b/‎mistralrs-core/src/speech_models/dia/mod.rs‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎mistralrs-quant/kernels/marlin/marlin/marlin.cuh‎
Lines changed: 13 additions & 4 deletions b/‎mistralrs-quant/kernels/marlin/marlin/marlin.cuh‎
Lines changed: 13 additions & 4 deletions
@@ -63,6 +63,15 @@ Please submit requests for new models [here](https://github.com/EricLBuehler/mis
     ./mistralrs-server -i --isq 4 plain -m Qwen/Qwen3-8B
     ```
 
+- Run the **AWQ format** models
+    Step 1: Convert AWQ model to marlin compatible format
+    ```
+    python3 scripts/convert_awq_marlin.py --src /home/Meta-Llama-3.1-8B-Instruct-AWQ-INT4/ --dst /home/Meta-Llama-3.1-8B-Instruct-AWQ-INT4-Marlin/ --bits 4
+    ```
+    Step 2: Run the converted model
+    ```
+    ./mistralrs-server -i plain -m /home/Meta-Llama-3.1-8B-Instruct-AWQ-INT4-Marlin/
+    ```
 
 - 💎💎💎 Run the entire **Gemma 3** Model family (1b, 4b, 12b, 27b) with 128k context length and vision support: [documentation](docs/GEMMA3.md)
 
@@ -152,6 +161,7 @@ Mistral.rs supports several model categories:
 - [Details](docs/QUANTS.md)
 - GGML: 2-bit, 3-bit, 4-bit, 5-bit, 6-bit and 8-bit, with imatrix support
 - GPTQ: 2-bit, 3-bit, 4-bit and 8-bit, with [Marlin](https://github.com/IST-DASLab/marlin) kernel support in 4-bit and 8-bit.
+- AWQ: 4-bit and 8-bit (convert using [script](scripts/convert_awq_marlin.py))
 - AFQ: 🔥 2-bit, 3-bit, 4-bit, 6-bit and 8-bit, designed to be fast on Metal!
 - HQQ: 4-bit and 8 bit, with ISQ support
 - FP8
 
@@ -458,10 +458,10 @@ impl DiaPipeline {
             if let Some(eos_countdown) = &mut eos_countdown {
                 let step_after_eos = max_delay_pattern - *eos_countdown;
                 for (i, d) in delay_pattern.iter().enumerate() {
-                    if step_after_eos == *d as usize {
-                        pred_c[i] = audio_eos_value;
-                    } else if step_after_eos > *d as usize {
-                        pred_c[i] = audio_pad_value;
+                    match step_after_eos.cmp(&(*d as usize)) {
+                        std::cmp::Ordering::Equal => pred_c[i] = audio_eos_value,
+                        std::cmp::Ordering::Greater => pred_c[i] = audio_pad_value,
+                        std::cmp::Ordering::Less => {}
                     }
                 }
                 *eos_countdown -= 1;
 
@@ -18,19 +18,19 @@ namespace marlin {
 // than 1 warp per schedule allows some more latency hiding. At the same time,
 // we want relatively few warps to have many registers per warp and small tiles.
 
+static constexpr int default_threads = 256;
 static constexpr int repack_threads = 256;
 static constexpr int repack_stages = 8;
 static constexpr int min_thread_n = 64;
 static constexpr int min_thread_k = 64;
-
+static constexpr int max_thread_n = 256;
 static constexpr int tile_size = 16;
 static constexpr int max_par = 16;
 static constexpr int tile_k_size = tile_size;
 static constexpr int tile_n_size = tile_k_size * 4;
+static constexpr int pipe_stages = 4;
 
-__device__ inline constexpr int ceildiv(int a, int b) {
-  return (a + b - 1) / b;
-}
+constexpr int div_ceil(int a, int b) { return (a + b - 1) / b; }
 
 // Predicated asynchronous global->shared copy; used for inputs A where we apply
 // predication to handle batchsizes that are not multiples of 16.
@@ -115,4 +115,13 @@ struct Vec {
 
 using I4 = Vec<int, 4>;
 
+enum ScalarTypeID {
+  //gptq
+  kU4B8,
+  kU8B128,
+  //awq
+  kU4,
+  kU8,
+};
+
 }  // namespace marlin