EricLBuehler · EricLBuehler · Apr 5, 2025 · Apr 4, 2025 · Apr 4, 2025 · Apr 5, 2025
diff --git a/.typos.toml b/.typos.toml
@@ -6,7 +6,10 @@ extend-ignore-identifiers-re = [
     "Nd",
     "nin",
     "cudaDevAttrMaxSharedMemoryPerBlockOptin",
-    "_thw"
+    "_thw",
+    "thr",
+    "nd",
+    "uneeded"
 ]
 
 [files]

diff --git a/README.md b/README.md
@@ -31,6 +31,12 @@ Please submit requests for new models [here](https://github.com/EricLBuehler/mis
 - Check out UQFF for prequantized models of various methods!
     - Models can be found [here](https://huggingface.co/collections/EricB/uqff-670e4a49d56ecdd3f7f0fd4c).
 
+- 🔥 Try out AFQ for blazingly fast Metal performance!
+
+    ```
+    ./mistralrs-server -i --isq afq8 plain -m meta-llama/Llama-3.2-3B-Instruct
+    ```
+
 - 🔍🌐 Easily add web search capabilities to your models! Compatible with OpenAI's `web_search_options` parameter: [documentation](docs/WEB_SEARCH.md)
 
     ```

diff --git a/docs/ISQ.md b/docs/ISQ.md
@@ -6,7 +6,14 @@ An API is exposed on the Python and Rust APIs which provide the ability to dynam
 
 To set the ISQ type for individual layers, use a model [`topology`](TOPOLOGY.md).
 
+> Note: 🔥 AFQ (affine) quantization is fast on **Metal**
+
 ## ISQ quantization types
+- AFQ2
+- AFQ3
+- AFQ4
+- AFQ6
+- AFQ8
 - Q4_0
 - Q4_1
 - Q5_0

diff --git a/docs/UQFF.md b/docs/UQFF.md
@@ -54,6 +54,13 @@ The following quantization formats are supported in UQFF. One can, of course, be
 - FP8:
     - FP8 E4M3 (4-bit exponent, 3-bit mantissa)
 
+- AFQ quantized (🔥 AFQ is fast on **Metal**):
+    - AFQ2
+    - AFQ3
+    - AFQ4
+    - AFQ6
+    - AFQ8
+
 ## Loading a UQFF model
 
 To load a UQFF model, one should specify the filename. This will be located based on the model ID, and can

diff --git a/mistralrs-core/src/pipeline/isq.rs b/mistralrs-core/src/pipeline/isq.rs
@@ -14,9 +14,9 @@ use candle_core::{quantized, Context, Device, Tensor};
 use indicatif::{MultiProgress, ParallelProgressIterator, ProgressBar, ProgressStyle};
 use itertools::Itertools;
 use mistralrs_quant::{
-    CollectedImatrixData, ColumnParallelLayer, DistributedKind, FP8Linear, GgufMatMul, HqqLayer,
-    IsqType, QuantMethod, QuantizeOntoGuard, QuantizedSerde, QuantizedSerdeType, ReplicatedLayer,
-    RowParallelLayer, UnquantLinear,
+    AfqLayer, CollectedImatrixData, ColumnParallelLayer, DistributedKind, FP8Linear, GgufMatMul,
+    HqqLayer, IsqType, QuantMethod, QuantizeOntoGuard, QuantizedSerde, QuantizedSerdeType,
+    ReplicatedLayer, RowParallelLayer, UnquantLinear,
 };
 use rayon::iter::{IndexedParallelIterator, IntoParallelIterator, ParallelIterator};
 use regex::Regex;
@@ -63,10 +63,15 @@ pub fn parse_isq_value(s: &str) -> Result<IsqType, String> {
         "hqq8" => IsqType::HQQ8,
         "hqq4" => IsqType::HQQ4,
         "fp8" => IsqType::F8E4M3,
+        "afq8" => IsqType::AFQ8,
+        "afq6" => IsqType::AFQ6,
+        "afq4" => IsqType::AFQ4,
+        "afq3" => IsqType::AFQ3,
+        "afq2" => IsqType::AFQ2,
         // "hqq3" => IsqType::HQQ3,
         // "hqq2" => IsqType::HQQ2,
         // "hqq1" => IsqType::HQQ1,
-        _ => return Err(format!("ISQ type {s} unknown, choose one of `Q4_0`, `Q4_1`, `Q5_0`, `Q5_1`, `Q8_0`, `Q8_1`, `Q2K`, `Q3K`, `Q4K`, `Q5K`, `Q6K`, `Q8K`, `HQQ8`, `HQQ4`, `FP8`.")),
+        _ => return Err(format!("ISQ type {s} unknown, choose one of `Q4_0`, `Q4_1`, `Q5_0`, `Q5_1`, `Q8_0`, `Q8_1`, `Q2K`, `Q3K`, `Q4K`, `Q5K`, `Q6K`, `Q8K`, `HQQ8`, `HQQ4`, `FP8`, `AFQ8`, `AFQ6`, `AFQ4`, `AFQ3`, `AFQ2`.")),
     };
     #[cfg(feature = "cuda")]
     {
@@ -442,19 +447,14 @@ pub trait IsqModel {
             // Get the MINIMUM of the max isq threads the quant method
             let mut minimum_max_threads = {
                 let current_rayon_threads = rayon::current_num_threads();
-                tensors
-                    .iter()
-                    .map(|(q, _)| {
-                        if let Some(dtype) = dtype {
-                            q.get_max_isq_cpu_threads(dtype)
-                                .map(usize::from)
-                                .unwrap_or(current_rayon_threads)
-                        } else {
-                            current_rayon_threads
-                        }
-                    })
-                    .min()
-                    .unwrap_or(current_rayon_threads)
+                if let Some(dtype) = dtype {
+                    dtype
+                        .get_max_isq_cpu_threads()
+                        .map(usize::from)
+                        .unwrap_or(current_rayon_threads)
+                } else {
+                    current_rayon_threads
+                }
             };
             if env::var("MISTRALRS_ISQ_SINGLETHREAD").is_ok() {
                 minimum_max_threads = 1;
@@ -807,6 +807,12 @@ pub trait IsqModel {
                                         &comm,
                                         guard.clone(),
                                     )?,
+                                    QuantizedSerdeType::Afq => AfqLayer::deserialize(
+                                        Cow::from(artifact),
+                                        &devices[i],
+                                        &comm,
+                                        guard.clone(),
+                                    )?,
                                 }
                             }
                         };
@@ -874,6 +880,12 @@ pub trait IsqModel {
                                         &comm,
                                         guard.clone(),
                                     )?,
+                                    QuantizedSerdeType::Afq => AfqLayer::deserialize(
+                                        Cow::from(artifact),
+                                        &devices[i],
+                                        &comm,
+                                        guard.clone(),
+                                    )?,
                                 }
                             }
                         };

diff --git a/mistralrs-core/src/pipeline/normal.rs b/mistralrs-core/src/pipeline/normal.rs
@@ -43,7 +43,7 @@ use candle_core::{Device, Tensor, Var};
 use hf_hub::Cache;
 use hf_hub::{api::sync::ApiBuilder, Repo, RepoType};
 use indicatif::MultiProgress;
-use mistralrs_quant::{GgufMatMul, HqqLayer, IsqType, QuantizedSerdeType};
+use mistralrs_quant::{AfqLayer, GgufMatMul, HqqLayer, IsqType, QuantizedSerdeType};
 use rand_isaac::Isaac64Rng;
 use regex_automata::meta::Regex;
 use std::any::Any;
@@ -365,6 +365,10 @@ impl Loader for NormalLoader {
                                 }
                                 QuantizedSerdeType::Fp8 => IsqType::F8E4M3.pack_factor(dtype),
                                 QuantizedSerdeType::Unquant => 1,
+                                QuantizedSerdeType::Afq => {
+                                    AfqLayer::get_isq_type_from_uqff(Cow::Borrowed(artifact))?
+                                        .pack_factor(dtype)
+                                }
                             };
                             total_pack_factors += pack_factor;
                         }

diff --git a/mistralrs-core/src/pipeline/vision.rs b/mistralrs-core/src/pipeline/vision.rs
@@ -38,7 +38,7 @@ use candle_core::{Device, Tensor, Var};
 use hf_hub::Cache;
 use hf_hub::{api::sync::ApiBuilder, Repo, RepoType};
 use indicatif::MultiProgress;
-use mistralrs_quant::{GgufMatMul, HqqLayer, IsqType, QuantizedSerdeType};
+use mistralrs_quant::{AfqLayer, GgufMatMul, HqqLayer, IsqType, QuantizedSerdeType};
 use rand_isaac::Isaac64Rng;
 use regex_automata::meta::Regex;
 use std::any::Any;
@@ -305,6 +305,10 @@ impl Loader for VisionLoader {
                                 }
                                 QuantizedSerdeType::Fp8 => IsqType::F8E4M3.pack_factor(dtype),
                                 QuantizedSerdeType::Unquant => 1,
+                                QuantizedSerdeType::Afq => {
+                                    AfqLayer::get_isq_type_from_uqff(Cow::Borrowed(artifact))?
+                                        .pack_factor(dtype)
+                                }
                             };
                             total_pack_factors += pack_factor;
                         }
-Original file line number
+Diff line change
@@ Expand Up / @@ -6,7 +6,10 @@ extend-ignore-identifiers-re = [ @@
         "Nd",
         "nin",
         "cudaDevAttrMaxSharedMemoryPerBlockOptin",
-        "_thw"
+        "_thw",
+        "thr",
+        "nd",
+        "uneeded"
     ]
     [files]
@@ Expand Down @@