EricLBuehler
diff --git a/‎mistralrs-bench/src/main.rs‎
Lines changed: 1 addition & 10 deletions b/‎mistralrs-bench/src/main.rs‎
Lines changed: 1 addition & 10 deletions
diff --git a/‎mistralrs-core/src/attention.rs‎
Lines changed: 1 addition & 1 deletion b/‎mistralrs-core/src/attention.rs‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎mistralrs-core/src/lib.rs‎
Lines changed: 8 additions & 8 deletions b/‎mistralrs-core/src/lib.rs‎
Lines changed: 8 additions & 8 deletions
diff --git a/‎mistralrs-core/src/model_loader.rs‎
Lines changed: 3 additions & 18 deletions b/‎mistralrs-core/src/model_loader.rs‎
Lines changed: 3 additions & 18 deletions
diff --git a/‎mistralrs-core/src/models/deepseek2.rs‎
Lines changed: 0 additions & 1 deletion b/‎mistralrs-core/src/models/deepseek2.rs‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎mistralrs-core/src/models/deepseek3.rs‎
Lines changed: 0 additions & 1 deletion b/‎mistralrs-core/src/models/deepseek3.rs‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎mistralrs-core/src/models/llama.rs‎
Lines changed: 0 additions & 1 deletion b/‎mistralrs-core/src/models/llama.rs‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎mistralrs-core/src/models/mistral.rs‎
Lines changed: 0 additions & 1 deletion b/‎mistralrs-core/src/models/mistral.rs‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎mistralrs-core/src/models/qwen2.rs‎
Lines changed: 0 additions & 1 deletion b/‎mistralrs-core/src/models/qwen2.rs‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎mistralrs-core/src/models/qwen3.rs‎
Lines changed: 0 additions & 1 deletion b/‎mistralrs-core/src/models/qwen3.rs‎
Lines changed: 0 additions & 1 deletion
@@ -11,7 +11,7 @@ use mistralrs_core::{
 use std::sync::Arc;
 use std::{fmt::Display, num::NonZeroUsize};
 use tokio::sync::mpsc::channel;
-use tracing::{info, warn};
+use tracing::info;
 
 enum TestName {
     Prompt(usize),
@@ -346,8 +346,6 @@ fn main() -> anyhow::Result<()> {
 
     args.concurrency = Some(args.concurrency.unwrap_or(vec![1]));
 
-    let use_flash_attn = mistralrs_core::using_flash_attn();
-
     let prompt_chunksize = match args.prompt_chunksize {
         Some(0) => {
             anyhow::bail!("`prompt_chunksize` must be a strictly positive integer, got 0.",)
@@ -362,7 +360,6 @@ fn main() -> anyhow::Result<()> {
     let max_seq_len = auto_device_map_params.max_seq_len();
 
     let loader: Box<dyn Loader> = LoaderBuilder::new(args.model)
-        .with_use_flash_attn(use_flash_attn)
         .with_prompt_chunksize(prompt_chunksize)
         .build()?;
     let model_name = loader.get_id();
@@ -389,12 +386,6 @@ fn main() -> anyhow::Result<()> {
         candle_core::utils::with_f16c()
     );
     info!("Sampling method: penalties -> temperature -> topk -> topp -> minp -> multinomial");
-    if use_flash_attn {
-        info!("Using flash attention.");
-    }
-    if use_flash_attn && loader.get_kind().is_quantized() {
-        warn!("Using flash attention with a quantized model has no effect!")
-    }
     info!("Model kind is: {}", loader.get_kind().to_string());
 
     // Parse device mapper
 
@@ -282,7 +282,7 @@ impl Sdpa {
     /// - v: (b_sz, n_kv_heads, q_len, head_dim)
     ///
     /// The attention implementation is dispatched as follows:
-    /// 1) If `use_flash_attn == true` (CUDA), use a flash attention V2 kernel
+    /// 1) If using flash attn (CUDA), use a flash attention V2/V3 kernel
     /// 2) If decoding and using a Metal device, use a fused kkernel
     /// 2) Otherwise, use the "naive" SDPA implementation (with optimized mask+softmax+scale application)
     #[allow(unused_variables, clippy::too_many_arguments)]
 
@@ -86,14 +86,14 @@ pub use paged_attention::{MemoryGpuConfig, PagedAttentionConfig};
 pub use pipeline::{
     chat_template::ChatTemplate, parse_isq_value, AdapterPaths, AnyMoeLoader, AnyMoePipeline,
     AutoDeviceMapParams, DiffusionGenerationParams, DiffusionLoader, DiffusionLoaderBuilder,
-    DiffusionLoaderType, DiffusionSpecificConfig, GGMLLoader, GGMLLoaderBuilder,
-    GGMLSpecificConfig, GGUFLoader, GGUFLoaderBuilder, GGUFSpecificConfig, GemmaLoader,
-    Idefics2Loader, IsqOrganization, LLaVALoader, LLaVANextLoader, LlamaLoader, Loader,
-    LocalModelPaths, LoraAdapterPaths, MistralLoader, MixtralLoader, ModelKind, ModelPaths,
-    NormalLoader, NormalLoaderBuilder, NormalLoaderType, NormalSpecificConfig, Phi2Loader,
-    Phi3Loader, Phi3VLoader, Qwen2Loader, SpeculativeConfig, SpeculativeLoader,
-    SpeculativePipeline, Starcoder2Loader, TokenSource, VisionLoader, VisionLoaderBuilder,
-    VisionLoaderType, VisionPromptPrefixer, VisionSpecificConfig, UQFF_MULTI_FILE_DELIMITER,
+    DiffusionLoaderType, GGMLLoader, GGMLLoaderBuilder, GGMLSpecificConfig, GGUFLoader,
+    GGUFLoaderBuilder, GGUFSpecificConfig, GemmaLoader, Idefics2Loader, IsqOrganization,
+    LLaVALoader, LLaVANextLoader, LlamaLoader, Loader, LocalModelPaths, LoraAdapterPaths,
+    MistralLoader, MixtralLoader, ModelKind, ModelPaths, NormalLoader, NormalLoaderBuilder,
+    NormalLoaderType, NormalSpecificConfig, Phi2Loader, Phi3Loader, Phi3VLoader, Qwen2Loader,
+    SpeculativeConfig, SpeculativeLoader, SpeculativePipeline, Starcoder2Loader, TokenSource,
+    VisionLoader, VisionLoaderBuilder, VisionLoaderType, VisionPromptPrefixer,
+    VisionSpecificConfig, UQFF_MULTI_FILE_DELIMITER,
 };
 pub use request::{
     ApproximateUserLocation, Constraint, DetokenizationRequest, ImageGenerationResponseFormat,
 
@@ -11,8 +11,8 @@ use crate::{
     get_toml_selected_model_dtype,
     pipeline::{GGMLLoaderBuilder, GGMLSpecificConfig, GGUFLoaderBuilder, NormalSpecificConfig},
     toml_selector::get_toml_selected_model_device_map_params,
-    AutoDeviceMapParams, DiffusionLoaderBuilder, DiffusionSpecificConfig, GGUFSpecificConfig,
-    Loader, ModelDType, ModelSelected, NormalLoaderBuilder, TomlLoaderArgs, TomlSelector, Topology,
+    AutoDeviceMapParams, DiffusionLoaderBuilder, GGUFSpecificConfig, Loader, ModelDType,
+    ModelSelected, NormalLoaderBuilder, TomlLoaderArgs, TomlSelector, Topology,
     VisionLoaderBuilder, VisionSpecificConfig, GGUF_MULTI_FILE_DELIMITER,
     UQFF_MULTI_FILE_DELIMITER,
 };
@@ -23,7 +23,6 @@ pub struct LoaderBuilder {
     no_kv_cache: bool,
     chat_template: Option<String>,
     jinja_explicit: Option<String>,
-    use_flash_attn: bool,
     prompt_chunksize: Option<NonZeroUsize>,
 }
 
@@ -33,7 +32,6 @@ impl LoaderBuilder {
             model,
             no_kv_cache: false,
             chat_template: None,
-            use_flash_attn: false,
             prompt_chunksize: None,
             jinja_explicit: None,
         }
@@ -51,10 +49,6 @@ impl LoaderBuilder {
         self.jinja_explicit = jinja_explicit;
         self
     }
-    pub fn with_use_flash_attn(mut self, use_flash_attn: bool) -> Self {
-        self.use_flash_attn = use_flash_attn;
-        self
-    }
     pub fn with_prompt_chunksize(mut self, prompt_chunksize: Option<NonZeroUsize>) -> Self {
         self.prompt_chunksize = prompt_chunksize;
         self
@@ -188,15 +182,13 @@ pub fn get_auto_device_map_params(model: &ModelSelected) -> anyhow::Result<AutoD
 }
 
 fn loader_from_model_selected(args: LoaderBuilder) -> anyhow::Result<Box<dyn Loader>> {
-    let use_flash_attn = args.use_flash_attn;
     let loader: Box<dyn Loader> = match args.model {
         ModelSelected::Toml { file } => {
             let selector: TomlSelector = toml::from_str(
                 &fs::read_to_string(file.clone())
                     .unwrap_or_else(|_| panic!("Could not load toml selector file at {file}")),
             )?;
             let args = TomlLoaderArgs {
-                use_flash_attn,
                 chat_template: args.chat_template,
                 no_kv_cache: args.no_kv_cache,
                 prompt_chunksize: args.prompt_chunksize,
@@ -220,7 +212,6 @@ fn loader_from_model_selected(args: LoaderBuilder) -> anyhow::Result<Box<dyn Loa
             hf_cache_path,
         } => NormalLoaderBuilder::new(
             NormalSpecificConfig {
-                use_flash_attn,
                 prompt_chunksize: args.prompt_chunksize,
                 topology: Topology::from_option_path(topology)?,
                 organization: organization.unwrap_or_default(),
@@ -258,7 +249,6 @@ fn loader_from_model_selected(args: LoaderBuilder) -> anyhow::Result<Box<dyn Loa
             hf_cache_path,
         } => NormalLoaderBuilder::new(
             NormalSpecificConfig {
-                use_flash_attn,
                 prompt_chunksize: args.prompt_chunksize,
                 topology: Topology::from_option_path(topology)?,
                 organization: Default::default(),
@@ -303,7 +293,6 @@ fn loader_from_model_selected(args: LoaderBuilder) -> anyhow::Result<Box<dyn Loa
             hf_cache_path,
         } => NormalLoaderBuilder::new(
             NormalSpecificConfig {
-                use_flash_attn,
                 prompt_chunksize: args.prompt_chunksize,
                 topology: Topology::from_option_path(topology)?,
                 organization: Default::default(),
@@ -526,7 +515,6 @@ fn loader_from_model_selected(args: LoaderBuilder) -> anyhow::Result<Box<dyn Loa
             imatrix,
         } => VisionLoaderBuilder::new(
             VisionSpecificConfig {
-                use_flash_attn,
                 prompt_chunksize: args.prompt_chunksize,
                 topology: Topology::from_option_path(topology)?,
                 write_uqff,
@@ -551,10 +539,7 @@ fn loader_from_model_selected(args: LoaderBuilder) -> anyhow::Result<Box<dyn Loa
             model_id,
             arch,
             dtype: _,
-        } => {
-            DiffusionLoaderBuilder::new(DiffusionSpecificConfig { use_flash_attn }, Some(model_id))
-                .build(arch)
-        }
+        } => DiffusionLoaderBuilder::new(Some(model_id)).build(arch),
     };
     Ok(loader)
 }
@@ -39,7 +39,6 @@ serde_default_fn!(bool, norm_topk_prob, false);
 serde_default_fn!(ScoringFunc, scoring_func, ScoringFunc::Softmax);
 serde_default_fn!(Activation, hidden_act, Activation::Silu);
 serde_default_fn!(bool, tie_word_embeddings, false);
-serde_default_fn!(bool, use_flash_attn_default, false);
 
 #[derive(Deserialize, Clone, Debug)]
 enum TopkMethod {
 
@@ -38,7 +38,6 @@ serde_default_fn!(usize, first_k_dense_replace, 0);
 serde_default_fn!(ScoringFunc, scoring_func, ScoringFunc::Softmax);
 serde_default_fn!(Activation, hidden_act, Activation::Silu);
 serde_default_fn!(bool, tie_word_embeddings, false);
-serde_default_fn!(bool, use_flash_attn_default, false);
 
 #[derive(Deserialize, Clone, Debug)]
 enum TopkMethod {
 
@@ -30,7 +30,6 @@ use crate::{
 };
 
 serde_default_fn!(bool, word_emb_default, false);
-serde_default_fn!(bool, use_flash_attn_default, false);
 
 #[derive(Debug, Clone, Deserialize, Serialize, Default)]
 pub struct Config {
 
@@ -26,7 +26,6 @@ use crate::{
     utils::{progress::NiceProgressBar, unvarbuilder::UnVarBuilder},
 };
 
-serde_default_fn!(bool, use_flash_attn, false);
 serde_default_fn!(bool, tie_word_embeddings, false);
 
 #[derive(Debug, Clone, Default, Serialize, Deserialize)]
 
@@ -25,7 +25,6 @@ use crate::{
 };
 
 serde_default_fn!(bool, word_emb_default, false);
-serde_default_fn!(bool, use_flash_attn, false);
 
 #[derive(Debug, Clone, serde::Deserialize, Default, serde::Serialize)]
 pub struct Config {
 
@@ -40,7 +40,6 @@ macro_rules! sliding_window {
     };
 }
 
-serde_default_fn!(bool, use_flash_attn, false);
 serde_default_fn!(bool, tie_word_embeddings, false);
 
 #[derive(Debug, Clone, Default, Serialize, Deserialize)]
Original file line number	Diff line number	Diff line change
`@@ -40,7 +40,6 @@ macro_rules! sliding_window {`
`40`	`40`	`};`
`41`	`41`	`}`
`42`	`42`
`43`		`-serde_default_fn!(bool, use_flash_attn, false);`
`44`	`43`	`serde_default_fn!(bool, tie_word_embeddings, false);`
`45`	`44`
`46`	`45`	`#[derive(Debug, Clone, Default, Serialize, Deserialize)]`