EricLBuehler · EricLBuehler · Jun 9, 2025 · Jun 6, 2025 · Jun 6, 2025 · Jun 6, 2025
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/examples/python/custom_tool_call.py b/examples/python/custom_tool_call.py
@@ -8,6 +8,7 @@
     ToolChoice,
 )
 
+
 def local_search(query: str):
     results = []
     for root, _, files in os.walk("."):
@@ -35,6 +36,7 @@ def tool_cb(name: str, args: dict) -> str:
         return json.dumps(local_search(args.get("query", "")))
     return ""
 
+
 schema = json.dumps(
     {
         "type": "function",
@@ -51,7 +53,9 @@ def tool_cb(name: str, args: dict) -> str:
 )
 
 runner = Runner(
-    which=Which.Plain(model_id="NousResearch/Hermes-3-Llama-3.1-8B", arch=Architecture.Llama),
+    which=Which.Plain(
+        model_id="NousResearch/Hermes-3-Llama-3.1-8B", arch=Architecture.Llama
+    ),
     tool_callbacks={"local_search": tool_cb},
 )
 

diff --git a/mistralrs-core/Cargo.toml b/mistralrs-core/Cargo.toml
@@ -94,6 +94,15 @@ ahash.workspace = true
 num-traits.workspace = true
 libc.workspace = true
 bm25.workspace = true
+mel_spec = "0.3.3"
+dasp = "0.11.0"
+rubato = "0.16.2"
+rustfft = "6.3.0"
+symphonia = "0.5.4"
+hound = "3.5.1"
+cpal = "0.15.3"
+apodize = "1.0.0"
+realfft = "3.4.0"
 
 [features]
 pyo3_macros = ["pyo3"]

diff --git a/mistralrs-core/src/layers.rs b/mistralrs-core/src/layers.rs
@@ -7,7 +7,8 @@ use candle_core::{
     Context, DType, Device, IndexOp, Result, Tensor, D,
 };
 use candle_nn::{
-    Conv2d, Conv2dConfig, Embedding, GroupNorm, LayerNorm, LayerNormConfig, Linear, Module,
+    BatchNorm, BatchNormConfig, Conv1d, Conv1dConfig, Conv2d, Conv2dConfig, Embedding, GroupNorm,
+    LayerNorm, LayerNormConfig, Linear, Module,
 };
 use float8::F8E4M3;
 use half::{bf16, f16};
@@ -67,6 +68,34 @@ pub fn layer_norm<C: Into<LayerNormConfig>>(
     }
 }
 
+pub fn batch_norm<C: Into<BatchNormConfig>>(
+    num_features: usize,
+    config: C,
+    vb: ShardedVarBuilder,
+) -> Result<BatchNorm> {
+    let config = config.into();
+    if config.eps < 0. {
+        candle_core::bail!("batch-norm eps cannot be negative {}", config.eps)
+    }
+    let running_mean = vb.get(num_features, "running_mean")?;
+    let running_var = vb.get(num_features, "running_var")?;
+
+    if config.affine {
+        let weight = vb.get(num_features, "weight")?;
+        let bias = vb.get(num_features, "bias")?;
+        BatchNorm::new(
+            num_features,
+            running_mean,
+            running_var,
+            weight,
+            bias,
+            config.eps,
+        )
+    } else {
+        BatchNorm::new_no_bias(num_features, running_mean, running_var, config.eps)
+    }
+}
+
 pub fn group_norm(
     num_groups: usize,
     num_channels: usize,
@@ -117,6 +146,35 @@ pub fn conv2d_no_bias(
     Ok(Conv2d::new(ws, None, cfg))
 }
 
+pub fn conv1d(
+    in_channels: usize,
+    out_channels: usize,
+    kernel_size: usize,
+    cfg: Conv1dConfig,
+    vb: ShardedVarBuilder,
+) -> Result<Conv1d> {
+    let ws = vb.get(
+        (out_channels, in_channels / cfg.groups, kernel_size),
+        "weight",
+    )?;
+    let bs = vb.get(out_channels, "bias")?;
+    Ok(Conv1d::new(ws, Some(bs), cfg))
+}
+
+pub fn conv1d_no_bias(
+    in_channels: usize,
+    out_channels: usize,
+    kernel_size: usize,
+    cfg: Conv1dConfig,
+    vb: ShardedVarBuilder,
+) -> Result<Conv1d> {
+    let ws = vb.get(
+        (out_channels, in_channels / cfg.groups, kernel_size),
+        "weight",
+    )?;
+    Ok(Conv1d::new(ws, None, cfg))
+}
+
 pub fn linear(in_dim: usize, out_dim: usize, vb: ShardedVarBuilder) -> Result<Linear> {
     let ws = vb.get((out_dim, in_dim), "weight")?;
     let bs = vb.get(out_dim, "bias")?;

diff --git a/mistralrs-core/src/vision_models/conformer/config.rs b/mistralrs-core/src/vision_models/conformer/config.rs
@@ -0,0 +1,141 @@
+use serde::{Deserialize, Serialize};
+
+use crate::{layers::Activation, serde_default_fn};
+
+serde_default_fn!(usize, default_attention_dim, 256);
+serde_default_fn!(usize, default_attention_heads, 4);
+serde_default_fn!(usize, default_linear_units, 2048);
+serde_default_fn!(usize, default_num_blocks, 6);
+serde_default_fn!(String, default_input_layer, "nemo_conv".to_string());
+serde_default_fn!(bool, default_causal, true);
+serde_default_fn!(bool, default_batch_norm, false);
+serde_default_fn!(usize, default_ext_pw_out_channel, 0);
+serde_default_fn!(usize, default_ext_pw_kernel_size, 1);
+serde_default_fn!(usize, default_depthwise_seperable_out_channel, 256);
+serde_default_fn!(usize, default_depthwise_multiplier, 1);
+serde_default_fn!(usize, default_chunk_se, 0);
+serde_default_fn!(usize, default_kernel_size, 3);
+serde_default_fn!(Activation, default_activation, Activation::Relu);
+serde_default_fn!(Activation, default_conv_activation, Activation::Relu);
+serde_default_fn!(Activation, default_conv_glu_type, Activation::Sigmoid);
+serde_default_fn!(bool, default_bias_in_glu, true);
+serde_default_fn!(bool, default_linear_glu_in_convm, false);
+serde_default_fn!(String, default_attention_glu_type, "swish".to_string());
+serde_default_fn!(bool, default_export, false);
+serde_default_fn!(i32, default_extra_layer_output_idx, -1);
+serde_default_fn!(usize, default_time_reduction, 4);
+serde_default_fn!(bool, default_replication_pad_for_subsample_embedding, false);
+serde_default_fn!(usize, default_attention_group_size, 1);
+serde_default_fn!(String, default_subsampling, "dw_striding".to_string());
+serde_default_fn!(usize, default_conv_channels, 256);
+serde_default_fn!(usize, default_subsampling_conv_chunking_factor, 1);
+serde_default_fn!(Activation, default_nemo_activation, Activation::Relu);
+serde_default_fn!(bool, default_nemo_is_causal, false);
+serde_default_fn!(usize, fake_default_sentinel, usize::MAX);
+
+#[derive(Serialize, Deserialize, Debug, Clone)]
+pub struct RelativeAttentionBiasArgs {
+    pub t5_bias_max_distance: Option<usize>,
+    pub t5_bias_symmetric: Option<bool>,
+    #[serde(rename = "type")]
+    pub tp: String,
+}
+
+#[derive(Serialize, Deserialize, Debug, Clone)]
+pub struct NemoConvConfig {
+    #[serde(default = "default_subsampling")]
+    pub subsampling: String,
+    #[serde(default = "fake_default_sentinel")]
+    pub subsampling_factor: usize,
+    #[serde(default = "fake_default_sentinel")]
+    pub feat_in: usize,
+    #[serde(default = "fake_default_sentinel")]
+    pub feat_out: usize,
+    #[serde(default = "default_conv_channels")]
+    pub conv_channels: usize,
+    #[serde(default = "default_subsampling_conv_chunking_factor")]
+    pub subsampling_conv_chunking_factor: usize,
+    #[serde(default = "default_nemo_activation")]
+    pub activation: Activation,
+    #[serde(default = "default_nemo_is_causal")]
+    pub is_causal: bool,
+}
+
+#[derive(Serialize, Deserialize, Debug, Clone)]
+pub struct EncoderEmbeddingConfig {
+    pub input_size: usize,
+}
+
+#[derive(Serialize, Deserialize, Debug, Clone)]
+pub struct ConformerEncoderConfig {
+    pub input_size: usize,
+    pub chunk_size: i32,
+    pub left_chunk: usize,
+    pub num_lang: Option<usize>,
+    #[serde(default = "default_attention_dim")]
+    pub attention_dim: usize,
+    #[serde(default = "default_attention_heads")]
+    pub attention_heads: usize,
+    #[serde(default = "default_linear_units")]
+    pub linear_units: usize,
+    #[serde(default = "default_num_blocks")]
+    pub num_blocks: usize,
+    #[serde(default = "default_input_layer")]
+    pub input_layer: String,
+    #[serde(default = "default_causal")]
+    pub causal: bool,
+    #[serde(default = "default_batch_norm")]
+    pub batch_norm: bool,
+    #[serde(default = "default_ext_pw_out_channel")]
+    pub ext_pw_out_channel: usize,
+    #[serde(default = "default_ext_pw_kernel_size")]
+    pub ext_pw_kernel_size: usize,
+    #[serde(default = "default_depthwise_seperable_out_channel")]
+    pub depthwise_seperable_out_channel: usize,
+    #[serde(default = "default_depthwise_multiplier")]
+    pub depthwise_multiplier: usize,
+    #[serde(default = "default_chunk_se")]
+    pub chunk_se: usize,
+    #[serde(default = "default_kernel_size")]
+    pub kernel_size: usize,
+    #[serde(default = "default_activation")]
+    pub activation: Activation,
+    #[serde(default = "default_conv_activation")]
+    pub conv_activation: Activation,
+    #[serde(default = "default_conv_glu_type")]
+    pub conv_glu_type: Activation,
+    #[serde(default = "default_bias_in_glu")]
+    pub bias_in_glu: bool,
+    #[serde(default = "default_linear_glu_in_convm")]
+    pub linear_glu_in_convm: bool,
+    #[serde(default = "default_attention_glu_type")]
+    pub attention_glu_type: String,
+    #[serde(default = "default_export")]
+    pub export: bool,
+    #[serde(default = "default_extra_layer_output_idx")]
+    pub extra_layer_output_idx: i32,
+    pub relative_attention_bias_args: Option<RelativeAttentionBiasArgs>,
+    #[serde(default = "default_time_reduction")]
+    pub time_reduction: usize,
+    pub nemo_conv_settings: NemoConvConfig,
+    #[serde(default = "default_replication_pad_for_subsample_embedding")]
+    pub replication_pad_for_subsample_embedding: bool,
+    #[serde(default = "default_attention_group_size")]
+    pub attention_group_size: usize,
+    pub encoder_embedding_config: Option<EncoderEmbeddingConfig>,
+}
+
+impl ConformerEncoderConfig {
+    pub fn finish_nemo_config(&mut self) {
+        // Override any of the defaults with the incoming, user settings
+        if self.nemo_conv_settings.subsampling_factor == usize::MAX {
+            self.nemo_conv_settings.subsampling_factor = self.time_reduction;
+        }
+        if self.nemo_conv_settings.feat_in == usize::MAX {
+            self.nemo_conv_settings.feat_in = self.input_size;
+        }
+        if self.nemo_conv_settings.feat_out == usize::MAX {
+            self.nemo_conv_settings.feat_out = self.attention_dim;
+        }
+    }
+}