From 3b9ce4d8a5a5aa5d32035af6ce539aaad44bd6ed Mon Sep 17 00:00:00 2001 From: Eric Buehler Date: Sun, 13 Apr 2025 10:15:01 +0000 Subject: [PATCH 1/2] Serialize sharded uqff files --- mistralrs-core/src/pipeline/isq.rs | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/mistralrs-core/src/pipeline/isq.rs b/mistralrs-core/src/pipeline/isq.rs index 229ca2a707..32cbefde30 100644 --- a/mistralrs-core/src/pipeline/isq.rs +++ b/mistralrs-core/src/pipeline/isq.rs @@ -27,6 +27,8 @@ use tracing::{info, warn}; use crate::{device_map::DeviceMapper, topology::LayerTopology, Topology}; pub(crate) const UQFF_RESIDUAL_SAFETENSORS: &str = "residual.safetensors"; +// 10 GB max per file +const MAX_UQFF_SIZE_BYTES: usize = 10 * 1024 * 1024 * 1024; /// Parse ISQ value. /// @@ -596,6 +598,7 @@ pub trait IsqModel { .collect::>>() } }); + let quantized_values = quantized_values?; let parent = serialized .parent() @@ -603,7 +606,31 @@ pub trait IsqModel { std::fs::create_dir_all(parent)?; - safetensors::serialize_to_file(quantized_values?, &None, serialized)?; + let file_stem = serialized + .file_stem() + .context("Target UQFF path must have a file stem!")? + .to_string_lossy() + .to_string(); + + let size_estimate_bytes = quantized_values + .iter() + .map(|(_, x)| x.elem_count() * x.dtype().size_in_bytes()) + .sum::(); + let n_files = size_estimate_bytes.div_ceil(MAX_UQFF_SIZE_BYTES); + + if n_files == 1 { + info!("Serializing to `{}`", serialized.display()); + safetensors::serialize_to_file(quantized_values, &None, serialized)?; + } else { + let chunksize = quantized_values.len() / n_files; + let quantized_values_chunks = quantized_values.into_iter().chunks(chunksize); + for (i, chunk) in quantized_values_chunks.into_iter().enumerate() { + let mut name = parent.to_path_buf(); + name.push(format!("{file_stem}-{i}.uqff")); + info!("Serializing shard {i} to `{}`", name.display()); + safetensors::serialize_to_file(chunk, &None, &name)?; + } + } let residual = match organization { IsqOrganization::Default => self.residual_tensors(), From 61d0d1f6fd509c2b0093c395bfe68d1eb3689836 Mon Sep 17 00:00:00 2001 From: Eric Buehler Date: Sun, 13 Apr 2025 10:59:31 +0000 Subject: [PATCH 2/2] Loading --- mistralrs-core/src/lib.rs | 2 +- mistralrs-core/src/model_loader.rs | 31 +++++++++++++++++--- mistralrs-core/src/model_selected.rs | 16 +++++------ mistralrs-core/src/pipeline/isq.rs | 9 +++--- mistralrs-core/src/pipeline/macros.rs | 8 ++++-- mistralrs-core/src/pipeline/mod.rs | 2 +- mistralrs-core/src/pipeline/normal.rs | 6 ++-- mistralrs-core/src/pipeline/vision.rs | 6 ++-- mistralrs-core/src/toml_selector.rs | 40 +++++++++++++++++++------- mistralrs-pyo3/API.md | 21 ++++++++++++-- mistralrs-pyo3/mistralrs.pyi | 4 +++ mistralrs-pyo3/src/lib.rs | 29 ++++++++++++++++--- mistralrs-pyo3/src/which.rs | 8 +++--- mistralrs/examples/uqff/main.rs | 2 +- mistralrs/examples/uqff_vision/main.rs | 2 +- mistralrs/src/text_model.rs | 6 ++-- mistralrs/src/vision_model.rs | 10 +++++-- 17 files changed, 147 insertions(+), 55 deletions(-) diff --git a/mistralrs-core/src/lib.rs b/mistralrs-core/src/lib.rs index 79f02a5382..2d5833e10c 100644 --- a/mistralrs-core/src/lib.rs +++ b/mistralrs-core/src/lib.rs @@ -92,7 +92,7 @@ pub use pipeline::{ NormalLoaderBuilder, NormalLoaderType, NormalSpecificConfig, Phi2Loader, Phi3Loader, Phi3VLoader, Qwen2Loader, SpeculativeConfig, SpeculativeLoader, SpeculativePipeline, Starcoder2Loader, TokenSource, VisionLoader, VisionLoaderBuilder, VisionLoaderType, - VisionPromptPrefixer, VisionSpecificConfig, + VisionPromptPrefixer, VisionSpecificConfig, UQFF_MULTI_FILE_DELIMITER, }; pub use request::{ ApproximateUserLocation, Constraint, DetokenizationRequest, ImageGenerationResponseFormat, diff --git a/mistralrs-core/src/model_loader.rs b/mistralrs-core/src/model_loader.rs index 2c259621e9..64faca43ce 100644 --- a/mistralrs-core/src/model_loader.rs +++ b/mistralrs-core/src/model_loader.rs @@ -1,6 +1,8 @@ use std::{ fs::{self, File}, num::NonZeroUsize, + path::PathBuf, + str::FromStr, }; use mistralrs_quant::MULTI_LORA_DELIMITER; @@ -12,6 +14,7 @@ use crate::{ AutoDeviceMapParams, DiffusionLoaderBuilder, DiffusionSpecificConfig, GGUFSpecificConfig, Loader, ModelDType, ModelSelected, NormalLoaderBuilder, TomlLoaderArgs, TomlSelector, Topology, VisionLoaderBuilder, VisionSpecificConfig, GGUF_MULTI_FILE_DELIMITER, + UQFF_MULTI_FILE_DELIMITER, }; /// A builder for a loader using the selected model. @@ -222,7 +225,12 @@ fn loader_from_model_selected(args: LoaderBuilder) -> anyhow::Result>() + }), imatrix, calibration_file, hf_cache_path, @@ -255,7 +263,12 @@ fn loader_from_model_selected(args: LoaderBuilder) -> anyhow::Result>() + }), imatrix: None, calibration_file: None, hf_cache_path, @@ -295,7 +308,12 @@ fn loader_from_model_selected(args: LoaderBuilder) -> anyhow::Result>() + }), imatrix: None, calibration_file: None, hf_cache_path, @@ -512,7 +530,12 @@ fn loader_from_model_selected(args: LoaderBuilder) -> anyhow::Result>() + }), max_edge, calibration_file, imatrix, diff --git a/mistralrs-core/src/model_selected.rs b/mistralrs-core/src/model_selected.rs index de1aa19fd8..02b9d7029a 100644 --- a/mistralrs-core/src/model_selected.rs +++ b/mistralrs-core/src/model_selected.rs @@ -63,9 +63,9 @@ pub enum ModelSelected { #[arg(short, long)] write_uqff: Option, - /// UQFF path to load from. If provided, this takes precedence over applying ISQ. + /// UQFF path to load from. If provided, this takes precedence over applying ISQ. Specify multiple files using a semicolon delimiter (;) #[arg(short, long)] - from_uqff: Option, + from_uqff: Option, /// .imatrix file to enhance GGUF quantizations with. /// Incompatible with `--calibration-file/-c` @@ -129,9 +129,9 @@ pub enum ModelSelected { #[arg(short, long)] write_uqff: Option, - /// UQFF path to load from. If provided, this takes precedence over applying ISQ. + /// UQFF path to load from. If provided, this takes precedence over applying ISQ. Specify multiple files using a semicolon delimiter (;). #[arg(short, long)] - from_uqff: Option, + from_uqff: Option, /// Maximum prompt sequence length to expect for this model. This affects automatic device mapping but is not a hard limit. #[arg(long, default_value_t = AutoDeviceMapParams::DEFAULT_MAX_SEQ_LEN)] @@ -176,9 +176,9 @@ pub enum ModelSelected { #[arg(short, long)] write_uqff: Option, - /// UQFF path to load from. If provided, this takes precedence over applying ISQ. + /// UQFF path to load from. If provided, this takes precedence over applying ISQ. Specify multiple files using a semicolon delimiter (;). #[arg(short, long)] - from_uqff: Option, + from_uqff: Option, /// Maximum prompt sequence length to expect for this model. This affects automatic device mapping but is not a hard limit. #[arg(long, default_value_t = AutoDeviceMapParams::DEFAULT_MAX_SEQ_LEN)] @@ -486,9 +486,9 @@ pub enum ModelSelected { #[arg(short, long)] write_uqff: Option, - /// UQFF path to load from. If provided, this takes precedence over applying ISQ. + /// UQFF path to load from. If provided, this takes precedence over applying ISQ. Specify multiple files using a semicolon delimiter (;). #[arg(short, long)] - from_uqff: Option, + from_uqff: Option, /// Automatically resize and pad images to this maximum edge length. Aspect ratio is preserved. /// This is only supported on the Qwen2-VL and Idefics models. Others handle this internally. diff --git a/mistralrs-core/src/pipeline/isq.rs b/mistralrs-core/src/pipeline/isq.rs index 32cbefde30..33533431b8 100644 --- a/mistralrs-core/src/pipeline/isq.rs +++ b/mistralrs-core/src/pipeline/isq.rs @@ -29,6 +29,7 @@ use crate::{device_map::DeviceMapper, topology::LayerTopology, Topology}; pub(crate) const UQFF_RESIDUAL_SAFETENSORS: &str = "residual.safetensors"; // 10 GB max per file const MAX_UQFF_SIZE_BYTES: usize = 10 * 1024 * 1024 * 1024; +pub const UQFF_MULTI_FILE_DELIMITER: &str = ";"; /// Parse ISQ value. /// @@ -619,7 +620,7 @@ pub trait IsqModel { let n_files = size_estimate_bytes.div_ceil(MAX_UQFF_SIZE_BYTES); if n_files == 1 { - info!("Serializing to `{}`", serialized.display()); + info!("Writing to `{}`", serialized.display()); safetensors::serialize_to_file(quantized_values, &None, serialized)?; } else { let chunksize = quantized_values.len() / n_files; @@ -627,7 +628,7 @@ pub trait IsqModel { for (i, chunk) in quantized_values_chunks.into_iter().enumerate() { let mut name = parent.to_path_buf(); name.push(format!("{file_stem}-{i}.uqff")); - info!("Serializing shard {i} to `{}`", name.display()); + info!("Writing shard {i} to `{}`", name.display()); safetensors::serialize_to_file(chunk, &None, &name)?; } } @@ -727,7 +728,7 @@ pub trait IsqModel { device: Device, topology: Option<&Topology>, silent: bool, - artifacts: &PathBuf, + artifacts: &[PathBuf], ) -> candle_core::Result<()> { let (tensors, mapper) = self.get_layers(); let total_tensors = tensors.len(); @@ -764,7 +765,7 @@ pub trait IsqModel { comms.push(mapper.get_comm_for(layer_num.unwrap_or(0))?) } - let artifacts = unsafe { candle_core::safetensors::MmapedSafetensors::new(artifacts)? }; + let artifacts = unsafe { candle_core::safetensors::MmapedSafetensors::multi(artifacts)? }; let artifact_isqs = artifacts .tensors() diff --git a/mistralrs-core/src/pipeline/macros.rs b/mistralrs-core/src/pipeline/macros.rs index ed2e350208..c633b1d49c 100644 --- a/mistralrs-core/src/pipeline/macros.rs +++ b/mistralrs-core/src/pipeline/macros.rs @@ -217,9 +217,13 @@ macro_rules! get_uqff_paths { revision.clone(), )); - let file = $from_uqff.display().to_string(); + let mut files = Vec::new(); + for file in $from_uqff { + let file = file.display().to_string(); - api_get_file!(api, &file, Path::new(&$this.model_id)) + files.push(api_get_file!(api, &file, Path::new(&$this.model_id))); + } + files }}; } diff --git a/mistralrs-core/src/pipeline/mod.rs b/mistralrs-core/src/pipeline/mod.rs index b74504f4cc..2b088f3710 100644 --- a/mistralrs-core/src/pipeline/mod.rs +++ b/mistralrs-core/src/pipeline/mod.rs @@ -30,7 +30,7 @@ pub use gguf::{GGUFLoader, GGUFLoaderBuilder, GGUFSpecificConfig}; use image::DynamicImage; pub use inputs_processor::InputProcessorOutput; pub(crate) use isq::IsqModelLoader; -pub use isq::{parse_isq_value, IsqModel, IsqOrganization}; +pub use isq::{parse_isq_value, IsqModel, IsqOrganization, UQFF_MULTI_FILE_DELIMITER}; pub use loaders::{ AdapterKind, AutoDeviceMapParams, AutoLoader, DeepSeekV2Loader, DeepSeekV3Loader, DeviceMappedModelLoader, DiffusionLoaderType, DiffusionModel, DiffusionModelLoader, FluxLoader, diff --git a/mistralrs-core/src/pipeline/normal.rs b/mistralrs-core/src/pipeline/normal.rs index f255bb8780..f0f9ebf936 100644 --- a/mistralrs-core/src/pipeline/normal.rs +++ b/mistralrs-core/src/pipeline/normal.rs @@ -92,7 +92,7 @@ pub struct NormalLoader { tgt_non_granular_index: Option, token_source: RwLock>, revision: RwLock>, - from_uqff: RwLock>, + from_uqff: RwLock>>, jinja_explicit: Option, hf_cache_path: Option, } @@ -122,7 +122,7 @@ pub struct NormalSpecificConfig { pub topology: Option, pub organization: IsqOrganization, pub write_uqff: Option, - pub from_uqff: Option, + pub from_uqff: Option>, pub imatrix: Option, pub calibration_file: Option, pub hf_cache_path: Option, @@ -349,7 +349,7 @@ impl Loader for NormalLoader { if let Some(serialized) = &*self.from_uqff.read().unwrap() { let weight_pack_factor = { let ser_artifacts = unsafe { - candle_core::safetensors::MmapedSafetensors::new(serialized)? + candle_core::safetensors::MmapedSafetensors::multi(serialized)? }; let mut total_pack_factors = 0; let total_tensors = ser_artifacts.tensors().len(); diff --git a/mistralrs-core/src/pipeline/vision.rs b/mistralrs-core/src/pipeline/vision.rs index be6476faf2..df55a0a2c9 100644 --- a/mistralrs-core/src/pipeline/vision.rs +++ b/mistralrs-core/src/pipeline/vision.rs @@ -87,7 +87,7 @@ pub struct VisionLoader { xlora_order: Option, token_source: RwLock>, revision: RwLock>, - from_uqff: RwLock>, + from_uqff: RwLock>>, jinja_explicit: Option, hf_cache_path: Option, lora_adapter_ids: Option>, @@ -113,7 +113,7 @@ pub struct VisionSpecificConfig { pub prompt_chunksize: Option, pub topology: Option, pub write_uqff: Option, - pub from_uqff: Option, + pub from_uqff: Option>, pub max_edge: Option, pub imatrix: Option, pub calibration_file: Option, @@ -286,7 +286,7 @@ impl Loader for VisionLoader { if let Some(serialized) = &*self.from_uqff.read().unwrap() { let weight_pack_factor = { let ser_artifacts = unsafe { - candle_core::safetensors::MmapedSafetensors::new(serialized)? + candle_core::safetensors::MmapedSafetensors::multi(serialized)? }; let mut total_pack_factors = 0; let total_tensors = ser_artifacts.tensors().len(); diff --git a/mistralrs-core/src/toml_selector.rs b/mistralrs-core/src/toml_selector.rs index b9f6221310..15d53537ee 100644 --- a/mistralrs-core/src/toml_selector.rs +++ b/mistralrs-core/src/toml_selector.rs @@ -1,4 +1,4 @@ -use std::{fs::File, num::NonZeroUsize, path::PathBuf}; +use std::{fs::File, num::NonZeroUsize, path::PathBuf, str::FromStr}; use mistralrs_quant::MULTI_LORA_DELIMITER; use serde::Deserialize; @@ -8,7 +8,7 @@ use crate::{ GGMLLoaderBuilder, GGMLSpecificConfig, GGUFLoaderBuilder, GGUFSpecificConfig, Loader, ModelDType, NormalLoaderBuilder, NormalLoaderType, NormalSpecificConfig, SpeculativeConfig, SpeculativeLoader, Topology, VisionLoaderBuilder, VisionLoaderType, VisionSpecificConfig, - GGUF_MULTI_FILE_DELIMITER, + GGUF_MULTI_FILE_DELIMITER, UQFF_MULTI_FILE_DELIMITER, }; fn default_one() -> usize { @@ -64,7 +64,7 @@ pub enum TomlModelSelected { write_uqff: Option, /// UQFF path to load from. If provided, this takes precedence over applying ISQ. - from_uqff: Option, + from_uqff: Option, /// .imatrix file to enhance GGUF quantizations with. /// Incompatible with `--imatrix/-i` @@ -115,7 +115,7 @@ pub enum TomlModelSelected { write_uqff: Option, /// UQFF path to load from. If provided, this takes precedence over applying ISQ. - from_uqff: Option, + from_uqff: Option, /// Maximum prompt sequence length to expect for this model. This affects automatic device mapping but is not a hard limit. #[serde(default = "default_max_seq_len")] @@ -151,7 +151,7 @@ pub enum TomlModelSelected { write_uqff: Option, /// UQFF path to load from. If provided, this takes precedence over applying ISQ. - from_uqff: Option, + from_uqff: Option, /// Maximum prompt sequence length to expect for this model. This affects automatic device mapping but is not a hard limit. #[serde(default = "default_max_seq_len")] @@ -407,7 +407,7 @@ pub enum TomlModelSelected { write_uqff: Option, /// UQFF path to load from. If provided, this takes precedence over applying ISQ. - from_uqff: Option, + from_uqff: Option, /// Automatically resize and pad images to this maximum edge length. Aspect ratio is preserved. /// This is only supported on the Qwen2-VL and Idefics 2 models. Others handle this internally. @@ -613,7 +613,12 @@ fn loader_from_selected( topology: Topology::from_option_path(topology)?, organization: organization.unwrap_or_default(), write_uqff, - from_uqff, + from_uqff: from_uqff.map(|x| { + x.split(UQFF_MULTI_FILE_DELIMITER) + .map(PathBuf::from_str) + .map(|x| x.unwrap()) + .collect::>() + }), imatrix, calibration_file, hf_cache_path, @@ -645,7 +650,12 @@ fn loader_from_selected( topology: Topology::from_option_path(topology)?, organization: Default::default(), write_uqff, - from_uqff, + from_uqff: from_uqff.map(|x| { + x.split(UQFF_MULTI_FILE_DELIMITER) + .map(PathBuf::from_str) + .map(|x| x.unwrap()) + .collect::>() + }), imatrix: None, calibration_file: None, hf_cache_path, @@ -684,7 +694,12 @@ fn loader_from_selected( topology: Topology::from_option_path(topology)?, organization: Default::default(), write_uqff, - from_uqff, + from_uqff: from_uqff.map(|x| { + x.split(UQFF_MULTI_FILE_DELIMITER) + .map(PathBuf::from_str) + .map(|x| x.unwrap()) + .collect::>() + }), imatrix: None, calibration_file: None, hf_cache_path, @@ -907,7 +922,12 @@ fn loader_from_selected( prompt_chunksize: args.prompt_chunksize, topology: Topology::from_option_path(topology)?, write_uqff, - from_uqff, + from_uqff: from_uqff.map(|x| { + x.split(UQFF_MULTI_FILE_DELIMITER) + .map(PathBuf::from_str) + .map(|x| x.unwrap()) + .collect::>() + }), max_edge, calibration_file, imatrix, diff --git a/mistralrs-pyo3/API.md b/mistralrs-pyo3/API.md index a26ad661d4..d3e6249fc9 100644 --- a/mistralrs-pyo3/API.md +++ b/mistralrs-pyo3/API.md @@ -54,6 +54,8 @@ If you do not specify the architecture, an attempt will be made to use the model - `Default` - `MoQE`: if applicable, only quantize MoE experts. https://arxiv.org/abs/2310.02410 +> Note: `from_uqff` specified a UQFF path to load from. If provided, this takes precedence over applying ISQ. Specify multiple files using a semicolon delimiter (;). + ```py class Which(Enum): @dataclass @@ -62,9 +64,11 @@ class Which(Enum): arch: Architecture | None = None tokenizer_json: str | None = None topology: str | None = None - organization: IsqOrganization | None = None + organization: str | None = None + from_uqff: str | list[str] | None = None write_uqff: str | None = None dtype: ModelDType = ModelDType.Auto + auto_map_params: TextAutoMapParams | None = (None,) calibration_file: str | None = None imatrix: str | None = None hf_cache_path: str | None = None @@ -78,20 +82,23 @@ class Which(Enum): tokenizer_json: str | None = None tgt_non_granular_index: int | None = None topology: str | None = None + from_uqff: str | list[str] | None = None write_uqff: str | None = None dtype: ModelDType = ModelDType.Auto + auto_map_params: TextAutoMapParams | None = (None,) hf_cache_path: str | None = None @dataclass class Lora: - adapters_model_id: str - order: str + adapter_model_id: str arch: Architecture | None = None model_id: str | None = None tokenizer_json: str | None = None topology: str | None = None + from_uqff: str | list[str] | None = None write_uqff: str | None = None dtype: ModelDType = ModelDType.Auto + auto_map_params: TextAutoMapParams | None = (None,) hf_cache_path: str | None = None @dataclass @@ -101,6 +108,7 @@ class Which(Enum): tok_model_id: str | None = None topology: str | None = None dtype: ModelDType = ModelDType.Auto + auto_map_params: TextAutoMapParams | None = (None,) @dataclass class XLoraGGUF: @@ -112,6 +120,7 @@ class Which(Enum): tgt_non_granular_index: int | None = None topology: str | None = None dtype: ModelDType = ModelDType.Auto + auto_map_params: TextAutoMapParams | None = (None,) @dataclass class LoraGGUF: @@ -122,6 +131,7 @@ class Which(Enum): tok_model_id: str | None = None topology: str | None = None dtype: ModelDType = ModelDType.Auto + auto_map_params: TextAutoMapParams | None = (None,) @dataclass class GGML: @@ -132,6 +142,7 @@ class Which(Enum): gqa: int | None = None topology: str | None = None dtype: ModelDType = ModelDType.Auto + auto_map_params: TextAutoMapParams | None = (None,) @dataclass class XLoraGGML: @@ -145,6 +156,7 @@ class Which(Enum): gqa: int | None = None topology: str | None = None dtype: ModelDType = ModelDType.Auto + auto_map_params: TextAutoMapParams | None = (None,) @dataclass class LoraGGML: @@ -156,6 +168,7 @@ class Which(Enum): tokenizer_json: str | None = None topology: str | None = None dtype: ModelDType = ModelDType.Auto + auto_map_params: TextAutoMapParams | None = (None,) @dataclass class VisionPlain: @@ -163,9 +176,11 @@ class Which(Enum): arch: VisionArchitecture tokenizer_json: str | None = None topology: str | None = None + from_uqff: str | list[str] | None = None write_uqff: str | None = None dtype: ModelDType = ModelDType.Auto max_edge: int | None = None + auto_map_params: VisionAutoMapParams | None = (None,) calibration_file: str | None = None imatrix: str | None = None hf_cache_path: str | None = None diff --git a/mistralrs-pyo3/mistralrs.pyi b/mistralrs-pyo3/mistralrs.pyi index d13d0e2b6c..61f0b97a38 100644 --- a/mistralrs-pyo3/mistralrs.pyi +++ b/mistralrs-pyo3/mistralrs.pyi @@ -178,6 +178,7 @@ class Which(Enum): tokenizer_json: str | None = None topology: str | None = None organization: str | None = None + from_uqff: str | list[str] | None = None write_uqff: str | None = None dtype: ModelDType = ModelDType.Auto auto_map_params: TextAutoMapParams | None = (None,) @@ -194,6 +195,7 @@ class Which(Enum): tokenizer_json: str | None = None tgt_non_granular_index: int | None = None topology: str | None = None + from_uqff: str | list[str] | None = None write_uqff: str | None = None dtype: ModelDType = ModelDType.Auto auto_map_params: TextAutoMapParams | None = (None,) @@ -206,6 +208,7 @@ class Which(Enum): model_id: str | None = None tokenizer_json: str | None = None topology: str | None = None + from_uqff: str | list[str] | None = None write_uqff: str | None = None dtype: ModelDType = ModelDType.Auto auto_map_params: TextAutoMapParams | None = (None,) @@ -286,6 +289,7 @@ class Which(Enum): arch: VisionArchitecture tokenizer_json: str | None = None topology: str | None = None + from_uqff: str | list[str] | None = None write_uqff: str | None = None dtype: ModelDType = ModelDType.Auto max_edge: int | None = None diff --git a/mistralrs-pyo3/src/lib.rs b/mistralrs-pyo3/src/lib.rs index b2e660ea47..0af4d9a569 100644 --- a/mistralrs-pyo3/src/lib.rs +++ b/mistralrs-pyo3/src/lib.rs @@ -10,6 +10,7 @@ use serde_json::Value; use std::{ cell::RefCell, num::NonZeroUsize, + path::PathBuf, str::FromStr, sync::{Arc, Mutex, OnceLock}, }; @@ -105,7 +106,12 @@ fn parse_which( topology: Topology::from_option_path(topology)?, organization: organization.map(Into::into).unwrap_or(Default::default()), write_uqff, - from_uqff, + from_uqff: from_uqff.map(|x| { + x.right_or_else(|l| vec![l]) + .iter() + .map(|x| PathBuf::from_str(x).unwrap()) + .collect::>() + }), imatrix, calibration_file, hf_cache_path, @@ -137,7 +143,12 @@ fn parse_which( topology: Topology::from_option_path(topology)?, organization: Default::default(), write_uqff, - from_uqff, + from_uqff: from_uqff.map(|x| { + x.right_or_else(|l| vec![l]) + .iter() + .map(|x| PathBuf::from_str(x).unwrap()) + .collect::>() + }), imatrix: None, calibration_file: None, hf_cache_path, @@ -176,7 +187,12 @@ fn parse_which( topology: Topology::from_option_path(topology)?, organization: Default::default(), write_uqff, - from_uqff, + from_uqff: from_uqff.map(|x| { + x.right_or_else(|l| vec![l]) + .iter() + .map(|x| PathBuf::from_str(x).unwrap()) + .collect::>() + }), imatrix: None, calibration_file: None, hf_cache_path, @@ -382,7 +398,12 @@ fn parse_which( prompt_chunksize, topology: Topology::from_option_path(topology)?, write_uqff, - from_uqff, + from_uqff: from_uqff.map(|x| { + x.right_or_else(|l| vec![l]) + .iter() + .map(|x| PathBuf::from_str(x).unwrap()) + .collect::>() + }), max_edge, calibration_file, imatrix, diff --git a/mistralrs-pyo3/src/which.rs b/mistralrs-pyo3/src/which.rs index d92e00d90d..6cf2517e76 100644 --- a/mistralrs-pyo3/src/which.rs +++ b/mistralrs-pyo3/src/which.rs @@ -193,7 +193,7 @@ pub enum Which { topology: Option, organization: Option, write_uqff: Option, - from_uqff: Option, + from_uqff: Option>>, dtype: ModelDType, imatrix: Option, calibration_file: Option, @@ -224,7 +224,7 @@ pub enum Which { tgt_non_granular_index: Option, topology: Option, write_uqff: Option, - from_uqff: Option, + from_uqff: Option>>, dtype: ModelDType, auto_map_params: Option, hf_cache_path: Option, @@ -249,7 +249,7 @@ pub enum Which { tokenizer_json: Option, topology: Option, write_uqff: Option, - from_uqff: Option, + from_uqff: Option>>, dtype: ModelDType, auto_map_params: Option, hf_cache_path: Option, @@ -411,7 +411,7 @@ pub enum Which { tokenizer_json: Option, topology: Option, write_uqff: Option, - from_uqff: Option, + from_uqff: Option>>, dtype: ModelDType, max_edge: Option, calibration_file: Option, diff --git a/mistralrs/examples/uqff/main.rs b/mistralrs/examples/uqff/main.rs index 0d517ad34e..bb120e4f20 100644 --- a/mistralrs/examples/uqff/main.rs +++ b/mistralrs/examples/uqff/main.rs @@ -8,7 +8,7 @@ use mistralrs::{ async fn main() -> Result<()> { let model = UqffTextModelBuilder::new( "EricB/Phi-3.5-mini-instruct-UQFF", - "phi3.5-mini-instruct-q8_0.uqff".into(), + vec!["phi3.5-mini-instruct-q8_0.uqff".into()], ) .into_inner() .with_isq(IsqType::Q8_0) diff --git a/mistralrs/examples/uqff_vision/main.rs b/mistralrs/examples/uqff_vision/main.rs index 9f985d8d59..4568a608ae 100644 --- a/mistralrs/examples/uqff_vision/main.rs +++ b/mistralrs/examples/uqff_vision/main.rs @@ -8,7 +8,7 @@ async fn main() -> Result<()> { let model = UqffVisionModelBuilder::new( "EricB/Phi-3.5-vision-instruct-UQFF", VisionLoaderType::Phi3V, - "phi3.5-vision-instruct-q8_0.uqff".into(), + vec!["phi3.5-vision-instruct-q8_0.uqff".into()], ) .into_inner() .with_isq(IsqType::Q4K) diff --git a/mistralrs/src/text_model.rs b/mistralrs/src/text_model.rs index c9009ce945..0d772c8b97 100644 --- a/mistralrs/src/text_model.rs +++ b/mistralrs/src/text_model.rs @@ -15,7 +15,7 @@ pub struct TextModelBuilder { pub(crate) token_source: TokenSource, pub(crate) hf_revision: Option, pub(crate) write_uqff: Option, - pub(crate) from_uqff: Option, + pub(crate) from_uqff: Option>, pub(crate) imatrix: Option, pub(crate) calibration_file: Option, pub(crate) chat_template: Option, @@ -261,7 +261,7 @@ impl TextModelBuilder { } /// Path to read a UQFF file from. - pub fn from_uqff(mut self, path: PathBuf) -> Self { + pub fn from_uqff(mut self, path: Vec) -> Self { self.from_uqff = Some(path); self } @@ -375,7 +375,7 @@ impl UqffTextModelBuilder { /// - Maximum number of sequences running is 32 /// - Number of sequences to hold in prefix cache is 16. /// - Automatic device mapping with model defaults according to `AutoDeviceMapParams` - pub fn new(model_id: impl ToString, uqff_file: PathBuf) -> Self { + pub fn new(model_id: impl ToString, uqff_file: Vec) -> Self { let mut inner = TextModelBuilder::new(model_id); inner = inner.from_uqff(uqff_file); Self(inner) diff --git a/mistralrs/src/vision_model.rs b/mistralrs/src/vision_model.rs index b4476f759a..f827d60b66 100644 --- a/mistralrs/src/vision_model.rs +++ b/mistralrs/src/vision_model.rs @@ -15,7 +15,7 @@ pub struct VisionModelBuilder { pub(crate) token_source: TokenSource, pub(crate) hf_revision: Option, pub(crate) write_uqff: Option, - pub(crate) from_uqff: Option, + pub(crate) from_uqff: Option>, pub(crate) calibration_file: Option, pub(crate) imatrix: Option, pub(crate) chat_template: Option, @@ -191,7 +191,7 @@ impl VisionModelBuilder { } /// Path to read a UQFF file from. - pub fn from_uqff(mut self, path: PathBuf) -> Self { + pub fn from_uqff(mut self, path: Vec) -> Self { self.from_uqff = Some(path); self } @@ -305,7 +305,11 @@ impl UqffVisionModelBuilder { /// - Token source is from the cache (.cache/huggingface/token) /// - Maximum number of sequences running is 32 /// - Automatic device mapping with model defaults according to `AutoDeviceMapParams` - pub fn new(model_id: impl ToString, loader_type: VisionLoaderType, uqff_file: PathBuf) -> Self { + pub fn new( + model_id: impl ToString, + loader_type: VisionLoaderType, + uqff_file: Vec, + ) -> Self { let mut inner = VisionModelBuilder::new(model_id, loader_type); inner = inner.from_uqff(uqff_file); Self(inner)