Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion mistralrs-core/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ pub use pipeline::{
NormalLoaderBuilder, NormalLoaderType, NormalSpecificConfig, Phi2Loader, Phi3Loader,
Phi3VLoader, Qwen2Loader, SpeculativeConfig, SpeculativeLoader, SpeculativePipeline,
Starcoder2Loader, TokenSource, VisionLoader, VisionLoaderBuilder, VisionLoaderType,
VisionPromptPrefixer, VisionSpecificConfig,
VisionPromptPrefixer, VisionSpecificConfig, UQFF_MULTI_FILE_DELIMITER,
};
pub use request::{
ApproximateUserLocation, Constraint, DetokenizationRequest, ImageGenerationResponseFormat,
Expand Down
31 changes: 27 additions & 4 deletions mistralrs-core/src/model_loader.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
use std::{
fs::{self, File},
num::NonZeroUsize,
path::PathBuf,
str::FromStr,
};

use mistralrs_quant::MULTI_LORA_DELIMITER;
Expand All @@ -12,6 +14,7 @@ use crate::{
AutoDeviceMapParams, DiffusionLoaderBuilder, DiffusionSpecificConfig, GGUFSpecificConfig,
Loader, ModelDType, ModelSelected, NormalLoaderBuilder, TomlLoaderArgs, TomlSelector, Topology,
VisionLoaderBuilder, VisionSpecificConfig, GGUF_MULTI_FILE_DELIMITER,
UQFF_MULTI_FILE_DELIMITER,
};

/// A builder for a loader using the selected model.
Expand Down Expand Up @@ -222,7 +225,12 @@ fn loader_from_model_selected(args: LoaderBuilder) -> anyhow::Result<Box<dyn Loa
topology: Topology::from_option_path(topology)?,
organization: organization.unwrap_or_default(),
write_uqff,
from_uqff,
from_uqff: from_uqff.map(|x| {
x.split(UQFF_MULTI_FILE_DELIMITER)
.map(PathBuf::from_str)
.map(|x| x.unwrap())
.collect::<Vec<_>>()
}),
imatrix,
calibration_file,
hf_cache_path,
Expand Down Expand Up @@ -255,7 +263,12 @@ fn loader_from_model_selected(args: LoaderBuilder) -> anyhow::Result<Box<dyn Loa
topology: Topology::from_option_path(topology)?,
organization: Default::default(),
write_uqff,
from_uqff,
from_uqff: from_uqff.map(|x| {
x.split(UQFF_MULTI_FILE_DELIMITER)
.map(PathBuf::from_str)
.map(|x| x.unwrap())
.collect::<Vec<_>>()
}),
imatrix: None,
calibration_file: None,
hf_cache_path,
Expand Down Expand Up @@ -295,7 +308,12 @@ fn loader_from_model_selected(args: LoaderBuilder) -> anyhow::Result<Box<dyn Loa
topology: Topology::from_option_path(topology)?,
organization: Default::default(),
write_uqff,
from_uqff,
from_uqff: from_uqff.map(|x| {
x.split(UQFF_MULTI_FILE_DELIMITER)
.map(PathBuf::from_str)
.map(|x| x.unwrap())
.collect::<Vec<_>>()
}),
imatrix: None,
calibration_file: None,
hf_cache_path,
Expand Down Expand Up @@ -512,7 +530,12 @@ fn loader_from_model_selected(args: LoaderBuilder) -> anyhow::Result<Box<dyn Loa
prompt_chunksize: args.prompt_chunksize,
topology: Topology::from_option_path(topology)?,
write_uqff,
from_uqff,
from_uqff: from_uqff.map(|x| {
x.split(UQFF_MULTI_FILE_DELIMITER)
.map(PathBuf::from_str)
.map(|x| x.unwrap())
.collect::<Vec<_>>()
}),
max_edge,
calibration_file,
imatrix,
Expand Down
16 changes: 8 additions & 8 deletions mistralrs-core/src/model_selected.rs
Original file line number Diff line number Diff line change
Expand Up @@ -63,9 +63,9 @@ pub enum ModelSelected {
#[arg(short, long)]
write_uqff: Option<PathBuf>,

/// UQFF path to load from. If provided, this takes precedence over applying ISQ.
/// UQFF path to load from. If provided, this takes precedence over applying ISQ. Specify multiple files using a semicolon delimiter (;)
#[arg(short, long)]
from_uqff: Option<PathBuf>,
from_uqff: Option<String>,

/// .imatrix file to enhance GGUF quantizations with.
/// Incompatible with `--calibration-file/-c`
Expand Down Expand Up @@ -129,9 +129,9 @@ pub enum ModelSelected {
#[arg(short, long)]
write_uqff: Option<PathBuf>,

/// UQFF path to load from. If provided, this takes precedence over applying ISQ.
/// UQFF path to load from. If provided, this takes precedence over applying ISQ. Specify multiple files using a semicolon delimiter (;).
#[arg(short, long)]
from_uqff: Option<PathBuf>,
from_uqff: Option<String>,

/// Maximum prompt sequence length to expect for this model. This affects automatic device mapping but is not a hard limit.
#[arg(long, default_value_t = AutoDeviceMapParams::DEFAULT_MAX_SEQ_LEN)]
Expand Down Expand Up @@ -176,9 +176,9 @@ pub enum ModelSelected {
#[arg(short, long)]
write_uqff: Option<PathBuf>,

/// UQFF path to load from. If provided, this takes precedence over applying ISQ.
/// UQFF path to load from. If provided, this takes precedence over applying ISQ. Specify multiple files using a semicolon delimiter (;).
#[arg(short, long)]
from_uqff: Option<PathBuf>,
from_uqff: Option<String>,

/// Maximum prompt sequence length to expect for this model. This affects automatic device mapping but is not a hard limit.
#[arg(long, default_value_t = AutoDeviceMapParams::DEFAULT_MAX_SEQ_LEN)]
Expand Down Expand Up @@ -486,9 +486,9 @@ pub enum ModelSelected {
#[arg(short, long)]
write_uqff: Option<PathBuf>,

/// UQFF path to load from. If provided, this takes precedence over applying ISQ.
/// UQFF path to load from. If provided, this takes precedence over applying ISQ. Specify multiple files using a semicolon delimiter (;).
#[arg(short, long)]
from_uqff: Option<PathBuf>,
from_uqff: Option<String>,

/// Automatically resize and pad images to this maximum edge length. Aspect ratio is preserved.
/// This is only supported on the Qwen2-VL and Idefics models. Others handle this internally.
Expand Down
34 changes: 31 additions & 3 deletions mistralrs-core/src/pipeline/isq.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@ use tracing::{info, warn};
use crate::{device_map::DeviceMapper, topology::LayerTopology, Topology};

pub(crate) const UQFF_RESIDUAL_SAFETENSORS: &str = "residual.safetensors";
// 10 GB max per file
const MAX_UQFF_SIZE_BYTES: usize = 10 * 1024 * 1024 * 1024;
pub const UQFF_MULTI_FILE_DELIMITER: &str = ";";

/// Parse ISQ value.
///
Expand Down Expand Up @@ -596,14 +599,39 @@ pub trait IsqModel {
.collect::<candle_core::Result<Vec<_>>>()
}
});
let quantized_values = quantized_values?;

let parent = serialized
.parent()
.context("Target UQFF path must have a filename!")?;

std::fs::create_dir_all(parent)?;

safetensors::serialize_to_file(quantized_values?, &None, serialized)?;
let file_stem = serialized
.file_stem()
.context("Target UQFF path must have a file stem!")?
.to_string_lossy()
.to_string();

let size_estimate_bytes = quantized_values
.iter()
.map(|(_, x)| x.elem_count() * x.dtype().size_in_bytes())
.sum::<usize>();
let n_files = size_estimate_bytes.div_ceil(MAX_UQFF_SIZE_BYTES);

if n_files == 1 {
info!("Writing to `{}`", serialized.display());
safetensors::serialize_to_file(quantized_values, &None, serialized)?;
} else {
let chunksize = quantized_values.len() / n_files;
let quantized_values_chunks = quantized_values.into_iter().chunks(chunksize);
for (i, chunk) in quantized_values_chunks.into_iter().enumerate() {
let mut name = parent.to_path_buf();
name.push(format!("{file_stem}-{i}.uqff"));
info!("Writing shard {i} to `{}`", name.display());
safetensors::serialize_to_file(chunk, &None, &name)?;
}
}

let residual = match organization {
IsqOrganization::Default => self.residual_tensors(),
Expand Down Expand Up @@ -700,7 +728,7 @@ pub trait IsqModel {
device: Device,
topology: Option<&Topology>,
silent: bool,
artifacts: &PathBuf,
artifacts: &[PathBuf],
) -> candle_core::Result<()> {
let (tensors, mapper) = self.get_layers();
let total_tensors = tensors.len();
Expand Down Expand Up @@ -737,7 +765,7 @@ pub trait IsqModel {
comms.push(mapper.get_comm_for(layer_num.unwrap_or(0))?)
}

let artifacts = unsafe { candle_core::safetensors::MmapedSafetensors::new(artifacts)? };
let artifacts = unsafe { candle_core::safetensors::MmapedSafetensors::multi(artifacts)? };

let artifact_isqs = artifacts
.tensors()
Expand Down
8 changes: 6 additions & 2 deletions mistralrs-core/src/pipeline/macros.rs
Original file line number Diff line number Diff line change
Expand Up @@ -217,9 +217,13 @@ macro_rules! get_uqff_paths {
revision.clone(),
));

let file = $from_uqff.display().to_string();
let mut files = Vec::new();
for file in $from_uqff {
let file = file.display().to_string();

api_get_file!(api, &file, Path::new(&$this.model_id))
files.push(api_get_file!(api, &file, Path::new(&$this.model_id)));
}
files
}};
}

Expand Down
2 changes: 1 addition & 1 deletion mistralrs-core/src/pipeline/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ pub use gguf::{GGUFLoader, GGUFLoaderBuilder, GGUFSpecificConfig};
use image::DynamicImage;
pub use inputs_processor::InputProcessorOutput;
pub(crate) use isq::IsqModelLoader;
pub use isq::{parse_isq_value, IsqModel, IsqOrganization};
pub use isq::{parse_isq_value, IsqModel, IsqOrganization, UQFF_MULTI_FILE_DELIMITER};
pub use loaders::{
AdapterKind, AutoDeviceMapParams, AutoLoader, DeepSeekV2Loader, DeepSeekV3Loader,
DeviceMappedModelLoader, DiffusionLoaderType, DiffusionModel, DiffusionModelLoader, FluxLoader,
Expand Down
6 changes: 3 additions & 3 deletions mistralrs-core/src/pipeline/normal.rs
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ pub struct NormalLoader {
tgt_non_granular_index: Option<usize>,
token_source: RwLock<Option<TokenSource>>,
revision: RwLock<Option<String>>,
from_uqff: RwLock<Option<PathBuf>>,
from_uqff: RwLock<Option<Vec<PathBuf>>>,
jinja_explicit: Option<String>,
hf_cache_path: Option<PathBuf>,
}
Expand Down Expand Up @@ -122,7 +122,7 @@ pub struct NormalSpecificConfig {
pub topology: Option<Topology>,
pub organization: IsqOrganization,
pub write_uqff: Option<PathBuf>,
pub from_uqff: Option<PathBuf>,
pub from_uqff: Option<Vec<PathBuf>>,
pub imatrix: Option<PathBuf>,
pub calibration_file: Option<PathBuf>,
pub hf_cache_path: Option<PathBuf>,
Expand Down Expand Up @@ -349,7 +349,7 @@ impl Loader for NormalLoader {
if let Some(serialized) = &*self.from_uqff.read().unwrap() {
let weight_pack_factor = {
let ser_artifacts = unsafe {
candle_core::safetensors::MmapedSafetensors::new(serialized)?
candle_core::safetensors::MmapedSafetensors::multi(serialized)?
};
let mut total_pack_factors = 0;
let total_tensors = ser_artifacts.tensors().len();
Expand Down
6 changes: 3 additions & 3 deletions mistralrs-core/src/pipeline/vision.rs
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ pub struct VisionLoader {
xlora_order: Option<Ordering>,
token_source: RwLock<Option<TokenSource>>,
revision: RwLock<Option<String>>,
from_uqff: RwLock<Option<PathBuf>>,
from_uqff: RwLock<Option<Vec<PathBuf>>>,
jinja_explicit: Option<String>,
hf_cache_path: Option<PathBuf>,
lora_adapter_ids: Option<Vec<String>>,
Expand All @@ -113,7 +113,7 @@ pub struct VisionSpecificConfig {
pub prompt_chunksize: Option<NonZeroUsize>,
pub topology: Option<Topology>,
pub write_uqff: Option<PathBuf>,
pub from_uqff: Option<PathBuf>,
pub from_uqff: Option<Vec<PathBuf>>,
pub max_edge: Option<u32>,
pub imatrix: Option<PathBuf>,
pub calibration_file: Option<PathBuf>,
Expand Down Expand Up @@ -286,7 +286,7 @@ impl Loader for VisionLoader {
if let Some(serialized) = &*self.from_uqff.read().unwrap() {
let weight_pack_factor = {
let ser_artifacts = unsafe {
candle_core::safetensors::MmapedSafetensors::new(serialized)?
candle_core::safetensors::MmapedSafetensors::multi(serialized)?
};
let mut total_pack_factors = 0;
let total_tensors = ser_artifacts.tensors().len();
Expand Down
40 changes: 30 additions & 10 deletions mistralrs-core/src/toml_selector.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use std::{fs::File, num::NonZeroUsize, path::PathBuf};
use std::{fs::File, num::NonZeroUsize, path::PathBuf, str::FromStr};

use mistralrs_quant::MULTI_LORA_DELIMITER;
use serde::Deserialize;
Expand All @@ -8,7 +8,7 @@ use crate::{
GGMLLoaderBuilder, GGMLSpecificConfig, GGUFLoaderBuilder, GGUFSpecificConfig, Loader,
ModelDType, NormalLoaderBuilder, NormalLoaderType, NormalSpecificConfig, SpeculativeConfig,
SpeculativeLoader, Topology, VisionLoaderBuilder, VisionLoaderType, VisionSpecificConfig,
GGUF_MULTI_FILE_DELIMITER,
GGUF_MULTI_FILE_DELIMITER, UQFF_MULTI_FILE_DELIMITER,
};

fn default_one() -> usize {
Expand Down Expand Up @@ -64,7 +64,7 @@ pub enum TomlModelSelected {
write_uqff: Option<PathBuf>,

/// UQFF path to load from. If provided, this takes precedence over applying ISQ.
from_uqff: Option<PathBuf>,
from_uqff: Option<String>,

/// .imatrix file to enhance GGUF quantizations with.
/// Incompatible with `--imatrix/-i`
Expand Down Expand Up @@ -115,7 +115,7 @@ pub enum TomlModelSelected {
write_uqff: Option<PathBuf>,

/// UQFF path to load from. If provided, this takes precedence over applying ISQ.
from_uqff: Option<PathBuf>,
from_uqff: Option<String>,

/// Maximum prompt sequence length to expect for this model. This affects automatic device mapping but is not a hard limit.
#[serde(default = "default_max_seq_len")]
Expand Down Expand Up @@ -151,7 +151,7 @@ pub enum TomlModelSelected {
write_uqff: Option<PathBuf>,

/// UQFF path to load from. If provided, this takes precedence over applying ISQ.
from_uqff: Option<PathBuf>,
from_uqff: Option<String>,

/// Maximum prompt sequence length to expect for this model. This affects automatic device mapping but is not a hard limit.
#[serde(default = "default_max_seq_len")]
Expand Down Expand Up @@ -407,7 +407,7 @@ pub enum TomlModelSelected {
write_uqff: Option<PathBuf>,

/// UQFF path to load from. If provided, this takes precedence over applying ISQ.
from_uqff: Option<PathBuf>,
from_uqff: Option<String>,

/// Automatically resize and pad images to this maximum edge length. Aspect ratio is preserved.
/// This is only supported on the Qwen2-VL and Idefics 2 models. Others handle this internally.
Expand Down Expand Up @@ -613,7 +613,12 @@ fn loader_from_selected(
topology: Topology::from_option_path(topology)?,
organization: organization.unwrap_or_default(),
write_uqff,
from_uqff,
from_uqff: from_uqff.map(|x| {
x.split(UQFF_MULTI_FILE_DELIMITER)
.map(PathBuf::from_str)
.map(|x| x.unwrap())
.collect::<Vec<_>>()
}),
imatrix,
calibration_file,
hf_cache_path,
Expand Down Expand Up @@ -645,7 +650,12 @@ fn loader_from_selected(
topology: Topology::from_option_path(topology)?,
organization: Default::default(),
write_uqff,
from_uqff,
from_uqff: from_uqff.map(|x| {
x.split(UQFF_MULTI_FILE_DELIMITER)
.map(PathBuf::from_str)
.map(|x| x.unwrap())
.collect::<Vec<_>>()
}),
imatrix: None,
calibration_file: None,
hf_cache_path,
Expand Down Expand Up @@ -684,7 +694,12 @@ fn loader_from_selected(
topology: Topology::from_option_path(topology)?,
organization: Default::default(),
write_uqff,
from_uqff,
from_uqff: from_uqff.map(|x| {
x.split(UQFF_MULTI_FILE_DELIMITER)
.map(PathBuf::from_str)
.map(|x| x.unwrap())
.collect::<Vec<_>>()
}),
imatrix: None,
calibration_file: None,
hf_cache_path,
Expand Down Expand Up @@ -907,7 +922,12 @@ fn loader_from_selected(
prompt_chunksize: args.prompt_chunksize,
topology: Topology::from_option_path(topology)?,
write_uqff,
from_uqff,
from_uqff: from_uqff.map(|x| {
x.split(UQFF_MULTI_FILE_DELIMITER)
.map(PathBuf::from_str)
.map(|x| x.unwrap())
.collect::<Vec<_>>()
}),
max_edge,
calibration_file,
imatrix,
Expand Down
Loading
Loading