Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
182 changes: 88 additions & 94 deletions mistralrs-core/src/gguf/gguf_tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,26 +2,27 @@

use std::{collections::HashMap, sync::atomic::Ordering};

use crate::utils::gguf_metadata::ContentMetadata;
use crate::DEBUG;
use anyhow::Result;
use candle_core::quantized::gguf_file::Value;
use itertools::Itertools;
use tokenizers::pre_tokenizers::{
sequence::Sequence,
split::{Split, SplitPattern},
PreTokenizerWrapper,
};
use tokenizers::tokenizer::normalizer::SplitDelimiterBehavior;
use tokenizers::{
decoders::{
self, byte_fallback::ByteFallback, byte_level::ByteLevel, fuse::Fuse, strip::Strip,
},
models::{bpe::BpeBuilder, unigram::Unigram},
normalizers::{self, Prepend, Replace},
pre_tokenizers,
processors::{
self,
template::{self, TemplateProcessing},
},
AddedToken, DecoderWrapper, ModelWrapper, NormalizerWrapper, Tokenizer,
processors, AddedToken, DecoderWrapper, ModelWrapper, NormalizerWrapper, Tokenizer,
};
use tracing::info;

use crate::utils::gguf_metadata::ContentMetadata;
use crate::DEBUG;

use super::Content;

pub(crate) struct GgufTokenizerConversion {
Expand All @@ -40,7 +41,6 @@
unk: Option<u32>,
eos: u32,
bos: u32,
add_bos_token: Option<bool>,
}

impl TryFrom<ContentMetadata<'_>> for PropsGGUF {
Expand All @@ -59,39 +59,61 @@
unk: c.get_value("unknown_token_id").ok(),
eos: c.get_value("eos_token_id")?,
bos: c.get_value("bos_token_id")?,
add_bos_token: c.get_value("add_bos_token").ok(),
};

Ok(props)
}
}

struct AddedTokensCollection {
bos: String,
eos: String,
unk: Option<String>,
}

pub fn convert_gguf_to_hf_tokenizer<R: std::io::Seek + std::io::Read>(
content: &Content<'_, R>,
) -> Result<GgufTokenizerConversion> {
let metadata = ContentMetadata {
path_prefix: "tokenizer.ggml",
metadata: content.get_metadata(),
};

let md_get = |s: &str| match metadata.metadata.get(s) {
None => candle_core::bail!("cannot find {s} in metadata"),
Some(v) => Ok(v),
};

let mut token_types = Vec::<i32>::new();
if metadata.metadata.contains_key("tokenizer.ggml.token_type") {
let vtypes: &Vec<Value> = md_get("tokenizer.ggml.token_type")
.unwrap()
.to_vec()
.unwrap();
let v: Vec<i32> = vtypes.iter().map(|v| v.to_i32().unwrap()).collect();
token_types.extend(v);
Comment on lines +83 to +88
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Potential panic from chained unwraps in token type extraction.

The code uses multiple unwrap() calls that could panic if the metadata format is unexpected. Consider using proper error handling:

-        let vtypes: &Vec<Value> = md_get("tokenizer.ggml.token_type")
-            .unwrap()
-            .to_vec()
-            .unwrap();
-        let v: Vec<i32> = vtypes.iter().map(|v| v.to_i32().unwrap()).collect();
+        let vtypes = md_get("tokenizer.ggml.token_type")?
+            .to_vec()?;
+        let v: Vec<i32> = vtypes
+            .iter()
+            .map(|v| v.to_i32())
+            .collect::<Result<Vec<_>, _>>()?;
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
let vtypes: &Vec<Value> = md_get("tokenizer.ggml.token_type")
.unwrap()
.to_vec()
.unwrap();
let v: Vec<i32> = vtypes.iter().map(|v| v.to_i32().unwrap()).collect();
token_types.extend(v);
// Replace chained unwraps with ?-based error handling
- let vtypes: &Vec<Value> = md_get("tokenizer.ggml.token_type")
- .unwrap()
- .to_vec()
- .unwrap();
- let v: Vec<i32> = vtypes.iter().map(|v| v.to_i32().unwrap()).collect();
+ let vtypes = md_get("tokenizer.ggml.token_type")?
+ .to_vec()?;
+ let v: Vec<i32> = vtypes
+ .iter()
+ .map(|v| v.to_i32())
+ .collect::<Result<Vec<_>, _>>()?;
token_types.extend(v);
🤖 Prompt for AI Agents
In mistralrs-core/src/gguf/gguf_tokenizer.rs around lines 83 to 88, the code
uses chained unwrap() calls on metadata extraction that can cause panics if the
expected data is missing or malformed. Replace these unwraps with proper error
handling by checking the presence and validity of the metadata at each step,
using match or if let constructs to handle errors gracefully and return or
propagate meaningful error messages instead of panicking.

}

let props = PropsGGUF::try_from(metadata)?;

let (tokenizer, kind, special_tokens) = match props.model.as_str() {
let (mut tokenizer, kind) = match props.model.as_str() {
"llama" | "replit" => unigram_tokenizer(&props)?,
"gpt2" => bpe_tokenizer(&props)?,
other => {
anyhow::bail!("Tokenizer model `{other}` not supported.");
}
};

//token type other than 1 treated as special token
let mut num_special_tokens = 0;
if token_types.len() == props.tokens.len() {
for i in 0..props.tokens.len() {

Check failure on line 104 in mistralrs-core/src/gguf/gguf_tokenizer.rs

View workflow job for this annotation

GitHub Actions / Clippy

the loop variable `i` is used to index `token_types`
if token_types[i] != 1i32 {
let tk = props.tokens[i].clone();
tokenizer.add_special_tokens(&[AddedToken::from(tk.to_string(), true)]);
num_special_tokens += 1;
}
}
}

info!(
"GGUF tokenizer model is `{model}`, kind: `{kind:?}`, num tokens: {}, num added tokens: {}, num merges: {}, num scores: {}",
"GGUF tokenizer model is `{model}`, kind: `{kind:?}`, num tokens: {}, num special tokens {}, num added tokens: {}, num merges: {}, num scores: {}",
tokenizer.get_vocab_size(true),
num_special_tokens,
props.added_tokens.as_ref().map(|x| x.len()).unwrap_or(0),
props.merges.as_ref().map(|x| x.len()).unwrap_or(0),
props.scores.as_ref().map(|x| x.len()).unwrap_or(0),
Expand All @@ -101,12 +123,15 @@
info!("Tokenizer: {tokenizer:?}");
}

let AddedTokensCollection { bos, eos, unk } = special_tokens;
let unk = match props.unk {
Some(u) => Some(props.tokens[u as usize].clone()),
_ => None,
};

Ok(GgufTokenizerConversion {
tokenizer,
bos: Some(bos),
eos: Some(eos),
bos: Some(props.tokens[props.bos as usize].clone()),
eos: Some(props.tokens[props.eos as usize].clone()),
unk,
})
}
Expand All @@ -119,37 +144,7 @@
Bpe,
}

/// Add the special tokens and return their string representations
fn add_special_tokens(
p: &PropsGGUF,
tokenizer: &mut Tokenizer,
bos: u32,
eos: u32,
unk: Option<u32>,
) -> AddedTokensCollection {
// Add special tokens (bos, eos, unk):
let mut special_tokens: [Option<String>; 3] = Default::default();

// A little bit awkward here since eos/bos are assumed not options so we need to handle an Option
for (i, token_id) in [Some(bos), Some(eos), unk].into_iter().enumerate() {
if let Some(token_id) = token_id {
let token = p.tokens[token_id as usize].as_str();
special_tokens[i] = Some(token.to_string());
tokenizer.add_special_tokens(&[AddedToken::from(token.to_string(), true)]);
}
}

// Destructure array of options:
let [bos_str, eos_str, unk_str] = special_tokens;
// Would need to unwrap bos/eos here, or change the struct types
AddedTokensCollection {
bos: bos_str.unwrap(),
eos: eos_str.unwrap(),
unk: unk_str,
}
}

fn unigram_tokenizer(p: &PropsGGUF) -> Result<(Tokenizer, TokenizerKind, AddedTokensCollection)> {
fn unigram_tokenizer(p: &PropsGGUF) -> Result<(Tokenizer, TokenizerKind)> {
let PropsGGUF { unk, eos, bos, .. } = *p;
// Unigram (SentencePiece) default UNK is 0
let unk = unk.unwrap_or(0);
Expand Down Expand Up @@ -191,12 +186,14 @@
)?;

// Add special tokens (bos, eos, unk):
let special_tokens = add_special_tokens(p, &mut tokenizer, bos, eos, Some(unk));

Ok((tokenizer, TokenizerKind::Unigram, special_tokens))
for i in [bos, eos, unk] {
let tk = p.tokens[i as usize].clone();
tokenizer.add_special_tokens(&[AddedToken::from(tk.to_string(), true)]);
}
Ok((tokenizer, TokenizerKind::Unigram))
}

fn bpe_tokenizer(p: &PropsGGUF) -> Result<(Tokenizer, TokenizerKind, AddedTokensCollection)> {
fn bpe_tokenizer(p: &PropsGGUF) -> Result<(Tokenizer, TokenizerKind)> {
// BPE merges have each string item as a space-delimited pair:
// https://github.com/EricLBuehler/mistral.rs/pull/397#discussion_r1631988370
let merges = p
Expand All @@ -219,13 +216,7 @@
vocab.insert(token.clone(), i as u32);
}

let PropsGGUF {
eos,
bos,
unk,
add_bos_token,
..
} = *p;
let PropsGGUF { eos, bos, unk, .. } = *p;

let mut bpe = BpeBuilder::new().vocab_and_merges(vocab, merges);
if let Some(unk) = unk {
Expand All @@ -239,39 +230,42 @@
Some(Decoder::ByteLevel(true, true, true)),
None,
)?;
tokenizer.with_pre_tokenizer(Some(pre_tokenizers::byte_level::ByteLevel::new(
false, true, true,

let split = Split::new(
SplitPattern::Regex("(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+".to_string()),
SplitDelimiterBehavior::Isolated,
false,
).unwrap();

// example:
// "type": "ByteLevel",
// "add_prefix_space": false,
// "trim_offsets": false,
// "use_regex": false
let pre_tokenizer = Sequence::new(vec![
PreTokenizerWrapper::Split(split),
PreTokenizerWrapper::ByteLevel(ByteLevel::new(false, false, false)),
]);

tokenizer.with_pre_tokenizer(Some(pre_tokenizer));

tokenizer.with_decoder(Some(decoders::byte_level::ByteLevel::new(
false, false, false,
)));
tokenizer.with_post_processor(Some(processors::byte_level::ByteLevel::new(
false, false, false,
)));
Comment on lines +241 to 258
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

💡 Verification agent

🧩 Analysis chain

Verify ByteLevel configuration parameters.

All ByteLevel components (pre-tokenizer, decoder, post-processor) are configured with (false, false, false) parameters. Please verify these settings match the expected behavior for GGUF BPE tokenizers:

  • add_prefix_space: false
  • trim_offsets: false
  • use_regex: false

🌐 Web query:

What are the correct ByteLevel parameters for GPT-2 style BPE tokenizers in the tokenizers library?

💡 Result:

For GPT-2 style BPE tokenizers in the tokenizers library (from Hugging Face), the typical ByteLevel parameters used are as follows:

  • add_prefix_space=True
  • use_regex=True
  • trim_offsets=True

When you instantiate the ByteLevel pre-tokenizer, it looks like this:

from tokenizers.pre_tokenizers import ByteLevel

pre_tokenizer = ByteLevel(add_prefix_space=True, use_regex=True, trim_offsets=True)

Explanation:

  • add_prefix_space=True adds a leading space before tokenizing, which helps with consistent tokenization especially for the first word.
  • use_regex=True enables using regex-based splitting for tokens.
  • trim_offsets=True trims offsets for cleaner span alignment, which is useful when mapping tokens back to original text.

This setup mirrors the original GPT-2 tokenizer's behavior using ByteLevel BPE.

If you need the full example or more details, let me know!


Update ByteLevel parameters to GPT-2 defaults

The ByteLevel pre-tokenizer, decoder, and post-processor are currently all configured with (false, false, false), but GPT-2–style BPE in Hugging Face’s tokenizers library uses:

  • add_prefix_space = true
  • use_regex = true
  • trim_offsets = true

Please update in mistralrs-core/src/gguf/gguf_tokenizer.rs (around lines 240–257):

- PreTokenizerWrapper::ByteLevel(ByteLevel::new(false, false, false)),
+ PreTokenizerWrapper::ByteLevel(ByteLevel::new(true, true, true)),
- decoder: ByteLevel::new(false, false, false),
+ decoder: ByteLevel::new(true, true, true),
- post_processor: ByteLevel::new(false, false, false),
+ post_processor: ByteLevel::new(true, true, true),

This ensures consistent GPT-2–style tokenization, decoding, and post-processing.

📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
// example:
// "type": "ByteLevel",
// "add_prefix_space": false,
// "trim_offsets": false,
// "use_regex": false
let pre_tokenizer = Sequence::new(vec![
PreTokenizerWrapper::Split(split),
PreTokenizerWrapper::ByteLevel(ByteLevel::new(false, false, false)),
]);
tokenizer.with_pre_tokenizer(Some(pre_tokenizer));
tokenizer.with_decoder(Some(decoders::byte_level::ByteLevel::new(
false, false, false,
)));
tokenizer.with_post_processor(Some(processors::byte_level::ByteLevel::new(
false, false, false,
)));
// example:
// "type": "ByteLevel",
// "add_prefix_space": false,
// "trim_offsets": false,
// "use_regex": false
let pre_tokenizer = Sequence::new(vec![
PreTokenizerWrapper::Split(split),
PreTokenizerWrapper::ByteLevel(ByteLevel::new(true, true, true)),
]);
tokenizer.with_pre_tokenizer(Some(pre_tokenizer));
tokenizer.with_decoder(Some(decoders::byte_level::ByteLevel::new(
true, true, true,
)));
tokenizer.with_post_processor(Some(processors::byte_level::ByteLevel::new(
true, true, true,
)));
🤖 Prompt for AI Agents
In mistralrs-core/src/gguf/gguf_tokenizer.rs around lines 240 to 257, the
ByteLevel pre-tokenizer, decoder, and post-processor are all instantiated with
parameters set to false (false, false, false). Update these parameters to match
the GPT-2 style defaults by setting add_prefix_space, use_regex, and
trim_offsets all to true. This means changing the ByteLevel::new calls to
ByteLevel::new(true, true, true) for the pre-tokenizer, decoder, and
post-processor to ensure consistent GPT-2 style tokenization behavior.

if add_bos_token.is_some_and(|x| x) {
let mut special_toks = HashMap::new();
special_toks.insert(
p.tokens[bos as usize].clone(),
template::SpecialToken::new(
p.tokens[bos as usize].clone(),
vec![bos],
vec![p.tokens[bos as usize].clone()],
)
.unwrap(),
);
tokenizer.with_post_processor(Some(
TemplateProcessing::builder()
.try_single(format!("{}:0 $A:0", p.tokens[bos as usize]))
.unwrap()
.try_pair(format!("{}:0 $A:0 $B:1", p.tokens[bos as usize]))
.unwrap()
.special_tokens(special_toks)
.build()
.unwrap(),
));
} else {
tokenizer.with_post_processor(Some(processors::byte_level::ByteLevel::new(
true, false, true,
)));
}

let special_tokens = add_special_tokens(p, &mut tokenizer, bos, eos, unk);
for i in [bos, eos] {
let tk = p.tokens[i as usize].clone();
tokenizer.add_special_tokens(&[AddedToken::from(tk.to_string(), true)]);
}
if unk.is_some() {
let tk = p.tokens[unk.unwrap() as usize].clone();
tokenizer.add_special_tokens(&[AddedToken::from(tk.to_string(), true)]);
}

Ok((tokenizer, TokenizerKind::Bpe, special_tokens))
Ok((tokenizer, TokenizerKind::Bpe))
}

// This is a workaround to have a better builder API.
Expand Down
30 changes: 1 addition & 29 deletions mistralrs-core/src/pipeline/gguf.rs
Original file line number Diff line number Diff line change
Expand Up @@ -379,7 +379,7 @@ impl Loader for GGUFLoader {
}

let GgufTokenizerConversion {
mut tokenizer,
tokenizer,
bos,
eos,
unk,
Expand All @@ -394,34 +394,6 @@ impl Loader for GGUFLoader {
}
};

let archs = model.get_metadata()[&"general.architecture".to_string()].to_string()?;
let base_url = "general.base_model.0.repo_url".to_string();
//temp fix, tokenizer built from gguf file may cause problems in qwen3
if archs == "qwen3" && model.get_metadata().contains_key(&base_url) {
let base_repo = model.get_metadata()[&base_url].to_string()?;
let base_repo = base_repo.replace("https://huggingface.co/", "");
warn!("Loading `tokenizer.json` at `{}` because built-in tokenizer metadata in gguf file may not be usable.", base_repo);
let api = {
use crate::GLOBAL_HF_CACHE;
let cache = GLOBAL_HF_CACHE.get().cloned().unwrap_or_default();
let mut api = ApiBuilder::from_cache(cache)
.with_progress(true)
.with_token(get_token(&TokenSource::CacheToken)?);
if let Ok(x) = std::env::var("HF_HUB_CACHE") {
api = api.with_cache_dir(x.into());
}
api.build()?
};
let api = api.repo(Repo::with_revision(
base_repo.clone(),
RepoType::Model,
"main".to_string(),
));
let tokenizer_file =
crate::api_get_file!(api, "tokenizer.json", std::path::Path::new(&base_repo));
tokenizer = get_tokenizer(tokenizer_file, None)?;
}

// Only load gguf chat template if there is nothing else
let gguf_chat_template =
if paths.get_template_filename().is_none() && self.chat_template.is_none() {
Expand Down
Loading