Skip to content

Commit 9108af2

Browse files
committed
Fix bug in tokenizer created with gguf metadata
1 parent cfd1e89 commit 9108af2

File tree

2 files changed

+89
-123
lines changed

2 files changed

+89
-123
lines changed

mistralrs-core/src/gguf/gguf_tokenizer.rs

Lines changed: 88 additions & 94 deletions
Original file line numberDiff line numberDiff line change
@@ -2,26 +2,27 @@
22

33
use std::{collections::HashMap, sync::atomic::Ordering};
44

5+
use crate::utils::gguf_metadata::ContentMetadata;
6+
use crate::DEBUG;
57
use anyhow::Result;
8+
use candle_core::quantized::gguf_file::Value;
69
use itertools::Itertools;
10+
use tokenizers::pre_tokenizers::{
11+
sequence::Sequence,
12+
split::{Split, SplitPattern},
13+
PreTokenizerWrapper,
14+
};
15+
use tokenizers::tokenizer::normalizer::SplitDelimiterBehavior;
716
use tokenizers::{
817
decoders::{
918
self, byte_fallback::ByteFallback, byte_level::ByteLevel, fuse::Fuse, strip::Strip,
1019
},
1120
models::{bpe::BpeBuilder, unigram::Unigram},
1221
normalizers::{self, Prepend, Replace},
13-
pre_tokenizers,
14-
processors::{
15-
self,
16-
template::{self, TemplateProcessing},
17-
},
18-
AddedToken, DecoderWrapper, ModelWrapper, NormalizerWrapper, Tokenizer,
22+
processors, AddedToken, DecoderWrapper, ModelWrapper, NormalizerWrapper, Tokenizer,
1923
};
2024
use tracing::info;
2125

22-
use crate::utils::gguf_metadata::ContentMetadata;
23-
use crate::DEBUG;
24-
2526
use super::Content;
2627

2728
pub(crate) struct GgufTokenizerConversion {
@@ -40,7 +41,6 @@ struct PropsGGUF {
4041
unk: Option<u32>,
4142
eos: u32,
4243
bos: u32,
43-
add_bos_token: Option<bool>,
4444
}
4545

4646
impl TryFrom<ContentMetadata<'_>> for PropsGGUF {
@@ -59,39 +59,61 @@ impl TryFrom<ContentMetadata<'_>> for PropsGGUF {
5959
unk: c.get_value("unknown_token_id").ok(),
6060
eos: c.get_value("eos_token_id")?,
6161
bos: c.get_value("bos_token_id")?,
62-
add_bos_token: c.get_value("add_bos_token").ok(),
6362
};
6463

6564
Ok(props)
6665
}
6766
}
6867

69-
struct AddedTokensCollection {
70-
bos: String,
71-
eos: String,
72-
unk: Option<String>,
73-
}
74-
7568
pub fn convert_gguf_to_hf_tokenizer<R: std::io::Seek + std::io::Read>(
7669
content: &Content<'_, R>,
7770
) -> Result<GgufTokenizerConversion> {
7871
let metadata = ContentMetadata {
7972
path_prefix: "tokenizer.ggml",
8073
metadata: content.get_metadata(),
8174
};
75+
76+
let md_get = |s: &str| match metadata.metadata.get(s) {
77+
None => candle_core::bail!("cannot find {s} in metadata"),
78+
Some(v) => Ok(v),
79+
};
80+
81+
let mut token_types = Vec::<i32>::new();
82+
if metadata.metadata.contains_key("tokenizer.ggml.token_type") {
83+
let vtypes: &Vec<Value> = md_get("tokenizer.ggml.token_type")
84+
.unwrap()
85+
.to_vec()
86+
.unwrap();
87+
let v: Vec<i32> = vtypes.iter().map(|v| v.to_i32().unwrap()).collect();
88+
token_types.extend(v);
89+
}
90+
8291
let props = PropsGGUF::try_from(metadata)?;
8392

84-
let (tokenizer, kind, special_tokens) = match props.model.as_str() {
93+
let (mut tokenizer, kind) = match props.model.as_str() {
8594
"llama" | "replit" => unigram_tokenizer(&props)?,
8695
"gpt2" => bpe_tokenizer(&props)?,
8796
other => {
8897
anyhow::bail!("Tokenizer model `{other}` not supported.");
8998
}
9099
};
91100

101+
//token type other than 1 treated as special token
102+
let mut num_special_tokens = 0;
103+
if token_types.len() == props.tokens.len() {
104+
for i in 0..props.tokens.len() {
105+
if token_types[i] != 1i32 {
106+
let tk = props.tokens[i].clone();
107+
tokenizer.add_special_tokens(&[AddedToken::from(tk.to_string(), true)]);
108+
num_special_tokens += 1;
109+
}
110+
}
111+
}
112+
92113
info!(
93-
"GGUF tokenizer model is `{model}`, kind: `{kind:?}`, num tokens: {}, num added tokens: {}, num merges: {}, num scores: {}",
114+
"GGUF tokenizer model is `{model}`, kind: `{kind:?}`, num tokens: {}, num special tokens {}, num added tokens: {}, num merges: {}, num scores: {}",
94115
tokenizer.get_vocab_size(true),
116+
num_special_tokens,
95117
props.added_tokens.as_ref().map(|x| x.len()).unwrap_or(0),
96118
props.merges.as_ref().map(|x| x.len()).unwrap_or(0),
97119
props.scores.as_ref().map(|x| x.len()).unwrap_or(0),
@@ -101,12 +123,15 @@ pub fn convert_gguf_to_hf_tokenizer<R: std::io::Seek + std::io::Read>(
101123
info!("Tokenizer: {tokenizer:?}");
102124
}
103125

104-
let AddedTokensCollection { bos, eos, unk } = special_tokens;
126+
let unk = match props.unk {
127+
Some(u) => Some(props.tokens[u as usize].clone()),
128+
_ => None,
129+
};
105130

106131
Ok(GgufTokenizerConversion {
107132
tokenizer,
108-
bos: Some(bos),
109-
eos: Some(eos),
133+
bos: Some(props.tokens[props.bos as usize].clone()),
134+
eos: Some(props.tokens[props.eos as usize].clone()),
110135
unk,
111136
})
112137
}
@@ -119,37 +144,7 @@ enum TokenizerKind {
119144
Bpe,
120145
}
121146

122-
/// Add the special tokens and return their string representations
123-
fn add_special_tokens(
124-
p: &PropsGGUF,
125-
tokenizer: &mut Tokenizer,
126-
bos: u32,
127-
eos: u32,
128-
unk: Option<u32>,
129-
) -> AddedTokensCollection {
130-
// Add special tokens (bos, eos, unk):
131-
let mut special_tokens: [Option<String>; 3] = Default::default();
132-
133-
// A little bit awkward here since eos/bos are assumed not options so we need to handle an Option
134-
for (i, token_id) in [Some(bos), Some(eos), unk].into_iter().enumerate() {
135-
if let Some(token_id) = token_id {
136-
let token = p.tokens[token_id as usize].as_str();
137-
special_tokens[i] = Some(token.to_string());
138-
tokenizer.add_special_tokens(&[AddedToken::from(token.to_string(), true)]);
139-
}
140-
}
141-
142-
// Destructure array of options:
143-
let [bos_str, eos_str, unk_str] = special_tokens;
144-
// Would need to unwrap bos/eos here, or change the struct types
145-
AddedTokensCollection {
146-
bos: bos_str.unwrap(),
147-
eos: eos_str.unwrap(),
148-
unk: unk_str,
149-
}
150-
}
151-
152-
fn unigram_tokenizer(p: &PropsGGUF) -> Result<(Tokenizer, TokenizerKind, AddedTokensCollection)> {
147+
fn unigram_tokenizer(p: &PropsGGUF) -> Result<(Tokenizer, TokenizerKind)> {
153148
let PropsGGUF { unk, eos, bos, .. } = *p;
154149
// Unigram (SentencePiece) default UNK is 0
155150
let unk = unk.unwrap_or(0);
@@ -191,12 +186,14 @@ fn unigram_tokenizer(p: &PropsGGUF) -> Result<(Tokenizer, TokenizerKind, AddedTo
191186
)?;
192187

193188
// Add special tokens (bos, eos, unk):
194-
let special_tokens = add_special_tokens(p, &mut tokenizer, bos, eos, Some(unk));
195-
196-
Ok((tokenizer, TokenizerKind::Unigram, special_tokens))
189+
for i in [bos, eos, unk] {
190+
let tk = p.tokens[i as usize].clone();
191+
tokenizer.add_special_tokens(&[AddedToken::from(tk.to_string(), true)]);
192+
}
193+
Ok((tokenizer, TokenizerKind::Unigram))
197194
}
198195

199-
fn bpe_tokenizer(p: &PropsGGUF) -> Result<(Tokenizer, TokenizerKind, AddedTokensCollection)> {
196+
fn bpe_tokenizer(p: &PropsGGUF) -> Result<(Tokenizer, TokenizerKind)> {
200197
// BPE merges have each string item as a space-delimited pair:
201198
// https://github.com/EricLBuehler/mistral.rs/pull/397#discussion_r1631988370
202199
let merges = p
@@ -219,13 +216,7 @@ fn bpe_tokenizer(p: &PropsGGUF) -> Result<(Tokenizer, TokenizerKind, AddedTokens
219216
vocab.insert(token.clone(), i as u32);
220217
}
221218

222-
let PropsGGUF {
223-
eos,
224-
bos,
225-
unk,
226-
add_bos_token,
227-
..
228-
} = *p;
219+
let PropsGGUF { eos, bos, unk, .. } = *p;
229220

230221
let mut bpe = BpeBuilder::new().vocab_and_merges(vocab, merges);
231222
if let Some(unk) = unk {
@@ -239,39 +230,42 @@ fn bpe_tokenizer(p: &PropsGGUF) -> Result<(Tokenizer, TokenizerKind, AddedTokens
239230
Some(Decoder::ByteLevel(true, true, true)),
240231
None,
241232
)?;
242-
tokenizer.with_pre_tokenizer(Some(pre_tokenizers::byte_level::ByteLevel::new(
243-
false, true, true,
233+
234+
let split = Split::new(
235+
SplitPattern::Regex("(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+".to_string()),
236+
SplitDelimiterBehavior::Isolated,
237+
false,
238+
).unwrap();
239+
240+
// example:
241+
// "type": "ByteLevel",
242+
// "add_prefix_space": false,
243+
// "trim_offsets": false,
244+
// "use_regex": false
245+
let pre_tokenizer = Sequence::new(vec![
246+
PreTokenizerWrapper::Split(split),
247+
PreTokenizerWrapper::ByteLevel(ByteLevel::new(false, false, false)),
248+
]);
249+
250+
tokenizer.with_pre_tokenizer(Some(pre_tokenizer));
251+
252+
tokenizer.with_decoder(Some(decoders::byte_level::ByteLevel::new(
253+
false, false, false,
254+
)));
255+
tokenizer.with_post_processor(Some(processors::byte_level::ByteLevel::new(
256+
false, false, false,
244257
)));
245-
if add_bos_token.is_some_and(|x| x) {
246-
let mut special_toks = HashMap::new();
247-
special_toks.insert(
248-
p.tokens[bos as usize].clone(),
249-
template::SpecialToken::new(
250-
p.tokens[bos as usize].clone(),
251-
vec![bos],
252-
vec![p.tokens[bos as usize].clone()],
253-
)
254-
.unwrap(),
255-
);
256-
tokenizer.with_post_processor(Some(
257-
TemplateProcessing::builder()
258-
.try_single(format!("{}:0 $A:0", p.tokens[bos as usize]))
259-
.unwrap()
260-
.try_pair(format!("{}:0 $A:0 $B:1", p.tokens[bos as usize]))
261-
.unwrap()
262-
.special_tokens(special_toks)
263-
.build()
264-
.unwrap(),
265-
));
266-
} else {
267-
tokenizer.with_post_processor(Some(processors::byte_level::ByteLevel::new(
268-
true, false, true,
269-
)));
270-
}
271258

272-
let special_tokens = add_special_tokens(p, &mut tokenizer, bos, eos, unk);
259+
for i in [bos, eos] {
260+
let tk = p.tokens[i as usize].clone();
261+
tokenizer.add_special_tokens(&[AddedToken::from(tk.to_string(), true)]);
262+
}
263+
if unk.is_some() {
264+
let tk = p.tokens[unk.unwrap() as usize].clone();
265+
tokenizer.add_special_tokens(&[AddedToken::from(tk.to_string(), true)]);
266+
}
273267

274-
Ok((tokenizer, TokenizerKind::Bpe, special_tokens))
268+
Ok((tokenizer, TokenizerKind::Bpe))
275269
}
276270

277271
// This is a workaround to have a better builder API.

mistralrs-core/src/pipeline/gguf.rs

Lines changed: 1 addition & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -379,7 +379,7 @@ impl Loader for GGUFLoader {
379379
}
380380

381381
let GgufTokenizerConversion {
382-
mut tokenizer,
382+
tokenizer,
383383
bos,
384384
eos,
385385
unk,
@@ -394,34 +394,6 @@ impl Loader for GGUFLoader {
394394
}
395395
};
396396

397-
let archs = model.get_metadata()[&"general.architecture".to_string()].to_string()?;
398-
let base_url = "general.base_model.0.repo_url".to_string();
399-
//temp fix, tokenizer built from gguf file may cause problems in qwen3
400-
if archs == "qwen3" && model.get_metadata().contains_key(&base_url) {
401-
let base_repo = model.get_metadata()[&base_url].to_string()?;
402-
let base_repo = base_repo.replace("https://huggingface.co/", "");
403-
warn!("Loading `tokenizer.json` at `{}` because built-in tokenizer metadata in gguf file may not be usable.", base_repo);
404-
let api = {
405-
use crate::GLOBAL_HF_CACHE;
406-
let cache = GLOBAL_HF_CACHE.get().cloned().unwrap_or_default();
407-
let mut api = ApiBuilder::from_cache(cache)
408-
.with_progress(true)
409-
.with_token(get_token(&TokenSource::CacheToken)?);
410-
if let Ok(x) = std::env::var("HF_HUB_CACHE") {
411-
api = api.with_cache_dir(x.into());
412-
}
413-
api.build()?
414-
};
415-
let api = api.repo(Repo::with_revision(
416-
base_repo.clone(),
417-
RepoType::Model,
418-
"main".to_string(),
419-
));
420-
let tokenizer_file =
421-
crate::api_get_file!(api, "tokenizer.json", std::path::Path::new(&base_repo));
422-
tokenizer = get_tokenizer(tokenizer_file, None)?;
423-
}
424-
425397
// Only load gguf chat template if there is nothing else
426398
let gguf_chat_template =
427399
if paths.get_template_filename().is_none() && self.chat_template.is_none() {

0 commit comments

Comments
 (0)