22
33use std:: { collections:: HashMap , sync:: atomic:: Ordering } ;
44
5+ use crate :: utils:: gguf_metadata:: ContentMetadata ;
6+ use crate :: DEBUG ;
57use anyhow:: Result ;
8+ use candle_core:: quantized:: gguf_file:: Value ;
69use itertools:: Itertools ;
10+ use tokenizers:: pre_tokenizers:: {
11+ sequence:: Sequence ,
12+ split:: { Split , SplitPattern } ,
13+ PreTokenizerWrapper ,
14+ } ;
15+ use tokenizers:: tokenizer:: normalizer:: SplitDelimiterBehavior ;
716use tokenizers:: {
817 decoders:: {
918 self , byte_fallback:: ByteFallback , byte_level:: ByteLevel , fuse:: Fuse , strip:: Strip ,
1019 } ,
1120 models:: { bpe:: BpeBuilder , unigram:: Unigram } ,
1221 normalizers:: { self , Prepend , Replace } ,
13- pre_tokenizers,
14- processors:: {
15- self ,
16- template:: { self , TemplateProcessing } ,
17- } ,
18- AddedToken , DecoderWrapper , ModelWrapper , NormalizerWrapper , Tokenizer ,
22+ processors, AddedToken , DecoderWrapper , ModelWrapper , NormalizerWrapper , Tokenizer ,
1923} ;
2024use tracing:: info;
2125
22- use crate :: utils:: gguf_metadata:: ContentMetadata ;
23- use crate :: DEBUG ;
24-
2526use super :: Content ;
2627
2728pub ( crate ) struct GgufTokenizerConversion {
@@ -40,7 +41,6 @@ struct PropsGGUF {
4041 unk : Option < u32 > ,
4142 eos : u32 ,
4243 bos : u32 ,
43- add_bos_token : Option < bool > ,
4444}
4545
4646impl TryFrom < ContentMetadata < ' _ > > for PropsGGUF {
@@ -59,39 +59,61 @@ impl TryFrom<ContentMetadata<'_>> for PropsGGUF {
5959 unk : c. get_value ( "unknown_token_id" ) . ok ( ) ,
6060 eos : c. get_value ( "eos_token_id" ) ?,
6161 bos : c. get_value ( "bos_token_id" ) ?,
62- add_bos_token : c. get_value ( "add_bos_token" ) . ok ( ) ,
6362 } ;
6463
6564 Ok ( props)
6665 }
6766}
6867
69- struct AddedTokensCollection {
70- bos : String ,
71- eos : String ,
72- unk : Option < String > ,
73- }
74-
7568pub fn convert_gguf_to_hf_tokenizer < R : std:: io:: Seek + std:: io:: Read > (
7669 content : & Content < ' _ , R > ,
7770) -> Result < GgufTokenizerConversion > {
7871 let metadata = ContentMetadata {
7972 path_prefix : "tokenizer.ggml" ,
8073 metadata : content. get_metadata ( ) ,
8174 } ;
75+
76+ let md_get = |s : & str | match metadata. metadata . get ( s) {
77+ None => candle_core:: bail!( "cannot find {s} in metadata" ) ,
78+ Some ( v) => Ok ( v) ,
79+ } ;
80+
81+ let mut token_types = Vec :: < i32 > :: new ( ) ;
82+ if metadata. metadata . contains_key ( "tokenizer.ggml.token_type" ) {
83+ let vtypes: & Vec < Value > = md_get ( "tokenizer.ggml.token_type" )
84+ . unwrap ( )
85+ . to_vec ( )
86+ . unwrap ( ) ;
87+ let v: Vec < i32 > = vtypes. iter ( ) . map ( |v| v. to_i32 ( ) . unwrap ( ) ) . collect ( ) ;
88+ token_types. extend ( v) ;
89+ }
90+
8291 let props = PropsGGUF :: try_from ( metadata) ?;
8392
84- let ( tokenizer, kind, special_tokens ) = match props. model . as_str ( ) {
93+ let ( mut tokenizer, kind) = match props. model . as_str ( ) {
8594 "llama" | "replit" => unigram_tokenizer ( & props) ?,
8695 "gpt2" => bpe_tokenizer ( & props) ?,
8796 other => {
8897 anyhow:: bail!( "Tokenizer model `{other}` not supported." ) ;
8998 }
9099 } ;
91100
101+ //token type other than 1 treated as special token
102+ let mut num_special_tokens = 0 ;
103+ if token_types. len ( ) == props. tokens . len ( ) {
104+ for i in 0 ..props. tokens . len ( ) {
105+ if token_types[ i] != 1i32 {
106+ let tk = props. tokens [ i] . clone ( ) ;
107+ tokenizer. add_special_tokens ( & [ AddedToken :: from ( tk. to_string ( ) , true ) ] ) ;
108+ num_special_tokens += 1 ;
109+ }
110+ }
111+ }
112+
92113 info ! (
93- "GGUF tokenizer model is `{model}`, kind: `{kind:?}`, num tokens: {}, num added tokens: {}, num merges: {}, num scores: {}" ,
114+ "GGUF tokenizer model is `{model}`, kind: `{kind:?}`, num tokens: {}, num special tokens {}, num added tokens: {}, num merges: {}, num scores: {}" ,
94115 tokenizer. get_vocab_size( true ) ,
116+ num_special_tokens,
95117 props. added_tokens. as_ref( ) . map( |x| x. len( ) ) . unwrap_or( 0 ) ,
96118 props. merges. as_ref( ) . map( |x| x. len( ) ) . unwrap_or( 0 ) ,
97119 props. scores. as_ref( ) . map( |x| x. len( ) ) . unwrap_or( 0 ) ,
@@ -101,12 +123,15 @@ pub fn convert_gguf_to_hf_tokenizer<R: std::io::Seek + std::io::Read>(
101123 info ! ( "Tokenizer: {tokenizer:?}" ) ;
102124 }
103125
104- let AddedTokensCollection { bos, eos, unk } = special_tokens;
126+ let unk = match props. unk {
127+ Some ( u) => Some ( props. tokens [ u as usize ] . clone ( ) ) ,
128+ _ => None ,
129+ } ;
105130
106131 Ok ( GgufTokenizerConversion {
107132 tokenizer,
108- bos : Some ( bos) ,
109- eos : Some ( eos) ,
133+ bos : Some ( props . tokens [ props . bos as usize ] . clone ( ) ) ,
134+ eos : Some ( props . tokens [ props . eos as usize ] . clone ( ) ) ,
110135 unk,
111136 } )
112137}
@@ -119,37 +144,7 @@ enum TokenizerKind {
119144 Bpe ,
120145}
121146
122- /// Add the special tokens and return their string representations
123- fn add_special_tokens (
124- p : & PropsGGUF ,
125- tokenizer : & mut Tokenizer ,
126- bos : u32 ,
127- eos : u32 ,
128- unk : Option < u32 > ,
129- ) -> AddedTokensCollection {
130- // Add special tokens (bos, eos, unk):
131- let mut special_tokens: [ Option < String > ; 3 ] = Default :: default ( ) ;
132-
133- // A little bit awkward here since eos/bos are assumed not options so we need to handle an Option
134- for ( i, token_id) in [ Some ( bos) , Some ( eos) , unk] . into_iter ( ) . enumerate ( ) {
135- if let Some ( token_id) = token_id {
136- let token = p. tokens [ token_id as usize ] . as_str ( ) ;
137- special_tokens[ i] = Some ( token. to_string ( ) ) ;
138- tokenizer. add_special_tokens ( & [ AddedToken :: from ( token. to_string ( ) , true ) ] ) ;
139- }
140- }
141-
142- // Destructure array of options:
143- let [ bos_str, eos_str, unk_str] = special_tokens;
144- // Would need to unwrap bos/eos here, or change the struct types
145- AddedTokensCollection {
146- bos : bos_str. unwrap ( ) ,
147- eos : eos_str. unwrap ( ) ,
148- unk : unk_str,
149- }
150- }
151-
152- fn unigram_tokenizer ( p : & PropsGGUF ) -> Result < ( Tokenizer , TokenizerKind , AddedTokensCollection ) > {
147+ fn unigram_tokenizer ( p : & PropsGGUF ) -> Result < ( Tokenizer , TokenizerKind ) > {
153148 let PropsGGUF { unk, eos, bos, .. } = * p;
154149 // Unigram (SentencePiece) default UNK is 0
155150 let unk = unk. unwrap_or ( 0 ) ;
@@ -191,12 +186,14 @@ fn unigram_tokenizer(p: &PropsGGUF) -> Result<(Tokenizer, TokenizerKind, AddedTo
191186 ) ?;
192187
193188 // Add special tokens (bos, eos, unk):
194- let special_tokens = add_special_tokens ( p, & mut tokenizer, bos, eos, Some ( unk) ) ;
195-
196- Ok ( ( tokenizer, TokenizerKind :: Unigram , special_tokens) )
189+ for i in [ bos, eos, unk] {
190+ let tk = p. tokens [ i as usize ] . clone ( ) ;
191+ tokenizer. add_special_tokens ( & [ AddedToken :: from ( tk. to_string ( ) , true ) ] ) ;
192+ }
193+ Ok ( ( tokenizer, TokenizerKind :: Unigram ) )
197194}
198195
199- fn bpe_tokenizer ( p : & PropsGGUF ) -> Result < ( Tokenizer , TokenizerKind , AddedTokensCollection ) > {
196+ fn bpe_tokenizer ( p : & PropsGGUF ) -> Result < ( Tokenizer , TokenizerKind ) > {
200197 // BPE merges have each string item as a space-delimited pair:
201198 // https://github.com/EricLBuehler/mistral.rs/pull/397#discussion_r1631988370
202199 let merges = p
@@ -219,13 +216,7 @@ fn bpe_tokenizer(p: &PropsGGUF) -> Result<(Tokenizer, TokenizerKind, AddedTokens
219216 vocab. insert ( token. clone ( ) , i as u32 ) ;
220217 }
221218
222- let PropsGGUF {
223- eos,
224- bos,
225- unk,
226- add_bos_token,
227- ..
228- } = * p;
219+ let PropsGGUF { eos, bos, unk, .. } = * p;
229220
230221 let mut bpe = BpeBuilder :: new ( ) . vocab_and_merges ( vocab, merges) ;
231222 if let Some ( unk) = unk {
@@ -239,39 +230,42 @@ fn bpe_tokenizer(p: &PropsGGUF) -> Result<(Tokenizer, TokenizerKind, AddedTokens
239230 Some ( Decoder :: ByteLevel ( true , true , true ) ) ,
240231 None ,
241232 ) ?;
242- tokenizer. with_pre_tokenizer ( Some ( pre_tokenizers:: byte_level:: ByteLevel :: new (
243- false , true , true ,
233+
234+ let split = Split :: new (
235+ SplitPattern :: Regex ( "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\ r\\ n\\ p{L}\\ p{N}]?\\ p{L}+|\\ p{N}| ?[^\\ s\\ p{L}\\ p{N}]+[\\ r\\ n]*|\\ s*[\\ r\\ n]+|\\ s+(?!\\ S)|\\ s+" . to_string ( ) ) ,
236+ SplitDelimiterBehavior :: Isolated ,
237+ false ,
238+ ) . unwrap ( ) ;
239+
240+ // example:
241+ // "type": "ByteLevel",
242+ // "add_prefix_space": false,
243+ // "trim_offsets": false,
244+ // "use_regex": false
245+ let pre_tokenizer = Sequence :: new ( vec ! [
246+ PreTokenizerWrapper :: Split ( split) ,
247+ PreTokenizerWrapper :: ByteLevel ( ByteLevel :: new( false , false , false ) ) ,
248+ ] ) ;
249+
250+ tokenizer. with_pre_tokenizer ( Some ( pre_tokenizer) ) ;
251+
252+ tokenizer. with_decoder ( Some ( decoders:: byte_level:: ByteLevel :: new (
253+ false , false , false ,
254+ ) ) ) ;
255+ tokenizer. with_post_processor ( Some ( processors:: byte_level:: ByteLevel :: new (
256+ false , false , false ,
244257 ) ) ) ;
245- if add_bos_token. is_some_and ( |x| x) {
246- let mut special_toks = HashMap :: new ( ) ;
247- special_toks. insert (
248- p. tokens [ bos as usize ] . clone ( ) ,
249- template:: SpecialToken :: new (
250- p. tokens [ bos as usize ] . clone ( ) ,
251- vec ! [ bos] ,
252- vec ! [ p. tokens[ bos as usize ] . clone( ) ] ,
253- )
254- . unwrap ( ) ,
255- ) ;
256- tokenizer. with_post_processor ( Some (
257- TemplateProcessing :: builder ( )
258- . try_single ( format ! ( "{}:0 $A:0" , p. tokens[ bos as usize ] ) )
259- . unwrap ( )
260- . try_pair ( format ! ( "{}:0 $A:0 $B:1" , p. tokens[ bos as usize ] ) )
261- . unwrap ( )
262- . special_tokens ( special_toks)
263- . build ( )
264- . unwrap ( ) ,
265- ) ) ;
266- } else {
267- tokenizer. with_post_processor ( Some ( processors:: byte_level:: ByteLevel :: new (
268- true , false , true ,
269- ) ) ) ;
270- }
271258
272- let special_tokens = add_special_tokens ( p, & mut tokenizer, bos, eos, unk) ;
259+ for i in [ bos, eos] {
260+ let tk = p. tokens [ i as usize ] . clone ( ) ;
261+ tokenizer. add_special_tokens ( & [ AddedToken :: from ( tk. to_string ( ) , true ) ] ) ;
262+ }
263+ if unk. is_some ( ) {
264+ let tk = p. tokens [ unk. unwrap ( ) as usize ] . clone ( ) ;
265+ tokenizer. add_special_tokens ( & [ AddedToken :: from ( tk. to_string ( ) , true ) ] ) ;
266+ }
273267
274- Ok ( ( tokenizer, TokenizerKind :: Bpe , special_tokens ) )
268+ Ok ( ( tokenizer, TokenizerKind :: Bpe ) )
275269}
276270
277271// This is a workaround to have a better builder API.
0 commit comments