@@ -362,7 +362,7 @@ void bpe_encode(char *text, char **vocab, float *vocab_scores, int vocab_size, u
362362 qsort (sorted_vocab , vocab_size , sizeof (TokenIndex ), compare_tokens );
363363
364364 // create a temporary buffer that will store merge candidates of always two consecutive tokens
365- char * str_buffer = malloc ((max_token_length * 2 + 1 ) * sizeof (char )); // *2 for concat, +1 for null terminator
365+ char * str_buffer = malloc ((max_token_length * 2 + 1 + 2 ) * sizeof (char )); // *2 for concat, +1 for null terminator +2 for UTF8 (in case max_token_lenght is 1)
366366 size_t str_len = 0 ;
367367
368368 // add_dummy_prefix is true by default
@@ -396,7 +396,8 @@ void bpe_encode(char *text, char **vocab, float *vocab_scores, int vocab_size, u
396396 str_buffer [str_len ] = '\0' ;
397397
398398 // while the next character is a continuation byte, continue appending
399- if ((* (c + 1 ) & 0xC0 ) == 0x80 ) {
399+ // but if there are too many of them, just stop to avoid overruning str_buffer size.
400+ if ((* (c + 1 ) & 0xC0 ) == 0x80 && str_len < 4 ) {
400401 continue ;
401402 }
402403
@@ -414,6 +415,7 @@ void bpe_encode(char *text, char **vocab, float *vocab_scores, int vocab_size, u
414415 tokens [(* n_tokens )++ ] = (unsigned char )str_buffer [i ] + 3 ;
415416 }
416417 }
418+ str_len = 0 ; // protect against a sequence of stray UTF8 continuation bytes
417419 }
418420
419421 // merge the best consecutive pair each iteration, according the scores in vocab_scores
0 commit comments