Skip to content

Commit 57bf0e9

Browse files
authored
Merge pull request #306 from rdentato/patch-utf8-no-validation
minimal protection against invalid UTF8 encoding.
2 parents df6557a + 55e6074 commit 57bf0e9

1 file changed

Lines changed: 4 additions & 2 deletions

File tree

run.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -362,7 +362,7 @@ void bpe_encode(char *text, char **vocab, float *vocab_scores, int vocab_size, u
362362
qsort(sorted_vocab, vocab_size, sizeof(TokenIndex), compare_tokens);
363363

364364
// create a temporary buffer that will store merge candidates of always two consecutive tokens
365-
char* str_buffer = malloc((max_token_length*2+1) * sizeof(char)); // *2 for concat, +1 for null terminator
365+
char* str_buffer = malloc((max_token_length*2 +1 +2) * sizeof(char)); // *2 for concat, +1 for null terminator +2 for UTF8 (in case max_token_lenght is 1)
366366
size_t str_len = 0;
367367

368368
// add_dummy_prefix is true by default
@@ -396,7 +396,8 @@ void bpe_encode(char *text, char **vocab, float *vocab_scores, int vocab_size, u
396396
str_buffer[str_len] = '\0';
397397

398398
// while the next character is a continuation byte, continue appending
399-
if ((*(c+1) & 0xC0) == 0x80) {
399+
// but if there are too many of them, just stop to avoid overruning str_buffer size.
400+
if ((*(c+1) & 0xC0) == 0x80 && str_len < 4) {
400401
continue;
401402
}
402403

@@ -414,6 +415,7 @@ void bpe_encode(char *text, char **vocab, float *vocab_scores, int vocab_size, u
414415
tokens[(*n_tokens)++] = (unsigned char)str_buffer[i] + 3;
415416
}
416417
}
418+
str_len = 0; // protect against a sequence of stray UTF8 continuation bytes
417419
}
418420

419421
// merge the best consecutive pair each iteration, according the scores in vocab_scores

0 commit comments

Comments
 (0)