Merge pull request #306 from rdentato/patch-utf8-no-validation

karpathy · web-flow · commit 57bf0e9ee4bb · 2023-08-16T09:51:11.000-07:00
minimal protection against invalid UTF8 encoding.
diff --git a/run.c b/run.c
@@ -362,7 +362,7 @@ void bpe_encode(char *text, char **vocab, float *vocab_scores, int vocab_size, u
     qsort(sorted_vocab, vocab_size, sizeof(TokenIndex), compare_tokens);
 
     // create a temporary buffer that will store merge candidates of always two consecutive tokens
-    char* str_buffer = malloc((max_token_length*2+1) * sizeof(char)); // *2 for concat, +1 for null terminator
+    char* str_buffer = malloc((max_token_length*2 +1 +2) * sizeof(char)); // *2 for concat, +1 for null terminator +2 for UTF8 (in case max_token_lenght is 1)
     size_t str_len = 0;
 
     // add_dummy_prefix is true by default
@@ -396,7 +396,8 @@ void bpe_encode(char *text, char **vocab, float *vocab_scores, int vocab_size, u
         str_buffer[str_len] = '\0';
 
         // while the next character is a continuation byte, continue appending
-        if ((*(c+1) & 0xC0) == 0x80) {
+        // but if there are too many of them, just stop to avoid overruning str_buffer size.
+        if ((*(c+1) & 0xC0) == 0x80 && str_len < 4) {
             continue;
         }
 
@@ -414,6 +415,7 @@ void bpe_encode(char *text, char **vocab, float *vocab_scores, int vocab_size, u
                 tokens[(*n_tokens)++] = (unsigned char)str_buffer[i] + 3;
             }
         }
+        str_len = 0; // protect against a sequence of stray UTF8 continuation bytes
     }
 
     // merge the best consecutive pair each iteration, according the scores in vocab_scores

Original file line number	Diff line number	Diff line change
`@@ -362,7 +362,7 @@ void bpe_encode(char text, char vocab, float vocab_scores, int vocab_size, u`
`362`	`362`	`qsort(sorted_vocab, vocab_size, sizeof(TokenIndex), compare_tokens);`
`363`	`363`
`364`	`364`	`// create a temporary buffer that will store merge candidates of always two consecutive tokens`
`365`		`- char* str_buffer = malloc((max_token_length2+1) sizeof(char)); // *2 for concat, +1 for null terminator`
	`365`	`+ char* str_buffer = malloc((max_token_length2 +1 +2) sizeof(char)); // *2 for concat, +1 for null terminator +2 for UTF8 (in case max_token_lenght is 1)`
`366`	`366`	`size_t str_len = 0;`
`367`	`367`
`368`	`368`	`// add_dummy_prefix is true by default`
`@@ -396,7 +396,8 @@ void bpe_encode(char text, char vocab, float vocab_scores, int vocab_size, u`
`396`	`396`	`str_buffer[str_len] = '\0';`
`397`	`397`
`398`	`398`	`// while the next character is a continuation byte, continue appending`
`399`		`- if ((*(c+1) & 0xC0) == 0x80) {`
	`399`	`+ // but if there are too many of them, just stop to avoid overruning str_buffer size.`
	`400`	`+ if ((*(c+1) & 0xC0) == 0x80 && str_len < 4) {`
`400`	`401`	`continue;`
`401`	`402`	`}`
`402`	`403`
`@@ -414,6 +415,7 @@ void bpe_encode(char text, char vocab, float vocab_scores, int vocab_size, u`
`414`	`415`	`tokens[(*n_tokens)++] = (unsigned char)str_buffer[i] + 3;`
`415`	`416`	`}`
`416`	`417`	`}`
	`418`	`+ str_len = 0; // protect against a sequence of stray UTF8 continuation bytes`
`417`	`419`	`}`
`418`	`420`
`419`	`421`	`// merge the best consecutive pair each iteration, according the scores in vocab_scores`