@@ -115,6 +115,45 @@ func TestTokenize(t *testing.T) {
115115 }
116116}
117117
118+ func TestTokenizerLatin1AsUtf8 (t * testing.T ) {
119+ content := []byte ("th\xe5 filling" ) // `th� filling`
120+ t .Logf ("%v - %q" , content , string (content ))
121+ tokens := Tokenize (content )
122+ for i , token := range tokens {
123+ t .Logf ("token %d, %s" , i + 1 , token )
124+ }
125+ require .Equal (t , 3 , len (tokens ))
126+ }
127+
128+ func TestRegexpOnInvalidUtf8 (t * testing.T ) {
129+ origContent := []struct {
130+ text string
131+ tokens []string
132+ }{
133+ {"th\xe0 filling" , []string {"th" , "filling" }}, // `th� filling`
134+ {"th\u0100 filling" , []string {"th" , "filling" }}, // `thĀ filling`
135+ {"привет, как дела?" , []string {}}, // empty, no ASCII tokens
136+ }
137+ re := reRegularToken
138+
139+ for _ , content := range origContent {
140+ t .Run ("" , func (t * testing.T ) {
141+ t .Logf ("%v - %q" , content , content .text )
142+ input := []byte (content .text )
143+ tokens := re .FindAll (input , - 1 )
144+ require .Equal (t , len (content .tokens ), len (tokens ))
145+
146+ newContent := re .ReplaceAll (input , []byte (` ` ))
147+ t .Logf ("content:%q, tokens:[" , newContent )
148+ for i , token := range tokens {
149+ t .Logf ("\t %q," , string (token ))
150+ require .Equal (t , content .tokens [i ], string (token ))
151+ }
152+ t .Logf (" ]\n " )
153+ })
154+ }
155+ }
156+
118157func BenchmarkTokenizer_BaselineCopy (b * testing.B ) {
119158 b .ReportAllocs ()
120159 for i := 0 ; i < b .N ; i ++ {
0 commit comments