* Fix Issue #360: Tokenizer failed when the infix regex matched the start of the string while trying to tokenize multi-infix tokens.

honnibal · honnibal · commit cc8bf6220838 · 2016-05-09T13:23:47.000+02:00
diff --git a/spacy/tests/tokenizer/test_infix.py b/spacy/tests/tokenizer/test_infix.py
@@ -24,6 +24,12 @@ def test_ellipsis(en_tokenizer):
     tokens = en_tokenizer('best...known')
     assert len(tokens) == 3
 
+def test_big_ellipsis(en_tokenizer):
+    '''Test regression identified in Issue #360'''
+    tokens = en_tokenizer(u'$45...............Asking')
+    assert len(tokens) > 2
+
+
 
 def test_email(en_tokenizer):
     tokens = en_tokenizer('hello@example.com')
diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx
@@ -227,6 +227,8 @@ cdef class Tokenizer:
                     for match in matches:
                         infix_start = match.start()
                         infix_end = match.end()
+                        if infix_start == start:
+                            continue
                         span = string[start:infix_start]
                         tokens.push_back(self.vocab.get(tokens.mem, span), False)