diff --git a/bindings/python/py_src/tokenizers/processors/__init__.pyi b/bindings/python/py_src/tokenizers/processors/__init__.pyi index 5136d02bb..f04806cd6 100644 --- a/bindings/python/py_src/tokenizers/processors/__init__.pyi +++ b/bindings/python/py_src/tokenizers/processors/__init__.pyi @@ -98,8 +98,11 @@ class ByteLevel(PostProcessor): Args: trim_offsets (:obj:`bool`): Whether to trim the whitespaces from the produced offsets. + add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether to add a space to the first word if there isn't already one. This + lets us treat `hello` exactly like `say hello`. """ - def __init__(self, trim_offsets=True): + def __init__(self, trim_offsets=True, add_prefix_space=True): pass def num_special_tokens_to_add(self, is_pair): diff --git a/bindings/python/src/processors.rs b/bindings/python/src/processors.rs index 03fa6bdf7..60d4c8ece 100644 --- a/bindings/python/src/processors.rs +++ b/bindings/python/src/processors.rs @@ -484,12 +484,16 @@ impl PyRobertaProcessing { /// Args: /// trim_offsets (:obj:`bool`): /// Whether to trim the whitespaces from the produced offsets. +/// +/// add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`True`): +/// Whether the add_prefix_space option was enabled during pre-tokenization. This +/// is relevant because it defines the way the offsets are trimmed out. #[pyclass(extends=PyPostProcessor, module = "tokenizers.processors", name = "ByteLevel")] pub struct PyByteLevel {} #[pymethods] impl PyByteLevel { #[new] - #[pyo3(signature = (add_prefix_space = None, trim_offsets = None, use_regex = None, **_kwargs), text_signature = "(self, trim_offsets=True)")] + #[pyo3(signature = (add_prefix_space = None, trim_offsets = None, use_regex = None, **_kwargs), text_signature = "(self, trim_offsets=True, add_prefix_state=True)")] fn new( add_prefix_space: Option, trim_offsets: Option, diff --git a/bindings/python/tests/bindings/test_processors.py b/bindings/python/tests/bindings/test_processors.py index 3038d8694..c72eb3f6e 100644 --- a/bindings/python/tests/bindings/test_processors.py +++ b/bindings/python/tests/bindings/test_processors.py @@ -66,6 +66,7 @@ class TestByteLevelProcessing: def test_instantiate(self): assert ByteLevel() is not None assert ByteLevel(trim_offsets=True) is not None + assert ByteLevel(add_prefix_space=True) is not None assert isinstance(ByteLevel(), PostProcessor) assert isinstance(ByteLevel(), ByteLevel) assert isinstance(pickle.loads(pickle.dumps(ByteLevel())), ByteLevel)