From 7637cb980dbfa49f1892a1472e150c981a6a52b7 Mon Sep 17 00:00:00 2001 From: Tobias Pitters Date: Tue, 21 Oct 2025 19:28:46 +0200 Subject: [PATCH 1/3] docs: add prefix space to pyi and rust --- bindings/python/tests/bindings/test_processors.py | 1 + 1 file changed, 1 insertion(+) diff --git a/bindings/python/tests/bindings/test_processors.py b/bindings/python/tests/bindings/test_processors.py index 3038d8694..c72eb3f6e 100644 --- a/bindings/python/tests/bindings/test_processors.py +++ b/bindings/python/tests/bindings/test_processors.py @@ -66,6 +66,7 @@ class TestByteLevelProcessing: def test_instantiate(self): assert ByteLevel() is not None assert ByteLevel(trim_offsets=True) is not None + assert ByteLevel(add_prefix_space=True) is not None assert isinstance(ByteLevel(), PostProcessor) assert isinstance(ByteLevel(), ByteLevel) assert isinstance(pickle.loads(pickle.dumps(ByteLevel())), ByteLevel) From 9ec05c94c27b18603e65d13670326570488c4ecb Mon Sep 17 00:00:00 2001 From: Tobias Pitters Date: Tue, 21 Oct 2025 19:31:12 +0200 Subject: [PATCH 2/3] add prefix space to pyi --- bindings/python/py_src/tokenizers/processors/__init__.pyi | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/bindings/python/py_src/tokenizers/processors/__init__.pyi b/bindings/python/py_src/tokenizers/processors/__init__.pyi index 5136d02bb..f04806cd6 100644 --- a/bindings/python/py_src/tokenizers/processors/__init__.pyi +++ b/bindings/python/py_src/tokenizers/processors/__init__.pyi @@ -98,8 +98,11 @@ class ByteLevel(PostProcessor): Args: trim_offsets (:obj:`bool`): Whether to trim the whitespaces from the produced offsets. + add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether to add a space to the first word if there isn't already one. This + lets us treat `hello` exactly like `say hello`. """ - def __init__(self, trim_offsets=True): + def __init__(self, trim_offsets=True, add_prefix_space=True): pass def num_special_tokens_to_add(self, is_pair): From c9296a331f6b07de9c01e8a1021da6891096977b Mon Sep 17 00:00:00 2001 From: Tobias Pitters Date: Wed, 22 Oct 2025 10:05:09 +0200 Subject: [PATCH 3/3] update docs for processors.rs --- bindings/python/src/processors.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/bindings/python/src/processors.rs b/bindings/python/src/processors.rs index 03fa6bdf7..60d4c8ece 100644 --- a/bindings/python/src/processors.rs +++ b/bindings/python/src/processors.rs @@ -484,12 +484,16 @@ impl PyRobertaProcessing { /// Args: /// trim_offsets (:obj:`bool`): /// Whether to trim the whitespaces from the produced offsets. +/// +/// add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`True`): +/// Whether the add_prefix_space option was enabled during pre-tokenization. This +/// is relevant because it defines the way the offsets are trimmed out. #[pyclass(extends=PyPostProcessor, module = "tokenizers.processors", name = "ByteLevel")] pub struct PyByteLevel {} #[pymethods] impl PyByteLevel { #[new] - #[pyo3(signature = (add_prefix_space = None, trim_offsets = None, use_regex = None, **_kwargs), text_signature = "(self, trim_offsets=True)")] + #[pyo3(signature = (add_prefix_space = None, trim_offsets = None, use_regex = None, **_kwargs), text_signature = "(self, trim_offsets=True, add_prefix_state=True)")] fn new( add_prefix_space: Option, trim_offsets: Option,