From 452978c126433dfe21f42a3ed2b30bf4c3403424 Mon Sep 17 00:00:00 2001 From: JqzChandler Date: Thu, 30 Oct 2025 19:16:25 +0800 Subject: [PATCH 1/2] enable_getting_encoding_offsets_at_diff_lvl --- .../implementations/base_tokenizer.py | 18 +++++- bindings/python/src/tokenizer.rs | 62 ++++++++++++++----- 2 files changed, 62 insertions(+), 18 deletions(-) diff --git a/bindings/python/py_src/tokenizers/implementations/base_tokenizer.py b/bindings/python/py_src/tokenizers/implementations/base_tokenizer.py index 432a4754c..45cf7ec56 100644 --- a/bindings/python/py_src/tokenizers/implementations/base_tokenizer.py +++ b/bindings/python/py_src/tokenizers/implementations/base_tokenizer.py @@ -195,6 +195,7 @@ def encode( pair: Optional[InputSequence] = None, is_pretokenized: bool = False, add_special_tokens: bool = True, + offset_type: str = "char", ) -> Encoding: """Encode the given sequence and pair. This method can process raw text sequences as well as already pre-tokenized sequences. @@ -214,19 +215,26 @@ def encode( add_special_tokens: bool: Whether to add the special tokens while encoding. + offset_type: str: + The type of offsets to return. Can be one of: + - "char": Character-based offsets (default, for backward compatibility) + - "byte": Byte-based offsets + - "none": No offsets (all zeros, faster) + Returns: An Encoding """ if sequence is None: raise ValueError("encode: `sequence` can't be `None`") - return self._tokenizer.encode(sequence, pair, is_pretokenized, add_special_tokens) + return self._tokenizer.encode(sequence, pair, is_pretokenized, add_special_tokens, offset_type) def encode_batch( self, inputs: List[EncodeInput], is_pretokenized: bool = False, add_special_tokens: bool = True, + offset_type: str = "char", ) -> List[Encoding]: """Encode the given inputs. This method accept both raw text sequences as well as already pre-tokenized sequences. @@ -250,6 +258,12 @@ def encode_batch( add_special_tokens: bool: Whether to add the special tokens while encoding. + offset_type: str: + The type of offsets to return. Can be one of: + - "char": Character-based offsets (default, for backward compatibility) + - "byte": Byte-based offsets + - "none": No offsets (all zeros, faster) + Returns: A list of Encoding """ @@ -257,7 +271,7 @@ def encode_batch( if inputs is None: raise ValueError("encode_batch: `inputs` can't be `None`") - return self._tokenizer.encode_batch(inputs, is_pretokenized, add_special_tokens) + return self._tokenizer.encode_batch(inputs, is_pretokenized, add_special_tokens, offset_type) async def async_encode_batch( self, diff --git a/bindings/python/src/tokenizer.rs b/bindings/python/src/tokenizer.rs index 0e27f594f..bcac23115 100644 --- a/bindings/python/src/tokenizer.rs +++ b/bindings/python/src/tokenizer.rs @@ -1060,12 +1060,18 @@ impl PyTokenizer { /// add_special_tokens (:obj:`bool`, defaults to :obj:`True`): /// Whether to add the special tokens /// + /// offset_type (:obj:`str`, `optional`, defaults to :obj:`"char"`): + /// The type of offsets to return. Can be one of: + /// - ``"char"``: Character-based offsets (default) + /// - ``"byte"``: Byte-based offsets + /// - ``"none"``: No offsets (all zeros, faster) + /// /// Returns: /// :class:`~tokenizers.Encoding`: The encoded result /// - #[pyo3(signature = (sequence, pair = None, is_pretokenized = false, add_special_tokens = true))] + #[pyo3(signature = (sequence, pair = None, is_pretokenized = false, add_special_tokens = true, offset_type = "char"))] #[pyo3( - text_signature = "(self, sequence, pair=None, is_pretokenized=False, add_special_tokens=True)" + text_signature = "(self, sequence, pair=None, is_pretokenized=False, add_special_tokens=True, offset_type='char')" )] fn encode( &self, @@ -1073,6 +1079,7 @@ impl PyTokenizer { pair: Option<&Bound<'_, PyAny>>, is_pretokenized: bool, add_special_tokens: bool, + offset_type: &str, ) -> PyResult { let sequence: tk::InputSequence = if is_pretokenized { sequence.extract::()?.into() @@ -1091,12 +1098,20 @@ impl PyTokenizer { None => tk::EncodeInput::Single(sequence), }; - ToPyResult( - self.tokenizer - .encode_char_offsets(input, add_special_tokens) - .map(|e| e.into()), - ) - .into() + let result = match offset_type { + "char" => self.tokenizer.encode_char_offsets(input, add_special_tokens), + "byte" => self.tokenizer.encode(input, add_special_tokens), + "none" => self.tokenizer.encode_fast(input, add_special_tokens), + _ => { + return Err(PyError(format!( + "Invalid offset_type: '{}'. Must be one of 'char', 'byte', or 'none'", + offset_type + )) + .into_pyerr::()) + } + }; + + ToPyResult(result.map(|e| e.into())).into() } /// Asynchronously encode the given input with character offsets. @@ -1199,17 +1214,24 @@ impl PyTokenizer { /// add_special_tokens (:obj:`bool`, defaults to :obj:`True`): /// Whether to add the special tokens /// + /// offset_type (:obj:`str`, `optional`, defaults to :obj:`"char"`): + /// The type of offsets to return. Can be one of: + /// - ``"char"``: Character-based offsets (default) + /// - ``"byte"``: Byte-based offsets + /// - ``"none"``: No offsets (all zeros, faster) + /// /// Returns: /// A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch /// - #[pyo3(signature = (input, is_pretokenized = false, add_special_tokens = true))] - #[pyo3(text_signature = "(self, input, is_pretokenized=False, add_special_tokens=True)")] + #[pyo3(signature = (input, is_pretokenized = false, add_special_tokens = true, offset_type = "char"))] + #[pyo3(text_signature = "(self, input, is_pretokenized=False, add_special_tokens=True, offset_type='char')")] fn encode_batch( &self, py: Python<'_>, input: Vec>, is_pretokenized: bool, add_special_tokens: bool, + offset_type: &str, ) -> PyResult> { let mut items = Vec::::with_capacity(input.len()); for item in &input { @@ -1221,12 +1243,20 @@ impl PyTokenizer { items.push(item); } py.allow_threads(|| { - ToPyResult( - self.tokenizer - .encode_batch_char_offsets(items, add_special_tokens) - .map(|encodings| encodings.into_iter().map(|e| e.into()).collect()), - ) - .into() + let result = match offset_type { + "char" => self.tokenizer.encode_batch_char_offsets(items, add_special_tokens), + "byte" => self.tokenizer.encode_batch(items, add_special_tokens), + "none" => self.tokenizer.encode_batch_fast(items, add_special_tokens), + _ => { + return Err(PyError(format!( + "Invalid offset_type: '{}'. Must be one of 'char', 'byte', or 'none'", + offset_type + )) + .into_pyerr::()) + } + }; + + ToPyResult(result.map(|encodings| encodings.into_iter().map(|e| e.into()).collect())).into() }) } /// Asynchronously encode the given batch of inputs with character offsets. From fd89fafabbea47222facac9c642e4423a2d7a5b3 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Wed, 12 Nov 2025 12:52:53 +0100 Subject: [PATCH 2/2] Update bindings/python/src/tokenizer.rs Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- bindings/python/src/tokenizer.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/bindings/python/src/tokenizer.rs b/bindings/python/src/tokenizer.rs index bcac23115..d5c950e72 100644 --- a/bindings/python/src/tokenizer.rs +++ b/bindings/python/src/tokenizer.rs @@ -1255,7 +1255,6 @@ impl PyTokenizer { .into_pyerr::()) } }; - ToPyResult(result.map(|encodings| encodings.into_iter().map(|e| e.into()).collect())).into() }) }