huggingface · JqzChandler · Oct 30, 2025 · Nov 12, 2025
diff --git a/bindings/python/py_src/tokenizers/implementations/base_tokenizer.py b/bindings/python/py_src/tokenizers/implementations/base_tokenizer.py
@@ -195,6 +195,7 @@ def encode(
         pair: Optional[InputSequence] = None,
         is_pretokenized: bool = False,
         add_special_tokens: bool = True,
+        offset_type: str = "char",
     ) -> Encoding:
         """Encode the given sequence and pair. This method can process raw text sequences as well
         as already pre-tokenized sequences.
@@ -214,19 +215,26 @@ def encode(
             add_special_tokens: bool:
                 Whether to add the special tokens while encoding.
 
+            offset_type: str:
+                The type of offsets to return. Can be one of:
+                - "char": Character-based offsets (default, for backward compatibility)
+                - "byte": Byte-based offsets
+                - "none": No offsets (all zeros, faster)
+
         Returns:
             An Encoding
         """
         if sequence is None:
             raise ValueError("encode: `sequence` can't be `None`")
 
-        return self._tokenizer.encode(sequence, pair, is_pretokenized, add_special_tokens)
+        return self._tokenizer.encode(sequence, pair, is_pretokenized, add_special_tokens, offset_type)
 
     def encode_batch(
         self,
         inputs: List[EncodeInput],
         is_pretokenized: bool = False,
         add_special_tokens: bool = True,
+        offset_type: str = "char",
     ) -> List[Encoding]:
         """Encode the given inputs. This method accept both raw text sequences as well as already
         pre-tokenized sequences.
@@ -250,14 +258,20 @@ def encode_batch(
             add_special_tokens: bool:
                 Whether to add the special tokens while encoding.
 
+            offset_type: str:
+                The type of offsets to return. Can be one of:
+                - "char": Character-based offsets (default, for backward compatibility)
+                - "byte": Byte-based offsets
+                - "none": No offsets (all zeros, faster)
+
         Returns:
             A list of Encoding
         """
 
         if inputs is None:
             raise ValueError("encode_batch: `inputs` can't be `None`")
 
-        return self._tokenizer.encode_batch(inputs, is_pretokenized, add_special_tokens)
+        return self._tokenizer.encode_batch(inputs, is_pretokenized, add_special_tokens, offset_type)
 
     async def async_encode_batch(
         self,

diff --git a/bindings/python/src/tokenizer.rs b/bindings/python/src/tokenizer.rs
@@ -1060,19 +1060,26 @@ impl PyTokenizer {
     ///     add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
     ///         Whether to add the special tokens
     ///
+    ///     offset_type (:obj:`str`, `optional`, defaults to :obj:`"char"`):
+    ///         The type of offsets to return. Can be one of:
+    ///         - ``"char"``: Character-based offsets (default)
+    ///         - ``"byte"``: Byte-based offsets
+    ///         - ``"none"``: No offsets (all zeros, faster)
+    ///
     /// Returns:
     ///     :class:`~tokenizers.Encoding`: The encoded result
     ///
-    #[pyo3(signature = (sequence, pair = None, is_pretokenized = false, add_special_tokens = true))]
+    #[pyo3(signature = (sequence, pair = None, is_pretokenized = false, add_special_tokens = true, offset_type = "char"))]
     #[pyo3(
-        text_signature = "(self, sequence, pair=None, is_pretokenized=False, add_special_tokens=True)"
+        text_signature = "(self, sequence, pair=None, is_pretokenized=False, add_special_tokens=True, offset_type='char')"
     )]
     fn encode(
         &self,
         sequence: &Bound<'_, PyAny>,
         pair: Option<&Bound<'_, PyAny>>,
         is_pretokenized: bool,
         add_special_tokens: bool,
+        offset_type: &str,
     ) -> PyResult<PyEncoding> {
         let sequence: tk::InputSequence = if is_pretokenized {
             sequence.extract::<PreTokenizedInputSequence>()?.into()
@@ -1091,12 +1098,20 @@ impl PyTokenizer {
             None => tk::EncodeInput::Single(sequence),
         };
 
-        ToPyResult(
-            self.tokenizer
-                .encode_char_offsets(input, add_special_tokens)
-                .map(|e| e.into()),
-        )
-        .into()
+        let result = match offset_type {
+            "char" => self.tokenizer.encode_char_offsets(input, add_special_tokens),
+            "byte" => self.tokenizer.encode(input, add_special_tokens),
+            "none" => self.tokenizer.encode_fast(input, add_special_tokens),
+            _ => {
+                return Err(PyError(format!(
+                    "Invalid offset_type: '{}'. Must be one of 'char', 'byte', or 'none'",
+                    offset_type
+                ))
+                .into_pyerr::<exceptions::PyValueError>())
+            }
+        };
+
+        ToPyResult(result.map(|e| e.into())).into()
     }
 
     /// Asynchronously encode the given input with character offsets.
@@ -1199,17 +1214,24 @@ impl PyTokenizer {
     ///     add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
     ///         Whether to add the special tokens
     ///
+    ///     offset_type (:obj:`str`, `optional`, defaults to :obj:`"char"`):
+    ///         The type of offsets to return. Can be one of:
+    ///         - ``"char"``: Character-based offsets (default)
+    ///         - ``"byte"``: Byte-based offsets
+    ///         - ``"none"``: No offsets (all zeros, faster)
+    ///
     /// Returns:
     ///     A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch
     ///
-    #[pyo3(signature = (input, is_pretokenized = false, add_special_tokens = true))]
-    #[pyo3(text_signature = "(self, input, is_pretokenized=False, add_special_tokens=True)")]
+    #[pyo3(signature = (input, is_pretokenized = false, add_special_tokens = true, offset_type = "char"))]
+    #[pyo3(text_signature = "(self, input, is_pretokenized=False, add_special_tokens=True, offset_type='char')")]
     fn encode_batch(
         &self,
         py: Python<'_>,
         input: Vec<Bound<'_, PyAny>>,
         is_pretokenized: bool,
         add_special_tokens: bool,
+        offset_type: &str,
     ) -> PyResult<Vec<PyEncoding>> {
         let mut items = Vec::<tk::EncodeInput>::with_capacity(input.len());
         for item in &input {
@@ -1221,12 +1243,19 @@ impl PyTokenizer {
             items.push(item);
         }
         py.allow_threads(|| {
-            ToPyResult(
-                self.tokenizer
-                    .encode_batch_char_offsets(items, add_special_tokens)
-                    .map(|encodings| encodings.into_iter().map(|e| e.into()).collect()),
-            )
-            .into()
+            let result = match offset_type {
+                "char" => self.tokenizer.encode_batch_char_offsets(items, add_special_tokens),
+                "byte" => self.tokenizer.encode_batch(items, add_special_tokens),
+                "none" => self.tokenizer.encode_batch_fast(items, add_special_tokens),
+                _ => {
+                    return Err(PyError(format!(
+                        "Invalid offset_type: '{}'. Must be one of 'char', 'byte', or 'none'",
+                        offset_type
+                    ))
+                    .into_pyerr::<exceptions::PyValueError>())
+                }
+            };
+            ToPyResult(result.map(|encodings| encodings.into_iter().map(|e| e.into()).collect())).into()
         })
     }
     /// Asynchronously encode the given batch of inputs with character offsets.