From 331dd0a975a3660afb9925e48d063180d9a3f905 Mon Sep 17 00:00:00 2001
From: krataratha <kratarathtatran@icloud.com>
Date: Tue, 21 Apr 2026 20:10:56 +0530
Subject: [PATCH] Enhance MythosTokenizer with new methods

Added methods for encoding, decoding, token counting, batch encoding, retrieving special tokens, and checking token limits.
---
 open_mythos/tokenizer.py | 49 +++++++++++++++++++++++-----------------
 1 file changed, 28 insertions(+), 21 deletions(-)

diff --git a/open_mythos/tokenizer.py b/open_mythos/tokenizer.py
index fadb3a5..ffdec6f 100644
--- a/open_mythos/tokenizer.py
+++ b/open_mythos/tokenizer.py
@@ -6,35 +6,42 @@
 class MythosTokenizer:
     """
     HuggingFace tokenizer wrapper for OpenMythos.
-
-    Args:
-        model_id (str): The HuggingFace model ID or path to use with AutoTokenizer.
-            Defaults to "openai/gpt-oss-20b".
-
-    Attributes:
-        tokenizer: An instance of HuggingFace's AutoTokenizer.
-
-    Example:
-        >>> tok = MythosTokenizer()
-        >>> ids = tok.encode("Hello world")
-        >>> s = tok.decode(ids)
     """
 
     def __init__(self, model_id: str = DEFAULT_MODEL_ID):
-        """
-        Initialize the MythosTokenizer.
-
-        Args:
-            model_id (str): HuggingFace model identifier or path to tokenizer files.
-        """
         self.tokenizer = AutoTokenizer.from_pretrained(model_id)
 
     @property
     def vocab_size(self) -> int:
-        """
-        Return the size of the tokenizer vocabulary.
+        return self.tokenizer.vocab_size
 
-        Returns:
+    def encode(self, text: str):
+        return self.tokenizer.encode(text)
+
+    def decode(self, token_ids):
+        return self.tokenizer.decode(token_ids)
+
+    # ✅ New methods added
+    def token_count(self, text: str) -> int:
+        """Return number of tokens in text."""
+        return len(self.tokenizer.encode(text))
+
+    def batch_encode(self, texts: list[str], padding: bool = True, truncation: bool = True):
+        """Encode multiple texts at once."""
+        return self.tokenizer(
+            texts,
+            padding=padding,
+            truncation=truncation,
+            return_tensors="pt"
+        )
+
+    def get_special_tokens(self):
+        """Return special tokens used by tokenizer."""
+        return self.tokenizer.special_tokens_map
+
+    def is_within_limit(self, text: str, max_tokens: int) -> bool:
+        """Check if text fits within a token limit."""
+        return self.token_count(text) <= max_tokens        Returns:
             int: The number of unique tokens in the tokenizer vocabulary.
         """
         return self.tokenizer.vocab_size