From 331dd0a975a3660afb9925e48d063180d9a3f905 Mon Sep 17 00:00:00 2001 From: krataratha Date: Tue, 21 Apr 2026 20:10:56 +0530 Subject: [PATCH] Enhance MythosTokenizer with new methods Added methods for encoding, decoding, token counting, batch encoding, retrieving special tokens, and checking token limits. --- open_mythos/tokenizer.py | 49 +++++++++++++++++++++++----------------- 1 file changed, 28 insertions(+), 21 deletions(-) diff --git a/open_mythos/tokenizer.py b/open_mythos/tokenizer.py index fadb3a5..ffdec6f 100644 --- a/open_mythos/tokenizer.py +++ b/open_mythos/tokenizer.py @@ -6,35 +6,42 @@ class MythosTokenizer: """ HuggingFace tokenizer wrapper for OpenMythos. - - Args: - model_id (str): The HuggingFace model ID or path to use with AutoTokenizer. - Defaults to "openai/gpt-oss-20b". - - Attributes: - tokenizer: An instance of HuggingFace's AutoTokenizer. - - Example: - >>> tok = MythosTokenizer() - >>> ids = tok.encode("Hello world") - >>> s = tok.decode(ids) """ def __init__(self, model_id: str = DEFAULT_MODEL_ID): - """ - Initialize the MythosTokenizer. - - Args: - model_id (str): HuggingFace model identifier or path to tokenizer files. - """ self.tokenizer = AutoTokenizer.from_pretrained(model_id) @property def vocab_size(self) -> int: - """ - Return the size of the tokenizer vocabulary. + return self.tokenizer.vocab_size - Returns: + def encode(self, text: str): + return self.tokenizer.encode(text) + + def decode(self, token_ids): + return self.tokenizer.decode(token_ids) + + # ✅ New methods added + def token_count(self, text: str) -> int: + """Return number of tokens in text.""" + return len(self.tokenizer.encode(text)) + + def batch_encode(self, texts: list[str], padding: bool = True, truncation: bool = True): + """Encode multiple texts at once.""" + return self.tokenizer( + texts, + padding=padding, + truncation=truncation, + return_tensors="pt" + ) + + def get_special_tokens(self): + """Return special tokens used by tokenizer.""" + return self.tokenizer.special_tokens_map + + def is_within_limit(self, text: str, max_tokens: int) -> bool: + """Check if text fits within a token limit.""" + return self.token_count(text) <= max_tokens Returns: int: The number of unique tokens in the tokenizer vocabulary. """ return self.tokenizer.vocab_size