From 2f71909a422295a2d0cb26062985a3c946dae1bc Mon Sep 17 00:00:00 2001 From: Arpit Sharma Date: Wed, 11 Mar 2026 03:17:35 +0530 Subject: [PATCH 1/8] feature:completed bpe and base --- .gitignore | 3 +- openverifiablellm/tokenizer/base.py | 21 +- openverifiablellm/tokenizer/bpe_tokenizer.py | 144 ++++++++++- tests/test_bpebase.py | 250 +++++++++++++++++++ 4 files changed, 413 insertions(+), 5 deletions(-) create mode 100644 tests/test_bpebase.py diff --git a/.gitignore b/.gitignore index c55e460..705653c 100644 --- a/.gitignore +++ b/.gitignore @@ -333,4 +333,5 @@ __pycache__/ *.pyd *.bz2 -*.venv \ No newline at end of file +*.venv +venv/ \ No newline at end of file diff --git a/openverifiablellm/tokenizer/base.py b/openverifiablellm/tokenizer/base.py index 5a8d3fd..2eb0176 100644 --- a/openverifiablellm/tokenizer/base.py +++ b/openverifiablellm/tokenizer/base.py @@ -19,13 +19,30 @@ def __init__(self, vocab_size: int, min_frequency: int): @abstractmethod def train(self, text_file: Path, save_path: Path): - """Train tokenizer and save model.""" + """Train tokenizer on a text corpus and save artifacts to save_path.""" + pass + + @abstractmethod + def encode(self, text: str) -> list: + """Encode text into a list of integer token ids.""" + pass + + @abstractmethod + def decode(self, ids: list) -> str: + """Decode a list of integer token ids back into text.""" + pass + + @abstractmethod + def load(self, tokenizer_dir: Path): + """Load a previously trained tokenizer from disk.""" pass @abstractmethod def get_vocab_path(self, tokenizer_dir: Path) -> Path: + """Return path to the vocabulary file.""" pass @abstractmethod def get_merges_path(self, tokenizer_dir: Path): - pass + """Return path to the merges file, or None if not applicable.""" + pass \ No newline at end of file diff --git a/openverifiablellm/tokenizer/bpe_tokenizer.py b/openverifiablellm/tokenizer/bpe_tokenizer.py index 2b4a145..411207a 100644 --- a/openverifiablellm/tokenizer/bpe_tokenizer.py +++ b/openverifiablellm/tokenizer/bpe_tokenizer.py @@ -8,8 +8,44 @@ class BPETokenizer(BaseTokenizer): + """ + Byte-level BPE tokenizer implementation. + + Wraps HuggingFace's ByteLevelBPETokenizer and implements + the full BaseTokenizer interface including train, encode, + decode, and load. + """ + + def __init__(self, vocab_size: int, min_frequency: int): + super().__init__(vocab_size, min_frequency) + self._tokenizer = None + + # ------------------------------------------------------------------ + # Training + # ------------------------------------------------------------------ def train(self, text_file: Path, save_path: Path): + """ + Train BPE tokenizer on text corpus and save artifacts. + + Args: + text_file: Path to training text corpus. + save_path: Directory to save vocab.json and merges.txt. + + Raises: + FileNotFoundError: If text_file does not exist or is not a file. + """ + + text_file = Path(text_file) + save_path = Path(save_path) + + if not text_file.is_file(): + raise FileNotFoundError( + f"Training file not found at {text_file}. " + f"Please provide a valid text corpus file." + ) + + save_path.mkdir(parents=True, exist_ok=True) tokenizer = ByteLevelBPETokenizer() @@ -20,10 +56,114 @@ def train(self, text_file: Path, save_path: Path): special_tokens=SPECIAL_TOKENS, ) + # Must create directory BEFORE save_model() is called + save_path.mkdir(parents=True, exist_ok=True) + tokenizer.save_model(str(save_path)) + self._tokenizer = tokenizer + + # ------------------------------------------------------------------ + # Encode / Decode + # ------------------------------------------------------------------ + + def encode(self, text: str) -> list: + """ + Encode text into a list of token ids. + + Args: + text: Input string to tokenize. + + Returns: + List of integer token ids. + + Raises: + RuntimeError: If tokenizer has not been trained or loaded. + """ + + self._check_loaded() + return self._tokenizer.encode(text).ids + + def decode(self, ids: list) -> str: + """ + Decode a list of token ids back into text. + + Args: + ids: List of integer token ids. + + Returns: + Decoded string. + + Raises: + RuntimeError: If tokenizer has not been trained or loaded. + """ + + self._check_loaded() + return self._tokenizer.decode(ids) + + # ------------------------------------------------------------------ + # Load + # ------------------------------------------------------------------ + + def load(self, tokenizer_dir: Path): + """ + Load a previously trained BPE tokenizer from disk. + + Args: + tokenizer_dir: Directory containing vocab.json and merges.txt. + + Raises: + FileNotFoundError: If vocab.json or merges.txt are not found. + """ + + tokenizer_dir = Path(tokenizer_dir) + + vocab_path = tokenizer_dir / "vocab.json" + merges_path = tokenizer_dir / "merges.txt" + + if not vocab_path.is_file(): + raise FileNotFoundError( + f"vocab.json not found at {vocab_path}. " + f"Please train the tokenizer first." + ) + + if not merges_path.is_file(): + raise FileNotFoundError( + f"merges.txt not found at {merges_path}. " + f"Please train the tokenizer first." + ) + + self._tokenizer = ByteLevelBPETokenizer( + vocab=str(vocab_path), + merges=str(merges_path), + ) + + # ------------------------------------------------------------------ + # Artifact paths + # ------------------------------------------------------------------ + def get_vocab_path(self, tokenizer_dir: Path) -> Path: - return tokenizer_dir / "vocab.json" + """Return path to vocab.json file.""" + return Path(tokenizer_dir) / "vocab.json" def get_merges_path(self, tokenizer_dir: Path) -> Path: - return tokenizer_dir / "merges.txt" + """Return path to merges.txt file.""" + return Path(tokenizer_dir) / "merges.txt" + + # ------------------------------------------------------------------ + # Internal helpers + # ------------------------------------------------------------------ + + def _check_loaded(self): + """ + Check that tokenizer is loaded before encode/decode. + + Raises: + RuntimeError: If tokenizer has not been trained or loaded. + """ + + if self._tokenizer is None: + raise RuntimeError( + "BPE tokenizer is not loaded. " + "Call train() or load() before encode/decode." + ) \ No newline at end of file diff --git a/tests/test_bpebase.py b/tests/test_bpebase.py new file mode 100644 index 0000000..d4d5844 --- /dev/null +++ b/tests/test_bpebase.py @@ -0,0 +1,250 @@ +import pytest +from pathlib import Path + +from openverifiablellm.tokenizer.bpe_tokenizer import BPETokenizer + + +# ------------------------------------------------------------------ +# Fixtures +# ------------------------------------------------------------------ + +@pytest.fixture +def sample_text_file(tmp_path): + """Create a sample text file for training.""" + text = ( + "Wikipedia is a free online encyclopedia.\n" + "It is written collaboratively by volunteers.\n" + "Anyone can edit Wikipedia articles.\n" + "Wikipedia was launched on January 15 2001.\n" + "It is one of the most popular websites in the world.\n" + ) * 500 + + text_file = tmp_path / "sample.txt" + text_file.write_text(text, encoding="utf-8") + return text_file + + +@pytest.fixture +def trained_tokenizer(tmp_path, sample_text_file): + """Train and return path to trained BPETokenizer.""" + tokenizer = BPETokenizer(vocab_size=1000, min_frequency=2) + tokenizer.train(sample_text_file, tmp_path / "tokenizer") + return tmp_path / "tokenizer" + + +# ------------------------------------------------------------------ +# Training tests +# ------------------------------------------------------------------ + +def test_bpe_train_creates_artifacts(tmp_path, sample_text_file): + """Training should produce vocab.json and merges.txt.""" + tokenizer = BPETokenizer(vocab_size=1000, min_frequency=2) + save_path = tmp_path / "tokenizer" + + tokenizer.train(sample_text_file, save_path) + + assert (save_path / "vocab.json").is_file() + assert (save_path / "merges.txt").is_file() + + +def test_bpe_train_creates_save_directory(tmp_path, sample_text_file): + """train() should create save_path directory if it does not exist.""" + tokenizer = BPETokenizer(vocab_size=1000, min_frequency=2) + save_path = tmp_path / "nested" / "tokenizer" / "dir" + + assert not save_path.exists() + + tokenizer.train(sample_text_file, save_path) + + assert save_path.exists() + + +def test_bpe_train_raises_file_not_found(tmp_path): + """train() should raise FileNotFoundError for missing text file.""" + tokenizer = BPETokenizer(vocab_size=1000, min_frequency=2) + + with pytest.raises(FileNotFoundError, match="Training file not found"): + tokenizer.train( + tmp_path / "nonexistent.txt", + tmp_path / "tokenizer" + ) + + +def test_bpe_train_raises_if_directory_passed(tmp_path, sample_text_file): + """train() should raise FileNotFoundError if directory passed as text_file.""" + tokenizer = BPETokenizer(vocab_size=1000, min_frequency=2) + + with pytest.raises(FileNotFoundError, match="Training file not found"): + tokenizer.train(tmp_path, tmp_path / "tokenizer") + + +# ------------------------------------------------------------------ +# Encode / Decode tests +# ------------------------------------------------------------------ + +def test_bpe_encode_returns_list_of_ints(trained_tokenizer): + """encode() should return a list of integers.""" + tokenizer = BPETokenizer(vocab_size=1000, min_frequency=2) + tokenizer.load(trained_tokenizer) + + ids = tokenizer.encode("hello world") + + assert isinstance(ids, list) + assert all(isinstance(i, int) for i in ids) + + +def test_bpe_encode_decode_roundtrip(trained_tokenizer): + """encode then decode should return original text.""" + tokenizer = BPETokenizer(vocab_size=1000, min_frequency=2) + tokenizer.load(trained_tokenizer) + + text = "Wikipedia is a free online encyclopedia" + ids = tokenizer.encode(text) + decoded = tokenizer.decode(ids) + + assert decoded.strip() == text.strip() + + +def test_bpe_encode_works_after_train(tmp_path, sample_text_file): + """encode() should work immediately after train() without calling load().""" + tokenizer = BPETokenizer(vocab_size=1000, min_frequency=2) + tokenizer.train(sample_text_file, tmp_path / "tokenizer") + + ids = tokenizer.encode("hello world") + + assert isinstance(ids, list) + assert len(ids) > 0 + + +def test_bpe_encode_raises_if_not_loaded(): + """encode() should raise RuntimeError if model not loaded.""" + tokenizer = BPETokenizer(vocab_size=1000, min_frequency=2) + + with pytest.raises(RuntimeError, match="not loaded"): + tokenizer.encode("hello world") + + +def test_bpe_decode_raises_if_not_loaded(): + """decode() should raise RuntimeError if model not loaded.""" + tokenizer = BPETokenizer(vocab_size=1000, min_frequency=2) + + with pytest.raises(RuntimeError, match="not loaded"): + tokenizer.decode([1, 2, 3]) + + +# ------------------------------------------------------------------ +# Load tests +# ------------------------------------------------------------------ + +def test_bpe_load_from_disk(trained_tokenizer): + """load() should successfully restore tokenizer from disk.""" + tokenizer = BPETokenizer(vocab_size=1000, min_frequency=2) + tokenizer.load(trained_tokenizer) + + assert tokenizer._tokenizer is not None + + +def test_bpe_encode_works_after_load(trained_tokenizer): + """encode() should work correctly after load().""" + tokenizer = BPETokenizer(vocab_size=1000, min_frequency=2) + tokenizer.load(trained_tokenizer) + + ids = tokenizer.encode("hello world") + + assert isinstance(ids, list) + assert len(ids) > 0 + + +def test_bpe_load_raises_if_vocab_missing(tmp_path): + """load() should raise FileNotFoundError if vocab.json not found.""" + tokenizer = BPETokenizer(vocab_size=1000, min_frequency=2) + + with pytest.raises(FileNotFoundError, match="vocab.json not found"): + tokenizer.load(tmp_path) + + +def test_bpe_load_raises_if_merges_missing(tmp_path): + """load() should raise FileNotFoundError if merges.txt not found.""" + tokenizer = BPETokenizer(vocab_size=1000, min_frequency=2) + + # Create vocab.json but not merges.txt + (tmp_path / "vocab.json").write_text("{}", encoding="utf-8") + + with pytest.raises(FileNotFoundError, match="merges.txt not found"): + tokenizer.load(tmp_path) + + +# ------------------------------------------------------------------ +# Artifact path tests +# ------------------------------------------------------------------ + +def test_bpe_get_vocab_path(tmp_path): + """get_vocab_path() should return path to vocab.json.""" + tokenizer = BPETokenizer(vocab_size=1000, min_frequency=2) + vocab_path = tokenizer.get_vocab_path(tmp_path) + + assert vocab_path == tmp_path / "vocab.json" + + +def test_bpe_get_merges_path(tmp_path): + """get_merges_path() should return path to merges.txt.""" + tokenizer = BPETokenizer(vocab_size=1000, min_frequency=2) + merges_path = tokenizer.get_merges_path(tmp_path) + + assert merges_path == tmp_path / "merges.txt" + + +# ------------------------------------------------------------------ +# Special tokens tests +# ------------------------------------------------------------------ + +def test_bpe_special_tokens_in_vocabulary(trained_tokenizer): + """Special tokens should be present in trained vocabulary.""" + tokenizer = BPETokenizer(vocab_size=1000, min_frequency=2) + tokenizer.load(trained_tokenizer) + + vocab_path = trained_tokenizer / "vocab.json" + vocab_content = vocab_path.read_text(encoding="utf-8") + + assert "" in vocab_content + assert "" in vocab_content + assert "" in vocab_content + assert "" in vocab_content + assert "" in vocab_content + + +# ------------------------------------------------------------------ +# Determinism tests +# ------------------------------------------------------------------ + +def test_bpe_training_is_deterministic(tmp_path, sample_text_file): + """Training twice on same data should produce same vocab.""" + save_path_1 = tmp_path / "tokenizer_1" + save_path_2 = tmp_path / "tokenizer_2" + + tokenizer_1 = BPETokenizer(vocab_size=1000, min_frequency=2) + tokenizer_1.train(sample_text_file, save_path_1) + + tokenizer_2 = BPETokenizer(vocab_size=1000, min_frequency=2) + tokenizer_2.train(sample_text_file, save_path_2) + + vocab_1 = (save_path_1 / "vocab.json").read_text(encoding="utf-8") + vocab_2 = (save_path_2 / "vocab.json").read_text(encoding="utf-8") + + assert vocab_1 == vocab_2 + + +# ------------------------------------------------------------------ +# Constructor validation tests +# ------------------------------------------------------------------ + +def test_bpe_raises_if_vocab_size_zero(): + """BPETokenizer should raise ValueError if vocab_size <= 0.""" + with pytest.raises(ValueError, match="vocab_size must be > 0"): + BPETokenizer(vocab_size=0, min_frequency=2) + + +def test_bpe_raises_if_min_frequency_zero(): + """BPETokenizer should raise ValueError if min_frequency <= 0.""" + with pytest.raises(ValueError, match="min_frequency must be > 0"): + BPETokenizer(vocab_size=1000, min_frequency=0) \ No newline at end of file From 71ea8902938739d31c289695580f63d6ff886285 Mon Sep 17 00:00:00 2001 From: Arpit Sharma Date: Wed, 11 Mar 2026 11:12:12 +0530 Subject: [PATCH 2/8] fix: remove unused Path import in test_bpe.py --- tests/test_bpebase.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_bpebase.py b/tests/test_bpebase.py index d4d5844..ab1f858 100644 --- a/tests/test_bpebase.py +++ b/tests/test_bpebase.py @@ -1,5 +1,4 @@ import pytest -from pathlib import Path from openverifiablellm.tokenizer.bpe_tokenizer import BPETokenizer From 3ff1c96ede928198d44302d6508d90925c7103de Mon Sep 17 00:00:00 2001 From: Arpit Sharma Date: Wed, 11 Mar 2026 11:19:08 +0530 Subject: [PATCH 3/8] fix: implied changes with ruff --- tests/test_bpebase.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_bpebase.py b/tests/test_bpebase.py index ab1f858..6886b45 100644 --- a/tests/test_bpebase.py +++ b/tests/test_bpebase.py @@ -2,7 +2,6 @@ from openverifiablellm.tokenizer.bpe_tokenizer import BPETokenizer - # ------------------------------------------------------------------ # Fixtures # ------------------------------------------------------------------ From 9d743a262bcaae2173548ef22e9c56429575651a Mon Sep 17 00:00:00 2001 From: Arpit Sharma Date: Wed, 11 Mar 2026 11:20:19 +0530 Subject: [PATCH 4/8] style: format with ruff --- openverifiablellm/tokenizer/base.py | 2 +- openverifiablellm/tokenizer/bpe_tokenizer.py | 14 +++++--------- openverifiablellm/verify.py | 16 +++++++++------- tests/test_bpebase.py | 15 ++++++++++----- 4 files changed, 25 insertions(+), 22 deletions(-) diff --git a/openverifiablellm/tokenizer/base.py b/openverifiablellm/tokenizer/base.py index 2eb0176..9ae2250 100644 --- a/openverifiablellm/tokenizer/base.py +++ b/openverifiablellm/tokenizer/base.py @@ -45,4 +45,4 @@ def get_vocab_path(self, tokenizer_dir: Path) -> Path: @abstractmethod def get_merges_path(self, tokenizer_dir: Path): """Return path to the merges file, or None if not applicable.""" - pass \ No newline at end of file + pass diff --git a/openverifiablellm/tokenizer/bpe_tokenizer.py b/openverifiablellm/tokenizer/bpe_tokenizer.py index 42ee465..584e08a 100644 --- a/openverifiablellm/tokenizer/bpe_tokenizer.py +++ b/openverifiablellm/tokenizer/bpe_tokenizer.py @@ -41,8 +41,7 @@ def train(self, text_file: Path, save_path: Path): if not text_file.is_file(): raise FileNotFoundError( - f"Training file not found at {text_file}. " - f"Please provide a valid text corpus file." + f"Training file not found at {text_file}. Please provide a valid text corpus file." ) save_path.mkdir(parents=True, exist_ok=True) @@ -120,14 +119,12 @@ def load(self, tokenizer_dir: Path): if not vocab_path.is_file(): raise FileNotFoundError( - f"vocab.json not found at {vocab_path}. " - f"Please train the tokenizer first." + f"vocab.json not found at {vocab_path}. Please train the tokenizer first." ) if not merges_path.is_file(): raise FileNotFoundError( - f"merges.txt not found at {merges_path}. " - f"Please train the tokenizer first." + f"merges.txt not found at {merges_path}. Please train the tokenizer first." ) self._tokenizer = ByteLevelBPETokenizer( @@ -161,6 +158,5 @@ def _check_loaded(self): if self._tokenizer is None: raise RuntimeError( - "BPE tokenizer is not loaded. " - "Call train() or load() before encode/decode." - ) \ No newline at end of file + "BPE tokenizer is not loaded. Call train() or load() before encode/decode." + ) diff --git a/openverifiablellm/verify.py b/openverifiablellm/verify.py index ad7f789..cc0c558 100644 --- a/openverifiablellm/verify.py +++ b/openverifiablellm/verify.py @@ -386,15 +386,17 @@ def verify_preprocessing( "environment_hash", expected=manifest.get("environment_hash"), actual=current_env["environment_hash"], - detail="Environment fingerprint comparison" + detail="Environment fingerprint comparison", ) else: - report.add(CheckResult( - name="environment_hash", - status=CheckStatus.SKIP, - detail="Field absent from manifest (older version)" - )) - + report.add( + CheckResult( + name="environment_hash", + status=CheckStatus.SKIP, + detail="Field absent from manifest (older version)", + ) + ) + # 4. Re-run preprocessing in an isolated temp directory tmp_dir = Path(tempfile.mkdtemp(prefix="ovllm_verify_")) try: diff --git a/tests/test_bpebase.py b/tests/test_bpebase.py index 6886b45..50e80c5 100644 --- a/tests/test_bpebase.py +++ b/tests/test_bpebase.py @@ -6,6 +6,7 @@ # Fixtures # ------------------------------------------------------------------ + @pytest.fixture def sample_text_file(tmp_path): """Create a sample text file for training.""" @@ -34,6 +35,7 @@ def trained_tokenizer(tmp_path, sample_text_file): # Training tests # ------------------------------------------------------------------ + def test_bpe_train_creates_artifacts(tmp_path, sample_text_file): """Training should produce vocab.json and merges.txt.""" tokenizer = BPETokenizer(vocab_size=1000, min_frequency=2) @@ -62,10 +64,7 @@ def test_bpe_train_raises_file_not_found(tmp_path): tokenizer = BPETokenizer(vocab_size=1000, min_frequency=2) with pytest.raises(FileNotFoundError, match="Training file not found"): - tokenizer.train( - tmp_path / "nonexistent.txt", - tmp_path / "tokenizer" - ) + tokenizer.train(tmp_path / "nonexistent.txt", tmp_path / "tokenizer") def test_bpe_train_raises_if_directory_passed(tmp_path, sample_text_file): @@ -80,6 +79,7 @@ def test_bpe_train_raises_if_directory_passed(tmp_path, sample_text_file): # Encode / Decode tests # ------------------------------------------------------------------ + def test_bpe_encode_returns_list_of_ints(trained_tokenizer): """encode() should return a list of integers.""" tokenizer = BPETokenizer(vocab_size=1000, min_frequency=2) @@ -134,6 +134,7 @@ def test_bpe_decode_raises_if_not_loaded(): # Load tests # ------------------------------------------------------------------ + def test_bpe_load_from_disk(trained_tokenizer): """load() should successfully restore tokenizer from disk.""" tokenizer = BPETokenizer(vocab_size=1000, min_frequency=2) @@ -176,6 +177,7 @@ def test_bpe_load_raises_if_merges_missing(tmp_path): # Artifact path tests # ------------------------------------------------------------------ + def test_bpe_get_vocab_path(tmp_path): """get_vocab_path() should return path to vocab.json.""" tokenizer = BPETokenizer(vocab_size=1000, min_frequency=2) @@ -196,6 +198,7 @@ def test_bpe_get_merges_path(tmp_path): # Special tokens tests # ------------------------------------------------------------------ + def test_bpe_special_tokens_in_vocabulary(trained_tokenizer): """Special tokens should be present in trained vocabulary.""" tokenizer = BPETokenizer(vocab_size=1000, min_frequency=2) @@ -215,6 +218,7 @@ def test_bpe_special_tokens_in_vocabulary(trained_tokenizer): # Determinism tests # ------------------------------------------------------------------ + def test_bpe_training_is_deterministic(tmp_path, sample_text_file): """Training twice on same data should produce same vocab.""" save_path_1 = tmp_path / "tokenizer_1" @@ -236,6 +240,7 @@ def test_bpe_training_is_deterministic(tmp_path, sample_text_file): # Constructor validation tests # ------------------------------------------------------------------ + def test_bpe_raises_if_vocab_size_zero(): """BPETokenizer should raise ValueError if vocab_size <= 0.""" with pytest.raises(ValueError, match="vocab_size must be > 0"): @@ -245,4 +250,4 @@ def test_bpe_raises_if_vocab_size_zero(): def test_bpe_raises_if_min_frequency_zero(): """BPETokenizer should raise ValueError if min_frequency <= 0.""" with pytest.raises(ValueError, match="min_frequency must be > 0"): - BPETokenizer(vocab_size=1000, min_frequency=0) \ No newline at end of file + BPETokenizer(vocab_size=1000, min_frequency=0) From dcdb8c4a75719d3caf22db6850f56bde15be4990 Mon Sep 17 00:00:00 2001 From: Arpit Sharma Date: Fri, 13 Mar 2026 10:37:41 +0530 Subject: [PATCH 5/8] fix:applied coderabbit changes --- openverifiablellm/tokenizer/base.py | 8 ++++---- openverifiablellm/tokenizer/bpe_tokenizer.py | 8 ++++---- tests/test_bpebase.py | 8 ++++---- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/openverifiablellm/tokenizer/base.py b/openverifiablellm/tokenizer/base.py index 9ae2250..49d6a1c 100644 --- a/openverifiablellm/tokenizer/base.py +++ b/openverifiablellm/tokenizer/base.py @@ -23,12 +23,12 @@ def train(self, text_file: Path, save_path: Path): pass @abstractmethod - def encode(self, text: str) -> list: + def encode(self, text: str) -> list[int]: """Encode text into a list of integer token ids.""" pass @abstractmethod - def decode(self, ids: list) -> str: + def decode(self, ids: list[int]) -> str: """Decode a list of integer token ids back into text.""" pass @@ -43,6 +43,6 @@ def get_vocab_path(self, tokenizer_dir: Path) -> Path: pass @abstractmethod - def get_merges_path(self, tokenizer_dir: Path): + def get_merges_path(self, tokenizer_dir: Path) -> Path | None: """Return path to the merges file, or None if not applicable.""" - pass + pass \ No newline at end of file diff --git a/openverifiablellm/tokenizer/bpe_tokenizer.py b/openverifiablellm/tokenizer/bpe_tokenizer.py index 584e08a..0b9613f 100644 --- a/openverifiablellm/tokenizer/bpe_tokenizer.py +++ b/openverifiablellm/tokenizer/bpe_tokenizer.py @@ -63,7 +63,7 @@ def train(self, text_file: Path, save_path: Path): # Encode / Decode # ------------------------------------------------------------------ - def encode(self, text: str) -> list: + def encode(self, text: str) -> list[int]: """ Encode text into a list of token ids. @@ -80,7 +80,7 @@ def encode(self, text: str) -> list: self._check_loaded() return self._tokenizer.encode(text).ids - def decode(self, ids: list) -> str: + def decode(self, ids: list[int]) -> str: """ Decode a list of token ids back into text. @@ -95,7 +95,7 @@ def decode(self, ids: list) -> str: """ self._check_loaded() - return self._tokenizer.decode(ids) + return self._tokenizer.decode(ids, skip_special_tokens=False) # ------------------------------------------------------------------ # Load @@ -159,4 +159,4 @@ def _check_loaded(self): if self._tokenizer is None: raise RuntimeError( "BPE tokenizer is not loaded. Call train() or load() before encode/decode." - ) + ) \ No newline at end of file diff --git a/tests/test_bpebase.py b/tests/test_bpebase.py index 50e80c5..657770b 100644 --- a/tests/test_bpebase.py +++ b/tests/test_bpebase.py @@ -67,7 +67,7 @@ def test_bpe_train_raises_file_not_found(tmp_path): tokenizer.train(tmp_path / "nonexistent.txt", tmp_path / "tokenizer") -def test_bpe_train_raises_if_directory_passed(tmp_path, sample_text_file): +def test_bpe_train_raises_if_directory_passed(tmp_path): """train() should raise FileNotFoundError if directory passed as text_file.""" tokenizer = BPETokenizer(vocab_size=1000, min_frequency=2) @@ -158,7 +158,7 @@ def test_bpe_load_raises_if_vocab_missing(tmp_path): """load() should raise FileNotFoundError if vocab.json not found.""" tokenizer = BPETokenizer(vocab_size=1000, min_frequency=2) - with pytest.raises(FileNotFoundError, match="vocab.json not found"): + with pytest.raises(FileNotFoundError, match=r"vocab\.json not found"): tokenizer.load(tmp_path) @@ -169,7 +169,7 @@ def test_bpe_load_raises_if_merges_missing(tmp_path): # Create vocab.json but not merges.txt (tmp_path / "vocab.json").write_text("{}", encoding="utf-8") - with pytest.raises(FileNotFoundError, match="merges.txt not found"): + with pytest.raises(FileNotFoundError, match=r"merges\.txt not found"): tokenizer.load(tmp_path) @@ -250,4 +250,4 @@ def test_bpe_raises_if_vocab_size_zero(): def test_bpe_raises_if_min_frequency_zero(): """BPETokenizer should raise ValueError if min_frequency <= 0.""" with pytest.raises(ValueError, match="min_frequency must be > 0"): - BPETokenizer(vocab_size=1000, min_frequency=0) + BPETokenizer(vocab_size=1000, min_frequency=0) \ No newline at end of file From 1d049ff4904ad3c57d6d29587a72a335e0a3c0a6 Mon Sep 17 00:00:00 2001 From: Arpit Sharma Date: Fri, 13 Mar 2026 11:54:56 +0530 Subject: [PATCH 6/8] fix:updated to comply with ruff --- openverifiablellm/tokenizer/base.py | 2 +- openverifiablellm/tokenizer/bpe_tokenizer.py | 2 +- tests/test_bpebase.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/openverifiablellm/tokenizer/base.py b/openverifiablellm/tokenizer/base.py index 49d6a1c..8751dcb 100644 --- a/openverifiablellm/tokenizer/base.py +++ b/openverifiablellm/tokenizer/base.py @@ -45,4 +45,4 @@ def get_vocab_path(self, tokenizer_dir: Path) -> Path: @abstractmethod def get_merges_path(self, tokenizer_dir: Path) -> Path | None: """Return path to the merges file, or None if not applicable.""" - pass \ No newline at end of file + pass diff --git a/openverifiablellm/tokenizer/bpe_tokenizer.py b/openverifiablellm/tokenizer/bpe_tokenizer.py index 0b9613f..331f9c3 100644 --- a/openverifiablellm/tokenizer/bpe_tokenizer.py +++ b/openverifiablellm/tokenizer/bpe_tokenizer.py @@ -159,4 +159,4 @@ def _check_loaded(self): if self._tokenizer is None: raise RuntimeError( "BPE tokenizer is not loaded. Call train() or load() before encode/decode." - ) \ No newline at end of file + ) diff --git a/tests/test_bpebase.py b/tests/test_bpebase.py index 657770b..d990e9e 100644 --- a/tests/test_bpebase.py +++ b/tests/test_bpebase.py @@ -250,4 +250,4 @@ def test_bpe_raises_if_vocab_size_zero(): def test_bpe_raises_if_min_frequency_zero(): """BPETokenizer should raise ValueError if min_frequency <= 0.""" with pytest.raises(ValueError, match="min_frequency must be > 0"): - BPETokenizer(vocab_size=1000, min_frequency=0) \ No newline at end of file + BPETokenizer(vocab_size=1000, min_frequency=0) From 088ec84dc4e3b4416224f46e0a26808c6592ee33 Mon Sep 17 00:00:00 2001 From: Arpit Sharma Date: Fri, 13 Mar 2026 13:14:35 +0530 Subject: [PATCH 7/8] fix:changed the load function --- openverifiablellm/tokenizer/bpe_tokenizer.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/openverifiablellm/tokenizer/bpe_tokenizer.py b/openverifiablellm/tokenizer/bpe_tokenizer.py index 331f9c3..722266d 100644 --- a/openverifiablellm/tokenizer/bpe_tokenizer.py +++ b/openverifiablellm/tokenizer/bpe_tokenizer.py @@ -132,6 +132,9 @@ def load(self, tokenizer_dir: Path): merges=str(merges_path), ) + # 🔧 Fix: re-register special tokens after loading + self._tokenizer.add_special_tokens(SPECIAL_TOKENS) + # ------------------------------------------------------------------ # Artifact paths # ------------------------------------------------------------------ @@ -159,4 +162,4 @@ def _check_loaded(self): if self._tokenizer is None: raise RuntimeError( "BPE tokenizer is not loaded. Call train() or load() before encode/decode." - ) + ) \ No newline at end of file From 49c7ba678ee9081672b0c17039a92f0da59a4ab4 Mon Sep 17 00:00:00 2001 From: Arpit Sharma Date: Fri, 13 Mar 2026 15:29:31 +0530 Subject: [PATCH 8/8] fix:done ruff changes --- openverifiablellm/tokenizer/bpe_tokenizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openverifiablellm/tokenizer/bpe_tokenizer.py b/openverifiablellm/tokenizer/bpe_tokenizer.py index 722266d..2b665b4 100644 --- a/openverifiablellm/tokenizer/bpe_tokenizer.py +++ b/openverifiablellm/tokenizer/bpe_tokenizer.py @@ -162,4 +162,4 @@ def _check_loaded(self): if self._tokenizer is None: raise RuntimeError( "BPE tokenizer is not loaded. Call train() or load() before encode/decode." - ) \ No newline at end of file + )