From c096dbbd75f21c4d9633a5c7820de2d2c6410cc4 Mon Sep 17 00:00:00 2001 From: Arpit Sharma Date: Wed, 11 Mar 2026 03:12:54 +0530 Subject: [PATCH 1/7] feature:complete sentence_piecetokenizer --- .gitignore | 3 +- .../tokenizer/sentencepiece_tokenizer.py | 163 +++++++++++++- tests/test_sentencepiece.py | 211 ++++++++++++++++++ 3 files changed, 373 insertions(+), 4 deletions(-) create mode 100644 tests/test_sentencepiece.py diff --git a/.gitignore b/.gitignore index c55e460..e68b378 100644 --- a/.gitignore +++ b/.gitignore @@ -333,4 +333,5 @@ __pycache__/ *.pyd *.bz2 -*.venv \ No newline at end of file +*.venv +venv/ diff --git a/openverifiablellm/tokenizer/sentencepiece_tokenizer.py b/openverifiablellm/tokenizer/sentencepiece_tokenizer.py index aa03893..b2a12ef 100644 --- a/openverifiablellm/tokenizer/sentencepiece_tokenizer.py +++ b/openverifiablellm/tokenizer/sentencepiece_tokenizer.py @@ -1,15 +1,59 @@ from pathlib import Path + import sentencepiece as spm from .base import BaseTokenizer +SPM_MODEL_FILE = "spm.model" +SPM_VOCAB_FILE = "spm.vocab" + + class SentencePieceTokenizer(BaseTokenizer): """ SentencePiece tokenizer implementation. + + Supports training a BPE SentencePiece tokenizer, + encoding text to token ids, decoding token ids back to text, + and loading a previously trained model from disk. + + Reproducibility depends on: + - Stable input data + - Pinned sentencepiece library version + - Consistent execution environment """ + def __init__(self, vocab_size: int, min_frequency: int): + super().__init__(vocab_size, min_frequency) + self._model = None + + # ------------------------------------------------------------------ + # Training + # ------------------------------------------------------------------ + def train(self, text_file: Path, save_path: Path): + """ + Train SentencePiece model on text corpus and save artifacts. + + Args: + text_file: Path to training text corpus. + save_path: Directory to save spm.model and spm.vocab. + + Raises: + FileNotFoundError: If text_file does not exist + or is not a file. + """ + + text_file = Path(text_file) + save_path = Path(save_path) + + if not text_file.is_file(): + raise FileNotFoundError( + f"Training file not found at {text_file}. " + f"Please provide a valid text corpus file." + ) + + save_path.mkdir(parents=True, exist_ok=True) model_prefix = save_path / "spm" @@ -17,11 +61,124 @@ def train(self, text_file: Path, save_path: Path): input=str(text_file), model_prefix=str(model_prefix), vocab_size=self.vocab_size, + pad_id=0, + unk_id=1, + bos_id=2, + eos_id=3, + pad_piece="", + unk_piece="", + bos_piece="", + eos_piece="", + character_coverage=1.0, + model_type="bpe", ) - def get_vocab_path(self, tokenizer_dir: Path): - return tokenizer_dir / "spm.vocab" + self._load_model(save_path) + + # ------------------------------------------------------------------ + # Encode / Decode + # ------------------------------------------------------------------ + + def encode(self, text: str) -> list: + """ + Encode text into list of token ids. + + Args: + text: Input string to tokenize. + + Returns: + List of integer token ids. + + Raises: + RuntimeError: If tokenizer has not been trained or loaded. + """ + + self._check_loaded() + return self._model.encode(text, out_type=int) + + def decode(self, ids: list) -> str: + """ + Decode list of token ids back into text. + + Args: + ids: List of integer token ids. + + Returns: + Decoded string. + + Raises: + RuntimeError: If tokenizer has not been trained or loaded. + """ + + self._check_loaded() + return self._model.decode(ids) + + # ------------------------------------------------------------------ + # Load + # ------------------------------------------------------------------ + + def load(self, tokenizer_dir: Path): + """ + Load a previously trained SentencePiece model from disk. + + Args: + tokenizer_dir: Directory containing spm.model + + Raises: + FileNotFoundError: If spm.model is not found. + """ + + tokenizer_dir = Path(tokenizer_dir) + self._load_model(tokenizer_dir) + + # ------------------------------------------------------------------ + # Artifact paths + # ------------------------------------------------------------------ + + def get_vocab_path(self, tokenizer_dir: Path) -> Path: + """Return path to spm.vocab file.""" + return Path(tokenizer_dir) / SPM_VOCAB_FILE def get_merges_path(self, tokenizer_dir: Path): - # SentencePiece does not use merges + """ + SentencePiece does not use a merges file. + Returns None for compatibility with BaseTokenizer interface. + """ return None + + # ------------------------------------------------------------------ + # Internal helpers + # ------------------------------------------------------------------ + + def _load_model(self, tokenizer_dir: Path): + """ + Internal helper to load spm.model from directory. + + Raises: + FileNotFoundError: If spm.model not found. + """ + + model_path = Path(tokenizer_dir) / SPM_MODEL_FILE + + if not model_path.is_file(): + raise FileNotFoundError( + f"SentencePiece model not found at {model_path}. " + f"Please train the tokenizer first." + ) + + self._model = spm.SentencePieceProcessor() + self._model.load(str(model_path)) + + def _check_loaded(self): + """ + Check that model is loaded before encode/decode. + + Raises: + RuntimeError: If model has not been loaded or trained. + """ + + if self._model is None: + raise RuntimeError( + "SentencePiece model is not loaded. " + "Call train() or load() before encode/decode." + ) \ No newline at end of file diff --git a/tests/test_sentencepiece.py b/tests/test_sentencepiece.py new file mode 100644 index 0000000..2280bbf --- /dev/null +++ b/tests/test_sentencepiece.py @@ -0,0 +1,211 @@ +import pytest +from pathlib import Path + +from openverifiablellm.tokenizer.sentencepiece_tokenizer import SentencePieceTokenizer + + +# ------------------------------------------------------------------ +# Fixtures +# ------------------------------------------------------------------ + +@pytest.fixture +def sample_text_file(tmp_path): + """Create a sample text file for training.""" + text = ( + "Wikipedia is a free online encyclopedia.\n" + "It is written collaboratively by volunteers.\n" + "Anyone can edit Wikipedia articles.\n" + "Wikipedia was launched on January 15 2001.\n" + "It is one of the most popular websites in the world.\n" + ) * 500 + + text_file = tmp_path / "sample.txt" + text_file.write_text(text) + return text_file + + +@pytest.fixture +def trained_tokenizer(tmp_path, sample_text_file): + """Train and return path to trained SentencePieceTokenizer.""" + tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2) + tokenizer.train(sample_text_file, tmp_path / "tokenizer") + return tmp_path / "tokenizer" + + +# ------------------------------------------------------------------ +# Training tests +# ------------------------------------------------------------------ + +def test_spm_train_creates_artifacts(tmp_path, sample_text_file): + """Training should produce spm.model and spm.vocab.""" + tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2) + save_path = tmp_path / "tokenizer" + + tokenizer.train(sample_text_file, save_path) + + assert (save_path / "spm.model").is_file() + assert (save_path / "spm.vocab").is_file() + + +def test_spm_train_creates_save_directory(tmp_path, sample_text_file): + """train() should create save_path directory if it does not exist.""" + tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2) + save_path = tmp_path / "nested" / "tokenizer" / "dir" + + assert not save_path.exists() + + tokenizer.train(sample_text_file, save_path) + + assert save_path.exists() + + +def test_spm_train_raises_file_not_found(tmp_path): + """train() should raise FileNotFoundError for missing text file.""" + tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2) + + with pytest.raises(FileNotFoundError, match="Training file not found"): + tokenizer.train( + tmp_path / "nonexistent.txt", + tmp_path / "tokenizer" + ) + + +def test_spm_train_raises_if_directory_passed(tmp_path, sample_text_file): + """train() should raise FileNotFoundError if directory passed as text_file.""" + tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2) + + with pytest.raises(FileNotFoundError, match="Training file not found"): + tokenizer.train(tmp_path, tmp_path / "tokenizer") + + +# ------------------------------------------------------------------ +# Encode / Decode tests +# ------------------------------------------------------------------ + +def test_spm_encode_returns_list_of_ints(trained_tokenizer): + """encode() should return a list of integers.""" + tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2) + tokenizer.load(trained_tokenizer) + + ids = tokenizer.encode("hello world") + + assert isinstance(ids, list) + assert all(isinstance(i, int) for i in ids) + + +def test_spm_encode_decode_roundtrip(trained_tokenizer): + """encode then decode should return original text.""" + tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2) + tokenizer.load(trained_tokenizer) + + text = "Wikipedia is a free online encyclopedia" + ids = tokenizer.encode(text) + decoded = tokenizer.decode(ids) + + assert decoded.strip() == text.strip() + + +def test_spm_encode_raises_if_not_loaded(): + """encode() should raise RuntimeError if model not loaded.""" + tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2) + + with pytest.raises(RuntimeError, match="not loaded"): + tokenizer.encode("hello world") + + +def test_spm_decode_raises_if_not_loaded(): + """decode() should raise RuntimeError if model not loaded.""" + tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2) + + with pytest.raises(RuntimeError, match="not loaded"): + tokenizer.decode([1, 2, 3]) + + +# ------------------------------------------------------------------ +# Load tests +# ------------------------------------------------------------------ + +def test_spm_load_from_disk(trained_tokenizer): + """load() should successfully restore tokenizer from disk.""" + tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2) + tokenizer.load(trained_tokenizer) + + assert tokenizer._model is not None + + +def test_spm_encode_works_after_load(trained_tokenizer): + """encode() should work correctly after load().""" + tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2) + tokenizer.load(trained_tokenizer) + + ids = tokenizer.encode("hello world") + + assert isinstance(ids, list) + assert len(ids) > 0 + + +def test_spm_load_raises_if_model_missing(tmp_path): + """load() should raise FileNotFoundError if spm.model not found.""" + tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2) + + with pytest.raises(FileNotFoundError, match="SentencePiece model not found"): + tokenizer.load(tmp_path) + + +# ------------------------------------------------------------------ +# Artifact path tests +# ------------------------------------------------------------------ + +def test_spm_get_vocab_path(tmp_path): + """get_vocab_path() should return path to spm.vocab.""" + tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2) + vocab_path = tokenizer.get_vocab_path(tmp_path) + + assert vocab_path == tmp_path / "spm.vocab" + + +def test_spm_get_merges_path_returns_none(tmp_path): + """get_merges_path() should return None for SentencePiece.""" + tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2) + merges_path = tokenizer.get_merges_path(tmp_path) + + assert merges_path is None + + +# ------------------------------------------------------------------ +# Special tokens tests +# ------------------------------------------------------------------ + +def test_spm_special_tokens_in_vocabulary(trained_tokenizer): + """Special tokens should be present in trained vocabulary.""" + tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2) + tokenizer.load(trained_tokenizer) + + vocab_path = trained_tokenizer / "spm.vocab" + vocab_content = vocab_path.read_text(encoding="utf-8") # fix: explicit UTF-8 + + assert "" in vocab_content + assert "" in vocab_content + assert "" in vocab_content + assert "" in vocab_content + + +# ------------------------------------------------------------------ +# Determinism tests +# ------------------------------------------------------------------ + +def test_spm_training_is_deterministic(tmp_path, sample_text_file): + """Training twice on same data should produce same vocab.""" + save_path_1 = tmp_path / "tokenizer_1" + save_path_2 = tmp_path / "tokenizer_2" + + tokenizer_1 = SentencePieceTokenizer(vocab_size=200, min_frequency=2) + tokenizer_1.train(sample_text_file, save_path_1) + + tokenizer_2 = SentencePieceTokenizer(vocab_size=200, min_frequency=2) + tokenizer_2.train(sample_text_file, save_path_2) + + vocab_1 = (save_path_1 / "spm.vocab").read_text(encoding="utf-8") # fix: explicit UTF-8 + vocab_2 = (save_path_2 / "spm.vocab").read_text(encoding="utf-8") # fix: explicit UTF-8 + + assert vocab_1 == vocab_2 \ No newline at end of file From 0b54e19998912d2c9daee4ec2ba86439a349b336 Mon Sep 17 00:00:00 2001 From: Arpit Sharma Date: Wed, 11 Mar 2026 11:37:13 +0530 Subject: [PATCH 2/7] fix: ruff lint and formatting for sentencepiece tokenizer --- .../tokenizer/sentencepiece_tokenizer.py | 12 ++++-------- openverifiablellm/verify.py | 16 +++++++++------- tests/test_sentencepiece.py | 16 +++++++++------- 3 files changed, 22 insertions(+), 22 deletions(-) diff --git a/openverifiablellm/tokenizer/sentencepiece_tokenizer.py b/openverifiablellm/tokenizer/sentencepiece_tokenizer.py index b2a12ef..6fe522b 100644 --- a/openverifiablellm/tokenizer/sentencepiece_tokenizer.py +++ b/openverifiablellm/tokenizer/sentencepiece_tokenizer.py @@ -4,7 +4,6 @@ from .base import BaseTokenizer - SPM_MODEL_FILE = "spm.model" SPM_VOCAB_FILE = "spm.vocab" @@ -49,8 +48,7 @@ def train(self, text_file: Path, save_path: Path): if not text_file.is_file(): raise FileNotFoundError( - f"Training file not found at {text_file}. " - f"Please provide a valid text corpus file." + f"Training file not found at {text_file}. Please provide a valid text corpus file." ) save_path.mkdir(parents=True, exist_ok=True) @@ -162,8 +160,7 @@ def _load_model(self, tokenizer_dir: Path): if not model_path.is_file(): raise FileNotFoundError( - f"SentencePiece model not found at {model_path}. " - f"Please train the tokenizer first." + f"SentencePiece model not found at {model_path}. Please train the tokenizer first." ) self._model = spm.SentencePieceProcessor() @@ -179,6 +176,5 @@ def _check_loaded(self): if self._model is None: raise RuntimeError( - "SentencePiece model is not loaded. " - "Call train() or load() before encode/decode." - ) \ No newline at end of file + "SentencePiece model is not loaded. Call train() or load() before encode/decode." + ) diff --git a/openverifiablellm/verify.py b/openverifiablellm/verify.py index ad7f789..cc0c558 100644 --- a/openverifiablellm/verify.py +++ b/openverifiablellm/verify.py @@ -386,15 +386,17 @@ def verify_preprocessing( "environment_hash", expected=manifest.get("environment_hash"), actual=current_env["environment_hash"], - detail="Environment fingerprint comparison" + detail="Environment fingerprint comparison", ) else: - report.add(CheckResult( - name="environment_hash", - status=CheckStatus.SKIP, - detail="Field absent from manifest (older version)" - )) - + report.add( + CheckResult( + name="environment_hash", + status=CheckStatus.SKIP, + detail="Field absent from manifest (older version)", + ) + ) + # 4. Re-run preprocessing in an isolated temp directory tmp_dir = Path(tempfile.mkdtemp(prefix="ovllm_verify_")) try: diff --git a/tests/test_sentencepiece.py b/tests/test_sentencepiece.py index 2280bbf..8aeb4eb 100644 --- a/tests/test_sentencepiece.py +++ b/tests/test_sentencepiece.py @@ -1,13 +1,12 @@ import pytest -from pathlib import Path from openverifiablellm.tokenizer.sentencepiece_tokenizer import SentencePieceTokenizer - # ------------------------------------------------------------------ # Fixtures # ------------------------------------------------------------------ + @pytest.fixture def sample_text_file(tmp_path): """Create a sample text file for training.""" @@ -36,6 +35,7 @@ def trained_tokenizer(tmp_path, sample_text_file): # Training tests # ------------------------------------------------------------------ + def test_spm_train_creates_artifacts(tmp_path, sample_text_file): """Training should produce spm.model and spm.vocab.""" tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2) @@ -64,10 +64,7 @@ def test_spm_train_raises_file_not_found(tmp_path): tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2) with pytest.raises(FileNotFoundError, match="Training file not found"): - tokenizer.train( - tmp_path / "nonexistent.txt", - tmp_path / "tokenizer" - ) + tokenizer.train(tmp_path / "nonexistent.txt", tmp_path / "tokenizer") def test_spm_train_raises_if_directory_passed(tmp_path, sample_text_file): @@ -82,6 +79,7 @@ def test_spm_train_raises_if_directory_passed(tmp_path, sample_text_file): # Encode / Decode tests # ------------------------------------------------------------------ + def test_spm_encode_returns_list_of_ints(trained_tokenizer): """encode() should return a list of integers.""" tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2) @@ -125,6 +123,7 @@ def test_spm_decode_raises_if_not_loaded(): # Load tests # ------------------------------------------------------------------ + def test_spm_load_from_disk(trained_tokenizer): """load() should successfully restore tokenizer from disk.""" tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2) @@ -156,6 +155,7 @@ def test_spm_load_raises_if_model_missing(tmp_path): # Artifact path tests # ------------------------------------------------------------------ + def test_spm_get_vocab_path(tmp_path): """get_vocab_path() should return path to spm.vocab.""" tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2) @@ -176,6 +176,7 @@ def test_spm_get_merges_path_returns_none(tmp_path): # Special tokens tests # ------------------------------------------------------------------ + def test_spm_special_tokens_in_vocabulary(trained_tokenizer): """Special tokens should be present in trained vocabulary.""" tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2) @@ -194,6 +195,7 @@ def test_spm_special_tokens_in_vocabulary(trained_tokenizer): # Determinism tests # ------------------------------------------------------------------ + def test_spm_training_is_deterministic(tmp_path, sample_text_file): """Training twice on same data should produce same vocab.""" save_path_1 = tmp_path / "tokenizer_1" @@ -208,4 +210,4 @@ def test_spm_training_is_deterministic(tmp_path, sample_text_file): vocab_1 = (save_path_1 / "spm.vocab").read_text(encoding="utf-8") # fix: explicit UTF-8 vocab_2 = (save_path_2 / "spm.vocab").read_text(encoding="utf-8") # fix: explicit UTF-8 - assert vocab_1 == vocab_2 \ No newline at end of file + assert vocab_1 == vocab_2 From 1bdeb40bbd1297e2da7d0fcb84adbffca96f1fca Mon Sep 17 00:00:00 2001 From: Arpit Sharma Date: Fri, 13 Mar 2026 17:52:27 +0530 Subject: [PATCH 3/7] trigger review From 323983c83a56194a701b21e1824e899f618232f2 Mon Sep 17 00:00:00 2001 From: Arpit Sharma Date: Fri, 13 Mar 2026 18:18:28 +0530 Subject: [PATCH 4/7] fix:changes acc to rabit --- .../tokenizer/sentencepiece_tokenizer.py | 106 +----------------- tests/test_sentencepiece.py | 67 ++--------- 2 files changed, 15 insertions(+), 158 deletions(-) diff --git a/openverifiablellm/tokenizer/sentencepiece_tokenizer.py b/openverifiablellm/tokenizer/sentencepiece_tokenizer.py index 6fe522b..d0a976b 100644 --- a/openverifiablellm/tokenizer/sentencepiece_tokenizer.py +++ b/openverifiablellm/tokenizer/sentencepiece_tokenizer.py @@ -9,40 +9,12 @@ class SentencePieceTokenizer(BaseTokenizer): - """ - SentencePiece tokenizer implementation. - - Supports training a BPE SentencePiece tokenizer, - encoding text to token ids, decoding token ids back to text, - and loading a previously trained model from disk. - - Reproducibility depends on: - - Stable input data - - Pinned sentencepiece library version - - Consistent execution environment - """ def __init__(self, vocab_size: int, min_frequency: int): super().__init__(vocab_size, min_frequency) self._model = None - # ------------------------------------------------------------------ - # Training - # ------------------------------------------------------------------ - def train(self, text_file: Path, save_path: Path): - """ - Train SentencePiece model on text corpus and save artifacts. - - Args: - text_file: Path to training text corpus. - save_path: Directory to save spm.model and spm.vocab. - - Raises: - FileNotFoundError: If text_file does not exist - or is not a file. - """ - text_file = Path(text_file) save_path = Path(save_path) @@ -59,6 +31,7 @@ def train(self, text_file: Path, save_path: Path): input=str(text_file), model_prefix=str(model_prefix), vocab_size=self.vocab_size, + min_count=self.min_frequency, pad_id=0, unk_id=1, bos_id=2, @@ -73,89 +46,25 @@ def train(self, text_file: Path, save_path: Path): self._load_model(save_path) - # ------------------------------------------------------------------ - # Encode / Decode - # ------------------------------------------------------------------ - def encode(self, text: str) -> list: - """ - Encode text into list of token ids. - - Args: - text: Input string to tokenize. - - Returns: - List of integer token ids. - - Raises: - RuntimeError: If tokenizer has not been trained or loaded. - """ - self._check_loaded() return self._model.encode(text, out_type=int) def decode(self, ids: list) -> str: - """ - Decode list of token ids back into text. - - Args: - ids: List of integer token ids. - - Returns: - Decoded string. - - Raises: - RuntimeError: If tokenizer has not been trained or loaded. - """ - self._check_loaded() return self._model.decode(ids) - # ------------------------------------------------------------------ - # Load - # ------------------------------------------------------------------ - def load(self, tokenizer_dir: Path): - """ - Load a previously trained SentencePiece model from disk. - - Args: - tokenizer_dir: Directory containing spm.model - - Raises: - FileNotFoundError: If spm.model is not found. - """ - tokenizer_dir = Path(tokenizer_dir) self._load_model(tokenizer_dir) - # ------------------------------------------------------------------ - # Artifact paths - # ------------------------------------------------------------------ - def get_vocab_path(self, tokenizer_dir: Path) -> Path: - """Return path to spm.vocab file.""" return Path(tokenizer_dir) / SPM_VOCAB_FILE - def get_merges_path(self, tokenizer_dir: Path): - """ - SentencePiece does not use a merges file. - Returns None for compatibility with BaseTokenizer interface. - """ - return None - - # ------------------------------------------------------------------ - # Internal helpers - # ------------------------------------------------------------------ + def get_merges_path(self, tokenizer_dir: Path) -> Path: + return Path(tokenizer_dir) / SPM_MODEL_FILE def _load_model(self, tokenizer_dir: Path): - """ - Internal helper to load spm.model from directory. - - Raises: - FileNotFoundError: If spm.model not found. - """ - model_path = Path(tokenizer_dir) / SPM_MODEL_FILE if not model_path.is_file(): @@ -167,14 +76,7 @@ def _load_model(self, tokenizer_dir: Path): self._model.load(str(model_path)) def _check_loaded(self): - """ - Check that model is loaded before encode/decode. - - Raises: - RuntimeError: If model has not been loaded or trained. - """ - if self._model is None: raise RuntimeError( "SentencePiece model is not loaded. Call train() or load() before encode/decode." - ) + ) \ No newline at end of file diff --git a/tests/test_sentencepiece.py b/tests/test_sentencepiece.py index 8aeb4eb..8d5cbd1 100644 --- a/tests/test_sentencepiece.py +++ b/tests/test_sentencepiece.py @@ -2,14 +2,9 @@ from openverifiablellm.tokenizer.sentencepiece_tokenizer import SentencePieceTokenizer -# ------------------------------------------------------------------ -# Fixtures -# ------------------------------------------------------------------ - @pytest.fixture def sample_text_file(tmp_path): - """Create a sample text file for training.""" text = ( "Wikipedia is a free online encyclopedia.\n" "It is written collaboratively by volunteers.\n" @@ -25,19 +20,12 @@ def sample_text_file(tmp_path): @pytest.fixture def trained_tokenizer(tmp_path, sample_text_file): - """Train and return path to trained SentencePieceTokenizer.""" tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2) tokenizer.train(sample_text_file, tmp_path / "tokenizer") return tmp_path / "tokenizer" -# ------------------------------------------------------------------ -# Training tests -# ------------------------------------------------------------------ - - def test_spm_train_creates_artifacts(tmp_path, sample_text_file): - """Training should produce spm.model and spm.vocab.""" tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2) save_path = tmp_path / "tokenizer" @@ -48,7 +36,6 @@ def test_spm_train_creates_artifacts(tmp_path, sample_text_file): def test_spm_train_creates_save_directory(tmp_path, sample_text_file): - """train() should create save_path directory if it does not exist.""" tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2) save_path = tmp_path / "nested" / "tokenizer" / "dir" @@ -60,7 +47,6 @@ def test_spm_train_creates_save_directory(tmp_path, sample_text_file): def test_spm_train_raises_file_not_found(tmp_path): - """train() should raise FileNotFoundError for missing text file.""" tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2) with pytest.raises(FileNotFoundError, match="Training file not found"): @@ -68,20 +54,13 @@ def test_spm_train_raises_file_not_found(tmp_path): def test_spm_train_raises_if_directory_passed(tmp_path, sample_text_file): - """train() should raise FileNotFoundError if directory passed as text_file.""" tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2) with pytest.raises(FileNotFoundError, match="Training file not found"): tokenizer.train(tmp_path, tmp_path / "tokenizer") -# ------------------------------------------------------------------ -# Encode / Decode tests -# ------------------------------------------------------------------ - - def test_spm_encode_returns_list_of_ints(trained_tokenizer): - """encode() should return a list of integers.""" tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2) tokenizer.load(trained_tokenizer) @@ -92,7 +71,6 @@ def test_spm_encode_returns_list_of_ints(trained_tokenizer): def test_spm_encode_decode_roundtrip(trained_tokenizer): - """encode then decode should return original text.""" tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2) tokenizer.load(trained_tokenizer) @@ -104,7 +82,6 @@ def test_spm_encode_decode_roundtrip(trained_tokenizer): def test_spm_encode_raises_if_not_loaded(): - """encode() should raise RuntimeError if model not loaded.""" tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2) with pytest.raises(RuntimeError, match="not loaded"): @@ -112,20 +89,13 @@ def test_spm_encode_raises_if_not_loaded(): def test_spm_decode_raises_if_not_loaded(): - """decode() should raise RuntimeError if model not loaded.""" tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2) with pytest.raises(RuntimeError, match="not loaded"): tokenizer.decode([1, 2, 3]) -# ------------------------------------------------------------------ -# Load tests -# ------------------------------------------------------------------ - - def test_spm_load_from_disk(trained_tokenizer): - """load() should successfully restore tokenizer from disk.""" tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2) tokenizer.load(trained_tokenizer) @@ -133,7 +103,6 @@ def test_spm_load_from_disk(trained_tokenizer): def test_spm_encode_works_after_load(trained_tokenizer): - """encode() should work correctly after load().""" tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2) tokenizer.load(trained_tokenizer) @@ -144,46 +113,33 @@ def test_spm_encode_works_after_load(trained_tokenizer): def test_spm_load_raises_if_model_missing(tmp_path): - """load() should raise FileNotFoundError if spm.model not found.""" tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2) with pytest.raises(FileNotFoundError, match="SentencePiece model not found"): tokenizer.load(tmp_path) -# ------------------------------------------------------------------ -# Artifact path tests -# ------------------------------------------------------------------ - - def test_spm_get_vocab_path(tmp_path): - """get_vocab_path() should return path to spm.vocab.""" tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2) vocab_path = tokenizer.get_vocab_path(tmp_path) assert vocab_path == tmp_path / "spm.vocab" -def test_spm_get_merges_path_returns_none(tmp_path): - """get_merges_path() should return None for SentencePiece.""" +def test_spm_get_merges_path_returns_model_path(tmp_path): tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2) merges_path = tokenizer.get_merges_path(tmp_path) - assert merges_path is None - - -# ------------------------------------------------------------------ -# Special tokens tests -# ------------------------------------------------------------------ + assert merges_path == tmp_path / "spm.model" + assert merges_path is not None def test_spm_special_tokens_in_vocabulary(trained_tokenizer): - """Special tokens should be present in trained vocabulary.""" tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2) tokenizer.load(trained_tokenizer) vocab_path = trained_tokenizer / "spm.vocab" - vocab_content = vocab_path.read_text(encoding="utf-8") # fix: explicit UTF-8 + vocab_content = vocab_path.read_text(encoding="utf-8") assert "" in vocab_content assert "" in vocab_content @@ -191,13 +147,7 @@ def test_spm_special_tokens_in_vocabulary(trained_tokenizer): assert "" in vocab_content -# ------------------------------------------------------------------ -# Determinism tests -# ------------------------------------------------------------------ - - def test_spm_training_is_deterministic(tmp_path, sample_text_file): - """Training twice on same data should produce same vocab.""" save_path_1 = tmp_path / "tokenizer_1" save_path_2 = tmp_path / "tokenizer_2" @@ -207,7 +157,12 @@ def test_spm_training_is_deterministic(tmp_path, sample_text_file): tokenizer_2 = SentencePieceTokenizer(vocab_size=200, min_frequency=2) tokenizer_2.train(sample_text_file, save_path_2) - vocab_1 = (save_path_1 / "spm.vocab").read_text(encoding="utf-8") # fix: explicit UTF-8 - vocab_2 = (save_path_2 / "spm.vocab").read_text(encoding="utf-8") # fix: explicit UTF-8 + vocab_1 = (save_path_1 / "spm.vocab").read_text(encoding="utf-8") + vocab_2 = (save_path_2 / "spm.vocab").read_text(encoding="utf-8") assert vocab_1 == vocab_2 + + model_1 = (save_path_1 / "spm.model").read_bytes() + model_2 = (save_path_2 / "spm.model").read_bytes() + + assert model_1 == model_2 \ No newline at end of file From c73d04641423e809a05cd57067416786e1f236a9 Mon Sep 17 00:00:00 2001 From: Arpit Sharma Date: Fri, 13 Mar 2026 18:55:41 +0530 Subject: [PATCH 5/7] fix:complied with the changes by rabbit --- .../tokenizer/sentencepiece_tokenizer.py | 11 +++-- tests/test_sentencepiece.py | 48 +++++++++++-------- 2 files changed, 35 insertions(+), 24 deletions(-) diff --git a/openverifiablellm/tokenizer/sentencepiece_tokenizer.py b/openverifiablellm/tokenizer/sentencepiece_tokenizer.py index d0a976b..f7b5347 100644 --- a/openverifiablellm/tokenizer/sentencepiece_tokenizer.py +++ b/openverifiablellm/tokenizer/sentencepiece_tokenizer.py @@ -9,7 +9,6 @@ class SentencePieceTokenizer(BaseTokenizer): - def __init__(self, vocab_size: int, min_frequency: int): super().__init__(vocab_size, min_frequency) self._model = None @@ -23,6 +22,13 @@ def train(self, text_file: Path, save_path: Path): f"Training file not found at {text_file}. Please provide a valid text corpus file." ) + if self.min_frequency != 1: + raise NotImplementedError( + f"min_frequency={self.min_frequency} is not supported. " + "SentencePiece does not expose a confirmed min_count option via the Python wrapper. " + "Set min_frequency=1 to use the default behaviour, or confirm the upstream option before enabling filtering." + ) + save_path.mkdir(parents=True, exist_ok=True) model_prefix = save_path / "spm" @@ -31,7 +37,6 @@ def train(self, text_file: Path, save_path: Path): input=str(text_file), model_prefix=str(model_prefix), vocab_size=self.vocab_size, - min_count=self.min_frequency, pad_id=0, unk_id=1, bos_id=2, @@ -79,4 +84,4 @@ def _check_loaded(self): if self._model is None: raise RuntimeError( "SentencePiece model is not loaded. Call train() or load() before encode/decode." - ) \ No newline at end of file + ) diff --git a/tests/test_sentencepiece.py b/tests/test_sentencepiece.py index 8d5cbd1..c1dffee 100644 --- a/tests/test_sentencepiece.py +++ b/tests/test_sentencepiece.py @@ -20,13 +20,13 @@ def sample_text_file(tmp_path): @pytest.fixture def trained_tokenizer(tmp_path, sample_text_file): - tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2) + tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=1) tokenizer.train(sample_text_file, tmp_path / "tokenizer") return tmp_path / "tokenizer" def test_spm_train_creates_artifacts(tmp_path, sample_text_file): - tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2) + tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=1) save_path = tmp_path / "tokenizer" tokenizer.train(sample_text_file, save_path) @@ -36,7 +36,7 @@ def test_spm_train_creates_artifacts(tmp_path, sample_text_file): def test_spm_train_creates_save_directory(tmp_path, sample_text_file): - tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2) + tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=1) save_path = tmp_path / "nested" / "tokenizer" / "dir" assert not save_path.exists() @@ -47,21 +47,28 @@ def test_spm_train_creates_save_directory(tmp_path, sample_text_file): def test_spm_train_raises_file_not_found(tmp_path): - tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2) + tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=1) with pytest.raises(FileNotFoundError, match="Training file not found"): tokenizer.train(tmp_path / "nonexistent.txt", tmp_path / "tokenizer") def test_spm_train_raises_if_directory_passed(tmp_path, sample_text_file): - tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2) + tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=1) with pytest.raises(FileNotFoundError, match="Training file not found"): tokenizer.train(tmp_path, tmp_path / "tokenizer") -def test_spm_encode_returns_list_of_ints(trained_tokenizer): +def test_spm_train_raises_if_min_frequency_not_one(tmp_path, sample_text_file): tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2) + + with pytest.raises(NotImplementedError, match="min_frequency=2 is not supported"): + tokenizer.train(sample_text_file, tmp_path / "tokenizer") + + +def test_spm_encode_returns_list_of_ints(trained_tokenizer): + tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=1) tokenizer.load(trained_tokenizer) ids = tokenizer.encode("hello world") @@ -71,7 +78,7 @@ def test_spm_encode_returns_list_of_ints(trained_tokenizer): def test_spm_encode_decode_roundtrip(trained_tokenizer): - tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2) + tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=1) tokenizer.load(trained_tokenizer) text = "Wikipedia is a free online encyclopedia" @@ -82,28 +89,28 @@ def test_spm_encode_decode_roundtrip(trained_tokenizer): def test_spm_encode_raises_if_not_loaded(): - tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2) + tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=1) with pytest.raises(RuntimeError, match="not loaded"): tokenizer.encode("hello world") def test_spm_decode_raises_if_not_loaded(): - tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2) + tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=1) with pytest.raises(RuntimeError, match="not loaded"): tokenizer.decode([1, 2, 3]) def test_spm_load_from_disk(trained_tokenizer): - tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2) + tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=1) tokenizer.load(trained_tokenizer) assert tokenizer._model is not None def test_spm_encode_works_after_load(trained_tokenizer): - tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2) + tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=1) tokenizer.load(trained_tokenizer) ids = tokenizer.encode("hello world") @@ -113,29 +120,28 @@ def test_spm_encode_works_after_load(trained_tokenizer): def test_spm_load_raises_if_model_missing(tmp_path): - tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2) + tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=1) with pytest.raises(FileNotFoundError, match="SentencePiece model not found"): tokenizer.load(tmp_path) def test_spm_get_vocab_path(tmp_path): - tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2) + tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=1) vocab_path = tokenizer.get_vocab_path(tmp_path) assert vocab_path == tmp_path / "spm.vocab" -def test_spm_get_merges_path_returns_model_path(tmp_path): - tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2) +def test_spm_get_merges_path_returns_none(tmp_path): + tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=1) merges_path = tokenizer.get_merges_path(tmp_path) - assert merges_path == tmp_path / "spm.model" - assert merges_path is not None + assert merges_path is None def test_spm_special_tokens_in_vocabulary(trained_tokenizer): - tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2) + tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=1) tokenizer.load(trained_tokenizer) vocab_path = trained_tokenizer / "spm.vocab" @@ -151,10 +157,10 @@ def test_spm_training_is_deterministic(tmp_path, sample_text_file): save_path_1 = tmp_path / "tokenizer_1" save_path_2 = tmp_path / "tokenizer_2" - tokenizer_1 = SentencePieceTokenizer(vocab_size=200, min_frequency=2) + tokenizer_1 = SentencePieceTokenizer(vocab_size=200, min_frequency=1) tokenizer_1.train(sample_text_file, save_path_1) - tokenizer_2 = SentencePieceTokenizer(vocab_size=200, min_frequency=2) + tokenizer_2 = SentencePieceTokenizer(vocab_size=200, min_frequency=1) tokenizer_2.train(sample_text_file, save_path_2) vocab_1 = (save_path_1 / "spm.vocab").read_text(encoding="utf-8") @@ -165,4 +171,4 @@ def test_spm_training_is_deterministic(tmp_path, sample_text_file): model_1 = (save_path_1 / "spm.model").read_bytes() model_2 = (save_path_2 / "spm.model").read_bytes() - assert model_1 == model_2 \ No newline at end of file + assert model_1 == model_2 From 9406c0b1125433b3b1d133aa040a696843543718 Mon Sep 17 00:00:00 2001 From: Arpit Sharma Date: Fri, 13 Mar 2026 19:06:15 +0530 Subject: [PATCH 6/7] fix:test --- tests/test_sentencepiece.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_sentencepiece.py b/tests/test_sentencepiece.py index c1dffee..281c7c4 100644 --- a/tests/test_sentencepiece.py +++ b/tests/test_sentencepiece.py @@ -133,11 +133,11 @@ def test_spm_get_vocab_path(tmp_path): assert vocab_path == tmp_path / "spm.vocab" -def test_spm_get_merges_path_returns_none(tmp_path): +def test_spm_get_merges_path_returns_model_path(tmp_path): tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=1) merges_path = tokenizer.get_merges_path(tmp_path) - assert merges_path is None + assert merges_path == tmp_path / "spm.model" def test_spm_special_tokens_in_vocabulary(trained_tokenizer): @@ -171,4 +171,4 @@ def test_spm_training_is_deterministic(tmp_path, sample_text_file): model_1 = (save_path_1 / "spm.model").read_bytes() model_2 = (save_path_2 / "spm.model").read_bytes() - assert model_1 == model_2 + assert model_1 == model_2 \ No newline at end of file From becd3c164d2ba5ab862011f46318542f3382bf53 Mon Sep 17 00:00:00 2001 From: Arpit Sharma Date: Fri, 13 Mar 2026 19:07:36 +0530 Subject: [PATCH 7/7] fix:lint --- tests/test_sentencepiece.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_sentencepiece.py b/tests/test_sentencepiece.py index 281c7c4..4d782c9 100644 --- a/tests/test_sentencepiece.py +++ b/tests/test_sentencepiece.py @@ -171,4 +171,4 @@ def test_spm_training_is_deterministic(tmp_path, sample_text_file): model_1 = (save_path_1 / "spm.model").read_bytes() model_2 = (save_path_2 / "spm.model").read_bytes() - assert model_1 == model_2 \ No newline at end of file + assert model_1 == model_2