From c096dbbd75f21c4d9633a5c7820de2d2c6410cc4 Mon Sep 17 00:00:00 2001
From: Arpit Sharma <officialarpit7@gmail.com>
Date: Wed, 11 Mar 2026 03:12:54 +0530
Subject: [PATCH 1/7] feature:complete sentence_piecetokenizer

---
 .gitignore                                    |   3 +-
 .../tokenizer/sentencepiece_tokenizer.py      | 163 +++++++++++++-
 tests/test_sentencepiece.py                   | 211 ++++++++++++++++++
 3 files changed, 373 insertions(+), 4 deletions(-)
 create mode 100644 tests/test_sentencepiece.py
diff --git a/.gitignore b/.gitignore
index c55e460..e68b378 100644
--- a/.gitignore
+++ b/.gitignore
@@ -333,4 +333,5 @@ __pycache__/
 *.pyd
 *.bz2
 
-*.venv
\ No newline at end of file
+*.venv
+venv/
diff --git a/openverifiablellm/tokenizer/sentencepiece_tokenizer.py b/openverifiablellm/tokenizer/sentencepiece_tokenizer.py
index aa03893..b2a12ef 100644
--- a/openverifiablellm/tokenizer/sentencepiece_tokenizer.py
+++ b/openverifiablellm/tokenizer/sentencepiece_tokenizer.py
@@ -1,15 +1,59 @@
 from pathlib import Path
+
 import sentencepiece as spm
 
 from .base import BaseTokenizer
 
 
+SPM_MODEL_FILE = "spm.model"
+SPM_VOCAB_FILE = "spm.vocab"
+
+
 class SentencePieceTokenizer(BaseTokenizer):
     """
     SentencePiece tokenizer implementation.
+
+    Supports training a BPE SentencePiece tokenizer,
+    encoding text to token ids, decoding token ids back to text,
+    and loading a previously trained model from disk.
+
+    Reproducibility depends on:
+    - Stable input data
+    - Pinned sentencepiece library version
+    - Consistent execution environment
     """
 
+    def __init__(self, vocab_size: int, min_frequency: int):
+        super().__init__(vocab_size, min_frequency)
+        self._model = None
+
+    # ------------------------------------------------------------------
+    # Training
+    # ------------------------------------------------------------------
+
     def train(self, text_file: Path, save_path: Path):
+        """
+        Train SentencePiece model on text corpus and save artifacts.
+
+        Args:
+            text_file: Path to training text corpus.
+            save_path: Directory to save spm.model and spm.vocab.
+
+        Raises:
+            FileNotFoundError: If text_file does not exist
+                               or is not a file.
+        """
+
+        text_file = Path(text_file)
+        save_path = Path(save_path)
+
+        if not text_file.is_file():
+            raise FileNotFoundError(
+                f"Training file not found at {text_file}. "
+                f"Please provide a valid text corpus file."
+            )
+
+        save_path.mkdir(parents=True, exist_ok=True)
 
         model_prefix = save_path / "spm"
 
@@ -17,11 +61,124 @@ def train(self, text_file: Path, save_path: Path):
             input=str(text_file),
             model_prefix=str(model_prefix),
             vocab_size=self.vocab_size,
+            pad_id=0,
+            unk_id=1,
+            bos_id=2,
+            eos_id=3,
+            pad_piece="<pad>",
+            unk_piece="<unk>",
+            bos_piece="<s>",
+            eos_piece="</s>",
+            character_coverage=1.0,
+            model_type="bpe",
         )
 
-    def get_vocab_path(self, tokenizer_dir: Path):
-        return tokenizer_dir / "spm.vocab"
+        self._load_model(save_path)
+
+    # ------------------------------------------------------------------
+    # Encode / Decode
+    # ------------------------------------------------------------------
+
+    def encode(self, text: str) -> list:
+        """
+        Encode text into list of token ids.
+
+        Args:
+            text: Input string to tokenize.
+
+        Returns:
+            List of integer token ids.
+
+        Raises:
+            RuntimeError: If tokenizer has not been trained or loaded.
+        """
+
+        self._check_loaded()
+        return self._model.encode(text, out_type=int)
+
+    def decode(self, ids: list) -> str:
+        """
+        Decode list of token ids back into text.
+
+        Args:
+            ids: List of integer token ids.
+
+        Returns:
+            Decoded string.
+
+        Raises:
+            RuntimeError: If tokenizer has not been trained or loaded.
+        """
+
+        self._check_loaded()
+        return self._model.decode(ids)
+
+    # ------------------------------------------------------------------
+    # Load
+    # ------------------------------------------------------------------
+
+    def load(self, tokenizer_dir: Path):
+        """
+        Load a previously trained SentencePiece model from disk.
+
+        Args:
+            tokenizer_dir: Directory containing spm.model
+
+        Raises:
+            FileNotFoundError: If spm.model is not found.
+        """
+
+        tokenizer_dir = Path(tokenizer_dir)
+        self._load_model(tokenizer_dir)
+
+    # ------------------------------------------------------------------
+    # Artifact paths
+    # ------------------------------------------------------------------
+
+    def get_vocab_path(self, tokenizer_dir: Path) -> Path:
+        """Return path to spm.vocab file."""
+        return Path(tokenizer_dir) / SPM_VOCAB_FILE
 
     def get_merges_path(self, tokenizer_dir: Path):
-        # SentencePiece does not use merges
+        """
+        SentencePiece does not use a merges file.
+        Returns None for compatibility with BaseTokenizer interface.
+        """
         return None
+
+    # ------------------------------------------------------------------
+    # Internal helpers
+    # ------------------------------------------------------------------
+
+    def _load_model(self, tokenizer_dir: Path):
+        """
+        Internal helper to load spm.model from directory.
+
+        Raises:
+            FileNotFoundError: If spm.model not found.
+        """
+
+        model_path = Path(tokenizer_dir) / SPM_MODEL_FILE
+
+        if not model_path.is_file():
+            raise FileNotFoundError(
+                f"SentencePiece model not found at {model_path}. "
+                f"Please train the tokenizer first."
+            )
+
+        self._model = spm.SentencePieceProcessor()
+        self._model.load(str(model_path))
+
+    def _check_loaded(self):
+        """
+        Check that model is loaded before encode/decode.
+
+        Raises:
+            RuntimeError: If model has not been loaded or trained.
+        """
+
+        if self._model is None:
+            raise RuntimeError(
+                "SentencePiece model is not loaded. "
+                "Call train() or load() before encode/decode."
+            )
\ No newline at end of file
diff --git a/tests/test_sentencepiece.py b/tests/test_sentencepiece.py
new file mode 100644
index 0000000..2280bbf
--- /dev/null
+++ b/tests/test_sentencepiece.py
@@ -0,0 +1,211 @@
+import pytest
+from pathlib import Path
+
+from openverifiablellm.tokenizer.sentencepiece_tokenizer import SentencePieceTokenizer
+
+
+# ------------------------------------------------------------------
+# Fixtures
+# ------------------------------------------------------------------
+
+@pytest.fixture
+def sample_text_file(tmp_path):
+    """Create a sample text file for training."""
+    text = (
+        "Wikipedia is a free online encyclopedia.\n"
+        "It is written collaboratively by volunteers.\n"
+        "Anyone can edit Wikipedia articles.\n"
+        "Wikipedia was launched on January 15 2001.\n"
+        "It is one of the most popular websites in the world.\n"
+    ) * 500
+
+    text_file = tmp_path / "sample.txt"
+    text_file.write_text(text)
+    return text_file
+
+
+@pytest.fixture
+def trained_tokenizer(tmp_path, sample_text_file):
+    """Train and return path to trained SentencePieceTokenizer."""
+    tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2)
+    tokenizer.train(sample_text_file, tmp_path / "tokenizer")
+    return tmp_path / "tokenizer"
+
+
+# ------------------------------------------------------------------
+# Training tests
+# ------------------------------------------------------------------
+
+def test_spm_train_creates_artifacts(tmp_path, sample_text_file):
+    """Training should produce spm.model and spm.vocab."""
+    tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2)
+    save_path = tmp_path / "tokenizer"
+
+    tokenizer.train(sample_text_file, save_path)
+
+    assert (save_path / "spm.model").is_file()
+    assert (save_path / "spm.vocab").is_file()
+
+
+def test_spm_train_creates_save_directory(tmp_path, sample_text_file):
+    """train() should create save_path directory if it does not exist."""
+    tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2)
+    save_path = tmp_path / "nested" / "tokenizer" / "dir"
+
+    assert not save_path.exists()
+
+    tokenizer.train(sample_text_file, save_path)
+
+    assert save_path.exists()
+
+
+def test_spm_train_raises_file_not_found(tmp_path):
+    """train() should raise FileNotFoundError for missing text file."""
+    tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2)
+
+    with pytest.raises(FileNotFoundError, match="Training file not found"):
+        tokenizer.train(
+            tmp_path / "nonexistent.txt",
+            tmp_path / "tokenizer"
+        )
+
+
+def test_spm_train_raises_if_directory_passed(tmp_path, sample_text_file):
+    """train() should raise FileNotFoundError if directory passed as text_file."""
+    tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2)
+
+    with pytest.raises(FileNotFoundError, match="Training file not found"):
+        tokenizer.train(tmp_path, tmp_path / "tokenizer")
+
+
+# ------------------------------------------------------------------
+# Encode / Decode tests
+# ------------------------------------------------------------------
+
+def test_spm_encode_returns_list_of_ints(trained_tokenizer):
+    """encode() should return a list of integers."""
+    tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2)
+    tokenizer.load(trained_tokenizer)
+
+    ids = tokenizer.encode("hello world")
+
+    assert isinstance(ids, list)
+    assert all(isinstance(i, int) for i in ids)
+
+
+def test_spm_encode_decode_roundtrip(trained_tokenizer):
+    """encode then decode should return original text."""
+    tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2)
+    tokenizer.load(trained_tokenizer)
+
+    text = "Wikipedia is a free online encyclopedia"
+    ids = tokenizer.encode(text)
+    decoded = tokenizer.decode(ids)
+
+    assert decoded.strip() == text.strip()
+
+
+def test_spm_encode_raises_if_not_loaded():
+    """encode() should raise RuntimeError if model not loaded."""
+    tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2)
+
+    with pytest.raises(RuntimeError, match="not loaded"):
+        tokenizer.encode("hello world")
+
+
+def test_spm_decode_raises_if_not_loaded():
+    """decode() should raise RuntimeError if model not loaded."""
+    tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2)
+
+    with pytest.raises(RuntimeError, match="not loaded"):
+        tokenizer.decode([1, 2, 3])
+
+
+# ------------------------------------------------------------------
+# Load tests
+# ------------------------------------------------------------------
+
+def test_spm_load_from_disk(trained_tokenizer):
+    """load() should successfully restore tokenizer from disk."""
+    tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2)
+    tokenizer.load(trained_tokenizer)
+
+    assert tokenizer._model is not None
+
+
+def test_spm_encode_works_after_load(trained_tokenizer):
+    """encode() should work correctly after load()."""
+    tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2)
+    tokenizer.load(trained_tokenizer)
+
+    ids = tokenizer.encode("hello world")
+
+    assert isinstance(ids, list)
+    assert len(ids) > 0
+
+
+def test_spm_load_raises_if_model_missing(tmp_path):
+    """load() should raise FileNotFoundError if spm.model not found."""
+    tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2)
+
+    with pytest.raises(FileNotFoundError, match="SentencePiece model not found"):
+        tokenizer.load(tmp_path)
+
+
+# ------------------------------------------------------------------
+# Artifact path tests
+# ------------------------------------------------------------------
+
+def test_spm_get_vocab_path(tmp_path):
+    """get_vocab_path() should return path to spm.vocab."""
+    tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2)
+    vocab_path = tokenizer.get_vocab_path(tmp_path)
+
+    assert vocab_path == tmp_path / "spm.vocab"
+
+
+def test_spm_get_merges_path_returns_none(tmp_path):
+    """get_merges_path() should return None for SentencePiece."""
+    tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2)
+    merges_path = tokenizer.get_merges_path(tmp_path)
+
+    assert merges_path is None
+
+
+# ------------------------------------------------------------------
+# Special tokens tests
+# ------------------------------------------------------------------
+
+def test_spm_special_tokens_in_vocabulary(trained_tokenizer):
+    """Special tokens should be present in trained vocabulary."""
+    tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2)
+    tokenizer.load(trained_tokenizer)
+
+    vocab_path = trained_tokenizer / "spm.vocab"
+    vocab_content = vocab_path.read_text(encoding="utf-8")  # fix: explicit UTF-8
+
+    assert "<pad>" in vocab_content
+    assert "<unk>" in vocab_content
+    assert "<s>" in vocab_content
+    assert "</s>" in vocab_content
+
+
+# ------------------------------------------------------------------
+# Determinism tests
+# ------------------------------------------------------------------
+
+def test_spm_training_is_deterministic(tmp_path, sample_text_file):
+    """Training twice on same data should produce same vocab."""
+    save_path_1 = tmp_path / "tokenizer_1"
+    save_path_2 = tmp_path / "tokenizer_2"
+
+    tokenizer_1 = SentencePieceTokenizer(vocab_size=200, min_frequency=2)
+    tokenizer_1.train(sample_text_file, save_path_1)
+
+    tokenizer_2 = SentencePieceTokenizer(vocab_size=200, min_frequency=2)
+    tokenizer_2.train(sample_text_file, save_path_2)
+
+    vocab_1 = (save_path_1 / "spm.vocab").read_text(encoding="utf-8")  # fix: explicit UTF-8
+    vocab_2 = (save_path_2 / "spm.vocab").read_text(encoding="utf-8")  # fix: explicit UTF-8
+
+    assert vocab_1 == vocab_2
\ No newline at end of file

From 0b54e19998912d2c9daee4ec2ba86439a349b336 Mon Sep 17 00:00:00 2001
From: Arpit Sharma <officialarpit7@gmail.com>
Date: Wed, 11 Mar 2026 11:37:13 +0530
Subject: [PATCH 2/7] fix: ruff lint and formatting for sentencepiece tokenizer

---
 .../tokenizer/sentencepiece_tokenizer.py         | 12 ++++--------
 openverifiablellm/verify.py                      | 16 +++++++++-------
 tests/test_sentencepiece.py                      | 16 +++++++++-------
 3 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/openverifiablellm/tokenizer/sentencepiece_tokenizer.py b/openverifiablellm/tokenizer/sentencepiece_tokenizer.py
index b2a12ef..6fe522b 100644
--- a/openverifiablellm/tokenizer/sentencepiece_tokenizer.py
+++ b/openverifiablellm/tokenizer/sentencepiece_tokenizer.py
@@ -4,7 +4,6 @@
 
 from .base import BaseTokenizer
 
-
 SPM_MODEL_FILE = "spm.model"
 SPM_VOCAB_FILE = "spm.vocab"
 
@@ -49,8 +48,7 @@ def train(self, text_file: Path, save_path: Path):
 
         if not text_file.is_file():
             raise FileNotFoundError(
-                f"Training file not found at {text_file}. "
-                f"Please provide a valid text corpus file."
+                f"Training file not found at {text_file}. Please provide a valid text corpus file."
             )
 
         save_path.mkdir(parents=True, exist_ok=True)
@@ -162,8 +160,7 @@ def _load_model(self, tokenizer_dir: Path):
 
         if not model_path.is_file():
             raise FileNotFoundError(
-                f"SentencePiece model not found at {model_path}. "
-                f"Please train the tokenizer first."
+                f"SentencePiece model not found at {model_path}. Please train the tokenizer first."
             )
 
         self._model = spm.SentencePieceProcessor()
@@ -179,6 +176,5 @@ def _check_loaded(self):
 
         if self._model is None:
             raise RuntimeError(
-                "SentencePiece model is not loaded. "
-                "Call train() or load() before encode/decode."
-            )
\ No newline at end of file
+                "SentencePiece model is not loaded. Call train() or load() before encode/decode."
+            )
diff --git a/openverifiablellm/verify.py b/openverifiablellm/verify.py
index ad7f789..cc0c558 100644
--- a/openverifiablellm/verify.py
+++ b/openverifiablellm/verify.py
@@ -386,15 +386,17 @@ def verify_preprocessing(
             "environment_hash",
             expected=manifest.get("environment_hash"),
             actual=current_env["environment_hash"],
-            detail="Environment fingerprint comparison"
+            detail="Environment fingerprint comparison",
         )
     else:
-        report.add(CheckResult(
-            name="environment_hash",
-            status=CheckStatus.SKIP,
-            detail="Field absent from manifest (older version)"
-        ))
-    
+        report.add(
+            CheckResult(
+                name="environment_hash",
+                status=CheckStatus.SKIP,
+                detail="Field absent from manifest (older version)",
+            )
+        )
+
     # 4. Re-run preprocessing in an isolated temp directory
     tmp_dir = Path(tempfile.mkdtemp(prefix="ovllm_verify_"))
     try:
diff --git a/tests/test_sentencepiece.py b/tests/test_sentencepiece.py
index 2280bbf..8aeb4eb 100644
--- a/tests/test_sentencepiece.py
+++ b/tests/test_sentencepiece.py
@@ -1,13 +1,12 @@
 import pytest
-from pathlib import Path
 
 from openverifiablellm.tokenizer.sentencepiece_tokenizer import SentencePieceTokenizer
 
-
 # ------------------------------------------------------------------
 # Fixtures
 # ------------------------------------------------------------------
 
+
 @pytest.fixture
 def sample_text_file(tmp_path):
     """Create a sample text file for training."""
@@ -36,6 +35,7 @@ def trained_tokenizer(tmp_path, sample_text_file):
 # Training tests
 # ------------------------------------------------------------------
 
+
 def test_spm_train_creates_artifacts(tmp_path, sample_text_file):
     """Training should produce spm.model and spm.vocab."""
     tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2)
@@ -64,10 +64,7 @@ def test_spm_train_raises_file_not_found(tmp_path):
     tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2)
 
     with pytest.raises(FileNotFoundError, match="Training file not found"):
-        tokenizer.train(
-            tmp_path / "nonexistent.txt",
-            tmp_path / "tokenizer"
-        )
+        tokenizer.train(tmp_path / "nonexistent.txt", tmp_path / "tokenizer")
 
 
 def test_spm_train_raises_if_directory_passed(tmp_path, sample_text_file):
@@ -82,6 +79,7 @@ def test_spm_train_raises_if_directory_passed(tmp_path, sample_text_file):
 # Encode / Decode tests
 # ------------------------------------------------------------------
 
+
 def test_spm_encode_returns_list_of_ints(trained_tokenizer):
     """encode() should return a list of integers."""
     tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2)
@@ -125,6 +123,7 @@ def test_spm_decode_raises_if_not_loaded():
 # Load tests
 # ------------------------------------------------------------------
 
+
 def test_spm_load_from_disk(trained_tokenizer):
     """load() should successfully restore tokenizer from disk."""
     tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2)
@@ -156,6 +155,7 @@ def test_spm_load_raises_if_model_missing(tmp_path):
 # Artifact path tests
 # ------------------------------------------------------------------
 
+
 def test_spm_get_vocab_path(tmp_path):
     """get_vocab_path() should return path to spm.vocab."""
     tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2)
@@ -176,6 +176,7 @@ def test_spm_get_merges_path_returns_none(tmp_path):
 # Special tokens tests
 # ------------------------------------------------------------------
 
+
 def test_spm_special_tokens_in_vocabulary(trained_tokenizer):
     """Special tokens should be present in trained vocabulary."""
     tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2)
@@ -194,6 +195,7 @@ def test_spm_special_tokens_in_vocabulary(trained_tokenizer):
 # Determinism tests
 # ------------------------------------------------------------------
 
+
 def test_spm_training_is_deterministic(tmp_path, sample_text_file):
     """Training twice on same data should produce same vocab."""
     save_path_1 = tmp_path / "tokenizer_1"
@@ -208,4 +210,4 @@ def test_spm_training_is_deterministic(tmp_path, sample_text_file):
     vocab_1 = (save_path_1 / "spm.vocab").read_text(encoding="utf-8")  # fix: explicit UTF-8
     vocab_2 = (save_path_2 / "spm.vocab").read_text(encoding="utf-8")  # fix: explicit UTF-8
 
-    assert vocab_1 == vocab_2
\ No newline at end of file
+    assert vocab_1 == vocab_2

From 1bdeb40bbd1297e2da7d0fcb84adbffca96f1fca Mon Sep 17 00:00:00 2001
From: Arpit Sharma <officialarpit7@gmail.com>
Date: Fri, 13 Mar 2026 17:52:27 +0530
Subject: [PATCH 3/7] trigger review


From 323983c83a56194a701b21e1824e899f618232f2 Mon Sep 17 00:00:00 2001
From: Arpit Sharma <officialarpit7@gmail.com>
Date: Fri, 13 Mar 2026 18:18:28 +0530
Subject: [PATCH 4/7] fix:changes acc to rabit

---
 .../tokenizer/sentencepiece_tokenizer.py      | 106 +-----------------
 tests/test_sentencepiece.py                   |  67 ++---------
 2 files changed, 15 insertions(+), 158 deletions(-)

diff --git a/openverifiablellm/tokenizer/sentencepiece_tokenizer.py b/openverifiablellm/tokenizer/sentencepiece_tokenizer.py
index 6fe522b..d0a976b 100644
--- a/openverifiablellm/tokenizer/sentencepiece_tokenizer.py
+++ b/openverifiablellm/tokenizer/sentencepiece_tokenizer.py
@@ -9,40 +9,12 @@
 
 
 class SentencePieceTokenizer(BaseTokenizer):
-    """
-    SentencePiece tokenizer implementation.
-
-    Supports training a BPE SentencePiece tokenizer,
-    encoding text to token ids, decoding token ids back to text,
-    and loading a previously trained model from disk.
-
-    Reproducibility depends on:
-    - Stable input data
-    - Pinned sentencepiece library version
-    - Consistent execution environment
-    """
 
     def __init__(self, vocab_size: int, min_frequency: int):
         super().__init__(vocab_size, min_frequency)
         self._model = None
 
-    # ------------------------------------------------------------------
-    # Training
-    # ------------------------------------------------------------------
-
     def train(self, text_file: Path, save_path: Path):
-        """
-        Train SentencePiece model on text corpus and save artifacts.
-
-        Args:
-            text_file: Path to training text corpus.
-            save_path: Directory to save spm.model and spm.vocab.
-
-        Raises:
-            FileNotFoundError: If text_file does not exist
-                               or is not a file.
-        """
-
         text_file = Path(text_file)
         save_path = Path(save_path)
 
@@ -59,6 +31,7 @@ def train(self, text_file: Path, save_path: Path):
             input=str(text_file),
             model_prefix=str(model_prefix),
             vocab_size=self.vocab_size,
+            min_count=self.min_frequency,
             pad_id=0,
             unk_id=1,
             bos_id=2,
@@ -73,89 +46,25 @@ def train(self, text_file: Path, save_path: Path):
 
         self._load_model(save_path)
 
-    # ------------------------------------------------------------------
-    # Encode / Decode
-    # ------------------------------------------------------------------
-
     def encode(self, text: str) -> list:
-        """
-        Encode text into list of token ids.
-
-        Args:
-            text: Input string to tokenize.
-
-        Returns:
-            List of integer token ids.
-
-        Raises:
-            RuntimeError: If tokenizer has not been trained or loaded.
-        """
-
         self._check_loaded()
         return self._model.encode(text, out_type=int)
 
     def decode(self, ids: list) -> str:
-        """
-        Decode list of token ids back into text.
-
-        Args:
-            ids: List of integer token ids.
-
-        Returns:
-            Decoded string.
-
-        Raises:
-            RuntimeError: If tokenizer has not been trained or loaded.
-        """
-
         self._check_loaded()
         return self._model.decode(ids)
 
-    # ------------------------------------------------------------------
-    # Load
-    # ------------------------------------------------------------------
-
     def load(self, tokenizer_dir: Path):
-        """
-        Load a previously trained SentencePiece model from disk.
-
-        Args:
-            tokenizer_dir: Directory containing spm.model
-
-        Raises:
-            FileNotFoundError: If spm.model is not found.
-        """
-
         tokenizer_dir = Path(tokenizer_dir)
         self._load_model(tokenizer_dir)
 
-    # ------------------------------------------------------------------
-    # Artifact paths
-    # ------------------------------------------------------------------
-
     def get_vocab_path(self, tokenizer_dir: Path) -> Path:
-        """Return path to spm.vocab file."""
         return Path(tokenizer_dir) / SPM_VOCAB_FILE
 
-    def get_merges_path(self, tokenizer_dir: Path):
-        """
-        SentencePiece does not use a merges file.
-        Returns None for compatibility with BaseTokenizer interface.
-        """
-        return None
-
-    # ------------------------------------------------------------------
-    # Internal helpers
-    # ------------------------------------------------------------------
+    def get_merges_path(self, tokenizer_dir: Path) -> Path:
+        return Path(tokenizer_dir) / SPM_MODEL_FILE
 
     def _load_model(self, tokenizer_dir: Path):
-        """
-        Internal helper to load spm.model from directory.
-
-        Raises:
-            FileNotFoundError: If spm.model not found.
-        """
-
         model_path = Path(tokenizer_dir) / SPM_MODEL_FILE
 
         if not model_path.is_file():
@@ -167,14 +76,7 @@ def _load_model(self, tokenizer_dir: Path):
         self._model.load(str(model_path))
 
     def _check_loaded(self):
-        """
-        Check that model is loaded before encode/decode.
-
-        Raises:
-            RuntimeError: If model has not been loaded or trained.
-        """
-
         if self._model is None:
             raise RuntimeError(
                 "SentencePiece model is not loaded. Call train() or load() before encode/decode."
-            )
+            )
\ No newline at end of file
diff --git a/tests/test_sentencepiece.py b/tests/test_sentencepiece.py
index 8aeb4eb..8d5cbd1 100644
--- a/tests/test_sentencepiece.py
+++ b/tests/test_sentencepiece.py
@@ -2,14 +2,9 @@
 
 from openverifiablellm.tokenizer.sentencepiece_tokenizer import SentencePieceTokenizer
 
-# ------------------------------------------------------------------
-# Fixtures
-# ------------------------------------------------------------------
-
 
 @pytest.fixture
 def sample_text_file(tmp_path):
-    """Create a sample text file for training."""
     text = (
         "Wikipedia is a free online encyclopedia.\n"
         "It is written collaboratively by volunteers.\n"
@@ -25,19 +20,12 @@ def sample_text_file(tmp_path):
 
 @pytest.fixture
 def trained_tokenizer(tmp_path, sample_text_file):
-    """Train and return path to trained SentencePieceTokenizer."""
     tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2)
     tokenizer.train(sample_text_file, tmp_path / "tokenizer")
     return tmp_path / "tokenizer"
 
 
-# ------------------------------------------------------------------
-# Training tests
-# ------------------------------------------------------------------
-
-
 def test_spm_train_creates_artifacts(tmp_path, sample_text_file):
-    """Training should produce spm.model and spm.vocab."""
     tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2)
     save_path = tmp_path / "tokenizer"
 
@@ -48,7 +36,6 @@ def test_spm_train_creates_artifacts(tmp_path, sample_text_file):
 
 
 def test_spm_train_creates_save_directory(tmp_path, sample_text_file):
-    """train() should create save_path directory if it does not exist."""
     tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2)
     save_path = tmp_path / "nested" / "tokenizer" / "dir"
 
@@ -60,7 +47,6 @@ def test_spm_train_creates_save_directory(tmp_path, sample_text_file):
 
 
 def test_spm_train_raises_file_not_found(tmp_path):
-    """train() should raise FileNotFoundError for missing text file."""
     tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2)
 
     with pytest.raises(FileNotFoundError, match="Training file not found"):
@@ -68,20 +54,13 @@ def test_spm_train_raises_file_not_found(tmp_path):
 
 
 def test_spm_train_raises_if_directory_passed(tmp_path, sample_text_file):
-    """train() should raise FileNotFoundError if directory passed as text_file."""
     tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2)
 
     with pytest.raises(FileNotFoundError, match="Training file not found"):
         tokenizer.train(tmp_path, tmp_path / "tokenizer")
 
 
-# ------------------------------------------------------------------
-# Encode / Decode tests
-# ------------------------------------------------------------------
-
-
 def test_spm_encode_returns_list_of_ints(trained_tokenizer):
-    """encode() should return a list of integers."""
     tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2)
     tokenizer.load(trained_tokenizer)
 
@@ -92,7 +71,6 @@ def test_spm_encode_returns_list_of_ints(trained_tokenizer):
 
 
 def test_spm_encode_decode_roundtrip(trained_tokenizer):
-    """encode then decode should return original text."""
     tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2)
     tokenizer.load(trained_tokenizer)
 
@@ -104,7 +82,6 @@ def test_spm_encode_decode_roundtrip(trained_tokenizer):
 
 
 def test_spm_encode_raises_if_not_loaded():
-    """encode() should raise RuntimeError if model not loaded."""
     tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2)
 
     with pytest.raises(RuntimeError, match="not loaded"):
@@ -112,20 +89,13 @@ def test_spm_encode_raises_if_not_loaded():
 
 
 def test_spm_decode_raises_if_not_loaded():
-    """decode() should raise RuntimeError if model not loaded."""
     tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2)
 
     with pytest.raises(RuntimeError, match="not loaded"):
         tokenizer.decode([1, 2, 3])
 
 
-# ------------------------------------------------------------------
-# Load tests
-# ------------------------------------------------------------------
-
-
 def test_spm_load_from_disk(trained_tokenizer):
-    """load() should successfully restore tokenizer from disk."""
     tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2)
     tokenizer.load(trained_tokenizer)
 
@@ -133,7 +103,6 @@ def test_spm_load_from_disk(trained_tokenizer):
 
 
 def test_spm_encode_works_after_load(trained_tokenizer):
-    """encode() should work correctly after load()."""
     tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2)
     tokenizer.load(trained_tokenizer)
 
@@ -144,46 +113,33 @@ def test_spm_encode_works_after_load(trained_tokenizer):
 
 
 def test_spm_load_raises_if_model_missing(tmp_path):
-    """load() should raise FileNotFoundError if spm.model not found."""
     tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2)
 
     with pytest.raises(FileNotFoundError, match="SentencePiece model not found"):
         tokenizer.load(tmp_path)
 
 
-# ------------------------------------------------------------------
-# Artifact path tests
-# ------------------------------------------------------------------
-
-
 def test_spm_get_vocab_path(tmp_path):
-    """get_vocab_path() should return path to spm.vocab."""
     tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2)
     vocab_path = tokenizer.get_vocab_path(tmp_path)
 
     assert vocab_path == tmp_path / "spm.vocab"
 
 
-def test_spm_get_merges_path_returns_none(tmp_path):
-    """get_merges_path() should return None for SentencePiece."""
+def test_spm_get_merges_path_returns_model_path(tmp_path):
     tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2)
     merges_path = tokenizer.get_merges_path(tmp_path)
 
-    assert merges_path is None
-
-
-# ------------------------------------------------------------------
-# Special tokens tests
-# ------------------------------------------------------------------
+    assert merges_path == tmp_path / "spm.model"
+    assert merges_path is not None
 
 
 def test_spm_special_tokens_in_vocabulary(trained_tokenizer):
-    """Special tokens should be present in trained vocabulary."""
     tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2)
     tokenizer.load(trained_tokenizer)
 
     vocab_path = trained_tokenizer / "spm.vocab"
-    vocab_content = vocab_path.read_text(encoding="utf-8")  # fix: explicit UTF-8
+    vocab_content = vocab_path.read_text(encoding="utf-8")
 
     assert "<pad>" in vocab_content
     assert "<unk>" in vocab_content
@@ -191,13 +147,7 @@ def test_spm_special_tokens_in_vocabulary(trained_tokenizer):
     assert "</s>" in vocab_content
 
 
-# ------------------------------------------------------------------
-# Determinism tests
-# ------------------------------------------------------------------
-
-
 def test_spm_training_is_deterministic(tmp_path, sample_text_file):
-    """Training twice on same data should produce same vocab."""
     save_path_1 = tmp_path / "tokenizer_1"
     save_path_2 = tmp_path / "tokenizer_2"
 
@@ -207,7 +157,12 @@ def test_spm_training_is_deterministic(tmp_path, sample_text_file):
     tokenizer_2 = SentencePieceTokenizer(vocab_size=200, min_frequency=2)
     tokenizer_2.train(sample_text_file, save_path_2)
 
-    vocab_1 = (save_path_1 / "spm.vocab").read_text(encoding="utf-8")  # fix: explicit UTF-8
-    vocab_2 = (save_path_2 / "spm.vocab").read_text(encoding="utf-8")  # fix: explicit UTF-8
+    vocab_1 = (save_path_1 / "spm.vocab").read_text(encoding="utf-8")
+    vocab_2 = (save_path_2 / "spm.vocab").read_text(encoding="utf-8")
 
     assert vocab_1 == vocab_2
+
+    model_1 = (save_path_1 / "spm.model").read_bytes()
+    model_2 = (save_path_2 / "spm.model").read_bytes()
+
+    assert model_1 == model_2
\ No newline at end of file

From c73d04641423e809a05cd57067416786e1f236a9 Mon Sep 17 00:00:00 2001
From: Arpit Sharma <officialarpit7@gmail.com>
Date: Fri, 13 Mar 2026 18:55:41 +0530
Subject: [PATCH 5/7] fix:complied with the changes by rabbit

---
 .../tokenizer/sentencepiece_tokenizer.py      | 11 +++--
 tests/test_sentencepiece.py                   | 48 +++++++++++--------
 2 files changed, 35 insertions(+), 24 deletions(-)

diff --git a/openverifiablellm/tokenizer/sentencepiece_tokenizer.py b/openverifiablellm/tokenizer/sentencepiece_tokenizer.py
index d0a976b..f7b5347 100644
--- a/openverifiablellm/tokenizer/sentencepiece_tokenizer.py
+++ b/openverifiablellm/tokenizer/sentencepiece_tokenizer.py
@@ -9,7 +9,6 @@
 
 
 class SentencePieceTokenizer(BaseTokenizer):
-
     def __init__(self, vocab_size: int, min_frequency: int):
         super().__init__(vocab_size, min_frequency)
         self._model = None
@@ -23,6 +22,13 @@ def train(self, text_file: Path, save_path: Path):
                 f"Training file not found at {text_file}. Please provide a valid text corpus file."
             )
 
+        if self.min_frequency != 1:
+            raise NotImplementedError(
+                f"min_frequency={self.min_frequency} is not supported. "
+                "SentencePiece does not expose a confirmed min_count option via the Python wrapper. "
+                "Set min_frequency=1 to use the default behaviour, or confirm the upstream option before enabling filtering."
+            )
+
         save_path.mkdir(parents=True, exist_ok=True)
 
         model_prefix = save_path / "spm"
@@ -31,7 +37,6 @@ def train(self, text_file: Path, save_path: Path):
             input=str(text_file),
             model_prefix=str(model_prefix),
             vocab_size=self.vocab_size,
-            min_count=self.min_frequency,
             pad_id=0,
             unk_id=1,
             bos_id=2,
@@ -79,4 +84,4 @@ def _check_loaded(self):
         if self._model is None:
             raise RuntimeError(
                 "SentencePiece model is not loaded. Call train() or load() before encode/decode."
-            )
\ No newline at end of file
+            )
diff --git a/tests/test_sentencepiece.py b/tests/test_sentencepiece.py
index 8d5cbd1..c1dffee 100644
--- a/tests/test_sentencepiece.py
+++ b/tests/test_sentencepiece.py
@@ -20,13 +20,13 @@ def sample_text_file(tmp_path):
 
 @pytest.fixture
 def trained_tokenizer(tmp_path, sample_text_file):
-    tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2)
+    tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=1)
     tokenizer.train(sample_text_file, tmp_path / "tokenizer")
     return tmp_path / "tokenizer"
 
 
 def test_spm_train_creates_artifacts(tmp_path, sample_text_file):
-    tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2)
+    tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=1)
     save_path = tmp_path / "tokenizer"
 
     tokenizer.train(sample_text_file, save_path)
@@ -36,7 +36,7 @@ def test_spm_train_creates_artifacts(tmp_path, sample_text_file):
 
 
 def test_spm_train_creates_save_directory(tmp_path, sample_text_file):
-    tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2)
+    tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=1)
     save_path = tmp_path / "nested" / "tokenizer" / "dir"
 
     assert not save_path.exists()
@@ -47,21 +47,28 @@ def test_spm_train_creates_save_directory(tmp_path, sample_text_file):
 
 
 def test_spm_train_raises_file_not_found(tmp_path):
-    tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2)
+    tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=1)
 
     with pytest.raises(FileNotFoundError, match="Training file not found"):
         tokenizer.train(tmp_path / "nonexistent.txt", tmp_path / "tokenizer")
 
 
 def test_spm_train_raises_if_directory_passed(tmp_path, sample_text_file):
-    tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2)
+    tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=1)
 
     with pytest.raises(FileNotFoundError, match="Training file not found"):
         tokenizer.train(tmp_path, tmp_path / "tokenizer")
 
 
-def test_spm_encode_returns_list_of_ints(trained_tokenizer):
+def test_spm_train_raises_if_min_frequency_not_one(tmp_path, sample_text_file):
     tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2)
+
+    with pytest.raises(NotImplementedError, match="min_frequency=2 is not supported"):
+        tokenizer.train(sample_text_file, tmp_path / "tokenizer")
+
+
+def test_spm_encode_returns_list_of_ints(trained_tokenizer):
+    tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=1)
     tokenizer.load(trained_tokenizer)
 
     ids = tokenizer.encode("hello world")
@@ -71,7 +78,7 @@ def test_spm_encode_returns_list_of_ints(trained_tokenizer):
 
 
 def test_spm_encode_decode_roundtrip(trained_tokenizer):
-    tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2)
+    tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=1)
     tokenizer.load(trained_tokenizer)
 
     text = "Wikipedia is a free online encyclopedia"
@@ -82,28 +89,28 @@ def test_spm_encode_decode_roundtrip(trained_tokenizer):
 
 
 def test_spm_encode_raises_if_not_loaded():
-    tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2)
+    tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=1)
 
     with pytest.raises(RuntimeError, match="not loaded"):
         tokenizer.encode("hello world")
 
 
 def test_spm_decode_raises_if_not_loaded():
-    tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2)
+    tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=1)
 
     with pytest.raises(RuntimeError, match="not loaded"):
         tokenizer.decode([1, 2, 3])
 
 
 def test_spm_load_from_disk(trained_tokenizer):
-    tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2)
+    tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=1)
     tokenizer.load(trained_tokenizer)
 
     assert tokenizer._model is not None
 
 
 def test_spm_encode_works_after_load(trained_tokenizer):
-    tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2)
+    tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=1)
     tokenizer.load(trained_tokenizer)
 
     ids = tokenizer.encode("hello world")
@@ -113,29 +120,28 @@ def test_spm_encode_works_after_load(trained_tokenizer):
 
 
 def test_spm_load_raises_if_model_missing(tmp_path):
-    tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2)
+    tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=1)
 
     with pytest.raises(FileNotFoundError, match="SentencePiece model not found"):
         tokenizer.load(tmp_path)
 
 
 def test_spm_get_vocab_path(tmp_path):
-    tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2)
+    tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=1)
     vocab_path = tokenizer.get_vocab_path(tmp_path)
 
     assert vocab_path == tmp_path / "spm.vocab"
 
 
-def test_spm_get_merges_path_returns_model_path(tmp_path):
-    tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2)
+def test_spm_get_merges_path_returns_none(tmp_path):
+    tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=1)
     merges_path = tokenizer.get_merges_path(tmp_path)
 
-    assert merges_path == tmp_path / "spm.model"
-    assert merges_path is not None
+    assert merges_path is None
 
 
 def test_spm_special_tokens_in_vocabulary(trained_tokenizer):
-    tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=2)
+    tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=1)
     tokenizer.load(trained_tokenizer)
 
     vocab_path = trained_tokenizer / "spm.vocab"
@@ -151,10 +157,10 @@ def test_spm_training_is_deterministic(tmp_path, sample_text_file):
     save_path_1 = tmp_path / "tokenizer_1"
     save_path_2 = tmp_path / "tokenizer_2"
 
-    tokenizer_1 = SentencePieceTokenizer(vocab_size=200, min_frequency=2)
+    tokenizer_1 = SentencePieceTokenizer(vocab_size=200, min_frequency=1)
     tokenizer_1.train(sample_text_file, save_path_1)
 
-    tokenizer_2 = SentencePieceTokenizer(vocab_size=200, min_frequency=2)
+    tokenizer_2 = SentencePieceTokenizer(vocab_size=200, min_frequency=1)
     tokenizer_2.train(sample_text_file, save_path_2)
 
     vocab_1 = (save_path_1 / "spm.vocab").read_text(encoding="utf-8")
@@ -165,4 +171,4 @@ def test_spm_training_is_deterministic(tmp_path, sample_text_file):
     model_1 = (save_path_1 / "spm.model").read_bytes()
     model_2 = (save_path_2 / "spm.model").read_bytes()
 
-    assert model_1 == model_2
\ No newline at end of file
+    assert model_1 == model_2

From 9406c0b1125433b3b1d133aa040a696843543718 Mon Sep 17 00:00:00 2001
From: Arpit Sharma <officialarpit7@gmail.com>
Date: Fri, 13 Mar 2026 19:06:15 +0530
Subject: [PATCH 6/7] fix:test

---
 tests/test_sentencepiece.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/test_sentencepiece.py b/tests/test_sentencepiece.py
index c1dffee..281c7c4 100644
--- a/tests/test_sentencepiece.py
+++ b/tests/test_sentencepiece.py
@@ -133,11 +133,11 @@ def test_spm_get_vocab_path(tmp_path):
     assert vocab_path == tmp_path / "spm.vocab"
 
 
-def test_spm_get_merges_path_returns_none(tmp_path):
+def test_spm_get_merges_path_returns_model_path(tmp_path):
     tokenizer = SentencePieceTokenizer(vocab_size=200, min_frequency=1)
     merges_path = tokenizer.get_merges_path(tmp_path)
 
-    assert merges_path is None
+    assert merges_path == tmp_path / "spm.model"
 
 
 def test_spm_special_tokens_in_vocabulary(trained_tokenizer):
@@ -171,4 +171,4 @@ def test_spm_training_is_deterministic(tmp_path, sample_text_file):
     model_1 = (save_path_1 / "spm.model").read_bytes()
     model_2 = (save_path_2 / "spm.model").read_bytes()
 
-    assert model_1 == model_2
+    assert model_1 == model_2
\ No newline at end of file

From becd3c164d2ba5ab862011f46318542f3382bf53 Mon Sep 17 00:00:00 2001
From: Arpit Sharma <officialarpit7@gmail.com>
Date: Fri, 13 Mar 2026 19:07:36 +0530
Subject: [PATCH 7/7] fix:lint

---
 tests/test_sentencepiece.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_sentencepiece.py b/tests/test_sentencepiece.py
index 281c7c4..4d782c9 100644
--- a/tests/test_sentencepiece.py
+++ b/tests/test_sentencepiece.py
@@ -171,4 +171,4 @@ def test_spm_training_is_deterministic(tmp_path, sample_text_file):
     model_1 = (save_path_1 / "spm.model").read_bytes()
     model_2 = (save_path_2 / "spm.model").read_bytes()
 
-    assert model_1 == model_2
\ No newline at end of file
+    assert model_1 == model_2