MASSIVEMAGNETICS · MASSIVEMAGNETICS · Dec 28, 2025
diff --git a/README.md b/README.md
@@ -72,6 +72,14 @@ For a simpler, direct demonstration of an LLM-based agent, the `VICTOR_AGI_LLM.p
     ```
     You can enable text-to-speech output by adding the `--voice` argument if you have the `pyttsx3` package installed and configured.
 
+### Train the Tokenizer Vocabulary
+
+To build a vocabulary from your own text corpus, run the tokenizer trainer. Provide either a single text file or a directory of `.txt` files:
+
+```bash
+python -m victor_core.train_tokenizer --input path/to/corpus --output models/tokenizer_vocab.json
+```
+
 ## Development
 
 -   The core AGI framework logic resides primarily within the `victor_core` package.

diff --git a/victor_core/nlp/fractal_tokenizer.py b/victor_core/nlp/fractal_tokenizer.py
@@ -1,7 +1,9 @@
 import re
 import math
+import json
 import hashlib
 import asyncio # For pulse publishing
+from pathlib import Path
 
 from victor_core.logger import VictorLoggerStub
 # Assuming BrainFractalPulseExchange will be passed during instantiation
@@ -43,6 +45,26 @@ def train(self, text_corpus: list[str]):
         if self.pulse:
             asyncio.create_task(self.pulse.publish("tokenizer_train_complete", {"vocab_size": len(self.vocabulary)}))
 
+    def save_vocabulary(self, path: str | Path) -> None:
+        vocab_path = Path(path)
+        vocab_path.parent.mkdir(parents=True, exist_ok=True)
+        data = {
+            "vocabulary": self.vocabulary,
+            "token_counts": self.token_counts,
+            "next_token_id": self.next_token_id,
+        }
+        vocab_path.write_text(json.dumps(data, indent=2, sort_keys=True), encoding="utf-8")
+        logger.info(f"Saved vocabulary to {vocab_path}")
+
+    def load_vocabulary(self, path: str | Path) -> None:
+        vocab_path = Path(path)
+        data = json.loads(vocab_path.read_text(encoding="utf-8"))
+        self.vocabulary = {word: int(token_id) for word, token_id in data.get("vocabulary", {}).items()}
+        self.reverse_vocabulary = {token_id: word for word, token_id in self.vocabulary.items()}
+        self.token_counts = {word: int(count) for word, count in data.get("token_counts", {}).items()}
+        self.next_token_id = int(data.get("next_token_id", len(self.vocabulary)))
+        logger.info(f"Loaded vocabulary from {vocab_path}. Size: {len(self.vocabulary)}")
+
 
     def tokenize(self, text: str) -> list[int]:
         normalized_text = self._normalize_text(text)

diff --git a/victor_core/train_tokenizer.py b/victor_core/train_tokenizer.py
@@ -0,0 +1,56 @@
+import argparse
+from pathlib import Path
+
+from victor_core.logger import VictorLoggerStub
+from victor_core.nlp.fractal_tokenizer import FractalTokenKernel_v1_1_0
+
+
+logger = VictorLoggerStub(component="VictorTokenizerTrainer")
+
+
+def _load_corpus(path: Path) -> list[str]:
+    if path.is_dir():
+        files = sorted(p for p in path.rglob("*.txt") if p.is_file())
+        corpus = []
+        for file_path in files:
+            lines = file_path.read_text(encoding="utf-8", errors="ignore").splitlines()
+            corpus.extend(line for line in lines if line.strip())
+        if not corpus:
+            logger.warn(f"No non-empty lines found in directory: {path}")
+        return corpus
+
+    if path.is_file():
+        lines = path.read_text(encoding="utf-8", errors="ignore").splitlines()
+        return [line for line in lines if line.strip()]
+
+    raise FileNotFoundError(f"Training input path does not exist: {path}")
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Train Victor tokenizer vocabulary from text data.")
+    parser.add_argument(
+        "--input",
+        type=Path,
+        required=True,
+        help="Path to a text file or directory of .txt files to train on.",
+    )
+    parser.add_argument(
+        "--output",
+        type=Path,
+        default=Path("models/tokenizer_vocab.json"),
+        help="Path to save the trained vocabulary JSON.",
+    )
+    args = parser.parse_args()
+
+    corpus = _load_corpus(args.input)
+    if not corpus:
+        logger.warn("Training aborted: no training data found.")
+        return
+
+    tokenizer = FractalTokenKernel_v1_1_0()
+    tokenizer.train(corpus)
+    tokenizer.save_vocabulary(args.output)
+
+
+if __name__ == "__main__":
+    main()