diff --git a/README.md b/README.md index 0255d69..e9fb24d 100644 --- a/README.md +++ b/README.md @@ -72,6 +72,14 @@ For a simpler, direct demonstration of an LLM-based agent, the `VICTOR_AGI_LLM.p ``` You can enable text-to-speech output by adding the `--voice` argument if you have the `pyttsx3` package installed and configured. +### Train the Tokenizer Vocabulary + +To build a vocabulary from your own text corpus, run the tokenizer trainer. Provide either a single text file or a directory of `.txt` files: + +```bash +python -m victor_core.train_tokenizer --input path/to/corpus --output models/tokenizer_vocab.json +``` + ## Development - The core AGI framework logic resides primarily within the `victor_core` package. diff --git a/victor_core/nlp/fractal_tokenizer.py b/victor_core/nlp/fractal_tokenizer.py index 628ed0f..7ec13d2 100644 --- a/victor_core/nlp/fractal_tokenizer.py +++ b/victor_core/nlp/fractal_tokenizer.py @@ -1,7 +1,9 @@ import re import math +import json import hashlib import asyncio # For pulse publishing +from pathlib import Path from victor_core.logger import VictorLoggerStub # Assuming BrainFractalPulseExchange will be passed during instantiation @@ -43,6 +45,26 @@ def train(self, text_corpus: list[str]): if self.pulse: asyncio.create_task(self.pulse.publish("tokenizer_train_complete", {"vocab_size": len(self.vocabulary)})) + def save_vocabulary(self, path: str | Path) -> None: + vocab_path = Path(path) + vocab_path.parent.mkdir(parents=True, exist_ok=True) + data = { + "vocabulary": self.vocabulary, + "token_counts": self.token_counts, + "next_token_id": self.next_token_id, + } + vocab_path.write_text(json.dumps(data, indent=2, sort_keys=True), encoding="utf-8") + logger.info(f"Saved vocabulary to {vocab_path}") + + def load_vocabulary(self, path: str | Path) -> None: + vocab_path = Path(path) + data = json.loads(vocab_path.read_text(encoding="utf-8")) + self.vocabulary = {word: int(token_id) for word, token_id in data.get("vocabulary", {}).items()} + self.reverse_vocabulary = {token_id: word for word, token_id in self.vocabulary.items()} + self.token_counts = {word: int(count) for word, count in data.get("token_counts", {}).items()} + self.next_token_id = int(data.get("next_token_id", len(self.vocabulary))) + logger.info(f"Loaded vocabulary from {vocab_path}. Size: {len(self.vocabulary)}") + def tokenize(self, text: str) -> list[int]: normalized_text = self._normalize_text(text) diff --git a/victor_core/train_tokenizer.py b/victor_core/train_tokenizer.py new file mode 100644 index 0000000..fecb45a --- /dev/null +++ b/victor_core/train_tokenizer.py @@ -0,0 +1,56 @@ +import argparse +from pathlib import Path + +from victor_core.logger import VictorLoggerStub +from victor_core.nlp.fractal_tokenizer import FractalTokenKernel_v1_1_0 + + +logger = VictorLoggerStub(component="VictorTokenizerTrainer") + + +def _load_corpus(path: Path) -> list[str]: + if path.is_dir(): + files = sorted(p for p in path.rglob("*.txt") if p.is_file()) + corpus = [] + for file_path in files: + lines = file_path.read_text(encoding="utf-8", errors="ignore").splitlines() + corpus.extend(line for line in lines if line.strip()) + if not corpus: + logger.warn(f"No non-empty lines found in directory: {path}") + return corpus + + if path.is_file(): + lines = path.read_text(encoding="utf-8", errors="ignore").splitlines() + return [line for line in lines if line.strip()] + + raise FileNotFoundError(f"Training input path does not exist: {path}") + + +def main() -> None: + parser = argparse.ArgumentParser(description="Train Victor tokenizer vocabulary from text data.") + parser.add_argument( + "--input", + type=Path, + required=True, + help="Path to a text file or directory of .txt files to train on.", + ) + parser.add_argument( + "--output", + type=Path, + default=Path("models/tokenizer_vocab.json"), + help="Path to save the trained vocabulary JSON.", + ) + args = parser.parse_args() + + corpus = _load_corpus(args.input) + if not corpus: + logger.warn("Training aborted: no training data found.") + return + + tokenizer = FractalTokenKernel_v1_1_0() + tokenizer.train(corpus) + tokenizer.save_vocabulary(args.output) + + +if __name__ == "__main__": + main()