Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,14 @@ For a simpler, direct demonstration of an LLM-based agent, the `VICTOR_AGI_LLM.p
```
You can enable text-to-speech output by adding the `--voice` argument if you have the `pyttsx3` package installed and configured.

### Train the Tokenizer Vocabulary

To build a vocabulary from your own text corpus, run the tokenizer trainer. Provide either a single text file or a directory of `.txt` files:

```bash
python -m victor_core.train_tokenizer --input path/to/corpus --output models/tokenizer_vocab.json
```

## Development

- The core AGI framework logic resides primarily within the `victor_core` package.
Expand Down
22 changes: 22 additions & 0 deletions victor_core/nlp/fractal_tokenizer.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import re
import math
import json
import hashlib
import asyncio # For pulse publishing
from pathlib import Path

from victor_core.logger import VictorLoggerStub
# Assuming BrainFractalPulseExchange will be passed during instantiation
Expand Down Expand Up @@ -43,6 +45,26 @@ def train(self, text_corpus: list[str]):
if self.pulse:
asyncio.create_task(self.pulse.publish("tokenizer_train_complete", {"vocab_size": len(self.vocabulary)}))

def save_vocabulary(self, path: str | Path) -> None:
vocab_path = Path(path)
vocab_path.parent.mkdir(parents=True, exist_ok=True)
data = {
"vocabulary": self.vocabulary,
"token_counts": self.token_counts,
"next_token_id": self.next_token_id,
}
vocab_path.write_text(json.dumps(data, indent=2, sort_keys=True), encoding="utf-8")
logger.info(f"Saved vocabulary to {vocab_path}")

def load_vocabulary(self, path: str | Path) -> None:
vocab_path = Path(path)
data = json.loads(vocab_path.read_text(encoding="utf-8"))
self.vocabulary = {word: int(token_id) for word, token_id in data.get("vocabulary", {}).items()}
self.reverse_vocabulary = {token_id: word for word, token_id in self.vocabulary.items()}
self.token_counts = {word: int(count) for word, count in data.get("token_counts", {}).items()}
self.next_token_id = int(data.get("next_token_id", len(self.vocabulary)))
logger.info(f"Loaded vocabulary from {vocab_path}. Size: {len(self.vocabulary)}")


def tokenize(self, text: str) -> list[int]:
normalized_text = self._normalize_text(text)
Expand Down
56 changes: 56 additions & 0 deletions victor_core/train_tokenizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import argparse
from pathlib import Path

from victor_core.logger import VictorLoggerStub
from victor_core.nlp.fractal_tokenizer import FractalTokenKernel_v1_1_0


logger = VictorLoggerStub(component="VictorTokenizerTrainer")


def _load_corpus(path: Path) -> list[str]:
if path.is_dir():
files = sorted(p for p in path.rglob("*.txt") if p.is_file())
corpus = []
for file_path in files:
lines = file_path.read_text(encoding="utf-8", errors="ignore").splitlines()
corpus.extend(line for line in lines if line.strip())
if not corpus:
logger.warn(f"No non-empty lines found in directory: {path}")
return corpus

if path.is_file():
lines = path.read_text(encoding="utf-8", errors="ignore").splitlines()
return [line for line in lines if line.strip()]

raise FileNotFoundError(f"Training input path does not exist: {path}")


def main() -> None:
parser = argparse.ArgumentParser(description="Train Victor tokenizer vocabulary from text data.")
parser.add_argument(
"--input",
type=Path,
required=True,
help="Path to a text file or directory of .txt files to train on.",
)
parser.add_argument(
"--output",
type=Path,
default=Path("models/tokenizer_vocab.json"),
help="Path to save the trained vocabulary JSON.",
)
args = parser.parse_args()

corpus = _load_corpus(args.input)
if not corpus:
logger.warn("Training aborted: no training data found.")
return

tokenizer = FractalTokenKernel_v1_1_0()
tokenizer.train(corpus)
tokenizer.save_vocabulary(args.output)


if __name__ == "__main__":
main()