math-reasoning/main.py at main · raphael-ph/math-reasoning · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
# internal imports
from src.utils.logger import get_logger
from src.preprocessing.hf_tokenizer import train_fast_tokenizer
from src.preprocessing.loader import RepoLoader, HuggingFaceLoader, CorpusBlender
from src.utils.runner import train_tokenizer_remote

logger = get_logger("main", level="INFO")

def generate_corpus():
    # Configuration
    LEAN_OUTPUT = "data/raw/corpus_lean_raw.txt"
    NL_OUTPUT = "data/raw/corpus_english_raw.txt"
    FINAL_OUTPUT = "data/corpus/final_training_corpus.txt"

    # Extract Lean (Code)
    # Using the RepoLoader you wrote
    lean_loader = RepoLoader(
        repo_url="https://github.com/leanprover-community/mathlib4.git",
        clone_dir="temp_mathlib_clone",
        output_file=LEAN_OUTPUT,
        delete_after=True
    )
    lean_loader.run()

    # Extract Proof-Pile (English/Latex)
    # Using 'hoskinson-center/proof-pile' (a standard large math corpus)
    nl_loader = HuggingFaceLoader(
        dataset_name="hoskinson-center/proof-pile",
        split="train",
        output_file=NL_OUTPUT,
        max_samples=30_000 # Adjust this to match size of Lean corpus approx
    )
    nl_loader.run()

    # Mix them (Bilingual Data)
    blender = CorpusBlender(
        file_a=LEAN_OUTPUT,
        file_b=NL_OUTPUT,
        output_file=FINAL_OUTPUT
    )
    blender.run()

    logger.info("\nREADY FOR TOKENIZER TRAINING!")

if __name__ == "__main__":
    train_fast_tokenizer()