-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
46 lines (39 loc) · 1.42 KB
/
main.py
File metadata and controls
46 lines (39 loc) · 1.42 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
# internal imports
from src.utils.logger import get_logger
from src.preprocessing.hf_tokenizer import train_fast_tokenizer
from src.preprocessing.loader import RepoLoader, HuggingFaceLoader, CorpusBlender
from src.utils.runner import train_tokenizer_remote
logger = get_logger("main", level="INFO")
def generate_corpus():
# Configuration
LEAN_OUTPUT = "data/raw/corpus_lean_raw.txt"
NL_OUTPUT = "data/raw/corpus_english_raw.txt"
FINAL_OUTPUT = "data/corpus/final_training_corpus.txt"
# Extract Lean (Code)
# Using the RepoLoader you wrote
lean_loader = RepoLoader(
repo_url="https://github.com/leanprover-community/mathlib4.git",
clone_dir="temp_mathlib_clone",
output_file=LEAN_OUTPUT,
delete_after=True
)
lean_loader.run()
# Extract Proof-Pile (English/Latex)
# Using 'hoskinson-center/proof-pile' (a standard large math corpus)
nl_loader = HuggingFaceLoader(
dataset_name="hoskinson-center/proof-pile",
split="train",
output_file=NL_OUTPUT,
max_samples=30_000 # Adjust this to match size of Lean corpus approx
)
nl_loader.run()
# Mix them (Bilingual Data)
blender = CorpusBlender(
file_a=LEAN_OUTPUT,
file_b=NL_OUTPUT,
output_file=FINAL_OUTPUT
)
blender.run()
logger.info("\nREADY FOR TOKENIZER TRAINING!")
if __name__ == "__main__":
train_fast_tokenizer()