FormalMATH-Bench/evaluate_results.py at main · Sphere-AI-Lab/FormalMATH-Bench · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import json
import random
import argparse
from multiprocessing import Pool
from tqdm import tqdm

def check_correct(answers, sample_size):
    # Randomly sample a subset of answers and check if any of them has 'answer_bool' set to True
    sampled_answers = random.sample(answers, sample_size)
    return any(answer['answer_bool'] for answer in sampled_answers)

def simulate_single(args):
    data, sample_sizes = args
    all_theorems = list(data.keys())
    correct_counts = {size: 0 for size in sample_sizes}
    applicable_counts = {size: 0 for size in sample_sizes}

    for theorem in all_theorems:
        answers = data[theorem]
        num_answers = len(answers)

        for size in sample_sizes:
            # Skip sample sizes larger than the number of available answers
            if size > num_answers:
                continue
            applicable_counts[size] += 1
            if check_correct(answers, size):
                correct_counts[size] += 1

    # Calculate the success rate for each sample size
    aggregate_rates = {}
    for size in sample_sizes:
        rate = correct_counts[size] / applicable_counts[size] if applicable_counts[size] > 0 else 0
        aggregate_rates[str(size)] = rate
        print(f"size,{correct_counts[size]}")
    return aggregate_rates

def monte_carlo_evaluate(
    input_filepath,
    output_filepath,
    sample_sizes=None,
    n_simulations=50,
    n_processes=50
):
    """
    Evaluate the verification results using Monte Carlo simulation.

    Args:
        input_filepath (str): Path to the verification results file
        output_filepath (str): Path to save the evaluation results
        sample_sizes (list, optional): List of sample sizes to evaluate. Defaults to None.
        n_simulations (int, optional): Number of Monte Carlo simulations. Defaults to 50.
        n_processes (int, optional): Number of processes for parallel computation. Defaults to 50.
    """
    # Default sample sizes if not provided
    if sample_sizes is None:
        sample_sizes = sorted(list(range(1, 3200, 5)) + [32, 64, 128, 328, 648, 1024, 2048, 3200])

    # Load input data file
    with open(input_filepath, 'r', encoding='utf-8') as f:
        data = json.load(f)

    aggregate_results = {}

    # Perform Monte Carlo simulation using multiprocessing
    with Pool(processes=n_processes) as pool:
        tasks = [(data, sample_sizes) for _ in range(n_simulations)]
        results = list(tqdm(pool.imap(simulate_single, tasks), total=n_simulations, desc="Monte Carlo in process"))

    # Aggregate results from each simulation run
    for sim, result in enumerate(results, start=1):
        aggregate_key = f"Aggregate_{sim}"
        aggregate_results[aggregate_key] = result

    # Save results to the output file
    with open(output_filepath, 'w', encoding='utf-8') as f:
        json.dump(aggregate_results, f, ensure_ascii=False, indent=4)

    print(f"\nMonte Carlo simulation finished, results saved to {output_filepath}")
    return aggregate_results

def parse_args():
    parser = argparse.ArgumentParser(description="Evaluate theorem proof verification results")

    # File paths
    parser.add_argument("--input_file", default="/workspace/ky_ding/math/verify/0411/verified_stp_3200.json",
                        help="Path to verification results file")
    parser.add_argument("--output_file", default="/workspace/ky_ding/math/verify/0411/verified_stp_3200_success_rate_0414.json",
                        help="Path to evaluation results file")

    # Evaluation parameters
    parser.add_argument("--n_simulations", default=50, type=int,
                        help="Number of Monte Carlo simulations")
    parser.add_argument("--n_processes", default=50, type=int,
                        help="Number of processes for Monte Carlo simulation")
    parser.add_argument("--custom_sample_sizes", default=None, type=str,
                        help="Comma-separated list of custom sample sizes (e.g., '1,5,10,50,100')")

    return parser.parse_args()

def main():
    args = parse_args()

    # Parse custom sample sizes if provided
    sample_sizes = None
    if args.custom_sample_sizes:
        sample_sizes = [int(size) for size in args.custom_sample_sizes.split(',')]

    monte_carlo_evaluate(
        input_filepath=args.input_file,
        output_filepath=args.output_file,
        sample_sizes=sample_sizes,
        n_simulations=args.n_simulations,
        n_processes=args.n_processes
    )

if __name__ == "__main__":
    main()