minrlm/eval/quickstart.py at master · avilum/minrlm · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
#!/usr/bin/env python3
"""
RLM Evaluation - Quick Start Example

Run a minimal evaluation to verify everything works.

Usage:
    uv run python eval/quickstart.py
"""

import os
import sys
import time
from pathlib import Path

sys.path.insert(0, str(Path(__file__).parent.parent))

MODEL = os.environ.get("MINRLM_MODEL", "gpt-5-mini")


def log(msg: str, *, indent: int = 0) -> None:
    prefix = "  " * indent
    print(f"{prefix}{msg}", flush=True)


def main():
    log("=" * 60)
    log("RLM Quick Evaluation")
    log("=" * 60)
    log(f"Model: {MODEL}")

    # ── Docker check ────────────────────────────────────────────
    log("\nChecking Docker...", indent=0)
    from minrlm.docker_repl import check_docker_available

    docker_available = check_docker_available()
    log(f"Docker: {'✓ Enabled (secure sandboxed execution)' if docker_available else '✗ Disabled (local execution)'}")

    # ── Load task ────────────────────────────────────────────────
    log("\nLoading task: official_sniah (this may download the dataset)...", indent=0)
    t0 = time.time()
    from eval.runners import get_runner, list_runners
    from eval.tasks import get_task

    task = get_task("official_sniah", max_samples=1)
    log(f"  Dataset loaded in {time.time() - t0:.1f}s")

    log("Generating task instance...", indent=0)
    instance = task.generate(seed=42)
    log(f"  Task:     {instance.task[:80]!r}")
    log(f"  Context:  {len(instance.context):,} characters")
    log(f"  Expected: {instance.expected!r}")

    # ── Available runners ────────────────────────────────────────
    available = list_runners()
    log(f"\nRegistered runners: {', '.join(available)}")

    runners_to_test = ["vanilla", "minrlm-reasoning"]
    log(f"Testing: {', '.join(runners_to_test)}")
    log("\n" + "-" * 60)

    # ── Run each runner ──────────────────────────────────────────
    for runner_name in runners_to_test:
        log(f"\n[{runner_name}] Initializing...", indent=0)
        try:
            runner = get_runner(runner_name, MODEL)
        except Exception as e:
            log(f"[{runner_name}] INIT ERROR: {e}", indent=0)
            continue

        log(f"[{runner_name}] Calling model ({MODEL})...", indent=0)
        t_run = time.time()
        try:
            result = runner.run(instance.task, instance.context)
            elapsed = time.time() - t_run
        except Exception as e:
            log(f"[{runner_name}] RUN ERROR ({time.time() - t_run:.1f}s): {e}", indent=0)
            continue

        if result.error:
            log(f"[{runner_name}] ERROR ({elapsed:.1f}s): {result.error}", indent=0)
            continue

        correct = task.check(result.response, instance.expected)
        status = "✓ PASS" if correct else "✗ FAIL"
        log(f"[{runner_name}] {status} ({elapsed:.1f}s)", indent=0)
        log(f"  Response:   {result.response[:120]!r}", indent=0)
        log(
            f"  Tokens:     {result.total_tokens:,} (in: {result.input_tokens:,}, out: {result.output_tokens:,})",
            indent=0,
        )
        log(f"  Iterations: {result.iterations}", indent=0)

    # ── Done ─────────────────────────────────────────────────────
    log("\n" + "=" * 60)
    log("Quick test complete!")
    log("\nTo run a full benchmark:")
    log("  uv run python eval/run.py \\")
    log(f"      --model {MODEL} \\")
    log("      --tasks all \\")
    log("      --runners minrlm-reasoning \\")
    log("      --runs 50 \\")
    log("      --parallel 5 \\")
    log("      --task-parallel 5 \\")
    log("      --official-max-samples 500 \\")
    log("      --output-dir logs/my_eval")
    log("=" * 60)


if __name__ == "__main__":
    main()