graphify-notes/bench.py at main · ipunithgowda/graphify-notes · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
#!/usr/bin/env python3
"""
Reproducible benchmark harness for the graphify-notes experiment.

Usage:
    python3 bench.py PATH_TO_GRAPH_JSON

Prints a distribution summary (min/p25/median/p75/max/geomean) across 20
realistic engineering questions. Exits with the same benchmark_results.json
committed in this repo when run against the same graph.json.

Requires: pip install graphifyy
"""
from __future__ import annotations
import json
import math
import statistics
import sys
from pathlib import Path

try:
    from graphify.benchmark import run_benchmark
except ImportError:
    sys.exit("error: graphifyy is not installed. run: pip install graphifyy")


QUESTIONS = [
    "how does authentication work",
    "what is the main entry point",
    "how are errors handled",
    "what connects the data layer to the api",
    "what are the core abstractions",
    "how is the audit chain constructed",
    "where are database queries defined",
    "how is tenant isolation enforced",
    "what validates incoming payloads",
    "how are LLM calls logged",
    "where is state mutation tracked",
    "which modules handle export",
    "how does the UI consume the api",
    "what is the background job scheduler",
    "how are secrets stored",
    "which tests cover the correlation engine",
    "how is the severity score computed",
    "where is the configuration loaded",
    "how does the incident workflow transition states",
    "what defines a playbook",
]


def stats_of(ratios: list[float]) -> dict:
    return {
        "n": len(ratios),
        "min": round(min(ratios), 1),
        "p25": round(statistics.quantiles(ratios, n=4)[0], 1),
        "median": round(statistics.median(ratios), 1),
        "p75": round(statistics.quantiles(ratios, n=4)[2], 1),
        "max": round(max(ratios), 1),
        "geomean": round(math.exp(sum(math.log(r) for r in ratios) / len(ratios)), 1),
        "arith_mean": round(statistics.mean(ratios), 1),
    }


def main(graph_path: str, corpus_words: int | None = None) -> int:
    if not Path(graph_path).exists():
        print(f"graph.json not found: {graph_path}", file=sys.stderr)
        return 1

    a = run_benchmark(graph_path, questions=QUESTIONS)
    b = run_benchmark(graph_path, corpus_words=corpus_words, questions=QUESTIONS) if corpus_words else None

    def print_block(tag: str, r: dict, label: str):
        ratios = [p["reduction"] for p in r["per_question"]]
        s = stats_of(ratios)
        print(f"== {tag}: {label} ==")
        print(f"   corpus tokens (heuristic): {r['corpus_tokens']:,}")
        print(f"   ratios  min {s['min']:>6}× · p25 {s['p25']:>6}× · median {s['median']:>6}× · p75 {s['p75']:>6}× · max {s['max']:>6}×")
        print(f"   geomean {s['geomean']:>6}× (correct headline for ratios)")
        print(f"   arith   {s['arith_mean']:>6}× (inflated by outliers — do not lead with this)")

    print(f"graph: {a['nodes']} nodes, {a['edges']} edges")
    print(f"n matched: {len(a['per_question'])} of {len(QUESTIONS)} queries")
    print()
    print_block("A", a, "graphify default baseline (nodes × 50 words)")
    if b:
        print()
        print_block("B", b, f"measured corpus ({corpus_words:,} words)")
    return 0


if __name__ == "__main__":
    gpath = sys.argv[1] if len(sys.argv) > 1 else "graph.json"
    words = int(sys.argv[2]) if len(sys.argv) > 2 else None
    sys.exit(main(gpath, words))