Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
206 changes: 160 additions & 46 deletions benchmark/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,10 +88,10 @@ def find_latest_benchmark_dir():
return latest_dir


def show_stats(dirnames, graphs, stats_languages=None):
def show_stats(dirnames, graphs, verbose, stats_languages=None):
raw_rows = []
for dirname in dirnames:
row = summarize_results(dirname, stats_languages)
row = summarize_results(dirname, verbose, stats_languages)
raw_rows.append(row)

# return
Expand Down Expand Up @@ -241,7 +241,7 @@ def main(
updated_dirnames.append(dirname)

if stats_only:
return show_stats(updated_dirnames, graphs, stats_languages)
return show_stats(updated_dirnames, graphs, verbose, stats_languages)

if diffs_only:
return show_diffs(updated_dirnames)
Expand Down Expand Up @@ -373,7 +373,7 @@ def get_exercise_dirs(base_dir, languages=None):
)

all_results.append(results)
summarize_results(dirname)
summarize_results(dirname, verbose)
if sleep:
time.sleep(sleep)
else:
Expand Down Expand Up @@ -402,7 +402,7 @@ def get_exercise_dirs(base_dir, languages=None):
print()
print()
print()
summarize_results(dirname)
summarize_results(dirname, verbose)

return 0

Expand Down Expand Up @@ -446,7 +446,7 @@ def show_diffs(dirnames):

def load_results(dirname, stats_languages=None):
dirname = Path(dirname)
all_results = []
lang_to_results = {}

if stats_languages:
languages = [lang.strip().lower() for lang in stats_languages.split(",")]
Expand All @@ -458,21 +458,23 @@ def load_results(dirname, stats_languages=None):
for fname in dirname.glob(pattern):
try:
results = json.loads(fname.read_text())
all_results.append(results)
# json / test / prac / exer / lang
lang = fname.parent.parent.parent.parent.name
lang_to_results.setdefault(lang, []).append(results)
except json.JSONDecodeError:
print("json.JSONDecodeError", fname)
continue
return all_results
return lang_to_results


def summarize_results(dirname, stats_languages=None):
all_results = load_results(dirname, stats_languages)
def summarize_results(dirname, verbose, stats_languages=None):
lang_to_results = load_results(dirname, stats_languages)

res = SimpleNamespace()
res.total_tests = len(list(Path(dirname).glob("*/exercises/practice/*")))

try:
tries = max(len(results.get("tests_outcomes", [])) for results in all_results if results)
tries = max(len(results.get("tests_outcomes", [])) for results_list in lang_to_results.values() for results in results_list if results)
except ValueError:
tries = 0

Expand All @@ -499,42 +501,77 @@ def summarize_results(dirname, stats_languages=None):
res.thinking_tokens = None
variants = defaultdict(set)

for results in all_results:
if not results:
continue

res.completed_tests += 1
tests_outcomes = results.get("tests_outcomes", [])
passed = tests_outcomes and tests_outcomes[-1]
if passed:
for i in range(len(tests_outcomes) - 1, tries):
passed_tests[i] += 1

res.cost += results.get("cost", 0)
res.duration += results.get("duration", 0)
res.test_timeouts += results.get("test_timeouts", 0)

res.error_outputs += results.get("num_error_outputs", 0)
res.user_asks += results.get("num_user_asks", 0)
res.exhausted_context_windows += results.get("num_exhausted_context_windows", 0)
res.num_malformed_responses += results.get("num_malformed_responses", 0)
if results.get("num_malformed_responses"):
res.num_with_malformed_responses += 1
res.lazy_comments += results.get("lazy_comments", 0)

res.syntax_errors += results.get("syntax_errors", 0)
res.indentation_errors += results.get("indentation_errors", 0)

res.prompt_tokens += results.get("prompt_tokens", 0)
res.completion_tokens += results.get("completion_tokens", 0)

res.reasoning_effort = results.get("reasoning_effort")
res.thinking_tokens = results.get("thinking_tokens")
def add(attr_name, increment, global_stats, lang_stats):
global_prev = getattr(global_stats, attr_name)
setattr(global_stats, attr_name, global_prev + increment)

lang_prev = getattr(lang_stats, attr_name)
setattr(lang_stats, attr_name, lang_prev + increment)

lang_to_stats = {}
lang_to_passed_tests = {}
for lang, results_list in lang_to_results.items():
lang_stats = SimpleNamespace()
lang_stats.completed_tests = 0
lang_stats.duration = 0
lang_stats.avg_duration_per_test = 0
lang_stats.cost = 0
for i in range(tries):
setattr(lang_stats, f"pass_rate_{i}", 0)
for i in range(tries):
setattr(lang_stats, f"pass_num_{i}", 0)
lang_stats.error_outputs = 0
lang_stats.user_asks = 0
lang_stats.test_timeouts = 0
lang_stats.exhausted_context_windows = 0
lang_stats.num_malformed_responses = 0
lang_stats.num_with_malformed_responses = 0
lang_stats.syntax_errors = 0
lang_stats.indentation_errors = 0
lang_stats.lazy_comments = 0
lang_stats.prompt_tokens = 0
lang_stats.completion_tokens = 0
lang_to_stats[lang] = lang_stats
lang_to_passed_tests[lang] = [0] * tries

for results in results_list:
if not results:
continue

for key in "model edit_format commit_hash editor_model editor_edit_format".split():
val = results.get(key)
if val:
variants[key].add(val)
add("completed_tests", 1, res, lang_stats)
tests_outcomes = results.get("tests_outcomes", [])
passed = tests_outcomes and tests_outcomes[-1]
if passed:
for i in range(len(tests_outcomes) - 1, tries):
passed_tests[i] += 1
lang_to_passed_tests[lang][i] += 1

add("cost", results.get("cost", 0), res, lang_stats)
add("duration", results.get("duration", 0), res, lang_stats)
add("test_timeouts", results.get("test_timeouts", 0), res, lang_stats)

add("error_outputs", results.get("num_error_outputs", 0), res, lang_stats)
add("user_asks", results.get("num_user_asks", 0), res, lang_stats)
add("exhausted_context_windows", results.get("num_exhausted_context_windows", 0), res, lang_stats)
add("num_malformed_responses", results.get("num_malformed_responses", 0), res, lang_stats)
if results.get("num_malformed_responses"):
add("num_with_malformed_responses", 1, res, lang_stats)
add("lazy_comments", results.get("lazy_comments", 0), res, lang_stats)

add("syntax_errors", results.get("syntax_errors", 0), res, lang_stats)
add("indentation_errors", results.get("indentation_errors", 0), res, lang_stats)

add("prompt_tokens", results.get("prompt_tokens", 0), res, lang_stats)
add("completion_tokens", results.get("completion_tokens", 0), res, lang_stats)

res.reasoning_effort = results.get("reasoning_effort")
res.thinking_tokens = results.get("thinking_tokens")
res.map_tokens = results.get("map_tokens")

for key in "model edit_format commit_hash editor_model editor_edit_format".split():
val = results.get(key)
if val:
variants[key].add(val)

if not res.completed_tests:
return
Expand Down Expand Up @@ -623,6 +660,83 @@ def show(stat, red="red"):
f" ${projected_cost:.2f} projected"
)

if verbose and len(lang_to_stats) > 0:
def format_lang_stats(lang_stats):
# First, postprocess attributes for easier printing
if lang_stats.completed_tests > 0:
lang_stats.avg_duration_per_test = lang_stats.duration / float(lang_stats.completed_tests)
for i in range(tries):
num_passed = lang_to_passed_tests[lang][i]
setattr(lang_stats, f"pass_num_{i}", num_passed)
pass_rate = 100 * num_passed / float(lang_stats.completed_tests)
setattr(lang_stats, f"pass_rate_{i}", pass_rate)

# Then format attributes into ready-to-print strings
for attr in lang_stats.__dict__:
val = getattr(lang_stats, attr)
if val == 0:
val = "-"
elif isinstance(val, float):
val = f"{val:,.2f}"
else:
val = f"{val:,}"

setattr(lang_stats, attr, val)

def compute_lang_to_col_widths(lang_to_stats):
lang_to_col_widths = {}
for lang, lang_stats in lang_to_stats.items():
lang_stat_attrs = [getattr(lang_stats, attr) for attr in lang_stats.__dict__]
lang_col_width = max(len(lang), len(max(lang_stat_attrs, key=len)))
lang_to_col_widths[lang] = lang_col_width

return lang_to_col_widths

print()
print("======== Stats by language ========")
print()

[format_lang_stats(lang_stats) for lang_stats in lang_to_stats.values()]
lang_to_col_widths = compute_lang_to_col_widths(lang_to_stats)

any_stats = list(lang_to_stats.values())[0]
attrs = list(any_stats.__dict__)
attr_col_width = len(max(["language"] + attrs, key=len))
langs = list(lang_to_stats.keys())

print("| " + ("-" * attr_col_width), end="")
for lang in langs:
col_width = lang_to_col_widths[lang]
print(" | " + ("-" * col_width), end="")
print(" |")

print(f"| {' '.center(attr_col_width)}", end="")
for lang in langs:
col_width = lang_to_col_widths[lang]
print(f" | {lang.center(col_width)}", end="")
print(" |")

print("| " + ("-" * attr_col_width), end="")
for lang in langs:
col_width = lang_to_col_widths[lang]
print(" | " + ("-" * col_width), end="")
print(" |")

for attr in attrs:
print(f"| {attr:<{attr_col_width}}", end="")
for lang in langs:
lang_stats = lang_to_stats[lang]
col_width = lang_to_col_widths[lang]
print(f" | {getattr(lang_stats, attr):>{col_width}}", end="")
print(" |")

print("| " + ("-" * attr_col_width), end="")
for lang in langs:
col_width = lang_to_col_widths[lang]
print(" | " + ("-" * col_width), end="")
print(" |")
print()

console.rule()

# print(json.dumps(vars(res), indent=4, sort_keys=True))
Expand Down
Loading