diff --git a/benchmark/benchmark.py b/benchmark/benchmark.py index cc694a9f70d..bfb09f3cb1b 100755 --- a/benchmark/benchmark.py +++ b/benchmark/benchmark.py @@ -88,10 +88,10 @@ def find_latest_benchmark_dir(): return latest_dir -def show_stats(dirnames, graphs, stats_languages=None): +def show_stats(dirnames, graphs, verbose, stats_languages=None): raw_rows = [] for dirname in dirnames: - row = summarize_results(dirname, stats_languages) + row = summarize_results(dirname, verbose, stats_languages) raw_rows.append(row) # return @@ -241,7 +241,7 @@ def main( updated_dirnames.append(dirname) if stats_only: - return show_stats(updated_dirnames, graphs, stats_languages) + return show_stats(updated_dirnames, graphs, verbose, stats_languages) if diffs_only: return show_diffs(updated_dirnames) @@ -373,7 +373,7 @@ def get_exercise_dirs(base_dir, languages=None): ) all_results.append(results) - summarize_results(dirname) + summarize_results(dirname, verbose) if sleep: time.sleep(sleep) else: @@ -402,7 +402,7 @@ def get_exercise_dirs(base_dir, languages=None): print() print() print() - summarize_results(dirname) + summarize_results(dirname, verbose) return 0 @@ -446,7 +446,7 @@ def show_diffs(dirnames): def load_results(dirname, stats_languages=None): dirname = Path(dirname) - all_results = [] + lang_to_results = {} if stats_languages: languages = [lang.strip().lower() for lang in stats_languages.split(",")] @@ -458,21 +458,23 @@ def load_results(dirname, stats_languages=None): for fname in dirname.glob(pattern): try: results = json.loads(fname.read_text()) - all_results.append(results) + # json / test / prac / exer / lang + lang = fname.parent.parent.parent.parent.name + lang_to_results.setdefault(lang, []).append(results) except json.JSONDecodeError: print("json.JSONDecodeError", fname) continue - return all_results + return lang_to_results -def summarize_results(dirname, stats_languages=None): - all_results = load_results(dirname, stats_languages) +def summarize_results(dirname, verbose, stats_languages=None): + lang_to_results = load_results(dirname, stats_languages) res = SimpleNamespace() res.total_tests = len(list(Path(dirname).glob("*/exercises/practice/*"))) try: - tries = max(len(results.get("tests_outcomes", [])) for results in all_results if results) + tries = max(len(results.get("tests_outcomes", [])) for results_list in lang_to_results.values() for results in results_list if results) except ValueError: tries = 0 @@ -499,42 +501,77 @@ def summarize_results(dirname, stats_languages=None): res.thinking_tokens = None variants = defaultdict(set) - for results in all_results: - if not results: - continue - - res.completed_tests += 1 - tests_outcomes = results.get("tests_outcomes", []) - passed = tests_outcomes and tests_outcomes[-1] - if passed: - for i in range(len(tests_outcomes) - 1, tries): - passed_tests[i] += 1 - - res.cost += results.get("cost", 0) - res.duration += results.get("duration", 0) - res.test_timeouts += results.get("test_timeouts", 0) - - res.error_outputs += results.get("num_error_outputs", 0) - res.user_asks += results.get("num_user_asks", 0) - res.exhausted_context_windows += results.get("num_exhausted_context_windows", 0) - res.num_malformed_responses += results.get("num_malformed_responses", 0) - if results.get("num_malformed_responses"): - res.num_with_malformed_responses += 1 - res.lazy_comments += results.get("lazy_comments", 0) - - res.syntax_errors += results.get("syntax_errors", 0) - res.indentation_errors += results.get("indentation_errors", 0) - - res.prompt_tokens += results.get("prompt_tokens", 0) - res.completion_tokens += results.get("completion_tokens", 0) - - res.reasoning_effort = results.get("reasoning_effort") - res.thinking_tokens = results.get("thinking_tokens") + def add(attr_name, increment, global_stats, lang_stats): + global_prev = getattr(global_stats, attr_name) + setattr(global_stats, attr_name, global_prev + increment) + + lang_prev = getattr(lang_stats, attr_name) + setattr(lang_stats, attr_name, lang_prev + increment) + + lang_to_stats = {} + lang_to_passed_tests = {} + for lang, results_list in lang_to_results.items(): + lang_stats = SimpleNamespace() + lang_stats.completed_tests = 0 + lang_stats.duration = 0 + lang_stats.avg_duration_per_test = 0 + lang_stats.cost = 0 + for i in range(tries): + setattr(lang_stats, f"pass_rate_{i}", 0) + for i in range(tries): + setattr(lang_stats, f"pass_num_{i}", 0) + lang_stats.error_outputs = 0 + lang_stats.user_asks = 0 + lang_stats.test_timeouts = 0 + lang_stats.exhausted_context_windows = 0 + lang_stats.num_malformed_responses = 0 + lang_stats.num_with_malformed_responses = 0 + lang_stats.syntax_errors = 0 + lang_stats.indentation_errors = 0 + lang_stats.lazy_comments = 0 + lang_stats.prompt_tokens = 0 + lang_stats.completion_tokens = 0 + lang_to_stats[lang] = lang_stats + lang_to_passed_tests[lang] = [0] * tries + + for results in results_list: + if not results: + continue - for key in "model edit_format commit_hash editor_model editor_edit_format".split(): - val = results.get(key) - if val: - variants[key].add(val) + add("completed_tests", 1, res, lang_stats) + tests_outcomes = results.get("tests_outcomes", []) + passed = tests_outcomes and tests_outcomes[-1] + if passed: + for i in range(len(tests_outcomes) - 1, tries): + passed_tests[i] += 1 + lang_to_passed_tests[lang][i] += 1 + + add("cost", results.get("cost", 0), res, lang_stats) + add("duration", results.get("duration", 0), res, lang_stats) + add("test_timeouts", results.get("test_timeouts", 0), res, lang_stats) + + add("error_outputs", results.get("num_error_outputs", 0), res, lang_stats) + add("user_asks", results.get("num_user_asks", 0), res, lang_stats) + add("exhausted_context_windows", results.get("num_exhausted_context_windows", 0), res, lang_stats) + add("num_malformed_responses", results.get("num_malformed_responses", 0), res, lang_stats) + if results.get("num_malformed_responses"): + add("num_with_malformed_responses", 1, res, lang_stats) + add("lazy_comments", results.get("lazy_comments", 0), res, lang_stats) + + add("syntax_errors", results.get("syntax_errors", 0), res, lang_stats) + add("indentation_errors", results.get("indentation_errors", 0), res, lang_stats) + + add("prompt_tokens", results.get("prompt_tokens", 0), res, lang_stats) + add("completion_tokens", results.get("completion_tokens", 0), res, lang_stats) + + res.reasoning_effort = results.get("reasoning_effort") + res.thinking_tokens = results.get("thinking_tokens") + res.map_tokens = results.get("map_tokens") + + for key in "model edit_format commit_hash editor_model editor_edit_format".split(): + val = results.get(key) + if val: + variants[key].add(val) if not res.completed_tests: return @@ -623,6 +660,83 @@ def show(stat, red="red"): f" ${projected_cost:.2f} projected" ) + if verbose and len(lang_to_stats) > 0: + def format_lang_stats(lang_stats): + # First, postprocess attributes for easier printing + if lang_stats.completed_tests > 0: + lang_stats.avg_duration_per_test = lang_stats.duration / float(lang_stats.completed_tests) + for i in range(tries): + num_passed = lang_to_passed_tests[lang][i] + setattr(lang_stats, f"pass_num_{i}", num_passed) + pass_rate = 100 * num_passed / float(lang_stats.completed_tests) + setattr(lang_stats, f"pass_rate_{i}", pass_rate) + + # Then format attributes into ready-to-print strings + for attr in lang_stats.__dict__: + val = getattr(lang_stats, attr) + if val == 0: + val = "-" + elif isinstance(val, float): + val = f"{val:,.2f}" + else: + val = f"{val:,}" + + setattr(lang_stats, attr, val) + + def compute_lang_to_col_widths(lang_to_stats): + lang_to_col_widths = {} + for lang, lang_stats in lang_to_stats.items(): + lang_stat_attrs = [getattr(lang_stats, attr) for attr in lang_stats.__dict__] + lang_col_width = max(len(lang), len(max(lang_stat_attrs, key=len))) + lang_to_col_widths[lang] = lang_col_width + + return lang_to_col_widths + + print() + print("======== Stats by language ========") + print() + + [format_lang_stats(lang_stats) for lang_stats in lang_to_stats.values()] + lang_to_col_widths = compute_lang_to_col_widths(lang_to_stats) + + any_stats = list(lang_to_stats.values())[0] + attrs = list(any_stats.__dict__) + attr_col_width = len(max(["language"] + attrs, key=len)) + langs = list(lang_to_stats.keys()) + + print("| " + ("-" * attr_col_width), end="") + for lang in langs: + col_width = lang_to_col_widths[lang] + print(" | " + ("-" * col_width), end="") + print(" |") + + print(f"| {' '.center(attr_col_width)}", end="") + for lang in langs: + col_width = lang_to_col_widths[lang] + print(f" | {lang.center(col_width)}", end="") + print(" |") + + print("| " + ("-" * attr_col_width), end="") + for lang in langs: + col_width = lang_to_col_widths[lang] + print(" | " + ("-" * col_width), end="") + print(" |") + + for attr in attrs: + print(f"| {attr:<{attr_col_width}}", end="") + for lang in langs: + lang_stats = lang_to_stats[lang] + col_width = lang_to_col_widths[lang] + print(f" | {getattr(lang_stats, attr):>{col_width}}", end="") + print(" |") + + print("| " + ("-" * attr_col_width), end="") + for lang in langs: + col_width = lang_to_col_widths[lang] + print(" | " + ("-" * col_width), end="") + print(" |") + print() + console.rule() # print(json.dumps(vars(res), indent=4, sort_keys=True))