diff --git a/README.md b/README.md index 4836ab1fc77..fce138c9085 100644 --- a/README.md +++ b/README.md @@ -30,6 +30,8 @@ This project aims to be compatible with upstream Aider, but with priority commit * [Fix Deepseek model configurations](https://github.com/Aider-AI/aider/commit/c839a6dd8964d702172cae007375e299732d3823) * [Relax Version Pinning For Easier Distribution](https://github.com/dwash96/aider-ce/issues/18) * [Remove Confirm Responses from History](https://github.com/Aider-AI/aider/pull/3958) +* [Benchmark Results By Language](https://github.com/dwash96/aider-ce/pull/27) +* [Allow Benchmarks to Use Repo Map For Better Accuracy](https://github.com/dwash96/aider-ce/pull/25) ### Other Notes * [MCP Configuration](https://github.com/dwash96/aider-ce/blob/main/aider/website/docs/config/mcp.md) diff --git a/aider/__init__.py b/aider/__init__.py index bd9897115d8..ecc81a4b534 100644 --- a/aider/__init__.py +++ b/aider/__init__.py @@ -1,6 +1,6 @@ from packaging import version -__version__ = "0.87.7.dev" +__version__ = "0.87.8.dev" safe_version = __version__ try: diff --git a/aider/coders/base_coder.py b/aider/coders/base_coder.py index 45038cd14e2..4f536a80699 100755 --- a/aider/coders/base_coder.py +++ b/aider/coders/base_coder.py @@ -367,6 +367,7 @@ def __init__( context_compaction_max_tokens=None, context_compaction_summary_tokens=8192, map_cache_dir=".", + repomap_in_memory=False, ): # initialize from args.map_cache_dir self.map_cache_dir = map_cache_dir @@ -555,6 +556,8 @@ def __init__( map_mul_no_files=map_mul_no_files, refresh=map_refresh, max_code_line_length=map_max_line_length, + repo_root=self.root, + use_memory_cache=repomap_in_memory, ) self.summarizer = summarizer or ChatSummary( @@ -853,6 +856,19 @@ def get_repo_map(self, force_refresh=False): mentioned_fnames.update(self.get_ident_filename_matches(mentioned_idents)) all_abs_files = set(self.get_all_abs_files()) + + # Exclude metadata/docs from repo map inputs to reduce parsing overhead + def _include_in_map(abs_path): + try: + rel = self.get_rel_fname(abs_path) + except Exception: + rel = str(abs_path) + parts = Path(rel).parts + if ".meta" in parts or ".docs" in parts: + return False + return True + + all_abs_files = {p for p in all_abs_files if _include_in_map(p)} repo_abs_read_only_fnames = set(self.abs_read_only_fnames) & all_abs_files repo_abs_read_only_stubs_fnames = set(self.abs_read_only_stubs_fnames) & all_abs_files chat_files = ( diff --git a/aider/repomap.py b/aider/repomap.py index e96ed0446fe..3aee43bd333 100644 --- a/aider/repomap.py +++ b/aider/repomap.py @@ -146,15 +146,22 @@ def __init__( map_mul_no_files=8, refresh="auto", max_code_line_length=100, + repo_root=None, + use_memory_cache=False, ): self.io = io self.verbose = verbose self.refresh = refresh self.map_cache_dir = map_cache_dir - self.root = os.getcwd() + # Prefer an explicit repo root (eg per-test repo), fallback to CWD + self.root = repo_root or os.getcwd() - self.load_tags_cache() + # Allow opting into an in-memory tags cache to avoid disk/SQLite locks + if use_memory_cache: + self.TAGS_CACHE = dict() + else: + self.load_tags_cache() self.cache_threshold = 0.95 self.max_map_tokens = map_tokens diff --git a/benchmark/README.md b/benchmark/README.md index 7765c00b79c..988406de687 100644 --- a/benchmark/README.md +++ b/benchmark/README.md @@ -83,6 +83,7 @@ You can run `./benchmark/benchmark.py --help` for a list of all the arguments, b - `--num-tests` specifies how many of the tests to run before stopping. This is another way to start gently as you debug your benchmarking setup. - `--keywords` filters the tests to run to only the ones whose name match the supplied argument (similar to `pytest -k xxxx`). - `--read-model-settings=` specify model settings, see here: https://aider.chat/docs/config/adv-model-settings.html#model-settings +- `--map-tokens` sets a token budget for the repo map sent with each request. Set `0` to disable the repo map. This lets you enable repo map usage for any model (e.g., `--map-tokens 1024`). ### Benchmark report diff --git a/benchmark/benchmark.py b/benchmark/benchmark.py index cc694a9f70d..c3ad3daee3e 100755 --- a/benchmark/benchmark.py +++ b/benchmark/benchmark.py @@ -15,20 +15,24 @@ from types import SimpleNamespace from typing import List, Optional -import git -import importlib_resources -import lox -import pandas as pd -import prompts +""" +Performance-oriented refactors: +- Avoid heavy imports unless needed for a given code path. +- Fast path for `--stats` to skip GitPython and benchmarking deps. +- Build DataFrame / import plotting only when `--graphs` is true. +- Use json.load for result file parsing to reduce memory churn. +- Cache git version lookups across a single invocation. +""" + +# Heavy modules are lazily imported within the code paths that need them. import typer from dotenv import load_dotenv -from plots import plot_refactoring from rich.console import Console -from aider import models, sendchat -from aider.coders import Coder, base_coder from aider.dump import dump # noqa: F401 -from aider.io import InputOutput + +# Cache for commit-hash -> version lookup +_VERSION_CACHE = {} BENCHMARK_DNAME = Path(os.environ.get("AIDER_BENCHMARK_DIR", "tmp.benchmarks")) @@ -88,10 +92,10 @@ def find_latest_benchmark_dir(): return latest_dir -def show_stats(dirnames, graphs, stats_languages=None): +def show_stats(dirnames, graphs, verbose, stats_languages=None): raw_rows = [] for dirname in dirnames: - row = summarize_results(dirname, stats_languages) + row = summarize_results(dirname, verbose, stats_languages) raw_rows.append(row) # return @@ -122,11 +126,12 @@ def show_stats(dirnames, graphs, stats_languages=None): repeat_hi = repeat_lo = repeat_avg = None # noqa: F841 - df = pd.DataFrame.from_records(rows) - # df.sort_values(by=["model", "edit_format"], inplace=True) - - # dump(df) + # Only build a DataFrame and import plotting libs when graphs are requested if graphs: + import pandas as pd # Lazy import + from plots import plot_refactoring # Lazy import + + df = pd.DataFrame.from_records(rows) # plot_timing(df) # plot_outcomes(df, repeats, repeat_hi, repeat_lo, repeat_avg) # plot_outcomes_claude(df) @@ -212,15 +217,15 @@ def main( thinking_tokens: Optional[int] = typer.Option( None, "--thinking-tokens", help="Set thinking tokens for models that support it" ), + map_tokens: Optional[int] = typer.Option( + None, + "--map-tokens", + help="Suggested number of tokens for repo map (0 to disable)", + ), exercises_dir: str = typer.Option( EXERCISES_DIR_DEFAULT, "--exercises-dir", help="Directory with exercise files" ), ): - repo = git.Repo(search_parent_directories=True) - commit_hash = repo.head.object.hexsha[:7] - if repo.is_dirty(): - commit_hash += "-dirty" - if stats_only and not dirnames: latest_dir = find_latest_benchmark_dir() dirnames = [str(latest_dir)] @@ -241,7 +246,7 @@ def main( updated_dirnames.append(dirname) if stats_only: - return show_stats(updated_dirnames, graphs, stats_languages) + return show_stats(updated_dirnames, graphs, verbose, stats_languages) if diffs_only: return show_diffs(updated_dirnames) @@ -249,6 +254,19 @@ def main( assert len(updated_dirnames) == 1, updated_dirnames dirname = updated_dirnames[0] + # Lazy imports for the actual benchmark run + import git # Heavy; avoid for --stats/--diffs + import importlib_resources # Used for model metadata registration + import lox # Only needed for threaded runs + + from aider import models, sendchat + from aider.coders import base_coder + + repo = git.Repo(search_parent_directories=True) + commit_hash = repo.head.object.hexsha[:7] + if repo.is_dirty(): + commit_hash += "-dirty" + if "AIDER_DOCKER" not in os.environ: print("Warning: benchmarking runs unvetted code from GPT, run in a docker container") return @@ -350,6 +368,9 @@ def get_exercise_dirs(base_dir, languages=None): base_coder.RETRY_TIMEOUT = LONG_TIMEOUT models.RETRY_TIMEOUT = LONG_TIMEOUT + # Enable in-memory RepoMap cache when running multiple threads to avoid SQLite contention + repomap_in_memory = threads > 1 + if threads == 1: all_results = [] for test_path in test_dnames: @@ -370,10 +391,12 @@ def get_exercise_dirs(base_dir, languages=None): sleep, reasoning_effort, thinking_tokens, + map_tokens, + repomap_in_memory, ) all_results.append(results) - summarize_results(dirname) + summarize_results(dirname, verbose) if sleep: time.sleep(sleep) else: @@ -396,13 +419,15 @@ def get_exercise_dirs(base_dir, languages=None): sleep, reasoning_effort, thinking_tokens, + map_tokens, + repomap_in_memory, ) all_results = run_test_threaded.gather(tqdm=True) print() print() print() - summarize_results(dirname) + summarize_results(dirname, verbose) return 0 @@ -446,7 +471,7 @@ def show_diffs(dirnames): def load_results(dirname, stats_languages=None): dirname = Path(dirname) - all_results = [] + lang_to_results = {} if stats_languages: languages = [lang.strip().lower() for lang in stats_languages.split(",")] @@ -458,21 +483,28 @@ def load_results(dirname, stats_languages=None): for fname in dirname.glob(pattern): try: results = json.loads(fname.read_text()) - all_results.append(results) + # json / test / prac / exer / lang + lang = fname.parent.parent.parent.parent.name + lang_to_results.setdefault(lang, []).append(results) except json.JSONDecodeError: print("json.JSONDecodeError", fname) continue - return all_results + return lang_to_results -def summarize_results(dirname, stats_languages=None): - all_results = load_results(dirname, stats_languages) +def summarize_results(dirname, verbose, stats_languages=None): + lang_to_results = load_results(dirname, stats_languages) res = SimpleNamespace() res.total_tests = len(list(Path(dirname).glob("*/exercises/practice/*"))) try: - tries = max(len(results.get("tests_outcomes", [])) for results in all_results if results) + tries = max( + len(results.get("tests_outcomes", [])) + for results_list in lang_to_results.values() + for results in results_list + if results + ) except ValueError: tries = 0 @@ -497,44 +529,90 @@ def summarize_results(dirname, stats_languages=None): res.reasoning_effort = None res.thinking_tokens = None + res.map_tokens = None variants = defaultdict(set) - for results in all_results: - if not results: - continue - - res.completed_tests += 1 - tests_outcomes = results.get("tests_outcomes", []) - passed = tests_outcomes and tests_outcomes[-1] - if passed: - for i in range(len(tests_outcomes) - 1, tries): - passed_tests[i] += 1 - - res.cost += results.get("cost", 0) - res.duration += results.get("duration", 0) - res.test_timeouts += results.get("test_timeouts", 0) + def add(attr_name, increment, global_stats, lang_stats): + global_prev = getattr(global_stats, attr_name) + setattr(global_stats, attr_name, global_prev + increment) + + lang_prev = getattr(lang_stats, attr_name) + setattr(lang_stats, attr_name, lang_prev + increment) + + lang_to_stats = {} + lang_to_passed_tests = {} + for lang, results_list in lang_to_results.items(): + lang_stats = SimpleNamespace() + lang_stats.completed_tests = 0 + lang_stats.duration = 0 + lang_stats.avg_duration_per_test = 0 + lang_stats.cost = 0 + for i in range(tries): + setattr(lang_stats, f"pass_rate_{i}", 0) + for i in range(tries): + setattr(lang_stats, f"pass_num_{i}", 0) + lang_stats.error_outputs = 0 + lang_stats.user_asks = 0 + lang_stats.test_timeouts = 0 + lang_stats.exhausted_context_windows = 0 + lang_stats.num_malformed_responses = 0 + lang_stats.num_with_malformed_responses = 0 + lang_stats.syntax_errors = 0 + lang_stats.indentation_errors = 0 + lang_stats.lazy_comments = 0 + lang_stats.prompt_tokens = 0 + lang_stats.completion_tokens = 0 + lang_to_stats[lang] = lang_stats + lang_to_passed_tests[lang] = [0] * tries + + for results in results_list: + if not results: + continue - res.error_outputs += results.get("num_error_outputs", 0) - res.user_asks += results.get("num_user_asks", 0) - res.exhausted_context_windows += results.get("num_exhausted_context_windows", 0) - res.num_malformed_responses += results.get("num_malformed_responses", 0) - if results.get("num_malformed_responses"): - res.num_with_malformed_responses += 1 - res.lazy_comments += results.get("lazy_comments", 0) + add("completed_tests", 1, res, lang_stats) + tests_outcomes = results.get("tests_outcomes", []) + passed = tests_outcomes and tests_outcomes[-1] + if passed: + for i in range(len(tests_outcomes) - 1, tries): + passed_tests[i] += 1 + lang_to_passed_tests[lang][i] += 1 + + add("cost", results.get("cost", 0), res, lang_stats) + add("duration", results.get("duration", 0), res, lang_stats) + add("test_timeouts", results.get("test_timeouts", 0), res, lang_stats) + + add("error_outputs", results.get("num_error_outputs", 0), res, lang_stats) + add("user_asks", results.get("num_user_asks", 0), res, lang_stats) + add( + "exhausted_context_windows", + results.get("num_exhausted_context_windows", 0), + res, + lang_stats, + ) + add( + "num_malformed_responses", + results.get("num_malformed_responses", 0), + res, + lang_stats, + ) + if results.get("num_malformed_responses"): + add("num_with_malformed_responses", 1, res, lang_stats) + add("lazy_comments", results.get("lazy_comments", 0), res, lang_stats) - res.syntax_errors += results.get("syntax_errors", 0) - res.indentation_errors += results.get("indentation_errors", 0) + add("syntax_errors", results.get("syntax_errors", 0), res, lang_stats) + add("indentation_errors", results.get("indentation_errors", 0), res, lang_stats) - res.prompt_tokens += results.get("prompt_tokens", 0) - res.completion_tokens += results.get("completion_tokens", 0) + add("prompt_tokens", results.get("prompt_tokens", 0), res, lang_stats) + add("completion_tokens", results.get("completion_tokens", 0), res, lang_stats) - res.reasoning_effort = results.get("reasoning_effort") - res.thinking_tokens = results.get("thinking_tokens") + res.reasoning_effort = results.get("reasoning_effort") + res.thinking_tokens = results.get("thinking_tokens") + res.map_tokens = results.get("map_tokens") - for key in "model edit_format commit_hash editor_model editor_edit_format".split(): - val = results.get(key) - if val: - variants[key].add(val) + for key in "model edit_format commit_hash editor_model editor_edit_format".split(): + val = results.get(key) + if val: + variants[key].add(val) if not res.completed_tests: return @@ -578,6 +656,8 @@ def show(stat, red="red"): print(f" reasoning_effort: {res.reasoning_effort}") if res.thinking_tokens is not None: print(f" thinking_tokens: {res.thinking_tokens}") + if res.map_tokens is not None: + print(f" map_tokens: {res.map_tokens}") for i in range(tries): print(f" pass_rate_{i + 1}: {percents[i]:.1f}") @@ -602,7 +682,7 @@ def show(stat, red="red"): if variants["model"]: a_model = set(variants["model"]).pop() - command = f"aider --model {a_model}" + command = f"aider-ce --model {a_model}" print(f" command: {command}") print(f" date: {date}") @@ -623,6 +703,86 @@ def show(stat, red="red"): f" ${projected_cost:.2f} projected" ) + if verbose and len(lang_to_stats) > 0: + + def format_lang_stats(lang_stats): + # First, postprocess attributes for easier printing + if lang_stats.completed_tests > 0: + lang_stats.avg_duration_per_test = lang_stats.duration / float( + lang_stats.completed_tests + ) + for i in range(tries): + num_passed = lang_to_passed_tests[lang][i] + setattr(lang_stats, f"pass_num_{i}", num_passed) + pass_rate = 100 * num_passed / float(lang_stats.completed_tests) + setattr(lang_stats, f"pass_rate_{i}", pass_rate) + + # Then format attributes into ready-to-print strings + for attr in lang_stats.__dict__: + val = getattr(lang_stats, attr) + if val == 0: + val = "-" + elif isinstance(val, float): + val = f"{val:,.2f}" + else: + val = f"{val:,}" + + setattr(lang_stats, attr, val) + + def compute_lang_to_col_widths(lang_to_stats): + lang_to_col_widths = {} + for lang, lang_stats in lang_to_stats.items(): + lang_stat_attrs = [getattr(lang_stats, attr) for attr in lang_stats.__dict__] + lang_col_width = max(len(lang), len(max(lang_stat_attrs, key=len))) + lang_to_col_widths[lang] = lang_col_width + + return lang_to_col_widths + + print() + print("======== Stats by language ========") + print() + + [format_lang_stats(lang_stats) for lang_stats in lang_to_stats.values()] + lang_to_col_widths = compute_lang_to_col_widths(lang_to_stats) + + any_stats = list(lang_to_stats.values())[0] + attrs = list(any_stats.__dict__) + attr_col_width = len(max(["language"] + attrs, key=len)) + langs = list(lang_to_stats.keys()) + + print("| " + ("-" * attr_col_width), end="") + for lang in langs: + col_width = lang_to_col_widths[lang] + print(" | " + ("-" * col_width), end="") + print(" |") + + print(f"| {' '.center(attr_col_width)}", end="") + for lang in langs: + col_width = lang_to_col_widths[lang] + print(f" | {lang.center(col_width)}", end="") + print(" |") + + print("| " + ("-" * attr_col_width), end="") + for lang in langs: + col_width = lang_to_col_widths[lang] + print(" | " + ("-" * col_width), end="") + print(" |") + + for attr in attrs: + print(f"| {attr:<{attr_col_width}}", end="") + for lang in langs: + lang_stats = lang_to_stats[lang] + col_width = lang_to_col_widths[lang] + print(f" | {getattr(lang_stats, attr):>{col_width}}", end="") + print(" |") + + print("| " + ("-" * attr_col_width), end="") + for lang in langs: + col_width = lang_to_col_widths[lang] + print(" | " + ("-" * col_width), end="") + print(" |") + print() + console.rule() # print(json.dumps(vars(res), indent=4, sort_keys=True)) @@ -634,14 +794,24 @@ def get_versions(commit_hashes): for hsh in commit_hashes: if not hsh: continue - hsh = hsh.split("-")[0] + short = hsh.split("-")[0] + if short in _VERSION_CACHE: + ver = _VERSION_CACHE.get(short) + if ver: + versions.add(ver) + continue + try: - version = subprocess.check_output( - ["git", "show", f"{hsh}:aider/__init__.py"], universal_newlines=True + version_src = subprocess.check_output( + ["git", "show", f"{short}:aider/__init__.py"], universal_newlines=True ) - version = re.search(r'__version__ = "(.*)"', version).group(1) - versions.add(version) + match = re.search(r'__version__ = "(.*)"', version_src) + ver = match.group(1) if match else None + _VERSION_CACHE[short] = ver + if ver: + versions.add(ver) except subprocess.CalledProcessError: + _VERSION_CACHE[short] = None pass return versions @@ -693,8 +863,18 @@ def run_test_real( sleep=0, reasoning_effort: Optional[str] = None, thinking_tokens: Optional[int] = None, + map_tokens: Optional[int] = None, read_model_settings=None, + repomap_in_memory: bool = False, ): + # Lazy imports: only needed in the actual benchmark execution path + import git + import prompts + + from aider import models + from aider.coders import Coder + from aider.io import InputOutput + if not os.path.isdir(testdir): print("Not a dir:", testdir) return @@ -818,20 +998,45 @@ def run_test_real( dump(edit_format) show_fnames = ",".join(map(str, fnames)) print("fnames:", show_fnames) - - coder = Coder.create( - main_model, - edit_format, - io, + # Ensure this test directory is a standalone git repo so RepoMap can be used + try: + git_dir = testdir / ".git" + if not git_dir.exists(): + r = git.Repo.init(testdir) + # Set a local identity to avoid commit failures in clean containers + with r.config_writer() as cw: + cw.set_value("user", "name", "aider-benchmark") + cw.set_value("user", "email", "aider-benchmark@example.com") + # Add existing files (solution set and any current files) + r.index.add([str(p.relative_to(testdir)) for p in testdir.rglob("*") if p.is_file()]) + r.index.commit("Initial commit for aider benchmark") + except Exception as e: + if verbose: + print(f"Warning: failed to initialize git repo in {testdir}: {e}") + + coder_kwargs = dict( + main_model=main_model, + edit_format=edit_format, + io=io, fnames=fnames, - use_git=False, + use_git=True, + auto_commits=False, + dirty_commits=False, stream=False, verbose=verbose, # auto_lint=False, # disabled for code-in-json experiments cache_prompts=True, suggest_shell_commands=False, ignore_mentions=ignore_files, + # Reduce repo map contention and size for benchmarks + map_cache_dir=str(testdir), + repomap_in_memory=repomap_in_memory, + map_mul_no_files=4, ) + if map_tokens is not None: + coder_kwargs["map_tokens"] = map_tokens + + coder = Coder.create(**coder_kwargs) dump(coder.ignore_mentions) coder.show_announcements() @@ -960,6 +1165,7 @@ def run_test_real( prompt_tokens=coder.total_tokens_sent, completion_tokens=coder.total_tokens_received, thinking_tokens=thinking_tokens, + map_tokens=map_tokens, chat_hashes=list( zip( coder.chat_completion_call_hashes,