Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions aider/coders/base_coder.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#!/usr/bin/env python

import asyncio
import os
import base64
import hashlib
import json
Expand Down Expand Up @@ -367,6 +368,7 @@ def __init__(
context_compaction_max_tokens=None,
context_compaction_summary_tokens=8192,
map_cache_dir=".",
repomap_in_memory=False,
):
# initialize from args.map_cache_dir
self.map_cache_dir = map_cache_dir
Expand Down Expand Up @@ -555,6 +557,8 @@ def __init__(
map_mul_no_files=map_mul_no_files,
refresh=map_refresh,
max_code_line_length=map_max_line_length,
repo_root=self.root,
use_memory_cache=repomap_in_memory,
)

self.summarizer = summarizer or ChatSummary(
Expand Down Expand Up @@ -853,6 +857,19 @@ def get_repo_map(self, force_refresh=False):
mentioned_fnames.update(self.get_ident_filename_matches(mentioned_idents))

all_abs_files = set(self.get_all_abs_files())

# Exclude metadata/docs from repo map inputs to reduce parsing overhead
def _include_in_map(abs_path):
try:
rel = self.get_rel_fname(abs_path)
except Exception:
rel = str(abs_path)
parts = Path(rel).parts
if ".meta" in parts or ".docs" in parts:
return False
return True

all_abs_files = {p for p in all_abs_files if _include_in_map(p)}
repo_abs_read_only_fnames = set(self.abs_read_only_fnames) & all_abs_files
repo_abs_read_only_stubs_fnames = set(self.abs_read_only_stubs_fnames) & all_abs_files
chat_files = (
Expand Down
11 changes: 9 additions & 2 deletions aider/repomap.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,15 +146,22 @@ def __init__(
map_mul_no_files=8,
refresh="auto",
max_code_line_length=100,
repo_root=None,
use_memory_cache=False,
):
self.io = io
self.verbose = verbose
self.refresh = refresh

self.map_cache_dir = map_cache_dir
self.root = os.getcwd()
# Prefer an explicit repo root (eg per-test repo), fallback to CWD
self.root = repo_root or os.getcwd()

self.load_tags_cache()
# Allow opting into an in-memory tags cache to avoid disk/SQLite locks
if use_memory_cache:
self.TAGS_CACHE = dict()
else:
self.load_tags_cache()
self.cache_threshold = 0.95

self.max_map_tokens = map_tokens
Expand Down
1 change: 1 addition & 0 deletions benchmark/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ You can run `./benchmark/benchmark.py --help` for a list of all the arguments, b
- `--num-tests` specifies how many of the tests to run before stopping. This is another way to start gently as you debug your benchmarking setup.
- `--keywords` filters the tests to run to only the ones whose name match the supplied argument (similar to `pytest -k xxxx`).
- `--read-model-settings=<filename.yml>` specify model settings, see here: https://aider.chat/docs/config/adv-model-settings.html#model-settings
- `--map-tokens` sets a token budget for the repo map sent with each request. Set `0` to disable the repo map. This lets you enable repo map usage for any model (e.g., `--map-tokens 1024`).

### Benchmark report

Expand Down
137 changes: 106 additions & 31 deletions benchmark/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,20 +15,24 @@
from types import SimpleNamespace
from typing import List, Optional

import git
import importlib_resources
import lox
import pandas as pd
import prompts
"""
Performance-oriented refactors:
- Avoid heavy imports unless needed for a given code path.
- Fast path for `--stats` to skip GitPython and benchmarking deps.
- Build DataFrame / import plotting only when `--graphs` is true.
- Use json.load for result file parsing to reduce memory churn.
- Cache git version lookups across a single invocation.
"""

# Heavy modules are lazily imported within the code paths that need them.
import typer
from dotenv import load_dotenv
from plots import plot_refactoring
from rich.console import Console

from aider import models, sendchat
from aider.coders import Coder, base_coder
from aider.dump import dump # noqa: F401
from aider.io import InputOutput

# Cache for commit-hash -> version lookup
_VERSION_CACHE = {}

BENCHMARK_DNAME = Path(os.environ.get("AIDER_BENCHMARK_DIR", "tmp.benchmarks"))

Expand Down Expand Up @@ -122,11 +126,12 @@ def show_stats(dirnames, graphs, stats_languages=None):

repeat_hi = repeat_lo = repeat_avg = None # noqa: F841

df = pd.DataFrame.from_records(rows)
# df.sort_values(by=["model", "edit_format"], inplace=True)

# dump(df)
# Only build a DataFrame and import plotting libs when graphs are requested
if graphs:
import pandas as pd # Lazy import
from plots import plot_refactoring # Lazy import

df = pd.DataFrame.from_records(rows)
# plot_timing(df)
# plot_outcomes(df, repeats, repeat_hi, repeat_lo, repeat_avg)
# plot_outcomes_claude(df)
Expand Down Expand Up @@ -212,15 +217,15 @@ def main(
thinking_tokens: Optional[int] = typer.Option(
None, "--thinking-tokens", help="Set thinking tokens for models that support it"
),
map_tokens: Optional[int] = typer.Option(
None,
"--map-tokens",
help="Suggested number of tokens for repo map (0 to disable)",
),
exercises_dir: str = typer.Option(
EXERCISES_DIR_DEFAULT, "--exercises-dir", help="Directory with exercise files"
),
):
repo = git.Repo(search_parent_directories=True)
commit_hash = repo.head.object.hexsha[:7]
if repo.is_dirty():
commit_hash += "-dirty"

if stats_only and not dirnames:
latest_dir = find_latest_benchmark_dir()
dirnames = [str(latest_dir)]
Expand All @@ -241,6 +246,7 @@ def main(
updated_dirnames.append(dirname)

if stats_only:
# Fast path: avoid importing/initializing benchmarking deps
return show_stats(updated_dirnames, graphs, stats_languages)

if diffs_only:
Expand All @@ -249,6 +255,18 @@ def main(
assert len(updated_dirnames) == 1, updated_dirnames
dirname = updated_dirnames[0]

# Lazy imports for the actual benchmark run
import git # Heavy; avoid for --stats/--diffs
import importlib_resources # Used for model metadata registration
import lox # Only needed for threaded runs
from aider import models, sendchat
from aider.coders import base_coder

repo = git.Repo(search_parent_directories=True)
commit_hash = repo.head.object.hexsha[:7]
if repo.is_dirty():
commit_hash += "-dirty"

if "AIDER_DOCKER" not in os.environ:
print("Warning: benchmarking runs unvetted code from GPT, run in a docker container")
return
Expand Down Expand Up @@ -350,6 +368,9 @@ def get_exercise_dirs(base_dir, languages=None):
base_coder.RETRY_TIMEOUT = LONG_TIMEOUT
models.RETRY_TIMEOUT = LONG_TIMEOUT

# Enable in-memory RepoMap cache when running multiple threads to avoid SQLite contention
repomap_in_memory = threads > 1

if threads == 1:
all_results = []
for test_path in test_dnames:
Expand All @@ -370,6 +391,8 @@ def get_exercise_dirs(base_dir, languages=None):
sleep,
reasoning_effort,
thinking_tokens,
map_tokens,
repomap_in_memory,
)

all_results.append(results)
Expand All @@ -396,6 +419,8 @@ def get_exercise_dirs(base_dir, languages=None):
sleep,
reasoning_effort,
thinking_tokens,
map_tokens,
repomap_in_memory,
)
all_results = run_test_threaded.gather(tqdm=True)

Expand Down Expand Up @@ -457,7 +482,8 @@ def load_results(dirname, stats_languages=None):
for pattern in glob_patterns:
for fname in dirname.glob(pattern):
try:
results = json.loads(fname.read_text())
with open(fname, "r", encoding="utf-8", errors="replace") as f:
results = json.load(f)
all_results.append(results)
except json.JSONDecodeError:
print("json.JSONDecodeError", fname)
Expand Down Expand Up @@ -497,6 +523,7 @@ def summarize_results(dirname, stats_languages=None):

res.reasoning_effort = None
res.thinking_tokens = None
res.map_tokens = None
variants = defaultdict(set)

for results in all_results:
Expand Down Expand Up @@ -530,6 +557,7 @@ def summarize_results(dirname, stats_languages=None):

res.reasoning_effort = results.get("reasoning_effort")
res.thinking_tokens = results.get("thinking_tokens")
res.map_tokens = results.get("map_tokens")

for key in "model edit_format commit_hash editor_model editor_edit_format".split():
val = results.get(key)
Expand Down Expand Up @@ -578,6 +606,8 @@ def show(stat, red="red"):
print(f" reasoning_effort: {res.reasoning_effort}")
if res.thinking_tokens is not None:
print(f" thinking_tokens: {res.thinking_tokens}")
if res.map_tokens is not None:
print(f" map_tokens: {res.map_tokens}")

for i in range(tries):
print(f" pass_rate_{i + 1}: {percents[i]:.1f}")
Expand All @@ -602,7 +632,7 @@ def show(stat, red="red"):

if variants["model"]:
a_model = set(variants["model"]).pop()
command = f"aider --model {a_model}"
command = f"aider-ce --model {a_model}"
print(f" command: {command}")

print(f" date: {date}")
Expand Down Expand Up @@ -634,14 +664,24 @@ def get_versions(commit_hashes):
for hsh in commit_hashes:
if not hsh:
continue
hsh = hsh.split("-")[0]
short = hsh.split("-")[0]
if short in _VERSION_CACHE:
ver = _VERSION_CACHE.get(short)
if ver:
versions.add(ver)
continue

try:
version = subprocess.check_output(
["git", "show", f"{hsh}:aider/__init__.py"], universal_newlines=True
version_src = subprocess.check_output(
["git", "show", f"{short}:aider/__init__.py"], universal_newlines=True
)
version = re.search(r'__version__ = "(.*)"', version).group(1)
versions.add(version)
match = re.search(r'__version__ = "(.*)"', version_src)
ver = match.group(1) if match else None
_VERSION_CACHE[short] = ver
if ver:
versions.add(ver)
except subprocess.CalledProcessError:
_VERSION_CACHE[short] = None
pass
return versions

Expand Down Expand Up @@ -693,8 +733,17 @@ def run_test_real(
sleep=0,
reasoning_effort: Optional[str] = None,
thinking_tokens: Optional[int] = None,
map_tokens: Optional[int] = None,
read_model_settings=None,
repomap_in_memory: bool = False,
):
# Lazy imports: only needed in the actual benchmark execution path
from aider.io import InputOutput
from aider.coders import Coder
from aider import models
import prompts
import git

if not os.path.isdir(testdir):
print("Not a dir:", testdir)
return
Expand Down Expand Up @@ -818,20 +867,45 @@ def run_test_real(
dump(edit_format)
show_fnames = ",".join(map(str, fnames))
print("fnames:", show_fnames)

coder = Coder.create(
main_model,
edit_format,
io,
# Ensure this test directory is a standalone git repo so RepoMap can be used
try:
git_dir = testdir / ".git"
if not git_dir.exists():
r = git.Repo.init(testdir)
# Set a local identity to avoid commit failures in clean containers
with r.config_writer() as cw:
cw.set_value("user", "name", "aider-benchmark")
cw.set_value("user", "email", "aider-benchmark@example.com")
# Add existing files (solution set and any current files)
r.index.add([str(p.relative_to(testdir)) for p in testdir.rglob("*") if p.is_file()])
r.index.commit("Initial commit for aider benchmark")
except Exception as e:
if verbose:
print(f"Warning: failed to initialize git repo in {testdir}: {e}")

coder_kwargs = dict(
main_model=main_model,
edit_format=edit_format,
io=io,
fnames=fnames,
use_git=False,
use_git=True,
auto_commits=False,
dirty_commits=False,
stream=False,
verbose=verbose,
# auto_lint=False, # disabled for code-in-json experiments
cache_prompts=True,
suggest_shell_commands=False,
ignore_mentions=ignore_files,
# Reduce repo map contention and size for benchmarks
map_cache_dir=str(testdir),
repomap_in_memory=repomap_in_memory,
map_mul_no_files=4,
)
if map_tokens is not None:
coder_kwargs["map_tokens"] = map_tokens

coder = Coder.create(**coder_kwargs)
dump(coder.ignore_mentions)

coder.show_announcements()
Expand Down Expand Up @@ -960,6 +1034,7 @@ def run_test_real(
prompt_tokens=coder.total_tokens_sent,
completion_tokens=coder.total_tokens_received,
thinking_tokens=thinking_tokens,
map_tokens=map_tokens,
chat_hashes=list(
zip(
coder.chat_completion_call_hashes,
Expand Down
Loading