Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
7e78e53
kube
Nov 27, 2025
b448767
uuid as name
Nov 27, 2025
9ac9226
normalize pod name
Nov 27, 2025
cebbf37
load dataset outside
Nov 27, 2025
ffd87cf
remove tolerations
Nov 27, 2025
0d29967
incorporate dataset loading
Nov 27, 2025
c7afaa2
some type annotations
Nov 27, 2025
c506fe1
fixture first fix
Nov 27, 2025
aab04ff
fix
Nov 27, 2025
cc8f813
fix tests
Nov 27, 2025
77aeb78
simplify filtering
Nov 27, 2025
28caf41
remove deps on swesmith! also fix excluded_ids for swesmith
Nov 27, 2025
d9b76c7
remove swesmith
Nov 27, 2025
e13462b
Merge remote-tracking branch 'origin/main' into envs_for_images
MarcCote Nov 28, 2025
928c1d8
load dataset as class method / setup_task
Nov 28, 2025
b338e1c
fix tests
Nov 28, 2025
0858bea
change run.py
Nov 28, 2025
35a4f66
blacked
Nov 28, 2025
e9600ed
remove imports
Nov 28, 2025
81b2eda
task name / task data adaptation
Nov 28, 2025
3468a62
pre commit
Nov 28, 2025
c56579c
cls keyword
Nov 28, 2025
4b01ac8
remove load dataset
Nov 28, 2025
0dd0f4e
Working on tests + refactoring
MarcCote Nov 28, 2025
e6fcd58
Adding back swesmith
MarcCote Nov 28, 2025
c80e6d8
Fixing tests.
MarcCote Dec 1, 2025
7d8268e
Print disk space after installing library.
MarcCote Dec 1, 2025
866ac39
When creating ficture env, reset the env in master thread first
MarcCote Dec 1, 2025
1f4661a
Disabling async pytests
MarcCote Dec 1, 2025
424b3dd
Reenable async pytests + make sure to provide specific problem to loa…
MarcCote Dec 1, 2025
e0263a2
Fixing load_dataset
MarcCote Dec 1, 2025
e5cda5a
Limiting workers for async pytest
MarcCote Dec 1, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .github/actions/test-if-changes/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,12 +39,13 @@ runs:
else
pip install "debug-gym[dev]==${{ inputs.version }}"
fi
df -h
- name: Run tests
env:
DEBUG_GYM_DEBUG: 1
shell: bash
run: |
pytest ${{ inputs.test-files }} -vv -n 16 --timeout=600 --cov=debug_gym --cov-report=term-missing
pytest ${{ inputs.test-files }} -vv -n 4 --timeout=600 --cov=debug_gym --cov-report=term-missing
- name: Store coverage report
uses: actions/upload-artifact@v4
with:
Expand Down
3 changes: 3 additions & 0 deletions debug_gym/gym/envs/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from debug_gym.gym.envs.aider import AiderBenchmarkEnv
from debug_gym.gym.envs.env import RepoEnv, TooledEnv
from debug_gym.gym.envs.local import LocalEnv
from debug_gym.gym.envs.mini_nightmare import MiniNightmareEnv
from debug_gym.gym.envs.r2egym import R2EGymEnv
from debug_gym.gym.envs.swe_bench import SWEBenchEnv
Expand All @@ -11,6 +12,8 @@ def select_env(env_type: str = None) -> type[RepoEnv]:
match env_type:
case None:
return RepoEnv
case "local":
return LocalEnv
case "aider":
return AiderBenchmarkEnv
case "swebench":
Expand Down
43 changes: 30 additions & 13 deletions debug_gym/gym/envs/aider.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import logging
import os
import subprocess
import tempfile
Expand All @@ -7,16 +8,20 @@
from debug_gym.constants import DEBUG_GYM_CACHE_DIR
from debug_gym.gym.entities import EvalOutput
from debug_gym.gym.envs.env import RepoEnv
from debug_gym.gym.envs.local import LocalEnv
from debug_gym.gym.terminals.docker import DockerTerminal
from debug_gym.gym.terminals.terminal import Terminal
from debug_gym.logger import DebugGymLogger

DOCKER_AIDER_IMAGE_NAME = "debug-gym:aider"


def build_docker_image(logger):
def build_docker_image(logger: logging.Logger | None = None):
"""
Build a Docker image for the Mini Nightmare environment.
"""
logger = logger or DebugGymLogger("debug-gym")

# Check if Docker image is built.
import docker

Expand Down Expand Up @@ -62,6 +67,7 @@ class AiderBenchmarkEnv(RepoEnv):

def __init__(
self,
task_data: dict,
entrypoint: str = "python -m pytest --tb=no -s .",
terminal: Terminal | None = None,
**kwargs,
Expand All @@ -73,7 +79,13 @@ def __init__(
if hasattr(terminal, "base_image") and terminal.base_image is None:
terminal.base_image = DOCKER_AIDER_IMAGE_NAME

super().__init__(entrypoint=entrypoint, terminal=terminal, **kwargs)
super().__init__(
task_data=task_data, entrypoint=entrypoint, terminal=terminal, **kwargs
)

@property
def task_name(self) -> str:
return self.current_task["task_name"]

@property
def instructions(self) -> str:
Expand All @@ -91,10 +103,8 @@ def eval(self, **kwargs) -> EvalOutput:
self.last_eval = EvalOutput(success, output)
return self.last_eval

def setup_task(self, task_name: str, options: dict = None):
if task_name not in self.dataset:
raise ValueError(f"Task {task_name} not found in the dataset.")
self.current_task = self.dataset[task_name]
def setup_task(self):
self.current_task = self.task_data

def setup_workspace(self):
self.workspace.reset()
Expand Down Expand Up @@ -122,14 +132,20 @@ def setup_terminal(self):
) # Aider tasks come with those.
self.terminal.run("git commit -am 'Add debug-gym ignore and read-only files'")

def load_dataset(self, problems: str | list[str] | None = None):
if isinstance(self.terminal, DockerTerminal):
build_docker_image(self.logger)
@classmethod
def load_dataset(
cls,
problems: str | list[str] | None = None,
build_image: bool = True,
logger: object = None,
) -> dict:
if build_image:
build_docker_image(logger)

if not os.path.exists(self.REPO_PATH):
subprocess.run(["git", "clone", self.REPO_URL, self.REPO_PATH], check=True)
if not os.path.exists(cls.REPO_PATH):
subprocess.run(["git", "clone", cls.REPO_URL, cls.REPO_PATH], check=True)

practice_path = self.REPO_PATH / "exercises" / "practice"
practice_path = cls.REPO_PATH / "exercises" / "practice"
directories = [d for d in practice_path.iterdir() if d.is_dir()]

dataset = {}
Expand Down Expand Up @@ -160,11 +176,12 @@ def load_dataset(self, problems: str | list[str] | None = None):
)

dataset[task_name] = {
"task_name": task_name,
"codebase": directory,
"instructions": instructions,
"filename": task_name + ".py",
}

problems = utils.filter_problems(dataset, problems)
dataset = {id: i for id, i in dataset.items() if id in problems}
dataset = {id: data for id, data in dataset.items() if id in problems}
return dataset
63 changes: 21 additions & 42 deletions debug_gym/gym/envs/env.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,41 +201,29 @@ class RepoEnv(TooledEnv):

def __init__(
self,
path: str | None = None,
task_data: dict,
entrypoint: str = "python -m pytest -sq .",
debug_entrypoint: str | None = None,
max_score: int | None = None,
readonly_patterns: list[str] | None = None, # TODO: remove
run_timeout: int | None = None,
terminal: Terminal | None = None,
logger: DebugGymLogger | None = None,
problems: str | list[str] | None = None,
**kwargs,
):
super().__init__()

self.path = path
self.task_data = task_data
self.max_score = max_score
self.run_timeout = run_timeout
self.terminal = terminal or LocalTerminal() # TODO: default to DockerTerminal
self.terminal = terminal
self._entrypoint = entrypoint
self._debug_entrypoint = debug_entrypoint
self.logger = logger or DebugGymLogger("debug-gym")
self.infos: EnvInfo | None = None
self.rng = None
self.additional_kwargs = kwargs
self.task_name: str | None = None
self.options: dict = {}

if "auto_eval_on_rewrite" in kwargs:
raise ValueError(
"The 'auto_eval_on_rewrite' parameter is no longer supported. "
"Please remove it from your initialization arguments."
"Instead, set 'auto_eval_on_rewrite' in the EvalTool instance."
)

self.workspace = Workspace(self.terminal, logger=self.logger)
self.dataset = self.load_dataset(problems)
self.set_entrypoints(self._entrypoint, self._debug_entrypoint)

def _reset_env_state(self):
Expand Down Expand Up @@ -290,45 +278,39 @@ def working_dir(self) -> Path:
def instructions(self) -> str:
"""Instructions for the current task.
Override in subclasses for different behavior."""
return ""
raise NotImplementedError(
"Subclasses must implement the instructions property."
)

def setup_task(self, task_name: str, options: dict = None) -> None:
@property
def task_name(self) -> str:
raise NotImplementedError("Subclasses must implement the task_name property.")

def setup_task(self) -> None:
"""Setup the task information.
Override in subclasses for different behavior. Called once at reset."""
pass
raise NotImplementedError("Subclasses must implement setup_task method.")

def setup_workspace(self) -> None:
"""Setup the workspace.
Override in subclasses for different behavior. Called once at reset."""
self.workspace.reset()
self.workspace.copy_content(self.path)
self.workspace.setup_file_filters()
raise NotImplementedError("Subclasses must implement setup_workspace method.")

def setup_terminal(self) -> None:
"""Setup the terminal.
Override in subclasses for different behavior. Called once at reset."""

self.logger.debug(f"Configuring {self.terminal}...")

self.terminal.run("git init -b main")
self.terminal.run("git config user.name 'debug-gym'")
self.terminal.run("git config user.email '<>'")

self.terminal.run("git add *")
self.terminal.run("git commit -am 'Init'")

self.terminal.run("git add .debugignore .debugreadonly")
self.terminal.run("git commit -am 'Add debug-gym ignore and read-only files'")
raise NotImplementedError("Subclasses must implement setup_terminal method.")

def reset(self, *, options: dict = None):
"""Resets the environment and returns eval as the initial observation."""
self.options = options if options is not None else self.options
options = options if options is not None else {}
self.logger.debug("Resetting environment")
self.close() # Clean up previous workspace and terminal.
self.task_name = self.options.get("task_name")
self.setup_task(task_name=self.task_name, options=self.options)
self.setup_workspace()
self.setup_terminal()
if options.get("reset_runtime", True):
self.close() # Clean up previous workspace and terminal.
self.setup_task()
self.setup_workspace()
self.setup_terminal()

self._reset_env_state()

# Notify all tools that the environment is reset and get their observations
Expand Down Expand Up @@ -504,6 +486,3 @@ def close(self):

def __del__(self):
self.close()

def load_dataset(self, problems: str | list[str] | None = None):
return {"custom": None}
57 changes: 57 additions & 0 deletions debug_gym/gym/envs/local.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
from debug_gym.gym.envs.env import RepoEnv
from debug_gym.gym.terminals.local import LocalTerminal
from debug_gym.gym.terminals.terminal import Terminal


class LocalEnv(RepoEnv):

def __init__(
self,
path: str,
terminal: Terminal | None = None,
entrypoint: str = "python -m pytest -sq .",
debug_entrypoint: str | None = None,
**kwargs,
):
task_data = {"path": path}
terminal = terminal or LocalTerminal()
super().__init__(
task_data=task_data,
terminal=terminal,
entrypoint=entrypoint,
debug_entrypoint=debug_entrypoint,
**kwargs,
)

@property
def instructions(self) -> str:
return f"Debug the local codebase at {self.path}. Investigate the repository, figure out the root cause, then rewrite the code to fix the issue."

@property
def task(self) -> str:
return self.task_data["path"].split("/")[-1]

def setup_task(self) -> None:
"""Setup the task information. Called once at reset."""
self.path = self.task_data["path"]

def setup_workspace(self) -> None:
"""Setup the workspace. Called once at reset."""
self.workspace.reset()
self.workspace.copy_content(self.path)
self.workspace.setup_file_filters()

def setup_terminal(self) -> None:
"""Setup the terminal. Called once at reset."""

self.logger.debug(f"Configuring {self.terminal}...")

self.terminal.run("git init -b main")
self.terminal.run("git config user.name 'debug-gym'")
self.terminal.run("git config user.email '<>'")

self.terminal.run("git add *")
self.terminal.run("git commit -am 'Init'")

self.terminal.run("git add .debugignore .debugreadonly")
self.terminal.run("git commit -am 'Add debug-gym ignore and read-only files'")
Loading
Loading