microsoft · sordonia · Nov 27, 2025 · Nov 27, 2025 · Nov 27, 2025 · Nov 27, 2025
diff --git a/.github/actions/test-if-changes/action.yml b/.github/actions/test-if-changes/action.yml
@@ -39,12 +39,13 @@ runs:
         else
           pip install "debug-gym[dev]==${{ inputs.version }}"
         fi
+        df -h
     - name: Run tests
       env:
         DEBUG_GYM_DEBUG: 1
       shell: bash
       run: |
-        pytest ${{ inputs.test-files }} -vv -n 16 --timeout=600 --cov=debug_gym --cov-report=term-missing
+        pytest ${{ inputs.test-files }} -vv -n 4 --timeout=600 --cov=debug_gym --cov-report=term-missing
     - name: Store coverage report
       uses: actions/upload-artifact@v4
       with:

diff --git a/debug_gym/gym/envs/__init__.py b/debug_gym/gym/envs/__init__.py
@@ -1,5 +1,6 @@
 from debug_gym.gym.envs.aider import AiderBenchmarkEnv
 from debug_gym.gym.envs.env import RepoEnv, TooledEnv
+from debug_gym.gym.envs.local import LocalEnv
 from debug_gym.gym.envs.mini_nightmare import MiniNightmareEnv
 from debug_gym.gym.envs.r2egym import R2EGymEnv
 from debug_gym.gym.envs.swe_bench import SWEBenchEnv
@@ -11,6 +12,8 @@ def select_env(env_type: str = None) -> type[RepoEnv]:
     match env_type:
         case None:
             return RepoEnv
+        case "local":
+            return LocalEnv
         case "aider":
             return AiderBenchmarkEnv
         case "swebench":

diff --git a/debug_gym/gym/envs/aider.py b/debug_gym/gym/envs/aider.py
@@ -1,3 +1,4 @@
+import logging
 import os
 import subprocess
 import tempfile
@@ -7,16 +8,20 @@
 from debug_gym.constants import DEBUG_GYM_CACHE_DIR
 from debug_gym.gym.entities import EvalOutput
 from debug_gym.gym.envs.env import RepoEnv
+from debug_gym.gym.envs.local import LocalEnv
 from debug_gym.gym.terminals.docker import DockerTerminal
 from debug_gym.gym.terminals.terminal import Terminal
+from debug_gym.logger import DebugGymLogger
 
 DOCKER_AIDER_IMAGE_NAME = "debug-gym:aider"
 
 
-def build_docker_image(logger):
+def build_docker_image(logger: logging.Logger | None = None):
     """
     Build a Docker image for the Mini Nightmare environment.
     """
+    logger = logger or DebugGymLogger("debug-gym")
+
     # Check if Docker image is built.
     import docker
 
@@ -62,6 +67,7 @@ class AiderBenchmarkEnv(RepoEnv):
 
     def __init__(
         self,
+        task_data: dict,
         entrypoint: str = "python -m pytest --tb=no -s .",
         terminal: Terminal | None = None,
         **kwargs,
@@ -73,7 +79,13 @@ def __init__(
         if hasattr(terminal, "base_image") and terminal.base_image is None:
             terminal.base_image = DOCKER_AIDER_IMAGE_NAME
 
-        super().__init__(entrypoint=entrypoint, terminal=terminal, **kwargs)
+        super().__init__(
+            task_data=task_data, entrypoint=entrypoint, terminal=terminal, **kwargs
+        )
+
+    @property
+    def task_name(self) -> str:
+        return self.current_task["task_name"]
 
     @property
     def instructions(self) -> str:
@@ -91,10 +103,8 @@ def eval(self, **kwargs) -> EvalOutput:
         self.last_eval = EvalOutput(success, output)
         return self.last_eval
 
-    def setup_task(self, task_name: str, options: dict = None):
-        if task_name not in self.dataset:
-            raise ValueError(f"Task {task_name} not found in the dataset.")
-        self.current_task = self.dataset[task_name]
+    def setup_task(self):
+        self.current_task = self.task_data
 
     def setup_workspace(self):
         self.workspace.reset()
@@ -122,14 +132,20 @@ def setup_terminal(self):
         )  # Aider tasks come with those.
         self.terminal.run("git commit -am 'Add debug-gym ignore and read-only files'")
 
-    def load_dataset(self, problems: str | list[str] | None = None):
-        if isinstance(self.terminal, DockerTerminal):
-            build_docker_image(self.logger)
+    @classmethod
+    def load_dataset(
+        cls,
+        problems: str | list[str] | None = None,
+        build_image: bool = True,
+        logger: object = None,
+    ) -> dict:
+        if build_image:
+            build_docker_image(logger)
 
-        if not os.path.exists(self.REPO_PATH):
-            subprocess.run(["git", "clone", self.REPO_URL, self.REPO_PATH], check=True)
+        if not os.path.exists(cls.REPO_PATH):
+            subprocess.run(["git", "clone", cls.REPO_URL, cls.REPO_PATH], check=True)
 
-        practice_path = self.REPO_PATH / "exercises" / "practice"
+        practice_path = cls.REPO_PATH / "exercises" / "practice"
         directories = [d for d in practice_path.iterdir() if d.is_dir()]
 
         dataset = {}
@@ -160,11 +176,12 @@ def load_dataset(self, problems: str | list[str] | None = None):
             )
 
             dataset[task_name] = {
+                "task_name": task_name,
                 "codebase": directory,
                 "instructions": instructions,
                 "filename": task_name + ".py",
             }
 
         problems = utils.filter_problems(dataset, problems)
-        dataset = {id: i for id, i in dataset.items() if id in problems}
+        dataset = {id: data for id, data in dataset.items() if id in problems}
         return dataset
diff --git a/debug_gym/gym/envs/env.py b/debug_gym/gym/envs/env.py
@@ -201,41 +201,29 @@ class RepoEnv(TooledEnv):
 
     def __init__(
         self,
-        path: str | None = None,
+        task_data: dict,
         entrypoint: str = "python -m pytest -sq .",
         debug_entrypoint: str | None = None,
         max_score: int | None = None,
-        readonly_patterns: list[str] | None = None,  # TODO: remove
         run_timeout: int | None = None,
         terminal: Terminal | None = None,
         logger: DebugGymLogger | None = None,
-        problems: str | list[str] | None = None,
         **kwargs,
     ):
         super().__init__()
 
-        self.path = path
+        self.task_data = task_data
         self.max_score = max_score
         self.run_timeout = run_timeout
-        self.terminal = terminal or LocalTerminal()  # TODO: default to DockerTerminal
+        self.terminal = terminal
         self._entrypoint = entrypoint
         self._debug_entrypoint = debug_entrypoint
         self.logger = logger or DebugGymLogger("debug-gym")
         self.infos: EnvInfo | None = None
         self.rng = None
         self.additional_kwargs = kwargs
-        self.task_name: str | None = None
-        self.options: dict = {}
-
-        if "auto_eval_on_rewrite" in kwargs:
-            raise ValueError(
-                "The 'auto_eval_on_rewrite' parameter is no longer supported. "
-                "Please remove it from your initialization arguments."
-                "Instead, set 'auto_eval_on_rewrite' in the EvalTool instance."
-            )
 
         self.workspace = Workspace(self.terminal, logger=self.logger)
-        self.dataset = self.load_dataset(problems)
         self.set_entrypoints(self._entrypoint, self._debug_entrypoint)
 
     def _reset_env_state(self):
@@ -290,45 +278,39 @@ def working_dir(self) -> Path:
     def instructions(self) -> str:
         """Instructions for the current task.
         Override in subclasses for different behavior."""
-        return ""
+        raise NotImplementedError(
+            "Subclasses must implement the instructions property."
+        )
 
-    def setup_task(self, task_name: str, options: dict = None) -> None:
+    @property
+    def task_name(self) -> str:
+        raise NotImplementedError("Subclasses must implement the task_name property.")
+
+    def setup_task(self) -> None:
         """Setup the task information.
         Override in subclasses for different behavior. Called once at reset."""
-        pass
+        raise NotImplementedError("Subclasses must implement setup_task method.")
 
     def setup_workspace(self) -> None:
         """Setup the workspace.
         Override in subclasses for different behavior. Called once at reset."""
-        self.workspace.reset()
-        self.workspace.copy_content(self.path)
-        self.workspace.setup_file_filters()
+        raise NotImplementedError("Subclasses must implement setup_workspace method.")
 
     def setup_terminal(self) -> None:
         """Setup the terminal.
         Override in subclasses for different behavior. Called once at reset."""
-
-        self.logger.debug(f"Configuring {self.terminal}...")
-
-        self.terminal.run("git init -b main")
-        self.terminal.run("git config user.name 'debug-gym'")
-        self.terminal.run("git config user.email '<>'")
-
-        self.terminal.run("git add *")
-        self.terminal.run("git commit -am 'Init'")
-
-        self.terminal.run("git add .debugignore .debugreadonly")
-        self.terminal.run("git commit -am 'Add debug-gym ignore and read-only files'")
+        raise NotImplementedError("Subclasses must implement setup_terminal method.")
 
     def reset(self, *, options: dict = None):
         """Resets the environment and returns eval as the initial observation."""
-        self.options = options if options is not None else self.options
+        options = options if options is not None else {}
         self.logger.debug("Resetting environment")
-        self.close()  # Clean up previous workspace and terminal.
-        self.task_name = self.options.get("task_name")
-        self.setup_task(task_name=self.task_name, options=self.options)
-        self.setup_workspace()
-        self.setup_terminal()
+        if options.get("reset_runtime", True):
+            self.close()  # Clean up previous workspace and terminal.
+            self.setup_task()
+            self.setup_workspace()
+            self.setup_terminal()
+
         self._reset_env_state()
 
         # Notify all tools that the environment is reset and get their observations
@@ -504,6 +486,3 @@ def close(self):
 
     def __del__(self):
         self.close()
-
-    def load_dataset(self, problems: str | list[str] | None = None):
-        return {"custom": None}
diff --git a/debug_gym/gym/envs/local.py b/debug_gym/gym/envs/local.py
@@ -0,0 +1,57 @@
+from debug_gym.gym.envs.env import RepoEnv
+from debug_gym.gym.terminals.local import LocalTerminal
+from debug_gym.gym.terminals.terminal import Terminal
+
+
+class LocalEnv(RepoEnv):
+
+    def __init__(
+        self,
+        path: str,
+        terminal: Terminal | None = None,
+        entrypoint: str = "python -m pytest -sq .",
+        debug_entrypoint: str | None = None,
+        **kwargs,
+    ):
+        task_data = {"path": path}
+        terminal = terminal or LocalTerminal()
+        super().__init__(
+            task_data=task_data,
+            terminal=terminal,
+            entrypoint=entrypoint,
+            debug_entrypoint=debug_entrypoint,
+            **kwargs,
+        )
+
+    @property
+    def instructions(self) -> str:
+        return f"Debug the local codebase at {self.path}. Investigate the repository, figure out the root cause, then rewrite the code to fix the issue."
+
+    @property
+    def task(self) -> str:
+        return self.task_data["path"].split("/")[-1]
+
+    def setup_task(self) -> None:
+        """Setup the task information. Called once at reset."""
+        self.path = self.task_data["path"]
+
+    def setup_workspace(self) -> None:
+        """Setup the workspace. Called once at reset."""
+        self.workspace.reset()
+        self.workspace.copy_content(self.path)
+        self.workspace.setup_file_filters()
+
+    def setup_terminal(self) -> None:
+        """Setup the terminal. Called once at reset."""
+
+        self.logger.debug(f"Configuring {self.terminal}...")
+
+        self.terminal.run("git init -b main")
+        self.terminal.run("git config user.name 'debug-gym'")
+        self.terminal.run("git config user.email '<>'")
+
+        self.terminal.run("git add *")
+        self.terminal.run("git commit -am 'Init'")
+
+        self.terminal.run("git add .debugignore .debugreadonly")
+        self.terminal.run("git commit -am 'Add debug-gym ignore and read-only files'")