From 7e78e53127c7f64d729a7d8c16d6bd6433b72640 Mon Sep 17 00:00:00 2001
From: Alessandro Sordoni <alsordon@microsoft.com>
Date: Thu, 27 Nov 2025 10:56:48 -0800
Subject: [PATCH 01/31] kube

---
 debug_gym/gym/terminals/kubernetes.py | 23 ++++++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

diff --git a/debug_gym/gym/terminals/kubernetes.py b/debug_gym/gym/terminals/kubernetes.py
index 5c5f39bc..5d145eef 100644
--- a/debug_gym/gym/terminals/kubernetes.py
+++ b/debug_gym/gym/terminals/kubernetes.py
@@ -508,7 +508,7 @@ def setup_pod(self, max_retries: int = 3) -> None:
                     "restartPolicy": "Never",
                     "containers": [
                         {
-                            "name": "main",
+                            "name": pod_name,
                             "image": f"{self.registry}{self.base_image}",
                             "imagePullPolicy": "IfNotPresent",
                             "command": ["/bin/bash"],
@@ -518,8 +518,7 @@ def setup_pod(self, max_retries: int = 3) -> None:
                             "stdinOnce": False,
                             "tty": True,
                             "env": [
-                                {"name": k, "value": v}
-                                for k, v in self.env_vars.items()
+                                {"name": k, "value": v} for k, v in self.env_vars.items()
                             ],
                             "resources": {
                                 "requests": {"cpu": "0.5", "memory": "1Gi"},
@@ -527,6 +526,24 @@ def setup_pod(self, max_retries: int = 3) -> None:
                             },
                         }
                     ],
+                    "tolerations": [
+                        {
+                            "key": "node.kubernetes.io/disk-pressure",
+                            "operator": "Exists",
+                            "effect": "NoExecute",
+                            "tolerationSeconds": 10800
+                        },
+                        {
+                            "key": "kubernetes.azure.com/scalesetpriority",
+                            "operator": "Equal",
+                            "value": "spot",
+                            "effect": "NoSchedule"
+                        },
+                        {
+                            "key": "CriticalAddonsOnly",
+                            "operator": "Exists"
+                        },
+                    ],
                     **pod_spec_kwargs,  # e.g., nodeSelector, tolerations
                 },
             }

From b44876738c7c16eb75b09561b77eeaf1e534c94c Mon Sep 17 00:00:00 2001
From: Alessandro Sordoni <alsordon@microsoft.com>
Date: Thu, 27 Nov 2025 11:40:34 -0800
Subject: [PATCH 02/31] uuid as name

---
 debug_gym/gym/terminals/kubernetes.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/debug_gym/gym/terminals/kubernetes.py b/debug_gym/gym/terminals/kubernetes.py
index 5d145eef..af390473 100644
--- a/debug_gym/gym/terminals/kubernetes.py
+++ b/debug_gym/gym/terminals/kubernetes.py
@@ -508,7 +508,7 @@ def setup_pod(self, max_retries: int = 3) -> None:
                     "restartPolicy": "Never",
                     "containers": [
                         {
-                            "name": pod_name,
+                            "name": str(uuid.uuid4())[:8],
                             "image": f"{self.registry}{self.base_image}",
                             "imagePullPolicy": "IfNotPresent",
                             "command": ["/bin/bash"],

From 9ac9226983a14916c1fe3af22da77479023371af Mon Sep 17 00:00:00 2001
From: Alessandro Sordoni <alsordon@microsoft.com>
Date: Thu, 27 Nov 2025 12:38:21 -0800
Subject: [PATCH 03/31] normalize pod name

---
 debug_gym/gym/terminals/kubernetes.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/debug_gym/gym/terminals/kubernetes.py b/debug_gym/gym/terminals/kubernetes.py
index af390473..fae333ae 100644
--- a/debug_gym/gym/terminals/kubernetes.py
+++ b/debug_gym/gym/terminals/kubernetes.py
@@ -3,6 +3,7 @@
 import os
 import random
 import subprocess
+import hashlib
 import time
 import uuid
 from pathlib import Path
@@ -37,6 +38,9 @@ def _clean_for_kubernetes(name: str) -> str:
     # replace any characters not in the regex with hyphens
     cleaned = "".join(c if c.isalnum() or c in "-." else "-" for c in name).lower()
     # ensure it starts and ends with alphanumeric character
+    cleaned = cleaned.replace("/", "-")
+    cleaned = cleaned.replace(":", "-")
+    cleaned = cleaned.replace(".", "-")
     cleaned = cleaned.strip("-").strip(".")
     # truncate to 253 characters
     return cleaned[:253]
@@ -487,7 +491,7 @@ def setup_pod(self, max_retries: int = 3) -> None:
         for attempt in range(max_retries):
             # Generate a new pod name for each attempt to avoid sandbox conflicts
             pod_name = _clean_for_kubernetes(
-                self._pod_name or f"dbg-gym.{self.task_name}.{str(uuid.uuid4())[:8]}"
+                self._pod_name or f"dbg-gym-{self.task_name}-{str(uuid.uuid4())[:8]}"
             )
             self.logger.debug(
                 f"Setting up pod {pod_name} (attempt {attempt + 1}/{max_retries}) "
@@ -508,7 +512,7 @@ def setup_pod(self, max_retries: int = 3) -> None:
                     "restartPolicy": "Never",
                     "containers": [
                         {
-                            "name": str(uuid.uuid4())[:8],
+                            "name": pod_name,
                             "image": f"{self.registry}{self.base_image}",
                             "imagePullPolicy": "IfNotPresent",
                             "command": ["/bin/bash"],

From cebbf374763088f265abfbf8df8eb29251cb4963 Mon Sep 17 00:00:00 2001
From: Alessandro Sordoni <alsordon@microsoft.com>
Date: Thu, 27 Nov 2025 12:55:41 -0800
Subject: [PATCH 04/31] load dataset outside

---
 debug_gym/gym/envs/r2egym.py          | 138 +++++++++++++------------
 debug_gym/gym/envs/swe_bench.py       |  93 ++++++++---------
 debug_gym/gym/envs/swe_smith.py       | 141 +++++++++++++-------------
 debug_gym/gym/terminals/kubernetes.py |  12 +--
 4 files changed, 188 insertions(+), 196 deletions(-)

diff --git a/debug_gym/gym/envs/r2egym.py b/debug_gym/gym/envs/r2egym.py
index 47bc135c..de02d93d 100644
--- a/debug_gym/gym/envs/r2egym.py
+++ b/debug_gym/gym/envs/r2egym.py
@@ -58,15 +58,74 @@ def parse_log_pytest(log: str | None) -> dict[str, str]:
     return test_status_map
 
 
+def load_dataset(
+    dataset_id: str = "R2E-Gym/R2E-Gym-Lite",
+    dataset_revision: str = "8d3163011f01f9393bb3dc7700497a79a8686ae5",
+    split: str = "train",
+    problems=None,
+    prepull_images=False,
+    logger=None,
+):
+    data_path = Path(dataset_id)
+    if data_path.is_file():
+        # Loading from local file.
+        if data_path.suffix.lower() == ".json":
+            ds = load_dataset("json", data_files=dataset_id)
+        elif data_path.suffix.lower() == ".parquet":
+            ds = load_dataset("parquet", data_files=dataset_id)
+    elif data_path.is_dir():
+        # Loading from local folder.
+        ds = load_from_disk(dataset_id)
+    else:
+        # Loading from HuggingFace or a folder.
+        ds = load_dataset(dataset_id, revision=dataset_revision)
+
+    # Select the split.
+    ds = ds[split]
+
+    # Load custom dataset splits from config.
+    with open(R2EGymEnv.CONFIG) as f:
+        custom_splits = yaml.safe_load(f)
+        excluded_ids = custom_splits.get("excluded", [])
+
+    dataset = {id.split("/", 1)[-1]: i for i, id in enumerate(ds["docker_image"])}
+    problems = filter_problems(dataset, problems, custom_splits, excluded_ids)
+    dataset = {id: i for id, i in dataset.items() if id in problems}
+
+    image_names = set(ds[dataset[id]]["docker_image"] for id in dataset)
+    if logger is not None:
+        logger.debug(
+            f"Loaded {len(dataset)} tasks accross {len(image_names)} Docker images from {dataset_id}."
+        )
+
+    if prepull_images:
+        # Download all images needed for R2E-Gym.
+        client = docker.from_env()
+
+        existing_images = set(
+            tag for image in client.images.list() for tag in image.tags
+        )
+        missing_images = image_names - existing_images
+        if missing_images:
+            if logger is not None:
+                logger.warning(f"Found {len(missing_images)} missing Docker images.")
+                for i, image_name in enumerate(missing_images):
+                    if logger is not None:
+                        logger.warning(
+                            f"Pulling Docker image {i + 1}/{len(missing_images)} `{image_name}`."
+                        )
+                    client.images.pull(image_name)
+
+    return dataset
+
+
 class R2EGymEnv(RepoEnv):
     CACHE = DEBUG_GYM_CACHE_DIR / "r2e-gym"
     CONFIG = importlib_files("debug_gym") / "gym" / "envs" / "configs" / "r2egym.yaml"
 
     def __init__(
         self,
-        dataset_id: str = "R2E-Gym/R2E-Gym-Lite",
-        dataset_revision: str = "8d3163011f01f9393bb3dc7700497a79a8686ae5",
-        split: str = "train",
+        task_data: dict,
         terminal: Terminal | None = None,
         **kwargs,
     ):
@@ -76,11 +135,10 @@ def __init__(
                 "R2EGymEnv only supports DockerTerminal and KubernetesTerminal."
             )
 
-        self.dataset_id = dataset_id
-        self.dataset_revision = dataset_revision
-        self.split = split
-        self.session_commands = []
+        self.ds_row = task_data
+        self.setup_task(task_data=task_data)
 
+        self.session_commands = []
         super().__init__(terminal=terminal, **kwargs)
 
     @property
@@ -93,69 +151,9 @@ def instructions(self) -> str:
         except Exception as e:
             return self.ds_row["problem_statement"]
 
-    def load_dataset(self, problems: str | list[str] | None = None):
-        data_path = Path(self.dataset_id)
-        if data_path.is_file():
-            # Loading from local file.
-            if data_path.suffix.lower() == ".json":
-                self.ds = load_dataset("json", data_files=self.dataset_id)
-            elif data_path.suffix.lower() == ".parquet":
-                self.ds = load_dataset("parquet", data_files=self.dataset_id)
-        elif data_path.is_dir():
-            # Loading from local folder.
-            self.ds = load_from_disk(self.dataset_id)
-        else:
-            # Loading from HuggingFace or a folder.
-            self.ds = load_dataset(self.dataset_id, revision=self.dataset_revision)
-
-        # Select the split.
-        self.ds = self.ds[self.split]
-
-        # Load custom dataset splits from config.
-        with open(R2EGymEnv.CONFIG) as f:
-            custom_splits = yaml.safe_load(f)
-            excluded_ids = custom_splits.get("excluded", [])
-
-        dataset = {
-            id.split("/", 1)[-1]: i for i, id in enumerate(self.ds["docker_image"])
-        }
-        problems = filter_problems(dataset, problems, custom_splits, excluded_ids)
-        dataset = {id: i for id, i in dataset.items() if id in problems}
-
-        image_names = set(self.ds[dataset[id]]["docker_image"] for id in dataset)
-        self.logger.debug(
-            f"Loaded {len(dataset)} tasks accross {len(image_names)} Docker images from {self.dataset_id}."
-        )
-
-        if not isinstance(self.terminal, KubernetesTerminal):
-            # Download all images needed for R2E-Gym.
-            client = docker.from_env()
-
-            existing_images = set(
-                tag for image in client.images.list() for tag in image.tags
-            )
-            missing_images = image_names - existing_images
-            if missing_images:
-                self.logger.warning(
-                    f"Found {len(missing_images)} missing Docker images."
-                )
-                for i, image_name in enumerate(missing_images):
-                    self.logger.warning(
-                        f"Pulling Docker image {i + 1}/{len(missing_images)} `{image_name}`."
-                    )
-                    client.images.pull(image_name)
-
-        return dataset
-
-    def setup_task(self, task_name: str, options: dict = None):
-        if task_name not in self.dataset:
-            raise ValueError(
-                f"Task `{task_name}` was not found in dataset. The available tasks are: {self.dataset}.\n"
-                "Please provide a valid task or initialize the environment without problems to load all tasks."
-            )
-
-        self.task_name = task_name
-        self.ds_row = self.ds[self.dataset[self.task_name]]
+    def setup_task(self, task_data: dict, options: dict = None):
+        self.ds_row = task_data
+        self.task_name = task_data["instance_id"]
         self.base_image = self.ds_row["docker_image"]
         self.package_name = self.ds_row["repo_name"]
         self.expected_output = json.loads(self.ds_row["expected_output_json"])
diff --git a/debug_gym/gym/envs/swe_bench.py b/debug_gym/gym/envs/swe_bench.py
index b438dbe0..6a584fc5 100644
--- a/debug_gym/gym/envs/swe_bench.py
+++ b/debug_gym/gym/envs/swe_bench.py
@@ -16,14 +16,51 @@
 from debug_gym.gym.utils import filter_problems
 
 
+def load_swebench_dataset(
+    dataset_id: str = "SWE-bench/SWE-bench_Verified",
+    dataset_revision: str = "99450355ca8c611021187a57ffac304b66666738",
+    split="test",
+    problems=None,
+    prepull_images=False,
+    logger=None,
+):
+    ds = datasets.load_dataset(dataset_id, revision=dataset_revision)[split]
+    dataset = {id: i for i, id in enumerate(ds["instance_id"])}
+    problems = filter_problems(dataset, problems)
+    dataset = {id: i for id, i in dataset.items() if id in problems}
+
+    instance_ids = [ds[dataset[id]]["instance_id"] for id in dataset]
+    image_names = set(
+        f"sweb.eval.x86_64.{id.replace('__', '_1776_')}" for id in instance_ids
+    )
+
+    if prepull_images:
+        # Download all images needed for SWE-Bench.
+        client = docker.from_env()
+        tagged_image_names = set(f"swebench/{name}:latest" for name in image_names)
+
+        existing_images = set(
+            tag for image in client.images.list() for tag in image.tags
+        )
+        missing_images = tagged_image_names - existing_images
+        if missing_images:
+            if logger:
+                logger.info(f"Found {len(missing_images)} missing Docker images.")
+            for i, image_name in enumerate(missing_images):
+                if logger:
+                    logger.info(
+                        f"Pulling Docker images {i + 1}/{len(missing_images)}: `{image_name}`."
+                    )
+                client.images.pull(image_name)
+    return dataset
+
+
 class SWEBenchEnv(RepoEnv):
     CACHE = DEBUG_GYM_CACHE_DIR / "swe-bench"
 
     def __init__(
         self,
-        dataset_id: str = "SWE-bench/SWE-bench_Verified",
-        dataset_revision: str = "99450355ca8c611021187a57ffac304b66666738",
-        split: str = "test",
+        task_data: dict,
         terminal: Terminal | None = None,
         **kwargs,
     ):
@@ -33,58 +70,18 @@ def __init__(
                 f"{self.__class__.__name__} only supports DockerTerminal and KubernetesTerminal."
             )
 
-        self.dataset_id = dataset_id
-        self.dataset_revision = dataset_revision
-        self.split = split
+        self.ds_row = task_data
+        self.setup_task(self.ds_row)
         self.test_directives = []
-
         super().__init__(terminal=terminal, **kwargs)
 
     @property
     def instructions(self) -> str:
         return self.ds_row["problem_statement"]
 
-    def load_dataset(self, problems: str | list[str] | None = None):
-        self.ds = datasets.load_dataset(
-            self.dataset_id, revision=self.dataset_revision
-        )[self.split]
-        dataset = {id: i for i, id in enumerate(self.ds["instance_id"])}
-        problems = filter_problems(dataset, problems)
-        dataset = {id: i for id, i in dataset.items() if id in problems}
-
-        instance_ids = [self.ds[dataset[id]]["instance_id"] for id in dataset]
-        image_names = set(
-            f"sweb.eval.x86_64.{id.replace('__', '_1776_')}" for id in instance_ids
-        )
-
-        if not isinstance(self.terminal, KubernetesTerminal):
-            # Download all images needed for SWE-Bench.
-            client = docker.from_env()
-            tagged_image_names = set(f"swebench/{name}:latest" for name in image_names)
-
-            existing_images = set(
-                tag for image in client.images.list() for tag in image.tags
-            )
-            missing_images = tagged_image_names - existing_images
-            if missing_images:
-                self.logger.info(f"Found {len(missing_images)} missing Docker images.")
-                for i, image_name in enumerate(missing_images):
-                    self.logger.info(
-                        f"Pulling Docker images {i + 1}/{len(missing_images)}: `{image_name}`."
-                    )
-                    client.images.pull(image_name)
-
-        return dataset
-
-    def setup_task(self, task_name: str, options: dict = None):
-        if task_name not in self.dataset:
-            raise ValueError(
-                f"Task `{task_name}` was not found in dataset. The available tasks are: {sorted(self.dataset)}.\n"
-                "Please provide a valid task or initialize the environment without problems to load all tasks."
-            )
-
-        self.task_name = task_name
-        self.ds_row = self.ds[self.dataset[self.task_name]]
+    def setup_task(self, task_data: dict, options: dict = None):
+        self.ds_row = task_data
+        self.task_name = task_data["instance_id"]
         self.repo = self.ds_row["repo"]
         self.package_name = self.repo.split("/")[1]
         self.version = self.ds_row["version"]
diff --git a/debug_gym/gym/envs/swe_smith.py b/debug_gym/gym/envs/swe_smith.py
index ce7ef627..f2162fe3 100644
--- a/debug_gym/gym/envs/swe_smith.py
+++ b/debug_gym/gym/envs/swe_smith.py
@@ -19,6 +19,71 @@
 from debug_gym.gym.utils import filter_problems
 
 
+def load_swesmith_dataset(
+    dataset_id: str = "SWE-bench/SWE-smith",
+    dataset_revision: str = "699b53400d3855206a0fbf3ff4beaf1a52f4f232",
+    split: str = "train",
+    problems: str | list[str] | None = None,
+    prepull_images: bool = False,
+    logger=None,
+):
+    data_path = Path(dataset_id)
+    if data_path.is_file():
+        # Loading from local file.
+        if data_path.suffix.lower() == ".json":
+            ds = load_dataset("json", data_files=dataset_id)
+        elif data_path.suffix.lower() == ".parquet":
+            ds = load_dataset("parquet", data_files=dataset_id)
+    elif data_path.is_dir():
+        # Loading from local folder.
+        ds = load_from_disk(dataset_id)
+    else:
+        # Loading from HuggingFace or a folder.
+        ds = load_dataset(dataset_id, revision=dataset_revision)
+
+    # Select the split.
+    ds = ds[split]
+
+    # Load custom dataset splits from config.
+    with open(SWESmithEnv.CONFIG) as f:
+        custom_splits = yaml.safe_load(f)
+        excluded_ids = custom_splits.get("excluded", [])
+
+    dataset = {id: i for i, id in enumerate(ds["instance_id"])}
+    problems = filter_problems(dataset, problems, custom_splits, excluded_ids)
+    dataset = {id: i for id, i in dataset.items() if id in problems}
+
+    image_names = set(ds[dataset[id]]["image_name"] for id in dataset)
+    if logger is not None:
+        logger.debug(
+            f"Loaded {len(dataset)} tasks accross {len(image_names)} Docker images from {dataset_id}."
+        )
+
+    if prepull_images:
+        # Download all images needed for SWE-Smith.
+        client = docker.from_env()
+        tagged_image_names = set(f"{DOCKER_ORG}/{name}:{TAG}" for name in image_names)
+
+        existing_images = set(
+            tag for image in client.images.list() for tag in image.tags
+        )
+        missing_images = tagged_image_names - existing_images
+        if missing_images:
+            if logger is not None:
+                logger.info(f"Found {len(missing_images)} missing Docker images.")
+
+            for image_name in missing_images:
+                docker_hub_image = image_name.replace("__", "_1776_")
+                if logger is not None:
+                    logger.info(
+                        f"Pulling Docker image `{docker_hub_image}` to `{image_name}`."
+                    )
+                client.images.pull(docker_hub_image)
+                # Rename images via tagging
+                client.images.get(docker_hub_image).tag(image_name)
+    return dataset
+
+
 class SWESmithEnv(SWEBenchEnv):
     CACHE = DEBUG_GYM_CACHE_DIR / "swe-smith"
     CONFIG = (
@@ -27,85 +92,19 @@ class SWESmithEnv(SWEBenchEnv):
 
     def __init__(
         self,
-        dataset_id: str = "SWE-bench/SWE-smith",
-        dataset_revision: str = "699b53400d3855206a0fbf3ff4beaf1a52f4f232",
-        split: str = "train",
+        task_data: dict,
         terminal: Terminal | None = None,
         **kwargs,
     ):
         super().__init__(
-            dataset_id=dataset_id,
-            dataset_revision=dataset_revision,
-            split=split,
+            task_data=task_data,
             terminal=terminal,
             **kwargs,
         )
 
-    def load_dataset(self, problems: str | list[str] | None = None):
-        data_path = Path(self.dataset_id)
-        if data_path.is_file():
-            # Loading from local file.
-            if data_path.suffix.lower() == ".json":
-                self.ds = load_dataset("json", data_files=self.dataset_id)
-            elif data_path.suffix.lower() == ".parquet":
-                self.ds = load_dataset("parquet", data_files=self.dataset_id)
-        elif data_path.is_dir():
-            # Loading from local folder.
-            self.ds = load_from_disk(self.dataset_id)
-        else:
-            # Loading from HuggingFace or a folder.
-            self.ds = load_dataset(self.dataset_id, revision=self.dataset_revision)
-
-        # Select the split.
-        self.ds = self.ds[self.split]
-
-        # Load custom dataset splits from config.
-        with open(SWESmithEnv.CONFIG) as f:
-            custom_splits = yaml.safe_load(f)
-            excluded_ids = custom_splits.get("excluded", [])
-
-        dataset = {id: i for i, id in enumerate(self.ds["instance_id"])}
-        problems = filter_problems(dataset, problems, custom_splits, excluded_ids)
-        dataset = {id: i for id, i in dataset.items() if id in problems}
-
-        image_names = set(self.ds[dataset[id]]["image_name"] for id in dataset)
-        self.logger.debug(
-            f"Loaded {len(dataset)} tasks accross {len(image_names)} Docker images from {self.dataset_id}."
-        )
-
-        if not isinstance(self.terminal, KubernetesTerminal):
-            # Download all images needed for SWE-Smith.
-            client = docker.from_env()
-            tagged_image_names = set(
-                f"{DOCKER_ORG}/{name}:{TAG}" for name in image_names
-            )
-
-            existing_images = set(
-                tag for image in client.images.list() for tag in image.tags
-            )
-            missing_images = tagged_image_names - existing_images
-            if missing_images:
-                self.logger.info(f"Found {len(missing_images)} missing Docker images.")
-                for image_name in missing_images:
-                    docker_hub_image = image_name.replace("__", "_1776_")
-                    self.logger.info(
-                        f"Pulling Docker image `{docker_hub_image}` to `{image_name}`."
-                    )
-                    client.images.pull(docker_hub_image)
-                    # Rename images via tagging
-                    client.images.get(docker_hub_image).tag(image_name)
-
-        return dataset
-
-    def setup_task(self, task_name: str, options: dict = None):
-        if task_name not in self.dataset:
-            raise ValueError(
-                f"Task `{task_name}` was not found in dataset. The available tasks are: {sorted(self.dataset)}.\n"
-                "Please provide a valid task or initialize the environment without problems to load all tasks."
-            )
-
-        self.task_name = task_name
-        self.ds_row = self.ds[self.dataset[self.task_name]]
+    def setup_task(self, task_data: dict, options: dict = None):
+        self.task_name = task_data["instance_id"]
+        self.ds_row = task_data
         self.base_commit = (
             self.ds_row["base_commit"] if "base_commit" in self.ds_row else "main"
         )
diff --git a/debug_gym/gym/terminals/kubernetes.py b/debug_gym/gym/terminals/kubernetes.py
index fae333ae..7d005f89 100644
--- a/debug_gym/gym/terminals/kubernetes.py
+++ b/debug_gym/gym/terminals/kubernetes.py
@@ -522,7 +522,8 @@ def setup_pod(self, max_retries: int = 3) -> None:
                             "stdinOnce": False,
                             "tty": True,
                             "env": [
-                                {"name": k, "value": v} for k, v in self.env_vars.items()
+                                {"name": k, "value": v}
+                                for k, v in self.env_vars.items()
                             ],
                             "resources": {
                                 "requests": {"cpu": "0.5", "memory": "1Gi"},
@@ -535,18 +536,15 @@ def setup_pod(self, max_retries: int = 3) -> None:
                             "key": "node.kubernetes.io/disk-pressure",
                             "operator": "Exists",
                             "effect": "NoExecute",
-                            "tolerationSeconds": 10800
+                            "tolerationSeconds": 10800,
                         },
                         {
                             "key": "kubernetes.azure.com/scalesetpriority",
                             "operator": "Equal",
                             "value": "spot",
-                            "effect": "NoSchedule"
-                        },
-                        {
-                            "key": "CriticalAddonsOnly",
-                            "operator": "Exists"
+                            "effect": "NoSchedule",
                         },
+                        {"key": "CriticalAddonsOnly", "operator": "Exists"},
                     ],
                     **pod_spec_kwargs,  # e.g., nodeSelector, tolerations
                 },

From ffd87cf37b1824048a29bc846c6ea13f5642d6e6 Mon Sep 17 00:00:00 2001
From: Alessandro Sordoni <alsordon@microsoft.com>
Date: Thu, 27 Nov 2025 12:56:56 -0800
Subject: [PATCH 05/31] remove tolerations

---
 debug_gym/gym/terminals/kubernetes.py | 15 ---------------
 1 file changed, 15 deletions(-)

diff --git a/debug_gym/gym/terminals/kubernetes.py b/debug_gym/gym/terminals/kubernetes.py
index 7d005f89..5fa90ea0 100644
--- a/debug_gym/gym/terminals/kubernetes.py
+++ b/debug_gym/gym/terminals/kubernetes.py
@@ -531,21 +531,6 @@ def setup_pod(self, max_retries: int = 3) -> None:
                             },
                         }
                     ],
-                    "tolerations": [
-                        {
-                            "key": "node.kubernetes.io/disk-pressure",
-                            "operator": "Exists",
-                            "effect": "NoExecute",
-                            "tolerationSeconds": 10800,
-                        },
-                        {
-                            "key": "kubernetes.azure.com/scalesetpriority",
-                            "operator": "Equal",
-                            "value": "spot",
-                            "effect": "NoSchedule",
-                        },
-                        {"key": "CriticalAddonsOnly", "operator": "Exists"},
-                    ],
                     **pod_spec_kwargs,  # e.g., nodeSelector, tolerations
                 },
             }

From 0d2996745da3523e0fafe8fa91e0603b0df90479 Mon Sep 17 00:00:00 2001
From: Alessandro Sordoni <alsordon@microsoft.com>
Date: Thu, 27 Nov 2025 13:05:09 -0800
Subject: [PATCH 06/31] incorporate dataset loading

---
 scripts/run.py | 43 +++++++++++++++++++++++++++++++++++--------
 1 file changed, 35 insertions(+), 8 deletions(-)

diff --git a/scripts/run.py b/scripts/run.py
index 5139afd1..4c1e458a 100644
--- a/scripts/run.py
+++ b/scripts/run.py
@@ -18,7 +18,11 @@
 from debug_gym.llms.human import Human
 from debug_gym.logger import DebugGymLogger, load_previous_run_status
 
-
+from debug_gym.gym.envs.swe_bench import load_swebench_dataset
+from debug_gym.gym.envs.swe_smith import load_swesmith_dataset
+from debug_gym.gym.envs.r2egym import load_r2egym_dataset
+        
+        
 class AgentTimeoutException(BaseException):
     """Custom exception to handle timeouts in agent
     execution. Inherits from BaseException to ensure
@@ -40,7 +44,11 @@ def timeout_handler(signum, frame):
         signal.alarm(timeout_seconds)
 
 
-def run_agent(args, problem, config):
+def run_agent(
+    args,
+    problem: dict,
+    config: dict
+):
     set_signal(args.timeout)
     success = True
     env = None
@@ -90,7 +98,7 @@ def run_agent(args, problem, config):
             status="running",
         )
 
-        env = create_env(config, task_logger)
+        env = create_env(config, problem, task_logger)
         add_tools(env, config, task_logger)
 
         llm = LLM.instantiate(
@@ -176,14 +184,14 @@ def run_agent(args, problem, config):
     return success
 
 
-def create_env(config: dict, logger: DebugGymLogger):
+def create_env(config: dict, problem: dict, logger: DebugGymLogger):
     terminal = select_terminal(config.get("terminal"), logger, uuid=config["uuid"])
     env_class = select_env(config.get("benchmark"))
     env = env_class(
-        **config["env_kwargs"],
-        problems=config.get("problems", ["custom"]),
+        task_data=problem,
         terminal=terminal,
         logger=logger,
+        **config["env_kwargs"],
     )
     return env
 
@@ -248,8 +256,27 @@ def main():
     dump_experiment_info(config, args)
 
     # Create the environment to get the list of problems to run.
-    env = create_env(config, logger=logger)
-    problems = sorted(env.dataset)
+    dataset_info = {
+        "dataset_id": config.env_kwargs.get("dataset_id"),
+        "dataset_revision": config.env_kwargs.get("dataset_revision"),
+        "problems": config.get("problems", "all"),
+        "prepull_images": config.env_kwargs.get("prepull_images", False)
+    }
+    load_dataset_fn = {
+        "swebench": load_swebench_dataset,
+        "swebench-debug": load_swebench_dataset,
+        "swesmith": load_swesmith_dataset,
+        "r2egym": load_r2egym_dataset,
+    }
+
+    if config['benchmark'] in load_dataset_fn:
+        dataset = load_dataset_fn[config['benchmark']](
+            **dataset_info,
+        )
+    else:
+        raise ValueError(f"Unsupported benchmark: {config['benchmark']}")
+
+    problems = sorted(dataset)
 
     if args.list:
         print(f"\n# Available problems in {config.get('benchmark', 'config')}:")

From c7afaa2c593caf87cf7b526b8ca33f231fbbe0ed Mon Sep 17 00:00:00 2001
From: Alessandro Sordoni <alsordon@microsoft.com>
Date: Thu, 27 Nov 2025 13:08:53 -0800
Subject: [PATCH 07/31] some type annotations

---
 debug_gym/gym/envs/r2egym.py    |  9 +++++----
 debug_gym/gym/envs/swe_bench.py | 10 +++++-----
 debug_gym/gym/envs/swe_smith.py |  6 +++---
 3 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/debug_gym/gym/envs/r2egym.py b/debug_gym/gym/envs/r2egym.py
index de02d93d..66745dfd 100644
--- a/debug_gym/gym/envs/r2egym.py
+++ b/debug_gym/gym/envs/r2egym.py
@@ -14,6 +14,7 @@
 from debug_gym.gym.terminals.kubernetes import KubernetesTerminal
 from debug_gym.gym.terminals.terminal import Terminal
 from debug_gym.gym.utils import filter_problems
+from debug_gym.logger import DebugGymLogger
 
 
 def decolor_dict_keys(key):
@@ -58,13 +59,13 @@ def parse_log_pytest(log: str | None) -> dict[str, str]:
     return test_status_map
 
 
-def load_dataset(
+def load_r2egym_dataset(
     dataset_id: str = "R2E-Gym/R2E-Gym-Lite",
     dataset_revision: str = "8d3163011f01f9393bb3dc7700497a79a8686ae5",
     split: str = "train",
-    problems=None,
-    prepull_images=False,
-    logger=None,
+    problems: list | None = None,
+    prepull_images: bool = False,
+    logger: DebugGymLogger | None = None,
 ):
     data_path = Path(dataset_id)
     if data_path.is_file():
diff --git a/debug_gym/gym/envs/swe_bench.py b/debug_gym/gym/envs/swe_bench.py
index 6a584fc5..9af6b8f6 100644
--- a/debug_gym/gym/envs/swe_bench.py
+++ b/debug_gym/gym/envs/swe_bench.py
@@ -12,17 +12,17 @@
 from debug_gym.gym.envs.env import RepoEnv
 from debug_gym.gym.terminals.docker import DockerTerminal
 from debug_gym.gym.terminals.kubernetes import KubernetesTerminal
-from debug_gym.gym.terminals.terminal import Terminal
+from debug_gym.gym.terminals.terminal import DebugGymLogger, Terminal
 from debug_gym.gym.utils import filter_problems
 
 
 def load_swebench_dataset(
     dataset_id: str = "SWE-bench/SWE-bench_Verified",
     dataset_revision: str = "99450355ca8c611021187a57ffac304b66666738",
-    split="test",
-    problems=None,
-    prepull_images=False,
-    logger=None,
+    split: str = "test",
+    problems: list | None = None,
+    prepull_images: bool = False,
+    logger: DebugGymLogger | None = None,
 ):
     ds = datasets.load_dataset(dataset_id, revision=dataset_revision)[split]
     dataset = {id: i for i, id in enumerate(ds["instance_id"])}
diff --git a/debug_gym/gym/envs/swe_smith.py b/debug_gym/gym/envs/swe_smith.py
index f2162fe3..9dc934fb 100644
--- a/debug_gym/gym/envs/swe_smith.py
+++ b/debug_gym/gym/envs/swe_smith.py
@@ -15,7 +15,7 @@
 from debug_gym.gym.entities import EvalOutput
 from debug_gym.gym.envs.swe_bench import SWEBenchEnv
 from debug_gym.gym.terminals.kubernetes import KubernetesTerminal
-from debug_gym.gym.terminals.terminal import Terminal
+from debug_gym.gym.terminals.terminal import DebugGymLogger, Terminal
 from debug_gym.gym.utils import filter_problems
 
 
@@ -23,9 +23,9 @@ def load_swesmith_dataset(
     dataset_id: str = "SWE-bench/SWE-smith",
     dataset_revision: str = "699b53400d3855206a0fbf3ff4beaf1a52f4f232",
     split: str = "train",
-    problems: str | list[str] | None = None,
+    problems: list | None = None,
     prepull_images: bool = False,
-    logger=None,
+    logger: DebugGymLogger | None = None,
 ):
     data_path = Path(dataset_id)
     if data_path.is_file():

From c506fe16a1557699469e3938c6ef8ec772c77614 Mon Sep 17 00:00:00 2001
From: Alessandro Sordoni <alsordon@microsoft.com>
Date: Thu, 27 Nov 2025 13:13:06 -0800
Subject: [PATCH 08/31] fixture first fix

---
 scripts/run.py             |  2 +-
 tests/gym/envs/conftest.py | 14 +++++++++++++-
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/scripts/run.py b/scripts/run.py
index 4c1e458a..5b9bc0fc 100644
--- a/scripts/run.py
+++ b/scripts/run.py
@@ -315,7 +315,7 @@ def main():
                 try:
                     success = run_agent(args, problem, config)
                 except AgentTimeoutException:
-                    pass  # Handleled in run_agent, just continue
+                    pass  # Handled in run_agent, just continue
                 except (KeyboardInterrupt, Exception) as e:
                     raise e
         else:
diff --git a/tests/gym/envs/conftest.py b/tests/gym/envs/conftest.py
index 1d4056ce..3f7792a6 100644
--- a/tests/gym/envs/conftest.py
+++ b/tests/gym/envs/conftest.py
@@ -2,7 +2,10 @@
 from filelock import FileLock
 
 from debug_gym.gym.envs import R2EGymEnv, SWEBenchEnv, SWESmithEnv
+from debug_gym.gym.envs.r2egym import load_r2egym_dataset
+from debug_gym.gym.envs.swe_bench import load_swebench_dataset
 from debug_gym.gym.envs.swe_bench_debug import SWEBenchDebugEnv
+from debug_gym.gym.envs.swe_smith import load_swesmith_dataset
 
 BUILD_ENV_CONFIGS = {
     "swe_smith": {
@@ -31,7 +34,16 @@ def make_env_factory(env_name, worker_id, tmp_path_factory):
     env_class = kwargs.pop("env_class")
 
     def _make_env():
-        return env_class(**kwargs)
+        if type(env_class) in [SWEBenchEnv, SWEBenchDebugEnv]:
+            fn = load_swebench_dataset
+        elif type(env_class) == SWESmithEnv:
+            fn = load_swesmith_dataset
+        elif type(env_class) == R2EGymEnv:
+            fn = load_r2egym_dataset
+        else:
+            raise ValueError(f"Unknown env_class: {env_class}")
+        task_data = fn(problems=kwargs["problems"])[0]
+        return env_class(task_data=task_data)
 
     if worker_id == "master":
         # Not running with pytest-xdist or we are in the master process

From aab04ff99894f89f940b93504405608b36059f71 Mon Sep 17 00:00:00 2001
From: Alessandro Sordoni <alsordon@microsoft.com>
Date: Thu, 27 Nov 2025 13:27:27 -0800
Subject: [PATCH 09/31] fix

---
 scripts/run.py                | 16 ++++++----------
 tests/gym/envs/conftest.py    |  6 +++---
 tests/gym/envs/test_r2egym.py | 16 +++++++++-------
 3 files changed, 18 insertions(+), 20 deletions(-)

diff --git a/scripts/run.py b/scripts/run.py
index 5b9bc0fc..d6a22c08 100644
--- a/scripts/run.py
+++ b/scripts/run.py
@@ -21,8 +21,8 @@
 from debug_gym.gym.envs.swe_bench import load_swebench_dataset
 from debug_gym.gym.envs.swe_smith import load_swesmith_dataset
 from debug_gym.gym.envs.r2egym import load_r2egym_dataset
-        
-        
+
+
 class AgentTimeoutException(BaseException):
     """Custom exception to handle timeouts in agent
     execution. Inherits from BaseException to ensure
@@ -44,11 +44,7 @@ def timeout_handler(signum, frame):
         signal.alarm(timeout_seconds)
 
 
-def run_agent(
-    args,
-    problem: dict,
-    config: dict
-):
+def run_agent(args, problem: dict, config: dict):
     set_signal(args.timeout)
     success = True
     env = None
@@ -260,7 +256,7 @@ def main():
         "dataset_id": config.env_kwargs.get("dataset_id"),
         "dataset_revision": config.env_kwargs.get("dataset_revision"),
         "problems": config.get("problems", "all"),
-        "prepull_images": config.env_kwargs.get("prepull_images", False)
+        "prepull_images": config.env_kwargs.get("prepull_images", False),
     }
     load_dataset_fn = {
         "swebench": load_swebench_dataset,
@@ -269,8 +265,8 @@ def main():
         "r2egym": load_r2egym_dataset,
     }
 
-    if config['benchmark'] in load_dataset_fn:
-        dataset = load_dataset_fn[config['benchmark']](
+    if config["benchmark"] in load_dataset_fn:
+        dataset = load_dataset_fn[config["benchmark"]](
             **dataset_info,
         )
     else:
diff --git a/tests/gym/envs/conftest.py b/tests/gym/envs/conftest.py
index 3f7792a6..4dd1c114 100644
--- a/tests/gym/envs/conftest.py
+++ b/tests/gym/envs/conftest.py
@@ -34,11 +34,11 @@ def make_env_factory(env_name, worker_id, tmp_path_factory):
     env_class = kwargs.pop("env_class")
 
     def _make_env():
-        if type(env_class) in [SWEBenchEnv, SWEBenchDebugEnv]:
+        if isinstance(env_class, (SWEBenchEnv, SWEBenchDebugEnv)):
             fn = load_swebench_dataset
-        elif type(env_class) == SWESmithEnv:
+        elif isinstance(env_class, SWESmithEnv):
             fn = load_swesmith_dataset
-        elif type(env_class) == R2EGymEnv:
+        elif isinstance(env_class, R2EGymEnv):
             fn = load_r2egym_dataset
         else:
             raise ValueError(f"Unknown env_class: {env_class}")
diff --git a/tests/gym/envs/test_r2egym.py b/tests/gym/envs/test_r2egym.py
index 43d25387..be32c2c2 100644
--- a/tests/gym/envs/test_r2egym.py
+++ b/tests/gym/envs/test_r2egym.py
@@ -7,7 +7,7 @@
 
 from debug_gym.agents.solution_agent import AgentSolution
 from debug_gym.gym.entities import Observation
-from debug_gym.gym.envs.r2egym import R2EGymEnv
+from debug_gym.gym.envs.r2egym import R2EGymEnv, load_r2egym_dataset
 from debug_gym.gym.terminals.docker import DockerTerminal
 from debug_gym.gym.tools.tool import ToolCall
 from debug_gym.gym.tools.toolbox import Toolbox
@@ -73,10 +73,12 @@ def test_load_dataset_from_parquet(mock_docker_from_env, tmp_path):
     mock_terminal = MagicMock(spec=DockerTerminal)
 
     # Load the dataset from the Parquet file
-    env = R2EGymEnv(dataset_id=str(parquet_file), split="train", terminal=mock_terminal)
+    dataset = load_r2egym_dataset(
+        dataset_id=str(parquet_file), split="train", terminal=mock_terminal
+    )
 
     # Verify the dataset contains the expected features
-    assert sorted(env.ds.features.keys()) == sorted(
+    assert sorted(dataset.features.keys()) == sorted(
         [
             "commit_hash",
             "docker_image",
@@ -96,10 +98,10 @@ def test_load_dataset_from_parquet(mock_docker_from_env, tmp_path):
     )
 
     # Verify the dataset has the expected data
-    assert len(env.ds) == 1
-    assert env.ds[0]["docker_image"] == "test_repo:test_hash_123"
-    assert env.ds[0]["commit_hash"] == "test_hash_123"
-    assert "Test problem statement" in env.ds[0]["problem_statement"]
+    assert len(dataset) == 1
+    assert dataset[0]["docker_image"] == "test_repo:test_hash_123"
+    assert dataset[0]["commit_hash"] == "test_hash_123"
+    assert "Test problem statement" in dataset[0]["problem_statement"]
 
 
 @pytest.if_docker_running

From cc8f813e1680cd1844c5bc72f3f860fe08f38e99 Mon Sep 17 00:00:00 2001
From: Alessandro Sordoni <alsordon@microsoft.com>
Date: Thu, 27 Nov 2025 13:49:18 -0800
Subject: [PATCH 10/31] fix tests

---
 tests/gym/envs/conftest.py             | 6 +++---
 tests/gym/envs/test_r2egym.py          | 5 +----
 tests/gym/terminals/test_kubernetes.py | 2 +-
 3 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/tests/gym/envs/conftest.py b/tests/gym/envs/conftest.py
index 4dd1c114..94e70a47 100644
--- a/tests/gym/envs/conftest.py
+++ b/tests/gym/envs/conftest.py
@@ -34,11 +34,11 @@ def make_env_factory(env_name, worker_id, tmp_path_factory):
     env_class = kwargs.pop("env_class")
 
     def _make_env():
-        if isinstance(env_class, (SWEBenchEnv, SWEBenchDebugEnv)):
+        if issubclass(env_class, (SWEBenchEnv, SWEBenchDebugEnv)):
             fn = load_swebench_dataset
-        elif isinstance(env_class, SWESmithEnv):
+        elif issubclass(env_class, SWESmithEnv):
             fn = load_swesmith_dataset
-        elif isinstance(env_class, R2EGymEnv):
+        elif issubclass(env_class, R2EGymEnv):
             fn = load_r2egym_dataset
         else:
             raise ValueError(f"Unknown env_class: {env_class}")
diff --git a/tests/gym/envs/test_r2egym.py b/tests/gym/envs/test_r2egym.py
index be32c2c2..f7bfd352 100644
--- a/tests/gym/envs/test_r2egym.py
+++ b/tests/gym/envs/test_r2egym.py
@@ -69,12 +69,9 @@ def test_load_dataset_from_parquet(mock_docker_from_env, tmp_path):
     table = pa.table(data)
     pq.write_table(table, str(parquet_file))
 
-    # Mock the terminal to avoid actual Docker operations
-    mock_terminal = MagicMock(spec=DockerTerminal)
-
     # Load the dataset from the Parquet file
     dataset = load_r2egym_dataset(
-        dataset_id=str(parquet_file), split="train", terminal=mock_terminal
+        dataset_id=str(parquet_file), split="train"
     )
 
     # Verify the dataset contains the expected features
diff --git a/tests/gym/terminals/test_kubernetes.py b/tests/gym/terminals/test_kubernetes.py
index 0161fbcc..dcde0a56 100644
--- a/tests/gym/terminals/test_kubernetes.py
+++ b/tests/gym/terminals/test_kubernetes.py
@@ -70,7 +70,7 @@ def test_kubernetes_terminal_init():
     assert terminal._pod is not None
 
     # Pod name should be automatically generated when not provided at initialization.
-    assert terminal.pod_name.startswith("dbg-gym.")
+    assert terminal.pod_name.startswith("dbg-gym-")
     assert terminal.pod.is_running()
     assert terminal.pod.exists()
 

From 77aeb783210a556ffff29605ce07ed5ed0573b90 Mon Sep 17 00:00:00 2001
From: Alessandro Sordoni <alsordon@microsoft.com>
Date: Thu, 27 Nov 2025 14:14:59 -0800
Subject: [PATCH 11/31] simplify filtering

---
 debug_gym/gym/envs/r2egym.py          | 14 +++++++-------
 debug_gym/gym/envs/swe_bench.py       | 10 +++++-----
 debug_gym/gym/envs/swe_smith.py       | 11 +++++------
 debug_gym/gym/terminals/kubernetes.py |  2 +-
 debug_gym/gym/utils.py                | 12 ++++++------
 5 files changed, 24 insertions(+), 25 deletions(-)

diff --git a/debug_gym/gym/envs/r2egym.py b/debug_gym/gym/envs/r2egym.py
index 66745dfd..1cab6740 100644
--- a/debug_gym/gym/envs/r2egym.py
+++ b/debug_gym/gym/envs/r2egym.py
@@ -89,14 +89,15 @@ def load_r2egym_dataset(
         custom_splits = yaml.safe_load(f)
         excluded_ids = custom_splits.get("excluded", [])
 
-    dataset = {id.split("/", 1)[-1]: i for i, id in enumerate(ds["docker_image"])}
-    problems = filter_problems(dataset, problems, custom_splits, excluded_ids)
-    dataset = {id: i for id, i in dataset.items() if id in problems}
+    # add instance id to each example (name of the image)
+    ds["instance_id"] = [id.split("/", 1)[-1] for id in ds["docker_image"]]
+    problems = filter_problems(ds["instance_id"], problems, custom_splits, excluded_ids)
+    ds = ds.filter(lambda example: example["instance_id"] in problems)
 
-    image_names = set(ds[dataset[id]]["docker_image"] for id in dataset)
+    image_names = set(ds["docker_image"])
     if logger is not None:
         logger.debug(
-            f"Loaded {len(dataset)} tasks accross {len(image_names)} Docker images from {dataset_id}."
+            f"Loaded {len(ds)} tasks across {len(image_names)} Docker images from {dataset_id}."
         )
 
     if prepull_images:
@@ -116,8 +117,7 @@ def load_r2egym_dataset(
                             f"Pulling Docker image {i + 1}/{len(missing_images)} `{image_name}`."
                         )
                     client.images.pull(image_name)
-
-    return dataset
+    return ds
 
 
 class R2EGymEnv(RepoEnv):
diff --git a/debug_gym/gym/envs/swe_bench.py b/debug_gym/gym/envs/swe_bench.py
index 9af6b8f6..891e0c2c 100644
--- a/debug_gym/gym/envs/swe_bench.py
+++ b/debug_gym/gym/envs/swe_bench.py
@@ -25,11 +25,11 @@ def load_swebench_dataset(
     logger: DebugGymLogger | None = None,
 ):
     ds = datasets.load_dataset(dataset_id, revision=dataset_revision)[split]
-    dataset = {id: i for i, id in enumerate(ds["instance_id"])}
-    problems = filter_problems(dataset, problems)
-    dataset = {id: i for id, i in dataset.items() if id in problems}
+    problems = filter_problems(ds["instance_id"], problems)
+
+    ds = ds.filter(lambda example: example["instance_id"] in problems)
+    instance_ids = ds["instance_id"]
 
-    instance_ids = [ds[dataset[id]]["instance_id"] for id in dataset]
     image_names = set(
         f"sweb.eval.x86_64.{id.replace('__', '_1776_')}" for id in instance_ids
     )
@@ -52,7 +52,7 @@ def load_swebench_dataset(
                         f"Pulling Docker images {i + 1}/{len(missing_images)}: `{image_name}`."
                     )
                 client.images.pull(image_name)
-    return dataset
+    return ds
 
 
 class SWEBenchEnv(RepoEnv):
diff --git a/debug_gym/gym/envs/swe_smith.py b/debug_gym/gym/envs/swe_smith.py
index 9dc934fb..e67c2be1 100644
--- a/debug_gym/gym/envs/swe_smith.py
+++ b/debug_gym/gym/envs/swe_smith.py
@@ -49,14 +49,13 @@ def load_swesmith_dataset(
         custom_splits = yaml.safe_load(f)
         excluded_ids = custom_splits.get("excluded", [])
 
-    dataset = {id: i for i, id in enumerate(ds["instance_id"])}
-    problems = filter_problems(dataset, problems, custom_splits, excluded_ids)
-    dataset = {id: i for id, i in dataset.items() if id in problems}
+    problems = filter_problems(ds["instance_id"], problems)
+    ds = ds.filter(lambda example: example["instance_id"] in problems)
 
-    image_names = set(ds[dataset[id]]["image_name"] for id in dataset)
+    image_names = set(ds["image_name"])
     if logger is not None:
         logger.debug(
-            f"Loaded {len(dataset)} tasks accross {len(image_names)} Docker images from {dataset_id}."
+            f"Loaded {len(ds)} tasks across {len(image_names)} Docker images from {dataset_id}."
         )
 
     if prepull_images:
@@ -81,7 +80,7 @@ def load_swesmith_dataset(
                 client.images.pull(docker_hub_image)
                 # Rename images via tagging
                 client.images.get(docker_hub_image).tag(image_name)
-    return dataset
+    return ds
 
 
 class SWESmithEnv(SWEBenchEnv):
diff --git a/debug_gym/gym/terminals/kubernetes.py b/debug_gym/gym/terminals/kubernetes.py
index 5fa90ea0..f40ca38a 100644
--- a/debug_gym/gym/terminals/kubernetes.py
+++ b/debug_gym/gym/terminals/kubernetes.py
@@ -512,7 +512,7 @@ def setup_pod(self, max_retries: int = 3) -> None:
                     "restartPolicy": "Never",
                     "containers": [
                         {
-                            "name": pod_name,
+                            "name": "main",
                             "image": f"{self.registry}{self.base_image}",
                             "imagePullPolicy": "IfNotPresent",
                             "command": ["/bin/bash"],
diff --git a/debug_gym/gym/utils.py b/debug_gym/gym/utils.py
index 24372a44..1d95294b 100644
--- a/debug_gym/gym/utils.py
+++ b/debug_gym/gym/utils.py
@@ -196,7 +196,7 @@ def extract_reward_from_pytest_output(output):
 
 
 def filter_problems(
-    dataset: dict[str, Any],
+    dataset_instances: list[str],
     problems: str | list[str] | None = None,
     custom_splits: dict[str, Any] | None = None,
     excluded_ids: list[str] | None = None,
@@ -208,9 +208,9 @@ def filter_problems(
     if not isinstance(problems, str):
         # Check that all problems are valid task names.
         for problem in problems:
-            if problem not in dataset:
+            if problem not in dataset_instances:
                 raise ValueError(
-                    f"Invalid problem id: '{problem}'.\nChoose from: {sorted(dataset)}"
+                    f"Invalid problem id: '{problem}'.\nChoose from: {sorted(dataset_instances)}"
                 )
 
         # Make sure all problems are unique.
@@ -220,14 +220,14 @@ def filter_problems(
         return problems  # Assuming a list of problem IDs.
 
     if problems == "all":
-        return [k for k in dataset if k not in excluded_ids]
-    elif problems in dataset:
+        return [k for k in dataset_instances if k not in excluded_ids]
+    elif problems in dataset_instances:
         return [problems]  # Single task
     elif problems in custom_splits:
         return custom_splits[problems]
     else:
         raise ValueError(
-            f"Invalid split or problem id: '{problems}'.\nChoose from: {sorted(dataset) + ['all'] + sorted(custom_splits)}"
+            f"Invalid split or problem id: '{problems}'.\nChoose from: {sorted(dataset_instances) + ['all'] + sorted(custom_splits)}"
         )
 
 

From 28caf414215ab943192a57e18a4210cbebe08dcc Mon Sep 17 00:00:00 2001
From: Alessandro Sordoni <alsordon@microsoft.com>
Date: Thu, 27 Nov 2025 14:49:36 -0800
Subject: [PATCH 12/31] remove deps on swesmith! also fix excluded_ids for
 swesmith

---
 debug_gym/gym/envs/r2egym.py              |  10 +-
 debug_gym/gym/envs/swe_smith.py           |  15 +-
 debug_gym/gym/envs/swe_smith_constants.py | 686 ++++++++++++++++++++++
 debug_gym/gym/envs/swe_smith_utils.py     | 190 ++++++
 debug_gym/gym/terminals/kubernetes.py     |  37 +-
 5 files changed, 925 insertions(+), 13 deletions(-)
 create mode 100755 debug_gym/gym/envs/swe_smith_constants.py
 create mode 100755 debug_gym/gym/envs/swe_smith_utils.py

diff --git a/debug_gym/gym/envs/r2egym.py b/debug_gym/gym/envs/r2egym.py
index 1cab6740..b843d4a4 100644
--- a/debug_gym/gym/envs/r2egym.py
+++ b/debug_gym/gym/envs/r2egym.py
@@ -90,7 +90,13 @@ def load_r2egym_dataset(
         excluded_ids = custom_splits.get("excluded", [])
 
     # add instance id to each example (name of the image)
-    ds["instance_id"] = [id.split("/", 1)[-1] for id in ds["docker_image"]]
+    def extract_instance_id(docker_image: str) -> str:
+        return docker_image.split("/", 1)[-1]
+
+    # create a column "instance_id" in the dataset
+    instance_ids = [extract_instance_id(id) for id in ds["docker_image"]]
+    ds = ds.add_column("instance_id", instance_ids)
+
     problems = filter_problems(ds["instance_id"], problems, custom_splits, excluded_ids)
     ds = ds.filter(lambda example: example["instance_id"] in problems)
 
@@ -154,7 +160,7 @@ def instructions(self) -> str:
 
     def setup_task(self, task_data: dict, options: dict = None):
         self.ds_row = task_data
-        self.task_name = task_data["instance_id"]
+        self.task_name = self.ds_row["instance_id"]
         self.base_image = self.ds_row["docker_image"]
         self.package_name = self.ds_row["repo_name"]
         self.expected_output = json.loads(self.ds_row["expected_output_json"])
diff --git a/debug_gym/gym/envs/swe_smith.py b/debug_gym/gym/envs/swe_smith.py
index e67c2be1..6ba6fe03 100644
--- a/debug_gym/gym/envs/swe_smith.py
+++ b/debug_gym/gym/envs/swe_smith.py
@@ -4,12 +4,13 @@
 import docker
 import yaml
 from datasets import load_dataset, load_from_disk
-from swesmith.build_repo.download_images import DOCKER_ORG, TAG
-from swesmith.constants import MAP_REPO_TO_SPECS
-from swesmith.harness.grading import TestStatus
-from swesmith.harness.log_parsers import MAP_REPO_TO_PARSER, parse_log_pytest
-from swesmith.harness.utils import get_test_command
-from swesmith.utils import get_repo_commit_from_image_name
+
+from .swe_smith_constants import DOCKER_ORG, TAG, MAP_REPO_TO_SPECS
+from .swe_smith_utils import get_test_command, get_repo_commit_from_image_name
+
+from swebench.harness.constants import TestStatus
+from swebench.harness.grading import MAP_REPO_TO_PARSER
+from swebench.harness.log_parsers.python import parse_log_pytest
 
 from debug_gym.constants import DEBUG_GYM_CACHE_DIR
 from debug_gym.gym.entities import EvalOutput
@@ -49,7 +50,7 @@ def load_swesmith_dataset(
         custom_splits = yaml.safe_load(f)
         excluded_ids = custom_splits.get("excluded", [])
 
-    problems = filter_problems(ds["instance_id"], problems)
+    problems = filter_problems(ds["instance_id"], problems, custom_splits, excluded_ids)
     ds = ds.filter(lambda example: example["instance_id"] in problems)
 
     image_names = set(ds["image_name"])
diff --git a/debug_gym/gym/envs/swe_smith_constants.py b/debug_gym/gym/envs/swe_smith_constants.py
new file mode 100755
index 00000000..4eff67c0
--- /dev/null
+++ b/debug_gym/gym/envs/swe_smith_constants.py
@@ -0,0 +1,686 @@
+
+DOCKER_ORG = "jyangballin"
+TAG = "latest"
+
+"""
+Pulled from official SWE-Smith repository.
+"""
+
+from pathlib import Path
+
+CONDA_VERSION = "py312_24.1.2-0"
+DEFAULT_PM_LIKELIHOOD = 0.2
+ENV_NAME = "testbed"
+KEY_IMAGE_NAME = "image_name"
+
+# If set, then subset of tests are run for post-bug validation
+# Affects get_test_command, get_valid_report
+KEY_MIN_TESTING = "minimal_testing"
+# If set, then for pre-bug validation, individual runs are
+# performed instead of running the entire test suite
+# Affects valid.py
+KEY_MIN_PREGOLD = "minimal_pregold"
+
+KEY_PATCH = "patch"
+KEY_TEST_CMD = "test_cmd"
+KEY_TIMED_OUT = "timed_out"
+LOG_DIR_BUG_GEN = Path("logs/bug_gen")
+LOG_DIR_ENV_RECORDS = Path("logs/build_images/records")
+LOG_DIR_ISSUE_GEN = Path("logs/issue_gen")
+LOG_DIR_RUN_VALIDATION = Path("logs/run_validation")
+LOG_DIR_TASKS = Path("logs/task_insts")
+LOG_TEST_OUTPUT_PRE_GOLD = "test_output_pre_gold.txt"
+MAX_INPUT_TOKENS = 128000
+ORG_NAME = "swesmith"
+PREFIX_BUG = "bug"
+PREFIX_METADATA = "metadata"
+REF_SUFFIX = ".ref"
+SGLANG_API_KEY = "swesmith"
+TEMP_PATCH = "_temp_patch_swesmith.diff"
+TEST_OUTPUT_END = ">>>>> End Test Output"
+TEST_OUTPUT_START = ">>>>> Start Test Output"
+TIMEOUT = 120
+UBUNTU_VERSION = "22.04"
+VOLUME_NAME_DATASET = "datasets"
+VOLUME_NAME_MODEL = "llm-weights"
+
+GIT_APPLY_CMDS = [
+    "git apply --verbose",
+    "git apply --verbose --reject",
+    "patch --batch --fuzz=5 -p1 -i",
+]
+
+_DOCKERFILE_BASE_EXTENDED = """
+RUN apt-get update && apt-get install ffmpeg libsm6 libxext6 libgl1 -y
+"""
+
+
+"""
+Purpose: Mirroring the constants specified in SWE-bench, this file contains the installation
+specifications for specific commit(s) of different Python repositories. It is written to be
+compatible with the SWE-bench repository to leverage its ability to create docker images.
+"""
+
+### MARK: Commonly Used Installion / Testing Specifications ###
+
+TEST_PYTEST = "pytest --disable-warnings --color=no --tb=no --verbose -rA -p no:snail"
+
+DEFAULT_SPECS = {
+    "install": ["python -m pip install -e ."],
+    "python": "3.10",
+    KEY_TEST_CMD: TEST_PYTEST,
+}
+
+CMAKE_VERSIONS = ["3.15.7", "3.16.9", "3.17.5", "3.19.7", "3.23.5", "3.27.9"]
+INSTALL_CMAKE = (
+    [
+        f"wget https://github.com/Kitware/CMake/releases/download/v{v}/cmake-{v}-Linux-x86_64.tar.gz"
+        for v in CMAKE_VERSIONS
+    ]
+    + [
+        f"tar -xvzf cmake-{v}-Linux-x86_64.tar.gz && mv cmake-{v}-Linux-x86_64 /usr/share/cmake-{v}"
+        if v not in ["3.23.5", "3.27.9"]
+        else f"tar -xvzf cmake-{v}-Linux-x86_64.tar.gz && mv cmake-{v}-linux-x86_64 /usr/share/cmake-{v}"
+        for v in CMAKE_VERSIONS
+    ]
+    + [
+        f"update-alternatives --install /usr/bin/cmake cmake /usr/share/cmake-{v}/bin/cmake {(idx + 1) * 10}"
+        for idx, v in enumerate(CMAKE_VERSIONS)
+    ]
+)
+
+INSTALL_BAZEL = [
+    cmd
+    for v in ["6.5.0", "7.4.1", "8.0.0"]
+    for cmd in [
+        f"mkdir -p /usr/share/bazel-{v}/bin",
+        f"wget https://github.com/bazelbuild/bazel/releases/download/{v}/bazel-{v}-linux-x86_64",
+        f"chmod +x bazel-{v}-linux-x86_64",
+        f"mv bazel-{v}-linux-x86_64 /usr/share/bazel-{v}/bin/bazel",
+    ]
+]
+
+### MARK Repository/Commit specific installation instructions ###
+
+SPECS_REPO_ADDICT = {"75284f9593dfb929cadd900aff9e35e7c7aec54b": DEFAULT_SPECS}
+SPECS_REPO_ALIVE_PROGRESS = {"35853799b84ee682af121f7bc5967bd9b62e34c4": DEFAULT_SPECS}
+SPECS_REPO_APISPEC = {
+    "8b421526ea1015046de42599dd93da6a3473fe44": {
+        **DEFAULT_SPECS,
+        "install": ["pip install -e .[dev]"],
+    }
+}
+SPECS_REPO_ARROW = {"1d70d0091980ea489a64fa95a48e99b45f29f0e7": DEFAULT_SPECS}
+SPECS_REPO_ASTROID = {"b114f6b58e749b8ab47f80490dce73ea80d8015f": DEFAULT_SPECS}
+SPECS_REPO_ASYNC_TIMEOUT = {"d0baa9f162b866e91881ae6cfa4d68839de96fb5": DEFAULT_SPECS}
+SPECS_REPO_AUTOGRAD = {
+    "ac044f0de1185b725955595840135e9ade06aaed": {
+        **DEFAULT_SPECS,
+        "install": ["pip install -e '.[scipy,test]'"],
+    }
+}
+SPECS_REPO_BLEACH = {"73871d766de1e33a296eeb4f9faf2451f28bee39": DEFAULT_SPECS}
+SPECS_REPO_BOLTONS = {"3bfcfdd04395b6cc74a5c0cdc72c8f64cc4ac01f": DEFAULT_SPECS}
+SPECS_REPO_BOTTLE = {"a8dfef301dec35f13e7578306002c40796651629": DEFAULT_SPECS}
+SPECS_REPO_BOX = {"a23451d2869a511280eebe194efca41efadd2706": DEFAULT_SPECS}
+SPECS_REPO_CANTOOLS = {
+    "0c6a78711409e4307de34582f795ddb426d58dd8": {
+        **DEFAULT_SPECS,
+        "install": ["pip install -e .[dev,plot]"],
+    }
+}
+SPECS_REPO_CHANNELS = {
+    "a144b4b8881a93faa567a6bdf2d7f518f4c16cd2": {
+        **DEFAULT_SPECS,
+        "install": ["pip install -e .[tests,daphne]"],
+    }
+}
+SPECS_REPO_CHARDET = {"9630f2382faa50b81be2f96fd3dfab5f6739a0ef": DEFAULT_SPECS}
+SPECS_REPO_CHARDET_NORMALIZER = {
+    "1fdd64633572040ab60e62e8b24f29cb7e17660b": DEFAULT_SPECS
+}
+SPECS_REPO_CLICK = {"fde47b4b4f978f179b9dff34583cb2b99021f482": DEFAULT_SPECS}
+SPECS_REPO_CLOUDPICKLE = {"6220b0ce83ffee5e47e06770a1ee38ca9e47c850": DEFAULT_SPECS}
+SPECS_REPO_COLORLOG = {"dfa10f59186d3d716aec4165ee79e58f2265c0eb": DEFAULT_SPECS}
+SPECS_REPO_COOKIECUTTER = {"b4451231809fb9e4fc2a1e95d433cb030e4b9e06": DEFAULT_SPECS}
+SPECS_REPO_DAPHNE = {"32ac73e1a0fb87af0e3280c89fe4cc3ff1231b37": DEFAULT_SPECS}
+SPECS_REPO_DATASET = {
+    "5c2dc8d3af1e0af0290dcd7ae2cae92589f305a1": {
+        **DEFAULT_SPECS,
+        "install": ["python setup.py install"],
+    }
+}
+SPECS_REPO_DEEPDIFF = {
+    "ed2520229d0369813f6e54cdf9c7e68e8073ef62": {
+        **DEFAULT_SPECS,
+        "install": [
+            "pip install -r requirements-dev.txt",
+            "pip install -e .",
+        ],
+    }
+}
+SPECS_REPO_DJANGO_MONEY = {
+    "835c1ab867d11137b964b94936692bea67a038ec": {
+        **DEFAULT_SPECS,
+        "install": ["pip install -e .[test,exchange]"],
+    }
+}
+SPECS_REPO_DOMINATE = {"9082227e93f5a370012bb934286caf7385d3e7ac": DEFAULT_SPECS}
+SPECS_REPO_DOTENV = {"2b8635b79f1aa15cade0950117d4e7d12c298766": DEFAULT_SPECS}
+SPECS_REPO_DRF_NESTED_ROUTERS = {
+    "6144169d5c33a1c5134b2fedac1d6cfa312c174e": {
+        **DEFAULT_SPECS,
+        "install": [
+            "pip install -r requirements.txt",
+            "pip install -e .",
+        ],
+    }
+}
+SPECS_REPO_ENVIRONS = {
+    "73c372df71002312615ad0349ae11274bb3edc69": {
+        **DEFAULT_SPECS,
+        "install": ["pip install -e .[dev]"],
+    }
+}
+SPECS_REPO_EXCEPTIONGROUP = {"0b4f49378b585a338ae10abd72ec2006c5057d7b": DEFAULT_SPECS}
+SPECS_REPO_FAKER = {"8b401a7d68f5fda1276f36a8fc502ef32050ed72": DEFAULT_SPECS}
+SPECS_REPO_FEEDPARSER = {"cad965a3f52c4b077221a2142fb14ef7f68cd576": DEFAULT_SPECS}
+SPECS_REPO_FLAKE8 = {"cf1542cefa3e766670b2066dd75c4571d682a649": DEFAULT_SPECS}
+SPECS_REPO_FLASHTEXT = {"b316c7e9e54b6b4d078462b302a83db85f884a94": DEFAULT_SPECS}
+SPECS_REPO_FLASK = {"bc098406af9537aacc436cb2ea777fbc9ff4c5aa": DEFAULT_SPECS}
+SPECS_REPO_FREEZEGUN = {"5f171db0aaa02c4ade003bbc8885e0bb19efbc81": DEFAULT_SPECS}
+SPECS_REPO_FUNCY = {"207a7810c216c7408596d463d3f429686e83b871": DEFAULT_SPECS}
+SPECS_REPO_FURL = {"da386f68b8d077086c25adfd205a4c3d502c3012": DEFAULT_SPECS}
+SPECS_REPO_FVCORE = {
+    "a491d5b9a06746f387aca2f1f9c7c7f28e20bef9": {
+        **DEFAULT_SPECS,
+        "install": [
+            "pip install torch shapely",
+            "rm tests/test_focal_loss.py",
+            "pip install -e .",
+        ],
+    }
+}
+SPECS_REPO_GLOM = {"fb3c4e76f28816aebfd2538980e617742e98a7c2": DEFAULT_SPECS}
+SPECS_REPO_GPXPY = {
+    "09fc46b3cad16b5bf49edf8e7ae873794a959620": {
+        **DEFAULT_SPECS,
+        KEY_TEST_CMD: "pytest test.py --verbose --color=no --tb=no --disable-warnings -rA -p no:snail",
+    }
+}
+SPECS_REPO_GRAFANALIB = {"5c3b17edaa437f0bc09b5f1b9275dc8fb91689fb": DEFAULT_SPECS}
+SPECS_REPO_GRAPHENE = {"82903263080b3b7f22c2ad84319584d7a3b1a1f6": DEFAULT_SPECS}
+SPECS_REPO_GSPREAD = {"a8be3b96f9276779ab680d84a0982282fb184000": DEFAULT_SPECS}
+SPECS_REPO_GTTS = {"dbcda4f396074427172d4a1f798a172686ace6e0": DEFAULT_SPECS}
+SPECS_REPO_GUNICORN = {"bacbf8aa5152b94e44aa5d2a94aeaf0318a85248": DEFAULT_SPECS}
+SPECS_REPO_H11 = {"bed0dd4ae9774b962b19833941bb9ec4dc403da9": DEFAULT_SPECS}
+SPECS_REPO_ICECREAM = {"f76fef56b66b59fd9a89502c60a99fbe28ee36bd": DEFAULT_SPECS}
+SPECS_REPO_INFLECT = {"c079a96a573ece60b54bd5210bb0f414beb74dcd": DEFAULT_SPECS}
+SPECS_REPO_INICONFIG = {"16793eaddac67de0b8d621ae4e42e05b927e8d67": DEFAULT_SPECS}
+SPECS_REPO_ISODATE = {"17cb25eb7bc3556a68f3f7b241313e9bb8b23760": DEFAULT_SPECS}
+SPECS_REPO_JAX = {
+    "ebd90e06fa7caad087e2342431e3899cfd2fdf98": {
+        **DEFAULT_SPECS,
+        "install": ['pip install -e ".[cpu]"'],
+        KEY_TEST_CMD: f"{TEST_PYTEST} -n auto",
+        KEY_MIN_TESTING: True,
+        KEY_MIN_PREGOLD: True,
+    }
+}
+SPECS_REPO_JINJA = {"ada0a9a6fc265128b46949b5144d2eaa55e6df2c": DEFAULT_SPECS}
+SPECS_REPO_JSONSCHEMA = {"93e0caa5752947ec77333da81a634afe41a022ed": DEFAULT_SPECS}
+SPECS_REPO_LANGDETECT = {"a1598f1afcbfe9a758cfd06bd688fbc5780177b2": DEFAULT_SPECS}
+SPECS_REPO_LINE_PROFILER = {"a646bf0f9ab3d15264a1be14d0d4ee6894966f6a": DEFAULT_SPECS}
+SPECS_REPO_MARKDOWNIFY = {"6258f5c38b97ab443b4ddf03e6676ce29b392d06": DEFAULT_SPECS}
+SPECS_REPO_MARKUPSAFE = {"620c06c919c1bd7bb1ce3dbee402e1c0c56e7ac3": DEFAULT_SPECS}
+SPECS_REPO_MARSHMALLOW = {"9716fc629976c9d3ce30cd15d270d9ac235eb725": DEFAULT_SPECS}
+SPECS_REPO_MIDO = {
+    "a0158ff95a08f9a4eef628a2e7c793fd3a466640": {
+        **DEFAULT_SPECS,
+        KEY_TEST_CMD: f"{TEST_PYTEST} -rs -c /dev/null",
+    }
+}
+SPECS_REPO_MISTUNE = {"bf54ef67390e02a5cdee7495d4386d7770c1902b": DEFAULT_SPECS}
+SPECS_REPO_NIKOLA = {
+    "0f4c230e5159e4e937463eb8d6d2ddfcbb09def2": {
+        **DEFAULT_SPECS,
+        "install": ["pip install -e '.[extras,tests]'"],
+    }
+}
+SPECS_REPO_OAUTHLIB = {"1fd5253630c03e3f12719dd8c13d43111f66a8d2": DEFAULT_SPECS}
+SPECS_REPO_PARAMIKO = {
+    "23f92003898b060df0e2b8b1d889455264e63a3e": {
+        **DEFAULT_SPECS,
+        KEY_TEST_CMD: "pytest -rA --color=no --disable-warnings -p no:snail",
+    }
+}
+SPECS_REPO_PARSE = {"30da9e4f37fdd979487c9fe2673df35b6b204c72": DEFAULT_SPECS}
+SPECS_REPO_PARSIMONIOUS = {"0d3f5f93c98ae55707f0958366900275d1ce094f": DEFAULT_SPECS}
+SPECS_REPO_PARSO = {
+    "338a57602740ad0645b2881e8c105ffdc959e90d": {
+        **DEFAULT_SPECS,
+        "install": ["python setup.py install"],
+    }
+}
+SPECS_REPO_PATSY = {
+    "a5d1648401b0ea0649b077f4b98da27db947d2d0": {
+        **DEFAULT_SPECS,
+        "install": ["pip install -e .[test]"],
+    }
+}
+SPECS_REPO_PDFMINER = {"1a8bd2f730295b31d6165e4d95fcb5a03793c978": DEFAULT_SPECS}
+SPECS_REPO_PDFPLUMBER = {
+    "02ff4313f846380fefccec9c73fb4c8d8a80d0ee": {
+        **DEFAULT_SPECS,
+        "install": [
+            "apt-get update && apt-get install ghostscript -y",
+            "pip install -e .",
+        ],
+    }
+}
+SPECS_REPO_PIPDEPTREE = {
+    "c31b641817f8235df97adf178ffd8e4426585f7a": {
+        **DEFAULT_SPECS,
+        "install": [
+            "apt-get update && apt-get install graphviz -y",
+            "pip install -e .[test,graphviz]",
+        ],
+    }
+}
+SPECS_REPO_PRETTYTABLE = {"ca90b055f20a6e8a06dcc46c2e3afe8ff1e8d0f1": DEFAULT_SPECS}
+SPECS_REPO_PTYPROCESS = {"1067dbdaf5cc3ab4786ae355aba7b9512a798734": DEFAULT_SPECS}
+SPECS_REPO_PYASN1 = {"0f07d7242a78ab4d129b26256d7474f7168cf536": DEFAULT_SPECS}
+SPECS_REPO_PYDICOM = {
+    "7d361b3d764dbbb1f8ad7af015e80ce96f6bf286": {**DEFAULT_SPECS, "python": "3.11"}
+}
+SPECS_REPO_PYFIGLET = {"f8c5f35be70a4bbf93ac032334311b326bc61688": DEFAULT_SPECS}
+SPECS_REPO_PYGMENTS = {"27649ebbf5a2519725036b48ec99ef7745f100af": DEFAULT_SPECS}
+SPECS_REPO_PYOPENSSL = {"04766a496eb11f69f6226a5a0dfca4db90a5cbd1": DEFAULT_SPECS}
+SPECS_REPO_PYPARSING = {"533adf471f85b570006871e60a2e585fcda5b085": DEFAULT_SPECS}
+SPECS_REPO_PYPIKA = {"1c9646f0a019a167c32b649b6f5e6423c5ba2c9b": DEFAULT_SPECS}
+SPECS_REPO_PYQUERY = {"811cd048ffbe4e69fdc512863671131f98d691fb": DEFAULT_SPECS}
+SPECS_REPO_PYSNOOPER = {"57472b4677b6c041647950f28f2d5750c38326c6": DEFAULT_SPECS}
+SPECS_REPO_PYTHON_DOCX = {"0cf6d71fb47ede07ecd5de2a8655f9f46c5f083d": DEFAULT_SPECS}
+SPECS_REPO_PYTHON_JSON_LOGGER = {
+    "5f85723f4693c7289724fdcda84cfc0b62da74d4": DEFAULT_SPECS
+}
+SPECS_REPO_PYTHON_PINYIN = {"e42dede51abbc40e225da9a8ec8e5bd0043eed21": DEFAULT_SPECS}
+SPECS_REPO_PYTHON_PPTX = {"278b47b1dedd5b46ee84c286e77cdfb0bf4594be": DEFAULT_SPECS}
+SPECS_REPO_PYTHON_QRCODE = {"456b01d41f16e0cfb0f70c687848e276b78c3e8a": DEFAULT_SPECS}
+SPECS_REPO_PYTHON_READABILITY = {
+    "40256f40389c1f97be5e83d7838547581653c6aa": DEFAULT_SPECS
+}
+SPECS_REPO_PYTHON_SLUGIFY = {
+    "872b37509399a7f02e53f46ad9881f63f66d334b": {
+        **DEFAULT_SPECS,
+        KEY_TEST_CMD: "python test.py --verbose",
+    }
+}
+SPECS_REPO_PYVISTA = {
+    "3f0fad3f42d9b491679e6aa50e52d93c1a81c042": {
+        **DEFAULT_SPECS,
+        "install": [
+            "apt-get update && apt-get install -y ffmpeg libsm6 libxext6 libxrender1",
+            "python -m pip install -e '.[dev]'",
+        ],
+    }
+}
+SPECS_REPO_RADON = {"54b88e5878b2724bf4d77f97349588b811abdff2": DEFAULT_SPECS}
+SPECS_REPO_RECORDS = {"5941ab2798cb91455b6424a9564c9cd680475fbe": DEFAULT_SPECS}
+SPECS_REPO_RED_DISCORDBOT = {"33e0eac741955ce5b7e89d9b8f2f2712727af770": DEFAULT_SPECS}
+SPECS_REPO_RESULT = {"0b855e1e38a08d6f0a4b0138b10c127c01e54ab4": DEFAULT_SPECS}
+SPECS_REPO_SAFETY = {"7654596be933f8310b294dbc85a7af6066d06e4f": DEFAULT_SPECS}
+SPECS_REPO_SCRAPY = {
+    "35212ec5b05a3af14c9f87a6193ab24e33d62f9f": {
+        **DEFAULT_SPECS,
+        "install": [
+            "apt-get update && apt-get install -y libxml2-dev libxslt-dev libjpeg-dev",
+            "python -m pip install -e .",
+            "rm tests/test_feedexport.py",
+            "rm tests/test_pipeline_files.py",
+        ],
+        KEY_MIN_TESTING: True,
+    }
+}
+SPECS_REPO_SCHEDULE = {"82a43db1b938d8fdf60103bd41f329e06c8d3651": DEFAULT_SPECS}
+SPECS_REPO_SCHEMA = {"24a3045773eac497c659f24b32f24a281be9f286": DEFAULT_SPECS}
+SPECS_REPO_SOUPSIEVE = {"a8080d97a0355e316981cb0c5c887a861c4244e3": DEFAULT_SPECS}
+SPECS_REPO_SPACY = {
+    "b3c46c315eb16ce644bddd106d31c3dd349f6bb2": {
+        **DEFAULT_SPECS,
+        "install": [
+            "pip install -r requirements.txt",
+            "pip install -e .",
+        ],
+        KEY_MIN_TESTING: True,
+    }
+}
+SPECS_REPO_SQLFLUFF = {
+    "50a1c4b6ff171188b6b70b39afe82a707b4919ac": {**DEFAULT_SPECS, KEY_MIN_TESTING: True}
+}
+SPECS_REPO_SQLGLOT = {
+    "036601ba9cbe4d175d6a9d38bc27587eab858968": {
+        **DEFAULT_SPECS,
+        "install": ['pip install -e ".[dev]"'],
+        KEY_MIN_TESTING: True,
+    }
+}
+SPECS_REPO_SQLPARSE = {"e57923b3aa823c524c807953cecc48cf6eec2cb2": DEFAULT_SPECS}
+SPECS_REPO_STACKPRINTER = {"219fcc522fa5fd6e440703358f6eb408f3ffc007": DEFAULT_SPECS}
+SPECS_REPO_STARLETTE = {"db5063c26030e019f7ee62aef9a1b564eca9f1d6": DEFAULT_SPECS}
+SPECS_REPO_STRING_SIM = {"115acaacf926b41a15664bd34e763d074682bda3": DEFAULT_SPECS}
+SPECS_REPO_SUNPY = {
+    "f8edfd5c4be873fbd28dec4583e7f737a045f546": {
+        **DEFAULT_SPECS,
+        "python": "3.11",
+        "install": ['pip install -e ".[dev]"'],
+        KEY_MIN_TESTING: True,
+    }
+}
+SPECS_REPO_SYMPY = {
+    "2ab64612efb287f09822419f4127878a4b664f71": {
+        **DEFAULT_SPECS,
+        "python": "3.10",
+        "install": ["pip install -e ."],
+        KEY_MIN_TESTING: True,
+        KEY_MIN_PREGOLD: True,
+    }
+}
+SPECS_REPO_TENACITY = {"0d40e76f7d06d631fb127e1ec58c8bd776e70d49": DEFAULT_SPECS}
+SPECS_REPO_TERMCOLOR = {"3a42086feb35647bc5aa5f1065b0327200da6b9b": DEFAULT_SPECS}
+SPECS_REPO_TEXTDISTANCE = {
+    "c3aca916bd756a8cb71114688b469ec90ef5b232": {
+        **DEFAULT_SPECS,
+        "install": ['pip install -e ".[benchmark,test]"'],
+    }
+}
+SPECS_REPO_TEXTFSM = {"c31b600743895f018e7583f93405a3738a9f4d55": DEFAULT_SPECS}
+SPECS_REPO_THEFUZZ = {"8a05a3ee38cbd00a2d2f4bb31db34693b37a1fdd": DEFAULT_SPECS}
+SPECS_REPO_TINYDB = {"10644a0e07ad180c5b756aba272ee6b0dbd12df8": DEFAULT_SPECS}
+SPECS_REPO_TLDEXTRACT = {
+    "3d1bf184d4f20fbdbadd6274560ccd438939160e": {
+        **DEFAULT_SPECS,
+        "install": ["pip install -e .[testing]"],
+    }
+}
+SPECS_REPO_TOMLI = {"443a0c1bc5da39b7ed84306912ee1900e6b72e2f": DEFAULT_SPECS}
+SPECS_REPO_TORNADO = {
+    "d5ac65c1f1453c2aeddd089d8e68c159645c13e1": {
+        **DEFAULT_SPECS,
+        KEY_TEST_CMD: "python -m tornado.test --verbose",
+    }
+}
+SPECS_REPO_TRIO = {"cfbbe2c1f96e93b19bc2577d2cab3f4fe2e81153": DEFAULT_SPECS}
+SPECS_REPO_TWEEPY = {
+    "91a41c6e1c955d278c370d51d5cf43b05f7cd979": {
+        **DEFAULT_SPECS,
+        "install": ["pip install -e '.[dev,test,async]'"],
+    }
+}
+SPECS_REPO_TYPEGUARD = {
+    "b6a7e4387c30a9f7d635712157c889eb073c1ea3": {
+        **DEFAULT_SPECS,
+        "install": ["pip install -e .[test,doc]"],
+    }
+}
+SPECS_REPO_USADDRESS = {
+    "a42a8f0c14bd2e273939fd51c604f10826301e73": {
+        **DEFAULT_SPECS,
+        "install": ["pip install -e .[dev]"],
+    }
+}
+SPECS_REPO_VOLUPTUOUS = {"a7a55f83b9fa7ba68b0669b3d78a61de703e0a16": DEFAULT_SPECS}
+SPECS_REPO_WEBARGS = {"dbde72fe5db8a999acd1716d5ef855ab7cc1a274": DEFAULT_SPECS}
+SPECS_REPO_WORDCLOUD = {"ec24191c64570d287032c5a4179c38237cd94043": DEFAULT_SPECS}
+SPECS_REPO_XMLTODICT = {"0952f382c2340bc8b86a5503ba765a35a49cf7c4": DEFAULT_SPECS}
+SPECS_REPO_YAMLLINT = {"8513d9b97da3b32453b3fccb221f4ab134a028d7": DEFAULT_SPECS}
+
+### MARK: SWE-gym Repositories
+SPECS_REPO_MOTO = {
+    "694ce1f4880c784fed0553bc19b2ace6691bc109": {
+        **DEFAULT_SPECS,
+        "python": "3.12",
+        "install": ["make init"],
+        KEY_MIN_TESTING: True,
+    }
+}
+SPECS_REPO_MYPY = {
+    "e93f06ceab81d8ff1f777c7587d04c339cfd5a16": {
+        "python": "3.12",
+        "install": [
+            "git submodule update --init mypy/typeshed || true",
+            "python -m pip install -r test-requirements.txt",
+            "python -m pip install -e .",
+            "hash -r",
+        ],
+        KEY_TEST_CMD: "pytest --color=no -rA -k -p no:snail",
+        KEY_MIN_TESTING: True,
+    }
+}
+SPECS_REPO_MONAI = {
+    "a09c1f08461cec3d2131fde3939ef38c3c4ad5fc": {
+        "python": "3.12",
+        "install": [
+            "sed -i '/^git+https:\/\/github.com\/Project-MONAI\//d' requirements-dev.txt",
+            "python -m pip install -U -r requirements-dev.txt",
+            "python -m pip install -e .",
+        ],
+        KEY_TEST_CMD: TEST_PYTEST,
+        KEY_MIN_PREGOLD: True,
+        KEY_MIN_TESTING: True,
+    }
+}
+SPECS_REPO_DVC = {
+    "1d6ea68133289ceab2637ce7095772678af792c6": {
+        **DEFAULT_SPECS,
+        "install": ['pip install -e ".[dev]"'],
+        KEY_MIN_TESTING: True,
+    }
+}
+SPECS_REPO_HYDRA = {
+    "0f03eb60c2ecd1fbdb25ede9a2c4faeac81de491": {
+        **DEFAULT_SPECS,
+        "install": [
+            "apt-get update && apt-get install -y openjdk-17-jdk openjdk-17-jre",
+            "pip install -e .",
+        ],
+        KEY_MIN_TESTING: True,
+    }
+}
+SPECS_REPO_DASK = {
+    "5f61e42324c3a6cd4da17b5d5ebe4663aa4b8783": {
+        **DEFAULT_SPECS,
+        "install": ["python -m pip install graphviz", "python -m pip install -e ."],
+        KEY_MIN_TESTING: True,
+    }
+}
+SPECS_REPO_MODIN = {
+    "8c7799fdbbc2fb0543224160dd928215852b7757": {
+        **DEFAULT_SPECS,
+        "install": ['pip install -e ".[all]"'],
+        KEY_MIN_PREGOLD: True,
+        KEY_MIN_TESTING: True,
+    }
+}
+SPECS_REPO_PYDANTIC = {
+    "acb0f10fda1c78441e052c57b4288bc91431f852": {
+        "python": "3.10",
+        "install": [
+            "apt-get update && apt-get install -y locales pipx",
+            "pipx install uv",
+            "pipx install pre-commit",
+            'export PATH="$HOME/.local/bin:$PATH"',
+            "make install",
+        ],
+        KEY_TEST_CMD: f"/root/.local/bin/uv run {TEST_PYTEST}",
+    }
+}
+SPECS_REPO_CONAN = {
+    "86f29e137a10bb6ed140c1a8c05c3099987b13c5": {
+        **DEFAULT_SPECS,
+        "install": INSTALL_CMAKE
+        + INSTALL_BAZEL
+        + [
+            "apt-get -y update && apt-get -y upgrade && apt-get install -y build-essential cmake automake autoconf pkg-config meson ninja-build",
+            "python -m pip install -r conans/requirements.txt",
+            "python -m pip install -r conans/requirements_server.txt",
+            "python -m pip install -r conans/requirements_dev.txt",
+            "python -m pip install -e .",
+        ],
+        KEY_MIN_TESTING: True,
+    }
+}
+SPECS_REPO_PANDAS = {
+    "95280573e15be59036f98d82a8792599c10c6603": {
+        **DEFAULT_SPECS,
+        "install": [
+            "git remote add upstream https://github.com/pandas-dev/pandas.git",
+            "git fetch upstream --tags",
+            "python -m pip install -ve . --no-build-isolation -Ceditable-verbose=true",
+            """sed -i 's/__version__="[^"]*"/__version__="3.0.0.dev0+1992.g95280573e1"/' build/cp310/_version_meson.py""",
+        ],
+        KEY_MIN_PREGOLD: True,
+        KEY_MIN_TESTING: True,
+    }
+}
+SPECS_REPO_MONKEYTYPE = {
+    "70c3acf62950be5dfb28743c7a719bfdecebcd84": DEFAULT_SPECS,
+}
+
+
+MAP_REPO_TO_SPECS = {
+    "adrienverge/yamllint": SPECS_REPO_YAMLLINT,
+    "agronholm/exceptiongroup": SPECS_REPO_EXCEPTIONGROUP,
+    "agronholm/typeguard": SPECS_REPO_TYPEGUARD,
+    "aio-libs/async-timeout": SPECS_REPO_ASYNC_TIMEOUT,
+    "alanjds/drf-nested-routers": SPECS_REPO_DRF_NESTED_ROUTERS,
+    "alecthomas/voluptuous": SPECS_REPO_VOLUPTUOUS,
+    "amueller/word_cloud": SPECS_REPO_WORDCLOUD,
+    "andialbrecht/sqlparse": SPECS_REPO_SQLPARSE,
+    "arrow-py/arrow": SPECS_REPO_ARROW,
+    "benoitc/gunicorn": SPECS_REPO_GUNICORN,
+    "borntyping/python-colorlog": SPECS_REPO_COLORLOG,
+    "bottlepy/bottle": SPECS_REPO_BOTTLE,
+    "buriy/python-readability": SPECS_REPO_PYTHON_READABILITY,
+    "burnash/gspread": SPECS_REPO_GSPREAD,
+    "cantools/cantools": SPECS_REPO_CANTOOLS,
+    "cdgriffith/Box": SPECS_REPO_BOX,
+    "chardet/chardet": SPECS_REPO_CHARDET,
+    "cknd/stackprinter": SPECS_REPO_STACKPRINTER,
+    "cloudpipe/cloudpickle": SPECS_REPO_CLOUDPICKLE,
+    "Cog-Creators/Red-DiscordBot": SPECS_REPO_RED_DISCORDBOT,
+    "conan-io/conan": SPECS_REPO_CONAN,
+    "cookiecutter/cookiecutter": SPECS_REPO_COOKIECUTTER,
+    "cool-RR/PySnooper": SPECS_REPO_PYSNOOPER,
+    "dask/dask": SPECS_REPO_DASK,
+    "datamade/usaddress": SPECS_REPO_USADDRESS,
+    "davidhalter/parso": SPECS_REPO_PARSO,
+    "dbader/schedule": SPECS_REPO_SCHEDULE,
+    "django-money/django-money": SPECS_REPO_DJANGO_MONEY,
+    "django/channels": SPECS_REPO_CHANNELS,
+    "django/daphne": SPECS_REPO_DAPHNE,
+    "encode/starlette": SPECS_REPO_STARLETTE,
+    "erikrose/parsimonious": SPECS_REPO_PARSIMONIOUS,
+    "facebookresearch/fvcore": SPECS_REPO_FVCORE,
+    "facebookresearch/hydra": SPECS_REPO_HYDRA,
+    "facelessuser/soupsieve": SPECS_REPO_SOUPSIEVE,
+    "gawel/pyquery": SPECS_REPO_PYQUERY,
+    "getmoto/moto": SPECS_REPO_MOTO,
+    "getnikola/nikola": SPECS_REPO_NIKOLA,
+    "google/textfsm": SPECS_REPO_TEXTFSM,
+    "graphql-python/graphene": SPECS_REPO_GRAPHENE,
+    "gruns/furl": SPECS_REPO_FURL,
+    "gruns/icecream": SPECS_REPO_ICECREAM,
+    "gweis/isodate": SPECS_REPO_ISODATE,
+    "HIPS/autograd": SPECS_REPO_AUTOGRAD,
+    "hukkin/tomli": SPECS_REPO_TOMLI,
+    "Instagram/MonkeyType": SPECS_REPO_MONKEYTYPE,
+    "iterative/dvc": SPECS_REPO_DVC,
+    "jaraco/inflect": SPECS_REPO_INFLECT,
+    "jawah/charset_normalizer": SPECS_REPO_CHARDET_NORMALIZER,
+    "jax-ml/jax": SPECS_REPO_JAX,
+    "jd/tenacity": SPECS_REPO_TENACITY,
+    "john-kurkowski/tldextract": SPECS_REPO_TLDEXTRACT,
+    "joke2k/faker": SPECS_REPO_FAKER,
+    "jsvine/pdfplumber": SPECS_REPO_PDFPLUMBER,
+    "kayak/pypika": SPECS_REPO_PYPIKA,
+    "keleshev/schema": SPECS_REPO_SCHEMA,
+    "kennethreitz/records": SPECS_REPO_RECORDS,
+    "Knio/dominate": SPECS_REPO_DOMINATE,
+    "kurtmckee/feedparser": SPECS_REPO_FEEDPARSER,
+    "lepture/mistune": SPECS_REPO_MISTUNE,
+    "life4/textdistance": SPECS_REPO_TEXTDISTANCE,
+    "lincolnloop/python-qrcode": SPECS_REPO_PYTHON_QRCODE,
+    "luozhouyang/python-string-similarity": SPECS_REPO_STRING_SIM,
+    "madzak/python-json-logger": SPECS_REPO_PYTHON_JSON_LOGGER,
+    "mahmoud/boltons": SPECS_REPO_BOLTONS,
+    "mahmoud/glom": SPECS_REPO_GLOM,
+    "marshmallow-code/apispec": SPECS_REPO_APISPEC,
+    "marshmallow-code/marshmallow": SPECS_REPO_MARSHMALLOW,
+    "marshmallow-code/webargs": SPECS_REPO_WEBARGS,
+    "martinblech/xmltodict": SPECS_REPO_XMLTODICT,
+    "matthewwithanm/python-markdownify": SPECS_REPO_MARKDOWNIFY,
+    "mewwts/addict": SPECS_REPO_ADDICT,
+    "mido/mido": SPECS_REPO_MIDO,
+    "Mimino666/langdetect": SPECS_REPO_LANGDETECT,
+    "modin-project/modin": SPECS_REPO_MODIN,
+    "mozilla/bleach": SPECS_REPO_BLEACH,
+    "mozillazg/python-pinyin": SPECS_REPO_PYTHON_PINYIN,
+    "msiemens/tinydb": SPECS_REPO_TINYDB,
+    "oauthlib/oauthlib": SPECS_REPO_OAUTHLIB,
+    "pallets/click": SPECS_REPO_CLICK,
+    "pallets/flask": SPECS_REPO_FLASK,
+    "pallets/jinja": SPECS_REPO_JINJA,
+    "pallets/markupsafe": SPECS_REPO_MARKUPSAFE,
+    "pandas-dev/pandas": SPECS_REPO_PANDAS,
+    "paramiko/paramiko": SPECS_REPO_PARAMIKO,
+    "pdfminer/pdfminer.six": SPECS_REPO_PDFMINER,
+    "pexpect/ptyprocess": SPECS_REPO_PTYPROCESS,
+    "pndurette/gTTS": SPECS_REPO_GTTS,
+    "prettytable/prettytable": SPECS_REPO_PRETTYTABLE,
+    "Project-MONAI/MONAI": SPECS_REPO_MONAI,
+    "pudo/dataset": SPECS_REPO_DATASET,
+    "pwaller/pyfiglet": SPECS_REPO_PYFIGLET,
+    "pyasn1/pyasn1": SPECS_REPO_PYASN1,
+    "pyca/pyopenssl": SPECS_REPO_PYOPENSSL,
+    "PyCQA/flake8": SPECS_REPO_FLAKE8,
+    "pydantic/pydantic": SPECS_REPO_PYDANTIC,
+    "pydata/patsy": SPECS_REPO_PATSY,
+    "pydicom/pydicom": SPECS_REPO_PYDICOM,
+    "pygments/pygments": SPECS_REPO_PYGMENTS,
+    "pylint-dev/astroid": SPECS_REPO_ASTROID,
+    "pyparsing/pyparsing": SPECS_REPO_PYPARSING,
+    "pytest-dev/iniconfig": SPECS_REPO_INICONFIG,
+    "python-hyper/h11": SPECS_REPO_H11,
+    "python-jsonschema/jsonschema": SPECS_REPO_JSONSCHEMA,
+    "python-openxml/python-docx": SPECS_REPO_PYTHON_DOCX,
+    "python-trio/trio": SPECS_REPO_TRIO,
+    "python/mypy": SPECS_REPO_MYPY,
+    "pyupio/safety": SPECS_REPO_SAFETY,
+    "pyutils/line_profiler": SPECS_REPO_LINE_PROFILER,
+    "pyvista/pyvista": SPECS_REPO_PYVISTA,
+    "r1chardj0n3s/parse": SPECS_REPO_PARSE,
+    "rsalmei/alive-progress": SPECS_REPO_ALIVE_PROGRESS,
+    "rubik/radon": SPECS_REPO_RADON,
+    "rustedpy/result": SPECS_REPO_RESULT,
+    "scanny/python-pptx": SPECS_REPO_PYTHON_PPTX,
+    "scrapy/scrapy": SPECS_REPO_SCRAPY,
+    "seatgeek/thefuzz": SPECS_REPO_THEFUZZ,
+    "seperman/deepdiff": SPECS_REPO_DEEPDIFF,
+    "sloria/environs": SPECS_REPO_ENVIRONS,
+    "spulec/freezegun": SPECS_REPO_FREEZEGUN,
+    "sqlfluff/sqlfluff": SPECS_REPO_SQLFLUFF,
+    "sunpy/sunpy": SPECS_REPO_SUNPY,
+    "Suor/funcy": SPECS_REPO_FUNCY,
+    "sympy/sympy": SPECS_REPO_SYMPY,
+    "termcolor/termcolor": SPECS_REPO_TERMCOLOR,
+    "theskumar/python-dotenv": SPECS_REPO_DOTENV,
+    "tkrajina/gpxpy": SPECS_REPO_GPXPY,
+    "tobymao/sqlglot": SPECS_REPO_SQLGLOT,
+    "tornadoweb/tornado": SPECS_REPO_TORNADO,
+    "tox-dev/pipdeptree": SPECS_REPO_PIPDEPTREE,
+    "tweepy/tweepy": SPECS_REPO_TWEEPY,
+    "un33k/python-slugify": SPECS_REPO_PYTHON_SLUGIFY,
+    "vi3k6i5/flashtext": SPECS_REPO_FLASHTEXT,
+    "weaveworks/grafanalib": SPECS_REPO_GRAFANALIB,
+}
\ No newline at end of file
diff --git a/debug_gym/gym/envs/swe_smith_utils.py b/debug_gym/gym/envs/swe_smith_utils.py
new file mode 100755
index 00000000..65c085f8
--- /dev/null
+++ b/debug_gym/gym/envs/swe_smith_utils.py
@@ -0,0 +1,190 @@
+"""
+Pulled from official SWE-Smith repository.
+"""
+import os
+import re
+from pathlib import Path
+from unidiff import PatchSet
+
+from .swe_smith_constants import (
+    KEY_IMAGE_NAME,
+    KEY_MIN_TESTING,
+    KEY_PATCH,
+    KEY_TEST_CMD,
+    MAP_REPO_TO_SPECS,
+)
+
+
+FAIL_TO_PASS = "FAIL_TO_PASS"
+PASS_TO_PASS = "PASS_TO_PASS"
+INSTANCE_REF = "instance_ref"
+
+def get_repo_name(repo, commit) -> str:
+    """
+    Get the SWE-smith GitHub repository name for a repository at a specific commit.
+    """
+    return f"{repo.replace('/', '__')}.{commit[:8]}"
+
+def get_test_paths(dir_path: str, ext: str = ".py") -> list[Path]:
+    """
+    Get all testing file paths relative to the given directory.
+    """
+    return [
+        Path(os.path.relpath(os.path.join(root, file), dir_path))
+        for root, _, files in os.walk(Path(dir_path).resolve())
+        for file in files
+        if (
+            (
+                any([x in root.split("/") for x in ["tests", "test", "specs"]])
+                or file.lower().startswith("test")
+                or file.rsplit(".", 1)[0].endswith("test")
+            )
+            and (ext is None or file.endswith(ext))
+        )
+    ]
+
+
+def get_full_commit(repo, partial_commit) -> str:
+    """
+    Get the full commit hash for a repository at a specific commit.
+    """
+    for commit in MAP_REPO_TO_SPECS[repo]:
+        if commit.startswith(partial_commit):
+            return commit
+
+    raise ValueError(f"Commit {partial_commit} not found for repository {repo}.")
+
+def get_repo_commit_from_image_name(image_name: str) -> tuple[str, str]:
+    """
+    Get the repository and commit from a docker image ID.
+    """
+    # Parsing supports repos with '.' in their name
+    image_name = image_name.split(".", 2)[-1]
+    repo = image_name.rsplit(".", 1)[0].replace("__", "/")
+    partial_commit = image_name.rsplit(".", 1)[-1]
+    for repo_name in MAP_REPO_TO_SPECS:
+        # Hack because docker image_name must be lowercase
+        if repo_name.lower() == repo:
+            repo = repo_name
+            break
+    commit = get_full_commit(repo, partial_commit)
+    return repo, commit
+
+
+def get_test_command_mypy(instance: dict):
+    repo, commit = get_repo_commit_from_image_name(instance[KEY_IMAGE_NAME])
+    pattern = r"\[case ([^\]]+)\]"
+    if FAIL_TO_PASS in instance:
+        fail_to_pass_files = [x.rsplit("::", 1)[-1] for x in instance[FAIL_TO_PASS]]
+        if PASS_TO_PASS in instance:
+            pass_to_pass_files = [x.rsplit("::", 1)[-1] for x in instance[PASS_TO_PASS]]
+            all_files = list(set(fail_to_pass_files + pass_to_pass_files))
+        else:
+            all_files = list(set(fail_to_pass_files))
+        test_keys = " or ".join(all_files)
+    elif INSTANCE_REF in instance and "test_patch" in instance[INSTANCE_REF]:
+        test_keys = " or ".join(
+            re.findall(pattern, instance[INSTANCE_REF]["test_patch"])
+        )
+    return f'{MAP_REPO_TO_SPECS[repo][commit][KEY_TEST_CMD]} "{test_keys}"'
+
+MAP_REPO_TO_TEST_CMD = {
+    "python/mypy": get_test_command_mypy,
+}
+
+def get_test_command(instance: dict):
+    """
+    Given a repo/commit pair and a (gold) patch, return the test command to run
+    """
+    repo, commit = get_repo_commit_from_image_name(instance[KEY_IMAGE_NAME])
+    specs = MAP_REPO_TO_SPECS[repo][commit]
+    test_command = specs[KEY_TEST_CMD]
+
+    if FAIL_TO_PASS in instance and "pytest" in specs[KEY_TEST_CMD]:
+        # NOTE: Using F2P key as indicator that this is eval instance, not validation
+        if repo in MAP_REPO_TO_TEST_CMD:
+            return MAP_REPO_TO_TEST_CMD[repo](instance), []
+        f2p_files = list(set([x.split("::", 1)[0] for x in instance[FAIL_TO_PASS]]))
+        p2p_files = []
+        if PASS_TO_PASS in instance:
+            p2p_files = list(set([x.split("::", 1)[0] for x in instance[PASS_TO_PASS]]))
+        all_files = list(set(f2p_files + p2p_files))
+        test_command += f" {' '.join(all_files)}"
+        return test_command, all_files
+
+    if KEY_MIN_TESTING not in specs or KEY_PATCH not in instance:
+        # If min testing is not enabled or there's no patch
+        # return test command as is (usually = run whole test suite)
+        return test_command, []
+
+    # Get all testing related file paths in the repo
+    test_paths = get_test_paths(get_repo_name(repo, commit))
+
+    if (
+        INSTANCE_REF in instance
+        and len(instance[INSTANCE_REF]["test_patch"].strip()) > 0
+    ):
+        test_patch = instance[INSTANCE_REF]["test_patch"]
+        # For PR Mirroring (SWE-bench style) instances,
+        # if test patch is available, use that information
+        if repo in MAP_REPO_TO_TEST_CMD:
+            return MAP_REPO_TO_TEST_CMD[repo](instance), []
+        rv = []
+        for x in PatchSet(test_patch):
+            for test_path in test_paths:
+                if str(test_path).endswith(x.path) or str(test_path).endswith(
+                    Path(x.path).name
+                ):
+                    rv.append(str(test_path))
+        if len(rv) > 0:
+            test_command += f" {' '.join(rv)}"
+            return test_command, rv
+
+    # Identify relevant test files based on the patch
+    patch_paths = [Path(f.path) for f in PatchSet(instance[KEY_PATCH])]
+    rv = []
+    for patch_path in patch_paths:
+        file_name = patch_path.name.strip(".py")
+        parent_dir = patch_path.parent.name
+        for test_path in test_paths:
+            # Check for common test file naming conventions first
+            # If found, add to list and break
+            common_test_names = [
+                f"test_{file_name}.py",
+                f"test{file_name}.py",
+                f"{file_name}_test.py",
+                f"{file_name}test.py",
+            ]
+            if any(
+                [
+                    str(test_path).endswith(f"{parent_dir}/{name}")
+                    or str(test_path).endswith(name)
+                    for name in common_test_names
+                ]
+            ):
+                rv.append(str(test_path))
+                break
+        else:
+            for test_path in test_paths:
+                if parent_dir == test_path.parent.name:
+                    # If similar testing folder found, add to list and break
+                    rv.append(str(test_path.parent))
+                    break
+                elif any(
+                    [
+                        x.format(parent_dir) == test_path.name
+                        for x in ["test_{}.py", "test{}.py", "{}_test.py", "{}test.py"]
+                    ]
+                ):
+                    rv.append(str(test_path))
+
+    if len(rv) > 0:
+        # Remove duplicates
+        test_files = [x for x in rv if x.endswith(".py")]
+        final = [x for x in rv if not x.endswith(".py")]
+        for test_file in test_files:
+            if os.path.dirname(test_file) not in final:
+                final.append(test_file)
+        test_command += f" {' '.join(set(final))}"
+
+    return test_command, rv
\ No newline at end of file
diff --git a/debug_gym/gym/terminals/kubernetes.py b/debug_gym/gym/terminals/kubernetes.py
index f40ca38a..54f22b0b 100644
--- a/debug_gym/gym/terminals/kubernetes.py
+++ b/debug_gym/gym/terminals/kubernetes.py
@@ -257,7 +257,9 @@ def __str__(self):
 
 
 class KubernetesTerminal(Terminal):
-
+    """
+    Note: reads values of env variables K8S_NAMESPACE, K8S_DOCKER_SECRET, K8S_DOCKER_CONSTRAINT.
+    """
     def __init__(
         self,
         working_dir: str | None = None,
@@ -268,8 +270,9 @@ def __init__(
         setup_commands: list[str] | None = None,
         pod_name: str | None = None,
         base_image: str | None = None,
-        registry: str = "",
-        namespace: str = "default",
+        image_pull_secret: str | None = None,
+        registry: str = "docker.io",
+        namespace: str | None = None,
         kube_config: str | None = None,
         kube_context: str | None = None,
         extra_labels: dict | None = None,
@@ -286,7 +289,9 @@ def __init__(
         self.base_image = base_image
         self._task_name = base_image
         self.setup_commands = setup_commands or []
-        self.namespace = namespace
+        self.namespace = namespace or os.environ.get("K8S_NAMESPACE", "default")
+        self.image_pull_secret = image_pull_secret or os.environ.get("K8S_DOCKER_SECRET")
+        self.in_node_constraint = os.environ.get("K8S_NODE_CONSTRAINT", False)
         self.kubernetes_kwargs = kwargs  # e.g., nodeSelector, tolerations
         self.registry = registry.rstrip("/") + "/" if registry else ""
         self._pod_name = pod_name
@@ -498,6 +503,30 @@ def setup_pod(self, max_retries: int = 3) -> None:
                 f"with image: {self.registry}{self.base_image}"
             )
 
+            # set image pull secrets, don't override imagePullSecrets
+            if self.image_pull_secret and not "imagePullSecrets" in pod_spec_kwargs:
+                pod_spec_kwargs["imagePullSecrets"] = [{"name": self.image_pull_secret}]
+
+            # set in node constraint, don't override affinity
+            if self.in_node_constraint and not "affinity" in pod_spec_kwargs:
+                pod_spec_kwargs["affinity"] = {
+                    "nodeAffinity": {
+                        "requiredDuringSchedulingIgnoredDuringExecution": {
+                            "nodeSelectorTerms": [
+                                {
+                                    "matchExpressions": [
+                                        {
+                                            "key": "kubernetes.io/hostname",
+                                            "operator": "In",
+                                            "values": [os.environ["HOSTNAME"]],
+                                        }
+                                    ]
+                                }
+                            ]
+                        }
+                    }
+                }
+
             # Create pod specification for Kubernetes.
             pod_body = {
                 "apiVersion": "v1",

From d9b76c781f2f17749df6838d39520cfc2504ebea Mon Sep 17 00:00:00 2001
From: Alessandro Sordoni <alsordon@microsoft.com>
Date: Thu, 27 Nov 2025 14:50:56 -0800
Subject: [PATCH 13/31] remove swesmith

---
 requirements.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index e60c649c..afe42474 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,7 +10,6 @@ transformers==4.51.3
 tiktoken==0.9.0
 docker==7.1.0
 swebench==4.0.3
-swesmith==0.0.4
 prompt_toolkit==3.0.51
 anthropic==0.51.0
 jinja2==3.1.6

From 928c1d83997ccbd7d801e3cf9e8bc46dec876135 Mon Sep 17 00:00:00 2001
From: Alessandro Sordoni <alsordon@microsoft.com>
Date: Fri, 28 Nov 2025 07:48:33 -0800
Subject: [PATCH 14/31] load dataset as class method / setup_task

---
 debug_gym/gym/envs/aider.py               |  29 ++--
 debug_gym/gym/envs/env.py                 |   5 +-
 debug_gym/gym/envs/mini_nightmare.py      |  38 +++--
 debug_gym/gym/envs/r2egym.py              | 162 +++++++++++-----------
 debug_gym/gym/envs/swe_bench.py           | 107 +++++++-------
 debug_gym/gym/envs/swe_smith.py           | 152 ++++++++++----------
 debug_gym/gym/envs/swe_smith_constants.py |  11 +-
 debug_gym/gym/envs/swe_smith_utils.py     |   8 +-
 debug_gym/gym/terminals/kubernetes.py     |   7 +-
 debug_gym/gym/utils.py                    |  12 +-
 10 files changed, 281 insertions(+), 250 deletions(-)

diff --git a/debug_gym/gym/envs/aider.py b/debug_gym/gym/envs/aider.py
index 26776448..3056e0dd 100644
--- a/debug_gym/gym/envs/aider.py
+++ b/debug_gym/gym/envs/aider.py
@@ -2,6 +2,7 @@
 import subprocess
 import tempfile
 from pathlib import Path
+from typing import List
 
 import debug_gym.gym.utils as utils
 from debug_gym.constants import DEBUG_GYM_CACHE_DIR
@@ -62,6 +63,7 @@ class AiderBenchmarkEnv(RepoEnv):
 
     def __init__(
         self,
+        task_data: dict,
         entrypoint: str = "python -m pytest --tb=no -s .",
         terminal: Terminal | None = None,
         **kwargs,
@@ -73,6 +75,7 @@ def __init__(
         if hasattr(terminal, "base_image") and terminal.base_image is None:
             terminal.base_image = DOCKER_AIDER_IMAGE_NAME
 
+        self.task_data = task_data
         super().__init__(entrypoint=entrypoint, terminal=terminal, **kwargs)
 
     @property
@@ -91,10 +94,8 @@ def eval(self, **kwargs) -> EvalOutput:
         self.last_eval = EvalOutput(success, output)
         return self.last_eval
 
-    def setup_task(self, task_name: str, options: dict = None):
-        if task_name not in self.dataset:
-            raise ValueError(f"Task {task_name} not found in the dataset.")
-        self.current_task = self.dataset[task_name]
+    def setup_task(self, options: dict = None):
+        pass
 
     def setup_workspace(self):
         self.workspace.reset()
@@ -122,14 +123,20 @@ def setup_terminal(self):
         )  # Aider tasks come with those.
         self.terminal.run("git commit -am 'Add debug-gym ignore and read-only files'")
 
-    def load_dataset(self, problems: str | list[str] | None = None):
-        if isinstance(self.terminal, DockerTerminal):
-            build_docker_image(self.logger)
+    @classmethod
+    def load_dataset(
+        cls,
+        problems: str | list[str] | None = None,
+        build_image: bool = False,
+        logger: object = None,
+    ) -> dict:
+        if build_image:
+            build_docker_image(logger)
 
-        if not os.path.exists(self.REPO_PATH):
-            subprocess.run(["git", "clone", self.REPO_URL, self.REPO_PATH], check=True)
+        if not os.path.exists(cls.REPO_PATH):
+            subprocess.run(["git", "clone", cls.REPO_URL, cls.REPO_PATH], check=True)
 
-        practice_path = self.REPO_PATH / "exercises" / "practice"
+        practice_path = cls.REPO_PATH / "exercises" / "practice"
         directories = [d for d in practice_path.iterdir() if d.is_dir()]
 
         dataset = {}
@@ -166,5 +173,5 @@ def load_dataset(self, problems: str | list[str] | None = None):
             }
 
         problems = utils.filter_problems(dataset, problems)
-        dataset = {id: i for id, i in dataset.items() if id in problems}
+        dataset = {id: data for id, data in dataset.items() if id in problems}
         return dataset
diff --git a/debug_gym/gym/envs/env.py b/debug_gym/gym/envs/env.py
index 48807546..0021dc2b 100644
--- a/debug_gym/gym/envs/env.py
+++ b/debug_gym/gym/envs/env.py
@@ -292,7 +292,7 @@ def instructions(self) -> str:
         Override in subclasses for different behavior."""
         return ""
 
-    def setup_task(self, task_name: str, options: dict = None) -> None:
+    def setup_task(self, options: dict = None) -> None:
         """Setup the task information.
         Override in subclasses for different behavior. Called once at reset."""
         pass
@@ -325,8 +325,7 @@ def reset(self, *, options: dict = None):
         self.options = options if options is not None else self.options
         self.logger.debug("Resetting environment")
         self.close()  # Clean up previous workspace and terminal.
-        self.task_name = self.options.get("task_name")
-        self.setup_task(task_name=self.task_name, options=self.options)
+        self.setup_task(options=self.options)
         self.setup_workspace()
         self.setup_terminal()
         self._reset_env_state()
diff --git a/debug_gym/gym/envs/mini_nightmare.py b/debug_gym/gym/envs/mini_nightmare.py
index b5cee0a8..32937bfb 100644
--- a/debug_gym/gym/envs/mini_nightmare.py
+++ b/debug_gym/gym/envs/mini_nightmare.py
@@ -74,6 +74,7 @@ class MiniNightmareEnv(RepoEnv):
 
     def __init__(
         self,
+        task_data: dict,
         entrypoint: str = "python -m pytest --tb=no -s test.py",
         terminal: Terminal | None = None,
         **kwargs,
@@ -85,6 +86,9 @@ def __init__(
         if hasattr(terminal, "base_image") and terminal.base_image is None:
             terminal.base_image = DOCKER_MINI_NIGHTMARE_IMAGE_NAME
 
+        self.task_data = task_data
+        self.task_name = task_data["task_name"]
+
         super().__init__(entrypoint=entrypoint, terminal=terminal, **kwargs)
 
     @property
@@ -107,10 +111,8 @@ def eval(self, **kwargs) -> EvalOutput:
         self.last_eval = EvalOutput(success, output)
         return self.last_eval
 
-    def setup_task(self, task_name: str, options: dict = None):
-        if task_name not in self.dataset:
-            raise ValueError(f"Task {task_name} not found in the dataset.")
-        self.current_task = self.dataset[task_name]
+    def setup_task(self, options: dict = None):
+        pass
 
     def setup_workspace(self):
         self.workspace.reset()
@@ -138,19 +140,27 @@ def setup_terminal(self):
         )  # Mini-nightmare tasks come with those.
         self.terminal.run("git commit -am 'Add debug-gym ignore and read-only files'")
 
-    def load_dataset(self, problems: str | list[str] | None = None):
-        if isinstance(self.terminal, DockerTerminal):
-            build_docker_image(self.logger)
-
-        if not self.DATA_PATH.exists():
+    @classmethod
+    def load_dataset(
+        cls,
+        problems: str | list[str] | None = None,
+        build_image: bool = False,
+        logger: object = None,
+    ) -> dict:
+        if build_image:
+            build_docker_image(logger)
+
+        if not MiniNightmareEnv.DATA_PATH.exists():
             zipped_data = utils.download(
-                self.DATA_URL, self.DATA_PATH, f"Downloading mini-nightmare dataset."
+                MiniNightmareEnv.DATA_URL,
+                MiniNightmareEnv.DATA_PATH,
+                f"Downloading mini-nightmare dataset.",
             )
-            utils.unzip(zipped_data, dst=self.DATA_PATH.parent)
+            utils.unzip(zipped_data, dst=cls.DATA_PATH.parent)
 
         dataset = {}
-        for task_name in self.TASK_NAMES:
-            task_path = self.DATA_PATH / task_name
+        for task_name in cls.TASK_NAMES:
+            task_path = cls.DATA_PATH / task_name
             assert (task_path / "test.py").exists()
             assert (task_path / f"{task_name}_code.py").exists()
             assert (task_path / ".debugignore").exists()
@@ -162,5 +172,5 @@ def load_dataset(self, problems: str | list[str] | None = None):
             }
 
         problems = utils.filter_problems(dataset, problems)
-        dataset = {id: i for id, i in dataset.items() if id in problems}
+        dataset = {id: data for id, data in dataset.items() if id in problems}
         return dataset
diff --git a/debug_gym/gym/envs/r2egym.py b/debug_gym/gym/envs/r2egym.py
index b843d4a4..8e650400 100644
--- a/debug_gym/gym/envs/r2egym.py
+++ b/debug_gym/gym/envs/r2egym.py
@@ -59,73 +59,6 @@ def parse_log_pytest(log: str | None) -> dict[str, str]:
     return test_status_map
 
 
-def load_r2egym_dataset(
-    dataset_id: str = "R2E-Gym/R2E-Gym-Lite",
-    dataset_revision: str = "8d3163011f01f9393bb3dc7700497a79a8686ae5",
-    split: str = "train",
-    problems: list | None = None,
-    prepull_images: bool = False,
-    logger: DebugGymLogger | None = None,
-):
-    data_path = Path(dataset_id)
-    if data_path.is_file():
-        # Loading from local file.
-        if data_path.suffix.lower() == ".json":
-            ds = load_dataset("json", data_files=dataset_id)
-        elif data_path.suffix.lower() == ".parquet":
-            ds = load_dataset("parquet", data_files=dataset_id)
-    elif data_path.is_dir():
-        # Loading from local folder.
-        ds = load_from_disk(dataset_id)
-    else:
-        # Loading from HuggingFace or a folder.
-        ds = load_dataset(dataset_id, revision=dataset_revision)
-
-    # Select the split.
-    ds = ds[split]
-
-    # Load custom dataset splits from config.
-    with open(R2EGymEnv.CONFIG) as f:
-        custom_splits = yaml.safe_load(f)
-        excluded_ids = custom_splits.get("excluded", [])
-
-    # add instance id to each example (name of the image)
-    def extract_instance_id(docker_image: str) -> str:
-        return docker_image.split("/", 1)[-1]
-
-    # create a column "instance_id" in the dataset
-    instance_ids = [extract_instance_id(id) for id in ds["docker_image"]]
-    ds = ds.add_column("instance_id", instance_ids)
-
-    problems = filter_problems(ds["instance_id"], problems, custom_splits, excluded_ids)
-    ds = ds.filter(lambda example: example["instance_id"] in problems)
-
-    image_names = set(ds["docker_image"])
-    if logger is not None:
-        logger.debug(
-            f"Loaded {len(ds)} tasks across {len(image_names)} Docker images from {dataset_id}."
-        )
-
-    if prepull_images:
-        # Download all images needed for R2E-Gym.
-        client = docker.from_env()
-
-        existing_images = set(
-            tag for image in client.images.list() for tag in image.tags
-        )
-        missing_images = image_names - existing_images
-        if missing_images:
-            if logger is not None:
-                logger.warning(f"Found {len(missing_images)} missing Docker images.")
-                for i, image_name in enumerate(missing_images):
-                    if logger is not None:
-                        logger.warning(
-                            f"Pulling Docker image {i + 1}/{len(missing_images)} `{image_name}`."
-                        )
-                    client.images.pull(image_name)
-    return ds
-
-
 class R2EGymEnv(RepoEnv):
     CACHE = DEBUG_GYM_CACHE_DIR / "r2e-gym"
     CONFIG = importlib_files("debug_gym") / "gym" / "envs" / "configs" / "r2egym.yaml"
@@ -142,8 +75,8 @@ def __init__(
                 "R2EGymEnv only supports DockerTerminal and KubernetesTerminal."
             )
 
-        self.ds_row = task_data
-        self.setup_task(task_data=task_data)
+        self.task_data = task_data
+        self.setup_task()
 
         self.session_commands = []
         super().__init__(terminal=terminal, **kwargs)
@@ -156,25 +89,24 @@ def instructions(self) -> str:
             content = self.ds_row["problem_statement"]
             return re.search(r"\[ISSUE\](.*)\[/ISSUE\]", content, re.DOTALL).group(1)
         except Exception as e:
-            return self.ds_row["problem_statement"]
-
-    def setup_task(self, task_data: dict, options: dict = None):
-        self.ds_row = task_data
-        self.task_name = self.ds_row["instance_id"]
-        self.base_image = self.ds_row["docker_image"]
-        self.package_name = self.ds_row["repo_name"]
-        self.expected_output = json.loads(self.ds_row["expected_output_json"])
+            return self.task_data["problem_statement"]
+
+    def setup_task(self, options: dict = None):
+        self.task_name = self.task_data["instance_id"]
+        self.base_image = self.task_data["docker_image"]
+        self.package_name = self.task_data["repo_name"]
+        self.expected_output = json.loads(self.task_data["expected_output_json"])
         self.expected_output = decolor_dict_keys(self.expected_output)
         self.expected_output = {
             k.split(" - ")[0]: self.expected_output[k]
             for k in sorted(self.expected_output.keys())
         }
 
-        self.commit_hash = self.ds_row["commit_hash"]
+        self.commit_hash = self.task_data["commit_hash"]
 
         self.entrypoint = "python -m pytest -W ignore -rA r2e_tests"
         if self.package_name == "pillow":
-            test_file_codes = json.loads(self.ds_row["execution_result_content"])[
+            test_file_codes = json.loads(self.task_data["execution_result_content"])[
                 "test_file_codes"
             ]
             if any(["unittest" in test_code for test_code in test_file_codes]):
@@ -319,3 +251,75 @@ def calculate_score(self, eval_output: EvalOutput) -> int:
             reward = 1 if match else 0
 
         return reward
+
+    @classmethod
+    def load_dataset(
+        dataset_id: str = "R2E-Gym/R2E-Gym-Lite",
+        dataset_revision: str = "8d3163011f01f9393bb3dc7700497a79a8686ae5",
+        split: str = "train",
+        problems: list | None = None,
+        prepull_images: bool = False,
+        logger: DebugGymLogger | None = None,
+    ) -> dict:
+        data_path = Path(dataset_id)
+        if data_path.is_file():
+            # Loading from local file.
+            if data_path.suffix.lower() == ".json":
+                ds = load_dataset("json", data_files=dataset_id)
+            elif data_path.suffix.lower() == ".parquet":
+                ds = load_dataset("parquet", data_files=dataset_id)
+        elif data_path.is_dir():
+            # Loading from local folder.
+            ds = load_from_disk(dataset_id)
+        else:
+            # Loading from HuggingFace or a folder.
+            ds = load_dataset(dataset_id, revision=dataset_revision)
+
+        # Select the split.
+        ds = ds[split]
+
+        # Load custom dataset splits from config.
+        with open(R2EGymEnv.CONFIG) as f:
+            custom_splits = yaml.safe_load(f)
+            excluded_ids = custom_splits.get("excluded", [])
+
+        # add instance id to each example (name of the image)
+        def extract_instance_id(docker_image: str) -> str:
+            return docker_image.split("/", 1)[-1]
+
+        # create a column "instance_id" in the dataset
+        dataset = {}
+        for example in ds:
+            instance_id = extract_instance_id(example["docker_image"])
+            example["instance_id"] = instance_id
+            dataset[instance_id] = example
+
+        problems = filter_problems(dataset, problems, custom_splits, excluded_ids)
+        dataset = {pid: dataset[pid] for pid in problems}
+
+        image_names = set(example["docker_image"] for example in dataset.values())
+        if logger is not None:
+            logger.debug(
+                f"Loaded {len(dataset)} tasks across {len(image_names)} Docker images from {dataset_id}."
+            )
+
+        if prepull_images:
+            # Download all images needed for R2E-Gym.
+            client = docker.from_env()
+
+            existing_images = set(
+                tag for image in client.images.list() for tag in image.tags
+            )
+            missing_images = image_names - existing_images
+            if missing_images:
+                if logger is not None:
+                    logger.warning(
+                        f"Found {len(missing_images)} missing Docker images."
+                    )
+                    for i, image_name in enumerate(missing_images):
+                        if logger is not None:
+                            logger.warning(
+                                f"Pulling Docker image {i + 1}/{len(missing_images)} `{image_name}`."
+                            )
+                        client.images.pull(image_name)
+        return dataset
diff --git a/debug_gym/gym/envs/swe_bench.py b/debug_gym/gym/envs/swe_bench.py
index 891e0c2c..68566230 100644
--- a/debug_gym/gym/envs/swe_bench.py
+++ b/debug_gym/gym/envs/swe_bench.py
@@ -16,45 +16,6 @@
 from debug_gym.gym.utils import filter_problems
 
 
-def load_swebench_dataset(
-    dataset_id: str = "SWE-bench/SWE-bench_Verified",
-    dataset_revision: str = "99450355ca8c611021187a57ffac304b66666738",
-    split: str = "test",
-    problems: list | None = None,
-    prepull_images: bool = False,
-    logger: DebugGymLogger | None = None,
-):
-    ds = datasets.load_dataset(dataset_id, revision=dataset_revision)[split]
-    problems = filter_problems(ds["instance_id"], problems)
-
-    ds = ds.filter(lambda example: example["instance_id"] in problems)
-    instance_ids = ds["instance_id"]
-
-    image_names = set(
-        f"sweb.eval.x86_64.{id.replace('__', '_1776_')}" for id in instance_ids
-    )
-
-    if prepull_images:
-        # Download all images needed for SWE-Bench.
-        client = docker.from_env()
-        tagged_image_names = set(f"swebench/{name}:latest" for name in image_names)
-
-        existing_images = set(
-            tag for image in client.images.list() for tag in image.tags
-        )
-        missing_images = tagged_image_names - existing_images
-        if missing_images:
-            if logger:
-                logger.info(f"Found {len(missing_images)} missing Docker images.")
-            for i, image_name in enumerate(missing_images):
-                if logger:
-                    logger.info(
-                        f"Pulling Docker images {i + 1}/{len(missing_images)}: `{image_name}`."
-                    )
-                client.images.pull(image_name)
-    return ds
-
-
 class SWEBenchEnv(RepoEnv):
     CACHE = DEBUG_GYM_CACHE_DIR / "swe-bench"
 
@@ -70,33 +31,32 @@ def __init__(
                 f"{self.__class__.__name__} only supports DockerTerminal and KubernetesTerminal."
             )
 
-        self.ds_row = task_data
-        self.setup_task(self.ds_row)
+        self.task_data = task_data
+        self.setup_task()
         self.test_directives = []
         super().__init__(terminal=terminal, **kwargs)
 
     @property
     def instructions(self) -> str:
-        return self.ds_row["problem_statement"]
+        return self.task_data["problem_statement"]
 
-    def setup_task(self, task_data: dict, options: dict = None):
-        self.ds_row = task_data
-        self.task_name = task_data["instance_id"]
-        self.repo = self.ds_row["repo"]
+    def setup_task(self, options: dict = None):
+        self.task_name = self.task_data["instance_id"]
+        self.repo = self.task_data["repo"]
         self.package_name = self.repo.split("/")[1]
-        self.version = self.ds_row["version"]
+        self.version = self.task_data["version"]
         self.install_configs = MAP_REPO_VERSION_TO_SPECS[self.repo][self.version]
-        self.gold_patch = self.ds_row["patch"]
-        self.test_spec = make_test_spec(self.ds_row)
+        self.gold_patch = self.task_data["patch"]
+        self.test_spec = make_test_spec(self.task_data)
         self.base_image = f"swebench/{self.test_spec.instance_image_key}".replace(
             "__", "_1776_"
         )
-        self.base_commit = self.ds_row["base_commit"]
-        self.test_patch = self.ds_row["test_patch"]
-        self.fail_to_pass = json.loads(self.ds_row["FAIL_TO_PASS"])
-        self.pass_to_pass = json.loads(self.ds_row["PASS_TO_PASS"])
+        self.base_commit = self.task_data["base_commit"]
+        self.test_patch = self.task_data["test_patch"]
+        self.fail_to_pass = json.loads(self.task_data["FAIL_TO_PASS"])
+        self.pass_to_pass = json.loads(self.task_data["PASS_TO_PASS"])
         self.test_cmd = self.install_configs["test_cmd"]
-        self.test_directives = get_test_directives(self.ds_row)
+        self.test_directives = get_test_directives(self.task_data)
 
         self.entrypoint = " ".join([self.test_cmd, *self.test_directives])
 
@@ -211,3 +171,42 @@ def calculate_score(self, eval_output: EvalOutput) -> int:
         )
         assert score <= self.max_score
         return score
+
+    @classmethod
+    def load_dataset(
+        dataset_id: str = "SWE-bench/SWE-bench_Verified",
+        dataset_revision: str = "99450355ca8c611021187a57ffac304b66666738",
+        split: str = "test",
+        problems: list | None = None,
+        prepull_images: bool = False,
+        logger: DebugGymLogger | None = None,
+    ) -> dict:
+        ds = datasets.load_dataset(dataset_id, revision=dataset_revision)[split]
+
+        dataset = {problem["instance_id"]: problem for problem in ds}
+        problems = filter_problems(dataset, problems)
+        dataset = {id: i for id, i in dataset.items() if id in problems}
+
+        image_names = set(
+            f"sweb.eval.x86_64.{id.replace('__', '_1776_')}" for id in problems
+        )
+
+        if prepull_images:
+            # Download all images needed for SWE-Bench.
+            client = docker.from_env()
+            tagged_image_names = set(f"swebench/{name}:latest" for name in image_names)
+
+            existing_images = set(
+                tag for image in client.images.list() for tag in image.tags
+            )
+            missing_images = tagged_image_names - existing_images
+            if missing_images:
+                if logger:
+                    logger.info(f"Found {len(missing_images)} missing Docker images.")
+                for i, image_name in enumerate(missing_images):
+                    if logger:
+                        logger.info(
+                            f"Pulling Docker images {i + 1}/{len(missing_images)}: `{image_name}`."
+                        )
+                    client.images.pull(image_name)
+        return dataset
diff --git a/debug_gym/gym/envs/swe_smith.py b/debug_gym/gym/envs/swe_smith.py
index 6ba6fe03..e973376e 100644
--- a/debug_gym/gym/envs/swe_smith.py
+++ b/debug_gym/gym/envs/swe_smith.py
@@ -1,5 +1,6 @@
 from importlib.resources import files as importlib_files
 from pathlib import Path
+from typing import List
 
 import docker
 import yaml
@@ -15,75 +16,10 @@
 from debug_gym.constants import DEBUG_GYM_CACHE_DIR
 from debug_gym.gym.entities import EvalOutput
 from debug_gym.gym.envs.swe_bench import SWEBenchEnv
-from debug_gym.gym.terminals.kubernetes import KubernetesTerminal
 from debug_gym.gym.terminals.terminal import DebugGymLogger, Terminal
 from debug_gym.gym.utils import filter_problems
 
 
-def load_swesmith_dataset(
-    dataset_id: str = "SWE-bench/SWE-smith",
-    dataset_revision: str = "699b53400d3855206a0fbf3ff4beaf1a52f4f232",
-    split: str = "train",
-    problems: list | None = None,
-    prepull_images: bool = False,
-    logger: DebugGymLogger | None = None,
-):
-    data_path = Path(dataset_id)
-    if data_path.is_file():
-        # Loading from local file.
-        if data_path.suffix.lower() == ".json":
-            ds = load_dataset("json", data_files=dataset_id)
-        elif data_path.suffix.lower() == ".parquet":
-            ds = load_dataset("parquet", data_files=dataset_id)
-    elif data_path.is_dir():
-        # Loading from local folder.
-        ds = load_from_disk(dataset_id)
-    else:
-        # Loading from HuggingFace or a folder.
-        ds = load_dataset(dataset_id, revision=dataset_revision)
-
-    # Select the split.
-    ds = ds[split]
-
-    # Load custom dataset splits from config.
-    with open(SWESmithEnv.CONFIG) as f:
-        custom_splits = yaml.safe_load(f)
-        excluded_ids = custom_splits.get("excluded", [])
-
-    problems = filter_problems(ds["instance_id"], problems, custom_splits, excluded_ids)
-    ds = ds.filter(lambda example: example["instance_id"] in problems)
-
-    image_names = set(ds["image_name"])
-    if logger is not None:
-        logger.debug(
-            f"Loaded {len(ds)} tasks across {len(image_names)} Docker images from {dataset_id}."
-        )
-
-    if prepull_images:
-        # Download all images needed for SWE-Smith.
-        client = docker.from_env()
-        tagged_image_names = set(f"{DOCKER_ORG}/{name}:{TAG}" for name in image_names)
-
-        existing_images = set(
-            tag for image in client.images.list() for tag in image.tags
-        )
-        missing_images = tagged_image_names - existing_images
-        if missing_images:
-            if logger is not None:
-                logger.info(f"Found {len(missing_images)} missing Docker images.")
-
-            for image_name in missing_images:
-                docker_hub_image = image_name.replace("__", "_1776_")
-                if logger is not None:
-                    logger.info(
-                        f"Pulling Docker image `{docker_hub_image}` to `{image_name}`."
-                    )
-                client.images.pull(docker_hub_image)
-                # Rename images via tagging
-                client.images.get(docker_hub_image).tag(image_name)
-    return ds
-
-
 class SWESmithEnv(SWEBenchEnv):
     CACHE = DEBUG_GYM_CACHE_DIR / "swe-smith"
     CONFIG = (
@@ -102,22 +38,21 @@ def __init__(
             **kwargs,
         )
 
-    def setup_task(self, task_data: dict, options: dict = None):
-        self.task_name = task_data["instance_id"]
-        self.ds_row = task_data
+    def setup_task(self, options: dict = None):
+        self.task_name = self.task_data["instance_id"]
         self.base_commit = (
-            self.ds_row["base_commit"] if "base_commit" in self.ds_row else "main"
+            self.task_data["base_commit"] if "base_commit" in self.task_data else "main"
         )
-        self.branch_name = self.ds_row["instance_id"]
-        self.bug_patch = self.ds_row["patch"]
-        self.image_name = self.ds_row["image_name"]
+        self.branch_name = self.task_data["instance_id"]
+        self.bug_patch = self.task_data["patch"]
+        self.image_name = self.task_data["image_name"]
         self.repo, self.commit = get_repo_commit_from_image_name(self.image_name)
         self.install_configs = MAP_REPO_TO_SPECS[self.repo][self.commit]
         self.base_image = f"{DOCKER_ORG}/{self.image_name}:{TAG}"
         self.package_name = self.repo.split("/")[1]
-        self.test_cmd, self.test_directives = get_test_command(self.ds_row)
-        self.fail_to_pass = self.ds_row["FAIL_TO_PASS"]
-        self.pass_to_pass = self.ds_row["PASS_TO_PASS"]
+        self.test_cmd, self.test_directives = get_test_command(self.task_data)
+        self.fail_to_pass = self.task_data["FAIL_TO_PASS"]
+        self.pass_to_pass = self.task_data["PASS_TO_PASS"]
         self.log_parser = MAP_REPO_TO_PARSER.get(self.repo, parse_log_pytest)
 
         if self.package_name == "python-colorlog":
@@ -219,3 +154,70 @@ def eval(self, **kwargs) -> EvalOutput:
         success, output = self.terminal.run(self.entrypoint, timeout=self.run_timeout)
         self.last_eval = EvalOutput(success, output)
         return self.last_eval
+
+    @classmethod
+    def load_dataset(
+        dataset_id: str = "SWE-bench/SWE-smith",
+        dataset_revision: str = "699b53400d3855206a0fbf3ff4beaf1a52f4f232",
+        split: str = "train",
+        problems: list | None = None,
+        prepull_images: bool = False,
+        logger: DebugGymLogger | None = None,
+    ) -> dict:
+        data_path = Path(dataset_id)
+        if data_path.is_file():
+            # Loading from local file.
+            if data_path.suffix.lower() == ".json":
+                ds = load_dataset("json", data_files=dataset_id)
+            elif data_path.suffix.lower() == ".parquet":
+                ds = load_dataset("parquet", data_files=dataset_id)
+        elif data_path.is_dir():
+            # Loading from local folder.
+            ds = load_from_disk(dataset_id)
+        else:
+            # Loading from HuggingFace or a folder.
+            ds = load_dataset(dataset_id, revision=dataset_revision)
+
+        # Select the split.
+        ds = ds[split]
+
+        # Load custom dataset splits from config.
+        with open(SWESmithEnv.CONFIG) as f:
+            custom_splits = yaml.safe_load(f)
+            excluded_ids = custom_splits.get("excluded", [])
+
+        dataset = {d["instance_id"]: d for d in ds}
+        problems = filter_problems(dataset, problems, custom_splits, excluded_ids)
+        dataset = {pid: dataset[pid] for pid in problems}
+
+        image_names = set([problem["image_name"] for problem in dataset.values()])
+        if logger is not None:
+            logger.debug(
+                f"Loaded {len(dataset)} tasks across {len(image_names)} Docker images from {dataset_id}."
+            )
+
+        if prepull_images:
+            # Download all images needed for SWE-Smith.
+            client = docker.from_env()
+            tagged_image_names = set(
+                f"{DOCKER_ORG}/{name}:{TAG}" for name in image_names
+            )
+
+            existing_images = set(
+                tag for image in client.images.list() for tag in image.tags
+            )
+            missing_images = tagged_image_names - existing_images
+            if missing_images:
+                if logger is not None:
+                    logger.info(f"Found {len(missing_images)} missing Docker images.")
+
+                for image_name in missing_images:
+                    docker_hub_image = image_name.replace("__", "_1776_")
+                    if logger is not None:
+                        logger.info(
+                            f"Pulling Docker image `{docker_hub_image}` to `{image_name}`."
+                        )
+                    client.images.pull(docker_hub_image)
+                    # Rename images via tagging
+                    client.images.get(docker_hub_image).tag(image_name)
+        return dataset
diff --git a/debug_gym/gym/envs/swe_smith_constants.py b/debug_gym/gym/envs/swe_smith_constants.py
index 4eff67c0..7d877245 100755
--- a/debug_gym/gym/envs/swe_smith_constants.py
+++ b/debug_gym/gym/envs/swe_smith_constants.py
@@ -1,4 +1,3 @@
-
 DOCKER_ORG = "jyangballin"
 TAG = "latest"
 
@@ -78,9 +77,11 @@
         for v in CMAKE_VERSIONS
     ]
     + [
-        f"tar -xvzf cmake-{v}-Linux-x86_64.tar.gz && mv cmake-{v}-Linux-x86_64 /usr/share/cmake-{v}"
-        if v not in ["3.23.5", "3.27.9"]
-        else f"tar -xvzf cmake-{v}-Linux-x86_64.tar.gz && mv cmake-{v}-linux-x86_64 /usr/share/cmake-{v}"
+        (
+            f"tar -xvzf cmake-{v}-Linux-x86_64.tar.gz && mv cmake-{v}-Linux-x86_64 /usr/share/cmake-{v}"
+            if v not in ["3.23.5", "3.27.9"]
+            else f"tar -xvzf cmake-{v}-Linux-x86_64.tar.gz && mv cmake-{v}-linux-x86_64 /usr/share/cmake-{v}"
+        )
         for v in CMAKE_VERSIONS
     ]
     + [
@@ -683,4 +684,4 @@
     "un33k/python-slugify": SPECS_REPO_PYTHON_SLUGIFY,
     "vi3k6i5/flashtext": SPECS_REPO_FLASHTEXT,
     "weaveworks/grafanalib": SPECS_REPO_GRAFANALIB,
-}
\ No newline at end of file
+}
diff --git a/debug_gym/gym/envs/swe_smith_utils.py b/debug_gym/gym/envs/swe_smith_utils.py
index 65c085f8..496200f8 100755
--- a/debug_gym/gym/envs/swe_smith_utils.py
+++ b/debug_gym/gym/envs/swe_smith_utils.py
@@ -1,6 +1,7 @@
 """
 Pulled from official SWE-Smith repository.
 """
+
 import os
 import re
 from pathlib import Path
@@ -19,12 +20,14 @@
 PASS_TO_PASS = "PASS_TO_PASS"
 INSTANCE_REF = "instance_ref"
 
+
 def get_repo_name(repo, commit) -> str:
     """
     Get the SWE-smith GitHub repository name for a repository at a specific commit.
     """
     return f"{repo.replace('/', '__')}.{commit[:8]}"
 
+
 def get_test_paths(dir_path: str, ext: str = ".py") -> list[Path]:
     """
     Get all testing file paths relative to the given directory.
@@ -54,6 +57,7 @@ def get_full_commit(repo, partial_commit) -> str:
 
     raise ValueError(f"Commit {partial_commit} not found for repository {repo}.")
 
+
 def get_repo_commit_from_image_name(image_name: str) -> tuple[str, str]:
     """
     Get the repository and commit from a docker image ID.
@@ -88,10 +92,12 @@ def get_test_command_mypy(instance: dict):
         )
     return f'{MAP_REPO_TO_SPECS[repo][commit][KEY_TEST_CMD]} "{test_keys}"'
 
+
 MAP_REPO_TO_TEST_CMD = {
     "python/mypy": get_test_command_mypy,
 }
 
+
 def get_test_command(instance: dict):
     """
     Given a repo/commit pair and a (gold) patch, return the test command to run
@@ -187,4 +193,4 @@ def get_test_command(instance: dict):
                 final.append(test_file)
         test_command += f" {' '.join(set(final))}"
 
-    return test_command, rv
\ No newline at end of file
+    return test_command, rv
diff --git a/debug_gym/gym/terminals/kubernetes.py b/debug_gym/gym/terminals/kubernetes.py
index 54f22b0b..1b6f14f0 100644
--- a/debug_gym/gym/terminals/kubernetes.py
+++ b/debug_gym/gym/terminals/kubernetes.py
@@ -260,6 +260,7 @@ class KubernetesTerminal(Terminal):
     """
     Note: reads values of env variables K8S_NAMESPACE, K8S_DOCKER_SECRET, K8S_DOCKER_CONSTRAINT.
     """
+
     def __init__(
         self,
         working_dir: str | None = None,
@@ -290,8 +291,10 @@ def __init__(
         self._task_name = base_image
         self.setup_commands = setup_commands or []
         self.namespace = namespace or os.environ.get("K8S_NAMESPACE", "default")
-        self.image_pull_secret = image_pull_secret or os.environ.get("K8S_DOCKER_SECRET")
-        self.in_node_constraint = os.environ.get("K8S_NODE_CONSTRAINT", False)
+        self.image_pull_secret = image_pull_secret or os.environ.get(
+            "K8S_DOCKER_SECRET"
+        )
+        self.in_node_constraint = os.environ.get("K8S_IN_NODE_CONSTRAINT", False)
         self.kubernetes_kwargs = kwargs  # e.g., nodeSelector, tolerations
         self.registry = registry.rstrip("/") + "/" if registry else ""
         self._pod_name = pod_name
diff --git a/debug_gym/gym/utils.py b/debug_gym/gym/utils.py
index 1d95294b..24372a44 100644
--- a/debug_gym/gym/utils.py
+++ b/debug_gym/gym/utils.py
@@ -196,7 +196,7 @@ def extract_reward_from_pytest_output(output):
 
 
 def filter_problems(
-    dataset_instances: list[str],
+    dataset: dict[str, Any],
     problems: str | list[str] | None = None,
     custom_splits: dict[str, Any] | None = None,
     excluded_ids: list[str] | None = None,
@@ -208,9 +208,9 @@ def filter_problems(
     if not isinstance(problems, str):
         # Check that all problems are valid task names.
         for problem in problems:
-            if problem not in dataset_instances:
+            if problem not in dataset:
                 raise ValueError(
-                    f"Invalid problem id: '{problem}'.\nChoose from: {sorted(dataset_instances)}"
+                    f"Invalid problem id: '{problem}'.\nChoose from: {sorted(dataset)}"
                 )
 
         # Make sure all problems are unique.
@@ -220,14 +220,14 @@ def filter_problems(
         return problems  # Assuming a list of problem IDs.
 
     if problems == "all":
-        return [k for k in dataset_instances if k not in excluded_ids]
-    elif problems in dataset_instances:
+        return [k for k in dataset if k not in excluded_ids]
+    elif problems in dataset:
         return [problems]  # Single task
     elif problems in custom_splits:
         return custom_splits[problems]
     else:
         raise ValueError(
-            f"Invalid split or problem id: '{problems}'.\nChoose from: {sorted(dataset_instances) + ['all'] + sorted(custom_splits)}"
+            f"Invalid split or problem id: '{problems}'.\nChoose from: {sorted(dataset) + ['all'] + sorted(custom_splits)}"
         )
 
 

From b338e1ceefb3f2097ad7a8118879ea1cd6c7adf9 Mon Sep 17 00:00:00 2001
From: Alessandro Sordoni <alsordon@microsoft.com>
Date: Fri, 28 Nov 2025 07:53:01 -0800
Subject: [PATCH 15/31] fix tests

---
 tests/gym/envs/conftest.py    | 11 ++---------
 tests/gym/envs/test_r2egym.py | 10 +++++-----
 2 files changed, 7 insertions(+), 14 deletions(-)

diff --git a/tests/gym/envs/conftest.py b/tests/gym/envs/conftest.py
index 94e70a47..60afdefa 100644
--- a/tests/gym/envs/conftest.py
+++ b/tests/gym/envs/conftest.py
@@ -34,15 +34,8 @@ def make_env_factory(env_name, worker_id, tmp_path_factory):
     env_class = kwargs.pop("env_class")
 
     def _make_env():
-        if issubclass(env_class, (SWEBenchEnv, SWEBenchDebugEnv)):
-            fn = load_swebench_dataset
-        elif issubclass(env_class, SWESmithEnv):
-            fn = load_swesmith_dataset
-        elif issubclass(env_class, R2EGymEnv):
-            fn = load_r2egym_dataset
-        else:
-            raise ValueError(f"Unknown env_class: {env_class}")
-        task_data = fn(problems=kwargs["problems"])[0]
+        dataset = env_class.load_dataset(problems=kwargs["problems"])
+        task_data = next(iter(dataset.values()))
         return env_class(task_data=task_data)
 
     if worker_id == "master":
diff --git a/tests/gym/envs/test_r2egym.py b/tests/gym/envs/test_r2egym.py
index e0b1a635..6fda7e9b 100644
--- a/tests/gym/envs/test_r2egym.py
+++ b/tests/gym/envs/test_r2egym.py
@@ -7,7 +7,7 @@
 
 from debug_gym.agents.solution_agent import AgentSolution
 from debug_gym.gym.entities import Observation
-from debug_gym.gym.envs.r2egym import R2EGymEnv, load_r2egym_dataset
+from debug_gym.gym.envs.r2egym import R2EGymEnv
 from debug_gym.gym.terminals.docker import DockerTerminal
 from debug_gym.gym.tools.tool import ToolCall
 from debug_gym.gym.tools.toolbox import Toolbox
@@ -70,17 +70,17 @@ def test_load_dataset_from_parquet(mock_docker_from_env, tmp_path):
     pq.write_table(table, str(parquet_file))
 
     # Load the dataset from the Parquet file
-    dataset = load_r2egym_dataset(
-        dataset_id=str(parquet_file), split="train"
-    )
+    dataset = R2EGymEnv.load_dataset(dataset_id=str(parquet_file), split="train")
+    dataset_entry = next(iter(dataset.values()))
 
     # Verify the dataset contains the expected features
-    assert sorted(dataset.features.keys()) == sorted(
+    assert sorted(dataset_entry) == sorted(
         [
             "commit_hash",
             "docker_image",
             "execution_result_content",
             "expected_output_json",
+            "instance_id",
             "modified_entity_summaries",
             "modified_files",
             "num_non_test_files",

From 0858bea62bd2399ae4cc887e7d336e68625affc8 Mon Sep 17 00:00:00 2001
From: Alessandro Sordoni <alsordon@microsoft.com>
Date: Fri, 28 Nov 2025 07:54:08 -0800
Subject: [PATCH 16/31] change run.py

---
 scripts/run.py | 17 +++--------------
 1 file changed, 3 insertions(+), 14 deletions(-)

diff --git a/scripts/run.py b/scripts/run.py
index e9f0b64a..fcbde3fb 100644
--- a/scripts/run.py
+++ b/scripts/run.py
@@ -259,20 +259,9 @@ def main():
         "problems": config.get("problems", "all"),
         "prepull_images": config.env_kwargs.get("prepull_images", False),
     }
-    load_dataset_fn = {
-        "swebench": load_swebench_dataset,
-        "swebench-debug": load_swebench_dataset,
-        "swesmith": load_swesmith_dataset,
-        "r2egym": load_r2egym_dataset,
-    }
-
-    if config["benchmark"] in load_dataset_fn:
-        dataset = load_dataset_fn[config["benchmark"]](
-            **dataset_info,
-        )
-    else:
-        raise ValueError(f"Unsupported benchmark: {config['benchmark']}")
-
+    dataset = select_env(config.get("benchmark")).load_dataset(
+        **dataset_info
+    )
     problems = sorted(dataset)
 
     if args.list:

From 35a4f666f03af5ac9f223deaba98dc107ffe1e5e Mon Sep 17 00:00:00 2001
From: Alessandro Sordoni <alsordon@microsoft.com>
Date: Fri, 28 Nov 2025 08:18:44 -0800
Subject: [PATCH 17/31] blacked

---
 scripts/run.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/scripts/run.py b/scripts/run.py
index fcbde3fb..1e119327 100644
--- a/scripts/run.py
+++ b/scripts/run.py
@@ -259,9 +259,7 @@ def main():
         "problems": config.get("problems", "all"),
         "prepull_images": config.env_kwargs.get("prepull_images", False),
     }
-    dataset = select_env(config.get("benchmark")).load_dataset(
-        **dataset_info
-    )
+    dataset = select_env(config.get("benchmark")).load_dataset(**dataset_info)
     problems = sorted(dataset)
 
     if args.list:

From e9600ed550ef1461ae2a9b284e5499cecc1fe99e Mon Sep 17 00:00:00 2001
From: Alessandro Sordoni <alsordon@microsoft.com>
Date: Fri, 28 Nov 2025 08:19:56 -0800
Subject: [PATCH 18/31] remove imports

---
 tests/gym/envs/conftest.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tests/gym/envs/conftest.py b/tests/gym/envs/conftest.py
index 60afdefa..8806dcee 100644
--- a/tests/gym/envs/conftest.py
+++ b/tests/gym/envs/conftest.py
@@ -2,10 +2,8 @@
 from filelock import FileLock
 
 from debug_gym.gym.envs import R2EGymEnv, SWEBenchEnv, SWESmithEnv
-from debug_gym.gym.envs.r2egym import load_r2egym_dataset
-from debug_gym.gym.envs.swe_bench import load_swebench_dataset
 from debug_gym.gym.envs.swe_bench_debug import SWEBenchDebugEnv
-from debug_gym.gym.envs.swe_smith import load_swesmith_dataset
+
 
 BUILD_ENV_CONFIGS = {
     "swe_smith": {

From 81b2eda1bd2da47b27d42d0282130f176830c945 Mon Sep 17 00:00:00 2001
From: Alessandro Sordoni <alsordon@microsoft.com>
Date: Fri, 28 Nov 2025 08:23:50 -0800
Subject: [PATCH 19/31] task name / task data adaptation

---
 scripts/run.py | 45 ++++++++++++++++++++++-----------------------
 1 file changed, 22 insertions(+), 23 deletions(-)

diff --git a/scripts/run.py b/scripts/run.py
index 1e119327..928b921f 100644
--- a/scripts/run.py
+++ b/scripts/run.py
@@ -44,7 +44,7 @@ def timeout_handler(signum, frame):
         signal.alarm(timeout_seconds)
 
 
-def run_agent(args, problem: dict, config: dict):
+def run_agent(args, task_name: str, task_data: dict, config: dict):
     set_signal(args.timeout)
     success = True
     env = None
@@ -54,22 +54,22 @@ def run_agent(args, problem: dict, config: dict):
     report_progress_error = True
 
     exp_path = Path(config["output_path"]) / config["uuid"]
-    problem_path = exp_path / problem
+    task_path = exp_path / task_name
 
     task_logger = DebugGymLogger(
-        problem,
-        log_dir=problem_path,
+        task_name,
+        log_dir=task_path,
         level=args.logging_level,
         mode="w" if args.force_all else "a",
     )
     try:
-        previous_run = load_previous_run_status(problem_path, problem)
+        previous_run = load_previous_run_status(task_path, task_name)
         if (
             not args.force_all
             and previous_run is not None
             and previous_run.status in ["resolved", "unresolved"]
         ):
-            task_logger.debug(f"Previous run found: {problem_path}")
+            task_logger.debug(f"Previous run found: {task_path}")
             success = previous_run.status == "resolved"
             task_logger.debug(f"Previous run status: {previous_run.status}")
             if not args.force_failed or success:
@@ -82,11 +82,11 @@ def run_agent(args, problem: dict, config: dict):
                     max_score=previous_run.max_score,
                     status=status,
                 )
-                task_logger.debug(f"Skipping {problem}, already done.")
+                task_logger.debug(f"Skipping {task_name}, already done.")
                 return success
 
         task_logger.report_progress(
-            problem_id=problem,
+            problem_id=task_name,
             step=0,
             total_steps=1,
             score=0,
@@ -94,7 +94,7 @@ def run_agent(args, problem: dict, config: dict):
             status="running",
         )
 
-        env = create_env(config, problem, task_logger)
+        env = create_env(config, task_data, task_logger)
         add_tools(env, config, task_logger)
 
         llm = LLM.instantiate(
@@ -107,17 +107,16 @@ def run_agent(args, problem: dict, config: dict):
         agent = create_agent(
             config["agent_type"],
             agent_args=agent_args,
-            env=env,
             llm=llm,
             logger=task_logger,
         )
 
         try:
-            success = agent.run(task_name=problem, debug=args.debug)
+            success = agent.run(env, debug=args.debug)
         except KeyboardInterrupt:
             task_logger.error("Agent run was interrupted by user.")
             task_logger.report_progress(
-                problem_id=problem,
+                problem_id=task_name,
                 step=1,
                 total_steps=1,
                 score=0,
@@ -128,11 +127,11 @@ def run_agent(args, problem: dict, config: dict):
             raise
         except AgentTimeoutException:
             task_logger.error(
-                f"Timeout: Problem `{problem}` exceeded "
+                f"Timeout: Problem `{task_name}` exceeded "
                 f"the time limit of {args.timeout} seconds."
             )
             task_logger.report_progress(
-                problem_id=problem,
+                problem_id=task_name,
                 step=1,
                 total_steps=1,
                 score=0,
@@ -146,23 +145,23 @@ def run_agent(args, problem: dict, config: dict):
             raise
 
         # save trajectory
-        save_trajectory(agent, problem, problem_path, task_logger)
+        save_trajectory(agent, task_name, task_path, task_logger)
 
         # optionally apply patch
         if config["save_patch"]:
-            save_patch(env, problem_path, task_logger)
+            save_patch(env, task_path, task_logger)
 
     except Exception as e:
         task_logger.error(
-            f"Task Error: {problem} - {e!r}. Run with --very-verbose "
+            f"Task Error: {task_name} - {e!r}. Run with --very-verbose "
             f"or check {task_logger.log_file} for more information."
         )
         task_logger.debug(
-            f"Task {problem} generated an exception: {e!r}. Traceback: {traceback.format_exc()}"
+            f"Task {task_name} generated an exception: {e!r}. Traceback: {traceback.format_exc()}"
         )
         if report_progress_error:
             task_logger.report_progress(
-                problem_id=problem,
+                problem_id=task_name,
                 step=1,
                 total_steps=1,
                 score=0,
@@ -181,11 +180,11 @@ def run_agent(args, problem: dict, config: dict):
     return success
 
 
-def create_env(config: dict, problem: dict, logger: DebugGymLogger):
+def create_env(config: dict, task_data: dict, logger: DebugGymLogger):
     terminal = select_terminal(config.get("terminal"), logger, uuid=config["uuid"])
     env_class = select_env(config.get("benchmark"))
     env = env_class(
-        task_data=problem,
+        task_data=task_data,
         terminal=terminal,
         logger=logger,
         **config["env_kwargs"],
@@ -297,7 +296,7 @@ def main():
         if num_workers == 1:  # run sequentially for easier debugging
             for problem in problems:
                 try:
-                    success = run_agent(args, problem, config)
+                    success = run_agent(args, problem, dataset[problem], config)
                 except AgentTimeoutException:
                     pass  # Handled in run_agent, just continue
                 except (KeyboardInterrupt, Exception) as e:
@@ -307,7 +306,7 @@ def main():
                 num_workers, initializer=DebugGymLogger.set_as_worker
             ) as executor:
                 futures = {
-                    executor.submit(run_agent, args, problem, config): problem
+                    executor.submit(run_agent, args, problem, dataset[problem], config): problem
                     for problem in problems
                 }
                 for future in as_completed(futures):

From 3468a627004643d26d1ff704bc830641e2821d91 Mon Sep 17 00:00:00 2001
From: Alessandro Sordoni <alsordon@microsoft.com>
Date: Fri, 28 Nov 2025 08:26:25 -0800
Subject: [PATCH 20/31] pre commit

---
 debug_gym/gym/envs/swe_smith.py       |  7 +++----
 debug_gym/gym/envs/swe_smith_utils.py |  2 +-
 debug_gym/gym/terminals/kubernetes.py |  2 +-
 scripts/run.py                        | 11 ++++++-----
 tests/gym/envs/conftest.py            |  1 -
 5 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/debug_gym/gym/envs/swe_smith.py b/debug_gym/gym/envs/swe_smith.py
index e973376e..9ab436a1 100644
--- a/debug_gym/gym/envs/swe_smith.py
+++ b/debug_gym/gym/envs/swe_smith.py
@@ -5,10 +5,6 @@
 import docker
 import yaml
 from datasets import load_dataset, load_from_disk
-
-from .swe_smith_constants import DOCKER_ORG, TAG, MAP_REPO_TO_SPECS
-from .swe_smith_utils import get_test_command, get_repo_commit_from_image_name
-
 from swebench.harness.constants import TestStatus
 from swebench.harness.grading import MAP_REPO_TO_PARSER
 from swebench.harness.log_parsers.python import parse_log_pytest
@@ -19,6 +15,9 @@
 from debug_gym.gym.terminals.terminal import DebugGymLogger, Terminal
 from debug_gym.gym.utils import filter_problems
 
+from .swe_smith_constants import DOCKER_ORG, MAP_REPO_TO_SPECS, TAG
+from .swe_smith_utils import get_repo_commit_from_image_name, get_test_command
+
 
 class SWESmithEnv(SWEBenchEnv):
     CACHE = DEBUG_GYM_CACHE_DIR / "swe-smith"
diff --git a/debug_gym/gym/envs/swe_smith_utils.py b/debug_gym/gym/envs/swe_smith_utils.py
index 496200f8..727ef233 100755
--- a/debug_gym/gym/envs/swe_smith_utils.py
+++ b/debug_gym/gym/envs/swe_smith_utils.py
@@ -5,6 +5,7 @@
 import os
 import re
 from pathlib import Path
+
 from unidiff import PatchSet
 
 from .swe_smith_constants import (
@@ -15,7 +16,6 @@
     MAP_REPO_TO_SPECS,
 )
 
-
 FAIL_TO_PASS = "FAIL_TO_PASS"
 PASS_TO_PASS = "PASS_TO_PASS"
 INSTANCE_REF = "instance_ref"
diff --git a/debug_gym/gym/terminals/kubernetes.py b/debug_gym/gym/terminals/kubernetes.py
index 1b6f14f0..7830d65b 100644
--- a/debug_gym/gym/terminals/kubernetes.py
+++ b/debug_gym/gym/terminals/kubernetes.py
@@ -1,9 +1,9 @@
 import atexit
+import hashlib
 import json
 import os
 import random
 import subprocess
-import hashlib
 import time
 import uuid
 from pathlib import Path
diff --git a/scripts/run.py b/scripts/run.py
index 928b921f..bc339b3c 100644
--- a/scripts/run.py
+++ b/scripts/run.py
@@ -12,16 +12,15 @@
 from debug_gym.agents.base_agent import AGENT_REGISTRY, AgentArgs, create_agent
 from debug_gym.agents.utils import load_config, save_patch, save_trajectory
 from debug_gym.gym.envs import select_env
+from debug_gym.gym.envs.r2egym import load_r2egym_dataset
+from debug_gym.gym.envs.swe_bench import load_swebench_dataset
+from debug_gym.gym.envs.swe_smith import load_swesmith_dataset
 from debug_gym.gym.terminals import select_terminal
 from debug_gym.gym.tools.toolbox import Toolbox
 from debug_gym.llms.base import LLM
 from debug_gym.llms.human import Human
 from debug_gym.logger import DebugGymLogger, load_previous_run_status
 
-from debug_gym.gym.envs.swe_bench import load_swebench_dataset
-from debug_gym.gym.envs.swe_smith import load_swesmith_dataset
-from debug_gym.gym.envs.r2egym import load_r2egym_dataset
-
 
 class AgentTimeoutException(BaseException):
     """Custom exception to handle timeouts in agent
@@ -306,7 +305,9 @@ def main():
                 num_workers, initializer=DebugGymLogger.set_as_worker
             ) as executor:
                 futures = {
-                    executor.submit(run_agent, args, problem, dataset[problem], config): problem
+                    executor.submit(
+                        run_agent, args, problem, dataset[problem], config
+                    ): problem
                     for problem in problems
                 }
                 for future in as_completed(futures):
diff --git a/tests/gym/envs/conftest.py b/tests/gym/envs/conftest.py
index 8806dcee..e79af5ba 100644
--- a/tests/gym/envs/conftest.py
+++ b/tests/gym/envs/conftest.py
@@ -4,7 +4,6 @@
 from debug_gym.gym.envs import R2EGymEnv, SWEBenchEnv, SWESmithEnv
 from debug_gym.gym.envs.swe_bench_debug import SWEBenchDebugEnv
 
-
 BUILD_ENV_CONFIGS = {
     "swe_smith": {
         "env_class": SWESmithEnv,

From c56579caa2209d3e4d562740a1290ee995524661 Mon Sep 17 00:00:00 2001
From: Alessandro Sordoni <alsordon@microsoft.com>
Date: Fri, 28 Nov 2025 08:34:27 -0800
Subject: [PATCH 21/31] cls keyword

---
 debug_gym/gym/envs/r2egym.py    | 1 +
 debug_gym/gym/envs/swe_bench.py | 1 +
 debug_gym/gym/envs/swe_smith.py | 1 +
 3 files changed, 3 insertions(+)

diff --git a/debug_gym/gym/envs/r2egym.py b/debug_gym/gym/envs/r2egym.py
index 8e650400..9746c537 100644
--- a/debug_gym/gym/envs/r2egym.py
+++ b/debug_gym/gym/envs/r2egym.py
@@ -254,6 +254,7 @@ def calculate_score(self, eval_output: EvalOutput) -> int:
 
     @classmethod
     def load_dataset(
+        cls,
         dataset_id: str = "R2E-Gym/R2E-Gym-Lite",
         dataset_revision: str = "8d3163011f01f9393bb3dc7700497a79a8686ae5",
         split: str = "train",
diff --git a/debug_gym/gym/envs/swe_bench.py b/debug_gym/gym/envs/swe_bench.py
index 68566230..6c43437b 100644
--- a/debug_gym/gym/envs/swe_bench.py
+++ b/debug_gym/gym/envs/swe_bench.py
@@ -174,6 +174,7 @@ def calculate_score(self, eval_output: EvalOutput) -> int:
 
     @classmethod
     def load_dataset(
+        cls,
         dataset_id: str = "SWE-bench/SWE-bench_Verified",
         dataset_revision: str = "99450355ca8c611021187a57ffac304b66666738",
         split: str = "test",
diff --git a/debug_gym/gym/envs/swe_smith.py b/debug_gym/gym/envs/swe_smith.py
index 9ab436a1..1f8ce79c 100644
--- a/debug_gym/gym/envs/swe_smith.py
+++ b/debug_gym/gym/envs/swe_smith.py
@@ -156,6 +156,7 @@ def eval(self, **kwargs) -> EvalOutput:
 
     @classmethod
     def load_dataset(
+        cls,
         dataset_id: str = "SWE-bench/SWE-smith",
         dataset_revision: str = "699b53400d3855206a0fbf3ff4beaf1a52f4f232",
         split: str = "train",

From 4b01ac849e4d389e5c1af8b7bd12df57590f9fc2 Mon Sep 17 00:00:00 2001
From: Alessandro Sordoni <alsordon@microsoft.com>
Date: Fri, 28 Nov 2025 08:39:23 -0800
Subject: [PATCH 22/31] remove load dataset

---
 debug_gym/gym/envs/env.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/debug_gym/gym/envs/env.py b/debug_gym/gym/envs/env.py
index 0021dc2b..d86e93c4 100644
--- a/debug_gym/gym/envs/env.py
+++ b/debug_gym/gym/envs/env.py
@@ -235,7 +235,6 @@ def __init__(
             )
 
         self.workspace = Workspace(self.terminal, logger=self.logger)
-        self.dataset = self.load_dataset(problems)
         self.set_entrypoints(self._entrypoint, self._debug_entrypoint)
 
     def _reset_env_state(self):
@@ -503,6 +502,3 @@ def close(self):
 
     def __del__(self):
         self.close()
-
-    def load_dataset(self, problems: str | list[str] | None = None):
-        return {"custom": None}

From 0dd0f4ed18e48a9ebcf490a1a5026103702eb512 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marc-Alexandre=20C=C3=B4t=C3=A9?= <marc.cote.19@gmail.com>
Date: Fri, 28 Nov 2025 12:52:45 -0800
Subject: [PATCH 23/31] Working on tests + refactoring

---
 debug_gym/gym/envs/__init__.py       |  3 ++
 debug_gym/gym/envs/aider.py          |  2 +-
 debug_gym/gym/envs/env.py            | 58 ++++++++-------------
 debug_gym/gym/envs/local.py          | 52 +++++++++++++++++++
 debug_gym/gym/envs/mini_nightmare.py |  2 +-
 debug_gym/gym/envs/r2egym.py         | 14 ++---
 debug_gym/gym/envs/swe_bench.py      | 11 ++--
 debug_gym/gym/envs/swe_smith.py      | 24 +++------
 scripts/config.yaml                  | 18 +++----
 scripts/config_aider.yaml            | 11 ++--
 scripts/config_mini_nightmare.yaml   | 13 ++---
 scripts/config_r2egym.yaml           | 13 +++--
 scripts/config_swebench.yaml         | 17 +++---
 scripts/config_swesmith.yaml         | 18 +++----
 scripts/run.py                       |  8 +--
 tests/gym/envs/test_r2egym.py        | 64 +++++++++--------------
 tests/gym/envs/test_swe_bench.py     | 75 ++++++++++++++-------------
 tests/gym/envs/test_swe_smith.py     | 77 +++++++++++-----------------
 18 files changed, 237 insertions(+), 243 deletions(-)
 create mode 100644 debug_gym/gym/envs/local.py

diff --git a/debug_gym/gym/envs/__init__.py b/debug_gym/gym/envs/__init__.py
index 86ef4cab..4cad96f0 100644
--- a/debug_gym/gym/envs/__init__.py
+++ b/debug_gym/gym/envs/__init__.py
@@ -1,5 +1,6 @@
 from debug_gym.gym.envs.aider import AiderBenchmarkEnv
 from debug_gym.gym.envs.env import RepoEnv, TooledEnv
+from debug_gym.gym.envs.local import LocalEnv
 from debug_gym.gym.envs.mini_nightmare import MiniNightmareEnv
 from debug_gym.gym.envs.r2egym import R2EGymEnv
 from debug_gym.gym.envs.swe_bench import SWEBenchEnv
@@ -11,6 +12,8 @@ def select_env(env_type: str = None) -> type[RepoEnv]:
     match env_type:
         case None:
             return RepoEnv
+        case "local":
+            return LocalEnv
         case "aider":
             return AiderBenchmarkEnv
         case "swebench":
diff --git a/debug_gym/gym/envs/aider.py b/debug_gym/gym/envs/aider.py
index 3056e0dd..98421e57 100644
--- a/debug_gym/gym/envs/aider.py
+++ b/debug_gym/gym/envs/aider.py
@@ -94,7 +94,7 @@ def eval(self, **kwargs) -> EvalOutput:
         self.last_eval = EvalOutput(success, output)
         return self.last_eval
 
-    def setup_task(self, options: dict = None):
+    def setup_task(self):
         pass
 
     def setup_workspace(self):
diff --git a/debug_gym/gym/envs/env.py b/debug_gym/gym/envs/env.py
index d86e93c4..13b54d76 100644
--- a/debug_gym/gym/envs/env.py
+++ b/debug_gym/gym/envs/env.py
@@ -201,38 +201,27 @@ class RepoEnv(TooledEnv):
 
     def __init__(
         self,
-        path: str | None = None,
+        task_data: dict,
         entrypoint: str = "python -m pytest -sq .",
         debug_entrypoint: str | None = None,
         max_score: int | None = None,
-        readonly_patterns: list[str] | None = None,  # TODO: remove
         run_timeout: int | None = None,
         terminal: Terminal | None = None,
         logger: DebugGymLogger | None = None,
-        problems: str | list[str] | None = None,
         **kwargs,
     ):
         super().__init__()
 
-        self.path = path
+        self.task_data = task_data
         self.max_score = max_score
         self.run_timeout = run_timeout
-        self.terminal = terminal or LocalTerminal()  # TODO: default to DockerTerminal
+        self.terminal = terminal
         self._entrypoint = entrypoint
         self._debug_entrypoint = debug_entrypoint
         self.logger = logger or DebugGymLogger("debug-gym")
         self.infos: EnvInfo | None = None
         self.rng = None
         self.additional_kwargs = kwargs
-        self.task_name: str | None = None
-        self.options: dict = {}
-
-        if "auto_eval_on_rewrite" in kwargs:
-            raise ValueError(
-                "The 'auto_eval_on_rewrite' parameter is no longer supported. "
-                "Please remove it from your initialization arguments."
-                "Instead, set 'auto_eval_on_rewrite' in the EvalTool instance."
-            )
 
         self.workspace = Workspace(self.terminal, logger=self.logger)
         self.set_entrypoints(self._entrypoint, self._debug_entrypoint)
@@ -289,44 +278,39 @@ def working_dir(self) -> Path:
     def instructions(self) -> str:
         """Instructions for the current task.
         Override in subclasses for different behavior."""
-        return ""
+        raise NotImplementedError(
+            "Subclasses must implement the instructions property."
+        )
 
-    def setup_task(self, options: dict = None) -> None:
+    @property
+    def task_name(self) -> str:
+        raise NotImplementedError("Subclasses must implement the task_name property.")
+
+    def setup_task(self) -> None:
         """Setup the task information.
         Override in subclasses for different behavior. Called once at reset."""
-        pass
+        raise NotImplementedError("Subclasses must implement setup_task method.")
 
     def setup_workspace(self) -> None:
         """Setup the workspace.
         Override in subclasses for different behavior. Called once at reset."""
-        self.workspace.reset()
-        self.workspace.copy_content(self.path)
-        self.workspace.setup_file_filters()
+        raise NotImplementedError("Subclasses must implement setup_workspace method.")
 
     def setup_terminal(self) -> None:
         """Setup the terminal.
         Override in subclasses for different behavior. Called once at reset."""
-
-        self.logger.debug(f"Configuring {self.terminal}...")
-
-        self.terminal.run("git init -b main")
-        self.terminal.run("git config user.name 'debug-gym'")
-        self.terminal.run("git config user.email '<>'")
-
-        self.terminal.run("git add *")
-        self.terminal.run("git commit -am 'Init'")
-
-        self.terminal.run("git add .debugignore .debugreadonly")
-        self.terminal.run("git commit -am 'Add debug-gym ignore and read-only files'")
+        raise NotImplementedError("Subclasses must implement setup_terminal method.")
 
     def reset(self, *, options: dict = None):
         """Resets the environment and returns eval as the initial observation."""
-        self.options = options if options is not None else self.options
+        options = options if options is not None else {}
         self.logger.debug("Resetting environment")
-        self.close()  # Clean up previous workspace and terminal.
-        self.setup_task(options=self.options)
-        self.setup_workspace()
-        self.setup_terminal()
+        if options.get("reset_runtime", True):
+            self.close()  # Clean up previous workspace and terminal.
+            self.setup_task()
+            self.setup_workspace()
+            self.setup_terminal()
+
         self._reset_env_state()
 
         # Notify all tools that the environment is reset and get their observations
diff --git a/debug_gym/gym/envs/local.py b/debug_gym/gym/envs/local.py
new file mode 100644
index 00000000..c3b8d54e
--- /dev/null
+++ b/debug_gym/gym/envs/local.py
@@ -0,0 +1,52 @@
+from debug_gym.gym.envs.env import RepoEnv
+
+
+class LocalEnv(RepoEnv):
+
+    def __init__(
+        self,
+        path: str,
+        entrypoint: str = "python -m pytest -sq .",
+        debug_entrypoint: str | None = None,
+        **kwargs,
+    ):
+        task_data = {"path": path}
+        super().__init__(
+            task_data=task_data,
+            entrypoint=entrypoint,
+            debug_entrypoint=debug_entrypoint,
+            **kwargs,
+        )
+
+    @property
+    def instruction(self) -> str:
+        return f"Debug the local codebase at {self.path}. Investigate the repository, figure out the root cause, then rewrite the code to fix the issue."
+
+    @property
+    def task(self) -> str:
+        return self.task_data["path"].split("/")[-1]
+
+    def setup_task(self) -> None:
+        """Setup the task information. Called once at reset."""
+        self.path = self.task_data["path"]
+
+    def setup_workspace(self) -> None:
+        """Setup the workspace. Called once at reset."""
+        self.workspace.reset()
+        self.workspace.copy_content(self.path)
+        self.workspace.setup_file_filters()
+
+    def setup_terminal(self) -> None:
+        """Setup the terminal. Called once at reset."""
+
+        self.logger.debug(f"Configuring {self.terminal}...")
+
+        self.terminal.run("git init -b main")
+        self.terminal.run("git config user.name 'debug-gym'")
+        self.terminal.run("git config user.email '<>'")
+
+        self.terminal.run("git add *")
+        self.terminal.run("git commit -am 'Init'")
+
+        self.terminal.run("git add .debugignore .debugreadonly")
+        self.terminal.run("git commit -am 'Add debug-gym ignore and read-only files'")
diff --git a/debug_gym/gym/envs/mini_nightmare.py b/debug_gym/gym/envs/mini_nightmare.py
index 32937bfb..2c59213b 100644
--- a/debug_gym/gym/envs/mini_nightmare.py
+++ b/debug_gym/gym/envs/mini_nightmare.py
@@ -111,7 +111,7 @@ def eval(self, **kwargs) -> EvalOutput:
         self.last_eval = EvalOutput(success, output)
         return self.last_eval
 
-    def setup_task(self, options: dict = None):
+    def setup_task(self):
         pass
 
     def setup_workspace(self):
diff --git a/debug_gym/gym/envs/r2egym.py b/debug_gym/gym/envs/r2egym.py
index 9746c537..17cd5ac7 100644
--- a/debug_gym/gym/envs/r2egym.py
+++ b/debug_gym/gym/envs/r2egym.py
@@ -75,24 +75,24 @@ def __init__(
                 "R2EGymEnv only supports DockerTerminal and KubernetesTerminal."
             )
 
-        self.task_data = task_data
-        self.setup_task()
-
+        super().__init__(task_data=task_data, terminal=terminal, **kwargs)
         self.session_commands = []
-        super().__init__(terminal=terminal, **kwargs)
+
+    @property
+    def task_name(self) -> str:
+        return self.task_data["instance_id"]
 
     @property
     def instructions(self) -> str:
         # try getting the content inside of [ISSUE] [/ISSUE] using regex tags for ds['problem_statement'] else return ds['problem_statement']
         # ref: https://github.com/R2E-Gym/R2E-Gym/blob/main/src/r2egym/agenthub/runtime/docker.py#L592
         try:
-            content = self.ds_row["problem_statement"]
+            content = self.task_data["problem_statement"]
             return re.search(r"\[ISSUE\](.*)\[/ISSUE\]", content, re.DOTALL).group(1)
         except Exception as e:
             return self.task_data["problem_statement"]
 
-    def setup_task(self, options: dict = None):
-        self.task_name = self.task_data["instance_id"]
+    def setup_task(self):
         self.base_image = self.task_data["docker_image"]
         self.package_name = self.task_data["repo_name"]
         self.expected_output = json.loads(self.task_data["expected_output_json"])
diff --git a/debug_gym/gym/envs/swe_bench.py b/debug_gym/gym/envs/swe_bench.py
index 6c43437b..1f7d8a41 100644
--- a/debug_gym/gym/envs/swe_bench.py
+++ b/debug_gym/gym/envs/swe_bench.py
@@ -31,17 +31,18 @@ def __init__(
                 f"{self.__class__.__name__} only supports DockerTerminal and KubernetesTerminal."
             )
 
-        self.task_data = task_data
-        self.setup_task()
         self.test_directives = []
-        super().__init__(terminal=terminal, **kwargs)
+        super().__init__(task_data=task_data, terminal=terminal, **kwargs)
 
     @property
     def instructions(self) -> str:
         return self.task_data["problem_statement"]
 
-    def setup_task(self, options: dict = None):
-        self.task_name = self.task_data["instance_id"]
+    @property
+    def task_name(self) -> str:
+        return self.task_data["instance_id"]
+
+    def setup_task(self):
         self.repo = self.task_data["repo"]
         self.package_name = self.repo.split("/")[1]
         self.version = self.task_data["version"]
diff --git a/debug_gym/gym/envs/swe_smith.py b/debug_gym/gym/envs/swe_smith.py
index 1f8ce79c..8511a1a7 100644
--- a/debug_gym/gym/envs/swe_smith.py
+++ b/debug_gym/gym/envs/swe_smith.py
@@ -5,9 +5,12 @@
 import docker
 import yaml
 from datasets import load_dataset, load_from_disk
-from swebench.harness.constants import TestStatus
-from swebench.harness.grading import MAP_REPO_TO_PARSER
-from swebench.harness.log_parsers.python import parse_log_pytest
+from swesmith.build_repo.download_images import DOCKER_ORG, TAG
+from swesmith.constants import MAP_REPO_TO_SPECS
+from swesmith.harness.grading import TestStatus
+from swesmith.harness.log_parsers import MAP_REPO_TO_PARSER, parse_log_pytest
+from swesmith.harness.utils import get_test_command
+from swesmith.utils import get_repo_commit_from_image_name
 
 from debug_gym.constants import DEBUG_GYM_CACHE_DIR
 from debug_gym.gym.entities import EvalOutput
@@ -25,20 +28,7 @@ class SWESmithEnv(SWEBenchEnv):
         importlib_files("debug_gym") / "gym" / "envs" / "configs" / "swe_smith.yaml"
     )
 
-    def __init__(
-        self,
-        task_data: dict,
-        terminal: Terminal | None = None,
-        **kwargs,
-    ):
-        super().__init__(
-            task_data=task_data,
-            terminal=terminal,
-            **kwargs,
-        )
-
-    def setup_task(self, options: dict = None):
-        self.task_name = self.task_data["instance_id"]
+    def setup_task(self):
         self.base_commit = (
             self.task_data["base_commit"] if "base_commit" in self.task_data else "main"
         )
diff --git a/scripts/config.yaml b/scripts/config.yaml
index ee3952c5..62e0f075 100644
--- a/scripts/config.yaml
+++ b/scripts/config.yaml
@@ -1,16 +1,14 @@
 base:
     # Environment configs
     output_path: "exps/pytorch"
-    env_kwargs: {
-        "path": "data/pytorch",
-        "entrypoint": "python -m pytest -sv test.py",
-        "debug_entrypoint": "python -m pdb -m pytest -s test.py",
-        "run_timeout": 10,
-    }
-    tools: ["pdb", "view", "rewrite"]
-    terminal: {
-        type: "docker",  # "local", "docker", or "kubernetes"
-    }
+    env:
+        type: "local"
+        path: "data/pytorch"
+        entrypoint: "python -m pytest -sv test.py"
+        debug_entrypoint: "python -m pdb -m pytest -s test.py"
+        run_timeout: 10
+    terminal:
+        type: "docker"  # "local", "docker", or "kubernetes"
 
     # LLM configs
     llm_name: "gpt-4o"
diff --git a/scripts/config_aider.yaml b/scripts/config_aider.yaml
index 88dd68fb..09f25411 100644
--- a/scripts/config_aider.yaml
+++ b/scripts/config_aider.yaml
@@ -3,12 +3,11 @@ base:
     output_path: "exps/aider"
     benchmark: "aider"
     problems: "all"  # list of problems, e.g., ["wordy"], or "all"
-    env_kwargs: {
-        "run_timeout": 20,
-    }
-    terminal: {
-        type: "docker",  # "docker", "kubernetes", or "local"
-    }
+    env:
+        type: "aider"
+        run_timeout: 20
+    terminal:
+        type: "docker"  # "docker", "kubernetes", or "local"
 
     # LLM configs
     llm_name: "gpt-4o"
diff --git a/scripts/config_mini_nightmare.yaml b/scripts/config_mini_nightmare.yaml
index 88fbc08a..e97b4345 100644
--- a/scripts/config_mini_nightmare.yaml
+++ b/scripts/config_mini_nightmare.yaml
@@ -3,14 +3,11 @@ base:
     output_path: "exps/mini_nightmare"
     benchmark: "mini_nightmare"
     problems: "all"  # list of problems, e.g., ["config"], or "all"
-    env_kwargs: {
-        "run_timeout": 30,
-        # shortcut features
-    }
-
-    terminal: {
-        type: "docker",  # "docker", "kubernetes", or "local"
-    }
+    env:
+        type: "mini_nightmare"
+        run_timeout: 30
+    terminal:
+        type: "docker"  # "docker", "kubernetes", or "local"
 
     # LLM configs
     llm_name: "gpt-4o"
diff --git a/scripts/config_r2egym.yaml b/scripts/config_r2egym.yaml
index 8d14b79e..57829fd9 100644
--- a/scripts/config_r2egym.yaml
+++ b/scripts/config_r2egym.yaml
@@ -3,14 +3,13 @@ base:
     output_path: "exps/re2gym"
     benchmark: "r2egym"
     problems: "all"  # list of problems, e.g., ["astropy__astropy-12907"], or strings like "test-125" (defined in gym/envs/configs), or "all",
-    env_kwargs: {
-        "run_timeout": 300,
-        dataset_id: "R2E-Gym/R2E-Gym-Lite",
+    env:
+        type: "r2egym"
+        run_timeout: 300
+        dataset_id: "R2E-Gym/R2E-Gym-Lite"
         dataset_revision: "8d3163011f01f9393bb3dc7700497a79a8686ae5"
-    }
-    terminal: {
-        type: "docker",  # "docker", "kubernetes"
-    }
+    terminal:
+        type: "docker"  # "docker", "kubernetes"
 
     # LLM configs
     llm_name: "gpt-4o"
diff --git a/scripts/config_swebench.yaml b/scripts/config_swebench.yaml
index 8bc0ba55..b19b5b36 100644
--- a/scripts/config_swebench.yaml
+++ b/scripts/config_swebench.yaml
@@ -3,14 +3,13 @@ base:
     output_path: "exps/swebench-verified"
     benchmark: "swebench-debug"
     problems: "all"  # list of problems, e.g., ["astropy__astropy-12907"], or "all"
-    env_kwargs: {
-        "run_timeout": 300,
-        "dataset_id": "SWE-bench/SWE-bench_Verified",
-        "dataset_revision": "99450355ca8c611021187a57ffac304b66666738",
-    }
-    terminal: {
-        type: "docker",  # "docker", "kubernetes"
-    }
+    env:
+        type: "swebench-debug"
+        run_timeout: 300
+        dataset_id: "SWE-bench/SWE-bench_Verified"
+        dataset_revision: "99450355ca8c611021187a57ffac304b66666738"
+    terminal:
+        type: "docker"  # "docker", "kubernetes"
 
     # LLM configs
     llm_name: "gpt-4o"
@@ -66,6 +65,8 @@ solution_agent:
 
 swe_agent:
     benchmark: "swebench"
+    env:
+        type: "swebench"
     max_steps: 100
     max_rewrite_steps: 20
     tools:
diff --git a/scripts/config_swesmith.yaml b/scripts/config_swesmith.yaml
index 5862e240..26bb08da 100644
--- a/scripts/config_swesmith.yaml
+++ b/scripts/config_swesmith.yaml
@@ -1,15 +1,15 @@
 base:
     # Environment configs
     output_path: "exps/swesmith"
-    benchmark: "swesmith"
-    problems: "all"  # list of problems, e.g., ["astropy__astropy-12907"], or strings like "test-125" (defined in gym/envs/configs), or "all",
-    env_kwargs: {
-        "run_timeout": 300,
-        "dataset_id": "SWE-bench/SWE-smith"
-    }
-    terminal: {
-        type: "docker",  # "docker", "kubernetes"
-    }
+    env:
+        # type: "swesmith"  # Not needed Will be inferred from dataset.
+        run_timeout: 300
+    terminal:
+        type: "docker"  # "docker", "kubernetes"
+    dataset:
+        type: "swesmith"
+        dataset_id: "SWE-bench/SWE-smith"
+        problems: "all"  # list of problems, e.g., ["astropy__astropy-12907"], or strings like "test-125" (defined in gym/envs/configs), or "all",
 
     # LLM configs
     llm_name: "gpt-4o"
diff --git a/scripts/run.py b/scripts/run.py
index bc339b3c..a2aac006 100644
--- a/scripts/run.py
+++ b/scripts/run.py
@@ -186,7 +186,7 @@ def create_env(config: dict, task_data: dict, logger: DebugGymLogger):
         task_data=task_data,
         terminal=terminal,
         logger=logger,
-        **config["env_kwargs"],
+        **config.get("env", {}),
     )
     return env
 
@@ -252,10 +252,10 @@ def main():
 
     # Create the environment to get the list of problems to run.
     dataset_info = {
-        "dataset_id": config.env_kwargs.get("dataset_id"),
-        "dataset_revision": config.env_kwargs.get("dataset_revision"),
+        "dataset_id": config.get("env", {}).get("dataset_id"),
+        "dataset_revision": config.get("env", {}).get("dataset_revision"),
         "problems": config.get("problems", "all"),
-        "prepull_images": config.env_kwargs.get("prepull_images", False),
+        "prepull_images": config.get("env", {}).get("prepull_images", False),
     }
     dataset = select_env(config.get("benchmark")).load_dataset(**dataset_info)
     problems = sorted(dataset)
diff --git a/tests/gym/envs/test_r2egym.py b/tests/gym/envs/test_r2egym.py
index 6fda7e9b..3aaa663a 100644
--- a/tests/gym/envs/test_r2egym.py
+++ b/tests/gym/envs/test_r2egym.py
@@ -1,4 +1,3 @@
-from pathlib import Path
 from unittest.mock import MagicMock, patch
 
 import pyarrow as pa
@@ -8,7 +7,6 @@
 from debug_gym.agents.solution_agent import AgentSolution
 from debug_gym.gym.entities import Observation
 from debug_gym.gym.envs.r2egym import R2EGymEnv
-from debug_gym.gym.terminals.docker import DockerTerminal
 from debug_gym.gym.tools.tool import ToolCall
 from debug_gym.gym.tools.toolbox import Toolbox
 
@@ -16,14 +14,19 @@
 @pytest.if_docker_running
 def test_load_dataset(get_r2egym_env):
     env = get_r2egym_env()
-    assert env.dataset_id == "R2E-Gym/R2E-Gym-Lite"
-    # check if the dataset contains features that R2EGymEnv expects
-    assert sorted(env.ds.features.keys()) == sorted(
+
+    dataset = env.load_dataset()
+    task_name = "aiohttp_final:d7cd0613472fd4d9940e37f1c55921f6a1515324"
+    assert task_name in dataset
+
+    task_data = next(iter(dataset.values()))
+    assert sorted(task_data.keys()) == sorted(
         [
             "commit_hash",
             "docker_image",
             "execution_result_content",
             "expected_output_json",
+            "instance_id",
             "modified_entity_summaries",
             "modified_files",
             "num_non_test_files",
@@ -38,20 +41,15 @@ def test_load_dataset(get_r2egym_env):
     )
 
 
-@patch("docker.from_env")
-def test_load_dataset_from_parquet(mock_docker_from_env, tmp_path):
+def test_load_dataset_from_parquet(tmp_path):
     """Test loading R2EGym dataset from a local Parquet file."""
-    # Mock Docker client to avoid trying to pull images
-    mock_docker_client = MagicMock()
-    mock_docker_client.images.list.return_value = []
-    mock_docker_from_env.return_value = mock_docker_client
 
     # Create a minimal test Parquet file with expected schema
     parquet_file = tmp_path / "test_dataset.parquet"
-
+    docker_image = "test_repo:test_hash_123"
     data = {
         "commit_hash": ["test_hash_123"],
-        "docker_image": ["test_repo:test_hash_123"],
+        "docker_image": [docker_image],
         "execution_result_content": ["test execution result"],
         "expected_output_json": ['{"test": "output"}'],
         "modified_entity_summaries": ["test summaries"],
@@ -96,25 +94,25 @@ def test_load_dataset_from_parquet(mock_docker_from_env, tmp_path):
 
     # Verify the dataset has the expected data
     assert len(dataset) == 1
-    assert dataset[0]["docker_image"] == "test_repo:test_hash_123"
-    assert dataset[0]["commit_hash"] == "test_hash_123"
-    assert "Test problem statement" in dataset[0]["problem_statement"]
+    task_name = docker_image  # For R2EGym, we use docker_image as instance_id
+    assert docker_image in dataset
+    assert dataset[task_name]["docker_image"] == "test_repo:test_hash_123"
+    assert dataset[task_name]["commit_hash"] == "test_hash_123"
+    assert "Test problem statement" in dataset[task_name]["problem_statement"]
 
 
 @pytest.if_docker_running
 def test_instructions(get_r2egym_env):
     env = get_r2egym_env()
-    env.setup_task("aiohttp_final:d7cd0613472fd4d9940e37f1c55921f6a1515324")
     # Instructions might be wrapped by [ISSUE] [/ISSUE]
-    assert env.instructions in env.ds_row["problem_statement"]
+    assert env.instructions in env.task_data["problem_statement"]
 
 
 @pytest.if_docker_running
 def test_setup_task(get_r2egym_env):
     env = get_r2egym_env()
-    task_name = "aiohttp_final:d7cd0613472fd4d9940e37f1c55921f6a1515324"
-    env.setup_task(task_name)
-    assert env.task_name == task_name
+    assert env.task_name == "aiohttp_final:d7cd0613472fd4d9940e37f1c55921f6a1515324"
+    env.setup_task()
     assert (
         env.base_image
         == "namanjain12/aiohttp_final:d7cd0613472fd4d9940e37f1c55921f6a1515324"
@@ -127,8 +125,7 @@ def test_setup_task(get_r2egym_env):
 @pytest.if_docker_running
 def test_setup_terminal(get_r2egym_env):
     env = get_r2egym_env()
-    task_name = "aiohttp_final:d7cd0613472fd4d9940e37f1c55921f6a1515324"
-    env.reset(options={"task_name": task_name})
+    env.reset()
     _, output = env.terminal.run(f"ls -a")
     assert ".git" in output
     assert "r2e_tests" in output
@@ -139,9 +136,7 @@ def test_setup_terminal(get_r2egym_env):
 def test_reset_and_step(get_r2egym_env):
     env = get_r2egym_env()
     env.add_tool(Toolbox.get_tool("eval"))
-    env_info = env.reset(
-        options={"task_name": "aiohttp_final:d7cd0613472fd4d9940e37f1c55921f6a1515324"}
-    )
+    env_info = env.reset()
 
     assert env.instructions == env_info.step_observation.observation
     assert "short test summary info" in env_info.eval_observation.observation
@@ -196,9 +191,7 @@ def test_reset_and_step(get_r2egym_env):
 @pytest.if_docker_running
 def test_readonly_file(get_r2egym_env):
     env = get_r2egym_env()
-    env_info = env.reset(
-        options={"task_name": "aiohttp_final:d7cd0613472fd4d9940e37f1c55921f6a1515324"}
-    )
+    env_info = env.reset()
     assert env.workspace._is_readonly_func("/testbed/r2e_tests/test_1.py")
 
     env.add_tool(Toolbox.get_tool("view"))
@@ -228,10 +221,7 @@ def test_readonly_file(get_r2egym_env):
 def test_apply_gold_patch(get_r2egym_env):
     env = get_r2egym_env()
     env.add_tool(Toolbox.get_tool("eval"))
-    env_info = env.reset(
-        options={"task_name": "aiohttp_final:d7cd0613472fd4d9940e37f1c55921f6a1515324"}
-    )
-
+    env_info = env.reset()
     assert not env_info.terminated
     assert not env_info.resolved
     assert env_info.score == env.score == 0
@@ -246,19 +236,17 @@ def test_apply_gold_patch(get_r2egym_env):
 def test_running_solution_agent(get_r2egym_env, tmp_path):
     """End-to-end SolutionAgent run for R2E-Gym environment, asserting successful resolution after gold patch."""
     env = get_r2egym_env()
-    task_name = "aiohttp_final:d7cd0613472fd4d9940e37f1c55921f6a1515324"
     config = {
         "output_path": str(tmp_path),
         "random_seed": 0,
         "memory_size": 8,
         "max_steps": 1,
         "max_rewrite_steps": 1,
-        "env_kwargs": {},
     }
     for tool_name in ["pdb", "eval", "submit"]:
         env.add_tool(Toolbox.get_tool(tool_name))
     agent = AgentSolution(agent_args=config, llm=None, logger=env.logger)
-    env.reset(options={"task_name": task_name})
+    env.reset()
     success = agent.run(env)
     assert success
 
@@ -267,9 +255,7 @@ def test_running_solution_agent(get_r2egym_env, tmp_path):
 def test_debug_entrypoint_contains_pdb(get_r2egym_env):
     """Ensure the environment's debug_entrypoint includes '-m pdb' for interactive debugging."""
     env = get_r2egym_env()
-    env.reset(
-        options={"task_name": "aiohttp_final:d7cd0613472fd4d9940e37f1c55921f6a1515324"}
-    )
+    env.reset()
     assert (
         "python -m pdb" in env.debug_entrypoint
     ), f"Expected '-m pdb' in debug_entrypoint, got: {env.debug_entrypoint}"
diff --git a/tests/gym/envs/test_swe_bench.py b/tests/gym/envs/test_swe_bench.py
index c8f86cb4..c198751e 100644
--- a/tests/gym/envs/test_swe_bench.py
+++ b/tests/gym/envs/test_swe_bench.py
@@ -10,16 +10,14 @@
 @pytest.if_docker_running
 def test_instructions(get_swe_bench_env):
     env = get_swe_bench_env()
-    env.ds_row = {"problem_statement": "Test problem statement"}
-    expected_instructions = "Test problem statement"
-    assert env.instructions == expected_instructions
+    assert env.instructions == env.task_data["problem_statement"]
 
 
 @pytest.if_docker_running
 def test_reset_and_step(get_swe_bench_env):
     env = get_swe_bench_env()
     env.add_tool(Toolbox.get_tool("eval"))
-    env_info = env.reset(options={"task_name": "astropy__astropy-14096"})
+    env_info = env.reset()
 
     assert env.instructions == env_info.step_observation.observation
     assert "short test summary info" in env_info.eval_observation.observation
@@ -99,46 +97,51 @@ def test_readonly_file(get_swe_bench_env):
     assert "|-- test_sky_coord.py (read-only)" in env_info.step_observation.observation
 
 
-@pytest.if_docker_running
 def test_load_dataset(get_swe_bench_env):
     env = get_swe_bench_env()
-    assert env.dataset_id == "SWE-bench/SWE-bench_Verified"
+
+    dataset = env.load_dataset()
     task_name = "astropy__astropy-14096"
-    assert task_name in env.dataset.keys()
-    assert list(env.ds.features.keys()) == [
-        "repo",
-        "instance_id",
-        "base_commit",
-        "patch",
-        "test_patch",
-        "problem_statement",
-        "hints_text",
-        "created_at",
-        "version",
-        "FAIL_TO_PASS",
-        "PASS_TO_PASS",
-        "environment_setup_commit",
-        "difficulty",
-    ]
+    assert task_name in dataset
+
+    task_data = next(iter(dataset.values()))
+    assert sorted(task_data.keys()) == sorted(
+        [
+            "repo",
+            "instance_id",
+            "base_commit",
+            "patch",
+            "test_patch",
+            "problem_statement",
+            "hints_text",
+            "created_at",
+            "version",
+            "FAIL_TO_PASS",
+            "PASS_TO_PASS",
+            "environment_setup_commit",
+            "difficulty",
+        ]
+    )
 
 
-@pytest.if_docker_running
 def test_setup_task(get_swe_bench_env):
     env = get_swe_bench_env()
     task_name = "astropy__astropy-14096"
-    env.setup_task(task_name)
     assert env.task_name == task_name
-    assert env.ds_row["repo"] == "astropy/astropy"
-    assert env.ds_row["version"] == "5.1"
-    assert isinstance(env.ds_row, dict)
-    assert isinstance(env.install_configs, dict)
+    env.setup_task()
+    assert env.repo == "astropy/astropy"
+    assert env.version == "5.1"
+    assert env.package_name == "astropy"
+    assert (
+        env.base_image == "swebench/sweb.eval.x86_64.astropy_1776_astropy-14096:latest"
+    )
 
 
 @pytest.if_docker_running
 def test_setup_terminal(get_swe_bench_env):
     env = get_swe_bench_env()
     task_name = "astropy__astropy-14096"
-    env.reset(options={"task_name": task_name})
+    env.reset()
     _, git_logs = env.terminal.run("git log -n 4")
     assert env.base_commit in git_logs
     assert f"Applying test patch for {task_name}" not in git_logs
@@ -167,7 +170,7 @@ def test_patch_property(tmp_path, get_swe_bench_env):
     env = get_swe_bench_env()
 
     # Reset with a task to set up the environment
-    env.reset(options={"task_name": "astropy__astropy-14096"})
+    env.reset()
 
     # Initially, there should be no changes (empty patch)
     initial_patch = env.patch
@@ -218,7 +221,7 @@ def new_function():
 def test_apply_gold_patch(get_swe_bench_env):
     env = get_swe_bench_env()
     env.add_tool(Toolbox.get_tool("eval"))
-    env_info = env.reset(options={"task_name": "astropy__astropy-14096"})
+    env_info = env.reset()
 
     assert not env_info.terminated
     assert not env_info.resolved
@@ -242,12 +245,11 @@ def test_running_solution_agent(get_swe_bench_env, tmp_path):
         # Optional values that BaseAgent.run would use; harmless to include here.
         "max_steps": 1,
         "max_rewrite_steps": 1,
-        "env_kwargs": {},
     }
     for tool_name in ["pdb", "submit"]:
         env.add_tool(Toolbox.get_tool(tool_name))
     agent = AgentSolution(agent_args=config, llm=None, logger=env.logger)
-    env.reset(options={"task_name": "astropy__astropy-14096"})
+    env.reset()
     success = agent.run(env)
     assert success
 
@@ -256,7 +258,7 @@ def test_running_solution_agent(get_swe_bench_env, tmp_path):
 def test_debug_entrypoint_contains_pdb(get_swe_bench_env):
     """Ensure the environment's debug_entrypoint includes '-m pdb' for interactive debugging."""
     env = get_swe_bench_env()
-    env.reset(options={"task_name": "astropy__astropy-14096"})
+    env.reset()
     assert (
         "python -m pdb" in env.debug_entrypoint
     ), f"Expected '-m pdb' in debug_entrypoint, got: {env.debug_entrypoint}"
@@ -266,7 +268,7 @@ def test_debug_entrypoint_contains_pdb(get_swe_bench_env):
 def test_setup_terminal_debug_mode(get_swe_bench_debug_env):
     env = get_swe_bench_debug_env()
     task_name = "astropy__astropy-14096"
-    env.reset(options={"task_name": task_name})
+    env.reset()
     _, git_logs = env.terminal.run("git log -n 4")
     assert env.base_commit in git_logs
     assert f"Applying test patch for {task_name}" in git_logs
@@ -287,11 +289,10 @@ def test_running_solution_agent_in_debug_mode(get_swe_bench_debug_env, tmp_path)
         # Optional values that BaseAgent.run would use; harmless to include here.
         "max_steps": 1,
         "max_rewrite_steps": 1,
-        "env_kwargs": {},
     }
     for tool_name in ["pdb", "eval", "submit"]:
         env.add_tool(Toolbox.get_tool(tool_name))
     agent = AgentSolution(agent_args=config, llm=None, logger=env.logger)
-    env.reset(options={"task_name": "astropy__astropy-14096"})
+    env.reset()
     success = agent.run(env)
     assert success
diff --git a/tests/gym/envs/test_swe_smith.py b/tests/gym/envs/test_swe_smith.py
index 8c46befc..26b02c9f 100644
--- a/tests/gym/envs/test_swe_smith.py
+++ b/tests/gym/envs/test_swe_smith.py
@@ -1,11 +1,13 @@
 from pathlib import Path
 
 import datasets
+import pyarrow as pa
+import pyarrow.parquet as pq
 import pytest
 
 from debug_gym.agents.solution_agent import AgentSolution
 from debug_gym.gym.entities import Observation
-from debug_gym.gym.envs import SWESmithEnv
+from debug_gym.gym.envs.swe_smith import SWESmithEnv
 from debug_gym.gym.tools.tool import ToolCall
 from debug_gym.gym.tools.toolbox import Toolbox
 
@@ -13,9 +15,14 @@
 @pytest.if_docker_running
 def test_load_dataset(get_swe_smith_env):
     env = get_swe_smith_env()
-    assert env.dataset_id == "SWE-bench/SWE-smith"
+
+    dataset = env.load_dataset()
+    task_name = "john-kurkowski__tldextract.3d1bf184.combine_file__1vnuqpt4"
+    assert task_name in dataset
+
     # check if the dataset contains features that SWESmithEnv expects
-    assert sorted(env.ds.features.keys()) == sorted(
+    task_data = next(iter(dataset.values()))
+    assert sorted(task_data.keys()) == sorted(
         [
             "instance_id",
             "repo",
@@ -32,8 +39,9 @@ def test_load_dataset(get_swe_smith_env):
 
 def test_load_dataset_from_parquet(tmp_path):
     """Test that loading from a local Parquet file works correctly."""
+
     # Create a sample parquet file with the required features
-    sample_data = {
+    data = {
         "instance_id": ["test-instance-1", "test-instance-2"],
         "repo": ["test/repo1", "test/repo2"],
         "patch": ["diff --git a/file.py", "diff --git b/file2.py"],
@@ -44,18 +52,17 @@ def test_load_dataset_from_parquet(tmp_path):
         "base_commit": ["abc123", "def456"],
         "problem_statement": ["Problem 1", "Problem 2"],
     }
+    parquet_file = tmp_path / "test_dataset.parquet"
 
-    # Create a dataset and save as parquet
-    ds = datasets.Dataset.from_dict(sample_data)
-    parquet_path = tmp_path / "test_dataset.parquet"
-    ds.to_parquet(str(parquet_path))
+    table = pa.table(data)
+    pq.write_table(table, str(parquet_file))
 
-    # Test that the parquet file can be loaded using datasets library
-    # mimicking what SWESmithEnv.load_dataset() does for parquet files
-    loaded_ds = datasets.load_dataset("parquet", data_files=str(parquet_path))["train"]
+    # Load the dataset from the Parquet file
+    dataset = SWESmithEnv.load_dataset(dataset_id=str(parquet_file), split="train")
+    dataset_entry = next(iter(dataset.values()))
 
     # Verify that the dataset was loaded correctly with expected features
-    assert sorted(loaded_ds.features.keys()) == sorted(
+    assert sorted(dataset_entry.keys()) == sorted(
         [
             "instance_id",
             "repo",
@@ -69,25 +76,20 @@ def test_load_dataset_from_parquet(tmp_path):
         ]
     )
     # Verify that the data is accessible
-    assert len(loaded_ds) == 2
-    assert loaded_ds[0]["instance_id"] == "test-instance-1"
-    assert loaded_ds[1]["instance_id"] == "test-instance-2"
+    assert len(dataset) == 2
+    assert sorted(dataset.keys()) == ["test-instance-1", "test-instance-2"]
 
 
-@pytest.if_docker_running
 def test_instructions(get_swe_smith_env):
     env = get_swe_smith_env()
-    env.ds_row = {"problem_statement": "Test problem statement"}
-    expected_instructions = "Test problem statement"
-    assert env.instructions == expected_instructions
+    assert env.instructions == env.task_data["problem_statement"]
 
 
-@pytest.if_docker_running
 def test_setup_task(get_swe_smith_env):
     env = get_swe_smith_env()
     task_name = "john-kurkowski__tldextract.3d1bf184.combine_file__1vnuqpt4"
-    env.setup_task(task_name)
     assert env.task_name == task_name
+    env.setup_task()
     assert env.repo == "john-kurkowski/tldextract"
     assert env.branch_name == task_name
     assert env.package_name == "tldextract"
@@ -97,7 +99,7 @@ def test_setup_task(get_swe_smith_env):
 def test_setup_terminal(get_swe_smith_env):
     env = get_swe_smith_env()
     task_name = "john-kurkowski__tldextract.3d1bf184.combine_file__1vnuqpt4"
-    env.reset(options={"task_name": task_name})
+    env.reset()
     _, git_logs = env.terminal.run("git log -n 4")
     # For SWE-Smith the base commit is found in the branch associated to the
     # instance id and is different from the one in the main branch.
@@ -112,11 +114,7 @@ def test_setup_terminal(get_swe_smith_env):
 def test_reset_and_step(get_swe_smith_env):
     env = get_swe_smith_env()
     env.add_tool(Toolbox.get_tool("eval"))
-    env_info = env.reset(
-        options={
-            "task_name": "john-kurkowski__tldextract.3d1bf184.combine_file__1vnuqpt4"
-        }
-    )
+    env_info = env.reset()
 
     assert env.instructions == env_info.step_observation.observation
     assert "short test summary info" in env_info.eval_observation.observation
@@ -156,11 +154,7 @@ def test_reset_and_step(get_swe_smith_env):
 @pytest.if_docker_running
 def test_readonly_file(get_swe_smith_env):
     env = get_swe_smith_env()
-    env_info = env.reset(
-        options={
-            "task_name": "john-kurkowski__tldextract.3d1bf184.combine_file__1vnuqpt4"
-        }
-    )
+    env_info = env.reset()
 
     env.add_tool(Toolbox.get_tool("view"))
     env.add_tool(Toolbox.get_tool("listdir"))
@@ -199,11 +193,7 @@ def test_readonly_file(get_swe_smith_env):
 def test_apply_gold_patch(get_swe_smith_env):
     env = get_swe_smith_env()
     env.add_tool(Toolbox.get_tool("eval"))
-    env_info = env.reset(
-        options={
-            "task_name": "john-kurkowski__tldextract.3d1bf184.combine_file__1vnuqpt4"
-        }
-    )
+    env_info = env.reset()
 
     assert not env_info.terminated
     assert not env_info.resolved
@@ -220,8 +210,7 @@ def test_calculate_score_with_pytest_error(get_swe_smith_env):
     """Test that the indentation error in pytest is handled correctly."""
     env = get_swe_smith_env()
     env.add_tool(Toolbox.get_tool("eval"))
-    task_name = "john-kurkowski__tldextract.3d1bf184.combine_file__1vnuqpt4"
-    env.reset(options={"task_name": task_name})
+    env.reset()
 
     # Modify 'tldextract/tldextract.py' in the working_dir to introduce an indentation error.
     content = env.workspace.read_file("tldextract/tldextract.py").split("\n")
@@ -253,19 +242,17 @@ def test_calculate_score_with_pytest_error(get_swe_smith_env):
 def test_running_solution_agent(get_swe_smith_env, tmp_path):
     """Analogous to SWE Bench solution agent test: run SolutionAgent end-to-end and assert success."""
     env = get_swe_smith_env()
-    task_name = "john-kurkowski__tldextract.3d1bf184.combine_file__1vnuqpt4"
     config = {
         "output_path": str(tmp_path),
         "random_seed": 0,
         "memory_size": 8,
         "max_steps": 1,
         "max_rewrite_steps": 1,
-        "env_kwargs": {},
     }
     for tool_name in ["pdb", "eval", "submit"]:
         env.add_tool(Toolbox.get_tool(tool_name))
     agent = AgentSolution(agent_args=config, llm=None, logger=env.logger)
-    env.reset(options={"task_name": task_name})
+    env.reset()
     success = agent.run(env)
     assert success
 
@@ -274,11 +261,7 @@ def test_running_solution_agent(get_swe_smith_env, tmp_path):
 def test_debug_entrypoint_contains_pdb(get_swe_smith_env):
     """Ensure the environment's debug_entrypoint includes '-m pdb' for interactive debugging."""
     env = get_swe_smith_env()
-    env.reset(
-        options={
-            "task_name": "john-kurkowski__tldextract.3d1bf184.combine_file__1vnuqpt4"
-        }
-    )
+    env.reset()
     assert (
         "python -m pdb" in env.debug_entrypoint
     ), f"Expected '-m pdb' in debug_entrypoint, got: {env.debug_entrypoint}"

From e6fcd586c464e826ea52dc76b7014e51786ee810 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marc-Alexandre=20C=C3=B4t=C3=A9?= <marc.cote.19@gmail.com>
Date: Fri, 28 Nov 2025 13:11:17 -0800
Subject: [PATCH 24/31] Adding back swesmith

---
 requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements.txt b/requirements.txt
index a4cfd455..b81e53e8 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,6 +10,7 @@ transformers==4.51.3
 tiktoken
 docker==7.1.0
 swebench==4.0.3
+swesmith==0.0.4
 prompt_toolkit==3.0.51
 anthropic==0.51.0
 jinja2==3.1.6

From c80e6d87a50d069a8246b7cb5446fb2b28789582 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marc-Alexandre=20C=C3=B4t=C3=A9?= <marc.cote.19@gmail.com>
Date: Mon, 1 Dec 2025 08:43:51 -0800
Subject: [PATCH 25/31] Fixing tests.

---
 debug_gym/gym/envs/aider.py           | 22 +++++++++----
 debug_gym/gym/envs/local.py           |  7 +++-
 debug_gym/gym/envs/mini_nightmare.py  | 21 ++++++++----
 tests/gym/envs/test_aider.py          | 12 ++++---
 tests/gym/envs/test_env.py            | 46 +++++++++++++--------------
 tests/gym/envs/test_mini_nightmare.py | 18 ++++++-----
 tests/gym/test_utils.py               |  8 ++---
 tests/gym/tools/test_bash.py          |  4 +--
 tests/gym/tools/test_eval.py          |  4 +--
 tests/gym/tools/test_grep.py          | 12 +++----
 tests/gym/tools/test_listdir.py       |  4 +--
 tests/gym/tools/test_pdb.py           | 37 ++++++++-------------
 tests/gym/tools/test_rewrite.py       |  4 +--
 tests/gym/tools/test_tool.py          | 29 +++++++++--------
 tests/gym/tools/test_view.py          |  4 +--
 15 files changed, 124 insertions(+), 108 deletions(-)

diff --git a/debug_gym/gym/envs/aider.py b/debug_gym/gym/envs/aider.py
index 98421e57..d80927ec 100644
--- a/debug_gym/gym/envs/aider.py
+++ b/debug_gym/gym/envs/aider.py
@@ -1,23 +1,27 @@
+import logging
 import os
 import subprocess
 import tempfile
 from pathlib import Path
-from typing import List
 
 import debug_gym.gym.utils as utils
 from debug_gym.constants import DEBUG_GYM_CACHE_DIR
 from debug_gym.gym.entities import EvalOutput
 from debug_gym.gym.envs.env import RepoEnv
+from debug_gym.gym.envs.local import LocalEnv
 from debug_gym.gym.terminals.docker import DockerTerminal
 from debug_gym.gym.terminals.terminal import Terminal
+from debug_gym.logger import DebugGymLogger
 
 DOCKER_AIDER_IMAGE_NAME = "debug-gym:aider"
 
 
-def build_docker_image(logger):
+def build_docker_image(logger: logging.Logger | None = None):
     """
     Build a Docker image for the Mini Nightmare environment.
     """
+    logger = logger or DebugGymLogger("debug-gym")
+
     # Check if Docker image is built.
     import docker
 
@@ -75,8 +79,13 @@ def __init__(
         if hasattr(terminal, "base_image") and terminal.base_image is None:
             terminal.base_image = DOCKER_AIDER_IMAGE_NAME
 
-        self.task_data = task_data
-        super().__init__(entrypoint=entrypoint, terminal=terminal, **kwargs)
+        super().__init__(
+            task_data=task_data, entrypoint=entrypoint, terminal=terminal, **kwargs
+        )
+
+    @property
+    def task_name(self) -> str:
+        return self.current_task["task_name"]
 
     @property
     def instructions(self) -> str:
@@ -95,7 +104,7 @@ def eval(self, **kwargs) -> EvalOutput:
         return self.last_eval
 
     def setup_task(self):
-        pass
+        self.current_task = self.task_data
 
     def setup_workspace(self):
         self.workspace.reset()
@@ -127,7 +136,7 @@ def setup_terminal(self):
     def load_dataset(
         cls,
         problems: str | list[str] | None = None,
-        build_image: bool = False,
+        build_image: bool = True,
         logger: object = None,
     ) -> dict:
         if build_image:
@@ -167,6 +176,7 @@ def load_dataset(
             )
 
             dataset[task_name] = {
+                "task_name": task_name,
                 "codebase": directory,
                 "instructions": instructions,
                 "filename": task_name + ".py",
diff --git a/debug_gym/gym/envs/local.py b/debug_gym/gym/envs/local.py
index c3b8d54e..e2134014 100644
--- a/debug_gym/gym/envs/local.py
+++ b/debug_gym/gym/envs/local.py
@@ -1,4 +1,6 @@
 from debug_gym.gym.envs.env import RepoEnv
+from debug_gym.gym.terminals.local import LocalTerminal
+from debug_gym.gym.terminals.terminal import Terminal
 
 
 class LocalEnv(RepoEnv):
@@ -6,20 +8,23 @@ class LocalEnv(RepoEnv):
     def __init__(
         self,
         path: str,
+        terminal: Terminal | None = None,
         entrypoint: str = "python -m pytest -sq .",
         debug_entrypoint: str | None = None,
         **kwargs,
     ):
         task_data = {"path": path}
+        terminal = terminal or LocalTerminal()
         super().__init__(
             task_data=task_data,
+            terminal=terminal,
             entrypoint=entrypoint,
             debug_entrypoint=debug_entrypoint,
             **kwargs,
         )
 
     @property
-    def instruction(self) -> str:
+    def instructions(self) -> str:
         return f"Debug the local codebase at {self.path}. Investigate the repository, figure out the root cause, then rewrite the code to fix the issue."
 
     @property
diff --git a/debug_gym/gym/envs/mini_nightmare.py b/debug_gym/gym/envs/mini_nightmare.py
index 2c59213b..83c128be 100644
--- a/debug_gym/gym/envs/mini_nightmare.py
+++ b/debug_gym/gym/envs/mini_nightmare.py
@@ -1,3 +1,4 @@
+import logging
 import tempfile
 from pathlib import Path
 
@@ -7,14 +8,16 @@
 from debug_gym.gym.envs.env import RepoEnv
 from debug_gym.gym.terminals.docker import DockerTerminal
 from debug_gym.gym.terminals.terminal import Terminal
+from debug_gym.logger import DebugGymLogger
 
 DOCKER_MINI_NIGHTMARE_IMAGE_NAME = "debug-gym:mini-nightmare"
 
 
-def build_docker_image(logger):
+def build_docker_image(logger: logging.Logger | None = None):
     """
     Build a Docker image for the Mini Nightmare environment.
     """
+    logger = logger or DebugGymLogger("debug-gym")
     # Check if Docker image is built.
     import docker
 
@@ -86,10 +89,9 @@ def __init__(
         if hasattr(terminal, "base_image") and terminal.base_image is None:
             terminal.base_image = DOCKER_MINI_NIGHTMARE_IMAGE_NAME
 
-        self.task_data = task_data
-        self.task_name = task_data["task_name"]
-
-        super().__init__(entrypoint=entrypoint, terminal=terminal, **kwargs)
+        super().__init__(
+            task_data=task_data, entrypoint=entrypoint, terminal=terminal, **kwargs
+        )
 
     @property
     def instructions(self) -> str:
@@ -99,6 +101,10 @@ def instructions(self) -> str:
             " Beaware that the bug may not be in the code you initially see."
         )
 
+    @property
+    def task_name(self) -> str:
+        return self.current_task["task_name"]
+
     def calculate_max_score(self, eval_output: EvalOutput) -> int:
         return utils.extract_max_score_from_pytest_output(eval_output.output)
 
@@ -112,7 +118,7 @@ def eval(self, **kwargs) -> EvalOutput:
         return self.last_eval
 
     def setup_task(self):
-        pass
+        self.current_task = self.task_data
 
     def setup_workspace(self):
         self.workspace.reset()
@@ -144,7 +150,7 @@ def setup_terminal(self):
     def load_dataset(
         cls,
         problems: str | list[str] | None = None,
-        build_image: bool = False,
+        build_image: bool = True,
         logger: object = None,
     ) -> dict:
         if build_image:
@@ -167,6 +173,7 @@ def load_dataset(
             assert (task_path / ".debugreadonly").exists()
 
             dataset[task_name] = {
+                "task_name": task_name,
                 "codebase": task_path,
                 "filename": task_name + "_code.py",
             }
diff --git a/tests/gym/envs/test_aider.py b/tests/gym/envs/test_aider.py
index 8786e291..ed2a2ac6 100644
--- a/tests/gym/envs/test_aider.py
+++ b/tests/gym/envs/test_aider.py
@@ -37,8 +37,10 @@ def setup_aider_repo(tmp_path_factory):
 @pytest.fixture
 def env(setup_aider_repo):
     terminal = LocalTerminal()
-    env = AiderBenchmarkEnv(terminal=terminal)
-    env.reset(options={"task_name": "clock"})
+    dataset = AiderBenchmarkEnv.load_dataset()
+    task_data = dataset["clock"]
+    env = AiderBenchmarkEnv(task_data=task_data, terminal=terminal)
+    env.reset()
     return env
 
 
@@ -103,13 +105,15 @@ def test_instructions(env):
 
 @patch("debug_gym.gym.envs.aider.build_docker_image")
 def test_build_docker_image(mock_build_docker_image):
-    AiderBenchmarkEnv()
+    dataset = AiderBenchmarkEnv.load_dataset()
     mock_build_docker_image.assert_called_once()
 
 
 @pytest.if_docker_running
 def test_reset_with_docker_terminal(setup_aider_repo):
-    env = AiderBenchmarkEnv()
+    dataset = AiderBenchmarkEnv.load_dataset()
+    task_data = dataset["clock"]
+    env = AiderBenchmarkEnv(task_data=task_data)
     env.add_tool(Toolbox.get_tool("eval"))
     assert isinstance(env.terminal, DockerTerminal)
 
diff --git a/tests/gym/envs/test_env.py b/tests/gym/envs/test_env.py
index 1a77a1ed..6a036893 100644
--- a/tests/gym/envs/test_env.py
+++ b/tests/gym/envs/test_env.py
@@ -6,13 +6,14 @@
 
 from debug_gym.gym.entities import EvalOutput, Event, Observation
 from debug_gym.gym.envs.env import EnvInfo, EventHooks, RepoEnv, TooledEnv
+from debug_gym.gym.envs.local import LocalEnv
 from debug_gym.gym.tools.tool import ToolCall
 from debug_gym.gym.tools.toolbox import Toolbox
 
 
 @pytest.fixture
-def env_mock():
-    env = RepoEnv()
+def env_mock(tmp_path):
+    env = LocalEnv(path=tmp_path)
     return env
 
 
@@ -109,7 +110,7 @@ def test_tool_names(env_mock):
     assert env_mock.tool_names == "tool1, tool2"
 
 
-def test_env_tools():
+def test_env_tools(env_mock):
     tool1 = MagicMock()
     tool1.name = "tool1"
     tool1.description = "instructions1"
@@ -129,11 +130,10 @@ def test_env_tools():
         },
     }
 
-    env = RepoEnv()
-    env.add_tool(tool1)
-    env.add_tool(tool2)
+    env_mock.add_tool(tool1)
+    env_mock.add_tool(tool2)
 
-    assert env.tools == [tool1, tool2]
+    assert env_mock.tools == [tool1, tool2]
 
 
 @pytest.fixture
@@ -147,7 +147,7 @@ def env(tmp_path):
     (repo_path / "file2.txt").touch()
     (subdir_path / "subfile1.txt").touch()
 
-    env = RepoEnv(path=repo_path)
+    env = LocalEnv(path=repo_path)
     return env
 
 
@@ -186,7 +186,7 @@ def test_step(
     mock_pdb_tool.current_frame_file = "file.py"
     mock_get_tool.return_value = None
 
-    env = RepoEnv(path=tmp_path)
+    env = LocalEnv(path=tmp_path)
     env.reset()
     env.last_eval = EvalOutput(success=False, output="1 failed, 0 passed")
     tool_call = ToolCall(id="123", name="pdb", arguments={"command": "b 10"})
@@ -210,7 +210,7 @@ def test_reset(tmp_path):
     (tmp_path / "test.py").write_text("def test_1():\n  assert False\n")
     (tmp_path / ".debugignore").write_text("__pycache__/\n.git/\n.pytest_cache/\n")
 
-    env = RepoEnv(path=tmp_path, entrypoint="pytest test.py")
+    env = LocalEnv(path=tmp_path, entrypoint="pytest test.py")
     infos = env.reset()
 
     assert env.last_eval is None
@@ -224,7 +224,7 @@ def test_reset(tmp_path):
         action_reasoning=None,
         action_content=None,
         action_tool_call=None,
-        instructions="",
+        instructions=env.instructions,
         score=0,
         max_score=None,
         terminated=False,
@@ -276,7 +276,7 @@ def test_eval(tmp_path):
     (tmp_path / "test.py").write_text("def test_1():\n  assert False\n")
     (tmp_path / ".debugignore").write_text("__pycache__/\n.git/\n.pytest_cache/\n")
 
-    env = RepoEnv(path=tmp_path, entrypoint="pytest test.py")
+    env = LocalEnv(path=tmp_path, entrypoint="pytest test.py")
     env.reset()
     env.eval()
     assert "FAILED test.py::test_1 - assert False" in env.last_eval.output
@@ -287,7 +287,7 @@ def test_eval_success(tmp_path):
     # create a dummy file
     with open(tmp_path / "file.py", "w") as f:
         f.write("print('Hello, World!')")
-    env = RepoEnv(path=working_dir, entrypoint="python file.py")
+    env = LocalEnv(path=working_dir, entrypoint="python file.py")
     env.reset()
     output = env.eval()
     assert output == EvalOutput(success=True, output="Hello, World!")
@@ -298,7 +298,7 @@ def test_eval_timeout(tmp_path):
     # runs for longer than the timeout
     with open(tmp_path / "file.py", "w") as f:
         f.write("import time; time.sleep(5)")
-    env = RepoEnv(path=working_dir, entrypoint="python file.py", run_timeout=1)
+    env = LocalEnv(path=working_dir, entrypoint="python file.py", run_timeout=1)
     env.reset()
     output = env.eval()
     assert output == EvalOutput(success=False, output="Timeout expired.")
@@ -371,22 +371,20 @@ def test_event_hooks_notify():
     subscriber.on_env_start.assert_called_once()
 
 
-def test_current_breakpoints_no_breakpoints():
-    env = RepoEnv()
-    env.current_breakpoints_state = {}
-    result = env.current_breakpoints()
+def test_current_breakpoints_no_breakpoints(env_mock):
+    env_mock.current_breakpoints_state = {}
+    result = env_mock.current_breakpoints()
     assert result == "No breakpoints are set."
 
 
-def test_current_breakpoints_with_breakpoints(tmp_path):
-    env = RepoEnv()
-    env.current_breakpoints_state = {
+def test_current_breakpoints_with_breakpoints(tmp_path, env_mock):
+    env_mock.current_breakpoints_state = {
         "file1.py|||10": "b file1.py:10",
         "file1.py|||20": "b file1.py:20",
         "file1.py|||30": "b file1.py:30",
         "file2.py|||15": "b file2.py:15",
     }
-    result = env.current_breakpoints()
+    result = env_mock.current_breakpoints()
     expected_result = (
         "line 10 in file1.py\n"
         "line 20 in file1.py\n"
@@ -424,7 +422,7 @@ def test_queue_and_process_events():
 
 
 def test_has_breakpoint_true_and_false(tmp_path):
-    env = RepoEnv(path=tmp_path)
+    env = LocalEnv(path=tmp_path)
     env.reset()
     file_path = env.working_dir / "test.py"
     file_path.write_text("print('hello')")
@@ -438,7 +436,7 @@ def test_has_breakpoint_true_and_false(tmp_path):
 
 
 def test_has_breakpoint_relative_path(tmp_path):
-    env = RepoEnv(path=tmp_path)
+    env = LocalEnv(path=tmp_path)
     env.reset()
     file_path = env.working_dir / "foo.py"
     file_path.write_text("print('foo')")
diff --git a/tests/gym/envs/test_mini_nightmare.py b/tests/gym/envs/test_mini_nightmare.py
index eee46ee4..0a8590c5 100644
--- a/tests/gym/envs/test_mini_nightmare.py
+++ b/tests/gym/envs/test_mini_nightmare.py
@@ -12,23 +12,23 @@
 def mini_nightmare_env():
     # Initialize the MiniNightmareEnv with LocalTerminal
     terminal = LocalTerminal()
-    env = MiniNightmareEnv(terminal=terminal)
+    dataset = MiniNightmareEnv.load_dataset()
+    task_data = dataset["config"]
+    env = MiniNightmareEnv(task_data=task_data, terminal=terminal)
     env.add_tool(Toolbox.get_tool("eval"))
     return env
 
 
 def test_load_dataset(mini_nightmare_env):
-    dataset = mini_nightmare_env.load_dataset()
-    assert mini_nightmare_env.dataset == dataset
-
+    dataset = MiniNightmareEnv.load_dataset()
     subproblems = list(dataset.keys())[::2]
-    subset = mini_nightmare_env.load_dataset(problems=subproblems)
+    subset = MiniNightmareEnv.load_dataset(problems=subproblems)
     assert list(subset.keys()) == subproblems
 
 
 @patch("debug_gym.gym.envs.mini_nightmare.build_docker_image")
 def test_build_docker_image(mock_build_docker_image):
-    MiniNightmareEnv()
+    dataset = MiniNightmareEnv.load_dataset()
     mock_build_docker_image.assert_called_once()
 
 
@@ -53,11 +53,13 @@ def test_reset(mini_nightmare_env):
 
 @pytest.if_docker_running
 def test_reset_with_docker_terminal():
-    env = MiniNightmareEnv()
+    dataset = MiniNightmareEnv.load_dataset()
+    task_data = dataset["config"]
+    env = MiniNightmareEnv(task_data=task_data)
     env.add_tool(Toolbox.get_tool("eval"))
     assert isinstance(env.terminal, DockerTerminal)
 
-    infos = env.reset(options={"task_name": "config"})
+    infos = env.reset()
     assert env.instructions == infos.step_observation.observation
     assert "2 failed" in infos.eval_observation.observation
     assert infos.max_score == 2
diff --git a/tests/gym/test_utils.py b/tests/gym/test_utils.py
index 6a51b583..47f50335 100644
--- a/tests/gym/test_utils.py
+++ b/tests/gym/test_utils.py
@@ -2,7 +2,7 @@
 
 import pytest
 
-from debug_gym.gym.envs.env import RepoEnv
+from debug_gym.gym.envs.local import LocalEnv
 from debug_gym.gym.utils import (
     _walk,
     cleanup_pytest_output,
@@ -45,7 +45,7 @@ def test_show_line_number_no_code_path_no_breakpoints():
 
 
 def test_show_line_number_with_code_path(tmp_path):
-    env = RepoEnv(path=tmp_path)
+    env = LocalEnv(path=tmp_path)
     env.reset()
     code_path = f"{env.working_dir}/code.py"
     breakpoints_state = {f"{code_path}|||2": "b 2"}
@@ -65,7 +65,7 @@ def test_show_line_number_with_code_path(tmp_path):
 
 
 def test_show_line_number_multiple_breakpoints(tmp_path):
-    env = RepoEnv(path=tmp_path)
+    env = LocalEnv(path=tmp_path)
     env.reset()
     code_path = f"{env.working_dir}/code.py"
     breakpoints_state = {
@@ -92,7 +92,7 @@ def test_show_line_number_multiple_breakpoints(tmp_path):
 
 
 def test_show_line_number_multiple_breakpoints_with_start_index(tmp_path):
-    env = RepoEnv(path=tmp_path)
+    env = LocalEnv(path=tmp_path)
     env.reset()
     code_path = f"{env.working_dir}/code.py"
     breakpoints_state = {
diff --git a/tests/gym/tools/test_bash.py b/tests/gym/tools/test_bash.py
index 5e7d860e..5644066a 100644
--- a/tests/gym/tools/test_bash.py
+++ b/tests/gym/tools/test_bash.py
@@ -4,7 +4,7 @@
 import pytest
 
 from debug_gym.gym.entities import Observation
-from debug_gym.gym.envs.env import RepoEnv
+from debug_gym.gym.envs.local import LocalEnv
 from debug_gym.gym.tools.bash import BashTool
 from debug_gym.gym.tools.tool import ToolCall
 from debug_gym.gym.tools.toolbox import Toolbox
@@ -30,7 +30,7 @@ def env(tmp_path):
     with open(subdir / "nested.txt", "w") as f:
         f.write("nested file content")
 
-    env = RepoEnv(path=repo_path)
+    env = LocalEnv(path=repo_path)
     bash_tool = Toolbox.get_tool("bash")
     env.add_tool(bash_tool)
     env.reset()
diff --git a/tests/gym/tools/test_eval.py b/tests/gym/tools/test_eval.py
index 7279de81..4bae1026 100644
--- a/tests/gym/tools/test_eval.py
+++ b/tests/gym/tools/test_eval.py
@@ -2,7 +2,7 @@
 
 import pytest
 
-from debug_gym.gym.envs.env import RepoEnv
+from debug_gym.gym.envs.local import LocalEnv
 from debug_gym.gym.tools.tool import ToolCall
 from debug_gym.gym.tools.toolbox import Toolbox
 
@@ -15,7 +15,7 @@ def env(tmp_path):
     with open(repo_path / "test_1.py", "w") as f:
         f.write("def test_1():\n  assert False\n")
 
-    env = RepoEnv(path=repo_path)
+    env = LocalEnv(path=repo_path)
     env.reset()
     return env
 
diff --git a/tests/gym/tools/test_grep.py b/tests/gym/tools/test_grep.py
index 9d3e7b4e..b594bd6f 100644
--- a/tests/gym/tools/test_grep.py
+++ b/tests/gym/tools/test_grep.py
@@ -1,10 +1,6 @@
-import os
-import tempfile
-from pathlib import Path
-
 import pytest
 
-from debug_gym.gym.envs.env import RepoEnv
+from debug_gym.gym.envs.local import LocalEnv
 from debug_gym.gym.tools.grep import GrepTool
 
 
@@ -35,7 +31,7 @@ def hello_world():
 class TestClass:
     def __init__(self):
         self.value = 42
-        
+
     def method_with_bug(self):
         # TODO: Fix this bug
         return self.value / 0  # This will cause a division by zero error
@@ -62,7 +58,7 @@ def load_config(filename):
 class EmailValidator:
     def __init__(self):
         self.pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
-    
+
     def validate(self, email):
         return re.match(self.pattern, email) is not None
 """
@@ -209,7 +205,7 @@ def _setup_grep_repo_env(base_dir, ignore_patterns=None, readonly_patterns=None)
             with (test_repo / ".debugreadonly").open("w") as f:
                 f.write("\n".join(readonly_patterns))
 
-        env = RepoEnv(path=str(test_repo))
+        env = LocalEnv(path=str(test_repo))
         grep_tool = GrepTool()
         env.reset()
         return grep_tool, env
diff --git a/tests/gym/tools/test_listdir.py b/tests/gym/tools/test_listdir.py
index 4198266a..c405ae05 100644
--- a/tests/gym/tools/test_listdir.py
+++ b/tests/gym/tools/test_listdir.py
@@ -1,6 +1,6 @@
 import pytest
 
-from debug_gym.gym.envs.env import RepoEnv
+from debug_gym.gym.envs.local import LocalEnv
 from debug_gym.gym.tools.listdir import ListdirTool
 
 
@@ -8,7 +8,7 @@
 def setup_listdir_repo_env(setup_test_repo):
     def _setup_listdir_repo_env(base_dir):
         test_repo = setup_test_repo(base_dir)
-        env = RepoEnv(path=str(test_repo))
+        env = LocalEnv(path=str(test_repo))
         listdir_tool = ListdirTool()
         listdir_tool.register(env)
         env.reset()
diff --git a/tests/gym/tools/test_pdb.py b/tests/gym/tools/test_pdb.py
index 23232ce9..0b6caf13 100644
--- a/tests/gym/tools/test_pdb.py
+++ b/tests/gym/tools/test_pdb.py
@@ -7,10 +7,8 @@
 import pytest
 
 from debug_gym.gym.entities import Event
-from debug_gym.gym.envs.env import RepoEnv
+from debug_gym.gym.envs.local import LocalEnv
 from debug_gym.gym.terminals.docker import DockerTerminal
-from debug_gym.gym.terminals.local import LocalTerminal
-from debug_gym.gym.terminals.shell_session import ProcessNotRunningError
 from debug_gym.gym.tools.pdb import PDBTool
 
 
@@ -60,7 +58,7 @@ def _breakpoints_state(working_dir):
 def setup_pdb_repo_env(setup_test_repo, setup_breakpoints_state):
     def _setup_pdb_repo_env(base_dir):
         test_repo = setup_test_repo(base_dir)
-        env = RepoEnv(path=str(test_repo))
+        env = LocalEnv(path=str(test_repo))
         pdb_tool = PDBTool(persistent_breakpoints=True, auto_list=True)
         pdb_tool.register(env)
         env.reset()
@@ -75,10 +73,8 @@ def _setup_pdb_repo_env(base_dir):
 def test_pdb_use(tmp_path, setup_test_repo):
     # Test PDBTool with LocalTerminal, verbose pytest
     tests_path = str(setup_test_repo(tmp_path))
-    terminal = LocalTerminal()
-    env = RepoEnv(
+    env = LocalEnv(
         path=tests_path,
-        terminal=terminal,
         debug_entrypoint="python -m pdb -m pytest -sv .",
     )
     env.reset()
@@ -103,10 +99,8 @@ def test_pdb_use(tmp_path, setup_test_repo):
 def test_pdb_use_empty_command(tmp_path, setup_test_repo):
     # Test PDBTool with LocalTerminal, verbose pytest
     tests_path = str(setup_test_repo(tmp_path))
-    terminal = LocalTerminal()
-    env = RepoEnv(
+    env = LocalEnv(
         path=tests_path,
-        terminal=terminal,
         debug_entrypoint="python -m pdb -m pytest -sv .",
     )
     env.reset()
@@ -120,10 +114,8 @@ def test_pdb_use_empty_command(tmp_path, setup_test_repo):
 def test_pdb_b_fail_blank_or_comment(tmp_path, setup_test_repo):
     # Test PDBTool with LocalTerminal, verbose pytest
     tests_path = str(setup_test_repo(tmp_path))
-    terminal = LocalTerminal()
-    env = RepoEnv(
+    env = LocalEnv(
         path=tests_path,
-        terminal=terminal,
         debug_entrypoint="python -m pdb -m pytest -sv .",
     )
     env.reset()
@@ -141,10 +133,8 @@ def test_pdb_b_fail_blank_or_comment(tmp_path, setup_test_repo):
 def test_pdb_pass_empty_path_if_in_session(tmp_path, setup_test_repo):
     # Test PDBTool with LocalTerminal, verbose pytest
     tests_path = str(setup_test_repo(tmp_path))
-    terminal = LocalTerminal()
-    env = RepoEnv(
+    env = LocalEnv(
         path=tests_path,
-        terminal=terminal,
         debug_entrypoint="python -m pdb -m pytest -sv .",
     )
     env.reset()
@@ -164,8 +154,7 @@ def test_pdb_pass_empty_path_if_in_session(tmp_path, setup_test_repo):
 def test_pdb_use_default_env_entrypoint(tmp_path, setup_test_repo):
     # Test PDBTool with default env entrypoint, quiet pytest
     tests_path = str(setup_test_repo(tmp_path))
-    terminal = LocalTerminal()
-    env = RepoEnv(path=tests_path, terminal=terminal)
+    env = LocalEnv(path=tests_path)
     env.reset()
     pdb = PDBTool()
     initial_output = pdb.start_pdb(env)  # "python -m pdb -m pytest -sq ."
@@ -202,7 +191,9 @@ def test_pdb_use_docker_terminal(tmp_path, setup_test_repo):
     )
     # no:cacheprovider to avoid .pytest_cache, --tb=short to reduce output
     debug_entrypoint = "python -m pdb -m pytest -p no:cacheprovider --color=no -sv ."
-    env = RepoEnv(path=tests_path, terminal=terminal, debug_entrypoint=debug_entrypoint)
+    env = LocalEnv(
+        path=tests_path, terminal=terminal, debug_entrypoint=debug_entrypoint
+    )
     env.reset()
     pdb = PDBTool()
     pdb.start_pdb(env)
@@ -228,8 +219,8 @@ def test_initialization():
     assert pdb_tool._session is None
 
 
-def test_register():
-    env = RepoEnv()
+def test_register(tmp_path):
+    env = LocalEnv(path=tmp_path)
     pdb_tool = PDBTool()
     pdb_tool.register(env)
     # every tool listen to ENV_RESET event to track history
@@ -369,7 +360,7 @@ def test_pdb_crashing(tmp_path, setup_test_repo):
     with open(tests_path / "test_fail.py", "w") as f:
         f.write("def test_fail():\nassert False")  # IndentationError
 
-    env = RepoEnv(
+    env = LocalEnv(
         path=tests_path,
         entrypoint="python -m pytest -s test.py",
         debug_entrypoint="python -m pdb -m pytest -s test_fail.py",
@@ -390,7 +381,7 @@ def test_pdb_timeout(tmp_path, setup_test_repo):
             "def test_fail():\n  print('Sleeping...'); import time; time.sleep(10)"
         )  # IndentationError
 
-    env = RepoEnv(
+    env = LocalEnv(
         path=tests_path,
         entrypoint="python -m pytest -s test.py",
         debug_entrypoint="python -m pdb -m pytest -sv test_fail.py",
diff --git a/tests/gym/tools/test_rewrite.py b/tests/gym/tools/test_rewrite.py
index e8ad0772..003f31e6 100644
--- a/tests/gym/tools/test_rewrite.py
+++ b/tests/gym/tools/test_rewrite.py
@@ -2,7 +2,7 @@
 
 import pytest
 
-from debug_gym.gym.envs.env import RepoEnv
+from debug_gym.gym.envs.local import LocalEnv
 from debug_gym.gym.tools.rewrite import RewriteTool
 
 
@@ -23,7 +23,7 @@ def env(tmp_path):
     with open(repo_path / "test.py", "w") as f:
         f.write(file_content)
 
-    env = RepoEnv(path=repo_path)
+    env = LocalEnv(path=repo_path)
 
     rewrite_tool = RewriteTool()
     env.add_tool(rewrite_tool)
diff --git a/tests/gym/tools/test_tool.py b/tests/gym/tools/test_tool.py
index 010526cd..a724befe 100644
--- a/tests/gym/tools/test_tool.py
+++ b/tests/gym/tools/test_tool.py
@@ -1,7 +1,10 @@
+from pathlib import Path
+
 import pytest
 
 from debug_gym.gym.entities import Observation
-from debug_gym.gym.envs.env import Event, RepoEnv
+from debug_gym.gym.envs.env import Event
+from debug_gym.gym.envs.local import LocalEnv
 from debug_gym.gym.tools.tool import EnvironmentTool, Record
 from debug_gym.gym.tools.toolbox import Toolbox
 
@@ -13,9 +16,14 @@ def use(self, env, action):
         return Observation("FakeTool", action)
 
 
-def test_register_valid_environment():
+@pytest.fixture
+def env(tmp_path):
+    env = LocalEnv(path=tmp_path)
+    return env
+
+
+def test_register_valid_environment(env):
     tool = FakeTool()
-    env = RepoEnv()
     tool.register(env)
     # every tool listen to ENV_RESET event to track history
     assert tool in env.event_hooks.event_listeners[Event.ENV_RESET]
@@ -46,7 +54,7 @@ class CompletelyFakeTool(EnvironmentTool):
         tool = CompletelyFakeTool()
 
 
-def test_auto_subscribe(monkeypatch):
+def test_auto_subscribe(monkeypatch, env):
 
     @Toolbox.register()
     class ToolWithHandler(FakeTool):
@@ -55,7 +63,6 @@ def on_env_reset(self, **kwargs):
 
     tool = ToolWithHandler()
 
-    env = RepoEnv()
     env.add_tool(tool)
 
     assert tool in env.event_hooks.event_listeners[Event.ENV_RESET]
@@ -65,9 +72,8 @@ def on_env_reset(self, **kwargs):
             assert tool not in env.event_hooks.event_listeners[channel]
 
 
-def test_track_history():
+def test_track_history(env):
     tool = FakeTool()
-    env = RepoEnv()
 
     assert hasattr(tool, "history")
     assert isinstance(tool.history, list)
@@ -90,18 +96,16 @@ def test_track_history():
     )
 
 
-def test_unknown_args():
+def test_unknown_args(env):
     tool = FakeTool()
-    env = RepoEnv()
     obs = tool(env, unknown_arg="unknown_value")
     assert obs == Observation(
         "FakeTool", "FakeTool.use() got an unexpected keyword argument 'unknown_arg'"
     )
 
 
-def test_unregister():
+def test_unregister(env):
     tool = FakeTool()
-    env = RepoEnv()
     tool.register(env)
 
     # Verify tool is registered
@@ -120,7 +124,7 @@ def test_unregister_invalid_environment():
         tool.unregister(object())
 
 
-def test_unregister_with_multiple_handlers():
+def test_unregister_with_multiple_handlers(env):
     class ToolWithMultipleHandlers(FakeTool):
         def on_env_reset(self, environment, **kwargs):
             return "Handler for Event.ENV_RESET"
@@ -129,7 +133,6 @@ def on_env_step(self, environment, **kwargs):
             return "Handler for Event.ENV_STEP"
 
     tool = ToolWithMultipleHandlers()
-    env = RepoEnv()
     tool.register(env)
 
     # Verify tool is registered for both events
diff --git a/tests/gym/tools/test_view.py b/tests/gym/tools/test_view.py
index ec2742bb..5d9f5e10 100644
--- a/tests/gym/tools/test_view.py
+++ b/tests/gym/tools/test_view.py
@@ -3,7 +3,7 @@
 import pytest
 
 from debug_gym.gym.entities import Observation
-from debug_gym.gym.envs.env import RepoEnv
+from debug_gym.gym.envs.local import LocalEnv
 from debug_gym.gym.tools.tool import ToolCall
 from debug_gym.gym.tools.toolbox import Toolbox
 
@@ -29,7 +29,7 @@ def env(tmp_path):
 
     (repo_path / "empty.py").touch()  # Create an empty file
 
-    env = RepoEnv(path=repo_path)
+    env = LocalEnv(path=repo_path)
     view_tool = Toolbox.get_tool("view")
     env.add_tool(view_tool)
     env.reset()

From 7d8268ebc64b116511bc08d24d07df2fecf00544 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marc-Alexandre=20C=C3=B4t=C3=A9?= <marc.cote.19@gmail.com>
Date: Mon, 1 Dec 2025 09:53:58 -0800
Subject: [PATCH 26/31] Print disk space after installing library.

---
 .github/actions/test-if-changes/action.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/actions/test-if-changes/action.yml b/.github/actions/test-if-changes/action.yml
index a2c872cf..dcda5618 100644
--- a/.github/actions/test-if-changes/action.yml
+++ b/.github/actions/test-if-changes/action.yml
@@ -39,6 +39,7 @@ runs:
         else
           pip install "debug-gym[dev]==${{ inputs.version }}"
         fi
+        df -h
     - name: Run tests
       env:
         DEBUG_GYM_DEBUG: 1

From 866ac3955166841fbe46014ead3897a754d8cbfd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marc-Alexandre=20C=C3=B4t=C3=A9?= <marc.cote.19@gmail.com>
Date: Mon, 1 Dec 2025 11:08:21 -0800
Subject: [PATCH 27/31] When creating ficture env, reset the env in master
 thread first

---
 tests/gym/envs/conftest.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/tests/gym/envs/conftest.py b/tests/gym/envs/conftest.py
index e79af5ba..660cc1f5 100644
--- a/tests/gym/envs/conftest.py
+++ b/tests/gym/envs/conftest.py
@@ -31,13 +31,16 @@ def make_env_factory(env_name, worker_id, tmp_path_factory):
     env_class = kwargs.pop("env_class")
 
     def _make_env():
-        dataset = env_class.load_dataset(problems=kwargs["problems"])
+        dataset = env_class.load_dataset(
+            problems=kwargs["problems"], prepull_images=True
+        )
         task_data = next(iter(dataset.values()))
         return env_class(task_data=task_data)
 
     if worker_id == "master":
         # Not running with pytest-xdist or we are in the master process
-        _make_env()
+        env = _make_env()
+        env.reset()
     else:
         # When running with pytest-xdist, synchronize between workers using a lock
         root_tmp_dir = tmp_path_factory.getbasetemp().parent

From 1f4661a1c56a3ac682075439c933356271c0979a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marc-Alexandre=20C=C3=B4t=C3=A9?= <marc.cote.19@gmail.com>
Date: Mon, 1 Dec 2025 11:53:15 -0800
Subject: [PATCH 28/31] Disabling async pytests

---
 .github/actions/test-if-changes/action.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/actions/test-if-changes/action.yml b/.github/actions/test-if-changes/action.yml
index dcda5618..a7f96449 100644
--- a/.github/actions/test-if-changes/action.yml
+++ b/.github/actions/test-if-changes/action.yml
@@ -45,7 +45,7 @@ runs:
         DEBUG_GYM_DEBUG: 1
       shell: bash
       run: |
-        pytest ${{ inputs.test-files }} -vv -n 16 --timeout=600 --cov=debug_gym --cov-report=term-missing
+        pytest ${{ inputs.test-files }} -vv --timeout=600 --cov=debug_gym --cov-report=term-missing
     - name: Store coverage report
       uses: actions/upload-artifact@v4
       with:

From 424b3dd8d16a2fbba7328845b58878af1829719b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marc-Alexandre=20C=C3=B4t=C3=A9?= <marc.cote.19@gmail.com>
Date: Mon, 1 Dec 2025 12:11:47 -0800
Subject: [PATCH 29/31] Reenable async pytests + make sure to provide specific
 problem to load_dataset

---
 .github/actions/test-if-changes/action.yml | 2 +-
 tests/gym/envs/conftest.py                 | 3 +--
 tests/gym/envs/test_r2egym.py              | 2 +-
 tests/gym/envs/test_swe_smith.py           | 2 +-
 4 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/.github/actions/test-if-changes/action.yml b/.github/actions/test-if-changes/action.yml
index a7f96449..dcda5618 100644
--- a/.github/actions/test-if-changes/action.yml
+++ b/.github/actions/test-if-changes/action.yml
@@ -45,7 +45,7 @@ runs:
         DEBUG_GYM_DEBUG: 1
       shell: bash
       run: |
-        pytest ${{ inputs.test-files }} -vv --timeout=600 --cov=debug_gym --cov-report=term-missing
+        pytest ${{ inputs.test-files }} -vv -n 16 --timeout=600 --cov=debug_gym --cov-report=term-missing
     - name: Store coverage report
       uses: actions/upload-artifact@v4
       with:
diff --git a/tests/gym/envs/conftest.py b/tests/gym/envs/conftest.py
index 660cc1f5..db78b192 100644
--- a/tests/gym/envs/conftest.py
+++ b/tests/gym/envs/conftest.py
@@ -39,8 +39,7 @@ def _make_env():
 
     if worker_id == "master":
         # Not running with pytest-xdist or we are in the master process
-        env = _make_env()
-        env.reset()
+        _make_env()
     else:
         # When running with pytest-xdist, synchronize between workers using a lock
         root_tmp_dir = tmp_path_factory.getbasetemp().parent
diff --git a/tests/gym/envs/test_r2egym.py b/tests/gym/envs/test_r2egym.py
index 3aaa663a..dd31c776 100644
--- a/tests/gym/envs/test_r2egym.py
+++ b/tests/gym/envs/test_r2egym.py
@@ -15,8 +15,8 @@
 def test_load_dataset(get_r2egym_env):
     env = get_r2egym_env()
 
-    dataset = env.load_dataset()
     task_name = "aiohttp_final:d7cd0613472fd4d9940e37f1c55921f6a1515324"
+    dataset = env.load_dataset(problems=[task_name])
     assert task_name in dataset
 
     task_data = next(iter(dataset.values()))
diff --git a/tests/gym/envs/test_swe_smith.py b/tests/gym/envs/test_swe_smith.py
index 26b02c9f..65c9e906 100644
--- a/tests/gym/envs/test_swe_smith.py
+++ b/tests/gym/envs/test_swe_smith.py
@@ -16,8 +16,8 @@
 def test_load_dataset(get_swe_smith_env):
     env = get_swe_smith_env()
 
-    dataset = env.load_dataset()
     task_name = "john-kurkowski__tldextract.3d1bf184.combine_file__1vnuqpt4"
+    dataset = env.load_dataset(problems=[task_name])
     assert task_name in dataset
 
     # check if the dataset contains features that SWESmithEnv expects

From e0263a225a87b7aef0e721305329cd4bc0212b8a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marc-Alexandre=20C=C3=B4t=C3=A9?= <marc.cote.19@gmail.com>
Date: Mon, 1 Dec 2025 12:31:45 -0800
Subject: [PATCH 30/31] Fixing load_dataset

---
 debug_gym/gym/envs/r2egym.py    | 20 ++++++++------------
 debug_gym/gym/envs/swe_smith.py | 18 ++++++++----------
 2 files changed, 16 insertions(+), 22 deletions(-)

diff --git a/debug_gym/gym/envs/r2egym.py b/debug_gym/gym/envs/r2egym.py
index 17cd5ac7..ee9bfa12 100644
--- a/debug_gym/gym/envs/r2egym.py
+++ b/debug_gym/gym/envs/r2egym.py
@@ -262,6 +262,7 @@ def load_dataset(
         prepull_images: bool = False,
         logger: DebugGymLogger | None = None,
     ) -> dict:
+        logger = logger or DebugGymLogger("debug_gym")
         data_path = Path(dataset_id)
         if data_path.is_file():
             # Loading from local file.
@@ -299,10 +300,9 @@ def extract_instance_id(docker_image: str) -> str:
         dataset = {pid: dataset[pid] for pid in problems}
 
         image_names = set(example["docker_image"] for example in dataset.values())
-        if logger is not None:
-            logger.debug(
-                f"Loaded {len(dataset)} tasks across {len(image_names)} Docker images from {dataset_id}."
-            )
+        logger.debug(
+            f"Loaded {len(dataset)} tasks across {len(image_names)} Docker images from {dataset_id}."
+        )
 
         if prepull_images:
             # Download all images needed for R2E-Gym.
@@ -313,14 +313,10 @@ def extract_instance_id(docker_image: str) -> str:
             )
             missing_images = image_names - existing_images
             if missing_images:
-                if logger is not None:
+                logger.warning(f"Found {len(missing_images)} missing Docker images.")
+                for i, image_name in enumerate(missing_images):
                     logger.warning(
-                        f"Found {len(missing_images)} missing Docker images."
+                        f"Pulling Docker image {i + 1}/{len(missing_images)} `{image_name}`."
                     )
-                    for i, image_name in enumerate(missing_images):
-                        if logger is not None:
-                            logger.warning(
-                                f"Pulling Docker image {i + 1}/{len(missing_images)} `{image_name}`."
-                            )
-                        client.images.pull(image_name)
+                    client.images.pull(image_name)
         return dataset
diff --git a/debug_gym/gym/envs/swe_smith.py b/debug_gym/gym/envs/swe_smith.py
index 8511a1a7..fc507032 100644
--- a/debug_gym/gym/envs/swe_smith.py
+++ b/debug_gym/gym/envs/swe_smith.py
@@ -154,6 +154,7 @@ def load_dataset(
         prepull_images: bool = False,
         logger: DebugGymLogger | None = None,
     ) -> dict:
+        logger = logger or DebugGymLogger("debug_gym")
         data_path = Path(dataset_id)
         if data_path.is_file():
             # Loading from local file.
@@ -181,10 +182,9 @@ def load_dataset(
         dataset = {pid: dataset[pid] for pid in problems}
 
         image_names = set([problem["image_name"] for problem in dataset.values()])
-        if logger is not None:
-            logger.debug(
-                f"Loaded {len(dataset)} tasks across {len(image_names)} Docker images from {dataset_id}."
-            )
+        logger.debug(
+            f"Loaded {len(dataset)} tasks across {len(image_names)} Docker images from {dataset_id}."
+        )
 
         if prepull_images:
             # Download all images needed for SWE-Smith.
@@ -198,15 +198,13 @@ def load_dataset(
             )
             missing_images = tagged_image_names - existing_images
             if missing_images:
-                if logger is not None:
-                    logger.info(f"Found {len(missing_images)} missing Docker images.")
+                logger.info(f"Found {len(missing_images)} missing Docker images.")
 
                 for image_name in missing_images:
                     docker_hub_image = image_name.replace("__", "_1776_")
-                    if logger is not None:
-                        logger.info(
-                            f"Pulling Docker image `{docker_hub_image}` to `{image_name}`."
-                        )
+                    logger.info(
+                        f"Pulling Docker image `{docker_hub_image}` to `{image_name}`."
+                    )
                     client.images.pull(docker_hub_image)
                     # Rename images via tagging
                     client.images.get(docker_hub_image).tag(image_name)

From e5cda5a3a782dba9e416904d8198b39692f462ba Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marc-Alexandre=20C=C3=B4t=C3=A9?= <marc.cote.19@gmail.com>
Date: Mon, 1 Dec 2025 12:59:35 -0800
Subject: [PATCH 31/31] Limiting workers for async pytest

---
 .github/actions/test-if-changes/action.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/actions/test-if-changes/action.yml b/.github/actions/test-if-changes/action.yml
index dcda5618..3d19d261 100644
--- a/.github/actions/test-if-changes/action.yml
+++ b/.github/actions/test-if-changes/action.yml
@@ -45,7 +45,7 @@ runs:
         DEBUG_GYM_DEBUG: 1
       shell: bash
       run: |
-        pytest ${{ inputs.test-files }} -vv -n 16 --timeout=600 --cov=debug_gym --cov-report=term-missing
+        pytest ${{ inputs.test-files }} -vv -n 4 --timeout=600 --cov=debug_gym --cov-report=term-missing
     - name: Store coverage report
       uses: actions/upload-artifact@v4
       with: