From 7e78e53127c7f64d729a7d8c16d6bd6433b72640 Mon Sep 17 00:00:00 2001 From: Alessandro Sordoni Date: Thu, 27 Nov 2025 10:56:48 -0800 Subject: [PATCH 01/31] kube --- debug_gym/gym/terminals/kubernetes.py | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/debug_gym/gym/terminals/kubernetes.py b/debug_gym/gym/terminals/kubernetes.py index 5c5f39bc..5d145eef 100644 --- a/debug_gym/gym/terminals/kubernetes.py +++ b/debug_gym/gym/terminals/kubernetes.py @@ -508,7 +508,7 @@ def setup_pod(self, max_retries: int = 3) -> None: "restartPolicy": "Never", "containers": [ { - "name": "main", + "name": pod_name, "image": f"{self.registry}{self.base_image}", "imagePullPolicy": "IfNotPresent", "command": ["/bin/bash"], @@ -518,8 +518,7 @@ def setup_pod(self, max_retries: int = 3) -> None: "stdinOnce": False, "tty": True, "env": [ - {"name": k, "value": v} - for k, v in self.env_vars.items() + {"name": k, "value": v} for k, v in self.env_vars.items() ], "resources": { "requests": {"cpu": "0.5", "memory": "1Gi"}, @@ -527,6 +526,24 @@ def setup_pod(self, max_retries: int = 3) -> None: }, } ], + "tolerations": [ + { + "key": "node.kubernetes.io/disk-pressure", + "operator": "Exists", + "effect": "NoExecute", + "tolerationSeconds": 10800 + }, + { + "key": "kubernetes.azure.com/scalesetpriority", + "operator": "Equal", + "value": "spot", + "effect": "NoSchedule" + }, + { + "key": "CriticalAddonsOnly", + "operator": "Exists" + }, + ], **pod_spec_kwargs, # e.g., nodeSelector, tolerations }, } From b44876738c7c16eb75b09561b77eeaf1e534c94c Mon Sep 17 00:00:00 2001 From: Alessandro Sordoni Date: Thu, 27 Nov 2025 11:40:34 -0800 Subject: [PATCH 02/31] uuid as name --- debug_gym/gym/terminals/kubernetes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/debug_gym/gym/terminals/kubernetes.py b/debug_gym/gym/terminals/kubernetes.py index 5d145eef..af390473 100644 --- a/debug_gym/gym/terminals/kubernetes.py +++ b/debug_gym/gym/terminals/kubernetes.py @@ -508,7 +508,7 @@ def setup_pod(self, max_retries: int = 3) -> None: "restartPolicy": "Never", "containers": [ { - "name": pod_name, + "name": str(uuid.uuid4())[:8], "image": f"{self.registry}{self.base_image}", "imagePullPolicy": "IfNotPresent", "command": ["/bin/bash"], From 9ac9226983a14916c1fe3af22da77479023371af Mon Sep 17 00:00:00 2001 From: Alessandro Sordoni Date: Thu, 27 Nov 2025 12:38:21 -0800 Subject: [PATCH 03/31] normalize pod name --- debug_gym/gym/terminals/kubernetes.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/debug_gym/gym/terminals/kubernetes.py b/debug_gym/gym/terminals/kubernetes.py index af390473..fae333ae 100644 --- a/debug_gym/gym/terminals/kubernetes.py +++ b/debug_gym/gym/terminals/kubernetes.py @@ -3,6 +3,7 @@ import os import random import subprocess +import hashlib import time import uuid from pathlib import Path @@ -37,6 +38,9 @@ def _clean_for_kubernetes(name: str) -> str: # replace any characters not in the regex with hyphens cleaned = "".join(c if c.isalnum() or c in "-." else "-" for c in name).lower() # ensure it starts and ends with alphanumeric character + cleaned = cleaned.replace("/", "-") + cleaned = cleaned.replace(":", "-") + cleaned = cleaned.replace(".", "-") cleaned = cleaned.strip("-").strip(".") # truncate to 253 characters return cleaned[:253] @@ -487,7 +491,7 @@ def setup_pod(self, max_retries: int = 3) -> None: for attempt in range(max_retries): # Generate a new pod name for each attempt to avoid sandbox conflicts pod_name = _clean_for_kubernetes( - self._pod_name or f"dbg-gym.{self.task_name}.{str(uuid.uuid4())[:8]}" + self._pod_name or f"dbg-gym-{self.task_name}-{str(uuid.uuid4())[:8]}" ) self.logger.debug( f"Setting up pod {pod_name} (attempt {attempt + 1}/{max_retries}) " @@ -508,7 +512,7 @@ def setup_pod(self, max_retries: int = 3) -> None: "restartPolicy": "Never", "containers": [ { - "name": str(uuid.uuid4())[:8], + "name": pod_name, "image": f"{self.registry}{self.base_image}", "imagePullPolicy": "IfNotPresent", "command": ["/bin/bash"], From cebbf374763088f265abfbf8df8eb29251cb4963 Mon Sep 17 00:00:00 2001 From: Alessandro Sordoni Date: Thu, 27 Nov 2025 12:55:41 -0800 Subject: [PATCH 04/31] load dataset outside --- debug_gym/gym/envs/r2egym.py | 138 +++++++++++++------------ debug_gym/gym/envs/swe_bench.py | 93 ++++++++--------- debug_gym/gym/envs/swe_smith.py | 141 +++++++++++++------------- debug_gym/gym/terminals/kubernetes.py | 12 +-- 4 files changed, 188 insertions(+), 196 deletions(-) diff --git a/debug_gym/gym/envs/r2egym.py b/debug_gym/gym/envs/r2egym.py index 47bc135c..de02d93d 100644 --- a/debug_gym/gym/envs/r2egym.py +++ b/debug_gym/gym/envs/r2egym.py @@ -58,15 +58,74 @@ def parse_log_pytest(log: str | None) -> dict[str, str]: return test_status_map +def load_dataset( + dataset_id: str = "R2E-Gym/R2E-Gym-Lite", + dataset_revision: str = "8d3163011f01f9393bb3dc7700497a79a8686ae5", + split: str = "train", + problems=None, + prepull_images=False, + logger=None, +): + data_path = Path(dataset_id) + if data_path.is_file(): + # Loading from local file. + if data_path.suffix.lower() == ".json": + ds = load_dataset("json", data_files=dataset_id) + elif data_path.suffix.lower() == ".parquet": + ds = load_dataset("parquet", data_files=dataset_id) + elif data_path.is_dir(): + # Loading from local folder. + ds = load_from_disk(dataset_id) + else: + # Loading from HuggingFace or a folder. + ds = load_dataset(dataset_id, revision=dataset_revision) + + # Select the split. + ds = ds[split] + + # Load custom dataset splits from config. + with open(R2EGymEnv.CONFIG) as f: + custom_splits = yaml.safe_load(f) + excluded_ids = custom_splits.get("excluded", []) + + dataset = {id.split("/", 1)[-1]: i for i, id in enumerate(ds["docker_image"])} + problems = filter_problems(dataset, problems, custom_splits, excluded_ids) + dataset = {id: i for id, i in dataset.items() if id in problems} + + image_names = set(ds[dataset[id]]["docker_image"] for id in dataset) + if logger is not None: + logger.debug( + f"Loaded {len(dataset)} tasks accross {len(image_names)} Docker images from {dataset_id}." + ) + + if prepull_images: + # Download all images needed for R2E-Gym. + client = docker.from_env() + + existing_images = set( + tag for image in client.images.list() for tag in image.tags + ) + missing_images = image_names - existing_images + if missing_images: + if logger is not None: + logger.warning(f"Found {len(missing_images)} missing Docker images.") + for i, image_name in enumerate(missing_images): + if logger is not None: + logger.warning( + f"Pulling Docker image {i + 1}/{len(missing_images)} `{image_name}`." + ) + client.images.pull(image_name) + + return dataset + + class R2EGymEnv(RepoEnv): CACHE = DEBUG_GYM_CACHE_DIR / "r2e-gym" CONFIG = importlib_files("debug_gym") / "gym" / "envs" / "configs" / "r2egym.yaml" def __init__( self, - dataset_id: str = "R2E-Gym/R2E-Gym-Lite", - dataset_revision: str = "8d3163011f01f9393bb3dc7700497a79a8686ae5", - split: str = "train", + task_data: dict, terminal: Terminal | None = None, **kwargs, ): @@ -76,11 +135,10 @@ def __init__( "R2EGymEnv only supports DockerTerminal and KubernetesTerminal." ) - self.dataset_id = dataset_id - self.dataset_revision = dataset_revision - self.split = split - self.session_commands = [] + self.ds_row = task_data + self.setup_task(task_data=task_data) + self.session_commands = [] super().__init__(terminal=terminal, **kwargs) @property @@ -93,69 +151,9 @@ def instructions(self) -> str: except Exception as e: return self.ds_row["problem_statement"] - def load_dataset(self, problems: str | list[str] | None = None): - data_path = Path(self.dataset_id) - if data_path.is_file(): - # Loading from local file. - if data_path.suffix.lower() == ".json": - self.ds = load_dataset("json", data_files=self.dataset_id) - elif data_path.suffix.lower() == ".parquet": - self.ds = load_dataset("parquet", data_files=self.dataset_id) - elif data_path.is_dir(): - # Loading from local folder. - self.ds = load_from_disk(self.dataset_id) - else: - # Loading from HuggingFace or a folder. - self.ds = load_dataset(self.dataset_id, revision=self.dataset_revision) - - # Select the split. - self.ds = self.ds[self.split] - - # Load custom dataset splits from config. - with open(R2EGymEnv.CONFIG) as f: - custom_splits = yaml.safe_load(f) - excluded_ids = custom_splits.get("excluded", []) - - dataset = { - id.split("/", 1)[-1]: i for i, id in enumerate(self.ds["docker_image"]) - } - problems = filter_problems(dataset, problems, custom_splits, excluded_ids) - dataset = {id: i for id, i in dataset.items() if id in problems} - - image_names = set(self.ds[dataset[id]]["docker_image"] for id in dataset) - self.logger.debug( - f"Loaded {len(dataset)} tasks accross {len(image_names)} Docker images from {self.dataset_id}." - ) - - if not isinstance(self.terminal, KubernetesTerminal): - # Download all images needed for R2E-Gym. - client = docker.from_env() - - existing_images = set( - tag for image in client.images.list() for tag in image.tags - ) - missing_images = image_names - existing_images - if missing_images: - self.logger.warning( - f"Found {len(missing_images)} missing Docker images." - ) - for i, image_name in enumerate(missing_images): - self.logger.warning( - f"Pulling Docker image {i + 1}/{len(missing_images)} `{image_name}`." - ) - client.images.pull(image_name) - - return dataset - - def setup_task(self, task_name: str, options: dict = None): - if task_name not in self.dataset: - raise ValueError( - f"Task `{task_name}` was not found in dataset. The available tasks are: {self.dataset}.\n" - "Please provide a valid task or initialize the environment without problems to load all tasks." - ) - - self.task_name = task_name - self.ds_row = self.ds[self.dataset[self.task_name]] + def setup_task(self, task_data: dict, options: dict = None): + self.ds_row = task_data + self.task_name = task_data["instance_id"] self.base_image = self.ds_row["docker_image"] self.package_name = self.ds_row["repo_name"] self.expected_output = json.loads(self.ds_row["expected_output_json"]) diff --git a/debug_gym/gym/envs/swe_bench.py b/debug_gym/gym/envs/swe_bench.py index b438dbe0..6a584fc5 100644 --- a/debug_gym/gym/envs/swe_bench.py +++ b/debug_gym/gym/envs/swe_bench.py @@ -16,14 +16,51 @@ from debug_gym.gym.utils import filter_problems +def load_swebench_dataset( + dataset_id: str = "SWE-bench/SWE-bench_Verified", + dataset_revision: str = "99450355ca8c611021187a57ffac304b66666738", + split="test", + problems=None, + prepull_images=False, + logger=None, +): + ds = datasets.load_dataset(dataset_id, revision=dataset_revision)[split] + dataset = {id: i for i, id in enumerate(ds["instance_id"])} + problems = filter_problems(dataset, problems) + dataset = {id: i for id, i in dataset.items() if id in problems} + + instance_ids = [ds[dataset[id]]["instance_id"] for id in dataset] + image_names = set( + f"sweb.eval.x86_64.{id.replace('__', '_1776_')}" for id in instance_ids + ) + + if prepull_images: + # Download all images needed for SWE-Bench. + client = docker.from_env() + tagged_image_names = set(f"swebench/{name}:latest" for name in image_names) + + existing_images = set( + tag for image in client.images.list() for tag in image.tags + ) + missing_images = tagged_image_names - existing_images + if missing_images: + if logger: + logger.info(f"Found {len(missing_images)} missing Docker images.") + for i, image_name in enumerate(missing_images): + if logger: + logger.info( + f"Pulling Docker images {i + 1}/{len(missing_images)}: `{image_name}`." + ) + client.images.pull(image_name) + return dataset + + class SWEBenchEnv(RepoEnv): CACHE = DEBUG_GYM_CACHE_DIR / "swe-bench" def __init__( self, - dataset_id: str = "SWE-bench/SWE-bench_Verified", - dataset_revision: str = "99450355ca8c611021187a57ffac304b66666738", - split: str = "test", + task_data: dict, terminal: Terminal | None = None, **kwargs, ): @@ -33,58 +70,18 @@ def __init__( f"{self.__class__.__name__} only supports DockerTerminal and KubernetesTerminal." ) - self.dataset_id = dataset_id - self.dataset_revision = dataset_revision - self.split = split + self.ds_row = task_data + self.setup_task(self.ds_row) self.test_directives = [] - super().__init__(terminal=terminal, **kwargs) @property def instructions(self) -> str: return self.ds_row["problem_statement"] - def load_dataset(self, problems: str | list[str] | None = None): - self.ds = datasets.load_dataset( - self.dataset_id, revision=self.dataset_revision - )[self.split] - dataset = {id: i for i, id in enumerate(self.ds["instance_id"])} - problems = filter_problems(dataset, problems) - dataset = {id: i for id, i in dataset.items() if id in problems} - - instance_ids = [self.ds[dataset[id]]["instance_id"] for id in dataset] - image_names = set( - f"sweb.eval.x86_64.{id.replace('__', '_1776_')}" for id in instance_ids - ) - - if not isinstance(self.terminal, KubernetesTerminal): - # Download all images needed for SWE-Bench. - client = docker.from_env() - tagged_image_names = set(f"swebench/{name}:latest" for name in image_names) - - existing_images = set( - tag for image in client.images.list() for tag in image.tags - ) - missing_images = tagged_image_names - existing_images - if missing_images: - self.logger.info(f"Found {len(missing_images)} missing Docker images.") - for i, image_name in enumerate(missing_images): - self.logger.info( - f"Pulling Docker images {i + 1}/{len(missing_images)}: `{image_name}`." - ) - client.images.pull(image_name) - - return dataset - - def setup_task(self, task_name: str, options: dict = None): - if task_name not in self.dataset: - raise ValueError( - f"Task `{task_name}` was not found in dataset. The available tasks are: {sorted(self.dataset)}.\n" - "Please provide a valid task or initialize the environment without problems to load all tasks." - ) - - self.task_name = task_name - self.ds_row = self.ds[self.dataset[self.task_name]] + def setup_task(self, task_data: dict, options: dict = None): + self.ds_row = task_data + self.task_name = task_data["instance_id"] self.repo = self.ds_row["repo"] self.package_name = self.repo.split("/")[1] self.version = self.ds_row["version"] diff --git a/debug_gym/gym/envs/swe_smith.py b/debug_gym/gym/envs/swe_smith.py index ce7ef627..f2162fe3 100644 --- a/debug_gym/gym/envs/swe_smith.py +++ b/debug_gym/gym/envs/swe_smith.py @@ -19,6 +19,71 @@ from debug_gym.gym.utils import filter_problems +def load_swesmith_dataset( + dataset_id: str = "SWE-bench/SWE-smith", + dataset_revision: str = "699b53400d3855206a0fbf3ff4beaf1a52f4f232", + split: str = "train", + problems: str | list[str] | None = None, + prepull_images: bool = False, + logger=None, +): + data_path = Path(dataset_id) + if data_path.is_file(): + # Loading from local file. + if data_path.suffix.lower() == ".json": + ds = load_dataset("json", data_files=dataset_id) + elif data_path.suffix.lower() == ".parquet": + ds = load_dataset("parquet", data_files=dataset_id) + elif data_path.is_dir(): + # Loading from local folder. + ds = load_from_disk(dataset_id) + else: + # Loading from HuggingFace or a folder. + ds = load_dataset(dataset_id, revision=dataset_revision) + + # Select the split. + ds = ds[split] + + # Load custom dataset splits from config. + with open(SWESmithEnv.CONFIG) as f: + custom_splits = yaml.safe_load(f) + excluded_ids = custom_splits.get("excluded", []) + + dataset = {id: i for i, id in enumerate(ds["instance_id"])} + problems = filter_problems(dataset, problems, custom_splits, excluded_ids) + dataset = {id: i for id, i in dataset.items() if id in problems} + + image_names = set(ds[dataset[id]]["image_name"] for id in dataset) + if logger is not None: + logger.debug( + f"Loaded {len(dataset)} tasks accross {len(image_names)} Docker images from {dataset_id}." + ) + + if prepull_images: + # Download all images needed for SWE-Smith. + client = docker.from_env() + tagged_image_names = set(f"{DOCKER_ORG}/{name}:{TAG}" for name in image_names) + + existing_images = set( + tag for image in client.images.list() for tag in image.tags + ) + missing_images = tagged_image_names - existing_images + if missing_images: + if logger is not None: + logger.info(f"Found {len(missing_images)} missing Docker images.") + + for image_name in missing_images: + docker_hub_image = image_name.replace("__", "_1776_") + if logger is not None: + logger.info( + f"Pulling Docker image `{docker_hub_image}` to `{image_name}`." + ) + client.images.pull(docker_hub_image) + # Rename images via tagging + client.images.get(docker_hub_image).tag(image_name) + return dataset + + class SWESmithEnv(SWEBenchEnv): CACHE = DEBUG_GYM_CACHE_DIR / "swe-smith" CONFIG = ( @@ -27,85 +92,19 @@ class SWESmithEnv(SWEBenchEnv): def __init__( self, - dataset_id: str = "SWE-bench/SWE-smith", - dataset_revision: str = "699b53400d3855206a0fbf3ff4beaf1a52f4f232", - split: str = "train", + task_data: dict, terminal: Terminal | None = None, **kwargs, ): super().__init__( - dataset_id=dataset_id, - dataset_revision=dataset_revision, - split=split, + task_data=task_data, terminal=terminal, **kwargs, ) - def load_dataset(self, problems: str | list[str] | None = None): - data_path = Path(self.dataset_id) - if data_path.is_file(): - # Loading from local file. - if data_path.suffix.lower() == ".json": - self.ds = load_dataset("json", data_files=self.dataset_id) - elif data_path.suffix.lower() == ".parquet": - self.ds = load_dataset("parquet", data_files=self.dataset_id) - elif data_path.is_dir(): - # Loading from local folder. - self.ds = load_from_disk(self.dataset_id) - else: - # Loading from HuggingFace or a folder. - self.ds = load_dataset(self.dataset_id, revision=self.dataset_revision) - - # Select the split. - self.ds = self.ds[self.split] - - # Load custom dataset splits from config. - with open(SWESmithEnv.CONFIG) as f: - custom_splits = yaml.safe_load(f) - excluded_ids = custom_splits.get("excluded", []) - - dataset = {id: i for i, id in enumerate(self.ds["instance_id"])} - problems = filter_problems(dataset, problems, custom_splits, excluded_ids) - dataset = {id: i for id, i in dataset.items() if id in problems} - - image_names = set(self.ds[dataset[id]]["image_name"] for id in dataset) - self.logger.debug( - f"Loaded {len(dataset)} tasks accross {len(image_names)} Docker images from {self.dataset_id}." - ) - - if not isinstance(self.terminal, KubernetesTerminal): - # Download all images needed for SWE-Smith. - client = docker.from_env() - tagged_image_names = set( - f"{DOCKER_ORG}/{name}:{TAG}" for name in image_names - ) - - existing_images = set( - tag for image in client.images.list() for tag in image.tags - ) - missing_images = tagged_image_names - existing_images - if missing_images: - self.logger.info(f"Found {len(missing_images)} missing Docker images.") - for image_name in missing_images: - docker_hub_image = image_name.replace("__", "_1776_") - self.logger.info( - f"Pulling Docker image `{docker_hub_image}` to `{image_name}`." - ) - client.images.pull(docker_hub_image) - # Rename images via tagging - client.images.get(docker_hub_image).tag(image_name) - - return dataset - - def setup_task(self, task_name: str, options: dict = None): - if task_name not in self.dataset: - raise ValueError( - f"Task `{task_name}` was not found in dataset. The available tasks are: {sorted(self.dataset)}.\n" - "Please provide a valid task or initialize the environment without problems to load all tasks." - ) - - self.task_name = task_name - self.ds_row = self.ds[self.dataset[self.task_name]] + def setup_task(self, task_data: dict, options: dict = None): + self.task_name = task_data["instance_id"] + self.ds_row = task_data self.base_commit = ( self.ds_row["base_commit"] if "base_commit" in self.ds_row else "main" ) diff --git a/debug_gym/gym/terminals/kubernetes.py b/debug_gym/gym/terminals/kubernetes.py index fae333ae..7d005f89 100644 --- a/debug_gym/gym/terminals/kubernetes.py +++ b/debug_gym/gym/terminals/kubernetes.py @@ -522,7 +522,8 @@ def setup_pod(self, max_retries: int = 3) -> None: "stdinOnce": False, "tty": True, "env": [ - {"name": k, "value": v} for k, v in self.env_vars.items() + {"name": k, "value": v} + for k, v in self.env_vars.items() ], "resources": { "requests": {"cpu": "0.5", "memory": "1Gi"}, @@ -535,18 +536,15 @@ def setup_pod(self, max_retries: int = 3) -> None: "key": "node.kubernetes.io/disk-pressure", "operator": "Exists", "effect": "NoExecute", - "tolerationSeconds": 10800 + "tolerationSeconds": 10800, }, { "key": "kubernetes.azure.com/scalesetpriority", "operator": "Equal", "value": "spot", - "effect": "NoSchedule" - }, - { - "key": "CriticalAddonsOnly", - "operator": "Exists" + "effect": "NoSchedule", }, + {"key": "CriticalAddonsOnly", "operator": "Exists"}, ], **pod_spec_kwargs, # e.g., nodeSelector, tolerations }, From ffd87cf37b1824048a29bc846c6ea13f5642d6e6 Mon Sep 17 00:00:00 2001 From: Alessandro Sordoni Date: Thu, 27 Nov 2025 12:56:56 -0800 Subject: [PATCH 05/31] remove tolerations --- debug_gym/gym/terminals/kubernetes.py | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/debug_gym/gym/terminals/kubernetes.py b/debug_gym/gym/terminals/kubernetes.py index 7d005f89..5fa90ea0 100644 --- a/debug_gym/gym/terminals/kubernetes.py +++ b/debug_gym/gym/terminals/kubernetes.py @@ -531,21 +531,6 @@ def setup_pod(self, max_retries: int = 3) -> None: }, } ], - "tolerations": [ - { - "key": "node.kubernetes.io/disk-pressure", - "operator": "Exists", - "effect": "NoExecute", - "tolerationSeconds": 10800, - }, - { - "key": "kubernetes.azure.com/scalesetpriority", - "operator": "Equal", - "value": "spot", - "effect": "NoSchedule", - }, - {"key": "CriticalAddonsOnly", "operator": "Exists"}, - ], **pod_spec_kwargs, # e.g., nodeSelector, tolerations }, } From 0d2996745da3523e0fafe8fa91e0603b0df90479 Mon Sep 17 00:00:00 2001 From: Alessandro Sordoni Date: Thu, 27 Nov 2025 13:05:09 -0800 Subject: [PATCH 06/31] incorporate dataset loading --- scripts/run.py | 43 +++++++++++++++++++++++++++++++++++-------- 1 file changed, 35 insertions(+), 8 deletions(-) diff --git a/scripts/run.py b/scripts/run.py index 5139afd1..4c1e458a 100644 --- a/scripts/run.py +++ b/scripts/run.py @@ -18,7 +18,11 @@ from debug_gym.llms.human import Human from debug_gym.logger import DebugGymLogger, load_previous_run_status - +from debug_gym.gym.envs.swe_bench import load_swebench_dataset +from debug_gym.gym.envs.swe_smith import load_swesmith_dataset +from debug_gym.gym.envs.r2egym import load_r2egym_dataset + + class AgentTimeoutException(BaseException): """Custom exception to handle timeouts in agent execution. Inherits from BaseException to ensure @@ -40,7 +44,11 @@ def timeout_handler(signum, frame): signal.alarm(timeout_seconds) -def run_agent(args, problem, config): +def run_agent( + args, + problem: dict, + config: dict +): set_signal(args.timeout) success = True env = None @@ -90,7 +98,7 @@ def run_agent(args, problem, config): status="running", ) - env = create_env(config, task_logger) + env = create_env(config, problem, task_logger) add_tools(env, config, task_logger) llm = LLM.instantiate( @@ -176,14 +184,14 @@ def run_agent(args, problem, config): return success -def create_env(config: dict, logger: DebugGymLogger): +def create_env(config: dict, problem: dict, logger: DebugGymLogger): terminal = select_terminal(config.get("terminal"), logger, uuid=config["uuid"]) env_class = select_env(config.get("benchmark")) env = env_class( - **config["env_kwargs"], - problems=config.get("problems", ["custom"]), + task_data=problem, terminal=terminal, logger=logger, + **config["env_kwargs"], ) return env @@ -248,8 +256,27 @@ def main(): dump_experiment_info(config, args) # Create the environment to get the list of problems to run. - env = create_env(config, logger=logger) - problems = sorted(env.dataset) + dataset_info = { + "dataset_id": config.env_kwargs.get("dataset_id"), + "dataset_revision": config.env_kwargs.get("dataset_revision"), + "problems": config.get("problems", "all"), + "prepull_images": config.env_kwargs.get("prepull_images", False) + } + load_dataset_fn = { + "swebench": load_swebench_dataset, + "swebench-debug": load_swebench_dataset, + "swesmith": load_swesmith_dataset, + "r2egym": load_r2egym_dataset, + } + + if config['benchmark'] in load_dataset_fn: + dataset = load_dataset_fn[config['benchmark']]( + **dataset_info, + ) + else: + raise ValueError(f"Unsupported benchmark: {config['benchmark']}") + + problems = sorted(dataset) if args.list: print(f"\n# Available problems in {config.get('benchmark', 'config')}:") From c7afaa2c593caf87cf7b526b8ca33f231fbbe0ed Mon Sep 17 00:00:00 2001 From: Alessandro Sordoni Date: Thu, 27 Nov 2025 13:08:53 -0800 Subject: [PATCH 07/31] some type annotations --- debug_gym/gym/envs/r2egym.py | 9 +++++---- debug_gym/gym/envs/swe_bench.py | 10 +++++----- debug_gym/gym/envs/swe_smith.py | 6 +++--- 3 files changed, 13 insertions(+), 12 deletions(-) diff --git a/debug_gym/gym/envs/r2egym.py b/debug_gym/gym/envs/r2egym.py index de02d93d..66745dfd 100644 --- a/debug_gym/gym/envs/r2egym.py +++ b/debug_gym/gym/envs/r2egym.py @@ -14,6 +14,7 @@ from debug_gym.gym.terminals.kubernetes import KubernetesTerminal from debug_gym.gym.terminals.terminal import Terminal from debug_gym.gym.utils import filter_problems +from debug_gym.logger import DebugGymLogger def decolor_dict_keys(key): @@ -58,13 +59,13 @@ def parse_log_pytest(log: str | None) -> dict[str, str]: return test_status_map -def load_dataset( +def load_r2egym_dataset( dataset_id: str = "R2E-Gym/R2E-Gym-Lite", dataset_revision: str = "8d3163011f01f9393bb3dc7700497a79a8686ae5", split: str = "train", - problems=None, - prepull_images=False, - logger=None, + problems: list | None = None, + prepull_images: bool = False, + logger: DebugGymLogger | None = None, ): data_path = Path(dataset_id) if data_path.is_file(): diff --git a/debug_gym/gym/envs/swe_bench.py b/debug_gym/gym/envs/swe_bench.py index 6a584fc5..9af6b8f6 100644 --- a/debug_gym/gym/envs/swe_bench.py +++ b/debug_gym/gym/envs/swe_bench.py @@ -12,17 +12,17 @@ from debug_gym.gym.envs.env import RepoEnv from debug_gym.gym.terminals.docker import DockerTerminal from debug_gym.gym.terminals.kubernetes import KubernetesTerminal -from debug_gym.gym.terminals.terminal import Terminal +from debug_gym.gym.terminals.terminal import DebugGymLogger, Terminal from debug_gym.gym.utils import filter_problems def load_swebench_dataset( dataset_id: str = "SWE-bench/SWE-bench_Verified", dataset_revision: str = "99450355ca8c611021187a57ffac304b66666738", - split="test", - problems=None, - prepull_images=False, - logger=None, + split: str = "test", + problems: list | None = None, + prepull_images: bool = False, + logger: DebugGymLogger | None = None, ): ds = datasets.load_dataset(dataset_id, revision=dataset_revision)[split] dataset = {id: i for i, id in enumerate(ds["instance_id"])} diff --git a/debug_gym/gym/envs/swe_smith.py b/debug_gym/gym/envs/swe_smith.py index f2162fe3..9dc934fb 100644 --- a/debug_gym/gym/envs/swe_smith.py +++ b/debug_gym/gym/envs/swe_smith.py @@ -15,7 +15,7 @@ from debug_gym.gym.entities import EvalOutput from debug_gym.gym.envs.swe_bench import SWEBenchEnv from debug_gym.gym.terminals.kubernetes import KubernetesTerminal -from debug_gym.gym.terminals.terminal import Terminal +from debug_gym.gym.terminals.terminal import DebugGymLogger, Terminal from debug_gym.gym.utils import filter_problems @@ -23,9 +23,9 @@ def load_swesmith_dataset( dataset_id: str = "SWE-bench/SWE-smith", dataset_revision: str = "699b53400d3855206a0fbf3ff4beaf1a52f4f232", split: str = "train", - problems: str | list[str] | None = None, + problems: list | None = None, prepull_images: bool = False, - logger=None, + logger: DebugGymLogger | None = None, ): data_path = Path(dataset_id) if data_path.is_file(): From c506fe16a1557699469e3938c6ef8ec772c77614 Mon Sep 17 00:00:00 2001 From: Alessandro Sordoni Date: Thu, 27 Nov 2025 13:13:06 -0800 Subject: [PATCH 08/31] fixture first fix --- scripts/run.py | 2 +- tests/gym/envs/conftest.py | 14 +++++++++++++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/scripts/run.py b/scripts/run.py index 4c1e458a..5b9bc0fc 100644 --- a/scripts/run.py +++ b/scripts/run.py @@ -315,7 +315,7 @@ def main(): try: success = run_agent(args, problem, config) except AgentTimeoutException: - pass # Handleled in run_agent, just continue + pass # Handled in run_agent, just continue except (KeyboardInterrupt, Exception) as e: raise e else: diff --git a/tests/gym/envs/conftest.py b/tests/gym/envs/conftest.py index 1d4056ce..3f7792a6 100644 --- a/tests/gym/envs/conftest.py +++ b/tests/gym/envs/conftest.py @@ -2,7 +2,10 @@ from filelock import FileLock from debug_gym.gym.envs import R2EGymEnv, SWEBenchEnv, SWESmithEnv +from debug_gym.gym.envs.r2egym import load_r2egym_dataset +from debug_gym.gym.envs.swe_bench import load_swebench_dataset from debug_gym.gym.envs.swe_bench_debug import SWEBenchDebugEnv +from debug_gym.gym.envs.swe_smith import load_swesmith_dataset BUILD_ENV_CONFIGS = { "swe_smith": { @@ -31,7 +34,16 @@ def make_env_factory(env_name, worker_id, tmp_path_factory): env_class = kwargs.pop("env_class") def _make_env(): - return env_class(**kwargs) + if type(env_class) in [SWEBenchEnv, SWEBenchDebugEnv]: + fn = load_swebench_dataset + elif type(env_class) == SWESmithEnv: + fn = load_swesmith_dataset + elif type(env_class) == R2EGymEnv: + fn = load_r2egym_dataset + else: + raise ValueError(f"Unknown env_class: {env_class}") + task_data = fn(problems=kwargs["problems"])[0] + return env_class(task_data=task_data) if worker_id == "master": # Not running with pytest-xdist or we are in the master process From aab04ff99894f89f940b93504405608b36059f71 Mon Sep 17 00:00:00 2001 From: Alessandro Sordoni Date: Thu, 27 Nov 2025 13:27:27 -0800 Subject: [PATCH 09/31] fix --- scripts/run.py | 16 ++++++---------- tests/gym/envs/conftest.py | 6 +++--- tests/gym/envs/test_r2egym.py | 16 +++++++++------- 3 files changed, 18 insertions(+), 20 deletions(-) diff --git a/scripts/run.py b/scripts/run.py index 5b9bc0fc..d6a22c08 100644 --- a/scripts/run.py +++ b/scripts/run.py @@ -21,8 +21,8 @@ from debug_gym.gym.envs.swe_bench import load_swebench_dataset from debug_gym.gym.envs.swe_smith import load_swesmith_dataset from debug_gym.gym.envs.r2egym import load_r2egym_dataset - - + + class AgentTimeoutException(BaseException): """Custom exception to handle timeouts in agent execution. Inherits from BaseException to ensure @@ -44,11 +44,7 @@ def timeout_handler(signum, frame): signal.alarm(timeout_seconds) -def run_agent( - args, - problem: dict, - config: dict -): +def run_agent(args, problem: dict, config: dict): set_signal(args.timeout) success = True env = None @@ -260,7 +256,7 @@ def main(): "dataset_id": config.env_kwargs.get("dataset_id"), "dataset_revision": config.env_kwargs.get("dataset_revision"), "problems": config.get("problems", "all"), - "prepull_images": config.env_kwargs.get("prepull_images", False) + "prepull_images": config.env_kwargs.get("prepull_images", False), } load_dataset_fn = { "swebench": load_swebench_dataset, @@ -269,8 +265,8 @@ def main(): "r2egym": load_r2egym_dataset, } - if config['benchmark'] in load_dataset_fn: - dataset = load_dataset_fn[config['benchmark']]( + if config["benchmark"] in load_dataset_fn: + dataset = load_dataset_fn[config["benchmark"]]( **dataset_info, ) else: diff --git a/tests/gym/envs/conftest.py b/tests/gym/envs/conftest.py index 3f7792a6..4dd1c114 100644 --- a/tests/gym/envs/conftest.py +++ b/tests/gym/envs/conftest.py @@ -34,11 +34,11 @@ def make_env_factory(env_name, worker_id, tmp_path_factory): env_class = kwargs.pop("env_class") def _make_env(): - if type(env_class) in [SWEBenchEnv, SWEBenchDebugEnv]: + if isinstance(env_class, (SWEBenchEnv, SWEBenchDebugEnv)): fn = load_swebench_dataset - elif type(env_class) == SWESmithEnv: + elif isinstance(env_class, SWESmithEnv): fn = load_swesmith_dataset - elif type(env_class) == R2EGymEnv: + elif isinstance(env_class, R2EGymEnv): fn = load_r2egym_dataset else: raise ValueError(f"Unknown env_class: {env_class}") diff --git a/tests/gym/envs/test_r2egym.py b/tests/gym/envs/test_r2egym.py index 43d25387..be32c2c2 100644 --- a/tests/gym/envs/test_r2egym.py +++ b/tests/gym/envs/test_r2egym.py @@ -7,7 +7,7 @@ from debug_gym.agents.solution_agent import AgentSolution from debug_gym.gym.entities import Observation -from debug_gym.gym.envs.r2egym import R2EGymEnv +from debug_gym.gym.envs.r2egym import R2EGymEnv, load_r2egym_dataset from debug_gym.gym.terminals.docker import DockerTerminal from debug_gym.gym.tools.tool import ToolCall from debug_gym.gym.tools.toolbox import Toolbox @@ -73,10 +73,12 @@ def test_load_dataset_from_parquet(mock_docker_from_env, tmp_path): mock_terminal = MagicMock(spec=DockerTerminal) # Load the dataset from the Parquet file - env = R2EGymEnv(dataset_id=str(parquet_file), split="train", terminal=mock_terminal) + dataset = load_r2egym_dataset( + dataset_id=str(parquet_file), split="train", terminal=mock_terminal + ) # Verify the dataset contains the expected features - assert sorted(env.ds.features.keys()) == sorted( + assert sorted(dataset.features.keys()) == sorted( [ "commit_hash", "docker_image", @@ -96,10 +98,10 @@ def test_load_dataset_from_parquet(mock_docker_from_env, tmp_path): ) # Verify the dataset has the expected data - assert len(env.ds) == 1 - assert env.ds[0]["docker_image"] == "test_repo:test_hash_123" - assert env.ds[0]["commit_hash"] == "test_hash_123" - assert "Test problem statement" in env.ds[0]["problem_statement"] + assert len(dataset) == 1 + assert dataset[0]["docker_image"] == "test_repo:test_hash_123" + assert dataset[0]["commit_hash"] == "test_hash_123" + assert "Test problem statement" in dataset[0]["problem_statement"] @pytest.if_docker_running From cc8f813e1680cd1844c5bc72f3f860fe08f38e99 Mon Sep 17 00:00:00 2001 From: Alessandro Sordoni Date: Thu, 27 Nov 2025 13:49:18 -0800 Subject: [PATCH 10/31] fix tests --- tests/gym/envs/conftest.py | 6 +++--- tests/gym/envs/test_r2egym.py | 5 +---- tests/gym/terminals/test_kubernetes.py | 2 +- 3 files changed, 5 insertions(+), 8 deletions(-) diff --git a/tests/gym/envs/conftest.py b/tests/gym/envs/conftest.py index 4dd1c114..94e70a47 100644 --- a/tests/gym/envs/conftest.py +++ b/tests/gym/envs/conftest.py @@ -34,11 +34,11 @@ def make_env_factory(env_name, worker_id, tmp_path_factory): env_class = kwargs.pop("env_class") def _make_env(): - if isinstance(env_class, (SWEBenchEnv, SWEBenchDebugEnv)): + if issubclass(env_class, (SWEBenchEnv, SWEBenchDebugEnv)): fn = load_swebench_dataset - elif isinstance(env_class, SWESmithEnv): + elif issubclass(env_class, SWESmithEnv): fn = load_swesmith_dataset - elif isinstance(env_class, R2EGymEnv): + elif issubclass(env_class, R2EGymEnv): fn = load_r2egym_dataset else: raise ValueError(f"Unknown env_class: {env_class}") diff --git a/tests/gym/envs/test_r2egym.py b/tests/gym/envs/test_r2egym.py index be32c2c2..f7bfd352 100644 --- a/tests/gym/envs/test_r2egym.py +++ b/tests/gym/envs/test_r2egym.py @@ -69,12 +69,9 @@ def test_load_dataset_from_parquet(mock_docker_from_env, tmp_path): table = pa.table(data) pq.write_table(table, str(parquet_file)) - # Mock the terminal to avoid actual Docker operations - mock_terminal = MagicMock(spec=DockerTerminal) - # Load the dataset from the Parquet file dataset = load_r2egym_dataset( - dataset_id=str(parquet_file), split="train", terminal=mock_terminal + dataset_id=str(parquet_file), split="train" ) # Verify the dataset contains the expected features diff --git a/tests/gym/terminals/test_kubernetes.py b/tests/gym/terminals/test_kubernetes.py index 0161fbcc..dcde0a56 100644 --- a/tests/gym/terminals/test_kubernetes.py +++ b/tests/gym/terminals/test_kubernetes.py @@ -70,7 +70,7 @@ def test_kubernetes_terminal_init(): assert terminal._pod is not None # Pod name should be automatically generated when not provided at initialization. - assert terminal.pod_name.startswith("dbg-gym.") + assert terminal.pod_name.startswith("dbg-gym-") assert terminal.pod.is_running() assert terminal.pod.exists() From 77aeb783210a556ffff29605ce07ed5ed0573b90 Mon Sep 17 00:00:00 2001 From: Alessandro Sordoni Date: Thu, 27 Nov 2025 14:14:59 -0800 Subject: [PATCH 11/31] simplify filtering --- debug_gym/gym/envs/r2egym.py | 14 +++++++------- debug_gym/gym/envs/swe_bench.py | 10 +++++----- debug_gym/gym/envs/swe_smith.py | 11 +++++------ debug_gym/gym/terminals/kubernetes.py | 2 +- debug_gym/gym/utils.py | 12 ++++++------ 5 files changed, 24 insertions(+), 25 deletions(-) diff --git a/debug_gym/gym/envs/r2egym.py b/debug_gym/gym/envs/r2egym.py index 66745dfd..1cab6740 100644 --- a/debug_gym/gym/envs/r2egym.py +++ b/debug_gym/gym/envs/r2egym.py @@ -89,14 +89,15 @@ def load_r2egym_dataset( custom_splits = yaml.safe_load(f) excluded_ids = custom_splits.get("excluded", []) - dataset = {id.split("/", 1)[-1]: i for i, id in enumerate(ds["docker_image"])} - problems = filter_problems(dataset, problems, custom_splits, excluded_ids) - dataset = {id: i for id, i in dataset.items() if id in problems} + # add instance id to each example (name of the image) + ds["instance_id"] = [id.split("/", 1)[-1] for id in ds["docker_image"]] + problems = filter_problems(ds["instance_id"], problems, custom_splits, excluded_ids) + ds = ds.filter(lambda example: example["instance_id"] in problems) - image_names = set(ds[dataset[id]]["docker_image"] for id in dataset) + image_names = set(ds["docker_image"]) if logger is not None: logger.debug( - f"Loaded {len(dataset)} tasks accross {len(image_names)} Docker images from {dataset_id}." + f"Loaded {len(ds)} tasks across {len(image_names)} Docker images from {dataset_id}." ) if prepull_images: @@ -116,8 +117,7 @@ def load_r2egym_dataset( f"Pulling Docker image {i + 1}/{len(missing_images)} `{image_name}`." ) client.images.pull(image_name) - - return dataset + return ds class R2EGymEnv(RepoEnv): diff --git a/debug_gym/gym/envs/swe_bench.py b/debug_gym/gym/envs/swe_bench.py index 9af6b8f6..891e0c2c 100644 --- a/debug_gym/gym/envs/swe_bench.py +++ b/debug_gym/gym/envs/swe_bench.py @@ -25,11 +25,11 @@ def load_swebench_dataset( logger: DebugGymLogger | None = None, ): ds = datasets.load_dataset(dataset_id, revision=dataset_revision)[split] - dataset = {id: i for i, id in enumerate(ds["instance_id"])} - problems = filter_problems(dataset, problems) - dataset = {id: i for id, i in dataset.items() if id in problems} + problems = filter_problems(ds["instance_id"], problems) + + ds = ds.filter(lambda example: example["instance_id"] in problems) + instance_ids = ds["instance_id"] - instance_ids = [ds[dataset[id]]["instance_id"] for id in dataset] image_names = set( f"sweb.eval.x86_64.{id.replace('__', '_1776_')}" for id in instance_ids ) @@ -52,7 +52,7 @@ def load_swebench_dataset( f"Pulling Docker images {i + 1}/{len(missing_images)}: `{image_name}`." ) client.images.pull(image_name) - return dataset + return ds class SWEBenchEnv(RepoEnv): diff --git a/debug_gym/gym/envs/swe_smith.py b/debug_gym/gym/envs/swe_smith.py index 9dc934fb..e67c2be1 100644 --- a/debug_gym/gym/envs/swe_smith.py +++ b/debug_gym/gym/envs/swe_smith.py @@ -49,14 +49,13 @@ def load_swesmith_dataset( custom_splits = yaml.safe_load(f) excluded_ids = custom_splits.get("excluded", []) - dataset = {id: i for i, id in enumerate(ds["instance_id"])} - problems = filter_problems(dataset, problems, custom_splits, excluded_ids) - dataset = {id: i for id, i in dataset.items() if id in problems} + problems = filter_problems(ds["instance_id"], problems) + ds = ds.filter(lambda example: example["instance_id"] in problems) - image_names = set(ds[dataset[id]]["image_name"] for id in dataset) + image_names = set(ds["image_name"]) if logger is not None: logger.debug( - f"Loaded {len(dataset)} tasks accross {len(image_names)} Docker images from {dataset_id}." + f"Loaded {len(ds)} tasks across {len(image_names)} Docker images from {dataset_id}." ) if prepull_images: @@ -81,7 +80,7 @@ def load_swesmith_dataset( client.images.pull(docker_hub_image) # Rename images via tagging client.images.get(docker_hub_image).tag(image_name) - return dataset + return ds class SWESmithEnv(SWEBenchEnv): diff --git a/debug_gym/gym/terminals/kubernetes.py b/debug_gym/gym/terminals/kubernetes.py index 5fa90ea0..f40ca38a 100644 --- a/debug_gym/gym/terminals/kubernetes.py +++ b/debug_gym/gym/terminals/kubernetes.py @@ -512,7 +512,7 @@ def setup_pod(self, max_retries: int = 3) -> None: "restartPolicy": "Never", "containers": [ { - "name": pod_name, + "name": "main", "image": f"{self.registry}{self.base_image}", "imagePullPolicy": "IfNotPresent", "command": ["/bin/bash"], diff --git a/debug_gym/gym/utils.py b/debug_gym/gym/utils.py index 24372a44..1d95294b 100644 --- a/debug_gym/gym/utils.py +++ b/debug_gym/gym/utils.py @@ -196,7 +196,7 @@ def extract_reward_from_pytest_output(output): def filter_problems( - dataset: dict[str, Any], + dataset_instances: list[str], problems: str | list[str] | None = None, custom_splits: dict[str, Any] | None = None, excluded_ids: list[str] | None = None, @@ -208,9 +208,9 @@ def filter_problems( if not isinstance(problems, str): # Check that all problems are valid task names. for problem in problems: - if problem not in dataset: + if problem not in dataset_instances: raise ValueError( - f"Invalid problem id: '{problem}'.\nChoose from: {sorted(dataset)}" + f"Invalid problem id: '{problem}'.\nChoose from: {sorted(dataset_instances)}" ) # Make sure all problems are unique. @@ -220,14 +220,14 @@ def filter_problems( return problems # Assuming a list of problem IDs. if problems == "all": - return [k for k in dataset if k not in excluded_ids] - elif problems in dataset: + return [k for k in dataset_instances if k not in excluded_ids] + elif problems in dataset_instances: return [problems] # Single task elif problems in custom_splits: return custom_splits[problems] else: raise ValueError( - f"Invalid split or problem id: '{problems}'.\nChoose from: {sorted(dataset) + ['all'] + sorted(custom_splits)}" + f"Invalid split or problem id: '{problems}'.\nChoose from: {sorted(dataset_instances) + ['all'] + sorted(custom_splits)}" ) From 28caf414215ab943192a57e18a4210cbebe08dcc Mon Sep 17 00:00:00 2001 From: Alessandro Sordoni Date: Thu, 27 Nov 2025 14:49:36 -0800 Subject: [PATCH 12/31] remove deps on swesmith! also fix excluded_ids for swesmith --- debug_gym/gym/envs/r2egym.py | 10 +- debug_gym/gym/envs/swe_smith.py | 15 +- debug_gym/gym/envs/swe_smith_constants.py | 686 ++++++++++++++++++++++ debug_gym/gym/envs/swe_smith_utils.py | 190 ++++++ debug_gym/gym/terminals/kubernetes.py | 37 +- 5 files changed, 925 insertions(+), 13 deletions(-) create mode 100755 debug_gym/gym/envs/swe_smith_constants.py create mode 100755 debug_gym/gym/envs/swe_smith_utils.py diff --git a/debug_gym/gym/envs/r2egym.py b/debug_gym/gym/envs/r2egym.py index 1cab6740..b843d4a4 100644 --- a/debug_gym/gym/envs/r2egym.py +++ b/debug_gym/gym/envs/r2egym.py @@ -90,7 +90,13 @@ def load_r2egym_dataset( excluded_ids = custom_splits.get("excluded", []) # add instance id to each example (name of the image) - ds["instance_id"] = [id.split("/", 1)[-1] for id in ds["docker_image"]] + def extract_instance_id(docker_image: str) -> str: + return docker_image.split("/", 1)[-1] + + # create a column "instance_id" in the dataset + instance_ids = [extract_instance_id(id) for id in ds["docker_image"]] + ds = ds.add_column("instance_id", instance_ids) + problems = filter_problems(ds["instance_id"], problems, custom_splits, excluded_ids) ds = ds.filter(lambda example: example["instance_id"] in problems) @@ -154,7 +160,7 @@ def instructions(self) -> str: def setup_task(self, task_data: dict, options: dict = None): self.ds_row = task_data - self.task_name = task_data["instance_id"] + self.task_name = self.ds_row["instance_id"] self.base_image = self.ds_row["docker_image"] self.package_name = self.ds_row["repo_name"] self.expected_output = json.loads(self.ds_row["expected_output_json"]) diff --git a/debug_gym/gym/envs/swe_smith.py b/debug_gym/gym/envs/swe_smith.py index e67c2be1..6ba6fe03 100644 --- a/debug_gym/gym/envs/swe_smith.py +++ b/debug_gym/gym/envs/swe_smith.py @@ -4,12 +4,13 @@ import docker import yaml from datasets import load_dataset, load_from_disk -from swesmith.build_repo.download_images import DOCKER_ORG, TAG -from swesmith.constants import MAP_REPO_TO_SPECS -from swesmith.harness.grading import TestStatus -from swesmith.harness.log_parsers import MAP_REPO_TO_PARSER, parse_log_pytest -from swesmith.harness.utils import get_test_command -from swesmith.utils import get_repo_commit_from_image_name + +from .swe_smith_constants import DOCKER_ORG, TAG, MAP_REPO_TO_SPECS +from .swe_smith_utils import get_test_command, get_repo_commit_from_image_name + +from swebench.harness.constants import TestStatus +from swebench.harness.grading import MAP_REPO_TO_PARSER +from swebench.harness.log_parsers.python import parse_log_pytest from debug_gym.constants import DEBUG_GYM_CACHE_DIR from debug_gym.gym.entities import EvalOutput @@ -49,7 +50,7 @@ def load_swesmith_dataset( custom_splits = yaml.safe_load(f) excluded_ids = custom_splits.get("excluded", []) - problems = filter_problems(ds["instance_id"], problems) + problems = filter_problems(ds["instance_id"], problems, custom_splits, excluded_ids) ds = ds.filter(lambda example: example["instance_id"] in problems) image_names = set(ds["image_name"]) diff --git a/debug_gym/gym/envs/swe_smith_constants.py b/debug_gym/gym/envs/swe_smith_constants.py new file mode 100755 index 00000000..4eff67c0 --- /dev/null +++ b/debug_gym/gym/envs/swe_smith_constants.py @@ -0,0 +1,686 @@ + +DOCKER_ORG = "jyangballin" +TAG = "latest" + +""" +Pulled from official SWE-Smith repository. +""" + +from pathlib import Path + +CONDA_VERSION = "py312_24.1.2-0" +DEFAULT_PM_LIKELIHOOD = 0.2 +ENV_NAME = "testbed" +KEY_IMAGE_NAME = "image_name" + +# If set, then subset of tests are run for post-bug validation +# Affects get_test_command, get_valid_report +KEY_MIN_TESTING = "minimal_testing" +# If set, then for pre-bug validation, individual runs are +# performed instead of running the entire test suite +# Affects valid.py +KEY_MIN_PREGOLD = "minimal_pregold" + +KEY_PATCH = "patch" +KEY_TEST_CMD = "test_cmd" +KEY_TIMED_OUT = "timed_out" +LOG_DIR_BUG_GEN = Path("logs/bug_gen") +LOG_DIR_ENV_RECORDS = Path("logs/build_images/records") +LOG_DIR_ISSUE_GEN = Path("logs/issue_gen") +LOG_DIR_RUN_VALIDATION = Path("logs/run_validation") +LOG_DIR_TASKS = Path("logs/task_insts") +LOG_TEST_OUTPUT_PRE_GOLD = "test_output_pre_gold.txt" +MAX_INPUT_TOKENS = 128000 +ORG_NAME = "swesmith" +PREFIX_BUG = "bug" +PREFIX_METADATA = "metadata" +REF_SUFFIX = ".ref" +SGLANG_API_KEY = "swesmith" +TEMP_PATCH = "_temp_patch_swesmith.diff" +TEST_OUTPUT_END = ">>>>> End Test Output" +TEST_OUTPUT_START = ">>>>> Start Test Output" +TIMEOUT = 120 +UBUNTU_VERSION = "22.04" +VOLUME_NAME_DATASET = "datasets" +VOLUME_NAME_MODEL = "llm-weights" + +GIT_APPLY_CMDS = [ + "git apply --verbose", + "git apply --verbose --reject", + "patch --batch --fuzz=5 -p1 -i", +] + +_DOCKERFILE_BASE_EXTENDED = """ +RUN apt-get update && apt-get install ffmpeg libsm6 libxext6 libgl1 -y +""" + + +""" +Purpose: Mirroring the constants specified in SWE-bench, this file contains the installation +specifications for specific commit(s) of different Python repositories. It is written to be +compatible with the SWE-bench repository to leverage its ability to create docker images. +""" + +### MARK: Commonly Used Installion / Testing Specifications ### + +TEST_PYTEST = "pytest --disable-warnings --color=no --tb=no --verbose -rA -p no:snail" + +DEFAULT_SPECS = { + "install": ["python -m pip install -e ."], + "python": "3.10", + KEY_TEST_CMD: TEST_PYTEST, +} + +CMAKE_VERSIONS = ["3.15.7", "3.16.9", "3.17.5", "3.19.7", "3.23.5", "3.27.9"] +INSTALL_CMAKE = ( + [ + f"wget https://github.com/Kitware/CMake/releases/download/v{v}/cmake-{v}-Linux-x86_64.tar.gz" + for v in CMAKE_VERSIONS + ] + + [ + f"tar -xvzf cmake-{v}-Linux-x86_64.tar.gz && mv cmake-{v}-Linux-x86_64 /usr/share/cmake-{v}" + if v not in ["3.23.5", "3.27.9"] + else f"tar -xvzf cmake-{v}-Linux-x86_64.tar.gz && mv cmake-{v}-linux-x86_64 /usr/share/cmake-{v}" + for v in CMAKE_VERSIONS + ] + + [ + f"update-alternatives --install /usr/bin/cmake cmake /usr/share/cmake-{v}/bin/cmake {(idx + 1) * 10}" + for idx, v in enumerate(CMAKE_VERSIONS) + ] +) + +INSTALL_BAZEL = [ + cmd + for v in ["6.5.0", "7.4.1", "8.0.0"] + for cmd in [ + f"mkdir -p /usr/share/bazel-{v}/bin", + f"wget https://github.com/bazelbuild/bazel/releases/download/{v}/bazel-{v}-linux-x86_64", + f"chmod +x bazel-{v}-linux-x86_64", + f"mv bazel-{v}-linux-x86_64 /usr/share/bazel-{v}/bin/bazel", + ] +] + +### MARK Repository/Commit specific installation instructions ### + +SPECS_REPO_ADDICT = {"75284f9593dfb929cadd900aff9e35e7c7aec54b": DEFAULT_SPECS} +SPECS_REPO_ALIVE_PROGRESS = {"35853799b84ee682af121f7bc5967bd9b62e34c4": DEFAULT_SPECS} +SPECS_REPO_APISPEC = { + "8b421526ea1015046de42599dd93da6a3473fe44": { + **DEFAULT_SPECS, + "install": ["pip install -e .[dev]"], + } +} +SPECS_REPO_ARROW = {"1d70d0091980ea489a64fa95a48e99b45f29f0e7": DEFAULT_SPECS} +SPECS_REPO_ASTROID = {"b114f6b58e749b8ab47f80490dce73ea80d8015f": DEFAULT_SPECS} +SPECS_REPO_ASYNC_TIMEOUT = {"d0baa9f162b866e91881ae6cfa4d68839de96fb5": DEFAULT_SPECS} +SPECS_REPO_AUTOGRAD = { + "ac044f0de1185b725955595840135e9ade06aaed": { + **DEFAULT_SPECS, + "install": ["pip install -e '.[scipy,test]'"], + } +} +SPECS_REPO_BLEACH = {"73871d766de1e33a296eeb4f9faf2451f28bee39": DEFAULT_SPECS} +SPECS_REPO_BOLTONS = {"3bfcfdd04395b6cc74a5c0cdc72c8f64cc4ac01f": DEFAULT_SPECS} +SPECS_REPO_BOTTLE = {"a8dfef301dec35f13e7578306002c40796651629": DEFAULT_SPECS} +SPECS_REPO_BOX = {"a23451d2869a511280eebe194efca41efadd2706": DEFAULT_SPECS} +SPECS_REPO_CANTOOLS = { + "0c6a78711409e4307de34582f795ddb426d58dd8": { + **DEFAULT_SPECS, + "install": ["pip install -e .[dev,plot]"], + } +} +SPECS_REPO_CHANNELS = { + "a144b4b8881a93faa567a6bdf2d7f518f4c16cd2": { + **DEFAULT_SPECS, + "install": ["pip install -e .[tests,daphne]"], + } +} +SPECS_REPO_CHARDET = {"9630f2382faa50b81be2f96fd3dfab5f6739a0ef": DEFAULT_SPECS} +SPECS_REPO_CHARDET_NORMALIZER = { + "1fdd64633572040ab60e62e8b24f29cb7e17660b": DEFAULT_SPECS +} +SPECS_REPO_CLICK = {"fde47b4b4f978f179b9dff34583cb2b99021f482": DEFAULT_SPECS} +SPECS_REPO_CLOUDPICKLE = {"6220b0ce83ffee5e47e06770a1ee38ca9e47c850": DEFAULT_SPECS} +SPECS_REPO_COLORLOG = {"dfa10f59186d3d716aec4165ee79e58f2265c0eb": DEFAULT_SPECS} +SPECS_REPO_COOKIECUTTER = {"b4451231809fb9e4fc2a1e95d433cb030e4b9e06": DEFAULT_SPECS} +SPECS_REPO_DAPHNE = {"32ac73e1a0fb87af0e3280c89fe4cc3ff1231b37": DEFAULT_SPECS} +SPECS_REPO_DATASET = { + "5c2dc8d3af1e0af0290dcd7ae2cae92589f305a1": { + **DEFAULT_SPECS, + "install": ["python setup.py install"], + } +} +SPECS_REPO_DEEPDIFF = { + "ed2520229d0369813f6e54cdf9c7e68e8073ef62": { + **DEFAULT_SPECS, + "install": [ + "pip install -r requirements-dev.txt", + "pip install -e .", + ], + } +} +SPECS_REPO_DJANGO_MONEY = { + "835c1ab867d11137b964b94936692bea67a038ec": { + **DEFAULT_SPECS, + "install": ["pip install -e .[test,exchange]"], + } +} +SPECS_REPO_DOMINATE = {"9082227e93f5a370012bb934286caf7385d3e7ac": DEFAULT_SPECS} +SPECS_REPO_DOTENV = {"2b8635b79f1aa15cade0950117d4e7d12c298766": DEFAULT_SPECS} +SPECS_REPO_DRF_NESTED_ROUTERS = { + "6144169d5c33a1c5134b2fedac1d6cfa312c174e": { + **DEFAULT_SPECS, + "install": [ + "pip install -r requirements.txt", + "pip install -e .", + ], + } +} +SPECS_REPO_ENVIRONS = { + "73c372df71002312615ad0349ae11274bb3edc69": { + **DEFAULT_SPECS, + "install": ["pip install -e .[dev]"], + } +} +SPECS_REPO_EXCEPTIONGROUP = {"0b4f49378b585a338ae10abd72ec2006c5057d7b": DEFAULT_SPECS} +SPECS_REPO_FAKER = {"8b401a7d68f5fda1276f36a8fc502ef32050ed72": DEFAULT_SPECS} +SPECS_REPO_FEEDPARSER = {"cad965a3f52c4b077221a2142fb14ef7f68cd576": DEFAULT_SPECS} +SPECS_REPO_FLAKE8 = {"cf1542cefa3e766670b2066dd75c4571d682a649": DEFAULT_SPECS} +SPECS_REPO_FLASHTEXT = {"b316c7e9e54b6b4d078462b302a83db85f884a94": DEFAULT_SPECS} +SPECS_REPO_FLASK = {"bc098406af9537aacc436cb2ea777fbc9ff4c5aa": DEFAULT_SPECS} +SPECS_REPO_FREEZEGUN = {"5f171db0aaa02c4ade003bbc8885e0bb19efbc81": DEFAULT_SPECS} +SPECS_REPO_FUNCY = {"207a7810c216c7408596d463d3f429686e83b871": DEFAULT_SPECS} +SPECS_REPO_FURL = {"da386f68b8d077086c25adfd205a4c3d502c3012": DEFAULT_SPECS} +SPECS_REPO_FVCORE = { + "a491d5b9a06746f387aca2f1f9c7c7f28e20bef9": { + **DEFAULT_SPECS, + "install": [ + "pip install torch shapely", + "rm tests/test_focal_loss.py", + "pip install -e .", + ], + } +} +SPECS_REPO_GLOM = {"fb3c4e76f28816aebfd2538980e617742e98a7c2": DEFAULT_SPECS} +SPECS_REPO_GPXPY = { + "09fc46b3cad16b5bf49edf8e7ae873794a959620": { + **DEFAULT_SPECS, + KEY_TEST_CMD: "pytest test.py --verbose --color=no --tb=no --disable-warnings -rA -p no:snail", + } +} +SPECS_REPO_GRAFANALIB = {"5c3b17edaa437f0bc09b5f1b9275dc8fb91689fb": DEFAULT_SPECS} +SPECS_REPO_GRAPHENE = {"82903263080b3b7f22c2ad84319584d7a3b1a1f6": DEFAULT_SPECS} +SPECS_REPO_GSPREAD = {"a8be3b96f9276779ab680d84a0982282fb184000": DEFAULT_SPECS} +SPECS_REPO_GTTS = {"dbcda4f396074427172d4a1f798a172686ace6e0": DEFAULT_SPECS} +SPECS_REPO_GUNICORN = {"bacbf8aa5152b94e44aa5d2a94aeaf0318a85248": DEFAULT_SPECS} +SPECS_REPO_H11 = {"bed0dd4ae9774b962b19833941bb9ec4dc403da9": DEFAULT_SPECS} +SPECS_REPO_ICECREAM = {"f76fef56b66b59fd9a89502c60a99fbe28ee36bd": DEFAULT_SPECS} +SPECS_REPO_INFLECT = {"c079a96a573ece60b54bd5210bb0f414beb74dcd": DEFAULT_SPECS} +SPECS_REPO_INICONFIG = {"16793eaddac67de0b8d621ae4e42e05b927e8d67": DEFAULT_SPECS} +SPECS_REPO_ISODATE = {"17cb25eb7bc3556a68f3f7b241313e9bb8b23760": DEFAULT_SPECS} +SPECS_REPO_JAX = { + "ebd90e06fa7caad087e2342431e3899cfd2fdf98": { + **DEFAULT_SPECS, + "install": ['pip install -e ".[cpu]"'], + KEY_TEST_CMD: f"{TEST_PYTEST} -n auto", + KEY_MIN_TESTING: True, + KEY_MIN_PREGOLD: True, + } +} +SPECS_REPO_JINJA = {"ada0a9a6fc265128b46949b5144d2eaa55e6df2c": DEFAULT_SPECS} +SPECS_REPO_JSONSCHEMA = {"93e0caa5752947ec77333da81a634afe41a022ed": DEFAULT_SPECS} +SPECS_REPO_LANGDETECT = {"a1598f1afcbfe9a758cfd06bd688fbc5780177b2": DEFAULT_SPECS} +SPECS_REPO_LINE_PROFILER = {"a646bf0f9ab3d15264a1be14d0d4ee6894966f6a": DEFAULT_SPECS} +SPECS_REPO_MARKDOWNIFY = {"6258f5c38b97ab443b4ddf03e6676ce29b392d06": DEFAULT_SPECS} +SPECS_REPO_MARKUPSAFE = {"620c06c919c1bd7bb1ce3dbee402e1c0c56e7ac3": DEFAULT_SPECS} +SPECS_REPO_MARSHMALLOW = {"9716fc629976c9d3ce30cd15d270d9ac235eb725": DEFAULT_SPECS} +SPECS_REPO_MIDO = { + "a0158ff95a08f9a4eef628a2e7c793fd3a466640": { + **DEFAULT_SPECS, + KEY_TEST_CMD: f"{TEST_PYTEST} -rs -c /dev/null", + } +} +SPECS_REPO_MISTUNE = {"bf54ef67390e02a5cdee7495d4386d7770c1902b": DEFAULT_SPECS} +SPECS_REPO_NIKOLA = { + "0f4c230e5159e4e937463eb8d6d2ddfcbb09def2": { + **DEFAULT_SPECS, + "install": ["pip install -e '.[extras,tests]'"], + } +} +SPECS_REPO_OAUTHLIB = {"1fd5253630c03e3f12719dd8c13d43111f66a8d2": DEFAULT_SPECS} +SPECS_REPO_PARAMIKO = { + "23f92003898b060df0e2b8b1d889455264e63a3e": { + **DEFAULT_SPECS, + KEY_TEST_CMD: "pytest -rA --color=no --disable-warnings -p no:snail", + } +} +SPECS_REPO_PARSE = {"30da9e4f37fdd979487c9fe2673df35b6b204c72": DEFAULT_SPECS} +SPECS_REPO_PARSIMONIOUS = {"0d3f5f93c98ae55707f0958366900275d1ce094f": DEFAULT_SPECS} +SPECS_REPO_PARSO = { + "338a57602740ad0645b2881e8c105ffdc959e90d": { + **DEFAULT_SPECS, + "install": ["python setup.py install"], + } +} +SPECS_REPO_PATSY = { + "a5d1648401b0ea0649b077f4b98da27db947d2d0": { + **DEFAULT_SPECS, + "install": ["pip install -e .[test]"], + } +} +SPECS_REPO_PDFMINER = {"1a8bd2f730295b31d6165e4d95fcb5a03793c978": DEFAULT_SPECS} +SPECS_REPO_PDFPLUMBER = { + "02ff4313f846380fefccec9c73fb4c8d8a80d0ee": { + **DEFAULT_SPECS, + "install": [ + "apt-get update && apt-get install ghostscript -y", + "pip install -e .", + ], + } +} +SPECS_REPO_PIPDEPTREE = { + "c31b641817f8235df97adf178ffd8e4426585f7a": { + **DEFAULT_SPECS, + "install": [ + "apt-get update && apt-get install graphviz -y", + "pip install -e .[test,graphviz]", + ], + } +} +SPECS_REPO_PRETTYTABLE = {"ca90b055f20a6e8a06dcc46c2e3afe8ff1e8d0f1": DEFAULT_SPECS} +SPECS_REPO_PTYPROCESS = {"1067dbdaf5cc3ab4786ae355aba7b9512a798734": DEFAULT_SPECS} +SPECS_REPO_PYASN1 = {"0f07d7242a78ab4d129b26256d7474f7168cf536": DEFAULT_SPECS} +SPECS_REPO_PYDICOM = { + "7d361b3d764dbbb1f8ad7af015e80ce96f6bf286": {**DEFAULT_SPECS, "python": "3.11"} +} +SPECS_REPO_PYFIGLET = {"f8c5f35be70a4bbf93ac032334311b326bc61688": DEFAULT_SPECS} +SPECS_REPO_PYGMENTS = {"27649ebbf5a2519725036b48ec99ef7745f100af": DEFAULT_SPECS} +SPECS_REPO_PYOPENSSL = {"04766a496eb11f69f6226a5a0dfca4db90a5cbd1": DEFAULT_SPECS} +SPECS_REPO_PYPARSING = {"533adf471f85b570006871e60a2e585fcda5b085": DEFAULT_SPECS} +SPECS_REPO_PYPIKA = {"1c9646f0a019a167c32b649b6f5e6423c5ba2c9b": DEFAULT_SPECS} +SPECS_REPO_PYQUERY = {"811cd048ffbe4e69fdc512863671131f98d691fb": DEFAULT_SPECS} +SPECS_REPO_PYSNOOPER = {"57472b4677b6c041647950f28f2d5750c38326c6": DEFAULT_SPECS} +SPECS_REPO_PYTHON_DOCX = {"0cf6d71fb47ede07ecd5de2a8655f9f46c5f083d": DEFAULT_SPECS} +SPECS_REPO_PYTHON_JSON_LOGGER = { + "5f85723f4693c7289724fdcda84cfc0b62da74d4": DEFAULT_SPECS +} +SPECS_REPO_PYTHON_PINYIN = {"e42dede51abbc40e225da9a8ec8e5bd0043eed21": DEFAULT_SPECS} +SPECS_REPO_PYTHON_PPTX = {"278b47b1dedd5b46ee84c286e77cdfb0bf4594be": DEFAULT_SPECS} +SPECS_REPO_PYTHON_QRCODE = {"456b01d41f16e0cfb0f70c687848e276b78c3e8a": DEFAULT_SPECS} +SPECS_REPO_PYTHON_READABILITY = { + "40256f40389c1f97be5e83d7838547581653c6aa": DEFAULT_SPECS +} +SPECS_REPO_PYTHON_SLUGIFY = { + "872b37509399a7f02e53f46ad9881f63f66d334b": { + **DEFAULT_SPECS, + KEY_TEST_CMD: "python test.py --verbose", + } +} +SPECS_REPO_PYVISTA = { + "3f0fad3f42d9b491679e6aa50e52d93c1a81c042": { + **DEFAULT_SPECS, + "install": [ + "apt-get update && apt-get install -y ffmpeg libsm6 libxext6 libxrender1", + "python -m pip install -e '.[dev]'", + ], + } +} +SPECS_REPO_RADON = {"54b88e5878b2724bf4d77f97349588b811abdff2": DEFAULT_SPECS} +SPECS_REPO_RECORDS = {"5941ab2798cb91455b6424a9564c9cd680475fbe": DEFAULT_SPECS} +SPECS_REPO_RED_DISCORDBOT = {"33e0eac741955ce5b7e89d9b8f2f2712727af770": DEFAULT_SPECS} +SPECS_REPO_RESULT = {"0b855e1e38a08d6f0a4b0138b10c127c01e54ab4": DEFAULT_SPECS} +SPECS_REPO_SAFETY = {"7654596be933f8310b294dbc85a7af6066d06e4f": DEFAULT_SPECS} +SPECS_REPO_SCRAPY = { + "35212ec5b05a3af14c9f87a6193ab24e33d62f9f": { + **DEFAULT_SPECS, + "install": [ + "apt-get update && apt-get install -y libxml2-dev libxslt-dev libjpeg-dev", + "python -m pip install -e .", + "rm tests/test_feedexport.py", + "rm tests/test_pipeline_files.py", + ], + KEY_MIN_TESTING: True, + } +} +SPECS_REPO_SCHEDULE = {"82a43db1b938d8fdf60103bd41f329e06c8d3651": DEFAULT_SPECS} +SPECS_REPO_SCHEMA = {"24a3045773eac497c659f24b32f24a281be9f286": DEFAULT_SPECS} +SPECS_REPO_SOUPSIEVE = {"a8080d97a0355e316981cb0c5c887a861c4244e3": DEFAULT_SPECS} +SPECS_REPO_SPACY = { + "b3c46c315eb16ce644bddd106d31c3dd349f6bb2": { + **DEFAULT_SPECS, + "install": [ + "pip install -r requirements.txt", + "pip install -e .", + ], + KEY_MIN_TESTING: True, + } +} +SPECS_REPO_SQLFLUFF = { + "50a1c4b6ff171188b6b70b39afe82a707b4919ac": {**DEFAULT_SPECS, KEY_MIN_TESTING: True} +} +SPECS_REPO_SQLGLOT = { + "036601ba9cbe4d175d6a9d38bc27587eab858968": { + **DEFAULT_SPECS, + "install": ['pip install -e ".[dev]"'], + KEY_MIN_TESTING: True, + } +} +SPECS_REPO_SQLPARSE = {"e57923b3aa823c524c807953cecc48cf6eec2cb2": DEFAULT_SPECS} +SPECS_REPO_STACKPRINTER = {"219fcc522fa5fd6e440703358f6eb408f3ffc007": DEFAULT_SPECS} +SPECS_REPO_STARLETTE = {"db5063c26030e019f7ee62aef9a1b564eca9f1d6": DEFAULT_SPECS} +SPECS_REPO_STRING_SIM = {"115acaacf926b41a15664bd34e763d074682bda3": DEFAULT_SPECS} +SPECS_REPO_SUNPY = { + "f8edfd5c4be873fbd28dec4583e7f737a045f546": { + **DEFAULT_SPECS, + "python": "3.11", + "install": ['pip install -e ".[dev]"'], + KEY_MIN_TESTING: True, + } +} +SPECS_REPO_SYMPY = { + "2ab64612efb287f09822419f4127878a4b664f71": { + **DEFAULT_SPECS, + "python": "3.10", + "install": ["pip install -e ."], + KEY_MIN_TESTING: True, + KEY_MIN_PREGOLD: True, + } +} +SPECS_REPO_TENACITY = {"0d40e76f7d06d631fb127e1ec58c8bd776e70d49": DEFAULT_SPECS} +SPECS_REPO_TERMCOLOR = {"3a42086feb35647bc5aa5f1065b0327200da6b9b": DEFAULT_SPECS} +SPECS_REPO_TEXTDISTANCE = { + "c3aca916bd756a8cb71114688b469ec90ef5b232": { + **DEFAULT_SPECS, + "install": ['pip install -e ".[benchmark,test]"'], + } +} +SPECS_REPO_TEXTFSM = {"c31b600743895f018e7583f93405a3738a9f4d55": DEFAULT_SPECS} +SPECS_REPO_THEFUZZ = {"8a05a3ee38cbd00a2d2f4bb31db34693b37a1fdd": DEFAULT_SPECS} +SPECS_REPO_TINYDB = {"10644a0e07ad180c5b756aba272ee6b0dbd12df8": DEFAULT_SPECS} +SPECS_REPO_TLDEXTRACT = { + "3d1bf184d4f20fbdbadd6274560ccd438939160e": { + **DEFAULT_SPECS, + "install": ["pip install -e .[testing]"], + } +} +SPECS_REPO_TOMLI = {"443a0c1bc5da39b7ed84306912ee1900e6b72e2f": DEFAULT_SPECS} +SPECS_REPO_TORNADO = { + "d5ac65c1f1453c2aeddd089d8e68c159645c13e1": { + **DEFAULT_SPECS, + KEY_TEST_CMD: "python -m tornado.test --verbose", + } +} +SPECS_REPO_TRIO = {"cfbbe2c1f96e93b19bc2577d2cab3f4fe2e81153": DEFAULT_SPECS} +SPECS_REPO_TWEEPY = { + "91a41c6e1c955d278c370d51d5cf43b05f7cd979": { + **DEFAULT_SPECS, + "install": ["pip install -e '.[dev,test,async]'"], + } +} +SPECS_REPO_TYPEGUARD = { + "b6a7e4387c30a9f7d635712157c889eb073c1ea3": { + **DEFAULT_SPECS, + "install": ["pip install -e .[test,doc]"], + } +} +SPECS_REPO_USADDRESS = { + "a42a8f0c14bd2e273939fd51c604f10826301e73": { + **DEFAULT_SPECS, + "install": ["pip install -e .[dev]"], + } +} +SPECS_REPO_VOLUPTUOUS = {"a7a55f83b9fa7ba68b0669b3d78a61de703e0a16": DEFAULT_SPECS} +SPECS_REPO_WEBARGS = {"dbde72fe5db8a999acd1716d5ef855ab7cc1a274": DEFAULT_SPECS} +SPECS_REPO_WORDCLOUD = {"ec24191c64570d287032c5a4179c38237cd94043": DEFAULT_SPECS} +SPECS_REPO_XMLTODICT = {"0952f382c2340bc8b86a5503ba765a35a49cf7c4": DEFAULT_SPECS} +SPECS_REPO_YAMLLINT = {"8513d9b97da3b32453b3fccb221f4ab134a028d7": DEFAULT_SPECS} + +### MARK: SWE-gym Repositories +SPECS_REPO_MOTO = { + "694ce1f4880c784fed0553bc19b2ace6691bc109": { + **DEFAULT_SPECS, + "python": "3.12", + "install": ["make init"], + KEY_MIN_TESTING: True, + } +} +SPECS_REPO_MYPY = { + "e93f06ceab81d8ff1f777c7587d04c339cfd5a16": { + "python": "3.12", + "install": [ + "git submodule update --init mypy/typeshed || true", + "python -m pip install -r test-requirements.txt", + "python -m pip install -e .", + "hash -r", + ], + KEY_TEST_CMD: "pytest --color=no -rA -k -p no:snail", + KEY_MIN_TESTING: True, + } +} +SPECS_REPO_MONAI = { + "a09c1f08461cec3d2131fde3939ef38c3c4ad5fc": { + "python": "3.12", + "install": [ + "sed -i '/^git+https:\/\/github.com\/Project-MONAI\//d' requirements-dev.txt", + "python -m pip install -U -r requirements-dev.txt", + "python -m pip install -e .", + ], + KEY_TEST_CMD: TEST_PYTEST, + KEY_MIN_PREGOLD: True, + KEY_MIN_TESTING: True, + } +} +SPECS_REPO_DVC = { + "1d6ea68133289ceab2637ce7095772678af792c6": { + **DEFAULT_SPECS, + "install": ['pip install -e ".[dev]"'], + KEY_MIN_TESTING: True, + } +} +SPECS_REPO_HYDRA = { + "0f03eb60c2ecd1fbdb25ede9a2c4faeac81de491": { + **DEFAULT_SPECS, + "install": [ + "apt-get update && apt-get install -y openjdk-17-jdk openjdk-17-jre", + "pip install -e .", + ], + KEY_MIN_TESTING: True, + } +} +SPECS_REPO_DASK = { + "5f61e42324c3a6cd4da17b5d5ebe4663aa4b8783": { + **DEFAULT_SPECS, + "install": ["python -m pip install graphviz", "python -m pip install -e ."], + KEY_MIN_TESTING: True, + } +} +SPECS_REPO_MODIN = { + "8c7799fdbbc2fb0543224160dd928215852b7757": { + **DEFAULT_SPECS, + "install": ['pip install -e ".[all]"'], + KEY_MIN_PREGOLD: True, + KEY_MIN_TESTING: True, + } +} +SPECS_REPO_PYDANTIC = { + "acb0f10fda1c78441e052c57b4288bc91431f852": { + "python": "3.10", + "install": [ + "apt-get update && apt-get install -y locales pipx", + "pipx install uv", + "pipx install pre-commit", + 'export PATH="$HOME/.local/bin:$PATH"', + "make install", + ], + KEY_TEST_CMD: f"/root/.local/bin/uv run {TEST_PYTEST}", + } +} +SPECS_REPO_CONAN = { + "86f29e137a10bb6ed140c1a8c05c3099987b13c5": { + **DEFAULT_SPECS, + "install": INSTALL_CMAKE + + INSTALL_BAZEL + + [ + "apt-get -y update && apt-get -y upgrade && apt-get install -y build-essential cmake automake autoconf pkg-config meson ninja-build", + "python -m pip install -r conans/requirements.txt", + "python -m pip install -r conans/requirements_server.txt", + "python -m pip install -r conans/requirements_dev.txt", + "python -m pip install -e .", + ], + KEY_MIN_TESTING: True, + } +} +SPECS_REPO_PANDAS = { + "95280573e15be59036f98d82a8792599c10c6603": { + **DEFAULT_SPECS, + "install": [ + "git remote add upstream https://github.com/pandas-dev/pandas.git", + "git fetch upstream --tags", + "python -m pip install -ve . --no-build-isolation -Ceditable-verbose=true", + """sed -i 's/__version__="[^"]*"/__version__="3.0.0.dev0+1992.g95280573e1"/' build/cp310/_version_meson.py""", + ], + KEY_MIN_PREGOLD: True, + KEY_MIN_TESTING: True, + } +} +SPECS_REPO_MONKEYTYPE = { + "70c3acf62950be5dfb28743c7a719bfdecebcd84": DEFAULT_SPECS, +} + + +MAP_REPO_TO_SPECS = { + "adrienverge/yamllint": SPECS_REPO_YAMLLINT, + "agronholm/exceptiongroup": SPECS_REPO_EXCEPTIONGROUP, + "agronholm/typeguard": SPECS_REPO_TYPEGUARD, + "aio-libs/async-timeout": SPECS_REPO_ASYNC_TIMEOUT, + "alanjds/drf-nested-routers": SPECS_REPO_DRF_NESTED_ROUTERS, + "alecthomas/voluptuous": SPECS_REPO_VOLUPTUOUS, + "amueller/word_cloud": SPECS_REPO_WORDCLOUD, + "andialbrecht/sqlparse": SPECS_REPO_SQLPARSE, + "arrow-py/arrow": SPECS_REPO_ARROW, + "benoitc/gunicorn": SPECS_REPO_GUNICORN, + "borntyping/python-colorlog": SPECS_REPO_COLORLOG, + "bottlepy/bottle": SPECS_REPO_BOTTLE, + "buriy/python-readability": SPECS_REPO_PYTHON_READABILITY, + "burnash/gspread": SPECS_REPO_GSPREAD, + "cantools/cantools": SPECS_REPO_CANTOOLS, + "cdgriffith/Box": SPECS_REPO_BOX, + "chardet/chardet": SPECS_REPO_CHARDET, + "cknd/stackprinter": SPECS_REPO_STACKPRINTER, + "cloudpipe/cloudpickle": SPECS_REPO_CLOUDPICKLE, + "Cog-Creators/Red-DiscordBot": SPECS_REPO_RED_DISCORDBOT, + "conan-io/conan": SPECS_REPO_CONAN, + "cookiecutter/cookiecutter": SPECS_REPO_COOKIECUTTER, + "cool-RR/PySnooper": SPECS_REPO_PYSNOOPER, + "dask/dask": SPECS_REPO_DASK, + "datamade/usaddress": SPECS_REPO_USADDRESS, + "davidhalter/parso": SPECS_REPO_PARSO, + "dbader/schedule": SPECS_REPO_SCHEDULE, + "django-money/django-money": SPECS_REPO_DJANGO_MONEY, + "django/channels": SPECS_REPO_CHANNELS, + "django/daphne": SPECS_REPO_DAPHNE, + "encode/starlette": SPECS_REPO_STARLETTE, + "erikrose/parsimonious": SPECS_REPO_PARSIMONIOUS, + "facebookresearch/fvcore": SPECS_REPO_FVCORE, + "facebookresearch/hydra": SPECS_REPO_HYDRA, + "facelessuser/soupsieve": SPECS_REPO_SOUPSIEVE, + "gawel/pyquery": SPECS_REPO_PYQUERY, + "getmoto/moto": SPECS_REPO_MOTO, + "getnikola/nikola": SPECS_REPO_NIKOLA, + "google/textfsm": SPECS_REPO_TEXTFSM, + "graphql-python/graphene": SPECS_REPO_GRAPHENE, + "gruns/furl": SPECS_REPO_FURL, + "gruns/icecream": SPECS_REPO_ICECREAM, + "gweis/isodate": SPECS_REPO_ISODATE, + "HIPS/autograd": SPECS_REPO_AUTOGRAD, + "hukkin/tomli": SPECS_REPO_TOMLI, + "Instagram/MonkeyType": SPECS_REPO_MONKEYTYPE, + "iterative/dvc": SPECS_REPO_DVC, + "jaraco/inflect": SPECS_REPO_INFLECT, + "jawah/charset_normalizer": SPECS_REPO_CHARDET_NORMALIZER, + "jax-ml/jax": SPECS_REPO_JAX, + "jd/tenacity": SPECS_REPO_TENACITY, + "john-kurkowski/tldextract": SPECS_REPO_TLDEXTRACT, + "joke2k/faker": SPECS_REPO_FAKER, + "jsvine/pdfplumber": SPECS_REPO_PDFPLUMBER, + "kayak/pypika": SPECS_REPO_PYPIKA, + "keleshev/schema": SPECS_REPO_SCHEMA, + "kennethreitz/records": SPECS_REPO_RECORDS, + "Knio/dominate": SPECS_REPO_DOMINATE, + "kurtmckee/feedparser": SPECS_REPO_FEEDPARSER, + "lepture/mistune": SPECS_REPO_MISTUNE, + "life4/textdistance": SPECS_REPO_TEXTDISTANCE, + "lincolnloop/python-qrcode": SPECS_REPO_PYTHON_QRCODE, + "luozhouyang/python-string-similarity": SPECS_REPO_STRING_SIM, + "madzak/python-json-logger": SPECS_REPO_PYTHON_JSON_LOGGER, + "mahmoud/boltons": SPECS_REPO_BOLTONS, + "mahmoud/glom": SPECS_REPO_GLOM, + "marshmallow-code/apispec": SPECS_REPO_APISPEC, + "marshmallow-code/marshmallow": SPECS_REPO_MARSHMALLOW, + "marshmallow-code/webargs": SPECS_REPO_WEBARGS, + "martinblech/xmltodict": SPECS_REPO_XMLTODICT, + "matthewwithanm/python-markdownify": SPECS_REPO_MARKDOWNIFY, + "mewwts/addict": SPECS_REPO_ADDICT, + "mido/mido": SPECS_REPO_MIDO, + "Mimino666/langdetect": SPECS_REPO_LANGDETECT, + "modin-project/modin": SPECS_REPO_MODIN, + "mozilla/bleach": SPECS_REPO_BLEACH, + "mozillazg/python-pinyin": SPECS_REPO_PYTHON_PINYIN, + "msiemens/tinydb": SPECS_REPO_TINYDB, + "oauthlib/oauthlib": SPECS_REPO_OAUTHLIB, + "pallets/click": SPECS_REPO_CLICK, + "pallets/flask": SPECS_REPO_FLASK, + "pallets/jinja": SPECS_REPO_JINJA, + "pallets/markupsafe": SPECS_REPO_MARKUPSAFE, + "pandas-dev/pandas": SPECS_REPO_PANDAS, + "paramiko/paramiko": SPECS_REPO_PARAMIKO, + "pdfminer/pdfminer.six": SPECS_REPO_PDFMINER, + "pexpect/ptyprocess": SPECS_REPO_PTYPROCESS, + "pndurette/gTTS": SPECS_REPO_GTTS, + "prettytable/prettytable": SPECS_REPO_PRETTYTABLE, + "Project-MONAI/MONAI": SPECS_REPO_MONAI, + "pudo/dataset": SPECS_REPO_DATASET, + "pwaller/pyfiglet": SPECS_REPO_PYFIGLET, + "pyasn1/pyasn1": SPECS_REPO_PYASN1, + "pyca/pyopenssl": SPECS_REPO_PYOPENSSL, + "PyCQA/flake8": SPECS_REPO_FLAKE8, + "pydantic/pydantic": SPECS_REPO_PYDANTIC, + "pydata/patsy": SPECS_REPO_PATSY, + "pydicom/pydicom": SPECS_REPO_PYDICOM, + "pygments/pygments": SPECS_REPO_PYGMENTS, + "pylint-dev/astroid": SPECS_REPO_ASTROID, + "pyparsing/pyparsing": SPECS_REPO_PYPARSING, + "pytest-dev/iniconfig": SPECS_REPO_INICONFIG, + "python-hyper/h11": SPECS_REPO_H11, + "python-jsonschema/jsonschema": SPECS_REPO_JSONSCHEMA, + "python-openxml/python-docx": SPECS_REPO_PYTHON_DOCX, + "python-trio/trio": SPECS_REPO_TRIO, + "python/mypy": SPECS_REPO_MYPY, + "pyupio/safety": SPECS_REPO_SAFETY, + "pyutils/line_profiler": SPECS_REPO_LINE_PROFILER, + "pyvista/pyvista": SPECS_REPO_PYVISTA, + "r1chardj0n3s/parse": SPECS_REPO_PARSE, + "rsalmei/alive-progress": SPECS_REPO_ALIVE_PROGRESS, + "rubik/radon": SPECS_REPO_RADON, + "rustedpy/result": SPECS_REPO_RESULT, + "scanny/python-pptx": SPECS_REPO_PYTHON_PPTX, + "scrapy/scrapy": SPECS_REPO_SCRAPY, + "seatgeek/thefuzz": SPECS_REPO_THEFUZZ, + "seperman/deepdiff": SPECS_REPO_DEEPDIFF, + "sloria/environs": SPECS_REPO_ENVIRONS, + "spulec/freezegun": SPECS_REPO_FREEZEGUN, + "sqlfluff/sqlfluff": SPECS_REPO_SQLFLUFF, + "sunpy/sunpy": SPECS_REPO_SUNPY, + "Suor/funcy": SPECS_REPO_FUNCY, + "sympy/sympy": SPECS_REPO_SYMPY, + "termcolor/termcolor": SPECS_REPO_TERMCOLOR, + "theskumar/python-dotenv": SPECS_REPO_DOTENV, + "tkrajina/gpxpy": SPECS_REPO_GPXPY, + "tobymao/sqlglot": SPECS_REPO_SQLGLOT, + "tornadoweb/tornado": SPECS_REPO_TORNADO, + "tox-dev/pipdeptree": SPECS_REPO_PIPDEPTREE, + "tweepy/tweepy": SPECS_REPO_TWEEPY, + "un33k/python-slugify": SPECS_REPO_PYTHON_SLUGIFY, + "vi3k6i5/flashtext": SPECS_REPO_FLASHTEXT, + "weaveworks/grafanalib": SPECS_REPO_GRAFANALIB, +} \ No newline at end of file diff --git a/debug_gym/gym/envs/swe_smith_utils.py b/debug_gym/gym/envs/swe_smith_utils.py new file mode 100755 index 00000000..65c085f8 --- /dev/null +++ b/debug_gym/gym/envs/swe_smith_utils.py @@ -0,0 +1,190 @@ +""" +Pulled from official SWE-Smith repository. +""" +import os +import re +from pathlib import Path +from unidiff import PatchSet + +from .swe_smith_constants import ( + KEY_IMAGE_NAME, + KEY_MIN_TESTING, + KEY_PATCH, + KEY_TEST_CMD, + MAP_REPO_TO_SPECS, +) + + +FAIL_TO_PASS = "FAIL_TO_PASS" +PASS_TO_PASS = "PASS_TO_PASS" +INSTANCE_REF = "instance_ref" + +def get_repo_name(repo, commit) -> str: + """ + Get the SWE-smith GitHub repository name for a repository at a specific commit. + """ + return f"{repo.replace('/', '__')}.{commit[:8]}" + +def get_test_paths(dir_path: str, ext: str = ".py") -> list[Path]: + """ + Get all testing file paths relative to the given directory. + """ + return [ + Path(os.path.relpath(os.path.join(root, file), dir_path)) + for root, _, files in os.walk(Path(dir_path).resolve()) + for file in files + if ( + ( + any([x in root.split("/") for x in ["tests", "test", "specs"]]) + or file.lower().startswith("test") + or file.rsplit(".", 1)[0].endswith("test") + ) + and (ext is None or file.endswith(ext)) + ) + ] + + +def get_full_commit(repo, partial_commit) -> str: + """ + Get the full commit hash for a repository at a specific commit. + """ + for commit in MAP_REPO_TO_SPECS[repo]: + if commit.startswith(partial_commit): + return commit + + raise ValueError(f"Commit {partial_commit} not found for repository {repo}.") + +def get_repo_commit_from_image_name(image_name: str) -> tuple[str, str]: + """ + Get the repository and commit from a docker image ID. + """ + # Parsing supports repos with '.' in their name + image_name = image_name.split(".", 2)[-1] + repo = image_name.rsplit(".", 1)[0].replace("__", "/") + partial_commit = image_name.rsplit(".", 1)[-1] + for repo_name in MAP_REPO_TO_SPECS: + # Hack because docker image_name must be lowercase + if repo_name.lower() == repo: + repo = repo_name + break + commit = get_full_commit(repo, partial_commit) + return repo, commit + + +def get_test_command_mypy(instance: dict): + repo, commit = get_repo_commit_from_image_name(instance[KEY_IMAGE_NAME]) + pattern = r"\[case ([^\]]+)\]" + if FAIL_TO_PASS in instance: + fail_to_pass_files = [x.rsplit("::", 1)[-1] for x in instance[FAIL_TO_PASS]] + if PASS_TO_PASS in instance: + pass_to_pass_files = [x.rsplit("::", 1)[-1] for x in instance[PASS_TO_PASS]] + all_files = list(set(fail_to_pass_files + pass_to_pass_files)) + else: + all_files = list(set(fail_to_pass_files)) + test_keys = " or ".join(all_files) + elif INSTANCE_REF in instance and "test_patch" in instance[INSTANCE_REF]: + test_keys = " or ".join( + re.findall(pattern, instance[INSTANCE_REF]["test_patch"]) + ) + return f'{MAP_REPO_TO_SPECS[repo][commit][KEY_TEST_CMD]} "{test_keys}"' + +MAP_REPO_TO_TEST_CMD = { + "python/mypy": get_test_command_mypy, +} + +def get_test_command(instance: dict): + """ + Given a repo/commit pair and a (gold) patch, return the test command to run + """ + repo, commit = get_repo_commit_from_image_name(instance[KEY_IMAGE_NAME]) + specs = MAP_REPO_TO_SPECS[repo][commit] + test_command = specs[KEY_TEST_CMD] + + if FAIL_TO_PASS in instance and "pytest" in specs[KEY_TEST_CMD]: + # NOTE: Using F2P key as indicator that this is eval instance, not validation + if repo in MAP_REPO_TO_TEST_CMD: + return MAP_REPO_TO_TEST_CMD[repo](instance), [] + f2p_files = list(set([x.split("::", 1)[0] for x in instance[FAIL_TO_PASS]])) + p2p_files = [] + if PASS_TO_PASS in instance: + p2p_files = list(set([x.split("::", 1)[0] for x in instance[PASS_TO_PASS]])) + all_files = list(set(f2p_files + p2p_files)) + test_command += f" {' '.join(all_files)}" + return test_command, all_files + + if KEY_MIN_TESTING not in specs or KEY_PATCH not in instance: + # If min testing is not enabled or there's no patch + # return test command as is (usually = run whole test suite) + return test_command, [] + + # Get all testing related file paths in the repo + test_paths = get_test_paths(get_repo_name(repo, commit)) + + if ( + INSTANCE_REF in instance + and len(instance[INSTANCE_REF]["test_patch"].strip()) > 0 + ): + test_patch = instance[INSTANCE_REF]["test_patch"] + # For PR Mirroring (SWE-bench style) instances, + # if test patch is available, use that information + if repo in MAP_REPO_TO_TEST_CMD: + return MAP_REPO_TO_TEST_CMD[repo](instance), [] + rv = [] + for x in PatchSet(test_patch): + for test_path in test_paths: + if str(test_path).endswith(x.path) or str(test_path).endswith( + Path(x.path).name + ): + rv.append(str(test_path)) + if len(rv) > 0: + test_command += f" {' '.join(rv)}" + return test_command, rv + + # Identify relevant test files based on the patch + patch_paths = [Path(f.path) for f in PatchSet(instance[KEY_PATCH])] + rv = [] + for patch_path in patch_paths: + file_name = patch_path.name.strip(".py") + parent_dir = patch_path.parent.name + for test_path in test_paths: + # Check for common test file naming conventions first + # If found, add to list and break + common_test_names = [ + f"test_{file_name}.py", + f"test{file_name}.py", + f"{file_name}_test.py", + f"{file_name}test.py", + ] + if any( + [ + str(test_path).endswith(f"{parent_dir}/{name}") + or str(test_path).endswith(name) + for name in common_test_names + ] + ): + rv.append(str(test_path)) + break + else: + for test_path in test_paths: + if parent_dir == test_path.parent.name: + # If similar testing folder found, add to list and break + rv.append(str(test_path.parent)) + break + elif any( + [ + x.format(parent_dir) == test_path.name + for x in ["test_{}.py", "test{}.py", "{}_test.py", "{}test.py"] + ] + ): + rv.append(str(test_path)) + + if len(rv) > 0: + # Remove duplicates + test_files = [x for x in rv if x.endswith(".py")] + final = [x for x in rv if not x.endswith(".py")] + for test_file in test_files: + if os.path.dirname(test_file) not in final: + final.append(test_file) + test_command += f" {' '.join(set(final))}" + + return test_command, rv \ No newline at end of file diff --git a/debug_gym/gym/terminals/kubernetes.py b/debug_gym/gym/terminals/kubernetes.py index f40ca38a..54f22b0b 100644 --- a/debug_gym/gym/terminals/kubernetes.py +++ b/debug_gym/gym/terminals/kubernetes.py @@ -257,7 +257,9 @@ def __str__(self): class KubernetesTerminal(Terminal): - + """ + Note: reads values of env variables K8S_NAMESPACE, K8S_DOCKER_SECRET, K8S_DOCKER_CONSTRAINT. + """ def __init__( self, working_dir: str | None = None, @@ -268,8 +270,9 @@ def __init__( setup_commands: list[str] | None = None, pod_name: str | None = None, base_image: str | None = None, - registry: str = "", - namespace: str = "default", + image_pull_secret: str | None = None, + registry: str = "docker.io", + namespace: str | None = None, kube_config: str | None = None, kube_context: str | None = None, extra_labels: dict | None = None, @@ -286,7 +289,9 @@ def __init__( self.base_image = base_image self._task_name = base_image self.setup_commands = setup_commands or [] - self.namespace = namespace + self.namespace = namespace or os.environ.get("K8S_NAMESPACE", "default") + self.image_pull_secret = image_pull_secret or os.environ.get("K8S_DOCKER_SECRET") + self.in_node_constraint = os.environ.get("K8S_NODE_CONSTRAINT", False) self.kubernetes_kwargs = kwargs # e.g., nodeSelector, tolerations self.registry = registry.rstrip("/") + "/" if registry else "" self._pod_name = pod_name @@ -498,6 +503,30 @@ def setup_pod(self, max_retries: int = 3) -> None: f"with image: {self.registry}{self.base_image}" ) + # set image pull secrets, don't override imagePullSecrets + if self.image_pull_secret and not "imagePullSecrets" in pod_spec_kwargs: + pod_spec_kwargs["imagePullSecrets"] = [{"name": self.image_pull_secret}] + + # set in node constraint, don't override affinity + if self.in_node_constraint and not "affinity" in pod_spec_kwargs: + pod_spec_kwargs["affinity"] = { + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "kubernetes.io/hostname", + "operator": "In", + "values": [os.environ["HOSTNAME"]], + } + ] + } + ] + } + } + } + # Create pod specification for Kubernetes. pod_body = { "apiVersion": "v1", From d9b76c781f2f17749df6838d39520cfc2504ebea Mon Sep 17 00:00:00 2001 From: Alessandro Sordoni Date: Thu, 27 Nov 2025 14:50:56 -0800 Subject: [PATCH 13/31] remove swesmith --- requirements.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index e60c649c..afe42474 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,7 +10,6 @@ transformers==4.51.3 tiktoken==0.9.0 docker==7.1.0 swebench==4.0.3 -swesmith==0.0.4 prompt_toolkit==3.0.51 anthropic==0.51.0 jinja2==3.1.6 From 928c1d83997ccbd7d801e3cf9e8bc46dec876135 Mon Sep 17 00:00:00 2001 From: Alessandro Sordoni Date: Fri, 28 Nov 2025 07:48:33 -0800 Subject: [PATCH 14/31] load dataset as class method / setup_task --- debug_gym/gym/envs/aider.py | 29 ++-- debug_gym/gym/envs/env.py | 5 +- debug_gym/gym/envs/mini_nightmare.py | 38 +++-- debug_gym/gym/envs/r2egym.py | 162 +++++++++++----------- debug_gym/gym/envs/swe_bench.py | 107 +++++++------- debug_gym/gym/envs/swe_smith.py | 152 ++++++++++---------- debug_gym/gym/envs/swe_smith_constants.py | 11 +- debug_gym/gym/envs/swe_smith_utils.py | 8 +- debug_gym/gym/terminals/kubernetes.py | 7 +- debug_gym/gym/utils.py | 12 +- 10 files changed, 281 insertions(+), 250 deletions(-) diff --git a/debug_gym/gym/envs/aider.py b/debug_gym/gym/envs/aider.py index 26776448..3056e0dd 100644 --- a/debug_gym/gym/envs/aider.py +++ b/debug_gym/gym/envs/aider.py @@ -2,6 +2,7 @@ import subprocess import tempfile from pathlib import Path +from typing import List import debug_gym.gym.utils as utils from debug_gym.constants import DEBUG_GYM_CACHE_DIR @@ -62,6 +63,7 @@ class AiderBenchmarkEnv(RepoEnv): def __init__( self, + task_data: dict, entrypoint: str = "python -m pytest --tb=no -s .", terminal: Terminal | None = None, **kwargs, @@ -73,6 +75,7 @@ def __init__( if hasattr(terminal, "base_image") and terminal.base_image is None: terminal.base_image = DOCKER_AIDER_IMAGE_NAME + self.task_data = task_data super().__init__(entrypoint=entrypoint, terminal=terminal, **kwargs) @property @@ -91,10 +94,8 @@ def eval(self, **kwargs) -> EvalOutput: self.last_eval = EvalOutput(success, output) return self.last_eval - def setup_task(self, task_name: str, options: dict = None): - if task_name not in self.dataset: - raise ValueError(f"Task {task_name} not found in the dataset.") - self.current_task = self.dataset[task_name] + def setup_task(self, options: dict = None): + pass def setup_workspace(self): self.workspace.reset() @@ -122,14 +123,20 @@ def setup_terminal(self): ) # Aider tasks come with those. self.terminal.run("git commit -am 'Add debug-gym ignore and read-only files'") - def load_dataset(self, problems: str | list[str] | None = None): - if isinstance(self.terminal, DockerTerminal): - build_docker_image(self.logger) + @classmethod + def load_dataset( + cls, + problems: str | list[str] | None = None, + build_image: bool = False, + logger: object = None, + ) -> dict: + if build_image: + build_docker_image(logger) - if not os.path.exists(self.REPO_PATH): - subprocess.run(["git", "clone", self.REPO_URL, self.REPO_PATH], check=True) + if not os.path.exists(cls.REPO_PATH): + subprocess.run(["git", "clone", cls.REPO_URL, cls.REPO_PATH], check=True) - practice_path = self.REPO_PATH / "exercises" / "practice" + practice_path = cls.REPO_PATH / "exercises" / "practice" directories = [d for d in practice_path.iterdir() if d.is_dir()] dataset = {} @@ -166,5 +173,5 @@ def load_dataset(self, problems: str | list[str] | None = None): } problems = utils.filter_problems(dataset, problems) - dataset = {id: i for id, i in dataset.items() if id in problems} + dataset = {id: data for id, data in dataset.items() if id in problems} return dataset diff --git a/debug_gym/gym/envs/env.py b/debug_gym/gym/envs/env.py index 48807546..0021dc2b 100644 --- a/debug_gym/gym/envs/env.py +++ b/debug_gym/gym/envs/env.py @@ -292,7 +292,7 @@ def instructions(self) -> str: Override in subclasses for different behavior.""" return "" - def setup_task(self, task_name: str, options: dict = None) -> None: + def setup_task(self, options: dict = None) -> None: """Setup the task information. Override in subclasses for different behavior. Called once at reset.""" pass @@ -325,8 +325,7 @@ def reset(self, *, options: dict = None): self.options = options if options is not None else self.options self.logger.debug("Resetting environment") self.close() # Clean up previous workspace and terminal. - self.task_name = self.options.get("task_name") - self.setup_task(task_name=self.task_name, options=self.options) + self.setup_task(options=self.options) self.setup_workspace() self.setup_terminal() self._reset_env_state() diff --git a/debug_gym/gym/envs/mini_nightmare.py b/debug_gym/gym/envs/mini_nightmare.py index b5cee0a8..32937bfb 100644 --- a/debug_gym/gym/envs/mini_nightmare.py +++ b/debug_gym/gym/envs/mini_nightmare.py @@ -74,6 +74,7 @@ class MiniNightmareEnv(RepoEnv): def __init__( self, + task_data: dict, entrypoint: str = "python -m pytest --tb=no -s test.py", terminal: Terminal | None = None, **kwargs, @@ -85,6 +86,9 @@ def __init__( if hasattr(terminal, "base_image") and terminal.base_image is None: terminal.base_image = DOCKER_MINI_NIGHTMARE_IMAGE_NAME + self.task_data = task_data + self.task_name = task_data["task_name"] + super().__init__(entrypoint=entrypoint, terminal=terminal, **kwargs) @property @@ -107,10 +111,8 @@ def eval(self, **kwargs) -> EvalOutput: self.last_eval = EvalOutput(success, output) return self.last_eval - def setup_task(self, task_name: str, options: dict = None): - if task_name not in self.dataset: - raise ValueError(f"Task {task_name} not found in the dataset.") - self.current_task = self.dataset[task_name] + def setup_task(self, options: dict = None): + pass def setup_workspace(self): self.workspace.reset() @@ -138,19 +140,27 @@ def setup_terminal(self): ) # Mini-nightmare tasks come with those. self.terminal.run("git commit -am 'Add debug-gym ignore and read-only files'") - def load_dataset(self, problems: str | list[str] | None = None): - if isinstance(self.terminal, DockerTerminal): - build_docker_image(self.logger) - - if not self.DATA_PATH.exists(): + @classmethod + def load_dataset( + cls, + problems: str | list[str] | None = None, + build_image: bool = False, + logger: object = None, + ) -> dict: + if build_image: + build_docker_image(logger) + + if not MiniNightmareEnv.DATA_PATH.exists(): zipped_data = utils.download( - self.DATA_URL, self.DATA_PATH, f"Downloading mini-nightmare dataset." + MiniNightmareEnv.DATA_URL, + MiniNightmareEnv.DATA_PATH, + f"Downloading mini-nightmare dataset.", ) - utils.unzip(zipped_data, dst=self.DATA_PATH.parent) + utils.unzip(zipped_data, dst=cls.DATA_PATH.parent) dataset = {} - for task_name in self.TASK_NAMES: - task_path = self.DATA_PATH / task_name + for task_name in cls.TASK_NAMES: + task_path = cls.DATA_PATH / task_name assert (task_path / "test.py").exists() assert (task_path / f"{task_name}_code.py").exists() assert (task_path / ".debugignore").exists() @@ -162,5 +172,5 @@ def load_dataset(self, problems: str | list[str] | None = None): } problems = utils.filter_problems(dataset, problems) - dataset = {id: i for id, i in dataset.items() if id in problems} + dataset = {id: data for id, data in dataset.items() if id in problems} return dataset diff --git a/debug_gym/gym/envs/r2egym.py b/debug_gym/gym/envs/r2egym.py index b843d4a4..8e650400 100644 --- a/debug_gym/gym/envs/r2egym.py +++ b/debug_gym/gym/envs/r2egym.py @@ -59,73 +59,6 @@ def parse_log_pytest(log: str | None) -> dict[str, str]: return test_status_map -def load_r2egym_dataset( - dataset_id: str = "R2E-Gym/R2E-Gym-Lite", - dataset_revision: str = "8d3163011f01f9393bb3dc7700497a79a8686ae5", - split: str = "train", - problems: list | None = None, - prepull_images: bool = False, - logger: DebugGymLogger | None = None, -): - data_path = Path(dataset_id) - if data_path.is_file(): - # Loading from local file. - if data_path.suffix.lower() == ".json": - ds = load_dataset("json", data_files=dataset_id) - elif data_path.suffix.lower() == ".parquet": - ds = load_dataset("parquet", data_files=dataset_id) - elif data_path.is_dir(): - # Loading from local folder. - ds = load_from_disk(dataset_id) - else: - # Loading from HuggingFace or a folder. - ds = load_dataset(dataset_id, revision=dataset_revision) - - # Select the split. - ds = ds[split] - - # Load custom dataset splits from config. - with open(R2EGymEnv.CONFIG) as f: - custom_splits = yaml.safe_load(f) - excluded_ids = custom_splits.get("excluded", []) - - # add instance id to each example (name of the image) - def extract_instance_id(docker_image: str) -> str: - return docker_image.split("/", 1)[-1] - - # create a column "instance_id" in the dataset - instance_ids = [extract_instance_id(id) for id in ds["docker_image"]] - ds = ds.add_column("instance_id", instance_ids) - - problems = filter_problems(ds["instance_id"], problems, custom_splits, excluded_ids) - ds = ds.filter(lambda example: example["instance_id"] in problems) - - image_names = set(ds["docker_image"]) - if logger is not None: - logger.debug( - f"Loaded {len(ds)} tasks across {len(image_names)} Docker images from {dataset_id}." - ) - - if prepull_images: - # Download all images needed for R2E-Gym. - client = docker.from_env() - - existing_images = set( - tag for image in client.images.list() for tag in image.tags - ) - missing_images = image_names - existing_images - if missing_images: - if logger is not None: - logger.warning(f"Found {len(missing_images)} missing Docker images.") - for i, image_name in enumerate(missing_images): - if logger is not None: - logger.warning( - f"Pulling Docker image {i + 1}/{len(missing_images)} `{image_name}`." - ) - client.images.pull(image_name) - return ds - - class R2EGymEnv(RepoEnv): CACHE = DEBUG_GYM_CACHE_DIR / "r2e-gym" CONFIG = importlib_files("debug_gym") / "gym" / "envs" / "configs" / "r2egym.yaml" @@ -142,8 +75,8 @@ def __init__( "R2EGymEnv only supports DockerTerminal and KubernetesTerminal." ) - self.ds_row = task_data - self.setup_task(task_data=task_data) + self.task_data = task_data + self.setup_task() self.session_commands = [] super().__init__(terminal=terminal, **kwargs) @@ -156,25 +89,24 @@ def instructions(self) -> str: content = self.ds_row["problem_statement"] return re.search(r"\[ISSUE\](.*)\[/ISSUE\]", content, re.DOTALL).group(1) except Exception as e: - return self.ds_row["problem_statement"] - - def setup_task(self, task_data: dict, options: dict = None): - self.ds_row = task_data - self.task_name = self.ds_row["instance_id"] - self.base_image = self.ds_row["docker_image"] - self.package_name = self.ds_row["repo_name"] - self.expected_output = json.loads(self.ds_row["expected_output_json"]) + return self.task_data["problem_statement"] + + def setup_task(self, options: dict = None): + self.task_name = self.task_data["instance_id"] + self.base_image = self.task_data["docker_image"] + self.package_name = self.task_data["repo_name"] + self.expected_output = json.loads(self.task_data["expected_output_json"]) self.expected_output = decolor_dict_keys(self.expected_output) self.expected_output = { k.split(" - ")[0]: self.expected_output[k] for k in sorted(self.expected_output.keys()) } - self.commit_hash = self.ds_row["commit_hash"] + self.commit_hash = self.task_data["commit_hash"] self.entrypoint = "python -m pytest -W ignore -rA r2e_tests" if self.package_name == "pillow": - test_file_codes = json.loads(self.ds_row["execution_result_content"])[ + test_file_codes = json.loads(self.task_data["execution_result_content"])[ "test_file_codes" ] if any(["unittest" in test_code for test_code in test_file_codes]): @@ -319,3 +251,75 @@ def calculate_score(self, eval_output: EvalOutput) -> int: reward = 1 if match else 0 return reward + + @classmethod + def load_dataset( + dataset_id: str = "R2E-Gym/R2E-Gym-Lite", + dataset_revision: str = "8d3163011f01f9393bb3dc7700497a79a8686ae5", + split: str = "train", + problems: list | None = None, + prepull_images: bool = False, + logger: DebugGymLogger | None = None, + ) -> dict: + data_path = Path(dataset_id) + if data_path.is_file(): + # Loading from local file. + if data_path.suffix.lower() == ".json": + ds = load_dataset("json", data_files=dataset_id) + elif data_path.suffix.lower() == ".parquet": + ds = load_dataset("parquet", data_files=dataset_id) + elif data_path.is_dir(): + # Loading from local folder. + ds = load_from_disk(dataset_id) + else: + # Loading from HuggingFace or a folder. + ds = load_dataset(dataset_id, revision=dataset_revision) + + # Select the split. + ds = ds[split] + + # Load custom dataset splits from config. + with open(R2EGymEnv.CONFIG) as f: + custom_splits = yaml.safe_load(f) + excluded_ids = custom_splits.get("excluded", []) + + # add instance id to each example (name of the image) + def extract_instance_id(docker_image: str) -> str: + return docker_image.split("/", 1)[-1] + + # create a column "instance_id" in the dataset + dataset = {} + for example in ds: + instance_id = extract_instance_id(example["docker_image"]) + example["instance_id"] = instance_id + dataset[instance_id] = example + + problems = filter_problems(dataset, problems, custom_splits, excluded_ids) + dataset = {pid: dataset[pid] for pid in problems} + + image_names = set(example["docker_image"] for example in dataset.values()) + if logger is not None: + logger.debug( + f"Loaded {len(dataset)} tasks across {len(image_names)} Docker images from {dataset_id}." + ) + + if prepull_images: + # Download all images needed for R2E-Gym. + client = docker.from_env() + + existing_images = set( + tag for image in client.images.list() for tag in image.tags + ) + missing_images = image_names - existing_images + if missing_images: + if logger is not None: + logger.warning( + f"Found {len(missing_images)} missing Docker images." + ) + for i, image_name in enumerate(missing_images): + if logger is not None: + logger.warning( + f"Pulling Docker image {i + 1}/{len(missing_images)} `{image_name}`." + ) + client.images.pull(image_name) + return dataset diff --git a/debug_gym/gym/envs/swe_bench.py b/debug_gym/gym/envs/swe_bench.py index 891e0c2c..68566230 100644 --- a/debug_gym/gym/envs/swe_bench.py +++ b/debug_gym/gym/envs/swe_bench.py @@ -16,45 +16,6 @@ from debug_gym.gym.utils import filter_problems -def load_swebench_dataset( - dataset_id: str = "SWE-bench/SWE-bench_Verified", - dataset_revision: str = "99450355ca8c611021187a57ffac304b66666738", - split: str = "test", - problems: list | None = None, - prepull_images: bool = False, - logger: DebugGymLogger | None = None, -): - ds = datasets.load_dataset(dataset_id, revision=dataset_revision)[split] - problems = filter_problems(ds["instance_id"], problems) - - ds = ds.filter(lambda example: example["instance_id"] in problems) - instance_ids = ds["instance_id"] - - image_names = set( - f"sweb.eval.x86_64.{id.replace('__', '_1776_')}" for id in instance_ids - ) - - if prepull_images: - # Download all images needed for SWE-Bench. - client = docker.from_env() - tagged_image_names = set(f"swebench/{name}:latest" for name in image_names) - - existing_images = set( - tag for image in client.images.list() for tag in image.tags - ) - missing_images = tagged_image_names - existing_images - if missing_images: - if logger: - logger.info(f"Found {len(missing_images)} missing Docker images.") - for i, image_name in enumerate(missing_images): - if logger: - logger.info( - f"Pulling Docker images {i + 1}/{len(missing_images)}: `{image_name}`." - ) - client.images.pull(image_name) - return ds - - class SWEBenchEnv(RepoEnv): CACHE = DEBUG_GYM_CACHE_DIR / "swe-bench" @@ -70,33 +31,32 @@ def __init__( f"{self.__class__.__name__} only supports DockerTerminal and KubernetesTerminal." ) - self.ds_row = task_data - self.setup_task(self.ds_row) + self.task_data = task_data + self.setup_task() self.test_directives = [] super().__init__(terminal=terminal, **kwargs) @property def instructions(self) -> str: - return self.ds_row["problem_statement"] + return self.task_data["problem_statement"] - def setup_task(self, task_data: dict, options: dict = None): - self.ds_row = task_data - self.task_name = task_data["instance_id"] - self.repo = self.ds_row["repo"] + def setup_task(self, options: dict = None): + self.task_name = self.task_data["instance_id"] + self.repo = self.task_data["repo"] self.package_name = self.repo.split("/")[1] - self.version = self.ds_row["version"] + self.version = self.task_data["version"] self.install_configs = MAP_REPO_VERSION_TO_SPECS[self.repo][self.version] - self.gold_patch = self.ds_row["patch"] - self.test_spec = make_test_spec(self.ds_row) + self.gold_patch = self.task_data["patch"] + self.test_spec = make_test_spec(self.task_data) self.base_image = f"swebench/{self.test_spec.instance_image_key}".replace( "__", "_1776_" ) - self.base_commit = self.ds_row["base_commit"] - self.test_patch = self.ds_row["test_patch"] - self.fail_to_pass = json.loads(self.ds_row["FAIL_TO_PASS"]) - self.pass_to_pass = json.loads(self.ds_row["PASS_TO_PASS"]) + self.base_commit = self.task_data["base_commit"] + self.test_patch = self.task_data["test_patch"] + self.fail_to_pass = json.loads(self.task_data["FAIL_TO_PASS"]) + self.pass_to_pass = json.loads(self.task_data["PASS_TO_PASS"]) self.test_cmd = self.install_configs["test_cmd"] - self.test_directives = get_test_directives(self.ds_row) + self.test_directives = get_test_directives(self.task_data) self.entrypoint = " ".join([self.test_cmd, *self.test_directives]) @@ -211,3 +171,42 @@ def calculate_score(self, eval_output: EvalOutput) -> int: ) assert score <= self.max_score return score + + @classmethod + def load_dataset( + dataset_id: str = "SWE-bench/SWE-bench_Verified", + dataset_revision: str = "99450355ca8c611021187a57ffac304b66666738", + split: str = "test", + problems: list | None = None, + prepull_images: bool = False, + logger: DebugGymLogger | None = None, + ) -> dict: + ds = datasets.load_dataset(dataset_id, revision=dataset_revision)[split] + + dataset = {problem["instance_id"]: problem for problem in ds} + problems = filter_problems(dataset, problems) + dataset = {id: i for id, i in dataset.items() if id in problems} + + image_names = set( + f"sweb.eval.x86_64.{id.replace('__', '_1776_')}" for id in problems + ) + + if prepull_images: + # Download all images needed for SWE-Bench. + client = docker.from_env() + tagged_image_names = set(f"swebench/{name}:latest" for name in image_names) + + existing_images = set( + tag for image in client.images.list() for tag in image.tags + ) + missing_images = tagged_image_names - existing_images + if missing_images: + if logger: + logger.info(f"Found {len(missing_images)} missing Docker images.") + for i, image_name in enumerate(missing_images): + if logger: + logger.info( + f"Pulling Docker images {i + 1}/{len(missing_images)}: `{image_name}`." + ) + client.images.pull(image_name) + return dataset diff --git a/debug_gym/gym/envs/swe_smith.py b/debug_gym/gym/envs/swe_smith.py index 6ba6fe03..e973376e 100644 --- a/debug_gym/gym/envs/swe_smith.py +++ b/debug_gym/gym/envs/swe_smith.py @@ -1,5 +1,6 @@ from importlib.resources import files as importlib_files from pathlib import Path +from typing import List import docker import yaml @@ -15,75 +16,10 @@ from debug_gym.constants import DEBUG_GYM_CACHE_DIR from debug_gym.gym.entities import EvalOutput from debug_gym.gym.envs.swe_bench import SWEBenchEnv -from debug_gym.gym.terminals.kubernetes import KubernetesTerminal from debug_gym.gym.terminals.terminal import DebugGymLogger, Terminal from debug_gym.gym.utils import filter_problems -def load_swesmith_dataset( - dataset_id: str = "SWE-bench/SWE-smith", - dataset_revision: str = "699b53400d3855206a0fbf3ff4beaf1a52f4f232", - split: str = "train", - problems: list | None = None, - prepull_images: bool = False, - logger: DebugGymLogger | None = None, -): - data_path = Path(dataset_id) - if data_path.is_file(): - # Loading from local file. - if data_path.suffix.lower() == ".json": - ds = load_dataset("json", data_files=dataset_id) - elif data_path.suffix.lower() == ".parquet": - ds = load_dataset("parquet", data_files=dataset_id) - elif data_path.is_dir(): - # Loading from local folder. - ds = load_from_disk(dataset_id) - else: - # Loading from HuggingFace or a folder. - ds = load_dataset(dataset_id, revision=dataset_revision) - - # Select the split. - ds = ds[split] - - # Load custom dataset splits from config. - with open(SWESmithEnv.CONFIG) as f: - custom_splits = yaml.safe_load(f) - excluded_ids = custom_splits.get("excluded", []) - - problems = filter_problems(ds["instance_id"], problems, custom_splits, excluded_ids) - ds = ds.filter(lambda example: example["instance_id"] in problems) - - image_names = set(ds["image_name"]) - if logger is not None: - logger.debug( - f"Loaded {len(ds)} tasks across {len(image_names)} Docker images from {dataset_id}." - ) - - if prepull_images: - # Download all images needed for SWE-Smith. - client = docker.from_env() - tagged_image_names = set(f"{DOCKER_ORG}/{name}:{TAG}" for name in image_names) - - existing_images = set( - tag for image in client.images.list() for tag in image.tags - ) - missing_images = tagged_image_names - existing_images - if missing_images: - if logger is not None: - logger.info(f"Found {len(missing_images)} missing Docker images.") - - for image_name in missing_images: - docker_hub_image = image_name.replace("__", "_1776_") - if logger is not None: - logger.info( - f"Pulling Docker image `{docker_hub_image}` to `{image_name}`." - ) - client.images.pull(docker_hub_image) - # Rename images via tagging - client.images.get(docker_hub_image).tag(image_name) - return ds - - class SWESmithEnv(SWEBenchEnv): CACHE = DEBUG_GYM_CACHE_DIR / "swe-smith" CONFIG = ( @@ -102,22 +38,21 @@ def __init__( **kwargs, ) - def setup_task(self, task_data: dict, options: dict = None): - self.task_name = task_data["instance_id"] - self.ds_row = task_data + def setup_task(self, options: dict = None): + self.task_name = self.task_data["instance_id"] self.base_commit = ( - self.ds_row["base_commit"] if "base_commit" in self.ds_row else "main" + self.task_data["base_commit"] if "base_commit" in self.task_data else "main" ) - self.branch_name = self.ds_row["instance_id"] - self.bug_patch = self.ds_row["patch"] - self.image_name = self.ds_row["image_name"] + self.branch_name = self.task_data["instance_id"] + self.bug_patch = self.task_data["patch"] + self.image_name = self.task_data["image_name"] self.repo, self.commit = get_repo_commit_from_image_name(self.image_name) self.install_configs = MAP_REPO_TO_SPECS[self.repo][self.commit] self.base_image = f"{DOCKER_ORG}/{self.image_name}:{TAG}" self.package_name = self.repo.split("/")[1] - self.test_cmd, self.test_directives = get_test_command(self.ds_row) - self.fail_to_pass = self.ds_row["FAIL_TO_PASS"] - self.pass_to_pass = self.ds_row["PASS_TO_PASS"] + self.test_cmd, self.test_directives = get_test_command(self.task_data) + self.fail_to_pass = self.task_data["FAIL_TO_PASS"] + self.pass_to_pass = self.task_data["PASS_TO_PASS"] self.log_parser = MAP_REPO_TO_PARSER.get(self.repo, parse_log_pytest) if self.package_name == "python-colorlog": @@ -219,3 +154,70 @@ def eval(self, **kwargs) -> EvalOutput: success, output = self.terminal.run(self.entrypoint, timeout=self.run_timeout) self.last_eval = EvalOutput(success, output) return self.last_eval + + @classmethod + def load_dataset( + dataset_id: str = "SWE-bench/SWE-smith", + dataset_revision: str = "699b53400d3855206a0fbf3ff4beaf1a52f4f232", + split: str = "train", + problems: list | None = None, + prepull_images: bool = False, + logger: DebugGymLogger | None = None, + ) -> dict: + data_path = Path(dataset_id) + if data_path.is_file(): + # Loading from local file. + if data_path.suffix.lower() == ".json": + ds = load_dataset("json", data_files=dataset_id) + elif data_path.suffix.lower() == ".parquet": + ds = load_dataset("parquet", data_files=dataset_id) + elif data_path.is_dir(): + # Loading from local folder. + ds = load_from_disk(dataset_id) + else: + # Loading from HuggingFace or a folder. + ds = load_dataset(dataset_id, revision=dataset_revision) + + # Select the split. + ds = ds[split] + + # Load custom dataset splits from config. + with open(SWESmithEnv.CONFIG) as f: + custom_splits = yaml.safe_load(f) + excluded_ids = custom_splits.get("excluded", []) + + dataset = {d["instance_id"]: d for d in ds} + problems = filter_problems(dataset, problems, custom_splits, excluded_ids) + dataset = {pid: dataset[pid] for pid in problems} + + image_names = set([problem["image_name"] for problem in dataset.values()]) + if logger is not None: + logger.debug( + f"Loaded {len(dataset)} tasks across {len(image_names)} Docker images from {dataset_id}." + ) + + if prepull_images: + # Download all images needed for SWE-Smith. + client = docker.from_env() + tagged_image_names = set( + f"{DOCKER_ORG}/{name}:{TAG}" for name in image_names + ) + + existing_images = set( + tag for image in client.images.list() for tag in image.tags + ) + missing_images = tagged_image_names - existing_images + if missing_images: + if logger is not None: + logger.info(f"Found {len(missing_images)} missing Docker images.") + + for image_name in missing_images: + docker_hub_image = image_name.replace("__", "_1776_") + if logger is not None: + logger.info( + f"Pulling Docker image `{docker_hub_image}` to `{image_name}`." + ) + client.images.pull(docker_hub_image) + # Rename images via tagging + client.images.get(docker_hub_image).tag(image_name) + return dataset diff --git a/debug_gym/gym/envs/swe_smith_constants.py b/debug_gym/gym/envs/swe_smith_constants.py index 4eff67c0..7d877245 100755 --- a/debug_gym/gym/envs/swe_smith_constants.py +++ b/debug_gym/gym/envs/swe_smith_constants.py @@ -1,4 +1,3 @@ - DOCKER_ORG = "jyangballin" TAG = "latest" @@ -78,9 +77,11 @@ for v in CMAKE_VERSIONS ] + [ - f"tar -xvzf cmake-{v}-Linux-x86_64.tar.gz && mv cmake-{v}-Linux-x86_64 /usr/share/cmake-{v}" - if v not in ["3.23.5", "3.27.9"] - else f"tar -xvzf cmake-{v}-Linux-x86_64.tar.gz && mv cmake-{v}-linux-x86_64 /usr/share/cmake-{v}" + ( + f"tar -xvzf cmake-{v}-Linux-x86_64.tar.gz && mv cmake-{v}-Linux-x86_64 /usr/share/cmake-{v}" + if v not in ["3.23.5", "3.27.9"] + else f"tar -xvzf cmake-{v}-Linux-x86_64.tar.gz && mv cmake-{v}-linux-x86_64 /usr/share/cmake-{v}" + ) for v in CMAKE_VERSIONS ] + [ @@ -683,4 +684,4 @@ "un33k/python-slugify": SPECS_REPO_PYTHON_SLUGIFY, "vi3k6i5/flashtext": SPECS_REPO_FLASHTEXT, "weaveworks/grafanalib": SPECS_REPO_GRAFANALIB, -} \ No newline at end of file +} diff --git a/debug_gym/gym/envs/swe_smith_utils.py b/debug_gym/gym/envs/swe_smith_utils.py index 65c085f8..496200f8 100755 --- a/debug_gym/gym/envs/swe_smith_utils.py +++ b/debug_gym/gym/envs/swe_smith_utils.py @@ -1,6 +1,7 @@ """ Pulled from official SWE-Smith repository. """ + import os import re from pathlib import Path @@ -19,12 +20,14 @@ PASS_TO_PASS = "PASS_TO_PASS" INSTANCE_REF = "instance_ref" + def get_repo_name(repo, commit) -> str: """ Get the SWE-smith GitHub repository name for a repository at a specific commit. """ return f"{repo.replace('/', '__')}.{commit[:8]}" + def get_test_paths(dir_path: str, ext: str = ".py") -> list[Path]: """ Get all testing file paths relative to the given directory. @@ -54,6 +57,7 @@ def get_full_commit(repo, partial_commit) -> str: raise ValueError(f"Commit {partial_commit} not found for repository {repo}.") + def get_repo_commit_from_image_name(image_name: str) -> tuple[str, str]: """ Get the repository and commit from a docker image ID. @@ -88,10 +92,12 @@ def get_test_command_mypy(instance: dict): ) return f'{MAP_REPO_TO_SPECS[repo][commit][KEY_TEST_CMD]} "{test_keys}"' + MAP_REPO_TO_TEST_CMD = { "python/mypy": get_test_command_mypy, } + def get_test_command(instance: dict): """ Given a repo/commit pair and a (gold) patch, return the test command to run @@ -187,4 +193,4 @@ def get_test_command(instance: dict): final.append(test_file) test_command += f" {' '.join(set(final))}" - return test_command, rv \ No newline at end of file + return test_command, rv diff --git a/debug_gym/gym/terminals/kubernetes.py b/debug_gym/gym/terminals/kubernetes.py index 54f22b0b..1b6f14f0 100644 --- a/debug_gym/gym/terminals/kubernetes.py +++ b/debug_gym/gym/terminals/kubernetes.py @@ -260,6 +260,7 @@ class KubernetesTerminal(Terminal): """ Note: reads values of env variables K8S_NAMESPACE, K8S_DOCKER_SECRET, K8S_DOCKER_CONSTRAINT. """ + def __init__( self, working_dir: str | None = None, @@ -290,8 +291,10 @@ def __init__( self._task_name = base_image self.setup_commands = setup_commands or [] self.namespace = namespace or os.environ.get("K8S_NAMESPACE", "default") - self.image_pull_secret = image_pull_secret or os.environ.get("K8S_DOCKER_SECRET") - self.in_node_constraint = os.environ.get("K8S_NODE_CONSTRAINT", False) + self.image_pull_secret = image_pull_secret or os.environ.get( + "K8S_DOCKER_SECRET" + ) + self.in_node_constraint = os.environ.get("K8S_IN_NODE_CONSTRAINT", False) self.kubernetes_kwargs = kwargs # e.g., nodeSelector, tolerations self.registry = registry.rstrip("/") + "/" if registry else "" self._pod_name = pod_name diff --git a/debug_gym/gym/utils.py b/debug_gym/gym/utils.py index 1d95294b..24372a44 100644 --- a/debug_gym/gym/utils.py +++ b/debug_gym/gym/utils.py @@ -196,7 +196,7 @@ def extract_reward_from_pytest_output(output): def filter_problems( - dataset_instances: list[str], + dataset: dict[str, Any], problems: str | list[str] | None = None, custom_splits: dict[str, Any] | None = None, excluded_ids: list[str] | None = None, @@ -208,9 +208,9 @@ def filter_problems( if not isinstance(problems, str): # Check that all problems are valid task names. for problem in problems: - if problem not in dataset_instances: + if problem not in dataset: raise ValueError( - f"Invalid problem id: '{problem}'.\nChoose from: {sorted(dataset_instances)}" + f"Invalid problem id: '{problem}'.\nChoose from: {sorted(dataset)}" ) # Make sure all problems are unique. @@ -220,14 +220,14 @@ def filter_problems( return problems # Assuming a list of problem IDs. if problems == "all": - return [k for k in dataset_instances if k not in excluded_ids] - elif problems in dataset_instances: + return [k for k in dataset if k not in excluded_ids] + elif problems in dataset: return [problems] # Single task elif problems in custom_splits: return custom_splits[problems] else: raise ValueError( - f"Invalid split or problem id: '{problems}'.\nChoose from: {sorted(dataset_instances) + ['all'] + sorted(custom_splits)}" + f"Invalid split or problem id: '{problems}'.\nChoose from: {sorted(dataset) + ['all'] + sorted(custom_splits)}" ) From b338e1ceefb3f2097ad7a8118879ea1cd6c7adf9 Mon Sep 17 00:00:00 2001 From: Alessandro Sordoni Date: Fri, 28 Nov 2025 07:53:01 -0800 Subject: [PATCH 15/31] fix tests --- tests/gym/envs/conftest.py | 11 ++--------- tests/gym/envs/test_r2egym.py | 10 +++++----- 2 files changed, 7 insertions(+), 14 deletions(-) diff --git a/tests/gym/envs/conftest.py b/tests/gym/envs/conftest.py index 94e70a47..60afdefa 100644 --- a/tests/gym/envs/conftest.py +++ b/tests/gym/envs/conftest.py @@ -34,15 +34,8 @@ def make_env_factory(env_name, worker_id, tmp_path_factory): env_class = kwargs.pop("env_class") def _make_env(): - if issubclass(env_class, (SWEBenchEnv, SWEBenchDebugEnv)): - fn = load_swebench_dataset - elif issubclass(env_class, SWESmithEnv): - fn = load_swesmith_dataset - elif issubclass(env_class, R2EGymEnv): - fn = load_r2egym_dataset - else: - raise ValueError(f"Unknown env_class: {env_class}") - task_data = fn(problems=kwargs["problems"])[0] + dataset = env_class.load_dataset(problems=kwargs["problems"]) + task_data = next(iter(dataset.values())) return env_class(task_data=task_data) if worker_id == "master": diff --git a/tests/gym/envs/test_r2egym.py b/tests/gym/envs/test_r2egym.py index e0b1a635..6fda7e9b 100644 --- a/tests/gym/envs/test_r2egym.py +++ b/tests/gym/envs/test_r2egym.py @@ -7,7 +7,7 @@ from debug_gym.agents.solution_agent import AgentSolution from debug_gym.gym.entities import Observation -from debug_gym.gym.envs.r2egym import R2EGymEnv, load_r2egym_dataset +from debug_gym.gym.envs.r2egym import R2EGymEnv from debug_gym.gym.terminals.docker import DockerTerminal from debug_gym.gym.tools.tool import ToolCall from debug_gym.gym.tools.toolbox import Toolbox @@ -70,17 +70,17 @@ def test_load_dataset_from_parquet(mock_docker_from_env, tmp_path): pq.write_table(table, str(parquet_file)) # Load the dataset from the Parquet file - dataset = load_r2egym_dataset( - dataset_id=str(parquet_file), split="train" - ) + dataset = R2EGymEnv.load_dataset(dataset_id=str(parquet_file), split="train") + dataset_entry = next(iter(dataset.values())) # Verify the dataset contains the expected features - assert sorted(dataset.features.keys()) == sorted( + assert sorted(dataset_entry) == sorted( [ "commit_hash", "docker_image", "execution_result_content", "expected_output_json", + "instance_id", "modified_entity_summaries", "modified_files", "num_non_test_files", From 0858bea62bd2399ae4cc887e7d336e68625affc8 Mon Sep 17 00:00:00 2001 From: Alessandro Sordoni Date: Fri, 28 Nov 2025 07:54:08 -0800 Subject: [PATCH 16/31] change run.py --- scripts/run.py | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) diff --git a/scripts/run.py b/scripts/run.py index e9f0b64a..fcbde3fb 100644 --- a/scripts/run.py +++ b/scripts/run.py @@ -259,20 +259,9 @@ def main(): "problems": config.get("problems", "all"), "prepull_images": config.env_kwargs.get("prepull_images", False), } - load_dataset_fn = { - "swebench": load_swebench_dataset, - "swebench-debug": load_swebench_dataset, - "swesmith": load_swesmith_dataset, - "r2egym": load_r2egym_dataset, - } - - if config["benchmark"] in load_dataset_fn: - dataset = load_dataset_fn[config["benchmark"]]( - **dataset_info, - ) - else: - raise ValueError(f"Unsupported benchmark: {config['benchmark']}") - + dataset = select_env(config.get("benchmark")).load_dataset( + **dataset_info + ) problems = sorted(dataset) if args.list: From 35a4f666f03af5ac9f223deaba98dc107ffe1e5e Mon Sep 17 00:00:00 2001 From: Alessandro Sordoni Date: Fri, 28 Nov 2025 08:18:44 -0800 Subject: [PATCH 17/31] blacked --- scripts/run.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/scripts/run.py b/scripts/run.py index fcbde3fb..1e119327 100644 --- a/scripts/run.py +++ b/scripts/run.py @@ -259,9 +259,7 @@ def main(): "problems": config.get("problems", "all"), "prepull_images": config.env_kwargs.get("prepull_images", False), } - dataset = select_env(config.get("benchmark")).load_dataset( - **dataset_info - ) + dataset = select_env(config.get("benchmark")).load_dataset(**dataset_info) problems = sorted(dataset) if args.list: From e9600ed550ef1461ae2a9b284e5499cecc1fe99e Mon Sep 17 00:00:00 2001 From: Alessandro Sordoni Date: Fri, 28 Nov 2025 08:19:56 -0800 Subject: [PATCH 18/31] remove imports --- tests/gym/envs/conftest.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/gym/envs/conftest.py b/tests/gym/envs/conftest.py index 60afdefa..8806dcee 100644 --- a/tests/gym/envs/conftest.py +++ b/tests/gym/envs/conftest.py @@ -2,10 +2,8 @@ from filelock import FileLock from debug_gym.gym.envs import R2EGymEnv, SWEBenchEnv, SWESmithEnv -from debug_gym.gym.envs.r2egym import load_r2egym_dataset -from debug_gym.gym.envs.swe_bench import load_swebench_dataset from debug_gym.gym.envs.swe_bench_debug import SWEBenchDebugEnv -from debug_gym.gym.envs.swe_smith import load_swesmith_dataset + BUILD_ENV_CONFIGS = { "swe_smith": { From 81b2eda1bd2da47b27d42d0282130f176830c945 Mon Sep 17 00:00:00 2001 From: Alessandro Sordoni Date: Fri, 28 Nov 2025 08:23:50 -0800 Subject: [PATCH 19/31] task name / task data adaptation --- scripts/run.py | 45 ++++++++++++++++++++++----------------------- 1 file changed, 22 insertions(+), 23 deletions(-) diff --git a/scripts/run.py b/scripts/run.py index 1e119327..928b921f 100644 --- a/scripts/run.py +++ b/scripts/run.py @@ -44,7 +44,7 @@ def timeout_handler(signum, frame): signal.alarm(timeout_seconds) -def run_agent(args, problem: dict, config: dict): +def run_agent(args, task_name: str, task_data: dict, config: dict): set_signal(args.timeout) success = True env = None @@ -54,22 +54,22 @@ def run_agent(args, problem: dict, config: dict): report_progress_error = True exp_path = Path(config["output_path"]) / config["uuid"] - problem_path = exp_path / problem + task_path = exp_path / task_name task_logger = DebugGymLogger( - problem, - log_dir=problem_path, + task_name, + log_dir=task_path, level=args.logging_level, mode="w" if args.force_all else "a", ) try: - previous_run = load_previous_run_status(problem_path, problem) + previous_run = load_previous_run_status(task_path, task_name) if ( not args.force_all and previous_run is not None and previous_run.status in ["resolved", "unresolved"] ): - task_logger.debug(f"Previous run found: {problem_path}") + task_logger.debug(f"Previous run found: {task_path}") success = previous_run.status == "resolved" task_logger.debug(f"Previous run status: {previous_run.status}") if not args.force_failed or success: @@ -82,11 +82,11 @@ def run_agent(args, problem: dict, config: dict): max_score=previous_run.max_score, status=status, ) - task_logger.debug(f"Skipping {problem}, already done.") + task_logger.debug(f"Skipping {task_name}, already done.") return success task_logger.report_progress( - problem_id=problem, + problem_id=task_name, step=0, total_steps=1, score=0, @@ -94,7 +94,7 @@ def run_agent(args, problem: dict, config: dict): status="running", ) - env = create_env(config, problem, task_logger) + env = create_env(config, task_data, task_logger) add_tools(env, config, task_logger) llm = LLM.instantiate( @@ -107,17 +107,16 @@ def run_agent(args, problem: dict, config: dict): agent = create_agent( config["agent_type"], agent_args=agent_args, - env=env, llm=llm, logger=task_logger, ) try: - success = agent.run(task_name=problem, debug=args.debug) + success = agent.run(env, debug=args.debug) except KeyboardInterrupt: task_logger.error("Agent run was interrupted by user.") task_logger.report_progress( - problem_id=problem, + problem_id=task_name, step=1, total_steps=1, score=0, @@ -128,11 +127,11 @@ def run_agent(args, problem: dict, config: dict): raise except AgentTimeoutException: task_logger.error( - f"Timeout: Problem `{problem}` exceeded " + f"Timeout: Problem `{task_name}` exceeded " f"the time limit of {args.timeout} seconds." ) task_logger.report_progress( - problem_id=problem, + problem_id=task_name, step=1, total_steps=1, score=0, @@ -146,23 +145,23 @@ def run_agent(args, problem: dict, config: dict): raise # save trajectory - save_trajectory(agent, problem, problem_path, task_logger) + save_trajectory(agent, task_name, task_path, task_logger) # optionally apply patch if config["save_patch"]: - save_patch(env, problem_path, task_logger) + save_patch(env, task_path, task_logger) except Exception as e: task_logger.error( - f"Task Error: {problem} - {e!r}. Run with --very-verbose " + f"Task Error: {task_name} - {e!r}. Run with --very-verbose " f"or check {task_logger.log_file} for more information." ) task_logger.debug( - f"Task {problem} generated an exception: {e!r}. Traceback: {traceback.format_exc()}" + f"Task {task_name} generated an exception: {e!r}. Traceback: {traceback.format_exc()}" ) if report_progress_error: task_logger.report_progress( - problem_id=problem, + problem_id=task_name, step=1, total_steps=1, score=0, @@ -181,11 +180,11 @@ def run_agent(args, problem: dict, config: dict): return success -def create_env(config: dict, problem: dict, logger: DebugGymLogger): +def create_env(config: dict, task_data: dict, logger: DebugGymLogger): terminal = select_terminal(config.get("terminal"), logger, uuid=config["uuid"]) env_class = select_env(config.get("benchmark")) env = env_class( - task_data=problem, + task_data=task_data, terminal=terminal, logger=logger, **config["env_kwargs"], @@ -297,7 +296,7 @@ def main(): if num_workers == 1: # run sequentially for easier debugging for problem in problems: try: - success = run_agent(args, problem, config) + success = run_agent(args, problem, dataset[problem], config) except AgentTimeoutException: pass # Handled in run_agent, just continue except (KeyboardInterrupt, Exception) as e: @@ -307,7 +306,7 @@ def main(): num_workers, initializer=DebugGymLogger.set_as_worker ) as executor: futures = { - executor.submit(run_agent, args, problem, config): problem + executor.submit(run_agent, args, problem, dataset[problem], config): problem for problem in problems } for future in as_completed(futures): From 3468a627004643d26d1ff704bc830641e2821d91 Mon Sep 17 00:00:00 2001 From: Alessandro Sordoni Date: Fri, 28 Nov 2025 08:26:25 -0800 Subject: [PATCH 20/31] pre commit --- debug_gym/gym/envs/swe_smith.py | 7 +++---- debug_gym/gym/envs/swe_smith_utils.py | 2 +- debug_gym/gym/terminals/kubernetes.py | 2 +- scripts/run.py | 11 ++++++----- tests/gym/envs/conftest.py | 1 - 5 files changed, 11 insertions(+), 12 deletions(-) diff --git a/debug_gym/gym/envs/swe_smith.py b/debug_gym/gym/envs/swe_smith.py index e973376e..9ab436a1 100644 --- a/debug_gym/gym/envs/swe_smith.py +++ b/debug_gym/gym/envs/swe_smith.py @@ -5,10 +5,6 @@ import docker import yaml from datasets import load_dataset, load_from_disk - -from .swe_smith_constants import DOCKER_ORG, TAG, MAP_REPO_TO_SPECS -from .swe_smith_utils import get_test_command, get_repo_commit_from_image_name - from swebench.harness.constants import TestStatus from swebench.harness.grading import MAP_REPO_TO_PARSER from swebench.harness.log_parsers.python import parse_log_pytest @@ -19,6 +15,9 @@ from debug_gym.gym.terminals.terminal import DebugGymLogger, Terminal from debug_gym.gym.utils import filter_problems +from .swe_smith_constants import DOCKER_ORG, MAP_REPO_TO_SPECS, TAG +from .swe_smith_utils import get_repo_commit_from_image_name, get_test_command + class SWESmithEnv(SWEBenchEnv): CACHE = DEBUG_GYM_CACHE_DIR / "swe-smith" diff --git a/debug_gym/gym/envs/swe_smith_utils.py b/debug_gym/gym/envs/swe_smith_utils.py index 496200f8..727ef233 100755 --- a/debug_gym/gym/envs/swe_smith_utils.py +++ b/debug_gym/gym/envs/swe_smith_utils.py @@ -5,6 +5,7 @@ import os import re from pathlib import Path + from unidiff import PatchSet from .swe_smith_constants import ( @@ -15,7 +16,6 @@ MAP_REPO_TO_SPECS, ) - FAIL_TO_PASS = "FAIL_TO_PASS" PASS_TO_PASS = "PASS_TO_PASS" INSTANCE_REF = "instance_ref" diff --git a/debug_gym/gym/terminals/kubernetes.py b/debug_gym/gym/terminals/kubernetes.py index 1b6f14f0..7830d65b 100644 --- a/debug_gym/gym/terminals/kubernetes.py +++ b/debug_gym/gym/terminals/kubernetes.py @@ -1,9 +1,9 @@ import atexit +import hashlib import json import os import random import subprocess -import hashlib import time import uuid from pathlib import Path diff --git a/scripts/run.py b/scripts/run.py index 928b921f..bc339b3c 100644 --- a/scripts/run.py +++ b/scripts/run.py @@ -12,16 +12,15 @@ from debug_gym.agents.base_agent import AGENT_REGISTRY, AgentArgs, create_agent from debug_gym.agents.utils import load_config, save_patch, save_trajectory from debug_gym.gym.envs import select_env +from debug_gym.gym.envs.r2egym import load_r2egym_dataset +from debug_gym.gym.envs.swe_bench import load_swebench_dataset +from debug_gym.gym.envs.swe_smith import load_swesmith_dataset from debug_gym.gym.terminals import select_terminal from debug_gym.gym.tools.toolbox import Toolbox from debug_gym.llms.base import LLM from debug_gym.llms.human import Human from debug_gym.logger import DebugGymLogger, load_previous_run_status -from debug_gym.gym.envs.swe_bench import load_swebench_dataset -from debug_gym.gym.envs.swe_smith import load_swesmith_dataset -from debug_gym.gym.envs.r2egym import load_r2egym_dataset - class AgentTimeoutException(BaseException): """Custom exception to handle timeouts in agent @@ -306,7 +305,9 @@ def main(): num_workers, initializer=DebugGymLogger.set_as_worker ) as executor: futures = { - executor.submit(run_agent, args, problem, dataset[problem], config): problem + executor.submit( + run_agent, args, problem, dataset[problem], config + ): problem for problem in problems } for future in as_completed(futures): diff --git a/tests/gym/envs/conftest.py b/tests/gym/envs/conftest.py index 8806dcee..e79af5ba 100644 --- a/tests/gym/envs/conftest.py +++ b/tests/gym/envs/conftest.py @@ -4,7 +4,6 @@ from debug_gym.gym.envs import R2EGymEnv, SWEBenchEnv, SWESmithEnv from debug_gym.gym.envs.swe_bench_debug import SWEBenchDebugEnv - BUILD_ENV_CONFIGS = { "swe_smith": { "env_class": SWESmithEnv, From c56579caa2209d3e4d562740a1290ee995524661 Mon Sep 17 00:00:00 2001 From: Alessandro Sordoni Date: Fri, 28 Nov 2025 08:34:27 -0800 Subject: [PATCH 21/31] cls keyword --- debug_gym/gym/envs/r2egym.py | 1 + debug_gym/gym/envs/swe_bench.py | 1 + debug_gym/gym/envs/swe_smith.py | 1 + 3 files changed, 3 insertions(+) diff --git a/debug_gym/gym/envs/r2egym.py b/debug_gym/gym/envs/r2egym.py index 8e650400..9746c537 100644 --- a/debug_gym/gym/envs/r2egym.py +++ b/debug_gym/gym/envs/r2egym.py @@ -254,6 +254,7 @@ def calculate_score(self, eval_output: EvalOutput) -> int: @classmethod def load_dataset( + cls, dataset_id: str = "R2E-Gym/R2E-Gym-Lite", dataset_revision: str = "8d3163011f01f9393bb3dc7700497a79a8686ae5", split: str = "train", diff --git a/debug_gym/gym/envs/swe_bench.py b/debug_gym/gym/envs/swe_bench.py index 68566230..6c43437b 100644 --- a/debug_gym/gym/envs/swe_bench.py +++ b/debug_gym/gym/envs/swe_bench.py @@ -174,6 +174,7 @@ def calculate_score(self, eval_output: EvalOutput) -> int: @classmethod def load_dataset( + cls, dataset_id: str = "SWE-bench/SWE-bench_Verified", dataset_revision: str = "99450355ca8c611021187a57ffac304b66666738", split: str = "test", diff --git a/debug_gym/gym/envs/swe_smith.py b/debug_gym/gym/envs/swe_smith.py index 9ab436a1..1f8ce79c 100644 --- a/debug_gym/gym/envs/swe_smith.py +++ b/debug_gym/gym/envs/swe_smith.py @@ -156,6 +156,7 @@ def eval(self, **kwargs) -> EvalOutput: @classmethod def load_dataset( + cls, dataset_id: str = "SWE-bench/SWE-smith", dataset_revision: str = "699b53400d3855206a0fbf3ff4beaf1a52f4f232", split: str = "train", From 4b01ac849e4d389e5c1af8b7bd12df57590f9fc2 Mon Sep 17 00:00:00 2001 From: Alessandro Sordoni Date: Fri, 28 Nov 2025 08:39:23 -0800 Subject: [PATCH 22/31] remove load dataset --- debug_gym/gym/envs/env.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/debug_gym/gym/envs/env.py b/debug_gym/gym/envs/env.py index 0021dc2b..d86e93c4 100644 --- a/debug_gym/gym/envs/env.py +++ b/debug_gym/gym/envs/env.py @@ -235,7 +235,6 @@ def __init__( ) self.workspace = Workspace(self.terminal, logger=self.logger) - self.dataset = self.load_dataset(problems) self.set_entrypoints(self._entrypoint, self._debug_entrypoint) def _reset_env_state(self): @@ -503,6 +502,3 @@ def close(self): def __del__(self): self.close() - - def load_dataset(self, problems: str | list[str] | None = None): - return {"custom": None} From 0dd0f4ed18e48a9ebcf490a1a5026103702eb512 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc-Alexandre=20C=C3=B4t=C3=A9?= Date: Fri, 28 Nov 2025 12:52:45 -0800 Subject: [PATCH 23/31] Working on tests + refactoring --- debug_gym/gym/envs/__init__.py | 3 ++ debug_gym/gym/envs/aider.py | 2 +- debug_gym/gym/envs/env.py | 58 ++++++++------------- debug_gym/gym/envs/local.py | 52 +++++++++++++++++++ debug_gym/gym/envs/mini_nightmare.py | 2 +- debug_gym/gym/envs/r2egym.py | 14 ++--- debug_gym/gym/envs/swe_bench.py | 11 ++-- debug_gym/gym/envs/swe_smith.py | 24 +++------ scripts/config.yaml | 18 +++---- scripts/config_aider.yaml | 11 ++-- scripts/config_mini_nightmare.yaml | 13 ++--- scripts/config_r2egym.yaml | 13 +++-- scripts/config_swebench.yaml | 17 +++--- scripts/config_swesmith.yaml | 18 +++---- scripts/run.py | 8 +-- tests/gym/envs/test_r2egym.py | 64 +++++++++-------------- tests/gym/envs/test_swe_bench.py | 75 ++++++++++++++------------- tests/gym/envs/test_swe_smith.py | 77 +++++++++++----------------- 18 files changed, 237 insertions(+), 243 deletions(-) create mode 100644 debug_gym/gym/envs/local.py diff --git a/debug_gym/gym/envs/__init__.py b/debug_gym/gym/envs/__init__.py index 86ef4cab..4cad96f0 100644 --- a/debug_gym/gym/envs/__init__.py +++ b/debug_gym/gym/envs/__init__.py @@ -1,5 +1,6 @@ from debug_gym.gym.envs.aider import AiderBenchmarkEnv from debug_gym.gym.envs.env import RepoEnv, TooledEnv +from debug_gym.gym.envs.local import LocalEnv from debug_gym.gym.envs.mini_nightmare import MiniNightmareEnv from debug_gym.gym.envs.r2egym import R2EGymEnv from debug_gym.gym.envs.swe_bench import SWEBenchEnv @@ -11,6 +12,8 @@ def select_env(env_type: str = None) -> type[RepoEnv]: match env_type: case None: return RepoEnv + case "local": + return LocalEnv case "aider": return AiderBenchmarkEnv case "swebench": diff --git a/debug_gym/gym/envs/aider.py b/debug_gym/gym/envs/aider.py index 3056e0dd..98421e57 100644 --- a/debug_gym/gym/envs/aider.py +++ b/debug_gym/gym/envs/aider.py @@ -94,7 +94,7 @@ def eval(self, **kwargs) -> EvalOutput: self.last_eval = EvalOutput(success, output) return self.last_eval - def setup_task(self, options: dict = None): + def setup_task(self): pass def setup_workspace(self): diff --git a/debug_gym/gym/envs/env.py b/debug_gym/gym/envs/env.py index d86e93c4..13b54d76 100644 --- a/debug_gym/gym/envs/env.py +++ b/debug_gym/gym/envs/env.py @@ -201,38 +201,27 @@ class RepoEnv(TooledEnv): def __init__( self, - path: str | None = None, + task_data: dict, entrypoint: str = "python -m pytest -sq .", debug_entrypoint: str | None = None, max_score: int | None = None, - readonly_patterns: list[str] | None = None, # TODO: remove run_timeout: int | None = None, terminal: Terminal | None = None, logger: DebugGymLogger | None = None, - problems: str | list[str] | None = None, **kwargs, ): super().__init__() - self.path = path + self.task_data = task_data self.max_score = max_score self.run_timeout = run_timeout - self.terminal = terminal or LocalTerminal() # TODO: default to DockerTerminal + self.terminal = terminal self._entrypoint = entrypoint self._debug_entrypoint = debug_entrypoint self.logger = logger or DebugGymLogger("debug-gym") self.infos: EnvInfo | None = None self.rng = None self.additional_kwargs = kwargs - self.task_name: str | None = None - self.options: dict = {} - - if "auto_eval_on_rewrite" in kwargs: - raise ValueError( - "The 'auto_eval_on_rewrite' parameter is no longer supported. " - "Please remove it from your initialization arguments." - "Instead, set 'auto_eval_on_rewrite' in the EvalTool instance." - ) self.workspace = Workspace(self.terminal, logger=self.logger) self.set_entrypoints(self._entrypoint, self._debug_entrypoint) @@ -289,44 +278,39 @@ def working_dir(self) -> Path: def instructions(self) -> str: """Instructions for the current task. Override in subclasses for different behavior.""" - return "" + raise NotImplementedError( + "Subclasses must implement the instructions property." + ) - def setup_task(self, options: dict = None) -> None: + @property + def task_name(self) -> str: + raise NotImplementedError("Subclasses must implement the task_name property.") + + def setup_task(self) -> None: """Setup the task information. Override in subclasses for different behavior. Called once at reset.""" - pass + raise NotImplementedError("Subclasses must implement setup_task method.") def setup_workspace(self) -> None: """Setup the workspace. Override in subclasses for different behavior. Called once at reset.""" - self.workspace.reset() - self.workspace.copy_content(self.path) - self.workspace.setup_file_filters() + raise NotImplementedError("Subclasses must implement setup_workspace method.") def setup_terminal(self) -> None: """Setup the terminal. Override in subclasses for different behavior. Called once at reset.""" - - self.logger.debug(f"Configuring {self.terminal}...") - - self.terminal.run("git init -b main") - self.terminal.run("git config user.name 'debug-gym'") - self.terminal.run("git config user.email '<>'") - - self.terminal.run("git add *") - self.terminal.run("git commit -am 'Init'") - - self.terminal.run("git add .debugignore .debugreadonly") - self.terminal.run("git commit -am 'Add debug-gym ignore and read-only files'") + raise NotImplementedError("Subclasses must implement setup_terminal method.") def reset(self, *, options: dict = None): """Resets the environment and returns eval as the initial observation.""" - self.options = options if options is not None else self.options + options = options if options is not None else {} self.logger.debug("Resetting environment") - self.close() # Clean up previous workspace and terminal. - self.setup_task(options=self.options) - self.setup_workspace() - self.setup_terminal() + if options.get("reset_runtime", True): + self.close() # Clean up previous workspace and terminal. + self.setup_task() + self.setup_workspace() + self.setup_terminal() + self._reset_env_state() # Notify all tools that the environment is reset and get their observations diff --git a/debug_gym/gym/envs/local.py b/debug_gym/gym/envs/local.py new file mode 100644 index 00000000..c3b8d54e --- /dev/null +++ b/debug_gym/gym/envs/local.py @@ -0,0 +1,52 @@ +from debug_gym.gym.envs.env import RepoEnv + + +class LocalEnv(RepoEnv): + + def __init__( + self, + path: str, + entrypoint: str = "python -m pytest -sq .", + debug_entrypoint: str | None = None, + **kwargs, + ): + task_data = {"path": path} + super().__init__( + task_data=task_data, + entrypoint=entrypoint, + debug_entrypoint=debug_entrypoint, + **kwargs, + ) + + @property + def instruction(self) -> str: + return f"Debug the local codebase at {self.path}. Investigate the repository, figure out the root cause, then rewrite the code to fix the issue." + + @property + def task(self) -> str: + return self.task_data["path"].split("/")[-1] + + def setup_task(self) -> None: + """Setup the task information. Called once at reset.""" + self.path = self.task_data["path"] + + def setup_workspace(self) -> None: + """Setup the workspace. Called once at reset.""" + self.workspace.reset() + self.workspace.copy_content(self.path) + self.workspace.setup_file_filters() + + def setup_terminal(self) -> None: + """Setup the terminal. Called once at reset.""" + + self.logger.debug(f"Configuring {self.terminal}...") + + self.terminal.run("git init -b main") + self.terminal.run("git config user.name 'debug-gym'") + self.terminal.run("git config user.email '<>'") + + self.terminal.run("git add *") + self.terminal.run("git commit -am 'Init'") + + self.terminal.run("git add .debugignore .debugreadonly") + self.terminal.run("git commit -am 'Add debug-gym ignore and read-only files'") diff --git a/debug_gym/gym/envs/mini_nightmare.py b/debug_gym/gym/envs/mini_nightmare.py index 32937bfb..2c59213b 100644 --- a/debug_gym/gym/envs/mini_nightmare.py +++ b/debug_gym/gym/envs/mini_nightmare.py @@ -111,7 +111,7 @@ def eval(self, **kwargs) -> EvalOutput: self.last_eval = EvalOutput(success, output) return self.last_eval - def setup_task(self, options: dict = None): + def setup_task(self): pass def setup_workspace(self): diff --git a/debug_gym/gym/envs/r2egym.py b/debug_gym/gym/envs/r2egym.py index 9746c537..17cd5ac7 100644 --- a/debug_gym/gym/envs/r2egym.py +++ b/debug_gym/gym/envs/r2egym.py @@ -75,24 +75,24 @@ def __init__( "R2EGymEnv only supports DockerTerminal and KubernetesTerminal." ) - self.task_data = task_data - self.setup_task() - + super().__init__(task_data=task_data, terminal=terminal, **kwargs) self.session_commands = [] - super().__init__(terminal=terminal, **kwargs) + + @property + def task_name(self) -> str: + return self.task_data["instance_id"] @property def instructions(self) -> str: # try getting the content inside of [ISSUE] [/ISSUE] using regex tags for ds['problem_statement'] else return ds['problem_statement'] # ref: https://github.com/R2E-Gym/R2E-Gym/blob/main/src/r2egym/agenthub/runtime/docker.py#L592 try: - content = self.ds_row["problem_statement"] + content = self.task_data["problem_statement"] return re.search(r"\[ISSUE\](.*)\[/ISSUE\]", content, re.DOTALL).group(1) except Exception as e: return self.task_data["problem_statement"] - def setup_task(self, options: dict = None): - self.task_name = self.task_data["instance_id"] + def setup_task(self): self.base_image = self.task_data["docker_image"] self.package_name = self.task_data["repo_name"] self.expected_output = json.loads(self.task_data["expected_output_json"]) diff --git a/debug_gym/gym/envs/swe_bench.py b/debug_gym/gym/envs/swe_bench.py index 6c43437b..1f7d8a41 100644 --- a/debug_gym/gym/envs/swe_bench.py +++ b/debug_gym/gym/envs/swe_bench.py @@ -31,17 +31,18 @@ def __init__( f"{self.__class__.__name__} only supports DockerTerminal and KubernetesTerminal." ) - self.task_data = task_data - self.setup_task() self.test_directives = [] - super().__init__(terminal=terminal, **kwargs) + super().__init__(task_data=task_data, terminal=terminal, **kwargs) @property def instructions(self) -> str: return self.task_data["problem_statement"] - def setup_task(self, options: dict = None): - self.task_name = self.task_data["instance_id"] + @property + def task_name(self) -> str: + return self.task_data["instance_id"] + + def setup_task(self): self.repo = self.task_data["repo"] self.package_name = self.repo.split("/")[1] self.version = self.task_data["version"] diff --git a/debug_gym/gym/envs/swe_smith.py b/debug_gym/gym/envs/swe_smith.py index 1f8ce79c..8511a1a7 100644 --- a/debug_gym/gym/envs/swe_smith.py +++ b/debug_gym/gym/envs/swe_smith.py @@ -5,9 +5,12 @@ import docker import yaml from datasets import load_dataset, load_from_disk -from swebench.harness.constants import TestStatus -from swebench.harness.grading import MAP_REPO_TO_PARSER -from swebench.harness.log_parsers.python import parse_log_pytest +from swesmith.build_repo.download_images import DOCKER_ORG, TAG +from swesmith.constants import MAP_REPO_TO_SPECS +from swesmith.harness.grading import TestStatus +from swesmith.harness.log_parsers import MAP_REPO_TO_PARSER, parse_log_pytest +from swesmith.harness.utils import get_test_command +from swesmith.utils import get_repo_commit_from_image_name from debug_gym.constants import DEBUG_GYM_CACHE_DIR from debug_gym.gym.entities import EvalOutput @@ -25,20 +28,7 @@ class SWESmithEnv(SWEBenchEnv): importlib_files("debug_gym") / "gym" / "envs" / "configs" / "swe_smith.yaml" ) - def __init__( - self, - task_data: dict, - terminal: Terminal | None = None, - **kwargs, - ): - super().__init__( - task_data=task_data, - terminal=terminal, - **kwargs, - ) - - def setup_task(self, options: dict = None): - self.task_name = self.task_data["instance_id"] + def setup_task(self): self.base_commit = ( self.task_data["base_commit"] if "base_commit" in self.task_data else "main" ) diff --git a/scripts/config.yaml b/scripts/config.yaml index ee3952c5..62e0f075 100644 --- a/scripts/config.yaml +++ b/scripts/config.yaml @@ -1,16 +1,14 @@ base: # Environment configs output_path: "exps/pytorch" - env_kwargs: { - "path": "data/pytorch", - "entrypoint": "python -m pytest -sv test.py", - "debug_entrypoint": "python -m pdb -m pytest -s test.py", - "run_timeout": 10, - } - tools: ["pdb", "view", "rewrite"] - terminal: { - type: "docker", # "local", "docker", or "kubernetes" - } + env: + type: "local" + path: "data/pytorch" + entrypoint: "python -m pytest -sv test.py" + debug_entrypoint: "python -m pdb -m pytest -s test.py" + run_timeout: 10 + terminal: + type: "docker" # "local", "docker", or "kubernetes" # LLM configs llm_name: "gpt-4o" diff --git a/scripts/config_aider.yaml b/scripts/config_aider.yaml index 88dd68fb..09f25411 100644 --- a/scripts/config_aider.yaml +++ b/scripts/config_aider.yaml @@ -3,12 +3,11 @@ base: output_path: "exps/aider" benchmark: "aider" problems: "all" # list of problems, e.g., ["wordy"], or "all" - env_kwargs: { - "run_timeout": 20, - } - terminal: { - type: "docker", # "docker", "kubernetes", or "local" - } + env: + type: "aider" + run_timeout: 20 + terminal: + type: "docker" # "docker", "kubernetes", or "local" # LLM configs llm_name: "gpt-4o" diff --git a/scripts/config_mini_nightmare.yaml b/scripts/config_mini_nightmare.yaml index 88fbc08a..e97b4345 100644 --- a/scripts/config_mini_nightmare.yaml +++ b/scripts/config_mini_nightmare.yaml @@ -3,14 +3,11 @@ base: output_path: "exps/mini_nightmare" benchmark: "mini_nightmare" problems: "all" # list of problems, e.g., ["config"], or "all" - env_kwargs: { - "run_timeout": 30, - # shortcut features - } - - terminal: { - type: "docker", # "docker", "kubernetes", or "local" - } + env: + type: "mini_nightmare" + run_timeout: 30 + terminal: + type: "docker" # "docker", "kubernetes", or "local" # LLM configs llm_name: "gpt-4o" diff --git a/scripts/config_r2egym.yaml b/scripts/config_r2egym.yaml index 8d14b79e..57829fd9 100644 --- a/scripts/config_r2egym.yaml +++ b/scripts/config_r2egym.yaml @@ -3,14 +3,13 @@ base: output_path: "exps/re2gym" benchmark: "r2egym" problems: "all" # list of problems, e.g., ["astropy__astropy-12907"], or strings like "test-125" (defined in gym/envs/configs), or "all", - env_kwargs: { - "run_timeout": 300, - dataset_id: "R2E-Gym/R2E-Gym-Lite", + env: + type: "r2egym" + run_timeout: 300 + dataset_id: "R2E-Gym/R2E-Gym-Lite" dataset_revision: "8d3163011f01f9393bb3dc7700497a79a8686ae5" - } - terminal: { - type: "docker", # "docker", "kubernetes" - } + terminal: + type: "docker" # "docker", "kubernetes" # LLM configs llm_name: "gpt-4o" diff --git a/scripts/config_swebench.yaml b/scripts/config_swebench.yaml index 8bc0ba55..b19b5b36 100644 --- a/scripts/config_swebench.yaml +++ b/scripts/config_swebench.yaml @@ -3,14 +3,13 @@ base: output_path: "exps/swebench-verified" benchmark: "swebench-debug" problems: "all" # list of problems, e.g., ["astropy__astropy-12907"], or "all" - env_kwargs: { - "run_timeout": 300, - "dataset_id": "SWE-bench/SWE-bench_Verified", - "dataset_revision": "99450355ca8c611021187a57ffac304b66666738", - } - terminal: { - type: "docker", # "docker", "kubernetes" - } + env: + type: "swebench-debug" + run_timeout: 300 + dataset_id: "SWE-bench/SWE-bench_Verified" + dataset_revision: "99450355ca8c611021187a57ffac304b66666738" + terminal: + type: "docker" # "docker", "kubernetes" # LLM configs llm_name: "gpt-4o" @@ -66,6 +65,8 @@ solution_agent: swe_agent: benchmark: "swebench" + env: + type: "swebench" max_steps: 100 max_rewrite_steps: 20 tools: diff --git a/scripts/config_swesmith.yaml b/scripts/config_swesmith.yaml index 5862e240..26bb08da 100644 --- a/scripts/config_swesmith.yaml +++ b/scripts/config_swesmith.yaml @@ -1,15 +1,15 @@ base: # Environment configs output_path: "exps/swesmith" - benchmark: "swesmith" - problems: "all" # list of problems, e.g., ["astropy__astropy-12907"], or strings like "test-125" (defined in gym/envs/configs), or "all", - env_kwargs: { - "run_timeout": 300, - "dataset_id": "SWE-bench/SWE-smith" - } - terminal: { - type: "docker", # "docker", "kubernetes" - } + env: + # type: "swesmith" # Not needed Will be inferred from dataset. + run_timeout: 300 + terminal: + type: "docker" # "docker", "kubernetes" + dataset: + type: "swesmith" + dataset_id: "SWE-bench/SWE-smith" + problems: "all" # list of problems, e.g., ["astropy__astropy-12907"], or strings like "test-125" (defined in gym/envs/configs), or "all", # LLM configs llm_name: "gpt-4o" diff --git a/scripts/run.py b/scripts/run.py index bc339b3c..a2aac006 100644 --- a/scripts/run.py +++ b/scripts/run.py @@ -186,7 +186,7 @@ def create_env(config: dict, task_data: dict, logger: DebugGymLogger): task_data=task_data, terminal=terminal, logger=logger, - **config["env_kwargs"], + **config.get("env", {}), ) return env @@ -252,10 +252,10 @@ def main(): # Create the environment to get the list of problems to run. dataset_info = { - "dataset_id": config.env_kwargs.get("dataset_id"), - "dataset_revision": config.env_kwargs.get("dataset_revision"), + "dataset_id": config.get("env", {}).get("dataset_id"), + "dataset_revision": config.get("env", {}).get("dataset_revision"), "problems": config.get("problems", "all"), - "prepull_images": config.env_kwargs.get("prepull_images", False), + "prepull_images": config.get("env", {}).get("prepull_images", False), } dataset = select_env(config.get("benchmark")).load_dataset(**dataset_info) problems = sorted(dataset) diff --git a/tests/gym/envs/test_r2egym.py b/tests/gym/envs/test_r2egym.py index 6fda7e9b..3aaa663a 100644 --- a/tests/gym/envs/test_r2egym.py +++ b/tests/gym/envs/test_r2egym.py @@ -1,4 +1,3 @@ -from pathlib import Path from unittest.mock import MagicMock, patch import pyarrow as pa @@ -8,7 +7,6 @@ from debug_gym.agents.solution_agent import AgentSolution from debug_gym.gym.entities import Observation from debug_gym.gym.envs.r2egym import R2EGymEnv -from debug_gym.gym.terminals.docker import DockerTerminal from debug_gym.gym.tools.tool import ToolCall from debug_gym.gym.tools.toolbox import Toolbox @@ -16,14 +14,19 @@ @pytest.if_docker_running def test_load_dataset(get_r2egym_env): env = get_r2egym_env() - assert env.dataset_id == "R2E-Gym/R2E-Gym-Lite" - # check if the dataset contains features that R2EGymEnv expects - assert sorted(env.ds.features.keys()) == sorted( + + dataset = env.load_dataset() + task_name = "aiohttp_final:d7cd0613472fd4d9940e37f1c55921f6a1515324" + assert task_name in dataset + + task_data = next(iter(dataset.values())) + assert sorted(task_data.keys()) == sorted( [ "commit_hash", "docker_image", "execution_result_content", "expected_output_json", + "instance_id", "modified_entity_summaries", "modified_files", "num_non_test_files", @@ -38,20 +41,15 @@ def test_load_dataset(get_r2egym_env): ) -@patch("docker.from_env") -def test_load_dataset_from_parquet(mock_docker_from_env, tmp_path): +def test_load_dataset_from_parquet(tmp_path): """Test loading R2EGym dataset from a local Parquet file.""" - # Mock Docker client to avoid trying to pull images - mock_docker_client = MagicMock() - mock_docker_client.images.list.return_value = [] - mock_docker_from_env.return_value = mock_docker_client # Create a minimal test Parquet file with expected schema parquet_file = tmp_path / "test_dataset.parquet" - + docker_image = "test_repo:test_hash_123" data = { "commit_hash": ["test_hash_123"], - "docker_image": ["test_repo:test_hash_123"], + "docker_image": [docker_image], "execution_result_content": ["test execution result"], "expected_output_json": ['{"test": "output"}'], "modified_entity_summaries": ["test summaries"], @@ -96,25 +94,25 @@ def test_load_dataset_from_parquet(mock_docker_from_env, tmp_path): # Verify the dataset has the expected data assert len(dataset) == 1 - assert dataset[0]["docker_image"] == "test_repo:test_hash_123" - assert dataset[0]["commit_hash"] == "test_hash_123" - assert "Test problem statement" in dataset[0]["problem_statement"] + task_name = docker_image # For R2EGym, we use docker_image as instance_id + assert docker_image in dataset + assert dataset[task_name]["docker_image"] == "test_repo:test_hash_123" + assert dataset[task_name]["commit_hash"] == "test_hash_123" + assert "Test problem statement" in dataset[task_name]["problem_statement"] @pytest.if_docker_running def test_instructions(get_r2egym_env): env = get_r2egym_env() - env.setup_task("aiohttp_final:d7cd0613472fd4d9940e37f1c55921f6a1515324") # Instructions might be wrapped by [ISSUE] [/ISSUE] - assert env.instructions in env.ds_row["problem_statement"] + assert env.instructions in env.task_data["problem_statement"] @pytest.if_docker_running def test_setup_task(get_r2egym_env): env = get_r2egym_env() - task_name = "aiohttp_final:d7cd0613472fd4d9940e37f1c55921f6a1515324" - env.setup_task(task_name) - assert env.task_name == task_name + assert env.task_name == "aiohttp_final:d7cd0613472fd4d9940e37f1c55921f6a1515324" + env.setup_task() assert ( env.base_image == "namanjain12/aiohttp_final:d7cd0613472fd4d9940e37f1c55921f6a1515324" @@ -127,8 +125,7 @@ def test_setup_task(get_r2egym_env): @pytest.if_docker_running def test_setup_terminal(get_r2egym_env): env = get_r2egym_env() - task_name = "aiohttp_final:d7cd0613472fd4d9940e37f1c55921f6a1515324" - env.reset(options={"task_name": task_name}) + env.reset() _, output = env.terminal.run(f"ls -a") assert ".git" in output assert "r2e_tests" in output @@ -139,9 +136,7 @@ def test_setup_terminal(get_r2egym_env): def test_reset_and_step(get_r2egym_env): env = get_r2egym_env() env.add_tool(Toolbox.get_tool("eval")) - env_info = env.reset( - options={"task_name": "aiohttp_final:d7cd0613472fd4d9940e37f1c55921f6a1515324"} - ) + env_info = env.reset() assert env.instructions == env_info.step_observation.observation assert "short test summary info" in env_info.eval_observation.observation @@ -196,9 +191,7 @@ def test_reset_and_step(get_r2egym_env): @pytest.if_docker_running def test_readonly_file(get_r2egym_env): env = get_r2egym_env() - env_info = env.reset( - options={"task_name": "aiohttp_final:d7cd0613472fd4d9940e37f1c55921f6a1515324"} - ) + env_info = env.reset() assert env.workspace._is_readonly_func("/testbed/r2e_tests/test_1.py") env.add_tool(Toolbox.get_tool("view")) @@ -228,10 +221,7 @@ def test_readonly_file(get_r2egym_env): def test_apply_gold_patch(get_r2egym_env): env = get_r2egym_env() env.add_tool(Toolbox.get_tool("eval")) - env_info = env.reset( - options={"task_name": "aiohttp_final:d7cd0613472fd4d9940e37f1c55921f6a1515324"} - ) - + env_info = env.reset() assert not env_info.terminated assert not env_info.resolved assert env_info.score == env.score == 0 @@ -246,19 +236,17 @@ def test_apply_gold_patch(get_r2egym_env): def test_running_solution_agent(get_r2egym_env, tmp_path): """End-to-end SolutionAgent run for R2E-Gym environment, asserting successful resolution after gold patch.""" env = get_r2egym_env() - task_name = "aiohttp_final:d7cd0613472fd4d9940e37f1c55921f6a1515324" config = { "output_path": str(tmp_path), "random_seed": 0, "memory_size": 8, "max_steps": 1, "max_rewrite_steps": 1, - "env_kwargs": {}, } for tool_name in ["pdb", "eval", "submit"]: env.add_tool(Toolbox.get_tool(tool_name)) agent = AgentSolution(agent_args=config, llm=None, logger=env.logger) - env.reset(options={"task_name": task_name}) + env.reset() success = agent.run(env) assert success @@ -267,9 +255,7 @@ def test_running_solution_agent(get_r2egym_env, tmp_path): def test_debug_entrypoint_contains_pdb(get_r2egym_env): """Ensure the environment's debug_entrypoint includes '-m pdb' for interactive debugging.""" env = get_r2egym_env() - env.reset( - options={"task_name": "aiohttp_final:d7cd0613472fd4d9940e37f1c55921f6a1515324"} - ) + env.reset() assert ( "python -m pdb" in env.debug_entrypoint ), f"Expected '-m pdb' in debug_entrypoint, got: {env.debug_entrypoint}" diff --git a/tests/gym/envs/test_swe_bench.py b/tests/gym/envs/test_swe_bench.py index c8f86cb4..c198751e 100644 --- a/tests/gym/envs/test_swe_bench.py +++ b/tests/gym/envs/test_swe_bench.py @@ -10,16 +10,14 @@ @pytest.if_docker_running def test_instructions(get_swe_bench_env): env = get_swe_bench_env() - env.ds_row = {"problem_statement": "Test problem statement"} - expected_instructions = "Test problem statement" - assert env.instructions == expected_instructions + assert env.instructions == env.task_data["problem_statement"] @pytest.if_docker_running def test_reset_and_step(get_swe_bench_env): env = get_swe_bench_env() env.add_tool(Toolbox.get_tool("eval")) - env_info = env.reset(options={"task_name": "astropy__astropy-14096"}) + env_info = env.reset() assert env.instructions == env_info.step_observation.observation assert "short test summary info" in env_info.eval_observation.observation @@ -99,46 +97,51 @@ def test_readonly_file(get_swe_bench_env): assert "|-- test_sky_coord.py (read-only)" in env_info.step_observation.observation -@pytest.if_docker_running def test_load_dataset(get_swe_bench_env): env = get_swe_bench_env() - assert env.dataset_id == "SWE-bench/SWE-bench_Verified" + + dataset = env.load_dataset() task_name = "astropy__astropy-14096" - assert task_name in env.dataset.keys() - assert list(env.ds.features.keys()) == [ - "repo", - "instance_id", - "base_commit", - "patch", - "test_patch", - "problem_statement", - "hints_text", - "created_at", - "version", - "FAIL_TO_PASS", - "PASS_TO_PASS", - "environment_setup_commit", - "difficulty", - ] + assert task_name in dataset + + task_data = next(iter(dataset.values())) + assert sorted(task_data.keys()) == sorted( + [ + "repo", + "instance_id", + "base_commit", + "patch", + "test_patch", + "problem_statement", + "hints_text", + "created_at", + "version", + "FAIL_TO_PASS", + "PASS_TO_PASS", + "environment_setup_commit", + "difficulty", + ] + ) -@pytest.if_docker_running def test_setup_task(get_swe_bench_env): env = get_swe_bench_env() task_name = "astropy__astropy-14096" - env.setup_task(task_name) assert env.task_name == task_name - assert env.ds_row["repo"] == "astropy/astropy" - assert env.ds_row["version"] == "5.1" - assert isinstance(env.ds_row, dict) - assert isinstance(env.install_configs, dict) + env.setup_task() + assert env.repo == "astropy/astropy" + assert env.version == "5.1" + assert env.package_name == "astropy" + assert ( + env.base_image == "swebench/sweb.eval.x86_64.astropy_1776_astropy-14096:latest" + ) @pytest.if_docker_running def test_setup_terminal(get_swe_bench_env): env = get_swe_bench_env() task_name = "astropy__astropy-14096" - env.reset(options={"task_name": task_name}) + env.reset() _, git_logs = env.terminal.run("git log -n 4") assert env.base_commit in git_logs assert f"Applying test patch for {task_name}" not in git_logs @@ -167,7 +170,7 @@ def test_patch_property(tmp_path, get_swe_bench_env): env = get_swe_bench_env() # Reset with a task to set up the environment - env.reset(options={"task_name": "astropy__astropy-14096"}) + env.reset() # Initially, there should be no changes (empty patch) initial_patch = env.patch @@ -218,7 +221,7 @@ def new_function(): def test_apply_gold_patch(get_swe_bench_env): env = get_swe_bench_env() env.add_tool(Toolbox.get_tool("eval")) - env_info = env.reset(options={"task_name": "astropy__astropy-14096"}) + env_info = env.reset() assert not env_info.terminated assert not env_info.resolved @@ -242,12 +245,11 @@ def test_running_solution_agent(get_swe_bench_env, tmp_path): # Optional values that BaseAgent.run would use; harmless to include here. "max_steps": 1, "max_rewrite_steps": 1, - "env_kwargs": {}, } for tool_name in ["pdb", "submit"]: env.add_tool(Toolbox.get_tool(tool_name)) agent = AgentSolution(agent_args=config, llm=None, logger=env.logger) - env.reset(options={"task_name": "astropy__astropy-14096"}) + env.reset() success = agent.run(env) assert success @@ -256,7 +258,7 @@ def test_running_solution_agent(get_swe_bench_env, tmp_path): def test_debug_entrypoint_contains_pdb(get_swe_bench_env): """Ensure the environment's debug_entrypoint includes '-m pdb' for interactive debugging.""" env = get_swe_bench_env() - env.reset(options={"task_name": "astropy__astropy-14096"}) + env.reset() assert ( "python -m pdb" in env.debug_entrypoint ), f"Expected '-m pdb' in debug_entrypoint, got: {env.debug_entrypoint}" @@ -266,7 +268,7 @@ def test_debug_entrypoint_contains_pdb(get_swe_bench_env): def test_setup_terminal_debug_mode(get_swe_bench_debug_env): env = get_swe_bench_debug_env() task_name = "astropy__astropy-14096" - env.reset(options={"task_name": task_name}) + env.reset() _, git_logs = env.terminal.run("git log -n 4") assert env.base_commit in git_logs assert f"Applying test patch for {task_name}" in git_logs @@ -287,11 +289,10 @@ def test_running_solution_agent_in_debug_mode(get_swe_bench_debug_env, tmp_path) # Optional values that BaseAgent.run would use; harmless to include here. "max_steps": 1, "max_rewrite_steps": 1, - "env_kwargs": {}, } for tool_name in ["pdb", "eval", "submit"]: env.add_tool(Toolbox.get_tool(tool_name)) agent = AgentSolution(agent_args=config, llm=None, logger=env.logger) - env.reset(options={"task_name": "astropy__astropy-14096"}) + env.reset() success = agent.run(env) assert success diff --git a/tests/gym/envs/test_swe_smith.py b/tests/gym/envs/test_swe_smith.py index 8c46befc..26b02c9f 100644 --- a/tests/gym/envs/test_swe_smith.py +++ b/tests/gym/envs/test_swe_smith.py @@ -1,11 +1,13 @@ from pathlib import Path import datasets +import pyarrow as pa +import pyarrow.parquet as pq import pytest from debug_gym.agents.solution_agent import AgentSolution from debug_gym.gym.entities import Observation -from debug_gym.gym.envs import SWESmithEnv +from debug_gym.gym.envs.swe_smith import SWESmithEnv from debug_gym.gym.tools.tool import ToolCall from debug_gym.gym.tools.toolbox import Toolbox @@ -13,9 +15,14 @@ @pytest.if_docker_running def test_load_dataset(get_swe_smith_env): env = get_swe_smith_env() - assert env.dataset_id == "SWE-bench/SWE-smith" + + dataset = env.load_dataset() + task_name = "john-kurkowski__tldextract.3d1bf184.combine_file__1vnuqpt4" + assert task_name in dataset + # check if the dataset contains features that SWESmithEnv expects - assert sorted(env.ds.features.keys()) == sorted( + task_data = next(iter(dataset.values())) + assert sorted(task_data.keys()) == sorted( [ "instance_id", "repo", @@ -32,8 +39,9 @@ def test_load_dataset(get_swe_smith_env): def test_load_dataset_from_parquet(tmp_path): """Test that loading from a local Parquet file works correctly.""" + # Create a sample parquet file with the required features - sample_data = { + data = { "instance_id": ["test-instance-1", "test-instance-2"], "repo": ["test/repo1", "test/repo2"], "patch": ["diff --git a/file.py", "diff --git b/file2.py"], @@ -44,18 +52,17 @@ def test_load_dataset_from_parquet(tmp_path): "base_commit": ["abc123", "def456"], "problem_statement": ["Problem 1", "Problem 2"], } + parquet_file = tmp_path / "test_dataset.parquet" - # Create a dataset and save as parquet - ds = datasets.Dataset.from_dict(sample_data) - parquet_path = tmp_path / "test_dataset.parquet" - ds.to_parquet(str(parquet_path)) + table = pa.table(data) + pq.write_table(table, str(parquet_file)) - # Test that the parquet file can be loaded using datasets library - # mimicking what SWESmithEnv.load_dataset() does for parquet files - loaded_ds = datasets.load_dataset("parquet", data_files=str(parquet_path))["train"] + # Load the dataset from the Parquet file + dataset = SWESmithEnv.load_dataset(dataset_id=str(parquet_file), split="train") + dataset_entry = next(iter(dataset.values())) # Verify that the dataset was loaded correctly with expected features - assert sorted(loaded_ds.features.keys()) == sorted( + assert sorted(dataset_entry.keys()) == sorted( [ "instance_id", "repo", @@ -69,25 +76,20 @@ def test_load_dataset_from_parquet(tmp_path): ] ) # Verify that the data is accessible - assert len(loaded_ds) == 2 - assert loaded_ds[0]["instance_id"] == "test-instance-1" - assert loaded_ds[1]["instance_id"] == "test-instance-2" + assert len(dataset) == 2 + assert sorted(dataset.keys()) == ["test-instance-1", "test-instance-2"] -@pytest.if_docker_running def test_instructions(get_swe_smith_env): env = get_swe_smith_env() - env.ds_row = {"problem_statement": "Test problem statement"} - expected_instructions = "Test problem statement" - assert env.instructions == expected_instructions + assert env.instructions == env.task_data["problem_statement"] -@pytest.if_docker_running def test_setup_task(get_swe_smith_env): env = get_swe_smith_env() task_name = "john-kurkowski__tldextract.3d1bf184.combine_file__1vnuqpt4" - env.setup_task(task_name) assert env.task_name == task_name + env.setup_task() assert env.repo == "john-kurkowski/tldextract" assert env.branch_name == task_name assert env.package_name == "tldextract" @@ -97,7 +99,7 @@ def test_setup_task(get_swe_smith_env): def test_setup_terminal(get_swe_smith_env): env = get_swe_smith_env() task_name = "john-kurkowski__tldextract.3d1bf184.combine_file__1vnuqpt4" - env.reset(options={"task_name": task_name}) + env.reset() _, git_logs = env.terminal.run("git log -n 4") # For SWE-Smith the base commit is found in the branch associated to the # instance id and is different from the one in the main branch. @@ -112,11 +114,7 @@ def test_setup_terminal(get_swe_smith_env): def test_reset_and_step(get_swe_smith_env): env = get_swe_smith_env() env.add_tool(Toolbox.get_tool("eval")) - env_info = env.reset( - options={ - "task_name": "john-kurkowski__tldextract.3d1bf184.combine_file__1vnuqpt4" - } - ) + env_info = env.reset() assert env.instructions == env_info.step_observation.observation assert "short test summary info" in env_info.eval_observation.observation @@ -156,11 +154,7 @@ def test_reset_and_step(get_swe_smith_env): @pytest.if_docker_running def test_readonly_file(get_swe_smith_env): env = get_swe_smith_env() - env_info = env.reset( - options={ - "task_name": "john-kurkowski__tldextract.3d1bf184.combine_file__1vnuqpt4" - } - ) + env_info = env.reset() env.add_tool(Toolbox.get_tool("view")) env.add_tool(Toolbox.get_tool("listdir")) @@ -199,11 +193,7 @@ def test_readonly_file(get_swe_smith_env): def test_apply_gold_patch(get_swe_smith_env): env = get_swe_smith_env() env.add_tool(Toolbox.get_tool("eval")) - env_info = env.reset( - options={ - "task_name": "john-kurkowski__tldextract.3d1bf184.combine_file__1vnuqpt4" - } - ) + env_info = env.reset() assert not env_info.terminated assert not env_info.resolved @@ -220,8 +210,7 @@ def test_calculate_score_with_pytest_error(get_swe_smith_env): """Test that the indentation error in pytest is handled correctly.""" env = get_swe_smith_env() env.add_tool(Toolbox.get_tool("eval")) - task_name = "john-kurkowski__tldextract.3d1bf184.combine_file__1vnuqpt4" - env.reset(options={"task_name": task_name}) + env.reset() # Modify 'tldextract/tldextract.py' in the working_dir to introduce an indentation error. content = env.workspace.read_file("tldextract/tldextract.py").split("\n") @@ -253,19 +242,17 @@ def test_calculate_score_with_pytest_error(get_swe_smith_env): def test_running_solution_agent(get_swe_smith_env, tmp_path): """Analogous to SWE Bench solution agent test: run SolutionAgent end-to-end and assert success.""" env = get_swe_smith_env() - task_name = "john-kurkowski__tldextract.3d1bf184.combine_file__1vnuqpt4" config = { "output_path": str(tmp_path), "random_seed": 0, "memory_size": 8, "max_steps": 1, "max_rewrite_steps": 1, - "env_kwargs": {}, } for tool_name in ["pdb", "eval", "submit"]: env.add_tool(Toolbox.get_tool(tool_name)) agent = AgentSolution(agent_args=config, llm=None, logger=env.logger) - env.reset(options={"task_name": task_name}) + env.reset() success = agent.run(env) assert success @@ -274,11 +261,7 @@ def test_running_solution_agent(get_swe_smith_env, tmp_path): def test_debug_entrypoint_contains_pdb(get_swe_smith_env): """Ensure the environment's debug_entrypoint includes '-m pdb' for interactive debugging.""" env = get_swe_smith_env() - env.reset( - options={ - "task_name": "john-kurkowski__tldextract.3d1bf184.combine_file__1vnuqpt4" - } - ) + env.reset() assert ( "python -m pdb" in env.debug_entrypoint ), f"Expected '-m pdb' in debug_entrypoint, got: {env.debug_entrypoint}" From e6fcd586c464e826ea52dc76b7014e51786ee810 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc-Alexandre=20C=C3=B4t=C3=A9?= Date: Fri, 28 Nov 2025 13:11:17 -0800 Subject: [PATCH 24/31] Adding back swesmith --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index a4cfd455..b81e53e8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,6 +10,7 @@ transformers==4.51.3 tiktoken docker==7.1.0 swebench==4.0.3 +swesmith==0.0.4 prompt_toolkit==3.0.51 anthropic==0.51.0 jinja2==3.1.6 From c80e6d87a50d069a8246b7cb5446fb2b28789582 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc-Alexandre=20C=C3=B4t=C3=A9?= Date: Mon, 1 Dec 2025 08:43:51 -0800 Subject: [PATCH 25/31] Fixing tests. --- debug_gym/gym/envs/aider.py | 22 +++++++++---- debug_gym/gym/envs/local.py | 7 +++- debug_gym/gym/envs/mini_nightmare.py | 21 ++++++++---- tests/gym/envs/test_aider.py | 12 ++++--- tests/gym/envs/test_env.py | 46 +++++++++++++-------------- tests/gym/envs/test_mini_nightmare.py | 18 ++++++----- tests/gym/test_utils.py | 8 ++--- tests/gym/tools/test_bash.py | 4 +-- tests/gym/tools/test_eval.py | 4 +-- tests/gym/tools/test_grep.py | 12 +++---- tests/gym/tools/test_listdir.py | 4 +-- tests/gym/tools/test_pdb.py | 37 ++++++++------------- tests/gym/tools/test_rewrite.py | 4 +-- tests/gym/tools/test_tool.py | 29 +++++++++-------- tests/gym/tools/test_view.py | 4 +-- 15 files changed, 124 insertions(+), 108 deletions(-) diff --git a/debug_gym/gym/envs/aider.py b/debug_gym/gym/envs/aider.py index 98421e57..d80927ec 100644 --- a/debug_gym/gym/envs/aider.py +++ b/debug_gym/gym/envs/aider.py @@ -1,23 +1,27 @@ +import logging import os import subprocess import tempfile from pathlib import Path -from typing import List import debug_gym.gym.utils as utils from debug_gym.constants import DEBUG_GYM_CACHE_DIR from debug_gym.gym.entities import EvalOutput from debug_gym.gym.envs.env import RepoEnv +from debug_gym.gym.envs.local import LocalEnv from debug_gym.gym.terminals.docker import DockerTerminal from debug_gym.gym.terminals.terminal import Terminal +from debug_gym.logger import DebugGymLogger DOCKER_AIDER_IMAGE_NAME = "debug-gym:aider" -def build_docker_image(logger): +def build_docker_image(logger: logging.Logger | None = None): """ Build a Docker image for the Mini Nightmare environment. """ + logger = logger or DebugGymLogger("debug-gym") + # Check if Docker image is built. import docker @@ -75,8 +79,13 @@ def __init__( if hasattr(terminal, "base_image") and terminal.base_image is None: terminal.base_image = DOCKER_AIDER_IMAGE_NAME - self.task_data = task_data - super().__init__(entrypoint=entrypoint, terminal=terminal, **kwargs) + super().__init__( + task_data=task_data, entrypoint=entrypoint, terminal=terminal, **kwargs + ) + + @property + def task_name(self) -> str: + return self.current_task["task_name"] @property def instructions(self) -> str: @@ -95,7 +104,7 @@ def eval(self, **kwargs) -> EvalOutput: return self.last_eval def setup_task(self): - pass + self.current_task = self.task_data def setup_workspace(self): self.workspace.reset() @@ -127,7 +136,7 @@ def setup_terminal(self): def load_dataset( cls, problems: str | list[str] | None = None, - build_image: bool = False, + build_image: bool = True, logger: object = None, ) -> dict: if build_image: @@ -167,6 +176,7 @@ def load_dataset( ) dataset[task_name] = { + "task_name": task_name, "codebase": directory, "instructions": instructions, "filename": task_name + ".py", diff --git a/debug_gym/gym/envs/local.py b/debug_gym/gym/envs/local.py index c3b8d54e..e2134014 100644 --- a/debug_gym/gym/envs/local.py +++ b/debug_gym/gym/envs/local.py @@ -1,4 +1,6 @@ from debug_gym.gym.envs.env import RepoEnv +from debug_gym.gym.terminals.local import LocalTerminal +from debug_gym.gym.terminals.terminal import Terminal class LocalEnv(RepoEnv): @@ -6,20 +8,23 @@ class LocalEnv(RepoEnv): def __init__( self, path: str, + terminal: Terminal | None = None, entrypoint: str = "python -m pytest -sq .", debug_entrypoint: str | None = None, **kwargs, ): task_data = {"path": path} + terminal = terminal or LocalTerminal() super().__init__( task_data=task_data, + terminal=terminal, entrypoint=entrypoint, debug_entrypoint=debug_entrypoint, **kwargs, ) @property - def instruction(self) -> str: + def instructions(self) -> str: return f"Debug the local codebase at {self.path}. Investigate the repository, figure out the root cause, then rewrite the code to fix the issue." @property diff --git a/debug_gym/gym/envs/mini_nightmare.py b/debug_gym/gym/envs/mini_nightmare.py index 2c59213b..83c128be 100644 --- a/debug_gym/gym/envs/mini_nightmare.py +++ b/debug_gym/gym/envs/mini_nightmare.py @@ -1,3 +1,4 @@ +import logging import tempfile from pathlib import Path @@ -7,14 +8,16 @@ from debug_gym.gym.envs.env import RepoEnv from debug_gym.gym.terminals.docker import DockerTerminal from debug_gym.gym.terminals.terminal import Terminal +from debug_gym.logger import DebugGymLogger DOCKER_MINI_NIGHTMARE_IMAGE_NAME = "debug-gym:mini-nightmare" -def build_docker_image(logger): +def build_docker_image(logger: logging.Logger | None = None): """ Build a Docker image for the Mini Nightmare environment. """ + logger = logger or DebugGymLogger("debug-gym") # Check if Docker image is built. import docker @@ -86,10 +89,9 @@ def __init__( if hasattr(terminal, "base_image") and terminal.base_image is None: terminal.base_image = DOCKER_MINI_NIGHTMARE_IMAGE_NAME - self.task_data = task_data - self.task_name = task_data["task_name"] - - super().__init__(entrypoint=entrypoint, terminal=terminal, **kwargs) + super().__init__( + task_data=task_data, entrypoint=entrypoint, terminal=terminal, **kwargs + ) @property def instructions(self) -> str: @@ -99,6 +101,10 @@ def instructions(self) -> str: " Beaware that the bug may not be in the code you initially see." ) + @property + def task_name(self) -> str: + return self.current_task["task_name"] + def calculate_max_score(self, eval_output: EvalOutput) -> int: return utils.extract_max_score_from_pytest_output(eval_output.output) @@ -112,7 +118,7 @@ def eval(self, **kwargs) -> EvalOutput: return self.last_eval def setup_task(self): - pass + self.current_task = self.task_data def setup_workspace(self): self.workspace.reset() @@ -144,7 +150,7 @@ def setup_terminal(self): def load_dataset( cls, problems: str | list[str] | None = None, - build_image: bool = False, + build_image: bool = True, logger: object = None, ) -> dict: if build_image: @@ -167,6 +173,7 @@ def load_dataset( assert (task_path / ".debugreadonly").exists() dataset[task_name] = { + "task_name": task_name, "codebase": task_path, "filename": task_name + "_code.py", } diff --git a/tests/gym/envs/test_aider.py b/tests/gym/envs/test_aider.py index 8786e291..ed2a2ac6 100644 --- a/tests/gym/envs/test_aider.py +++ b/tests/gym/envs/test_aider.py @@ -37,8 +37,10 @@ def setup_aider_repo(tmp_path_factory): @pytest.fixture def env(setup_aider_repo): terminal = LocalTerminal() - env = AiderBenchmarkEnv(terminal=terminal) - env.reset(options={"task_name": "clock"}) + dataset = AiderBenchmarkEnv.load_dataset() + task_data = dataset["clock"] + env = AiderBenchmarkEnv(task_data=task_data, terminal=terminal) + env.reset() return env @@ -103,13 +105,15 @@ def test_instructions(env): @patch("debug_gym.gym.envs.aider.build_docker_image") def test_build_docker_image(mock_build_docker_image): - AiderBenchmarkEnv() + dataset = AiderBenchmarkEnv.load_dataset() mock_build_docker_image.assert_called_once() @pytest.if_docker_running def test_reset_with_docker_terminal(setup_aider_repo): - env = AiderBenchmarkEnv() + dataset = AiderBenchmarkEnv.load_dataset() + task_data = dataset["clock"] + env = AiderBenchmarkEnv(task_data=task_data) env.add_tool(Toolbox.get_tool("eval")) assert isinstance(env.terminal, DockerTerminal) diff --git a/tests/gym/envs/test_env.py b/tests/gym/envs/test_env.py index 1a77a1ed..6a036893 100644 --- a/tests/gym/envs/test_env.py +++ b/tests/gym/envs/test_env.py @@ -6,13 +6,14 @@ from debug_gym.gym.entities import EvalOutput, Event, Observation from debug_gym.gym.envs.env import EnvInfo, EventHooks, RepoEnv, TooledEnv +from debug_gym.gym.envs.local import LocalEnv from debug_gym.gym.tools.tool import ToolCall from debug_gym.gym.tools.toolbox import Toolbox @pytest.fixture -def env_mock(): - env = RepoEnv() +def env_mock(tmp_path): + env = LocalEnv(path=tmp_path) return env @@ -109,7 +110,7 @@ def test_tool_names(env_mock): assert env_mock.tool_names == "tool1, tool2" -def test_env_tools(): +def test_env_tools(env_mock): tool1 = MagicMock() tool1.name = "tool1" tool1.description = "instructions1" @@ -129,11 +130,10 @@ def test_env_tools(): }, } - env = RepoEnv() - env.add_tool(tool1) - env.add_tool(tool2) + env_mock.add_tool(tool1) + env_mock.add_tool(tool2) - assert env.tools == [tool1, tool2] + assert env_mock.tools == [tool1, tool2] @pytest.fixture @@ -147,7 +147,7 @@ def env(tmp_path): (repo_path / "file2.txt").touch() (subdir_path / "subfile1.txt").touch() - env = RepoEnv(path=repo_path) + env = LocalEnv(path=repo_path) return env @@ -186,7 +186,7 @@ def test_step( mock_pdb_tool.current_frame_file = "file.py" mock_get_tool.return_value = None - env = RepoEnv(path=tmp_path) + env = LocalEnv(path=tmp_path) env.reset() env.last_eval = EvalOutput(success=False, output="1 failed, 0 passed") tool_call = ToolCall(id="123", name="pdb", arguments={"command": "b 10"}) @@ -210,7 +210,7 @@ def test_reset(tmp_path): (tmp_path / "test.py").write_text("def test_1():\n assert False\n") (tmp_path / ".debugignore").write_text("__pycache__/\n.git/\n.pytest_cache/\n") - env = RepoEnv(path=tmp_path, entrypoint="pytest test.py") + env = LocalEnv(path=tmp_path, entrypoint="pytest test.py") infos = env.reset() assert env.last_eval is None @@ -224,7 +224,7 @@ def test_reset(tmp_path): action_reasoning=None, action_content=None, action_tool_call=None, - instructions="", + instructions=env.instructions, score=0, max_score=None, terminated=False, @@ -276,7 +276,7 @@ def test_eval(tmp_path): (tmp_path / "test.py").write_text("def test_1():\n assert False\n") (tmp_path / ".debugignore").write_text("__pycache__/\n.git/\n.pytest_cache/\n") - env = RepoEnv(path=tmp_path, entrypoint="pytest test.py") + env = LocalEnv(path=tmp_path, entrypoint="pytest test.py") env.reset() env.eval() assert "FAILED test.py::test_1 - assert False" in env.last_eval.output @@ -287,7 +287,7 @@ def test_eval_success(tmp_path): # create a dummy file with open(tmp_path / "file.py", "w") as f: f.write("print('Hello, World!')") - env = RepoEnv(path=working_dir, entrypoint="python file.py") + env = LocalEnv(path=working_dir, entrypoint="python file.py") env.reset() output = env.eval() assert output == EvalOutput(success=True, output="Hello, World!") @@ -298,7 +298,7 @@ def test_eval_timeout(tmp_path): # runs for longer than the timeout with open(tmp_path / "file.py", "w") as f: f.write("import time; time.sleep(5)") - env = RepoEnv(path=working_dir, entrypoint="python file.py", run_timeout=1) + env = LocalEnv(path=working_dir, entrypoint="python file.py", run_timeout=1) env.reset() output = env.eval() assert output == EvalOutput(success=False, output="Timeout expired.") @@ -371,22 +371,20 @@ def test_event_hooks_notify(): subscriber.on_env_start.assert_called_once() -def test_current_breakpoints_no_breakpoints(): - env = RepoEnv() - env.current_breakpoints_state = {} - result = env.current_breakpoints() +def test_current_breakpoints_no_breakpoints(env_mock): + env_mock.current_breakpoints_state = {} + result = env_mock.current_breakpoints() assert result == "No breakpoints are set." -def test_current_breakpoints_with_breakpoints(tmp_path): - env = RepoEnv() - env.current_breakpoints_state = { +def test_current_breakpoints_with_breakpoints(tmp_path, env_mock): + env_mock.current_breakpoints_state = { "file1.py|||10": "b file1.py:10", "file1.py|||20": "b file1.py:20", "file1.py|||30": "b file1.py:30", "file2.py|||15": "b file2.py:15", } - result = env.current_breakpoints() + result = env_mock.current_breakpoints() expected_result = ( "line 10 in file1.py\n" "line 20 in file1.py\n" @@ -424,7 +422,7 @@ def test_queue_and_process_events(): def test_has_breakpoint_true_and_false(tmp_path): - env = RepoEnv(path=tmp_path) + env = LocalEnv(path=tmp_path) env.reset() file_path = env.working_dir / "test.py" file_path.write_text("print('hello')") @@ -438,7 +436,7 @@ def test_has_breakpoint_true_and_false(tmp_path): def test_has_breakpoint_relative_path(tmp_path): - env = RepoEnv(path=tmp_path) + env = LocalEnv(path=tmp_path) env.reset() file_path = env.working_dir / "foo.py" file_path.write_text("print('foo')") diff --git a/tests/gym/envs/test_mini_nightmare.py b/tests/gym/envs/test_mini_nightmare.py index eee46ee4..0a8590c5 100644 --- a/tests/gym/envs/test_mini_nightmare.py +++ b/tests/gym/envs/test_mini_nightmare.py @@ -12,23 +12,23 @@ def mini_nightmare_env(): # Initialize the MiniNightmareEnv with LocalTerminal terminal = LocalTerminal() - env = MiniNightmareEnv(terminal=terminal) + dataset = MiniNightmareEnv.load_dataset() + task_data = dataset["config"] + env = MiniNightmareEnv(task_data=task_data, terminal=terminal) env.add_tool(Toolbox.get_tool("eval")) return env def test_load_dataset(mini_nightmare_env): - dataset = mini_nightmare_env.load_dataset() - assert mini_nightmare_env.dataset == dataset - + dataset = MiniNightmareEnv.load_dataset() subproblems = list(dataset.keys())[::2] - subset = mini_nightmare_env.load_dataset(problems=subproblems) + subset = MiniNightmareEnv.load_dataset(problems=subproblems) assert list(subset.keys()) == subproblems @patch("debug_gym.gym.envs.mini_nightmare.build_docker_image") def test_build_docker_image(mock_build_docker_image): - MiniNightmareEnv() + dataset = MiniNightmareEnv.load_dataset() mock_build_docker_image.assert_called_once() @@ -53,11 +53,13 @@ def test_reset(mini_nightmare_env): @pytest.if_docker_running def test_reset_with_docker_terminal(): - env = MiniNightmareEnv() + dataset = MiniNightmareEnv.load_dataset() + task_data = dataset["config"] + env = MiniNightmareEnv(task_data=task_data) env.add_tool(Toolbox.get_tool("eval")) assert isinstance(env.terminal, DockerTerminal) - infos = env.reset(options={"task_name": "config"}) + infos = env.reset() assert env.instructions == infos.step_observation.observation assert "2 failed" in infos.eval_observation.observation assert infos.max_score == 2 diff --git a/tests/gym/test_utils.py b/tests/gym/test_utils.py index 6a51b583..47f50335 100644 --- a/tests/gym/test_utils.py +++ b/tests/gym/test_utils.py @@ -2,7 +2,7 @@ import pytest -from debug_gym.gym.envs.env import RepoEnv +from debug_gym.gym.envs.local import LocalEnv from debug_gym.gym.utils import ( _walk, cleanup_pytest_output, @@ -45,7 +45,7 @@ def test_show_line_number_no_code_path_no_breakpoints(): def test_show_line_number_with_code_path(tmp_path): - env = RepoEnv(path=tmp_path) + env = LocalEnv(path=tmp_path) env.reset() code_path = f"{env.working_dir}/code.py" breakpoints_state = {f"{code_path}|||2": "b 2"} @@ -65,7 +65,7 @@ def test_show_line_number_with_code_path(tmp_path): def test_show_line_number_multiple_breakpoints(tmp_path): - env = RepoEnv(path=tmp_path) + env = LocalEnv(path=tmp_path) env.reset() code_path = f"{env.working_dir}/code.py" breakpoints_state = { @@ -92,7 +92,7 @@ def test_show_line_number_multiple_breakpoints(tmp_path): def test_show_line_number_multiple_breakpoints_with_start_index(tmp_path): - env = RepoEnv(path=tmp_path) + env = LocalEnv(path=tmp_path) env.reset() code_path = f"{env.working_dir}/code.py" breakpoints_state = { diff --git a/tests/gym/tools/test_bash.py b/tests/gym/tools/test_bash.py index 5e7d860e..5644066a 100644 --- a/tests/gym/tools/test_bash.py +++ b/tests/gym/tools/test_bash.py @@ -4,7 +4,7 @@ import pytest from debug_gym.gym.entities import Observation -from debug_gym.gym.envs.env import RepoEnv +from debug_gym.gym.envs.local import LocalEnv from debug_gym.gym.tools.bash import BashTool from debug_gym.gym.tools.tool import ToolCall from debug_gym.gym.tools.toolbox import Toolbox @@ -30,7 +30,7 @@ def env(tmp_path): with open(subdir / "nested.txt", "w") as f: f.write("nested file content") - env = RepoEnv(path=repo_path) + env = LocalEnv(path=repo_path) bash_tool = Toolbox.get_tool("bash") env.add_tool(bash_tool) env.reset() diff --git a/tests/gym/tools/test_eval.py b/tests/gym/tools/test_eval.py index 7279de81..4bae1026 100644 --- a/tests/gym/tools/test_eval.py +++ b/tests/gym/tools/test_eval.py @@ -2,7 +2,7 @@ import pytest -from debug_gym.gym.envs.env import RepoEnv +from debug_gym.gym.envs.local import LocalEnv from debug_gym.gym.tools.tool import ToolCall from debug_gym.gym.tools.toolbox import Toolbox @@ -15,7 +15,7 @@ def env(tmp_path): with open(repo_path / "test_1.py", "w") as f: f.write("def test_1():\n assert False\n") - env = RepoEnv(path=repo_path) + env = LocalEnv(path=repo_path) env.reset() return env diff --git a/tests/gym/tools/test_grep.py b/tests/gym/tools/test_grep.py index 9d3e7b4e..b594bd6f 100644 --- a/tests/gym/tools/test_grep.py +++ b/tests/gym/tools/test_grep.py @@ -1,10 +1,6 @@ -import os -import tempfile -from pathlib import Path - import pytest -from debug_gym.gym.envs.env import RepoEnv +from debug_gym.gym.envs.local import LocalEnv from debug_gym.gym.tools.grep import GrepTool @@ -35,7 +31,7 @@ def hello_world(): class TestClass: def __init__(self): self.value = 42 - + def method_with_bug(self): # TODO: Fix this bug return self.value / 0 # This will cause a division by zero error @@ -62,7 +58,7 @@ def load_config(filename): class EmailValidator: def __init__(self): self.pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$' - + def validate(self, email): return re.match(self.pattern, email) is not None """ @@ -209,7 +205,7 @@ def _setup_grep_repo_env(base_dir, ignore_patterns=None, readonly_patterns=None) with (test_repo / ".debugreadonly").open("w") as f: f.write("\n".join(readonly_patterns)) - env = RepoEnv(path=str(test_repo)) + env = LocalEnv(path=str(test_repo)) grep_tool = GrepTool() env.reset() return grep_tool, env diff --git a/tests/gym/tools/test_listdir.py b/tests/gym/tools/test_listdir.py index 4198266a..c405ae05 100644 --- a/tests/gym/tools/test_listdir.py +++ b/tests/gym/tools/test_listdir.py @@ -1,6 +1,6 @@ import pytest -from debug_gym.gym.envs.env import RepoEnv +from debug_gym.gym.envs.local import LocalEnv from debug_gym.gym.tools.listdir import ListdirTool @@ -8,7 +8,7 @@ def setup_listdir_repo_env(setup_test_repo): def _setup_listdir_repo_env(base_dir): test_repo = setup_test_repo(base_dir) - env = RepoEnv(path=str(test_repo)) + env = LocalEnv(path=str(test_repo)) listdir_tool = ListdirTool() listdir_tool.register(env) env.reset() diff --git a/tests/gym/tools/test_pdb.py b/tests/gym/tools/test_pdb.py index 23232ce9..0b6caf13 100644 --- a/tests/gym/tools/test_pdb.py +++ b/tests/gym/tools/test_pdb.py @@ -7,10 +7,8 @@ import pytest from debug_gym.gym.entities import Event -from debug_gym.gym.envs.env import RepoEnv +from debug_gym.gym.envs.local import LocalEnv from debug_gym.gym.terminals.docker import DockerTerminal -from debug_gym.gym.terminals.local import LocalTerminal -from debug_gym.gym.terminals.shell_session import ProcessNotRunningError from debug_gym.gym.tools.pdb import PDBTool @@ -60,7 +58,7 @@ def _breakpoints_state(working_dir): def setup_pdb_repo_env(setup_test_repo, setup_breakpoints_state): def _setup_pdb_repo_env(base_dir): test_repo = setup_test_repo(base_dir) - env = RepoEnv(path=str(test_repo)) + env = LocalEnv(path=str(test_repo)) pdb_tool = PDBTool(persistent_breakpoints=True, auto_list=True) pdb_tool.register(env) env.reset() @@ -75,10 +73,8 @@ def _setup_pdb_repo_env(base_dir): def test_pdb_use(tmp_path, setup_test_repo): # Test PDBTool with LocalTerminal, verbose pytest tests_path = str(setup_test_repo(tmp_path)) - terminal = LocalTerminal() - env = RepoEnv( + env = LocalEnv( path=tests_path, - terminal=terminal, debug_entrypoint="python -m pdb -m pytest -sv .", ) env.reset() @@ -103,10 +99,8 @@ def test_pdb_use(tmp_path, setup_test_repo): def test_pdb_use_empty_command(tmp_path, setup_test_repo): # Test PDBTool with LocalTerminal, verbose pytest tests_path = str(setup_test_repo(tmp_path)) - terminal = LocalTerminal() - env = RepoEnv( + env = LocalEnv( path=tests_path, - terminal=terminal, debug_entrypoint="python -m pdb -m pytest -sv .", ) env.reset() @@ -120,10 +114,8 @@ def test_pdb_use_empty_command(tmp_path, setup_test_repo): def test_pdb_b_fail_blank_or_comment(tmp_path, setup_test_repo): # Test PDBTool with LocalTerminal, verbose pytest tests_path = str(setup_test_repo(tmp_path)) - terminal = LocalTerminal() - env = RepoEnv( + env = LocalEnv( path=tests_path, - terminal=terminal, debug_entrypoint="python -m pdb -m pytest -sv .", ) env.reset() @@ -141,10 +133,8 @@ def test_pdb_b_fail_blank_or_comment(tmp_path, setup_test_repo): def test_pdb_pass_empty_path_if_in_session(tmp_path, setup_test_repo): # Test PDBTool with LocalTerminal, verbose pytest tests_path = str(setup_test_repo(tmp_path)) - terminal = LocalTerminal() - env = RepoEnv( + env = LocalEnv( path=tests_path, - terminal=terminal, debug_entrypoint="python -m pdb -m pytest -sv .", ) env.reset() @@ -164,8 +154,7 @@ def test_pdb_pass_empty_path_if_in_session(tmp_path, setup_test_repo): def test_pdb_use_default_env_entrypoint(tmp_path, setup_test_repo): # Test PDBTool with default env entrypoint, quiet pytest tests_path = str(setup_test_repo(tmp_path)) - terminal = LocalTerminal() - env = RepoEnv(path=tests_path, terminal=terminal) + env = LocalEnv(path=tests_path) env.reset() pdb = PDBTool() initial_output = pdb.start_pdb(env) # "python -m pdb -m pytest -sq ." @@ -202,7 +191,9 @@ def test_pdb_use_docker_terminal(tmp_path, setup_test_repo): ) # no:cacheprovider to avoid .pytest_cache, --tb=short to reduce output debug_entrypoint = "python -m pdb -m pytest -p no:cacheprovider --color=no -sv ." - env = RepoEnv(path=tests_path, terminal=terminal, debug_entrypoint=debug_entrypoint) + env = LocalEnv( + path=tests_path, terminal=terminal, debug_entrypoint=debug_entrypoint + ) env.reset() pdb = PDBTool() pdb.start_pdb(env) @@ -228,8 +219,8 @@ def test_initialization(): assert pdb_tool._session is None -def test_register(): - env = RepoEnv() +def test_register(tmp_path): + env = LocalEnv(path=tmp_path) pdb_tool = PDBTool() pdb_tool.register(env) # every tool listen to ENV_RESET event to track history @@ -369,7 +360,7 @@ def test_pdb_crashing(tmp_path, setup_test_repo): with open(tests_path / "test_fail.py", "w") as f: f.write("def test_fail():\nassert False") # IndentationError - env = RepoEnv( + env = LocalEnv( path=tests_path, entrypoint="python -m pytest -s test.py", debug_entrypoint="python -m pdb -m pytest -s test_fail.py", @@ -390,7 +381,7 @@ def test_pdb_timeout(tmp_path, setup_test_repo): "def test_fail():\n print('Sleeping...'); import time; time.sleep(10)" ) # IndentationError - env = RepoEnv( + env = LocalEnv( path=tests_path, entrypoint="python -m pytest -s test.py", debug_entrypoint="python -m pdb -m pytest -sv test_fail.py", diff --git a/tests/gym/tools/test_rewrite.py b/tests/gym/tools/test_rewrite.py index e8ad0772..003f31e6 100644 --- a/tests/gym/tools/test_rewrite.py +++ b/tests/gym/tools/test_rewrite.py @@ -2,7 +2,7 @@ import pytest -from debug_gym.gym.envs.env import RepoEnv +from debug_gym.gym.envs.local import LocalEnv from debug_gym.gym.tools.rewrite import RewriteTool @@ -23,7 +23,7 @@ def env(tmp_path): with open(repo_path / "test.py", "w") as f: f.write(file_content) - env = RepoEnv(path=repo_path) + env = LocalEnv(path=repo_path) rewrite_tool = RewriteTool() env.add_tool(rewrite_tool) diff --git a/tests/gym/tools/test_tool.py b/tests/gym/tools/test_tool.py index 010526cd..a724befe 100644 --- a/tests/gym/tools/test_tool.py +++ b/tests/gym/tools/test_tool.py @@ -1,7 +1,10 @@ +from pathlib import Path + import pytest from debug_gym.gym.entities import Observation -from debug_gym.gym.envs.env import Event, RepoEnv +from debug_gym.gym.envs.env import Event +from debug_gym.gym.envs.local import LocalEnv from debug_gym.gym.tools.tool import EnvironmentTool, Record from debug_gym.gym.tools.toolbox import Toolbox @@ -13,9 +16,14 @@ def use(self, env, action): return Observation("FakeTool", action) -def test_register_valid_environment(): +@pytest.fixture +def env(tmp_path): + env = LocalEnv(path=tmp_path) + return env + + +def test_register_valid_environment(env): tool = FakeTool() - env = RepoEnv() tool.register(env) # every tool listen to ENV_RESET event to track history assert tool in env.event_hooks.event_listeners[Event.ENV_RESET] @@ -46,7 +54,7 @@ class CompletelyFakeTool(EnvironmentTool): tool = CompletelyFakeTool() -def test_auto_subscribe(monkeypatch): +def test_auto_subscribe(monkeypatch, env): @Toolbox.register() class ToolWithHandler(FakeTool): @@ -55,7 +63,6 @@ def on_env_reset(self, **kwargs): tool = ToolWithHandler() - env = RepoEnv() env.add_tool(tool) assert tool in env.event_hooks.event_listeners[Event.ENV_RESET] @@ -65,9 +72,8 @@ def on_env_reset(self, **kwargs): assert tool not in env.event_hooks.event_listeners[channel] -def test_track_history(): +def test_track_history(env): tool = FakeTool() - env = RepoEnv() assert hasattr(tool, "history") assert isinstance(tool.history, list) @@ -90,18 +96,16 @@ def test_track_history(): ) -def test_unknown_args(): +def test_unknown_args(env): tool = FakeTool() - env = RepoEnv() obs = tool(env, unknown_arg="unknown_value") assert obs == Observation( "FakeTool", "FakeTool.use() got an unexpected keyword argument 'unknown_arg'" ) -def test_unregister(): +def test_unregister(env): tool = FakeTool() - env = RepoEnv() tool.register(env) # Verify tool is registered @@ -120,7 +124,7 @@ def test_unregister_invalid_environment(): tool.unregister(object()) -def test_unregister_with_multiple_handlers(): +def test_unregister_with_multiple_handlers(env): class ToolWithMultipleHandlers(FakeTool): def on_env_reset(self, environment, **kwargs): return "Handler for Event.ENV_RESET" @@ -129,7 +133,6 @@ def on_env_step(self, environment, **kwargs): return "Handler for Event.ENV_STEP" tool = ToolWithMultipleHandlers() - env = RepoEnv() tool.register(env) # Verify tool is registered for both events diff --git a/tests/gym/tools/test_view.py b/tests/gym/tools/test_view.py index ec2742bb..5d9f5e10 100644 --- a/tests/gym/tools/test_view.py +++ b/tests/gym/tools/test_view.py @@ -3,7 +3,7 @@ import pytest from debug_gym.gym.entities import Observation -from debug_gym.gym.envs.env import RepoEnv +from debug_gym.gym.envs.local import LocalEnv from debug_gym.gym.tools.tool import ToolCall from debug_gym.gym.tools.toolbox import Toolbox @@ -29,7 +29,7 @@ def env(tmp_path): (repo_path / "empty.py").touch() # Create an empty file - env = RepoEnv(path=repo_path) + env = LocalEnv(path=repo_path) view_tool = Toolbox.get_tool("view") env.add_tool(view_tool) env.reset() From 7d8268ebc64b116511bc08d24d07df2fecf00544 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc-Alexandre=20C=C3=B4t=C3=A9?= Date: Mon, 1 Dec 2025 09:53:58 -0800 Subject: [PATCH 26/31] Print disk space after installing library. --- .github/actions/test-if-changes/action.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/actions/test-if-changes/action.yml b/.github/actions/test-if-changes/action.yml index a2c872cf..dcda5618 100644 --- a/.github/actions/test-if-changes/action.yml +++ b/.github/actions/test-if-changes/action.yml @@ -39,6 +39,7 @@ runs: else pip install "debug-gym[dev]==${{ inputs.version }}" fi + df -h - name: Run tests env: DEBUG_GYM_DEBUG: 1 From 866ac3955166841fbe46014ead3897a754d8cbfd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc-Alexandre=20C=C3=B4t=C3=A9?= Date: Mon, 1 Dec 2025 11:08:21 -0800 Subject: [PATCH 27/31] When creating ficture env, reset the env in master thread first --- tests/gym/envs/conftest.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/gym/envs/conftest.py b/tests/gym/envs/conftest.py index e79af5ba..660cc1f5 100644 --- a/tests/gym/envs/conftest.py +++ b/tests/gym/envs/conftest.py @@ -31,13 +31,16 @@ def make_env_factory(env_name, worker_id, tmp_path_factory): env_class = kwargs.pop("env_class") def _make_env(): - dataset = env_class.load_dataset(problems=kwargs["problems"]) + dataset = env_class.load_dataset( + problems=kwargs["problems"], prepull_images=True + ) task_data = next(iter(dataset.values())) return env_class(task_data=task_data) if worker_id == "master": # Not running with pytest-xdist or we are in the master process - _make_env() + env = _make_env() + env.reset() else: # When running with pytest-xdist, synchronize between workers using a lock root_tmp_dir = tmp_path_factory.getbasetemp().parent From 1f4661a1c56a3ac682075439c933356271c0979a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc-Alexandre=20C=C3=B4t=C3=A9?= Date: Mon, 1 Dec 2025 11:53:15 -0800 Subject: [PATCH 28/31] Disabling async pytests --- .github/actions/test-if-changes/action.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/actions/test-if-changes/action.yml b/.github/actions/test-if-changes/action.yml index dcda5618..a7f96449 100644 --- a/.github/actions/test-if-changes/action.yml +++ b/.github/actions/test-if-changes/action.yml @@ -45,7 +45,7 @@ runs: DEBUG_GYM_DEBUG: 1 shell: bash run: | - pytest ${{ inputs.test-files }} -vv -n 16 --timeout=600 --cov=debug_gym --cov-report=term-missing + pytest ${{ inputs.test-files }} -vv --timeout=600 --cov=debug_gym --cov-report=term-missing - name: Store coverage report uses: actions/upload-artifact@v4 with: From 424b3dd8d16a2fbba7328845b58878af1829719b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc-Alexandre=20C=C3=B4t=C3=A9?= Date: Mon, 1 Dec 2025 12:11:47 -0800 Subject: [PATCH 29/31] Reenable async pytests + make sure to provide specific problem to load_dataset --- .github/actions/test-if-changes/action.yml | 2 +- tests/gym/envs/conftest.py | 3 +-- tests/gym/envs/test_r2egym.py | 2 +- tests/gym/envs/test_swe_smith.py | 2 +- 4 files changed, 4 insertions(+), 5 deletions(-) diff --git a/.github/actions/test-if-changes/action.yml b/.github/actions/test-if-changes/action.yml index a7f96449..dcda5618 100644 --- a/.github/actions/test-if-changes/action.yml +++ b/.github/actions/test-if-changes/action.yml @@ -45,7 +45,7 @@ runs: DEBUG_GYM_DEBUG: 1 shell: bash run: | - pytest ${{ inputs.test-files }} -vv --timeout=600 --cov=debug_gym --cov-report=term-missing + pytest ${{ inputs.test-files }} -vv -n 16 --timeout=600 --cov=debug_gym --cov-report=term-missing - name: Store coverage report uses: actions/upload-artifact@v4 with: diff --git a/tests/gym/envs/conftest.py b/tests/gym/envs/conftest.py index 660cc1f5..db78b192 100644 --- a/tests/gym/envs/conftest.py +++ b/tests/gym/envs/conftest.py @@ -39,8 +39,7 @@ def _make_env(): if worker_id == "master": # Not running with pytest-xdist or we are in the master process - env = _make_env() - env.reset() + _make_env() else: # When running with pytest-xdist, synchronize between workers using a lock root_tmp_dir = tmp_path_factory.getbasetemp().parent diff --git a/tests/gym/envs/test_r2egym.py b/tests/gym/envs/test_r2egym.py index 3aaa663a..dd31c776 100644 --- a/tests/gym/envs/test_r2egym.py +++ b/tests/gym/envs/test_r2egym.py @@ -15,8 +15,8 @@ def test_load_dataset(get_r2egym_env): env = get_r2egym_env() - dataset = env.load_dataset() task_name = "aiohttp_final:d7cd0613472fd4d9940e37f1c55921f6a1515324" + dataset = env.load_dataset(problems=[task_name]) assert task_name in dataset task_data = next(iter(dataset.values())) diff --git a/tests/gym/envs/test_swe_smith.py b/tests/gym/envs/test_swe_smith.py index 26b02c9f..65c9e906 100644 --- a/tests/gym/envs/test_swe_smith.py +++ b/tests/gym/envs/test_swe_smith.py @@ -16,8 +16,8 @@ def test_load_dataset(get_swe_smith_env): env = get_swe_smith_env() - dataset = env.load_dataset() task_name = "john-kurkowski__tldextract.3d1bf184.combine_file__1vnuqpt4" + dataset = env.load_dataset(problems=[task_name]) assert task_name in dataset # check if the dataset contains features that SWESmithEnv expects From e0263a225a87b7aef0e721305329cd4bc0212b8a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc-Alexandre=20C=C3=B4t=C3=A9?= Date: Mon, 1 Dec 2025 12:31:45 -0800 Subject: [PATCH 30/31] Fixing load_dataset --- debug_gym/gym/envs/r2egym.py | 20 ++++++++------------ debug_gym/gym/envs/swe_smith.py | 18 ++++++++---------- 2 files changed, 16 insertions(+), 22 deletions(-) diff --git a/debug_gym/gym/envs/r2egym.py b/debug_gym/gym/envs/r2egym.py index 17cd5ac7..ee9bfa12 100644 --- a/debug_gym/gym/envs/r2egym.py +++ b/debug_gym/gym/envs/r2egym.py @@ -262,6 +262,7 @@ def load_dataset( prepull_images: bool = False, logger: DebugGymLogger | None = None, ) -> dict: + logger = logger or DebugGymLogger("debug_gym") data_path = Path(dataset_id) if data_path.is_file(): # Loading from local file. @@ -299,10 +300,9 @@ def extract_instance_id(docker_image: str) -> str: dataset = {pid: dataset[pid] for pid in problems} image_names = set(example["docker_image"] for example in dataset.values()) - if logger is not None: - logger.debug( - f"Loaded {len(dataset)} tasks across {len(image_names)} Docker images from {dataset_id}." - ) + logger.debug( + f"Loaded {len(dataset)} tasks across {len(image_names)} Docker images from {dataset_id}." + ) if prepull_images: # Download all images needed for R2E-Gym. @@ -313,14 +313,10 @@ def extract_instance_id(docker_image: str) -> str: ) missing_images = image_names - existing_images if missing_images: - if logger is not None: + logger.warning(f"Found {len(missing_images)} missing Docker images.") + for i, image_name in enumerate(missing_images): logger.warning( - f"Found {len(missing_images)} missing Docker images." + f"Pulling Docker image {i + 1}/{len(missing_images)} `{image_name}`." ) - for i, image_name in enumerate(missing_images): - if logger is not None: - logger.warning( - f"Pulling Docker image {i + 1}/{len(missing_images)} `{image_name}`." - ) - client.images.pull(image_name) + client.images.pull(image_name) return dataset diff --git a/debug_gym/gym/envs/swe_smith.py b/debug_gym/gym/envs/swe_smith.py index 8511a1a7..fc507032 100644 --- a/debug_gym/gym/envs/swe_smith.py +++ b/debug_gym/gym/envs/swe_smith.py @@ -154,6 +154,7 @@ def load_dataset( prepull_images: bool = False, logger: DebugGymLogger | None = None, ) -> dict: + logger = logger or DebugGymLogger("debug_gym") data_path = Path(dataset_id) if data_path.is_file(): # Loading from local file. @@ -181,10 +182,9 @@ def load_dataset( dataset = {pid: dataset[pid] for pid in problems} image_names = set([problem["image_name"] for problem in dataset.values()]) - if logger is not None: - logger.debug( - f"Loaded {len(dataset)} tasks across {len(image_names)} Docker images from {dataset_id}." - ) + logger.debug( + f"Loaded {len(dataset)} tasks across {len(image_names)} Docker images from {dataset_id}." + ) if prepull_images: # Download all images needed for SWE-Smith. @@ -198,15 +198,13 @@ def load_dataset( ) missing_images = tagged_image_names - existing_images if missing_images: - if logger is not None: - logger.info(f"Found {len(missing_images)} missing Docker images.") + logger.info(f"Found {len(missing_images)} missing Docker images.") for image_name in missing_images: docker_hub_image = image_name.replace("__", "_1776_") - if logger is not None: - logger.info( - f"Pulling Docker image `{docker_hub_image}` to `{image_name}`." - ) + logger.info( + f"Pulling Docker image `{docker_hub_image}` to `{image_name}`." + ) client.images.pull(docker_hub_image) # Rename images via tagging client.images.get(docker_hub_image).tag(image_name) From e5cda5a3a782dba9e416904d8198b39692f462ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc-Alexandre=20C=C3=B4t=C3=A9?= Date: Mon, 1 Dec 2025 12:59:35 -0800 Subject: [PATCH 31/31] Limiting workers for async pytest --- .github/actions/test-if-changes/action.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/actions/test-if-changes/action.yml b/.github/actions/test-if-changes/action.yml index dcda5618..3d19d261 100644 --- a/.github/actions/test-if-changes/action.yml +++ b/.github/actions/test-if-changes/action.yml @@ -45,7 +45,7 @@ runs: DEBUG_GYM_DEBUG: 1 shell: bash run: | - pytest ${{ inputs.test-files }} -vv -n 16 --timeout=600 --cov=debug_gym --cov-report=term-missing + pytest ${{ inputs.test-files }} -vv -n 4 --timeout=600 --cov=debug_gym --cov-report=term-missing - name: Store coverage report uses: actions/upload-artifact@v4 with: