diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 884968fb5..1eafcb1fc 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -32,11 +32,6 @@ jobs: - name: Run tests with coverage run: uv run pytest --cov=src/bcbench --cov-report=term-missing - get-entries: - uses: ./.github/workflows/get-entries.yml - with: - test-run: true - select-category: runs-on: ubuntu-latest outputs: @@ -50,6 +45,13 @@ jobs: $selected = $categories | Get-Random echo "category=$selected" >> $env:GITHUB_OUTPUT + get-entries: + needs: select-category + uses: ./.github/workflows/get-entries.yml + with: + test-run: true + category: ${{ needs.select-category.outputs.category }} + mock-evaluation: runs-on: ubuntu-latest needs: [get-entries, select-category] @@ -79,7 +81,7 @@ jobs: retention-days: 1 summarize-results: - needs: mock-evaluation + needs: [mock-evaluation, select-category] uses: ./.github/workflows/summarize-results.yml permissions: contents: write @@ -89,5 +91,5 @@ jobs: model: ${{ github.run_id }} agent: "mock-agent" mock: true - category: "delete" + category: ${{ needs.select-category.outputs.category }} secrets: inherit diff --git a/.github/workflows/claude-evaluation.yml b/.github/workflows/claude-evaluation.yml index c523be844..69542c85e 100644 --- a/.github/workflows/claude-evaluation.yml +++ b/.github/workflows/claude-evaluation.yml @@ -57,6 +57,7 @@ jobs: uses: ./.github/workflows/get-entries.yml with: test-run: ${{ inputs.test-run }} + category: ${{ inputs.category }} evaluate-with-claude-code: runs-on: [self-hosted, 1ES.Pool=GitHub-BCBench] diff --git a/.github/workflows/copilot-evaluation.yml b/.github/workflows/copilot-evaluation.yml index d82914c12..3c0ef9162 100644 --- a/.github/workflows/copilot-evaluation.yml +++ b/.github/workflows/copilot-evaluation.yml @@ -65,6 +65,7 @@ jobs: uses: ./.github/workflows/get-entries.yml with: test-run: ${{ inputs.test-run }} + category: ${{ inputs.category }} evaluate-with-copilot-cli: runs-on: [self-hosted, 1ES.Pool=GitHub-BCBench] diff --git a/.github/workflows/dataset-validation.yml b/.github/workflows/dataset-validation.yml index d2d0900a1..180b1b53b 100644 --- a/.github/workflows/dataset-validation.yml +++ b/.github/workflows/dataset-validation.yml @@ -11,7 +11,7 @@ on: default: true type: boolean schedule: - - cron: "0 0 * * 0" # Weekly on Sundays at midnight + - cron: "0 0 * * 0" jobs: get-entries: @@ -19,6 +19,7 @@ jobs: with: modified-only: false test-run: ${{ inputs.test-run || false }} + category: "bug-fix" verify-build-and-tests: runs-on: [self-hosted, 1ES.Pool=GitHub-BCBench] diff --git a/.github/workflows/get-entries.yml b/.github/workflows/get-entries.yml index 10a44edf1..31c952d87 100644 --- a/.github/workflows/get-entries.yml +++ b/.github/workflows/get-entries.yml @@ -15,6 +15,11 @@ on: required: false type: boolean default: false + category: + description: Evaluation category + required: true + type: string + default: "bug-fix" outputs: entries: description: JSON array of dataset entries @@ -37,7 +42,7 @@ jobs: - name: Get entries for matrix id: get-entries run: | - cmd="uv run bcbench dataset list --github-output entries" + cmd="uv run bcbench dataset list --category ${{ inputs.category }} --github-output entries" if [[ "${{ inputs.modified-only }}" == "true" ]]; then cmd="$cmd --modified-only" diff --git a/.github/workflows/mini-evaluation.yml b/.github/workflows/mini-evaluation.yml index 29430f0cc..8149947e3 100644 --- a/.github/workflows/mini-evaluation.yml +++ b/.github/workflows/mini-evaluation.yml @@ -38,6 +38,7 @@ jobs: uses: ./.github/workflows/get-entries.yml with: test-run: ${{ inputs.test-run }} + category: ${{ inputs.category }} evaluate-with-mini-agent: runs-on: [self-hosted, 1ES.Pool=GitHub-BCBench] diff --git a/.github/workflows/summarize-results.yml b/.github/workflows/summarize-results.yml index bd65b480a..a492f7a86 100644 --- a/.github/workflows/summarize-results.yml +++ b/.github/workflows/summarize-results.yml @@ -53,7 +53,7 @@ jobs: merge-multiple: true - name: Summarize evaluation results - run: uv run bcbench result summarize --result-dir "${{ inputs.results-dir }}" --bceval-output "${{ env.BCEVAL_RESULT_FILE }}" --summary-output "${{ env.SUMMARY_OUTPUT_FILE }}" + run: uv run bcbench result summarize --category "${{ inputs.category }}" --result-dir "${{ inputs.results-dir }}" --bceval-output "${{ env.BCEVAL_RESULT_FILE }}" --summary-output "${{ env.SUMMARY_OUTPUT_FILE }}" - name: Upload evaluation summary to artifacts uses: actions/upload-artifact@v6 diff --git a/src/bcbench/agent/claude/agent.py b/src/bcbench/agent/claude/agent.py index ab71fa200..b07c48c57 100644 --- a/src/bcbench/agent/claude/agent.py +++ b/src/bcbench/agent/claude/agent.py @@ -8,7 +8,7 @@ from bcbench.agent.claude.metrics import parse_metrics from bcbench.agent.shared import build_mcp_config, build_prompt from bcbench.config import get_config -from bcbench.dataset import DatasetEntry +from bcbench.dataset import BaseDatasetEntry from bcbench.exceptions import AgentError, AgentTimeoutError from bcbench.logger import get_logger from bcbench.operations import setup_agent_skills, setup_custom_agent, setup_instructions_from_config @@ -19,7 +19,7 @@ def run_claude_code( - entry: DatasetEntry, model: str, category: EvaluationCategory, repo_path: Path, output_dir: Path, al_mcp: bool = False, container_name: str = "bcbench" + entry: BaseDatasetEntry, model: str, category: EvaluationCategory, repo_path: Path, output_dir: Path, al_mcp: bool = False, container_name: str = "bcbench" ) -> tuple[AgentMetrics | None, ExperimentConfiguration]: """Run Claude Code on a single dataset entry. diff --git a/src/bcbench/agent/copilot/agent.py b/src/bcbench/agent/copilot/agent.py index 2fdc98078..ab2f1a79e 100644 --- a/src/bcbench/agent/copilot/agent.py +++ b/src/bcbench/agent/copilot/agent.py @@ -10,7 +10,7 @@ from bcbench.agent.copilot.metrics import parse_metrics from bcbench.agent.shared import build_mcp_config, build_prompt from bcbench.config import get_config -from bcbench.dataset import DatasetEntry +from bcbench.dataset import BaseDatasetEntry from bcbench.exceptions import AgentError, AgentTimeoutError from bcbench.logger import get_logger from bcbench.operations import setup_agent_skills, setup_custom_agent, setup_instructions_from_config @@ -21,7 +21,7 @@ def run_copilot_agent( - entry: DatasetEntry, model: str, category: EvaluationCategory, repo_path: Path, output_dir: Path, al_mcp: bool = False, container_name: str = "bcbench" + entry: BaseDatasetEntry, model: str, category: EvaluationCategory, repo_path: Path, output_dir: Path, al_mcp: bool = False, container_name: str = "bcbench" ) -> tuple[AgentMetrics | None, ExperimentConfiguration]: """Run GitHub Copilot CLI agent on a single dataset entry. diff --git a/src/bcbench/agent/mini/agent.py b/src/bcbench/agent/mini/agent.py index 9f82ab64c..fec150331 100644 --- a/src/bcbench/agent/mini/agent.py +++ b/src/bcbench/agent/mini/agent.py @@ -8,7 +8,7 @@ import yaml from bcbench.config import get_config -from bcbench.dataset import DatasetEntry +from bcbench.dataset import BaseDatasetEntry from bcbench.exceptions import ConfigurationError from bcbench.logger import get_logger from bcbench.types import AgentMetrics, EvaluationCategory, ExperimentConfiguration @@ -52,7 +52,7 @@ def parse_action(self, response: dict) -> dict: def run_mini_agent( - entry: DatasetEntry, + entry: BaseDatasetEntry, repo_path: Path, model: str, category: EvaluationCategory, diff --git a/src/bcbench/agent/shared/mcp.py b/src/bcbench/agent/shared/mcp.py index 85046d6e0..822df035a 100644 --- a/src/bcbench/agent/shared/mcp.py +++ b/src/bcbench/agent/shared/mcp.py @@ -6,7 +6,7 @@ from jinja2 import Template from packaging.version import Version -from bcbench.dataset import DatasetEntry +from bcbench.dataset import BaseDatasetEntry from bcbench.exceptions import AgentError from bcbench.logger import get_logger @@ -144,7 +144,7 @@ def _build_server_entry(server: dict[str, Any], template_context: dict[str, Any] raise AgentError(f"Unsupported MCP server type: {server_type}") -def build_mcp_config(config: dict[str, Any], entry: DatasetEntry, repo_path: Path, al_mcp: bool = False, container_name: str = "bcbench") -> tuple[str | None, list[str] | None]: +def build_mcp_config(config: dict[str, Any], entry: BaseDatasetEntry, repo_path: Path, al_mcp: bool = False, container_name: str = "bcbench") -> tuple[str | None, list[str] | None]: mcp_servers: list[dict[str, Any]] = config.get("mcp", {}).get("servers", []) if not al_mcp: diff --git a/src/bcbench/agent/shared/prompt.py b/src/bcbench/agent/shared/prompt.py index b819ada9f..474105993 100644 --- a/src/bcbench/agent/shared/prompt.py +++ b/src/bcbench/agent/shared/prompt.py @@ -2,11 +2,11 @@ from jinja2 import Template -from bcbench.dataset import DatasetEntry +from bcbench.dataset import BaseDatasetEntry from bcbench.types import EvaluationCategory -def build_prompt(entry: DatasetEntry, repo_path: Path, config: dict, category: EvaluationCategory, al_mcp: bool = False) -> str: +def build_prompt(entry: BaseDatasetEntry, repo_path: Path, config: dict, category: EvaluationCategory, al_mcp: bool = False) -> str: prompt_config = config.get("prompt", {}) template_str = prompt_config.get(f"{category.value}-template") include_project_paths = prompt_config.get("include_project_paths") diff --git a/src/bcbench/cli_options.py b/src/bcbench/cli_options.py index 923e376c1..d8221f27f 100644 --- a/src/bcbench/cli_options.py +++ b/src/bcbench/cli_options.py @@ -9,12 +9,8 @@ # Type aliases for cleaner command signatures # Note: Defaults are provided in function signatures, not here -DatasetPath = Annotated[Path, typer.Option(help="Path to dataset file")] - RepoPath = Annotated[Path, typer.Option(help="Path to repository")] -SchemaPath = Annotated[Path, typer.Option(help="Path to schema file")] - OutputDir = Annotated[Path, typer.Option(help="Directory to save evaluation results")] RunId = Annotated[str, typer.Option(envvar="GITHUB_RUN_ID", help="Unique identifier for this evaluation run")] diff --git a/src/bcbench/collection/build_entry.py b/src/bcbench/collection/build_entry.py index 05a6ec5c2..88e340e67 100644 --- a/src/bcbench/collection/build_entry.py +++ b/src/bcbench/collection/build_entry.py @@ -1,4 +1,4 @@ -"""Builder functions for creating DatasetEntry from ADO sources.""" +"""Builder functions for creating BugFixEntry from ADO sources.""" from pathlib import Path from typing import Any @@ -7,7 +7,7 @@ from bcbench.collection.patch_utils import extract_file_paths_from_patch, extract_patches, find_project_paths_from_diff from bcbench.collection.version_resolver import determine_environment_setup_version from bcbench.config import get_config -from bcbench.dataset import DatasetEntry +from bcbench.dataset import BugFixEntry from bcbench.operations.git_operations import checkout_commit from bcbench.operations.test_operations import extract_tests_from_patch @@ -51,7 +51,7 @@ def build_dataset_entry_from_ado( base_commit: str, commit: str, diff_path: list[str] | None = None, -) -> DatasetEntry: +) -> BugFixEntry: created_at = extract_creation_date(pr_data) patch, patch_fix, patch_test = extract_patches(repo_path, base_commit, commit, diff_path=diff_path) problem_statement, hints = extract_problem_statement(work_item_data) @@ -73,7 +73,7 @@ def build_dataset_entry_from_ado( hints=hints, ) - return DatasetEntry( + return BugFixEntry( instance_id=instance_id, base_commit=base_commit, created_at=created_at, diff --git a/src/bcbench/collection/collect_gh.py b/src/bcbench/collection/collect_gh.py index 4a9046a99..651b61dd0 100644 --- a/src/bcbench/collection/collect_gh.py +++ b/src/bcbench/collection/collect_gh.py @@ -9,7 +9,7 @@ from bcbench.collection.gh_client import GHClient from bcbench.collection.patch_utils import extract_file_paths_from_patch, find_project_paths_from_diff, separate_patches from bcbench.config import get_config -from bcbench.dataset import DatasetEntry +from bcbench.dataset import BugFixEntry from bcbench.exceptions import CollectionError from bcbench.logger import get_logger from bcbench.operations.test_operations import extract_tests_from_patch @@ -58,7 +58,7 @@ def collect_gh_entry(pr_number: int, output: Path, repo: str = "microsoft/BCApps save_problem_statement(instance_id=instance_id, problem_statement=problem_statement) - entry = DatasetEntry( + entry = BugFixEntry( repo=repo, instance_id=instance_id, base_commit=base_commit, diff --git a/src/bcbench/collection/collect_nav.py b/src/bcbench/collection/collect_nav.py index ed4a1050b..3c8dd7694 100644 --- a/src/bcbench/collection/collect_nav.py +++ b/src/bcbench/collection/collect_nav.py @@ -6,7 +6,7 @@ from bcbench.collection.ado_client import ADOClient from bcbench.collection.build_entry import build_dataset_entry_from_ado from bcbench.config import get_config -from bcbench.dataset import DatasetEntry +from bcbench.dataset import BugFixEntry from bcbench.exceptions import CollectionError from bcbench.logger import get_logger @@ -35,7 +35,7 @@ def collect_nav_entry( if len(parents) != 1: raise CollectionError("Commit has multiple parents, cannot determine base commit.") - entry: DatasetEntry = build_dataset_entry_from_ado( + entry: BugFixEntry = build_dataset_entry_from_ado( pr_number=pr_number, repo_path=repo_path, pr_data=pr_data, diff --git a/src/bcbench/commands/collect.py b/src/bcbench/commands/collect.py index 18eac9cf8..2df9a03d9 100644 --- a/src/bcbench/commands/collect.py +++ b/src/bcbench/commands/collect.py @@ -1,9 +1,11 @@ """CLI commands for collecting dataset entries.""" +from pathlib import Path + import typer from typing_extensions import Annotated -from bcbench.cli_options import DatasetPath, RepoPath +from bcbench.cli_options import RepoPath from bcbench.collection import collect_gh_entry, collect_nav_entry from bcbench.config import get_config @@ -15,7 +17,7 @@ @collect_app.command("nav") def collect_nav( pr_number: Annotated[int, typer.Argument(help="Pull request number to collect")], - output: DatasetPath = _config.paths.dataset_path, + output: Annotated[Path, typer.Option(help="Path to output dataset file")] = _config.paths.dataset_dir / "bcbench.jsonl", repo_path: RepoPath = _config.paths.testbed_path, diff_path: Annotated[ list[str] | None, @@ -35,7 +37,7 @@ def collect_nav( @collect_app.command("gh") def collect_gh( pr_number: Annotated[int, typer.Argument(help="Pull request number to collect")], - output: DatasetPath = _config.paths.dataset_path, + output: Annotated[Path, typer.Option(help="Path to output dataset file")] = _config.paths.dataset_dir / "bcbench.jsonl", repo: Annotated[str, typer.Option(help="GitHub repository in OWNER/REPO format")] = "microsoft/BCApps", ): """ diff --git a/src/bcbench/commands/dataset.py b/src/bcbench/commands/dataset.py index 4ba0d9dda..732faee7a 100644 --- a/src/bcbench/commands/dataset.py +++ b/src/bcbench/commands/dataset.py @@ -1,52 +1,34 @@ """CLI commands for dataset operations.""" import json -from pathlib import Path import typer from typing_extensions import Annotated -from bcbench.cli_options import DatasetPath +from bcbench.cli_options import EvaluationCategoryOption from bcbench.config import get_config -from bcbench.dataset import DatasetEntry -from bcbench.dataset.dataset_loader import load_dataset_entries -from bcbench.dataset.reviewer import run_dataset_reviewer +from bcbench.dataset import BaseDatasetEntry +from bcbench.dataset.dataset_entry import _BugFixTestGenBase from bcbench.exceptions import ConfigurationError from bcbench.logger import get_logger +from bcbench.types import EvaluationCategory logger = get_logger(__name__) -_config = get_config() dataset_app = typer.Typer(help="Query and analyze dataset") -@dataset_app.command("review") -def review_dataset( - dataset_path: Annotated[Path, typer.Argument(help="Path to dataset JSONL file")] = _config.paths.dataset_path, - results_dir: Annotated[ - Path | None, typer.Option("--results-dir", "-r", help="Directory containing result JSONL files to show resolution stats", exists=True, file_okay=False, dir_okay=True) - ] = None, -): - """ - Review dataset entries using a TUI. - - Opens a split-pane view showing entry information and problem statement. - Use arrow keys to navigate between entries. - - If --results-dir is provided, shows resolution stats (e.g., "2/5 resolved") - for each entry based on the results in that directory. - """ - run_dataset_reviewer(dataset_path, results_dir) - - @dataset_app.command("list") def list_entries( - dataset_path: DatasetPath = _config.paths.dataset_path, + category: EvaluationCategoryOption = EvaluationCategory.BUG_FIX, github_output: Annotated[str | None, typer.Option(help="Write JSON output to GITHUB_OUTPUT with this key name")] = None, modified_only: Annotated[bool, typer.Option(help="Only list entries that have been modified in git diff")] = False, test_run: Annotated[bool, typer.Option(help="Indicate this is a test run (with 2 entries)")] = False, ): """List dataset entry IDs.""" + entry_cls = category.entry_class + resolved_path = category.dataset_path + if modified_only: import subprocess @@ -59,19 +41,19 @@ def list_entries( "--no-color", "--diff-filter=AM", "--", - str(dataset_path), + str(resolved_path), ], capture_output=True, text=True, encoding="utf-8", check=True, - cwd=dataset_path.parent, + cwd=resolved_path.parent, ) diff_output: str = result.stdout entry_ids: list[str] = _modified_instance_ids_from_diff(diff_output) else: - dataset_entries: list[DatasetEntry] = load_dataset_entries(dataset_path, random=2 if test_run else None) - entry_ids: list[str] = [e.instance_id for e in dataset_entries] + entries: list[BaseDatasetEntry] = entry_cls.load(resolved_path, random=2 if test_run else None) + entry_ids: list[str] = [e.instance_id for e in entries] print(f"Found {len(entry_ids)} entry(ies){' (modified only)' if modified_only else ''}:") for entry_id in entry_ids: @@ -84,7 +66,7 @@ def list_entries( @dataset_app.command("view") def view_entry( entry_id: Annotated[str, typer.Argument(help="Entry ID to view")], - dataset_path: DatasetPath = _config.paths.dataset_path, + category: EvaluationCategoryOption = EvaluationCategory.BUG_FIX, show_patch: Annotated[bool, typer.Option(help="Show patch in output")] = False, ): """View a specific dataset entry with rich formatting.""" @@ -92,7 +74,7 @@ def view_entry( from rich.panel import Panel from rich.table import Table - entry = load_dataset_entries(dataset_path, entry_id=entry_id)[0] + entry: BaseDatasetEntry = category.entry_class.load(category.dataset_path, entry_id=entry_id)[0] console = Console() info_table = Table(show_header=False, box=None) @@ -109,7 +91,6 @@ def view_entry( "\n".join(entry.project_paths) if entry.project_paths else "N/A", ) - # Add metadata fields dynamically metadata_dict = entry.metadata.model_dump() for field_name, field_value in metadata_dict.items(): display_name = field_name.replace("_", " ").title() @@ -123,33 +104,35 @@ def view_entry( if show_patch: console.print("\n[bold cyan]Patch:[/bold cyan]") console.print(Panel(entry.patch or "[dim]Empty[/dim]", border_style="magenta")) - console.print("\n[bold cyan]Test Patch:[/bold cyan]") - console.print(Panel(entry.test_patch or "[dim]Empty[/dim]", border_style="magenta")) - - console.print("\n[bold cyan]FAIL_TO_PASS Tests:[/bold cyan]") - if entry.fail_to_pass: - test_table = Table() - test_table.add_column("Codeunit ID", style="magenta") - test_table.add_column("Functions", style="yellow") - for test in entry.fail_to_pass: - test_table.add_row(str(test.codeunitID), ", ".join(test.functionName)) - console.print(test_table) - else: - console.print("[dim]No FAIL_TO_PASS tests[/dim]") - - console.print("\n[bold cyan]PASS_TO_PASS Tests:[/bold cyan]") - if entry.pass_to_pass: - test_table = Table() - test_table.add_column("Codeunit ID", style="magenta") - test_table.add_column("Functions", style="yellow") - for test in entry.pass_to_pass: - test_table.add_row( - str(test.codeunitID), - ", ".join(test.functionName), - ) - console.print(test_table) - else: - console.print("[dim]No PASS_TO_PASS tests[/dim]") + + # Display category-specific fields + if isinstance(entry, _BugFixTestGenBase): + bugfix_entry = entry + if show_patch: + console.print("\n[bold cyan]Test Patch:[/bold cyan]") + console.print(Panel(bugfix_entry.test_patch or "[dim]Empty[/dim]", border_style="magenta")) + + console.print("\n[bold cyan]FAIL_TO_PASS Tests:[/bold cyan]") + if bugfix_entry.fail_to_pass: + test_table = Table() + test_table.add_column("Codeunit ID", style="magenta") + test_table.add_column("Functions", style="yellow") + for test in bugfix_entry.fail_to_pass: + test_table.add_row(str(test.codeunitID), ", ".join(test.functionName)) + console.print(test_table) + else: + console.print("[dim]No FAIL_TO_PASS tests[/dim]") + + console.print("\n[bold cyan]PASS_TO_PASS Tests:[/bold cyan]") + if bugfix_entry.pass_to_pass: + test_table = Table() + test_table.add_column("Codeunit ID", style="magenta") + test_table.add_column("Functions", style="yellow") + for test in bugfix_entry.pass_to_pass: + test_table.add_row(str(test.codeunitID), ", ".join(test.functionName)) + console.print(test_table) + else: + console.print("[dim]No PASS_TO_PASS tests[/dim]") def _modified_instance_ids_from_diff(diff_output: str) -> list[str]: diff --git a/src/bcbench/commands/evaluate.py b/src/bcbench/commands/evaluate.py index 6e41b2f92..ffef5d745 100644 --- a/src/bcbench/commands/evaluate.py +++ b/src/bcbench/commands/evaluate.py @@ -13,7 +13,6 @@ ContainerPassword, ContainerUsername, CopilotModel, - DatasetPath, EvaluationCategoryOption, FoundryModel, OutputDir, @@ -21,11 +20,11 @@ RunId, ) from bcbench.config import get_config -from bcbench.dataset import DatasetEntry, load_dataset_entries -from bcbench.evaluate import EvaluationPipeline, create_pipeline +from bcbench.dataset import BaseDatasetEntry +from bcbench.evaluate import EvaluationPipeline from bcbench.logger import get_logger from bcbench.results import BaseEvaluationResult -from bcbench.types import AgentMetrics, EvaluationContext, ExperimentConfiguration +from bcbench.types import AgentMetrics, ContainerConfig, EvaluationContext, ExperimentConfiguration logger = get_logger(__name__) _config = get_config() @@ -33,6 +32,14 @@ evaluate_app = typer.Typer(help="Evaluate agents on benchmark datasets") +def _prepare_run_dir(output_dir: Path, run_id: str) -> Path: + run_dir = output_dir / run_id + if run_dir.exists(): + shutil.rmtree(run_dir) + run_dir.mkdir(parents=True) + return run_dir + + @evaluate_app.command("mini") def evaluate_mini( entry_id: Annotated[str, typer.Argument(help="Entry ID to run")], @@ -41,7 +48,6 @@ def evaluate_mini( password: ContainerPassword, category: EvaluationCategoryOption, model: FoundryModel = "gpt-5.1-codex-mini", - dataset_path: DatasetPath = _config.paths.dataset_path, repo_path: RepoPath = _config.paths.testbed_path, output_dir: OutputDir = _config.paths.evaluation_results_path, run_id: RunId = "mini_test_run", @@ -51,14 +57,8 @@ def evaluate_mini( To only run the agent to generate a patch without building/testing, use 'bcbench run mini' instead. """ - entries: list[DatasetEntry] = load_dataset_entries(dataset_path, entry_id=entry_id) - entry: DatasetEntry = entries[0] - logger.info(f"Loaded {entry_id} entry from dataset") - - run_dir: Path = output_dir / run_id - if run_dir.exists(): - shutil.rmtree(run_dir) - run_dir.mkdir(parents=True) + entry = category.entry_class.load(category.dataset_path, entry_id=entry_id)[0] + run_dir = _prepare_run_dir(output_dir, run_id) logger.info(f"Running evaluation on entry {entry_id} with mini-bc-agent") @@ -66,15 +66,13 @@ def evaluate_mini( entry=entry, repo_path=repo_path, result_dir=run_dir, - container_name=container_name, - username=username, - password=password, + container=ContainerConfig(name=container_name, username=username, password=password), model=model, agent_name="mini-bc-agent", category=category, ) - pipeline = create_pipeline(category) + pipeline = category.pipeline pipeline.execute( context, lambda ctx: run_mini_agent( @@ -98,7 +96,6 @@ def evaluate_copilot( password: ContainerPassword, category: EvaluationCategoryOption, model: CopilotModel = "claude-haiku-4.5", - dataset_path: DatasetPath = _config.paths.dataset_path, repo_path: RepoPath = _config.paths.testbed_path, output_dir: OutputDir = _config.paths.evaluation_results_path, run_id: RunId = "copilot_test_run", @@ -109,14 +106,8 @@ def evaluate_copilot( To only run the agent to generate a patch without building/testing, use 'bcbench run copilot' instead. """ - entries: list[DatasetEntry] = load_dataset_entries(dataset_path, entry_id=entry_id) - entry: DatasetEntry = entries[0] - logger.info(f"Loaded {entry_id} entry from dataset") - - run_dir: Path = output_dir / run_id - if run_dir.exists(): - shutil.rmtree(run_dir) - run_dir.mkdir(parents=True) + entry = category.entry_class.load(category.dataset_path, entry_id=entry_id)[0] + run_dir = _prepare_run_dir(output_dir, run_id) logger.info(f"Running evaluation on entry {entry_id} with GitHub Copilot CLI") @@ -124,15 +115,13 @@ def evaluate_copilot( entry=entry, repo_path=repo_path, result_dir=run_dir, - container_name=container_name, - username=username, - password=password, + container=ContainerConfig(name=container_name, username=username, password=password), model=model, agent_name="GitHub Copilot", category=category, ) - pipeline = create_pipeline(category) + pipeline = category.pipeline pipeline.execute( context, lambda ctx: run_copilot_agent( @@ -142,7 +131,7 @@ def evaluate_copilot( model=ctx.model, output_dir=ctx.result_dir, al_mcp=al_mcp, - container_name=ctx.container_name, + container_name=ctx.get_container().name, ), ) @@ -158,7 +147,6 @@ def evaluate_claude_code( password: ContainerPassword, category: EvaluationCategoryOption, model: ClaudeCodeModel = "claude-haiku-4-5", - dataset_path: DatasetPath = _config.paths.dataset_path, repo_path: RepoPath = _config.paths.testbed_path, output_dir: OutputDir = _config.paths.evaluation_results_path, run_id: RunId = "claude_code_test_run", @@ -169,14 +157,8 @@ def evaluate_claude_code( To only run the agent to generate a patch without building/testing, use 'bcbench run claude' instead. """ - entries: list[DatasetEntry] = load_dataset_entries(dataset_path, entry_id=entry_id) - entry: DatasetEntry = entries[0] - logger.info(f"Loaded {entry_id} entry from dataset") - - run_dir: Path = output_dir / run_id - if run_dir.exists(): - shutil.rmtree(run_dir) - run_dir.mkdir(parents=True) + entry = category.entry_class.load(category.dataset_path, entry_id=entry_id)[0] + run_dir = _prepare_run_dir(output_dir, run_id) logger.info(f"Running evaluation on entry {entry_id} with Claude Code") @@ -184,15 +166,13 @@ def evaluate_claude_code( entry=entry, repo_path=repo_path, result_dir=run_dir, - container_name=container_name, - username=username, - password=password, + container=ContainerConfig(name=container_name, username=username, password=password), model=model, agent_name="Claude Code", category=category, ) - pipeline = create_pipeline(category) + pipeline = category.pipeline pipeline.execute( context, lambda ctx: run_claude_code( @@ -202,7 +182,7 @@ def evaluate_claude_code( model=ctx.model, output_dir=ctx.result_dir, al_mcp=al_mcp, - container_name=ctx.container_name, + container_name=ctx.get_container().name, ), ) @@ -214,21 +194,14 @@ def evaluate_claude_code( def evaluate_mock( entry_id: Annotated[str, typer.Argument(help="Entry ID to run")], category: EvaluationCategoryOption, - dataset_path: DatasetPath = _config.paths.dataset_path, output_dir: OutputDir = _config.paths.evaluation_results_path, run_id: RunId = "mock_run", ): """ Evaluate mock agent on single dataset entry for testing purposes. """ - entries: list[DatasetEntry] = load_dataset_entries(dataset_path, entry_id=entry_id) - entry: DatasetEntry = entries[0] - logger.info(f"Loaded {entry_id} entry from dataset") - - run_dir: Path = output_dir / run_id - if run_dir.exists(): - shutil.rmtree(run_dir) - run_dir.mkdir(parents=True) + entry = category.entry_class.load(category.dataset_path, entry_id=entry_id)[0] + run_dir = _prepare_run_dir(output_dir, run_id) logger.info(f"Running evaluation on entry {entry_id} with mock agent") @@ -236,9 +209,6 @@ def evaluate_mock( entry=entry, repo_path=Path(), result_dir=run_dir, - container_name="no container", - username="", - password="", model="mock-model", agent_name="mock-agent", category=category, @@ -251,17 +221,17 @@ def evaluate_mock( logger.info(f"Results saved to: {run_dir}") -class MockEvaluationPipeline(EvaluationPipeline): +class MockEvaluationPipeline(EvaluationPipeline[BaseDatasetEntry]): """Mock pipeline for testing evaluation infrastructure. This pipeline simulates agent execution without requiring actual BC container setup. It randomly generates different scenarios to test result handling and serialization. """ - def setup(self, context: EvaluationContext) -> None: + def setup(self, context: EvaluationContext[BaseDatasetEntry]) -> None: logger.info("Mock pipeline: Skipping setup") - def run_agent(self, context: EvaluationContext, agent_runner: Callable) -> None: + def run_agent(self, context: EvaluationContext[BaseDatasetEntry], agent_runner: Callable) -> None: """Generate random agent metrics and experiment configuration.""" logger.info("Mock pipeline: Generating random metrics and experiment configuration") @@ -290,7 +260,7 @@ def run_agent(self, context: EvaluationContext, agent_runner: Callable) -> None: logger.info(f"Using agent metrics: {context.metrics}") logger.info(f"Using experiment configuration: {context.experiment}") - def evaluate(self, context: EvaluationContext) -> None: + def evaluate(self, context: EvaluationContext[BaseDatasetEntry]) -> None: """Create random evaluation result to test different outcome scenarios.""" logger.info("Mock pipeline: Generating random evaluation result") diff --git a/src/bcbench/commands/result.py b/src/bcbench/commands/result.py index eb51534c7..40cb6d375 100644 --- a/src/bcbench/commands/result.py +++ b/src/bcbench/commands/result.py @@ -6,7 +6,7 @@ import typer from typing_extensions import Annotated -from bcbench.cli_options import DatasetPath, OutputDir, RunId +from bcbench.cli_options import EvaluationCategoryOption, OutputDir, RunId from bcbench.config import get_config from bcbench.logger import get_logger from bcbench.results import ( @@ -19,8 +19,6 @@ create_result_from_json, write_bceval_results, ) -from bcbench.results.reviewer import run_instance_reviewer, run_reviewer -from bcbench.types import EvaluationCategory logger = get_logger(__name__) @@ -30,45 +28,12 @@ result_app = typer.Typer(help="Process and display evaluation results") -@result_app.command("review") -def result_review( - results_file: Annotated[Path, typer.Argument(help="Path to results JSONL file to review (or directory if --instance-id is used)", exists=True, file_okay=True, dir_okay=True)], - category: Annotated[EvaluationCategory, typer.Option("--category", "-c", help="Evaluation category: bug-fix (compare patch) or test-generation (compare test_patch)")], - dataset_path: DatasetPath = _config.paths.dataset_path, - instance_id: Annotated[str | None, typer.Option("--instance-id", "-i", help="Review a single instance across all runs in a directory")] = None, - include_resolved: Annotated[bool, typer.Option("--include-resolved", help="Include resolved instances in the review")] = False, -): - """ - Review evaluation results and annotate failure categories using a TUI. - - Opens a split-pane view showing expected (gold patch) vs actual (agent output). - Use j/k or arrows to navigate, 1-7 to select failure category. - Categories are auto-saved on navigate and quit. - - Two modes: - - Default: Review all unresolved results in a single JSONL file - - With --instance-id: Review one instance across all runs in a directory - - Use --category to switch between bug-fix (compares patch) and test-generation (compares test_patch). - """ - if instance_id: - if not results_file.is_dir(): - logger.error(f"When using --instance-id, the path must be a directory containing JSONL files: {results_file}") - raise typer.Exit(code=1) - run_instance_reviewer(results_file, instance_id, dataset_path, category) - else: - if not results_file.is_file(): - logger.error(f"Expected a JSONL file, got directory: {results_file}. Use --instance-id to review across runs.") - raise typer.Exit(code=1) - run_reviewer(results_file, dataset_path, category, include_resolved) - - @result_app.command("summarize") def result_summarize( run_id: RunId, + category: EvaluationCategoryOption, result_dir: OutputDir = _config.paths.evaluation_results_path, result_pattern: Annotated[str, typer.Option(help="Pattern for the per instances result files")] = f"*{_config.file_patterns.result_pattern}", - dataset_path: DatasetPath = _config.paths.dataset_path, summary_output: Annotated[str, typer.Option(help="Output filename for summary JSON")] = "evaluation_summary.json", bceval_output: Annotated[str, typer.Option(help="Output filename for bceval results")] = "bceval_results.jsonl", ): @@ -106,7 +71,7 @@ def result_summarize( logger.error("No results found in the result files") raise typer.Exit(code=1) - write_bceval_results(results, run_dir, run_id, dataset_path, bceval_output) + write_bceval_results(results, run_dir, run_id, bceval_output, category) if _config.env.github_actions: create_github_job_summary(results) diff --git a/src/bcbench/commands/run.py b/src/bcbench/commands/run.py index a026d1435..38832a1b8 100644 --- a/src/bcbench/commands/run.py +++ b/src/bcbench/commands/run.py @@ -13,14 +13,13 @@ ClaudeCodeModel, ContainerName, CopilotModel, - DatasetPath, EvaluationCategoryOption, FoundryModel, OutputDir, RepoPath, ) from bcbench.config import get_config -from bcbench.dataset import DatasetEntry, load_dataset_entries +from bcbench.dataset.dataset_entry import _BugFixTestGenBase from bcbench.logger import get_logger from bcbench.operations import setup_repo_postbuild, setup_repo_prebuild @@ -35,7 +34,6 @@ def run_mini( entry_id: Annotated[str, typer.Argument(help="Entry ID to run")], category: EvaluationCategoryOption, model: FoundryModel = "gpt-5.1-codex-mini", - dataset_path: DatasetPath = _config.paths.dataset_path, repo_path: RepoPath = _config.paths.testbed_path, output_dir: OutputDir = _config.paths.evaluation_results_path, ): @@ -47,10 +45,10 @@ def run_mini( Example: uv run bcbench run mini microsoft__BCApps-5633 --step-limit 5 --category bug-fix """ - entry: DatasetEntry = load_dataset_entries(dataset_path, entry_id=entry_id)[0] - + entry = category.entry_class.load(category.dataset_path, entry_id=entry_id)[0] setup_repo_prebuild(entry, repo_path) - setup_repo_postbuild(entry, repo_path, category) + if isinstance(entry, _BugFixTestGenBase): + setup_repo_postbuild(entry, repo_path, category) run_mini_agent( entry=entry, @@ -67,7 +65,6 @@ def run_copilot( category: EvaluationCategoryOption, container_name: ContainerName, model: CopilotModel = "claude-haiku-4.5", - dataset_path: DatasetPath = _config.paths.dataset_path, repo_path: RepoPath = _config.paths.testbed_path, output_dir: OutputDir = _config.paths.evaluation_results_path, al_mcp: Annotated[bool, typer.Option("--al-mcp", help="Enable AL MCP server")] = False, @@ -80,10 +77,10 @@ def run_copilot( Example: uv run bcbench run copilot microsoft__BCApps-5633 --category bug-fix --repo-path /path/to/BCApps """ - entry: DatasetEntry = load_dataset_entries(dataset_path, entry_id=entry_id)[0] - + entry = category.entry_class.load(category.dataset_path, entry_id=entry_id)[0] setup_repo_prebuild(entry, repo_path) - setup_repo_postbuild(entry, repo_path, category) + if isinstance(entry, _BugFixTestGenBase): + setup_repo_postbuild(entry, repo_path, category) run_copilot_agent(entry=entry, repo_path=repo_path, model=model, category=category, output_dir=output_dir, al_mcp=al_mcp, container_name=container_name) @@ -94,7 +91,6 @@ def run_claude( category: EvaluationCategoryOption, container_name: ContainerName, model: ClaudeCodeModel = "claude-haiku-4-5", - dataset_path: DatasetPath = _config.paths.dataset_path, repo_path: RepoPath = _config.paths.testbed_path, output_dir: OutputDir = _config.paths.evaluation_results_path, al_mcp: Annotated[bool, typer.Option("--al-mcp", help="Enable AL MCP server")] = False, @@ -107,10 +103,10 @@ def run_claude( Example: uv run bcbench run claude microsoft__BCApps-5633 --category bug-fix --repo-path /path/to/BCApps """ - entry: DatasetEntry = load_dataset_entries(dataset_path, entry_id=entry_id)[0] - + entry = category.entry_class.load(category.dataset_path, entry_id=entry_id)[0] setup_repo_prebuild(entry, repo_path) - setup_repo_postbuild(entry, repo_path, category) + if isinstance(entry, _BugFixTestGenBase): + setup_repo_postbuild(entry, repo_path, category) run_claude_code(entry=entry, repo_path=repo_path, model=model, category=category, output_dir=output_dir, al_mcp=al_mcp, container_name=container_name) diff --git a/src/bcbench/config.py b/src/bcbench/config.py index 06fdffd11..c9a35df69 100644 --- a/src/bcbench/config.py +++ b/src/bcbench/config.py @@ -35,14 +35,12 @@ class PathConfig: """File and directory paths.""" bc_bench_root: Path - dataset_path: Path dataset_dir: Path problem_statement_dir: Path testbed_path: Path ps_script_path: Path evaluation_results_path: Path leaderboard_dir: Path - copilot_dir: Path agent_share_dir: Path @classmethod @@ -51,13 +49,11 @@ def from_root(cls, root: Path) -> PathConfig: return cls( bc_bench_root=root, dataset_dir=root / "dataset", - dataset_path=root / "dataset" / "bcbench.jsonl", problem_statement_dir=root / "dataset" / "problemstatement", testbed_path=root.parent / "NAV", ps_script_path=root / "scripts", evaluation_results_path=root / "evaluation_results", leaderboard_dir=root / "docs" / "_data", - copilot_dir=root / "src" / "bcbench" / "agent" / "copilot", agent_share_dir=root / "src" / "bcbench" / "agent" / "shared", ) diff --git a/src/bcbench/dataset/__init__.py b/src/bcbench/dataset/__init__.py index 6af72372f..4e6e205fa 100644 --- a/src/bcbench/dataset/__init__.py +++ b/src/bcbench/dataset/__init__.py @@ -1,12 +1,10 @@ """Dataset module for querying, validating and analyze dataset entries.""" -from bcbench.dataset.dataset_entry import DatasetEntry, TestEntry -from bcbench.dataset.dataset_loader import load_dataset_entries -from bcbench.dataset.reviewer import run_dataset_reviewer +from bcbench.dataset.dataset_entry import BaseDatasetEntry, BugFixEntry, TestEntry, TestGenEntry __all__ = [ - "DatasetEntry", + "BaseDatasetEntry", + "BugFixEntry", "TestEntry", - "load_dataset_entries", - "run_dataset_reviewer", + "TestGenEntry", ] diff --git a/src/bcbench/dataset/dataset_entry.py b/src/bcbench/dataset/dataset_entry.py index 76a1361a0..2d073a156 100644 --- a/src/bcbench/dataset/dataset_entry.py +++ b/src/bcbench/dataset/dataset_entry.py @@ -2,16 +2,18 @@ import json import re +from abc import abstractmethod from pathlib import Path -from typing import Annotated +from typing import Annotated, Self from pydantic import BaseModel, ConfigDict, Field, model_validator from bcbench.config import get_config +from bcbench.exceptions import EntryNotFoundError _config = get_config() -__all__ = ["DatasetEntry", "TestEntry"] +__all__ = ["BaseDatasetEntry", "BugFixEntry", "TestEntry", "TestGenEntry"] class TestEntry(BaseModel): @@ -28,7 +30,9 @@ class EntryMetadata(BaseModel): image_count: int | None = None -class DatasetEntry(BaseModel): +class BaseDatasetEntry(BaseModel): + """Base class for all dataset entries. Contains common properties shared across categories.""" + model_config = ConfigDict(frozen=True, populate_by_name=True) metadata: EntryMetadata = Field(default_factory=EntryMetadata) @@ -39,90 +43,116 @@ class DatasetEntry(BaseModel): created_at: Annotated[str, Field(min_length=1)] environment_setup_version: str = Field(pattern=r"^[0-9]{2}\.[0-9]{1}$") project_paths: Annotated[list[str], Field(min_length=2)] - fail_to_pass: Annotated[list[TestEntry], Field(alias="FAIL_TO_PASS", min_length=1)] - pass_to_pass: Annotated[list[TestEntry], Field(alias="PASS_TO_PASS")] = [] - test_patch: Annotated[str, Field(min_length=1)] patch: Annotated[str, Field(min_length=1)] - @model_validator(mode="after") - def validate_baseapp_patches_are_w1_only(self) -> DatasetEntry: - """Validate that patches only modify files in the expected layer (currently W1). + @classmethod + def load(cls, dataset_path: Path, entry_id: str | None = None, random: int | None = None) -> list[Self]: + if not dataset_path.exists(): + raise FileNotFoundError(f"Dataset file not found: {dataset_path}") - Only applicable to BaseApp, patches should only modify W1 layer files, not country-specific layers (IT, DE, etc.). - """ - if self.extract_project_name() != "BaseApp": - return self + entries: list[Self] = [] - # Check both patch and test_patch - for patch in (self.patch, self.test_patch): - patch_paths = re.findall(r"^diff --git a/(.+?) b/", patch, re.MULTILINE) + with open(dataset_path, encoding="utf-8") as file: + for line in file: + stripped_line: str = line.strip() + if not stripped_line: + continue - for patch_path in patch_paths: - match = re.match(r"App/Layers/([^/]+)/", patch_path) - if match: - layer = match.group(1) - if layer != "W1": - raise ValueError(f"Patch modifies non-W1 layer '{layer}': {patch_path}") + entry = cls.model_validate_json(stripped_line) - return self + if entry_id: + if entry.instance_id == entry_id: + return [entry] + continue - @property - def problem_statement_dir(self) -> Path: - return _config.paths.problem_statement_dir / self.instance_id + entries.append(entry) + + if entry_id: + raise EntryNotFoundError(entry_id) + + if random is not None and random > 0: + import random as random_module + + return random_module.sample(entries, min(random, len(entries))) + + return entries def save_to_file(self, filepath: Path | str) -> None: path = Path(filepath) path.parent.mkdir(parents=True, exist_ok=True) with path.open("a", encoding="utf-8") as handle: - # For JSONL format, always write compact JSON without indentation json.dump(self.model_dump(by_alias=True, mode="json"), handle, ensure_ascii=False) handle.write("\n") + @abstractmethod def get_task(self, transform_image_paths: bool = False) -> str: - """Get the task description. + pass - problem_statment and hints_text stored in the README.md file under the problem statement directory. + @abstractmethod + def get_expected_output(self) -> str: + pass - Args: - transform_image_paths: Whether to transform relative image paths to include the problem statement directory. Needed when passing to Agents. - """ - readme_path = self.problem_statement_dir / _config.file_patterns.problem_statement_readme + def extract_project_name(self) -> str: + if not self.project_paths: + return "" + + path = self.project_paths[0] + parts = path.replace("\\", "/").split("/") + + if len(parts) >= 4: + return parts[-2] if parts[-1].lower() in ("app", "test") else parts[-1] + + return parts[-1] if parts else "" + +class _BugFixTestGenBase(BaseDatasetEntry): + """Shared schema for bug-fix and test-generation entries (same JSONL, different semantics).""" + + fail_to_pass: Annotated[list[TestEntry], Field(alias="FAIL_TO_PASS", min_length=1)] + pass_to_pass: Annotated[list[TestEntry], Field(alias="PASS_TO_PASS")] = [] + test_patch: Annotated[str, Field(min_length=1)] + + @property + def problem_statement_dir(self) -> Path: + return _config.paths.problem_statement_dir / self.instance_id + + def get_task(self, transform_image_paths: bool = False) -> str: + readme_path = self.problem_statement_dir / _config.file_patterns.problem_statement_readme content: str = readme_path.read_text(encoding="utf-8") if not transform_image_paths: return content - # Transform relative image paths: ![alt text](./diagram.png) -> ![alt text](problem/diagram.png) dest_dir = _config.file_patterns.problem_statement_dest_dir return re.sub(r"!\[([^\]]*)\]\(\./([^)]+)\)", rf"![\1]({dest_dir}/\2)", content) - def extract_project_name(self) -> str: - """Extract the project name from the first project path. + @model_validator(mode="after") + def validate_baseapp_patches_are_w1_only(self) -> Self: + if self.extract_project_name() != "BaseApp": + return self - Examples: - App\\Apps\\W1\\Sustainability\\app -> Sustainability - App\\Layers\\W1\\BaseApp -> BaseApp - src\\Apps\\W1\\Shopify\\App -> Shopify + for patch in (self.patch, self.test_patch): + patch_paths = re.findall(r"^diff --git a/(.+?) b/", patch, re.MULTILINE) - Returns: - The extracted project name, or empty string if no project paths. - """ - if not self.project_paths: - return "" + for patch_path in patch_paths: + match = re.match(r"App/Layers/([^/]+)/", patch_path) + if match: + layer = match.group(1) + if layer != "W1": + raise ValueError(f"Patch modifies non-W1 layer '{layer}': {patch_path}") - # Take the first project path - path = self.project_paths[0] + return self - # Split by backslash or forward slash - parts = path.replace("\\", "/").split("/") - # Look for the meaningful project name - # Pattern: App\Apps\W1\\app or App\Layers\W1\ - if len(parts) >= 4: - # For paths like App\Apps\W1\Sustainability\app, return "Sustainability" - # For paths like App\Layers\W1\BaseApp, return "BaseApp" - return parts[-2] if parts[-1].lower() in ("app", "test") else parts[-1] +class BugFixEntry(_BugFixTestGenBase): + """Dataset entry for the bug-fix category.""" - # Fallback to the last meaningful part - return parts[-1] if parts else "" + def get_expected_output(self) -> str: + return self.patch + + +class TestGenEntry(_BugFixTestGenBase): + """Dataset entry for the test-generation category.""" + + def get_expected_output(self) -> str: + return self.test_patch diff --git a/src/bcbench/dataset/dataset_loader.py b/src/bcbench/dataset/dataset_loader.py deleted file mode 100644 index 8eb1b3f12..000000000 --- a/src/bcbench/dataset/dataset_loader.py +++ /dev/null @@ -1,51 +0,0 @@ -"""Utilities for loading dataset entries from JSONL files.""" - -from pathlib import Path - -from bcbench.dataset.dataset_entry import DatasetEntry -from bcbench.exceptions import EntryNotFoundError - -__all__ = ["load_dataset_entries"] - - -def load_dataset_entries(dataset_path: Path, entry_id: str | None = None, random: int | None = None) -> list[DatasetEntry]: - """ - Load dataset entries from a JSONL file. - - Examples: - # Load a single entry by ID - entries = load_dataset_entries(path, entry_id="NAV_12345") - - # Load 2 random entries - entries = load_dataset_entries(path, random=2) - """ - if not dataset_path.exists(): - raise FileNotFoundError(f"Dataset file not found: {dataset_path}") - - entries: list[DatasetEntry] = [] - - with open(dataset_path, encoding="utf-8") as file: - for line in file: - stripped_line: str = line.strip() - if not stripped_line: - continue - - entry = DatasetEntry.model_validate_json(stripped_line) - - # If searching for specific entry_id, return immediately when found - if entry_id: - if entry.instance_id == entry_id: - return [entry] - continue - - entries.append(entry) - - if entry_id: - raise EntryNotFoundError(entry_id) - - if random is not None and random > 0: - import random as random_module - - return random_module.sample(entries, min(random, len(entries))) - - return entries diff --git a/src/bcbench/dataset/reviewer.py b/src/bcbench/dataset/reviewer.py deleted file mode 100644 index 18dfc0123..000000000 --- a/src/bcbench/dataset/reviewer.py +++ /dev/null @@ -1,234 +0,0 @@ -""" -Dataset Review TUI for browsing dataset entries. - -Shows dataset entry info and problem statement. Use arrow keys to navigate. -If results directory is provided, shows resolution stats across runs. - -Copilot wrote all of this code, seems to work, haven't read it. -""" - -import json -from pathlib import Path - -from textual.app import App, ComposeResult -from textual.binding import Binding -from textual.containers import Horizontal, Vertical, VerticalScroll -from textual.widgets import Footer, Header, Label, Static - -from bcbench.dataset import DatasetEntry, load_dataset_entries - - -class DatasetReviewer(App): - """TUI for reviewing dataset entries.""" - - TITLE = "Dataset Review" - - CSS = """ - #content-container { - height: 1fr; - } - .content-panel { - height: 1fr; - border: solid blue; - } - #info-panel { - height: auto; - max-height: 12; - } - #info-columns { - height: auto; - } - .info-column { - width: 1fr; - height: auto; - } - #problem-panel { - border: solid green; - } - """ - - BINDINGS = [ # noqa: RUF012 - Binding("right", "next_item", "Next →"), - Binding("left", "prev_item", "← Prev"), - Binding("escape", "quit", "Quit"), - ] - - def __init__(self, dataset_path: Path, results_dir: Path | None = None): - super().__init__() - self.dataset_path = dataset_path - self.results_dir = results_dir - self.entries: list[DatasetEntry] = [] - self.current_index = 0 - self.resolution_cache: dict[str, tuple[int, int, list[str]]] = {} # instance_id -> (resolved, total, failure_categories) - - def compose(self) -> ComposeResult: - yield Header() - yield Label("", id="progress") - with Vertical(id="content-container"): - with VerticalScroll(id="info-panel", classes="content-panel"), Horizontal(id="info-columns"): - yield Static("", id="info-left", classes="info-column") - yield Static("", id="info-right", classes="info-column") - with VerticalScroll(id="problem-panel", classes="content-panel"): - yield Static("", id="problem-content") - yield Footer() - - def on_mount(self) -> None: - self._load_data() - self._update_display() - - def _load_data(self) -> None: - self.entries = load_dataset_entries(self.dataset_path) - if self.results_dir: - self._load_resolution_stats() - - def _load_resolution_stats(self) -> None: - if not self.results_dir or not self.results_dir.exists(): - return - - for jsonl_file in self.results_dir.glob("*.jsonl"): - with open(jsonl_file, encoding="utf-8") as f: - for line in f: - if not line.strip(): - continue - try: - result = json.loads(line) - instance_id = result.get("instance_id") or result.get("InstanceID", "") - if not instance_id: - continue - - scores = result.get("scores", {}) - resolved = scores.get("ResolutionRate", 0) == 1 - - if instance_id not in self.resolution_cache: - self.resolution_cache[instance_id] = (0, 0, []) - - current_resolved, current_total, failure_cats = self.resolution_cache[instance_id] - - # Collect failure category if not resolved - if not resolved: - failure_cat = result.get("review", {}).get("failure_category") - if failure_cat: - failure_cats = [*failure_cats, failure_cat] - - self.resolution_cache[instance_id] = ( - current_resolved + (1 if resolved else 0), - current_total + 1, - failure_cats, - ) - except json.JSONDecodeError: - continue - - def _get_current_entry(self) -> DatasetEntry | None: - if not self.entries or self.current_index >= len(self.entries): - return None - return self.entries[self.current_index] - - def _get_resolution_text(self, instance_id: str) -> str: - if not self.results_dir: - return "" - if instance_id not in self.resolution_cache: - return " | [dim]No runs found[/dim]" - resolved, total, failure_cats = self.resolution_cache[instance_id] - if resolved == total: - return f" | [green]{resolved}/{total} resolved[/green]" - - # Build failure category summary - failure_text = "" - if failure_cats: - from collections import Counter - - counts = Counter(failure_cats) - failure_text = " | " + ", ".join(f"{cat}: {cnt}" for cat, cnt in counts.most_common()) - - if resolved == 0: - return f" | [red]{resolved}/{total} resolved[/red]{failure_text}" - return f" | [yellow]{resolved}/{total} resolved[/yellow]{failure_text}" - - def _format_entry_info_left(self, entry: DatasetEntry) -> str: - lines = [ - f"[cyan bold]Repo:[/cyan bold] {entry.repo}", - f"[cyan bold]Instance ID:[/cyan bold] {entry.instance_id}", - f"[cyan bold]Base Commit:[/cyan bold] {entry.base_commit}", - f"[cyan bold]Created At:[/cyan bold] {entry.created_at}", - f"[cyan bold]Version:[/cyan bold] {entry.environment_setup_version}", - ] - - # Project Paths - lines.append("") - lines.append("[cyan bold]Project Paths:[/cyan bold]") - for path in entry.project_paths: - lines.append(f" {path}") - - return "\n".join(lines) - - def _format_entry_info_right(self, entry: DatasetEntry) -> str: - lines = [] - - # Metadata - metadata = entry.metadata.model_dump() - if any(v is not None for v in metadata.values()): - lines.append("[bold]Metadata:[/bold]") - for field_name, field_value in metadata.items(): - if field_value is not None: - display_name = field_name.replace("_", " ").title() - lines.append(f" [dim]{display_name}:[/dim] {field_value}") - lines.append("") - - # FAIL_TO_PASS tests - lines.append("[bold]FAIL_TO_PASS:[/bold]") - if entry.fail_to_pass: - for test in entry.fail_to_pass: - lines.append(f" [magenta]{test.codeunitID}[/magenta]: {', '.join(test.functionName)}") - else: - lines.append(" [dim]None[/dim]") - - # PASS_TO_PASS tests - lines.append("") - lines.append("[bold]PASS_TO_PASS:[/bold]") - if entry.pass_to_pass: - for test in entry.pass_to_pass: - lines.append(f" [magenta]{test.codeunitID}[/magenta]: {', '.join(test.functionName)}") - else: - lines.append(" [dim]None[/dim]") - - return "\n".join(lines) - - def _update_display(self) -> None: - entry = self._get_current_entry() - if not entry: - self.query_one("#progress", Label).update("No entries in dataset") - return - - resolution_text = self._get_resolution_text(entry.instance_id) - progress_text = f"[{self.current_index + 1}/{len(self.entries)}]{resolution_text} | {entry.instance_id}" - self.query_one("#progress", Label).update(progress_text) - - info_panel = self.query_one("#info-panel", VerticalScroll) - problem_panel = self.query_one("#problem-panel", VerticalScroll) - info_panel.border_title = "Entry Information" - problem_panel.border_title = "Problem Statement" - - self.query_one("#info-left", Static).update(self._format_entry_info_left(entry)) - self.query_one("#info-right", Static).update(self._format_entry_info_right(entry)) - self.query_one("#problem-content", Static).update(entry.get_task()) - - info_panel.scroll_home(animate=False) - problem_panel.scroll_home(animate=False) - - def action_next_item(self) -> None: - if self.current_index < len(self.entries) - 1: - self.current_index += 1 - self._update_display() - - def action_prev_item(self) -> None: - if self.current_index > 0: - self.current_index -= 1 - self._update_display() - - def action_quit(self) -> None: - self.exit() - - -def run_dataset_reviewer(dataset_path: Path, results_dir: Path | None = None) -> None: - app = DatasetReviewer(dataset_path, results_dir) - app.run() diff --git a/src/bcbench/evaluate/__init__.py b/src/bcbench/evaluate/__init__.py index 79c109648..9a29b5f09 100644 --- a/src/bcbench/evaluate/__init__.py +++ b/src/bcbench/evaluate/__init__.py @@ -1,5 +1,7 @@ """Evaluation module for running pipelines and creating results.""" -from bcbench.evaluate.base import EvaluationPipeline, create_pipeline +from bcbench.evaluate.base import EvaluationPipeline +from bcbench.evaluate.bugfix import BugFixPipeline +from bcbench.evaluate.testgeneration import TestGenerationPipeline -__all__ = ["EvaluationPipeline", "create_pipeline"] +__all__ = ["BugFixPipeline", "EvaluationPipeline", "TestGenerationPipeline"] diff --git a/src/bcbench/evaluate/base.py b/src/bcbench/evaluate/base.py index 87ec251b2..8c2dbb2d6 100644 --- a/src/bcbench/evaluate/base.py +++ b/src/bcbench/evaluate/base.py @@ -1,19 +1,22 @@ +from __future__ import annotations + from abc import ABC, abstractmethod from collections.abc import Callable from bcbench.config import get_config +from bcbench.dataset import BaseDatasetEntry from bcbench.exceptions import AgentTimeoutError from bcbench.logger import get_logger from bcbench.results import BaseEvaluationResult -from bcbench.types import AgentMetrics, EvaluationCategory, EvaluationContext, ExperimentConfiguration +from bcbench.types import AgentMetrics, EvaluationContext, ExperimentConfiguration logger = get_logger(__name__) _config = get_config() -__all__ = ["EvaluationPipeline", "create_pipeline"] +__all__ = ["EvaluationPipeline"] -class EvaluationPipeline(ABC): +class EvaluationPipeline[E: BaseDatasetEntry](ABC): """Abstract base class for evaluation pipelines. Subclasses implement category-specific setup, agent execution, and validation logic. @@ -21,7 +24,7 @@ class EvaluationPipeline(ABC): """ @abstractmethod - def setup(self, context: EvaluationContext) -> None: + def setup(self, context: EvaluationContext[E]) -> None: """Setup environment: e.g. clean repo, checkout base commit, initial build. Args: @@ -33,7 +36,7 @@ def setup(self, context: EvaluationContext) -> None: raise NotImplementedError() @abstractmethod - def run_agent(self, context: EvaluationContext, agent_runner: Callable) -> None: + def run_agent(self, context: EvaluationContext[E], agent_runner: Callable) -> None: """Run the agent and capture metrics. Args: @@ -46,7 +49,7 @@ def run_agent(self, context: EvaluationContext, agent_runner: Callable) -> None: raise NotImplementedError() @abstractmethod - def evaluate(self, context: EvaluationContext) -> None: + def evaluate(self, context: EvaluationContext[E]) -> None: """Evaluate results: e.g. apply patches, build, run tests. Implementation should raise category-specific exceptions on failure. @@ -61,8 +64,8 @@ def evaluate(self, context: EvaluationContext) -> None: def execute( self, - context: EvaluationContext, - agent_runner: Callable[[EvaluationContext], tuple[AgentMetrics | None, ExperimentConfiguration | None]], + context: EvaluationContext[E], + agent_runner: Callable[[EvaluationContext[E]], tuple[AgentMetrics | None, ExperimentConfiguration | None]], ) -> None: """Template method orchestrating the evaluation flow. @@ -90,7 +93,7 @@ def execute( self.evaluate(context) - def save_result(self, context: EvaluationContext, result: BaseEvaluationResult) -> None: + def save_result(self, context: EvaluationContext[E], result: BaseEvaluationResult) -> None: """Save result directly using result object. Args: @@ -99,20 +102,3 @@ def save_result(self, context: EvaluationContext, result: BaseEvaluationResult) """ result.save(context.result_dir, f"{context.entry.instance_id}{_config.file_patterns.result_pattern}") - - -def create_pipeline(category: EvaluationCategory) -> EvaluationPipeline: - """Factory function to create evaluation pipeline based on category.""" - from bcbench.evaluate.bugfix import BugFixPipeline - from bcbench.evaluate.testgeneration import TestGenerationPipeline - - match category: - case EvaluationCategory.BUG_FIX: - logger.info(f"Using BugFixPipeline for category: {category}") - return BugFixPipeline() - case EvaluationCategory.TEST_GENERATION: - logger.info(f"Using TestGenerationPipeline for category: {category}") - return TestGenerationPipeline() - case _: - raise ValueError(f"Unknown evaluation category: {category}") - raise RuntimeError("Unreachable: no pipeline returned") diff --git a/src/bcbench/evaluate/bugfix.py b/src/bcbench/evaluate/bugfix.py index 310d455d1..9df7eee67 100644 --- a/src/bcbench/evaluate/bugfix.py +++ b/src/bcbench/evaluate/bugfix.py @@ -1,5 +1,6 @@ from collections.abc import Callable +from bcbench.dataset import BugFixEntry from bcbench.evaluate.base import EvaluationPipeline from bcbench.exceptions import BuildError, TestExecutionError from bcbench.logger import get_logger, github_log_group @@ -21,42 +22,30 @@ __all__ = ["BugFixPipeline"] -class BugFixPipeline(EvaluationPipeline): - """Pipeline for bug-fix evaluation category. +class BugFixPipeline(EvaluationPipeline[BugFixEntry]): + """Pipeline for bug-fix evaluation category.""" - Workflow: - 1. Setup: clean repo, checkout base commit, copy problem statement, build - 2. Run agent: execute agent to generate fix patch - 3. evaluate: apply test patch, build, run tests - """ - - def setup(self, context: EvaluationContext) -> None: + def setup(self, context: EvaluationContext[BugFixEntry]) -> None: setup_repo_prebuild(context.entry, context.repo_path) build_and_publish_projects( context.repo_path, context.entry.project_paths, - context.container_name, - context.username, - context.password, + context.get_container(), context.entry.environment_setup_version, ) setup_repo_postbuild(context.entry, context.repo_path, context.category) - def run_agent(self, context: EvaluationContext, agent_runner: Callable) -> None: + def run_agent(self, context: EvaluationContext[BugFixEntry], agent_runner: Callable) -> None: with github_log_group(f"{context.agent_name} -- Entry: {context.entry.instance_id}"): context.metrics, context.experiment = agent_runner(context) - def evaluate(self, context: EvaluationContext) -> None: - """Apply test patch, build, and run tests. - - Creates and saves appropriate result based on validation outcome. - """ + def evaluate(self, context: EvaluationContext[BugFixEntry]) -> None: + container = context.get_container() test_projects, _app_projects = categorize_projects(context.entry.project_paths) # Clean test projects to revert any unintended agent changes before capturing diff - # Evaluation focuses on valid changes (app code), treating unintended modifications as out-of-scope noise clean_project_paths(context.repo_path, test_projects) generated_patch = stage_and_get_diff(context.repo_path) @@ -67,12 +56,10 @@ def evaluate(self, context: EvaluationContext) -> None: build_and_publish_projects( context.repo_path, context.entry.project_paths, - context.container_name, - context.username, - context.password, + container, context.entry.environment_setup_version, ) - run_tests(context.entry, context.container_name, context.username, context.password) + run_tests(context.entry, container) result = BugFixResult.create_success(context, generated_patch) logger.info(f"Successfully completed {context.entry.instance_id}") diff --git a/src/bcbench/evaluate/testgeneration.py b/src/bcbench/evaluate/testgeneration.py index 0a8758ab2..11642dee1 100644 --- a/src/bcbench/evaluate/testgeneration.py +++ b/src/bcbench/evaluate/testgeneration.py @@ -1,7 +1,7 @@ from collections.abc import Callable from bcbench.collection.patch_utils import extract_file_paths_from_patch -from bcbench.dataset import TestEntry +from bcbench.dataset import TestEntry, TestGenEntry from bcbench.evaluate.base import EvaluationPipeline from bcbench.exceptions import BuildError, NoTestsExtractedError, TestExecutionError from bcbench.logger import get_logger, github_log_group @@ -24,42 +24,27 @@ __all__ = ["TestGenerationPipeline"] -class TestGenerationPipeline(EvaluationPipeline): - """Pipeline for test-generation evaluation category. +class TestGenerationPipeline(EvaluationPipeline[TestGenEntry]): + """Pipeline for test-generation evaluation category.""" - Workflow: - 1. Setup: clean repo, checkout base commit, build, then apply gold patch/problem statement - 2. Run agent: execute agent to generate test code - 3. Evaluate: build, run tests with expected failures, then apply original patch, build, run tests with expected passes - - Input modes (configured in copilot/config.yaml): - - problem-statement: Agent receives bug description, generates tests from problem statement - - gold-patch: Agent sees the fixed code, generates tests that verify the fix works - - Note: For test-generation, we build BEFORE applying the gold patch so the published app - doesn't include the fix. The agent sees the fix in source but tests run against unfixed app. - """ - - def setup(self, context: EvaluationContext) -> None: + def setup(self, context: EvaluationContext[TestGenEntry]) -> None: setup_repo_prebuild(context.entry, context.repo_path) build_and_publish_projects( context.repo_path, context.entry.project_paths, - context.container_name, - context.username, - context.password, + context.get_container(), context.entry.environment_setup_version, ) - # Apply gold patch / problem statement AFTER build so fix isn't included in published app setup_repo_postbuild(context.entry, context.repo_path, context.category) - def run_agent(self, context: EvaluationContext, agent_runner: Callable) -> None: + def run_agent(self, context: EvaluationContext[TestGenEntry], agent_runner: Callable) -> None: with github_log_group(f"{context.agent_name} -- Entry: {context.entry.instance_id}"): context.metrics, context.experiment = agent_runner(context) - def evaluate(self, context: EvaluationContext) -> None: + def evaluate(self, context: EvaluationContext[TestGenEntry]) -> None: + container = context.get_container() test_projects, app_projects = categorize_projects(context.entry.project_paths) # Clean app projects to revert any unintended agent changes before capturing diff @@ -83,24 +68,20 @@ def evaluate(self, context: EvaluationContext) -> None: build_and_publish_projects( context.repo_path, test_projects, - context.container_name, - context.username, - context.password, + container, context.entry.environment_setup_version, ) - run_test_suite(generated_tests, "Fail", context.container_name, context.username, context.password) + run_test_suite(generated_tests, "Fail", container) apply_patch(context.repo_path, context.entry.patch, f"{context.entry.instance_id} patch") build_and_publish_projects( context.repo_path, app_projects, - context.container_name, - context.username, - context.password, + container, context.entry.environment_setup_version, ) - run_test_suite(generated_tests, "Pass", context.container_name, context.username, context.password) + run_test_suite(generated_tests, "Pass", container) result = TestGenerationResult.create_success(context, generated_patch, pre_patch_failed=True, post_patch_passed=True) logger.info(f"Successfully completed {context.entry.instance_id}") diff --git a/src/bcbench/operations/bc_operations.py b/src/bcbench/operations/bc_operations.py index 76baa631b..cca41e654 100644 --- a/src/bcbench/operations/bc_operations.py +++ b/src/bcbench/operations/bc_operations.py @@ -8,9 +8,11 @@ from pydantic import TypeAdapter from bcbench.config import get_config -from bcbench.dataset import DatasetEntry, TestEntry +from bcbench.dataset import TestEntry +from bcbench.dataset.dataset_entry import _BugFixTestGenBase from bcbench.exceptions import BuildError, BuildTimeoutExpired, TestExecutionError, TestExecutionTimeoutExpired from bcbench.logger import get_logger +from bcbench.types import ContainerConfig logger = get_logger(__name__) _config = get_config() @@ -116,7 +118,7 @@ def build_ps_dataset_tests_script(container_name: str, username: str, password: ) -def build_and_publish_projects(repo_path: Path, project_paths: list[str], container_name: str, username: str, password: str, version: str) -> None: +def build_and_publish_projects(repo_path: Path, project_paths: list[str], container: ContainerConfig, version: str) -> None: """Build and publish all projects.""" logger.info(f"Building and publishing {len(project_paths)} projects") @@ -125,9 +127,9 @@ def build_and_publish_projects(repo_path: Path, project_paths: list[str], contai logger.info(f"Building project: {full_project_path}") ps_script = build_ps_app_build_and_publish( - container_name=container_name, - username=username, - password=password, + container_name=container.name, + username=container.username, + password=container.password, project_path=full_project_path, version=version, ) @@ -157,26 +159,26 @@ def build_and_publish_projects(repo_path: Path, project_paths: list[str], contai logger.info("All projects built and published") -def run_tests(entry: DatasetEntry, container_name: str, username: str, password: str) -> None: +def run_tests(entry: _BugFixTestGenBase, container: ContainerConfig) -> None: if entry.fail_to_pass: logger.info(f"Running {len(entry.fail_to_pass)} fail-to-pass tests") - run_test_suite(entry.fail_to_pass, "Pass", container_name, username, password) + run_test_suite(entry.fail_to_pass, "Pass", container) if entry.pass_to_pass: logger.info(f"Running {len(entry.pass_to_pass)} pass-to-pass tests") - run_test_suite(entry.pass_to_pass, "Pass", container_name, username, password) + run_test_suite(entry.pass_to_pass, "Pass", container) logger.info("All tests completed") -def run_test_suite(test_entries: list[TestEntry], expectation: Literal["Pass", "Fail"], container_name: str, username: str, password: str) -> None: +def run_test_suite(test_entries: list[TestEntry], expectation: Literal["Pass", "Fail"], container: ContainerConfig) -> None: """Run a suite of tests.""" test_entries_json: str = TypeAdapter(list[TestEntry]).dump_json(test_entries).decode() ps_script = build_ps_dataset_tests_script( - container_name=container_name, - username=username, - password=password, + container_name=container.name, + username=container.username, + password=container.password, test_entries_json=test_entries_json, expectation=expectation, ) diff --git a/src/bcbench/operations/instruction_operations.py b/src/bcbench/operations/instruction_operations.py index 761f28c02..a7120267f 100644 --- a/src/bcbench/operations/instruction_operations.py +++ b/src/bcbench/operations/instruction_operations.py @@ -2,7 +2,8 @@ from shutil import copytree, rmtree from bcbench.config import get_config -from bcbench.dataset import DatasetEntry +from bcbench.dataset import BaseDatasetEntry +from bcbench.dataset.dataset_entry import _BugFixTestGenBase from bcbench.logger import get_logger from bcbench.types import AgentType @@ -10,7 +11,7 @@ _config = get_config() -def setup_instructions_from_config(agent_config: dict, entry: DatasetEntry, repo_path: Path, agent_type: AgentType) -> bool: +def setup_instructions_from_config(agent_config: dict, entry: BaseDatasetEntry, repo_path: Path, agent_type: AgentType) -> bool: """ Setup custom instructions from config if enabled. @@ -47,7 +48,7 @@ def setup_instructions_from_config(agent_config: dict, entry: DatasetEntry, repo return instructions_enabled -def setup_custom_agent(agent_config: dict, entry: DatasetEntry, repo_path: Path, agent_type: AgentType) -> str | None: +def setup_custom_agent(agent_config: dict, entry: BaseDatasetEntry, repo_path: Path, agent_type: AgentType) -> str | None: """ Setup custom agents in the repository if available. """ @@ -83,7 +84,7 @@ def _get_source_instructions_path(repo_name: str) -> Path: return instructions_path -def copy_problem_statement_folder(entry: DatasetEntry, repo_path: Path) -> None: +def copy_problem_statement_folder(entry: _BugFixTestGenBase, repo_path: Path) -> None: """ Copy problem statement folder to the testbed repository root. diff --git a/src/bcbench/operations/setup_operations.py b/src/bcbench/operations/setup_operations.py index f2924ddf2..bc502ff9f 100644 --- a/src/bcbench/operations/setup_operations.py +++ b/src/bcbench/operations/setup_operations.py @@ -5,7 +5,7 @@ import yaml from bcbench.config import get_config -from bcbench.dataset import DatasetEntry +from bcbench.dataset.dataset_entry import BaseDatasetEntry, _BugFixTestGenBase from bcbench.logger import get_logger from bcbench.operations.git_operations import apply_patch, checkout_commit, clean_repo from bcbench.operations.instruction_operations import copy_problem_statement_folder @@ -37,7 +37,7 @@ def _get_test_generation_input_mode() -> str: return input_mode -def setup_repo_prebuild(entry: DatasetEntry, repo_path: Path) -> None: +def setup_repo_prebuild(entry: BaseDatasetEntry, repo_path: Path) -> None: """Setup repository before building - clean and checkout base commit. This is the first phase of repo setup that should be called BEFORE build_and_publish_projects. @@ -51,17 +51,14 @@ def setup_repo_prebuild(entry: DatasetEntry, repo_path: Path) -> None: checkout_commit(repo_path, entry.base_commit) -def setup_repo_postbuild(entry: DatasetEntry, repo_path: Path, category: EvaluationCategory) -> None: - """Setup repository after building - apply patches and copy problem statements. +def setup_repo_postbuild(entry: _BugFixTestGenBase, repo_path: Path, category: EvaluationCategory) -> None: + """Setup repository after building for bug-fix and test-generation categories. This is the second phase of repo setup that should be called AFTER build_and_publish_projects. For test-generation, this ensures the gold patch is applied only after the base code is built, so the agent sees the fixed code but tests run against the unfixed published app. - Args: - entry: Dataset entry with instance metadata - repo_path: Path to the repository - category: Evaluation category (bug-fix or test-generation) + Note: Other categories should implement their own postbuild setup. """ if category == EvaluationCategory.TEST_GENERATION: input_mode: str = _get_test_generation_input_mode() diff --git a/src/bcbench/operations/skills_operations.py b/src/bcbench/operations/skills_operations.py index da327eaa8..67f25d219 100644 --- a/src/bcbench/operations/skills_operations.py +++ b/src/bcbench/operations/skills_operations.py @@ -1,7 +1,7 @@ from pathlib import Path from shutil import copytree, rmtree -from bcbench.dataset.dataset_entry import DatasetEntry +from bcbench.dataset.dataset_entry import BaseDatasetEntry from bcbench.logger import get_logger from bcbench.operations.instruction_operations import _get_source_instructions_path from bcbench.types import AgentType @@ -9,7 +9,7 @@ logger = get_logger(__name__) -def setup_agent_skills(agent_config: dict, entry: DatasetEntry, repo_path: Path, agent_type: AgentType) -> bool: +def setup_agent_skills(agent_config: dict, entry: BaseDatasetEntry, repo_path: Path, agent_type: AgentType) -> bool: """ Setup skills in the repository if available. diff --git a/src/bcbench/results/base.py b/src/bcbench/results/base.py index 8e9783727..fffe9beba 100644 --- a/src/bcbench/results/base.py +++ b/src/bcbench/results/base.py @@ -2,7 +2,7 @@ import json from pathlib import Path -from typing import Any, TypeVar +from typing import Any, Self from pydantic import BaseModel @@ -11,9 +11,6 @@ logger = get_logger(__name__) -# Type variable for proper return type hints in class methods -T = TypeVar("T", bound="BaseEvaluationResult") - class BaseEvaluationResult(BaseModel): """Base class for all evaluation results with shared metrics across categories.""" @@ -36,14 +33,14 @@ class BaseEvaluationResult(BaseModel): @classmethod def _create_from_context( - cls: type[T], + cls, context: "EvaluationContext", resolved: bool, build: bool, error_message: str | None = None, generated_patch: str = "", **kwargs: Any, - ) -> T: + ) -> Self: """Create result from EvaluationContext with validation and metric extraction. Args: @@ -80,19 +77,19 @@ def _create_from_context( ) @classmethod - def create_success(cls: type[T], context: "EvaluationContext", generated_patch: str, **kwargs: Any) -> T: + def create_success(cls, context: "EvaluationContext", generated_patch: str, **kwargs: Any) -> Self: return cls._create_from_context(context, resolved=True, build=True, generated_patch=generated_patch, **kwargs) @classmethod - def create_build_failure(cls: type[T], context: "EvaluationContext", generated_patch: str, error_msg: str, **kwargs: Any) -> T: + def create_build_failure(cls, context: "EvaluationContext", generated_patch: str, error_msg: str, **kwargs: Any) -> Self: return cls._create_from_context(context, resolved=False, build=False, error_message=error_msg, generated_patch=generated_patch, **kwargs) @classmethod - def create_test_failure(cls: type[T], context: "EvaluationContext", generated_patch: str, error_msg: str = "Tests failed", **kwargs: Any) -> T: + def create_test_failure(cls, context: "EvaluationContext", generated_patch: str, error_msg: str = "Tests failed", **kwargs: Any) -> Self: return cls._create_from_context(context, resolved=False, build=True, error_message=error_msg, generated_patch=generated_patch, **kwargs) @classmethod - def create_agent_timeout_failure(cls: type[T], context: "EvaluationContext", **kwargs: Any) -> T: + def create_agent_timeout_failure(cls, context: "EvaluationContext", **kwargs: Any) -> Self: return cls._create_from_context(context, resolved=False, build=False, timeout=True, error_message="Agent timed out", **kwargs) def save(self, output_dir: Path, result_file: str) -> None: diff --git a/src/bcbench/results/bceval_export.py b/src/bcbench/results/bceval_export.py index 043e0f84d..eadb79fb0 100644 --- a/src/bcbench/results/bceval_export.py +++ b/src/bcbench/results/bceval_export.py @@ -6,7 +6,7 @@ from pathlib import Path from typing import Any -from bcbench.dataset import DatasetEntry, load_dataset_entries +from bcbench.dataset import BaseDatasetEntry from bcbench.logger import get_logger from bcbench.results.base import BaseEvaluationResult from bcbench.results.testgeneration import TestGenerationResult @@ -15,9 +15,10 @@ logger = get_logger(__name__) -def write_bceval_results(results: list[BaseEvaluationResult], out_dir: Path, run_id: str, dataset_path: Path, output_filename: str) -> None: +def write_bceval_results(results: list[BaseEvaluationResult], out_dir: Path, run_id: str, output_filename: str, category: EvaluationCategory) -> None: """Write results into a JSONL file for bceval consumption.""" - dataset_entries: list[DatasetEntry] = load_dataset_entries(dataset_path) + entry_cls = category.entry_class + dataset_entries: list[BaseDatasetEntry] = entry_cls.load(category.dataset_path) output_file = out_dir / output_filename with open(output_file, "w") as f: @@ -28,7 +29,8 @@ def write_bceval_results(results: list[BaseEvaluationResult], out_dir: Path, run logger.error(f"No matching dataset entry found for instance_id: {result.instance_id}") continue - input, expected = get_info_from_dataset_entry(matching_entries[0], result.category) + matched_entry = matching_entries[0] + input, expected = matched_entry.get_task(), matched_entry.get_expected_output() metadata: dict[str, Any] = { "model": result.model, @@ -61,22 +63,3 @@ def write_bceval_results(results: list[BaseEvaluationResult], out_dir: Path, run f.write(json.dumps(bceval_result) + "\n") logger.info(f"Wrote bceval results to: {output_file}") - - -def get_info_from_dataset_entry(entry: DatasetEntry, category: EvaluationCategory) -> tuple[str, str]: - """ - Extract relevant info from DatasetEntry for bceval results. - - Args: - entry: The DatasetEntry instance - category: The evaluation category - Returns: - A tuple of (input, expected output) - """ - match category: - case EvaluationCategory.BUG_FIX: - return entry.get_task(), entry.patch - case EvaluationCategory.TEST_GENERATION: - return entry.get_task(), entry.test_patch - case _: - raise ValueError(f"Unsupported evaluation category: {category}") diff --git a/src/bcbench/results/reviewer.py b/src/bcbench/results/reviewer.py deleted file mode 100644 index 4b5d760da..000000000 --- a/src/bcbench/results/reviewer.py +++ /dev/null @@ -1,376 +0,0 @@ -""" -Failure Mode Analysis TUI for reviewing evaluation results. - -Shows expected vs actual diffs side-by-side. Use arrow keys to navigate, -number keys to select failure category. Press Escape to quit. - -Copilot wrote most of this code, seems to work, haven't read it all carefully. -""" - -import json -from abc import abstractmethod -from pathlib import Path - -from textual.app import App, ComposeResult -from textual.binding import Binding -from textual.containers import Horizontal, VerticalScroll -from textual.widgets import Footer, Header, Label, Static - -from bcbench.types import EvaluationCategory - - -def _get_patch_field(category: EvaluationCategory) -> str: - return "patch" if category == EvaluationCategory.BUG_FIX else "test_patch" - - -def _get_expected_panel_title(category: EvaluationCategory) -> str: - return "Expected (Gold)" if category == EvaluationCategory.BUG_FIX else "Expected (Test Patch)" - - -# Failure categories (name, description) -FAILURE_CATEGORIES = [ - ("Wrong Solution", "Incorrect approach/logic"), - ("Syntax Error", "AL syntax issues"), - ("Incorrect File", "Wrong file edited"), - ("Missing Using", "Missing using statement for namespace"), - ("Timeout", "Agent timed out and made no changes"), - ("Other", "Doesn't fit above"), - ("Flag For Review", "Needs further review"), -] - - -class BaseReviewer(App): - """Base class for failure mode review TUIs.""" - - CSS = """ - #diff-container { - height: 1fr; - } - .diff-panel { - width: 1fr; - border: solid green; - } - #actual-panel { - border: solid red; - } - """ - - BINDINGS = [ # noqa: RUF012 - Binding("right", "next_item", "Next →"), - Binding("left", "prev_item", "← Prev"), - Binding("1", "select_category(0)", "Wrong Solution"), - Binding("2", "select_category(1)", "Syntax Error"), - Binding("3", "select_category(2)", "Incorrect File"), - Binding("4", "select_category(3)", "Missing Using"), - Binding("5", "select_category(4)", "Timeout"), - Binding("6", "select_category(5)", "Other"), - Binding("7", "select_category(6)", "Flag For Review"), - Binding("escape", "quit", "Quit"), - ] - - def __init__(self, dataset_path: Path, category: EvaluationCategory = EvaluationCategory.BUG_FIX): - super().__init__() - self.dataset_path = dataset_path - self.category = category - self.current_index = 0 - - def compose(self) -> ComposeResult: - yield Header() - yield Label("", id="progress") - with Horizontal(id="diff-container"): - with VerticalScroll(id="expected-panel", classes="diff-panel"): - yield Static("", id="expected-content") - with VerticalScroll(id="actual-panel", classes="diff-panel"): - yield Static("", id="actual-content") - yield Footer() - - def on_mount(self) -> None: - self._load_data() - self._update_display() - - @abstractmethod - def _load_data(self) -> None: - pass - - @abstractmethod - def _get_current_result(self) -> dict | None: - pass - - @abstractmethod - def _get_expected_patch(self, result: dict) -> str: - pass - - @abstractmethod - def _get_progress_text(self, result: dict) -> str: - pass - - @abstractmethod - def _get_actual_panel_title(self) -> str: - pass - - @abstractmethod - def _save(self) -> None: - pass - - @abstractmethod - def _get_item_count(self) -> int: - pass - - @abstractmethod - def _get_reviewed_count(self) -> int: - pass - - def _update_display(self) -> None: - result = self._get_current_result() - if not result: - self.query_one("#progress", Label).update(self._get_empty_message()) - return - - self.query_one("#progress", Label).update(self._get_progress_text(result)) - - expected = self._get_expected_patch(result) - actual = result.get("output", "No output") - - exp_panel = self.query_one("#expected-panel", VerticalScroll) - act_panel = self.query_one("#actual-panel", VerticalScroll) - exp_panel.border_title = _get_expected_panel_title(self.category) - act_panel.border_title = self._get_actual_panel_title() - self.query_one("#expected-content", Static).update(self._format_diff(expected)) - self.query_one("#actual-content", Static).update(self._format_diff(actual)) - exp_panel.scroll_home(animate=False) - act_panel.scroll_home(animate=False) - - def _get_empty_message(self) -> str: - return "No results to review" - - def _format_diff(self, text: str) -> str: - if not text: - return "(empty)" - lines = [] - for line in text.split("\n"): - escaped = line.replace("[", r"\[") - if line.startswith(("+++", "---", "@@")): - lines.append(f"[cyan]{escaped}[/cyan]") - elif line.startswith("+"): - lines.append(f"[green]{escaped}[/green]") - elif line.startswith("-"): - lines.append(f"[red]{escaped}[/red]") - else: - lines.append(escaped) - return "\n".join(lines) - - def _format_status(self, result: dict) -> str: - scores = result.get("scores", {}) - build = scores.get("BuildRate", 0) == 1 - resolved = scores.get("ResolutionRate", 0) == 1 - build_str = "[green]✓[/green]" if build else "[red]✗[/red]" - resolved_str = "[green]✓[/green]" if resolved else "[red]✗[/red]" - current_category = result.get("review", {}).get("failure_category") or "[dim]None[/dim]" - parts: list[str] = [f"Build: {build_str}"] - if self.category == EvaluationCategory.TEST_GENERATION: - post_passed = scores.get("PostPatchPassedRate", 0) == 1 - pre_failed = scores.get("PrePatchFailedRate", 0) == 1 - post_str = "[green]✓[/green]" if post_passed else "[red]✗[/red]" - pre_str = "[green]✓[/green]" if pre_failed else "[red]✗[/red]" - parts.append(f"PrePatchFailed: {pre_str}") - parts.append(f"PostPatchPassed: {post_str}") - parts.append(f"Resolved: {resolved_str}") - parts.append(f"Category: {current_category}") - return " | ".join(parts) - - def _set_category(self, idx: int) -> None: - result = self._get_current_result() - if not result: - return - if "review" not in result: - result["review"] = {} - result["review"]["failure_category"] = FAILURE_CATEGORIES[idx][0] - self._save() - self.notify(f"Set: {FAILURE_CATEGORIES[idx][0]}") - self._update_display() - - def action_next_item(self) -> None: - if self.current_index < self._get_item_count() - 1: - self.current_index += 1 - self._update_display() - - def action_prev_item(self) -> None: - if self.current_index > 0: - self.current_index -= 1 - self._update_display() - - def action_select_category(self, index: int) -> None: - if 0 <= index < len(FAILURE_CATEGORIES): - self._set_category(index) - - def action_quit(self) -> None: - self._save() - self.exit() - - -class InstanceAcrossRunsReviewer(BaseReviewer): - """Review a single instance across all runs in a directory.""" - - TITLE = "Instance Across Runs Review" - - def __init__(self, results_dir: Path, instance_id: str, dataset_path: Path, category: EvaluationCategory = EvaluationCategory.BUG_FIX): - super().__init__(dataset_path, category) - self.results_dir = results_dir - self.instance_id = instance_id - self.run_results: list[tuple[str, dict, Path]] = [] # (run_id, result_dict, file_path) - self.expected_patch: str = "" - - def _load_data(self) -> None: - with open(self.dataset_path, encoding="utf-8") as f: - for line in f: - if line.strip(): - entry = json.loads(line) - if entry["instance_id"] == self.instance_id: - self.expected_patch = entry.get(_get_patch_field(self.category), "") - break - - for jsonl_file in sorted(self.results_dir.glob("*.jsonl")): - run_id = jsonl_file.stem - with open(jsonl_file, encoding="utf-8") as f: - for line in f: - if not line.strip(): - continue - result = json.loads(line) - result_instance_id = result.get("instance_id") or result.get("InstanceID", "") - if result_instance_id == self.instance_id: - self.run_results.append((run_id, result, jsonl_file)) - break - - def _get_current_result(self) -> dict | None: - if not self.run_results or self.current_index >= len(self.run_results): - return None - return self.run_results[self.current_index][1] - - def _get_expected_patch(self, result: dict) -> str: - return self.expected_patch or "Not found in dataset" - - def _get_progress_text(self, result: dict) -> str: - run_id = self.run_results[self.current_index][0] - status = self._format_status(result) - return f"[{self.current_index + 1}/{len(self.run_results)} runs] (reviewed: {self._get_reviewed_count()}) | Run: {run_id} | {status} | {self.instance_id}" - - def _get_actual_panel_title(self) -> str: - run_id = self.run_results[self.current_index][0] - return f"Actual (Run: {run_id})" - - def _get_empty_message(self) -> str: - return f"No results found for instance: {self.instance_id}" - - def _get_item_count(self) -> int: - return len(self.run_results) - - def _get_reviewed_count(self) -> int: - return sum(1 for _, r, _ in self.run_results if r.get("review", {}).get("failure_category")) - - def _save(self) -> None: - if not self.run_results or self.current_index >= len(self.run_results): - return - _, result, file_path = self.run_results[self.current_index] - - all_results = [] - with open(file_path, encoding="utf-8") as f: - for line in f: - if line.strip(): - all_results.append(json.loads(line)) - - for i, r in enumerate(all_results): - r_instance_id = r.get("instance_id") or r.get("InstanceID", "") - if r_instance_id == self.instance_id: - all_results[i] = result - break - - self._write_results(file_path, all_results) - - @staticmethod - def _write_results(file_path: Path, results: list[dict]) -> None: - with open(file_path, "w", encoding="utf-8") as f: - for r in results: - if "review" in r: - ordered = {"review": r["review"]} - ordered.update({k: v for k, v in r.items() if k != "review"}) - f.write(json.dumps(ordered) + "\n") - else: - f.write(json.dumps(r) + "\n") - - -class FailureModeAnalysis(BaseReviewer): - """Review and categorize agent failures in a single results file.""" - - TITLE = "Failure Mode Analysis" - - def __init__(self, results_path: Path, dataset_path: Path, category: EvaluationCategory = EvaluationCategory.BUG_FIX, include_resolved: bool = False): - super().__init__(dataset_path, category) - self.results_path = results_path - self.results: list[dict] = [] - self.unresolved_indices: list[int] = [] - self.dataset_lookup: dict[str, str] = {} - self.include_resolved = include_resolved - - def _load_data(self) -> None: - # Build dataset lookup and track order - dataset_order: dict[str, int] = {} - with open(self.dataset_path, encoding="utf-8") as f: - for idx, line in enumerate(f): - if line.strip(): - entry = json.loads(line) - instance_id = entry["instance_id"] - self.dataset_lookup[instance_id] = entry.get(_get_patch_field(self.category), "") - dataset_order[instance_id] = idx - - with open(self.results_path, encoding="utf-8") as f: - for line in f: - if line.strip(): - self.results.append(json.loads(line)) - - if self.include_resolved: - # Sort indices by dataset order to match dataset review iteration - self.unresolved_indices = sorted( - range(len(self.results)), - key=lambda i: dataset_order.get(self.results[i].get("instance_id") or self.results[i].get("InstanceID", ""), float("inf")), - ) - else: - self.unresolved_indices = [i for i, r in enumerate(self.results) if not r.get("resolved", False) and r.get("scores", {}).get("ResolutionRate", 0) == 0] - - def _get_current_result(self) -> dict | None: - if not self.unresolved_indices or self.current_index >= len(self.unresolved_indices): - return None - return self.results[self.unresolved_indices[self.current_index]] - - def _get_expected_patch(self, result: dict) -> str: - instance_id = result.get("instance_id") or result.get("InstanceID", "unknown") - return self.dataset_lookup.get(instance_id, "Not found in dataset") - - def _get_progress_text(self, result: dict) -> str: - instance_id = result.get("instance_id") or result.get("InstanceID", "unknown") - status = self._format_status(result) - return f"[{self.current_index + 1}/{len(self.unresolved_indices)}] (reviewed: {self._get_reviewed_count()}) | {status} | {instance_id}" - - def _get_actual_panel_title(self) -> str: - return "Actual (Agent)" - - def _get_empty_message(self) -> str: - return "No results to review" if self.include_resolved else "No unresolved results" - - def _get_item_count(self) -> int: - return len(self.unresolved_indices) - - def _get_reviewed_count(self) -> int: - return sum(1 for i in self.unresolved_indices if self.results[i].get("review", {}).get("failure_category")) - - def _save(self) -> None: - InstanceAcrossRunsReviewer._write_results(self.results_path, self.results) - - -def run_reviewer(results_path: Path, dataset_path: Path, category: EvaluationCategory = EvaluationCategory.BUG_FIX, include_resolved: bool = False) -> None: - app = FailureModeAnalysis(results_path, dataset_path, category, include_resolved) - app.run() - - -def run_instance_reviewer(results_dir: Path, instance_id: str, dataset_path: Path, category: EvaluationCategory = EvaluationCategory.BUG_FIX) -> None: - app = InstanceAcrossRunsReviewer(results_dir, instance_id, dataset_path, category) - app.run() diff --git a/src/bcbench/results/testgeneration.py b/src/bcbench/results/testgeneration.py index 01516ee69..fb80d0e05 100644 --- a/src/bcbench/results/testgeneration.py +++ b/src/bcbench/results/testgeneration.py @@ -1,4 +1,6 @@ -from bcbench.results.base import BaseEvaluationResult, T +from typing import Self + +from bcbench.results.base import BaseEvaluationResult from bcbench.types import EvaluationContext @@ -13,5 +15,5 @@ class TestGenerationResult(BaseEvaluationResult): post_patch_passed: bool = False @classmethod - def create_no_tests_extracted(cls: type[T], context: "EvaluationContext", generated_patch: str, error_message: str) -> T: + def create_no_tests_extracted(cls, context: "EvaluationContext", generated_patch: str, error_message: str) -> Self: return cls._create_from_context(context, resolved=False, build=False, generated_patch=generated_patch, error_message=error_message) diff --git a/src/bcbench/types.py b/src/bcbench/types.py index 6489414ae..c2177a8a6 100644 --- a/src/bcbench/types.py +++ b/src/bcbench/types.py @@ -12,9 +12,10 @@ from bcbench.logger import get_logger if TYPE_CHECKING: - from bcbench.dataset import DatasetEntry + from bcbench.dataset import BaseDatasetEntry + from bcbench.evaluate.base import EvaluationPipeline -__all__ = ["AgentMetrics", "AgentType", "EvaluationCategory", "EvaluationContext", "ExperimentConfiguration"] +__all__ = ["AgentMetrics", "AgentType", "ContainerConfig", "EvaluationCategory", "EvaluationContext", "ExperimentConfiguration"] logger = get_logger(__name__) @@ -101,9 +102,52 @@ class EvaluationCategory(str, Enum): # CODE_REVIEW = "code-review" # EVENT_REQUEST = "event-request" + @property + def dataset_path(self) -> Path: + from bcbench.config import get_config + + match self: + case EvaluationCategory.BUG_FIX: + return get_config().paths.dataset_dir / "bcbench.jsonl" + case EvaluationCategory.TEST_GENERATION: + return get_config().paths.dataset_dir / "bcbench.jsonl" + + raise ValueError(f"Unknown evaluation category: {self}") + + @property + def entry_class(self) -> type[BaseDatasetEntry]: + from bcbench.dataset import BugFixEntry, TestGenEntry + + match self: + case EvaluationCategory.BUG_FIX: + return BugFixEntry + case EvaluationCategory.TEST_GENERATION: + return TestGenEntry + + raise ValueError(f"Unknown evaluation category: {self}") + + @property + def pipeline(self) -> EvaluationPipeline: + from bcbench.evaluate import BugFixPipeline, TestGenerationPipeline + + match self: + case EvaluationCategory.BUG_FIX: + return BugFixPipeline() + case EvaluationCategory.TEST_GENERATION: + return TestGenerationPipeline() + + raise ValueError(f"Unknown evaluation category: {self}") + + +@dataclass(frozen=True) +class ContainerConfig: + name: str + username: str + password: str + @dataclass -class EvaluationContext: +class EvaluationContext[E: BaseDatasetEntry]: """Context object containing all configuration for evaluation pipeline. This bundles related configuration together to avoid long parameter lists @@ -111,15 +155,10 @@ class EvaluationContext: """ # Core configuration - entry: DatasetEntry + entry: E repo_path: Path result_dir: Path - # BC Container configuration - container_name: str - password: str - username: str - # Agent metadata agent_name: str model: str @@ -127,8 +166,16 @@ class EvaluationContext: # Evaluation category category: EvaluationCategory + # BC Container configuration (optional — not all categories require a container) + container: ContainerConfig | None = None + # Agent execution metrics metrics: AgentMetrics | None = None # Experiment configuration experiment: ExperimentConfiguration | None = None + + def get_container(self) -> ContainerConfig: + if self.container is None: + raise ValueError(f"Container configuration is required for {self.category.value} evaluation") + return self.container diff --git a/tests/conftest.py b/tests/conftest.py index a1ee29531..6c6ab4922 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,7 +1,7 @@ """ Shared test fixtures for BC-Bench tests. -This module provides reusable fixtures that create valid DatasetEntry objects +This module provides reusable fixtures that create valid BugFixEntry objects meeting all Pydantic validation requirements. """ @@ -12,12 +12,13 @@ import pytest -from bcbench.dataset import DatasetEntry, TestEntry +from bcbench.dataset import BugFixEntry, TestEntry +from bcbench.dataset.dataset_entry import _BugFixTestGenBase from bcbench.results.bugfix import BugFixResult from bcbench.results.testgeneration import TestGenerationResult -from bcbench.types import AgentMetrics, EvaluationCategory, EvaluationContext +from bcbench.types import AgentMetrics, ContainerConfig, EvaluationCategory, EvaluationContext -# Valid test data that passes all DatasetEntry validation rules +# Valid test data that passes all BugFixEntry validation rules VALID_INSTANCE_ID = "microsoftInternal__NAV-123456" VALID_REPO = "microsoftInternal/NAV" VALID_BASE_COMMIT = "a" * 40 # 40-character hex string @@ -49,7 +50,7 @@ def create_dataset_entry( created_at: str = VALID_CREATED_AT, fail_to_pass: list[TestEntry] | None = None, pass_to_pass: list[TestEntry] | None = None, -) -> DatasetEntry: +) -> BugFixEntry: if project_paths is None: project_paths = VALID_PROJECT_PATHS.copy() if fail_to_pass is None: @@ -57,7 +58,7 @@ def create_dataset_entry( if pass_to_pass is None: pass_to_pass = [] - return DatasetEntry( + return BugFixEntry( instance_id=instance_id, repo=repo, base_commit=base_commit, @@ -73,24 +74,22 @@ def create_dataset_entry( def create_evaluation_context( tmp_path: Path, - entry: DatasetEntry | None = None, + entry: BugFixEntry | None = None, agent_name: str = "test-agent", model: str = "test-model", category: EvaluationCategory = EvaluationCategory.BUG_FIX, container_name: str = "test-container", password: str = "test-password", username: str = "test-user", -) -> EvaluationContext: +) -> EvaluationContext[BugFixEntry]: if entry is None: entry = create_dataset_entry() - return EvaluationContext( + return EvaluationContext[BugFixEntry]( entry=entry, repo_path=tmp_path / "repo", result_dir=tmp_path / "results", - container_name=container_name, - password=password, - username=username, + container=ContainerConfig(name=container_name, username=username, password=password), agent_name=agent_name, model=model, category=category, @@ -151,7 +150,7 @@ def create_testgen_result( ) -def create_dataset_file(tmp_path: Path, entries: list[DatasetEntry] | None = None) -> Path: +def create_dataset_file(tmp_path: Path, entries: list[BugFixEntry] | None = None) -> Path: if entries is None: entries = [create_dataset_entry()] @@ -188,12 +187,12 @@ def sample_test_entry() -> TestEntry: @pytest.fixture -def sample_dataset_entry() -> DatasetEntry: +def sample_dataset_entry() -> BugFixEntry: return create_dataset_entry() @pytest.fixture -def sample_evaluation_context(tmp_path: Path) -> EvaluationContext: +def sample_evaluation_context(tmp_path: Path) -> EvaluationContext[BugFixEntry]: return create_evaluation_context(tmp_path) @@ -225,10 +224,9 @@ def sample_bugfix_result_with_metrics() -> BugFixResult: @pytest.fixture -def sample_dataset_entry_with_problem_statement(tmp_path: Path) -> Generator[DatasetEntry]: +def sample_dataset_entry_with_problem_statement(tmp_path: Path) -> Generator[BugFixEntry]: problem_dir = create_problem_statement_dir(tmp_path) entry = create_dataset_entry() - # Patch the problem_statement_dir property to return our temp directory - with patch.object(type(entry), "problem_statement_dir", property(lambda self: problem_dir)): + with patch.object(_BugFixTestGenBase, "problem_statement_dir", property(lambda self: problem_dir)): yield entry diff --git a/tests/test_agent_skills.py b/tests/test_agent_skills.py index c49028c74..4025d72f2 100644 --- a/tests/test_agent_skills.py +++ b/tests/test_agent_skills.py @@ -6,7 +6,7 @@ from tempfile import TemporaryDirectory from unittest.mock import MagicMock -from bcbench.dataset import DatasetEntry +from bcbench.dataset import BaseDatasetEntry from bcbench.operations import setup_agent_skills from bcbench.operations.instruction_operations import _get_source_instructions_path from bcbench.types import AgentType @@ -24,7 +24,7 @@ def test_setup_agent_skills(): with TemporaryDirectory() as tmpdir: repo_path = Path(tmpdir) - entry = MagicMock(spec=DatasetEntry) + entry = MagicMock(spec=BaseDatasetEntry) entry.repo = "microsoftInternal/NAV" config = {"skills": {"enabled": True}} @@ -69,7 +69,7 @@ def test_nonexistent_skills(): """Test that setup_agent_skills raises FileNotFoundError for nonexistent repo.""" with TemporaryDirectory() as tmpdir: repo_path = Path(tmpdir) - entry = MagicMock(spec=DatasetEntry) + entry = MagicMock(spec=BaseDatasetEntry) entry.repo = "nonexistent/repo" config = {"skills": {"enabled": True}} @@ -92,7 +92,7 @@ def test_overwrite_skill_folder_files(): with TemporaryDirectory() as tmpdir: repo_path = Path(tmpdir) - entry = MagicMock(spec=DatasetEntry) + entry = MagicMock(spec=BaseDatasetEntry) entry.repo = "microsoftInternal/NAV" config = {"skills": {"enabled": True}} @@ -122,7 +122,7 @@ def test_overwrite_skill_folder_files(): def test_path_specific_skills_copied(): with TemporaryDirectory() as tmpdir: repo_path = Path(tmpdir) - entry = MagicMock(spec=DatasetEntry) + entry = MagicMock(spec=BaseDatasetEntry) entry.repo = "microsoftInternal/NAV" config = {"skills": {"enabled": True}} @@ -141,7 +141,7 @@ def test_path_specific_skills_copied(): def test_path_specific_skills_removed_before_copy(): with TemporaryDirectory() as tmpdir: repo_path = Path(tmpdir) - entry = MagicMock(spec=DatasetEntry) + entry = MagicMock(spec=BaseDatasetEntry) entry.repo = "microsoftInternal/NAV" config = {"skills": {"enabled": True}} @@ -166,7 +166,7 @@ def test_skills_disabled(): """When skills disabled, should return False and not create directory.""" with TemporaryDirectory() as tmpdir: repo_path = Path(tmpdir) - entry = MagicMock(spec=DatasetEntry) + entry = MagicMock(spec=BaseDatasetEntry) entry.repo = "microsoftInternal/NAV" config = {"skills": {"enabled": False}} diff --git a/tests/test_cli_commands.py b/tests/test_cli_commands.py index eb5c99dd4..2290cff5c 100644 --- a/tests/test_cli_commands.py +++ b/tests/test_cli_commands.py @@ -1,14 +1,14 @@ """Integration tests for CLI commands using Typer's CliRunner.""" import json -from unittest.mock import patch +from unittest.mock import PropertyMock, patch import pytest from typer.testing import CliRunner from bcbench.cli import app -from bcbench.dataset import DatasetEntry -from bcbench.types import AgentMetrics +from bcbench.dataset.dataset_entry import _BugFixTestGenBase +from bcbench.types import AgentMetrics, EvaluationCategory from tests.conftest import ( create_bugfix_result, create_dataset_entry, @@ -95,18 +95,21 @@ def test_result_summarize_creates_all_outputs(sample_results_directory, problem_ base_path, run_id, dataset_path = sample_results_directory results_dir = base_path / run_id - with patch.object(DatasetEntry, "problem_statement_dir", property(lambda self: problem_statement_dir)): + with ( + patch.object(_BugFixTestGenBase, "problem_statement_dir", property(lambda self: problem_statement_dir)), + patch.object(EvaluationCategory, "dataset_path", new_callable=PropertyMock, return_value=dataset_path), + ): result = runner.invoke( app, [ "result", "summarize", + "--category", + "bug-fix", "--run-id", run_id, "--result-dir", str(base_path), - "--dataset-path", - str(dataset_path), ], ) @@ -126,18 +129,21 @@ def test_result_summarize_verifies_summary_calculations(sample_results_directory base_path, run_id, dataset_path = sample_results_directory results_dir = base_path / run_id - with patch.object(DatasetEntry, "problem_statement_dir", property(lambda self: problem_statement_dir)): + with ( + patch.object(_BugFixTestGenBase, "problem_statement_dir", property(lambda self: problem_statement_dir)), + patch.object(EvaluationCategory, "dataset_path", new_callable=PropertyMock, return_value=dataset_path), + ): result = runner.invoke( app, [ "result", "summarize", + "--category", + "bug-fix", "--run-id", run_id, "--result-dir", str(base_path), - "--dataset-path", - str(dataset_path), ], ) @@ -162,6 +168,8 @@ def test_result_summarize_missing_directory_fails_gracefully(tmp_path): [ "result", "summarize", + "--category", + "bug-fix", "--run-id", "nonexistent_run", "--result-dir", @@ -187,6 +195,8 @@ def test_result_summarize_no_matching_files_fails_gracefully(tmp_path): [ "result", "summarize", + "--category", + "bug-fix", "--run-id", run_id, "--result-dir", @@ -201,18 +211,21 @@ def test_result_summarize_no_matching_files_fails_gracefully(tmp_path): def test_result_summarize_with_custom_pattern(sample_results_directory, problem_statement_dir): base_path, run_id, dataset_path = sample_results_directory - with patch.object(DatasetEntry, "problem_statement_dir", property(lambda self: problem_statement_dir)): + with ( + patch.object(_BugFixTestGenBase, "problem_statement_dir", property(lambda self: problem_statement_dir)), + patch.object(EvaluationCategory, "dataset_path", new_callable=PropertyMock, return_value=dataset_path), + ): result = runner.invoke( app, [ "result", "summarize", + "--category", + "bug-fix", "--run-id", run_id, "--result-dir", str(base_path), - "--dataset-path", - str(dataset_path), "--result-pattern", "*.jsonl", ], @@ -223,15 +236,14 @@ def test_result_summarize_with_custom_pattern(sample_results_directory, problem_ @pytest.mark.integration def test_dataset_list_displays_all_entries(sample_dataset_file_for_cli): - result = runner.invoke( - app, - [ - "dataset", - "list", - "--dataset-path", - str(sample_dataset_file_for_cli), - ], - ) + with patch.object(EvaluationCategory, "dataset_path", new_callable=PropertyMock, return_value=sample_dataset_file_for_cli): + result = runner.invoke( + app, + [ + "dataset", + "list", + ], + ) assert result.exit_code == 0 assert "microsoftInternal__NAV-1" in result.stdout @@ -244,15 +256,14 @@ def test_dataset_list_displays_all_entries(sample_dataset_file_for_cli): def test_dataset_list_missing_file_fails_gracefully(tmp_path): nonexistent_path = tmp_path / "nonexistent.jsonl" - result = runner.invoke( - app, - [ - "dataset", - "list", - "--dataset-path", - str(nonexistent_path), - ], - ) + with patch.object(EvaluationCategory, "dataset_path", new_callable=PropertyMock, return_value=nonexistent_path): + result = runner.invoke( + app, + [ + "dataset", + "list", + ], + ) assert result.exit_code != 0 @@ -262,15 +273,14 @@ def test_dataset_list_empty_file_shows_zero_entries(tmp_path): empty_dataset = tmp_path / "empty.jsonl" empty_dataset.write_text("") - result = runner.invoke( - app, - [ - "dataset", - "list", - "--dataset-path", - str(empty_dataset), - ], - ) + with patch.object(EvaluationCategory, "dataset_path", new_callable=PropertyMock, return_value=empty_dataset): + result = runner.invoke( + app, + [ + "dataset", + "list", + ], + ) assert result.exit_code == 0 assert "Found 0 entry(ies)" in result.stdout @@ -281,15 +291,14 @@ def test_dataset_list_single_entry(tmp_path): entry = create_dataset_entry(instance_id="microsoftInternal__NAV-100") dataset_path = create_dataset_file(tmp_path, [entry]) - result = runner.invoke( - app, - [ - "dataset", - "list", - "--dataset-path", - str(dataset_path), - ], - ) + with patch.object(EvaluationCategory, "dataset_path", new_callable=PropertyMock, return_value=dataset_path): + result = runner.invoke( + app, + [ + "dataset", + "list", + ], + ) assert result.exit_code == 0 assert "microsoftInternal__NAV-100" in result.stdout @@ -298,15 +307,14 @@ def test_dataset_list_single_entry(tmp_path): @pytest.mark.integration def test_dataset_list_verifies_entry_format(sample_dataset_file_for_cli): - result = runner.invoke( - app, - [ - "dataset", - "list", - "--dataset-path", - str(sample_dataset_file_for_cli), - ], - ) + with patch.object(EvaluationCategory, "dataset_path", new_callable=PropertyMock, return_value=sample_dataset_file_for_cli): + result = runner.invoke( + app, + [ + "dataset", + "list", + ], + ) assert result.exit_code == 0 # Should contain instance_id of first entry diff --git a/tests/test_custom_instructions.py b/tests/test_custom_instructions.py index 1a3eb54a2..caf7a56f4 100644 --- a/tests/test_custom_instructions.py +++ b/tests/test_custom_instructions.py @@ -8,7 +8,7 @@ from unittest.mock import MagicMock from bcbench.config import get_config -from bcbench.dataset import DatasetEntry +from bcbench.dataset import BaseDatasetEntry from bcbench.operations.instruction_operations import ( _get_source_instructions_path, setup_instructions_from_config, @@ -30,7 +30,7 @@ def test_setup_custom_instructions(): with TemporaryDirectory() as tmpdir: repo_path = Path(tmpdir) - entry = MagicMock(spec=DatasetEntry) + entry = MagicMock(spec=BaseDatasetEntry) entry.repo = "microsoftInternal/NAV" config = {"instructions": {"enabled": True}} @@ -85,7 +85,7 @@ def test_overwrite_existing_instructions(): with TemporaryDirectory() as tmpdir: repo_path = Path(tmpdir) - entry = MagicMock(spec=DatasetEntry) + entry = MagicMock(spec=BaseDatasetEntry) entry.repo = "microsoftInternal/NAV" config = {"instructions": {"enabled": True}} @@ -110,7 +110,7 @@ def test_overwrite_existing_instructions(): def test_path_specific_instructions_removed_before_copy(): with TemporaryDirectory() as tmpdir: repo_path = Path(tmpdir) - entry = MagicMock(spec=DatasetEntry) + entry = MagicMock(spec=BaseDatasetEntry) entry.repo = "microsoftInternal/NAV" config = {"instructions": {"enabled": True}} @@ -133,7 +133,7 @@ def test_path_specific_instructions_removed_before_copy(): def test_no_path_specific_instructions_warning(): with TemporaryDirectory() as tmpdir: repo_path = Path(tmpdir) - entry = MagicMock(spec=DatasetEntry) + entry = MagicMock(spec=BaseDatasetEntry) entry.repo = "microsoftInternal/NAV" config = {"instructions": {"enabled": True}} @@ -149,7 +149,7 @@ def test_no_path_specific_instructions_warning(): def test_empty_instructions_folder_warning(): with TemporaryDirectory() as tmpdir: repo_path = Path(tmpdir) - entry = MagicMock(spec=DatasetEntry) + entry = MagicMock(spec=BaseDatasetEntry) entry.repo = "microsoftInternal/NAV" config = {"instructions": {"enabled": True}} @@ -167,7 +167,7 @@ def test_claude_instructions_renamed(): with TemporaryDirectory() as tmpdir: repo_path = Path(tmpdir) - entry = MagicMock(spec=DatasetEntry) + entry = MagicMock(spec=BaseDatasetEntry) entry.repo = "microsoftInternal/NAV" config = {"instructions": {"enabled": True}} diff --git a/tests/test_ps_templates.py b/tests/test_ps_templates.py index 82cb7e983..7cc0bb658 100644 --- a/tests/test_ps_templates.py +++ b/tests/test_ps_templates.py @@ -5,6 +5,7 @@ from bcbench.config import get_config from bcbench.dataset import TestEntry from bcbench.operations import bc_operations +from bcbench.types import ContainerConfig _config = get_config() @@ -225,9 +226,7 @@ def test_test_entries_serialized_as_json(self, mock_subprocess): bc_operations.run_test_suite( test_entries=test_entries, expectation="Pass", - container_name="bcserver", - username="admin", - password="Test123", + container=ContainerConfig(name="bcserver", username="admin", password="Test123"), ) assert len(mock_subprocess) == 1 @@ -248,9 +247,7 @@ def test_multiple_test_entries_serialized_as_json(self, mock_subprocess): bc_operations.run_test_suite( test_entries=test_entries, expectation="Pass", - container_name="bcserver", - username="admin", - password="Test123", + container=ContainerConfig(name="bcserver", username="admin", password="Test123"), ) assert len(mock_subprocess) == 1 diff --git a/tests/test_result_writer.py b/tests/test_result_writer.py index 97138b203..4133f8714 100644 --- a/tests/test_result_writer.py +++ b/tests/test_result_writer.py @@ -1,9 +1,9 @@ import json -from unittest.mock import patch +from unittest.mock import PropertyMock, patch -from bcbench.dataset import DatasetEntry +from bcbench.dataset.dataset_entry import _BugFixTestGenBase from bcbench.results.bceval_export import write_bceval_results -from bcbench.types import AgentMetrics +from bcbench.types import AgentMetrics, EvaluationCategory from tests.conftest import VALID_INSTANCE_ID, create_bugfix_result @@ -12,13 +12,16 @@ def test_writes_bceval_results_with_all_fields(self, tmp_path, sample_dataset_fi output_dir = tmp_path / "output" output_dir.mkdir() - with patch.object(DatasetEntry, "problem_statement_dir", property(lambda self: problem_statement_dir)): + with ( + patch.object(_BugFixTestGenBase, "problem_statement_dir", property(lambda self: problem_statement_dir)), + patch.object(EvaluationCategory, "dataset_path", new_callable=PropertyMock, return_value=sample_dataset_file), + ): write_bceval_results( results=[sample_bugfix_result_with_metrics], out_dir=output_dir, run_id="test_run_123", - dataset_path=sample_dataset_file, output_filename="results.jsonl", + category=EvaluationCategory.BUG_FIX, ) output_file = output_dir / "results.jsonl" @@ -43,13 +46,16 @@ def test_handles_none_prompt_tokens(self, tmp_path, sample_dataset_file, sample_ output_dir = tmp_path / "output" output_dir.mkdir() - with patch.object(DatasetEntry, "problem_statement_dir", property(lambda self: problem_statement_dir)): + with ( + patch.object(_BugFixTestGenBase, "problem_statement_dir", property(lambda self: problem_statement_dir)), + patch.object(EvaluationCategory, "dataset_path", new_callable=PropertyMock, return_value=sample_dataset_file), + ): write_bceval_results( results=[sample_testgen_result], out_dir=output_dir, run_id="test_run_456", - dataset_path=sample_dataset_file, output_filename="results.jsonl", + category=EvaluationCategory.TEST_GENERATION, ) output_file = output_dir / "results.jsonl" @@ -65,13 +71,16 @@ def test_handles_mixed_results(self, tmp_path, sample_dataset_file, sample_bugfi output_dir = tmp_path / "output" output_dir.mkdir() - with patch.object(DatasetEntry, "problem_statement_dir", property(lambda self: problem_statement_dir)): + with ( + patch.object(_BugFixTestGenBase, "problem_statement_dir", property(lambda self: problem_statement_dir)), + patch.object(EvaluationCategory, "dataset_path", new_callable=PropertyMock, return_value=sample_dataset_file), + ): write_bceval_results( results=[sample_bugfix_result_with_metrics, sample_testgen_result], out_dir=output_dir, run_id="test_run_789", - dataset_path=sample_dataset_file, output_filename="results.jsonl", + category=EvaluationCategory.BUG_FIX, ) output_file = output_dir / "results.jsonl" @@ -94,13 +103,16 @@ def test_includes_expected_fields_in_bceval_format(self, tmp_path, sample_datase output_dir = tmp_path / "output" output_dir.mkdir() - with patch.object(DatasetEntry, "problem_statement_dir", property(lambda self: problem_statement_dir)): + with ( + patch.object(_BugFixTestGenBase, "problem_statement_dir", property(lambda self: problem_statement_dir)), + patch.object(EvaluationCategory, "dataset_path", new_callable=PropertyMock, return_value=sample_dataset_file), + ): write_bceval_results( results=[sample_bugfix_result_with_metrics], out_dir=output_dir, run_id="test_run_abc", - dataset_path=sample_dataset_file, output_filename="results.jsonl", + category=EvaluationCategory.BUG_FIX, ) output_file = output_dir / "results.jsonl" @@ -127,13 +139,14 @@ def test_skips_results_without_matching_dataset_entry(self, tmp_path, sample_dat metrics=AgentMetrics(prompt_tokens=1000, completion_tokens=200), ) - write_bceval_results( - results=[non_matching_result], - out_dir=output_dir, - run_id="test_run_xyz", - dataset_path=sample_dataset_file, - output_filename="results.jsonl", - ) + with patch.object(EvaluationCategory, "dataset_path", new_callable=PropertyMock, return_value=sample_dataset_file): + write_bceval_results( + results=[non_matching_result], + out_dir=output_dir, + run_id="test_run_xyz", + output_filename="results.jsonl", + category=EvaluationCategory.BUG_FIX, + ) output_file = output_dir / "results.jsonl" with open(output_file) as f: @@ -148,13 +161,16 @@ def test_handles_partial_none_metrics(self, tmp_path, sample_dataset_file, probl result = create_bugfix_result(metrics=AgentMetrics(execution_time=100.0, prompt_tokens=None, completion_tokens=1500)) - with patch.object(DatasetEntry, "problem_statement_dir", property(lambda self: problem_statement_dir)): + with ( + patch.object(_BugFixTestGenBase, "problem_statement_dir", property(lambda self: problem_statement_dir)), + patch.object(EvaluationCategory, "dataset_path", new_callable=PropertyMock, return_value=sample_dataset_file), + ): write_bceval_results( results=[result], out_dir=output_dir, run_id="test_run_partial", - dataset_path=sample_dataset_file, output_filename="results.jsonl", + category=EvaluationCategory.BUG_FIX, ) output_file = output_dir / "results.jsonl" diff --git a/tests/test_type_exhaustiveness.py b/tests/test_type_exhaustiveness.py index a229628b8..3da3844f7 100644 --- a/tests/test_type_exhaustiveness.py +++ b/tests/test_type_exhaustiveness.py @@ -1,8 +1,6 @@ from pathlib import Path -from bcbench.dataset import DatasetEntry -from bcbench.evaluate import create_pipeline -from bcbench.results.bceval_export import get_info_from_dataset_entry +from bcbench.dataset import BugFixEntry from bcbench.types import AgentType, EvaluationCategory @@ -23,13 +21,23 @@ def test_all_agent_types_have_instruction_filename(): def test_all_categories_have_pipelines(): for category in EvaluationCategory: - pipeline = create_pipeline(category) + pipeline = category.pipeline assert pipeline is not None -def test_all_categories_handled_in_get_info_from_dataset_entry(sample_dataset_entry_with_problem_statement: DatasetEntry): +def test_all_categories_have_entry_classes(): for category in EvaluationCategory: - input_text, expected_output = get_info_from_dataset_entry(sample_dataset_entry_with_problem_statement, category) + entry_cls = category.entry_class + assert entry_cls is not None + + +def test_all_categories_handled_in_get_expected_output(sample_dataset_entry_with_problem_statement: BugFixEntry): + for category in EvaluationCategory: + entry_cls = category.entry_class + # Reconstruct entry as the category-specific type so get_expected_output() works + entry = entry_cls.model_validate(sample_dataset_entry_with_problem_statement.model_dump(by_alias=True)) + input_text = entry.get_task() + expected_output = entry.get_expected_output() assert isinstance(input_text, str) assert isinstance(expected_output, str) assert len(expected_output) > 0