diff --git a/.github/actions/setup-bc-container/action.yml b/.github/actions/setup-bc-container/action.yml index 700cc58c9..3b71c1457 100644 --- a/.github/actions/setup-bc-container/action.yml +++ b/.github/actions/setup-bc-container/action.yml @@ -14,6 +14,10 @@ inputs: github-token: description: GitHub token for accessing public repositories required: true + skip-container: + description: Skip BC container setup (only clone repository) + required: false + default: "false" outputs: repo_path: @@ -24,6 +28,7 @@ runs: using: composite steps: - name: Generate BC container name and credentials + if: inputs.skip-container != 'true' run: | # Generate a 32-character random password using Get-Random # The password is short-lived and only used for the duration of the workflow @@ -38,6 +43,7 @@ runs: shell: pwsh - name: Install BcContainerHelper module + if: inputs.skip-container != 'true' run: Install-Module -Name BcContainerHelper -Force -AllowClobber -AllowPrerelease shell: pwsh @@ -59,5 +65,5 @@ runs: $env:ADO_TOKEN = az account get-access-token --resource "499b84ac-1321-427f-aa17-267ca6975798" --query accessToken -o tsv Write-Output "::add-mask::$env:ADO_TOKEN" - .\scripts\Setup-ContainerAndRepository.ps1 -InstanceId "${{ inputs.instance-id }}" + .\scripts\Setup-ContainerAndRepository.ps1 -InstanceId "${{ inputs.instance-id }}" ${{ inputs.skip-container == 'true' && '-SkipContainer' || '' }} shell: pwsh diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md index 98156aef5..76b60e234 100644 --- a/.github/copilot-instructions.md +++ b/.github/copilot-instructions.md @@ -14,6 +14,9 @@ This is a benchmark for evaluating coding agents on real-world Business Central - Uses `uv` for dependency management: e.g. `uv add ` to add packages, `uv run ` to run commands - Uses `pre-commit` for code quality checks (ruff linting/formatting, trailing whitespace, etc.) +## Categories +BC-Bench is category-based and designed to grow over time. It currently has two categories, `bug-fix` and `test-generation`. They share the same dataset tasks and execution-based setup, but use different prompts, expected outputs, and evaluation pipelines. Future categories such as `code-review` can be added within the same overall benchmark structure, though they may require different inputs, setup, or evaluation methods. + ## Coding Patterns and Guidelines - Prefer strong typing and type hints diff --git a/notebooks/bug-fix/overview.ipynb b/notebooks/bug-fix/overview.ipynb index 49211055d..8e31e3804 100644 --- a/notebooks/bug-fix/overview.ipynb +++ b/notebooks/bug-fix/overview.ipynb @@ -269,7 +269,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "id": "8b5bb1be", "metadata": {}, "outputs": [ @@ -291,7 +291,7 @@ "merged_df[\"image_bin\"] = pd.cut(merged_df[\"image_count\"], bins=bins, labels=labels)\n", "\n", "# Add problem statement char count\n", - "ps_chars = {entry.instance_id: len(entry.get_task(transform_image_paths=False)) for entry in bcbench_dataset}\n", + "ps_chars = {entry.instance_id: len(entry.get_task()) for entry in bcbench_dataset}\n", "merged_df[\"ps_chars\"] = merged_df[\"instance_id\"].map(ps_chars)\n", "\n", "instance_df = (\n", diff --git a/scripts/Setup-ContainerAndRepository.ps1 b/scripts/Setup-ContainerAndRepository.ps1 index 36e665ed3..77f6d8d4b 100644 --- a/scripts/Setup-ContainerAndRepository.ps1 +++ b/scripts/Setup-ContainerAndRepository.ps1 @@ -25,7 +25,10 @@ param( [SecureString]$Password, [Parameter(Mandatory = $false)] - [string]$RepoPath + [string]$RepoPath, + + [Parameter(Mandatory = $false)] + [switch]$SkipContainer ) [DatasetEntry[]] $entries = Get-DatasetEntries -DatasetPath $DatasetPath -Version $Version -InstanceId $InstanceId @@ -37,9 +40,7 @@ else { Write-Log "Found $($entries.Count) dataset entries to process." -Level Info } -Write-Log "Setting up BC container and repository for version $Version, Dataset Path: $DatasetPath" -Level Info - -[PSCredential]$credential = Get-BCCredential -Username $Username -Password $Password +Write-Log "Setting up repository for version $Version, Dataset Path: $DatasetPath" -Level Info if (-not $RepoPath) { $RepoPath = Join-Path -Path $env:GITHUB_WORKSPACE -ChildPath "testbed" @@ -56,27 +57,34 @@ if (Test-Path $RepoPath) { Write-Log "Cloning repository $($entries[0].repo) to $RepoPath" -Level Info Invoke-GitCloneWithRetry -RepoUrl $cloneInfo.Url -Token $cloneInfo.Token -ClonePath $RepoPath -CommitSha $commitSha -SparseCheckoutPaths $cloneInfo.SparseCheckoutPaths -Import-Module BcContainerHelper -Force -DisableNameChecking +if (-not $SkipContainer) { + [PSCredential]$credential = Get-BCCredential -Username $Username -Password $Password -Write-Log "Container name: $ContainerName" -Level Info + Import-Module BcContainerHelper -Force -DisableNameChecking -if (Test-ContainerExists -containerName $ContainerName) { - throw "Container $ContainerName already exists. This indicates the machine was not properly cleaned up from a previous run." -} + Write-Log "Container name: $ContainerName" -Level Info -Write-Log "Creating container $ContainerName for version $Version..." -Level Info + if (Test-ContainerExists -containerName $ContainerName) { + throw "Container $ContainerName already exists. This indicates the machine was not properly cleaned up from a previous run." + } -# Get BC artifact URL -[string] $url = Get-BCArtifactUrl -version $Version -Country $Country -Write-Log "Retrieved artifact URL: $url" -Level Info + Write-Log "Creating container $ContainerName for version $Version..." -Level Info -# Create container synchronously with NAV folder shared -New-BCContainerSync -ContainerName $ContainerName -Version $Version -ArtifactUrl $url -Credential $credential -AdditionalFolders @($RepoPath) + # Get BC artifact URL + [string] $url = Get-BCArtifactUrl -version $Version -Country $Country + Write-Log "Retrieved artifact URL: $url" -Level Info -# Create compiler folder synchronously -New-BCCompilerFolderSync -ContainerName $ContainerName -ArtifactUrl $url + # Create container synchronously with NAV folder shared + New-BCContainerSync -ContainerName $ContainerName -Version $Version -ArtifactUrl $url -Credential $credential -AdditionalFolders @($RepoPath) -Initialize-ContainerForDevelopment -ContainerName $ContainerName -RepoVersion ([System.Version]$Version) + # Create compiler folder synchronously + New-BCCompilerFolderSync -ContainerName $ContainerName -ArtifactUrl $url + + Initialize-ContainerForDevelopment -ContainerName $ContainerName -RepoVersion ([System.Version]$Version) +} +else { + Write-Log "Skipping BC container setup (SkipContainer flag set)" -Level Info +} # Set output for GitHub Actions or return path if ($env:GITHUB_OUTPUT) { diff --git a/src/bcbench/agent/mini/agent.py b/src/bcbench/agent/mini/agent.py index fec150331..0fbba7088 100644 --- a/src/bcbench/agent/mini/agent.py +++ b/src/bcbench/agent/mini/agent.py @@ -73,7 +73,9 @@ def run_mini_agent( logger.info(f"Running mini-bc-agent on: {entry.instance_id}") - task: str = entry.get_task(transform_image_paths=True) + from bcbench.agent.shared.prompt import _transform_image_paths + + task: str = _transform_image_paths(entry.get_task()) # Lazy import and create agent from minisweagent.models.litellm_model import LitellmModel diff --git a/src/bcbench/agent/shared/prompt.py b/src/bcbench/agent/shared/prompt.py index 474105993..1dd36d360 100644 --- a/src/bcbench/agent/shared/prompt.py +++ b/src/bcbench/agent/shared/prompt.py @@ -1,10 +1,19 @@ +import re from pathlib import Path from jinja2 import Template +from bcbench.config import get_config from bcbench.dataset import BaseDatasetEntry from bcbench.types import EvaluationCategory +_config = get_config() + + +def _transform_image_paths(content: str) -> str: + dest_dir = _config.file_patterns.problem_statement_dest_dir + return re.sub(r"!\[([^\]]*)\]\(\./([^)]+)\)", rf"![\1]({dest_dir}/\2)", content) + def build_prompt(entry: BaseDatasetEntry, repo_path: Path, config: dict, category: EvaluationCategory, al_mcp: bool = False) -> str: prompt_config = config.get("prompt", {}) @@ -15,10 +24,12 @@ def build_prompt(entry: BaseDatasetEntry, repo_path: Path, config: dict, categor is_gold_patch: bool = category == EvaluationCategory.TEST_GENERATION and test_gen_input in ("gold-patch", "both") is_problem_statement: bool = category == EvaluationCategory.TEST_GENERATION and test_gen_input in ("problem-statement", "both") + task = _transform_image_paths(entry.get_task()) + template = Template(template_str) return template.render( repo_path=repo_path, - task=entry.get_task(transform_image_paths=True), + task=task, project_paths=", ".join(entry.project_paths), include_project_paths=include_project_paths, is_gold_patch=is_gold_patch, # only relevant for test-generation diff --git a/src/bcbench/commands/dataset.py b/src/bcbench/commands/dataset.py index 732faee7a..678c9502e 100644 --- a/src/bcbench/commands/dataset.py +++ b/src/bcbench/commands/dataset.py @@ -93,8 +93,9 @@ def view_entry( metadata_dict = entry.metadata.model_dump() for field_name, field_value in metadata_dict.items(): - display_name = field_name.replace("_", " ").title() - info_table.add_row(f"[dim]Metadata:[/dim] {display_name}", str(field_value) if field_value else "N/A") + if field_value is not None: + display_name = field_name.replace("_", " ").title() + info_table.add_row(f"[dim]Metadata:[/dim] {display_name}", str(field_value)) console.print(Panel(info_table, title="[bold]Entry Information[/bold]", border_style="blue")) diff --git a/src/bcbench/commands/evaluate.py b/src/bcbench/commands/evaluate.py index ffef5d745..1322ac473 100644 --- a/src/bcbench/commands/evaluate.py +++ b/src/bcbench/commands/evaluate.py @@ -23,7 +23,7 @@ from bcbench.dataset import BaseDatasetEntry from bcbench.evaluate import EvaluationPipeline from bcbench.logger import get_logger -from bcbench.results import BaseEvaluationResult +from bcbench.results import BaseEvaluationResult, ExecutionBasedEvaluationResult from bcbench.types import AgentMetrics, ContainerConfig, EvaluationContext, ExperimentConfiguration logger = get_logger(__name__) @@ -228,6 +228,9 @@ class MockEvaluationPipeline(EvaluationPipeline[BaseDatasetEntry]): It randomly generates different scenarios to test result handling and serialization. """ + def setup_workspace(self, entry: BaseDatasetEntry, repo_path: Path) -> None: + logger.info("Mock pipeline: Skipping workspace setup") + def setup(self, context: EvaluationContext[BaseDatasetEntry]) -> None: logger.info("Mock pipeline: Skipping setup") @@ -271,11 +274,11 @@ def evaluate(self, context: EvaluationContext[BaseDatasetEntry]) -> None: result: BaseEvaluationResult match scenario: case "success": - result = BaseEvaluationResult.create_success(context, "MOCK_PATCH_CONTENT") + result = ExecutionBasedEvaluationResult.create_success(context, "MOCK_PATCH_CONTENT") case "build-fail": - result = BaseEvaluationResult.create_build_failure(context, "MOCK_PATCH_CONTENT", "Mock build failure") + result = ExecutionBasedEvaluationResult.create_build_failure(context, "MOCK_PATCH_CONTENT", "Mock build failure") case "test-fail": - result = BaseEvaluationResult.create_test_failure(context, "MOCK_PATCH_CONTENT", "Mock test failure") + result = ExecutionBasedEvaluationResult.create_test_failure(context, "MOCK_PATCH_CONTENT", "Mock test failure") case _: raise ValueError("Invalid mock scenario, this should not happen") diff --git a/src/bcbench/commands/result.py b/src/bcbench/commands/result.py index 40cb6d375..178dd2749 100644 --- a/src/bcbench/commands/result.py +++ b/src/bcbench/commands/result.py @@ -12,11 +12,11 @@ from bcbench.results import ( BaseEvaluationResult, EvaluationResultSummary, + ExecutionBasedEvaluationResultSummary, Leaderboard, LeaderboardAggregate, create_console_summary, create_github_job_summary, - create_result_from_json, write_bceval_results, ) @@ -65,7 +65,7 @@ def result_summarize( for results_path in result_files: logger.info(f"Reading results from: {results_path}") with open(results_path) as f: - results.extend(create_result_from_json(json.loads(line)) for line in f if line.strip()) + results.extend(BaseEvaluationResult.from_json(json.loads(line)) for line in f if line.strip()) if not results: logger.error("No results found in the result files") @@ -73,13 +73,13 @@ def result_summarize( write_bceval_results(results, run_dir, run_id, bceval_output, category) + summary = EvaluationResultSummary.from_results(results, run_id=run_id) + if _config.env.github_actions: - create_github_job_summary(results) + create_github_job_summary(results, summary) else: - create_console_summary(results) + create_console_summary(results, summary) - # Save summary JSON - summary = EvaluationResultSummary.from_results(results, run_id=run_id) summary.save(run_dir, summary_output) @@ -90,8 +90,8 @@ def _get_combination_key(result: EvaluationResultSummary) -> tuple[str, str, str return (result.agent_name, result.model, exp_key, result.benchmark_version) -def _rebuild_aggregates(runs: list[EvaluationResultSummary]) -> list[LeaderboardAggregate]: - grouped: defaultdict[tuple[str, str, str | None, str], list[EvaluationResultSummary]] = defaultdict(list) +def _rebuild_aggregates(runs: list[ExecutionBasedEvaluationResultSummary]) -> list[LeaderboardAggregate]: + grouped: defaultdict[tuple[str, str, str | None, str], list[ExecutionBasedEvaluationResultSummary]] = defaultdict(list) for run in runs: grouped[_get_combination_key(run)].append(run) return [LeaderboardAggregate.from_runs(group) for group in grouped.values()] @@ -111,7 +111,7 @@ def result_update( """ logger.info(f"Loading evaluation summary from: {evaluation_summary}") with open(evaluation_summary, encoding="utf-8") as f: - new_result = EvaluationResultSummary.model_validate_json(f.read()) + new_result = ExecutionBasedEvaluationResultSummary.model_validate_json(f.read()) logger.info(f"Processing result for agent '{new_result.agent_name}' with model '{new_result.model}' in category '{new_result.category.value}'") @@ -120,13 +120,13 @@ def result_update( # Load existing leaderboard leaderboard: Leaderboard = Leaderboard.load(leaderboard_path) - runs: list[EvaluationResultSummary] = list(leaderboard.runs) + runs: list[ExecutionBasedEvaluationResultSummary] = list(leaderboard.runs) logger.info(f"Loaded {len(runs)} existing runs") # Find runs matching this combination new_result_key = _get_combination_key(new_result) - matching_runs: list[EvaluationResultSummary] = [r for r in runs if _get_combination_key(r) == new_result_key] - other_runs: list[EvaluationResultSummary] = [r for r in runs if _get_combination_key(r) != new_result_key] + matching_runs: list[ExecutionBasedEvaluationResultSummary] = [r for r in runs if _get_combination_key(r) == new_result_key] + other_runs: list[ExecutionBasedEvaluationResultSummary] = [r for r in runs if _get_combination_key(r) != new_result_key] if len(matching_runs) < n: logger.info(f"Adding run ({len(matching_runs) + 1}/{n}) for '{new_result.agent_name}' + '{new_result.model}'") @@ -137,7 +137,7 @@ def result_update( matching_runs = [*matching_runs[1:], new_result] # Combine and rebuild aggregates - all_runs: list[EvaluationResultSummary] = other_runs + matching_runs + all_runs: list[ExecutionBasedEvaluationResultSummary] = other_runs + matching_runs aggregates = _rebuild_aggregates(all_runs) # Write back @@ -171,7 +171,7 @@ def result_refresh( logger.info(f"Refreshing: {leaderboard_path.name}") leaderboard: Leaderboard = Leaderboard.load(leaderboard_path) - runs: list[EvaluationResultSummary] = list(leaderboard.runs) + runs: list[ExecutionBasedEvaluationResultSummary] = list(leaderboard.runs) if not runs: logger.warning(f"No runs found in {leaderboard_path.name}, skipping") diff --git a/src/bcbench/commands/run.py b/src/bcbench/commands/run.py index 38832a1b8..4b069155d 100644 --- a/src/bcbench/commands/run.py +++ b/src/bcbench/commands/run.py @@ -19,9 +19,7 @@ RepoPath, ) from bcbench.config import get_config -from bcbench.dataset.dataset_entry import _BugFixTestGenBase from bcbench.logger import get_logger -from bcbench.operations import setup_repo_postbuild, setup_repo_prebuild logger = get_logger(__name__) _config = get_config() @@ -46,9 +44,7 @@ def run_mini( uv run bcbench run mini microsoft__BCApps-5633 --step-limit 5 --category bug-fix """ entry = category.entry_class.load(category.dataset_path, entry_id=entry_id)[0] - setup_repo_prebuild(entry, repo_path) - if isinstance(entry, _BugFixTestGenBase): - setup_repo_postbuild(entry, repo_path, category) + category.pipeline.setup_workspace(entry, repo_path) run_mini_agent( entry=entry, @@ -78,9 +74,7 @@ def run_copilot( uv run bcbench run copilot microsoft__BCApps-5633 --category bug-fix --repo-path /path/to/BCApps """ entry = category.entry_class.load(category.dataset_path, entry_id=entry_id)[0] - setup_repo_prebuild(entry, repo_path) - if isinstance(entry, _BugFixTestGenBase): - setup_repo_postbuild(entry, repo_path, category) + category.pipeline.setup_workspace(entry, repo_path) run_copilot_agent(entry=entry, repo_path=repo_path, model=model, category=category, output_dir=output_dir, al_mcp=al_mcp, container_name=container_name) @@ -104,9 +98,7 @@ def run_claude( uv run bcbench run claude microsoft__BCApps-5633 --category bug-fix --repo-path /path/to/BCApps """ entry = category.entry_class.load(category.dataset_path, entry_id=entry_id)[0] - setup_repo_prebuild(entry, repo_path) - if isinstance(entry, _BugFixTestGenBase): - setup_repo_postbuild(entry, repo_path, category) + category.pipeline.setup_workspace(entry, repo_path) run_claude_code(entry=entry, repo_path=repo_path, model=model, category=category, output_dir=output_dir, al_mcp=al_mcp, container_name=container_name) diff --git a/src/bcbench/dataset/dataset_entry.py b/src/bcbench/dataset/dataset_entry.py index 2d073a156..c2620caa0 100644 --- a/src/bcbench/dataset/dataset_entry.py +++ b/src/bcbench/dataset/dataset_entry.py @@ -37,12 +37,12 @@ class BaseDatasetEntry(BaseModel): metadata: EntryMetadata = Field(default_factory=EntryMetadata) - repo: str = Field(default="microsoftInternal/NAV", pattern=r"^[a-zA-Z0-9_-]+/[a-zA-Z0-9_-]+$") + repo: str = Field(default="microsoft/BCApps", pattern=r"^[a-zA-Z0-9_-]+/[a-zA-Z0-9_-]+$") instance_id: str = Field(pattern=_config.file_patterns.instance_pattern) base_commit: str = Field(pattern=r"^[a-fA-F0-9]{40}$") created_at: Annotated[str, Field(min_length=1)] environment_setup_version: str = Field(pattern=r"^[0-9]{2}\.[0-9]{1}$") - project_paths: Annotated[list[str], Field(min_length=2)] + project_paths: list[str] = [] patch: Annotated[str, Field(min_length=1)] @classmethod @@ -85,7 +85,7 @@ def save_to_file(self, filepath: Path | str) -> None: handle.write("\n") @abstractmethod - def get_task(self, transform_image_paths: bool = False) -> str: + def get_task(self) -> str: pass @abstractmethod @@ -116,15 +116,9 @@ class _BugFixTestGenBase(BaseDatasetEntry): def problem_statement_dir(self) -> Path: return _config.paths.problem_statement_dir / self.instance_id - def get_task(self, transform_image_paths: bool = False) -> str: + def get_task(self) -> str: readme_path = self.problem_statement_dir / _config.file_patterns.problem_statement_readme - content: str = readme_path.read_text(encoding="utf-8") - - if not transform_image_paths: - return content - - dest_dir = _config.file_patterns.problem_statement_dest_dir - return re.sub(r"!\[([^\]]*)\]\(\./([^)]+)\)", rf"![\1]({dest_dir}/\2)", content) + return readme_path.read_text(encoding="utf-8") @model_validator(mode="after") def validate_baseapp_patches_are_w1_only(self) -> Self: diff --git a/src/bcbench/evaluate/base.py b/src/bcbench/evaluate/base.py index 8c2dbb2d6..fd7850354 100644 --- a/src/bcbench/evaluate/base.py +++ b/src/bcbench/evaluate/base.py @@ -2,6 +2,7 @@ from abc import ABC, abstractmethod from collections.abc import Callable +from pathlib import Path from bcbench.config import get_config from bcbench.dataset import BaseDatasetEntry @@ -23,6 +24,14 @@ class EvaluationPipeline[E: BaseDatasetEntry](ABC): The execute() method provides a template orchestrating the overall evaluation flow. """ + @abstractmethod + def setup_workspace(self, entry: E, repo_path: Path) -> None: + """Prepare the workspace for agent execution (no build). + + Used by the `run` command to set up the repo without building. + """ + raise NotImplementedError() + @abstractmethod def setup(self, context: EvaluationContext[E]) -> None: """Setup environment: e.g. clean repo, checkout base commit, initial build. diff --git a/src/bcbench/evaluate/bugfix.py b/src/bcbench/evaluate/bugfix.py index 9df7eee67..b575407fe 100644 --- a/src/bcbench/evaluate/bugfix.py +++ b/src/bcbench/evaluate/bugfix.py @@ -1,4 +1,5 @@ from collections.abc import Callable +from pathlib import Path from bcbench.dataset import BugFixEntry from bcbench.evaluate.base import EvaluationPipeline @@ -9,8 +10,8 @@ build_and_publish_projects, categorize_projects, clean_project_paths, + copy_problem_statement_folder, run_tests, - setup_repo_postbuild, setup_repo_prebuild, stage_and_get_diff, ) @@ -25,6 +26,10 @@ class BugFixPipeline(EvaluationPipeline[BugFixEntry]): """Pipeline for bug-fix evaluation category.""" + def setup_workspace(self, entry: BugFixEntry, repo_path: Path) -> None: + setup_repo_prebuild(entry, repo_path) + copy_problem_statement_folder(entry, repo_path) + def setup(self, context: EvaluationContext[BugFixEntry]) -> None: setup_repo_prebuild(context.entry, context.repo_path) @@ -35,7 +40,7 @@ def setup(self, context: EvaluationContext[BugFixEntry]) -> None: context.entry.environment_setup_version, ) - setup_repo_postbuild(context.entry, context.repo_path, context.category) + copy_problem_statement_folder(context.entry, context.repo_path) def run_agent(self, context: EvaluationContext[BugFixEntry], agent_runner: Callable) -> None: with github_log_group(f"{context.agent_name} -- Entry: {context.entry.instance_id}"): diff --git a/src/bcbench/evaluate/testgeneration.py b/src/bcbench/evaluate/testgeneration.py index 11642dee1..f0e3848ee 100644 --- a/src/bcbench/evaluate/testgeneration.py +++ b/src/bcbench/evaluate/testgeneration.py @@ -1,6 +1,10 @@ from collections.abc import Callable +from pathlib import Path + +import yaml from bcbench.collection.patch_utils import extract_file_paths_from_patch +from bcbench.config import get_config from bcbench.dataset import TestEntry, TestGenEntry from bcbench.evaluate.base import EvaluationPipeline from bcbench.exceptions import BuildError, NoTestsExtractedError, TestExecutionError @@ -10,8 +14,8 @@ build_and_publish_projects, categorize_projects, clean_project_paths, + copy_problem_statement_folder, extract_tests_from_patch, - setup_repo_postbuild, setup_repo_prebuild, stage_and_get_diff, ) @@ -20,13 +24,44 @@ from bcbench.types import EvaluationContext logger = get_logger(__name__) +_config = get_config() + +__all__ = ["TestGenerationPipeline", "_get_test_generation_input_mode"] + -__all__ = ["TestGenerationPipeline"] +def _get_test_generation_input_mode() -> str: + config_file: Path = _config.paths.agent_share_dir / "config.yaml" + shared_config = yaml.safe_load(config_file.read_text()) + input_mode: str = shared_config.get("prompt", {}).get("test-generation-input", "problem-statement") + + valid_modes: set[str] = {"gold-patch", "problem-statement", "both"} + if input_mode not in valid_modes: + raise ValueError(f"Invalid test-generation-input mode: '{input_mode}'. Must be one of {valid_modes}. Note: Use hyphens, not underscores (e.g., 'gold-patch' not 'gold_patch')") + + return input_mode class TestGenerationPipeline(EvaluationPipeline[TestGenEntry]): """Pipeline for test-generation evaluation category.""" + def _apply_input_postbuild(self, entry: TestGenEntry, repo_path: Path) -> None: + input_mode = _get_test_generation_input_mode() + logger.info(f"Test generation input mode: {input_mode}") + match input_mode: + case "gold-patch": + apply_patch(repo_path, entry.patch, f"{entry.instance_id} gold patch") + case "both": + apply_patch(repo_path, entry.patch, f"{entry.instance_id} gold patch") + copy_problem_statement_folder(entry, repo_path) + case "problem-statement": + copy_problem_statement_folder(entry, repo_path) + case _: + raise ValueError(f"Unhandled test generation input mode: {input_mode}") + + def setup_workspace(self, entry: TestGenEntry, repo_path: Path) -> None: + setup_repo_prebuild(entry, repo_path) + self._apply_input_postbuild(entry, repo_path) + def setup(self, context: EvaluationContext[TestGenEntry]) -> None: setup_repo_prebuild(context.entry, context.repo_path) @@ -37,7 +72,7 @@ def setup(self, context: EvaluationContext[TestGenEntry]) -> None: context.entry.environment_setup_version, ) - setup_repo_postbuild(context.entry, context.repo_path, context.category) + self._apply_input_postbuild(context.entry, context.repo_path) def run_agent(self, context: EvaluationContext[TestGenEntry], agent_runner: Callable) -> None: with github_log_group(f"{context.agent_name} -- Entry: {context.entry.instance_id}"): diff --git a/src/bcbench/operations/__init__.py b/src/bcbench/operations/__init__.py index 05fd52171..45d7dcf5e 100644 --- a/src/bcbench/operations/__init__.py +++ b/src/bcbench/operations/__init__.py @@ -17,7 +17,7 @@ ) from bcbench.operations.instruction_operations import copy_problem_statement_folder, setup_custom_agent, setup_instructions_from_config from bcbench.operations.project_operations import categorize_projects -from bcbench.operations.setup_operations import setup_repo_postbuild, setup_repo_prebuild +from bcbench.operations.setup_operations import setup_repo_prebuild from bcbench.operations.skills_operations import setup_agent_skills from bcbench.operations.test_operations import extract_tests_from_patch @@ -38,7 +38,6 @@ "setup_agent_skills", "setup_custom_agent", "setup_instructions_from_config", - "setup_repo_postbuild", "setup_repo_prebuild", "stage_and_get_diff", ] diff --git a/src/bcbench/operations/setup_operations.py b/src/bcbench/operations/setup_operations.py index bc502ff9f..4682ed721 100644 --- a/src/bcbench/operations/setup_operations.py +++ b/src/bcbench/operations/setup_operations.py @@ -2,39 +2,15 @@ from pathlib import Path -import yaml - from bcbench.config import get_config -from bcbench.dataset.dataset_entry import BaseDatasetEntry, _BugFixTestGenBase +from bcbench.dataset.dataset_entry import BaseDatasetEntry from bcbench.logger import get_logger -from bcbench.operations.git_operations import apply_patch, checkout_commit, clean_repo -from bcbench.operations.instruction_operations import copy_problem_statement_folder -from bcbench.types import EvaluationCategory +from bcbench.operations.git_operations import checkout_commit, clean_repo logger = get_logger(__name__) _config = get_config() -__all__ = ["_get_test_generation_input_mode", "setup_repo_postbuild", "setup_repo_prebuild"] - - -def _get_test_generation_input_mode() -> str: - """Read test-generation input mode from shared agent config. - - Returns: - str: The validated input mode: "gold-patch", "problem-statement", or "both" - - Raises: - ValueError: If the input mode is not one of the valid values - """ - config_file: Path = _config.paths.agent_share_dir / "config.yaml" - shared_config = yaml.safe_load(config_file.read_text()) - input_mode: str = shared_config.get("prompt", {}).get("test-generation-input", "problem-statement") - - valid_modes: set[str] = {"gold-patch", "problem-statement", "both"} - if input_mode not in valid_modes: - raise ValueError(f"Invalid test-generation-input mode: '{input_mode}'. Must be one of {valid_modes}. Note: Use hyphens, not underscores (e.g., 'gold-patch' not 'gold_patch')") - - return input_mode +__all__ = ["setup_repo_prebuild"] def setup_repo_prebuild(entry: BaseDatasetEntry, repo_path: Path) -> None: @@ -42,36 +18,15 @@ def setup_repo_prebuild(entry: BaseDatasetEntry, repo_path: Path) -> None: This is the first phase of repo setup that should be called BEFORE build_and_publish_projects. It prepares a clean slate at the base commit without any patches or problem statements. + Skips for entries without a base_commit (e.g. categories that start from a blank project). Args: entry: Dataset entry with instance metadata repo_path: Path to the repository """ + if not entry.base_commit: + logger.info(f"Skipping prebuild setup for {entry.instance_id} (no base_commit)") + return + clean_repo(repo_path) checkout_commit(repo_path, entry.base_commit) - - -def setup_repo_postbuild(entry: _BugFixTestGenBase, repo_path: Path, category: EvaluationCategory) -> None: - """Setup repository after building for bug-fix and test-generation categories. - - This is the second phase of repo setup that should be called AFTER build_and_publish_projects. - For test-generation, this ensures the gold patch is applied only after the base code is built, - so the agent sees the fixed code but tests run against the unfixed published app. - - Note: Other categories should implement their own postbuild setup. - """ - if category == EvaluationCategory.TEST_GENERATION: - input_mode: str = _get_test_generation_input_mode() - logger.info(f"Test generation input mode: {input_mode}") - match input_mode: - case "gold-patch": - apply_patch(repo_path, entry.patch, f"{entry.instance_id} gold patch") - case "both": - apply_patch(repo_path, entry.patch, f"{entry.instance_id} gold patch") - copy_problem_statement_folder(entry, repo_path) - case "problem-statement": - copy_problem_statement_folder(entry, repo_path) - case _: - raise ValueError(f"Unhandled test generation input mode: {input_mode}") - else: - copy_problem_statement_folder(entry, repo_path) diff --git a/src/bcbench/results/__init__.py b/src/bcbench/results/__init__.py index f769c58af..162f2f678 100644 --- a/src/bcbench/results/__init__.py +++ b/src/bcbench/results/__init__.py @@ -1,23 +1,25 @@ -from bcbench.results.base import create_result_from_json +from bcbench.results.base import ExecutionBasedEvaluationResult from bcbench.results.bceval_export import write_bceval_results from bcbench.results.display import create_console_summary, create_github_job_summary -from bcbench.results.evaluation_result import ( +from bcbench.results.metrics import bootstrap_ci, pass_at_k, pass_hat_k +from bcbench.results.summary import ( BaseEvaluationResult, EvaluationResultSummary, + ExecutionBasedEvaluationResultSummary, Leaderboard, LeaderboardAggregate, ) -from bcbench.results.metrics import bootstrap_ci, pass_at_k, pass_hat_k __all__ = [ "BaseEvaluationResult", "EvaluationResultSummary", + "ExecutionBasedEvaluationResult", + "ExecutionBasedEvaluationResultSummary", "Leaderboard", "LeaderboardAggregate", "bootstrap_ci", "create_console_summary", "create_github_job_summary", - "create_result_from_json", "pass_at_k", "pass_hat_k", "write_bceval_results", diff --git a/src/bcbench/results/base.py b/src/bcbench/results/base.py index fffe9beba..495eaaf27 100644 --- a/src/bcbench/results/base.py +++ b/src/bcbench/results/base.py @@ -16,16 +16,14 @@ class BaseEvaluationResult(BaseModel): """Base class for all evaluation results with shared metrics across categories.""" instance_id: str - project: str # TODO: move to category-specific subclasses? + project: str model: str agent_name: str category: EvaluationCategory - resolved: bool - build: bool timeout: bool = False - generated_patch: str = "" + output: str = "" error_message: str | None = None metrics: AgentMetrics | None = None @@ -35,26 +33,10 @@ class BaseEvaluationResult(BaseModel): def _create_from_context( cls, context: "EvaluationContext", - resolved: bool, - build: bool, error_message: str | None = None, - generated_patch: str = "", + output: str = "", **kwargs: Any, ) -> Self: - """Create result from EvaluationContext with validation and metric extraction. - - Args: - context: Evaluation context with configuration - resolved: Whether the evaluation was successful - build: Whether the build succeeded - error_message: Optional error message if evaluation failed - generated_patch: The generated patch content - **kwargs: Additional category-specific fields - - Returns: - Result instance (base or category-specific subclass) - """ - # Warn about missing metrics if they are not present if not context.metrics: logger.warning(f"Creating result for {context.entry.instance_id} with no agent metrics - performance data will be unavailable") elif missing_metrics := [name for name in AgentMetrics.model_fields if getattr(context.metrics, name) is None]: @@ -64,64 +46,86 @@ def _create_from_context( return cls( instance_id=context.entry.instance_id, project=project, - resolved=resolved, - build=build, model=context.model.replace(".", "-"), category=context.category, agent_name=context.agent_name, - generated_patch=generated_patch, + output=output, error_message=error_message, metrics=context.metrics, experiment=context.experiment, **kwargs, ) - @classmethod - def create_success(cls, context: "EvaluationContext", generated_patch: str, **kwargs: Any) -> Self: - return cls._create_from_context(context, resolved=True, build=True, generated_patch=generated_patch, **kwargs) - - @classmethod - def create_build_failure(cls, context: "EvaluationContext", generated_patch: str, error_msg: str, **kwargs: Any) -> Self: - return cls._create_from_context(context, resolved=False, build=False, error_message=error_msg, generated_patch=generated_patch, **kwargs) - - @classmethod - def create_test_failure(cls, context: "EvaluationContext", generated_patch: str, error_msg: str = "Tests failed", **kwargs: Any) -> Self: - return cls._create_from_context(context, resolved=False, build=True, error_message=error_msg, generated_patch=generated_patch, **kwargs) - @classmethod def create_agent_timeout_failure(cls, context: "EvaluationContext", **kwargs: Any) -> Self: - return cls._create_from_context(context, resolved=False, build=False, timeout=True, error_message="Agent timed out", **kwargs) + return cls._create_from_context(context, timeout=True, error_message="Agent timed out", **kwargs) def save(self, output_dir: Path, result_file: str) -> None: output_file = output_dir / result_file with open(output_file, "a", encoding="utf-8") as f: result_dict = self.model_dump(mode="json") - result_dict["category"] = self.category.value # Per-instance JSONL result files are uploaded as workflow artifacts and are the only inputs required by the summarize-results workflow. f.write(json.dumps(result_dict) + "\n") logger.info(f"Saved evaluation result for {self.instance_id} to {output_file}") + @property + def status_label(self) -> str: + """Short human-readable label for the result status shown in tables (e.g. 'Completed', 'Timeout').""" + if self.timeout: + return "Timeout" + if self.error_message: + return "Error" + return "Completed" + + @property + def category_metrics(self) -> dict[str, int | float | bool]: + """Category-specific metrics included in bceval export metadata. + + Keys become metadata fields; values must be JSON-serializable scalars. + Subclasses override to add metrics like 'resolved', 'build', etc. + """ + return {} -def create_result_from_json(payload: dict[str, Any]) -> BaseEvaluationResult: - """Create appropriate result instance from JSON payload based on category. + @property + def display_row(self) -> dict[str, str]: + """Extra columns for per-instance detail tables. - Args: - payload: Dictionary containing result data + Keys are column headers; values are the cell text for this result. + Subclasses override to surface category-specific per-instance info. + """ + return {} - Returns: - BugFixResult or TestGenerationResult instance based on category - """ - # Import here to avoid circular dependencies - from bcbench.results.bugfix import BugFixResult - from bcbench.results.testgeneration import TestGenerationResult + @classmethod + def from_json(cls, payload: dict[str, Any]) -> "BaseEvaluationResult": + category = EvaluationCategory(payload["category"]) + return category.result_class.model_validate(payload) + + +class ExecutionBasedEvaluationResult(BaseEvaluationResult): + """Result for categories that involve building/compiling AL code and have binary pass/fail outcomes.""" + + resolved: bool = False + build: bool = False - category = EvaluationCategory(payload["category"]) + @classmethod + def create_success(cls, context: "EvaluationContext", output: str, **kwargs: Any) -> Self: + return cls._create_from_context(context, output=output, resolved=True, build=True, **kwargs) + + @classmethod + def create_build_failure(cls, context: "EvaluationContext", output: str, error_msg: str, **kwargs: Any) -> Self: + return cls._create_from_context(context, output=output, error_message=error_msg, resolved=False, build=False, **kwargs) - match category: - case EvaluationCategory.BUG_FIX: - return BugFixResult.model_validate(payload) - case EvaluationCategory.TEST_GENERATION: - return TestGenerationResult.model_validate(payload) - case _: - raise ValueError(f"Unknown evaluation category: {category}") + @classmethod + def create_test_failure(cls, context: "EvaluationContext", output: str, error_msg: str = "Tests failed", **kwargs: Any) -> Self: + return cls._create_from_context(context, output=output, error_message=error_msg, resolved=False, build=True, **kwargs) + + @property + def status_label(self) -> str: + if self.timeout: + return "Timeout" + return "Success" if self.resolved else "Failed" + + @property + def category_metrics(self) -> dict[str, int | float | bool]: + return {"resolved": self.resolved, "build": self.build} diff --git a/src/bcbench/results/bceval_export.py b/src/bcbench/results/bceval_export.py index eadb79fb0..ad28f4727 100644 --- a/src/bcbench/results/bceval_export.py +++ b/src/bcbench/results/bceval_export.py @@ -9,7 +9,6 @@ from bcbench.dataset import BaseDatasetEntry from bcbench.logger import get_logger from bcbench.results.base import BaseEvaluationResult -from bcbench.results.testgeneration import TestGenerationResult from bcbench.types import EvaluationCategory logger = get_logger(__name__) @@ -39,23 +38,18 @@ def write_bceval_results(results: list[BaseEvaluationResult], out_dir: Path, run "llm_duration": (result.metrics.llm_duration if result.metrics else None) or 0, "latency": (result.metrics.execution_time if result.metrics else None) or 0, "turn_count": (result.metrics.turn_count if result.metrics else None) or 0, - "resolved": result.resolved, - "build": result.build, + **result.category_metrics, "run_id": run_id, "project": result.project, "error_message": result.error_message, "tool_usage": (result.metrics.tool_usage if result.metrics and result.metrics.tool_usage else None) or 0, } - if isinstance(result, TestGenerationResult): - metadata["pre_patch_failed"] = result.pre_patch_failed - metadata["post_patch_passed"] = result.post_patch_passed - bceval_result = { "id": result.instance_id, "input": input, "expected": expected, - "output": result.generated_patch, + "output": result.output, "context": "", "metadata": metadata, "tags": [], diff --git a/src/bcbench/results/bugfix.py b/src/bcbench/results/bugfix.py index 505de5236..bc55dbe82 100644 --- a/src/bcbench/results/bugfix.py +++ b/src/bcbench/results/bugfix.py @@ -1,9 +1,5 @@ -from bcbench.results.base import BaseEvaluationResult +from bcbench.results.base import ExecutionBasedEvaluationResult -class BugFixResult(BaseEvaluationResult): - """Result class for bug-fix evaluation category. - - Inherits all shared metrics from BaseEvaluationResult. - Category-specific fields can be added here as needed. - """ +class BugFixResult(ExecutionBasedEvaluationResult): + """Result class for bug-fix evaluation category.""" diff --git a/src/bcbench/results/display.py b/src/bcbench/results/display.py index bec719bc7..f0354f3a2 100644 --- a/src/bcbench/results/display.py +++ b/src/bcbench/results/display.py @@ -1,30 +1,31 @@ +from collections.abc import Sequence + from rich.console import Console from rich.table import Table from bcbench.config import get_config from bcbench.logger import get_logger from bcbench.results.base import BaseEvaluationResult -from bcbench.results.evaluation_result import _calculate_average_tool_usage +from bcbench.results.summary import EvaluationResultSummary, calculate_average_tool_usage logger = get_logger(__name__) console = Console() -def create_console_summary(results: list[BaseEvaluationResult]) -> None: +def create_console_summary(results: Sequence[BaseEvaluationResult], summary: EvaluationResultSummary) -> None: total = len(results) - resolved = sum(r.resolved for r in results) - failed = total - resolved + display_metrics: dict[str, int | float | bool] = summary.display_summary() console.print("\n[bold cyan]Evaluation Results Summary[/bold cyan]") console.print(f"Total Processed: [bold]{total}[/bold], using [bold]{results[0].agent_name}({results[0].model})[/bold]") console.print(f"Category: [bold]{results[0].category.value}[/bold]") - console.print(f"Resolved: [bold green]{resolved}[/bold green]") - console.print(f"Failed: [bold red]{failed}[/bold red]") + for key, value in display_metrics.items(): + console.print(f"{key.replace('_', ' ').title()}: [bold]{value}[/bold]") # Display average tool usage if available tool_usages = [r.metrics.tool_usage for r in results if r.metrics and r.metrics.tool_usage is not None] if tool_usages: - avg_usage = _calculate_average_tool_usage(tool_usages) + avg_usage = calculate_average_tool_usage(tool_usages) if avg_usage: console.print("\n[bold cyan]Average Tool Usage[/bold cyan]") sorted_tools = sorted(avg_usage.items(), key=lambda x: x[1], reverse=True) @@ -35,6 +36,12 @@ def create_console_summary(results: list[BaseEvaluationResult]) -> None: table.add_column("Instance ID", style="cyan", no_wrap=True) table.add_column("Project", style="magenta", no_wrap=True) table.add_column("Status", justify="center") + + # Dynamic columns from display_row() + extra_columns = list(results[0].display_row.keys()) if results else [] + for col_name in extra_columns: + table.add_column(col_name, style="yellow") + table.add_column("MCP Servers", style="yellow") table.add_column("Custom Instructions", style="yellow") table.add_column("Skills", style="yellow") @@ -42,12 +49,14 @@ def create_console_summary(results: list[BaseEvaluationResult]) -> None: table.add_column("Error Message", style="dim") for result in results: - status = "[green]Success[/green]" if result.resolved else "[red]Failed[/red]" + has_error = result.error_message is not None or result.timeout + status = f"[red]{result.status_label}[/red]" if has_error else f"[green]{result.status_label}[/green]" mcp_servers = ", ".join(result.experiment.mcp_servers) if result.experiment and result.experiment.mcp_servers else "N/A" custom_instructions = "Yes" if result.experiment and result.experiment.custom_instructions else "No" skills = "Yes" if result.experiment and result.experiment.skills_enabled else "No" custom_agent = result.experiment.custom_agent if result.experiment and result.experiment.custom_agent else "N/A" - table.add_row(result.instance_id, result.project, status, mcp_servers, custom_instructions, skills, custom_agent, result.error_message or "") + extra_values = list(result.display_row.values()) + table.add_row(result.instance_id, result.project, status, *extra_values, mcp_servers, custom_instructions, skills, custom_agent, result.error_message or "") console.print(table) console.print() @@ -61,12 +70,12 @@ def _get_short_error_message(error_message: str | None) -> str: return first_line.replace("|", "\\|") -def create_github_job_summary(results: list[BaseEvaluationResult]) -> None: +def create_github_job_summary(results: Sequence[BaseEvaluationResult], summary: EvaluationResultSummary) -> None: total = len(results) - resolved = sum(r.resolved for r in results) - failed = total - resolved + display_metrics: dict[str, int | float | bool] = summary.display_summary() + errors = sum(1 for r in results if r.error_message or r.timeout) - success_icon = ":white_check_mark:" if failed == 0 else ":x:" + success_icon = ":white_check_mark:" if errors == 0 else ":x:" mcp_servers = ", ".join(results[0].experiment.mcp_servers) if results[0].experiment and results[0].experiment.mcp_servers else "None" custom_instructions = "Yes" if results[0].experiment and results[0].experiment.custom_instructions else "No" @@ -77,31 +86,50 @@ def create_github_job_summary(results: list[BaseEvaluationResult]) -> None: tool_usage_section = "" tool_usages = [r.metrics.tool_usage for r in results if r.metrics and r.metrics.tool_usage is not None] if tool_usages: - avg_usage = _calculate_average_tool_usage(tool_usages) + avg_usage = calculate_average_tool_usage(tool_usages) if avg_usage: sorted_tools = sorted(avg_usage.items(), key=lambda x: x[1], reverse=True) tool_lines = [f" - `{tool}`: {count}" for tool, count in sorted_tools] tool_usage_section = "\n\n## Average Tool Usage\n" + "\n".join(tool_lines) + # Build category-specific summary lines + display_lines = "\n".join(f"- {key.replace('_', ' ').title()}: {value}" for key, value in display_metrics.items()) + markdown_summary = f"""Total entries processed: {total}, using **{results[0].agent_name} ({results[0].model})** - Category: `{results[0].category.value}` - MCP Servers used: {mcp_servers} - Custom Instructions: {custom_instructions} - Skills: {skills} - Custom Agent: {custom_agent} -- Successful evaluations: {resolved} :white_check_mark: -- Failed evaluations: {failed} {success_icon}{tool_usage_section} +{display_lines} +- Errors: {errors} {success_icon}{tool_usage_section} ## Detailed Results -| Instance ID | Project | Status | Error Message | -|-------------|---------|--------|---------------| """ + + # Dynamic columns from display_row() + extra_columns = list(results[0].display_row.keys()) if results else [] + extra_headers = " | ".join(extra_columns) + extra_separator = " | ".join("------" for _ in extra_columns) + + if extra_columns: + markdown_summary += f"| Instance ID | Project | Status | {extra_headers} | Error Message |\n" + markdown_summary += f"|-------------|---------|--------|{extra_separator}|---------------|\n" + else: + markdown_summary += "| Instance ID | Project | Status | Error Message |\n" + markdown_summary += "|-------------|---------|--------|---------------|\n" + for result in results: - status_icon = ":white_check_mark:" if result.resolved else ":x:" - status_text = f"{status_icon} {'Success' if result.resolved else 'Failed'}" + has_error = result.error_message is not None or result.timeout + status_icon = ":x:" if has_error else ":white_check_mark:" + status_text = f"{status_icon} {result.status_label}" error_msg = _get_short_error_message(result.error_message) - markdown_summary += f"| `{result.instance_id}` | `{result.project}` | {status_text} | {error_msg} |\n" + extra_values = " | ".join(result.display_row.values()) + if extra_columns: + markdown_summary += f"| `{result.instance_id}` | `{result.project}` | {status_text} | {extra_values} | {error_msg} |\n" + else: + markdown_summary += f"| `{result.instance_id}` | `{result.project}` | {status_text} | {error_msg} |\n" _write_github_step_summary(markdown_summary) diff --git a/src/bcbench/results/evaluation_result.py b/src/bcbench/results/summary.py similarity index 59% rename from src/bcbench/results/evaluation_result.py rename to src/bcbench/results/summary.py index 79aae3e39..858b19acd 100644 --- a/src/bcbench/results/evaluation_result.py +++ b/src/bcbench/results/summary.py @@ -1,11 +1,13 @@ import json import tomllib +from abc import ABC, abstractmethod from collections import Counter +from collections.abc import Sequence from datetime import date from pathlib import Path -from typing import Any, Sequence +from typing import Any -from pydantic import BaseModel +from pydantic import BaseModel, Field from bcbench.logger import get_logger from bcbench.results.base import BaseEvaluationResult @@ -28,12 +30,14 @@ def _get_benchmark_version() -> str: return tomllib.load(f).get("project", {}).get("version", "unknown") -class EvaluationResultSummary(BaseModel): +class EvaluationResultSummary(BaseModel, ABC): + """Base summary for a single evaluation run across all instances. + + Contains agent metrics common to every category (tokens, duration, tool usage). + Category-specific metrics (resolved, build, etc.) live on subclasses. + """ + total: int - resolved: int - failed: int - build: int - percentage: float date: date @@ -50,39 +54,38 @@ class EvaluationResultSummary(BaseModel): github_run_id: str | None = None experiment: ExperimentConfiguration | None = None - # Per-instance results for aggregate metrics calculation: instance_id -> resolved - instance_results: dict[str, bool] | None = None - - # Benchmark version from pyproject.toml at evaluation time benchmark_version: str + @abstractmethod + def display_summary(self) -> dict[str, int | float]: + """Return category-specific metrics for console/GitHub summary display. + + Subclasses must override. Keys become display labels (underscores replaced + with spaces and title-cased). Values are shown as-is. + """ + @classmethod def from_results(cls, results: Sequence[BaseEvaluationResult], run_id: str) -> "EvaluationResultSummary": - total = len(results) - resolved = sum(r.resolved for r in results) + """Create a summary from a list of per-instance results. + + When called on the base class, dispatches to the correct subclass. + Subclasses override, call super().from_results(), and extend via model_copy(). + """ + if cls is EvaluationResultSummary: + summary_cls = results[0].category.summary_class + return summary_cls.from_results(results, run_id) durations = [r.metrics.execution_time for r in results if r.metrics and r.metrics.execution_time is not None] prompt_tokens = [r.metrics.prompt_tokens for r in results if r.metrics and r.metrics.prompt_tokens is not None] completion_tokens = [r.metrics.completion_tokens for r in results if r.metrics and r.metrics.completion_tokens is not None] llm_durations = [r.metrics.llm_duration for r in results if r.metrics and r.metrics.llm_duration is not None] - - # Calculate average tool usage across all results tool_usages = [r.metrics.tool_usage for r in results if r.metrics and r.metrics.tool_usage is not None] - average_tool_usage = _calculate_average_tool_usage(tool_usages) if tool_usages else None - # Extract experiment configuration from first result (all should be same in a run) first_result = results[0] experiment = first_result.experiment if first_result.experiment and not first_result.experiment.is_empty() else None - # Create per-instance results for aggregate metrics calculation - instance_results = {r.instance_id: r.resolved for r in results} - return cls( - total=total, - resolved=resolved, - percentage=round(resolved / total * 100, 1), - failed=total - resolved, - build=sum(r.build for r in results), + total=len(results), date=date.today(), category=first_result.category, model=first_result.model, @@ -91,16 +94,19 @@ def from_results(cls, results: Sequence[BaseEvaluationResult], run_id: str) -> " average_prompt_tokens=sum(prompt_tokens) / len(prompt_tokens) if prompt_tokens else 0.0, average_completion_tokens=sum(completion_tokens) / len(completion_tokens) if completion_tokens else 0.0, average_llm_duration=sum(llm_durations) / len(llm_durations) if llm_durations else 0.0, - average_tool_usage=average_tool_usage, + average_tool_usage=calculate_average_tool_usage(tool_usages) if tool_usages else None, github_run_id=run_id, experiment=experiment, - instance_results=instance_results, benchmark_version=_get_benchmark_version(), ) + @classmethod + def from_json(cls, payload: dict[str, Any]) -> "EvaluationResultSummary": + category = EvaluationCategory(payload["category"]) + return category.summary_class.model_validate(payload) + def to_dict(self) -> dict[str, Any]: data = self.model_dump(mode="json") - # Round numeric values for readability data["average_duration"] = round(data["average_duration"], 1) data["average_prompt_tokens"] = round(data["average_prompt_tokens"], 1) data["average_completion_tokens"] = round(data["average_completion_tokens"], 1) @@ -115,15 +121,65 @@ def save(self, output_dir: Path, summary_file: str) -> None: logger.info(f"Saved evaluation summary to {output_file}") +class ExecutionBasedEvaluationResultSummary(EvaluationResultSummary): + """Summary for categories with binary pass/fail outcomes (bug-fix, test-generation). + + Fields match the original flat layout in the leaderboard JSON files. + """ + + resolved: int = 0 + failed: int = 0 + build: int = 0 + percentage: float = 0.0 + + # Per-instance pass/fail for aggregate metrics (pass^k, CI) + instance_results: dict[str, bool] = Field(default_factory=dict) + + def display_summary(self) -> dict[str, int | float]: + return { + "resolved": self.resolved, + "failed": self.failed, + "build": self.build, + "percentage": self.percentage, + } + + @classmethod + def from_results(cls, results: Sequence[BaseEvaluationResult], run_id: str) -> "ExecutionBasedEvaluationResultSummary": + from bcbench.results.base import ExecutionBasedEvaluationResult + + summary = super().from_results(results, run_id) + assert isinstance(summary, ExecutionBasedEvaluationResultSummary) + total = summary.total + + resolved = sum(1 for r in results if isinstance(r, ExecutionBasedEvaluationResult) and r.resolved) + build = sum(1 for r in results if isinstance(r, ExecutionBasedEvaluationResult) and r.build) + instance_results = {r.instance_id: (isinstance(r, ExecutionBasedEvaluationResult) and r.resolved) for r in results} + + return summary.model_copy( + update={ + "resolved": resolved, + "failed": total - resolved, + "build": build, + "percentage": round(resolved / total * 100, 1) if total else 0.0, + "instance_results": instance_results, + } + ) + + +# --------------------------------------------------------------------------- +# Leaderboard aggregation (execution-based categories only) +# --------------------------------------------------------------------------- + + class LeaderboardAggregate(BaseModel): + """Aggregate metrics across multiple runs. Execution-based categories only for now.""" + model: str agent_name: str category: EvaluationCategory experiment: ExperimentConfiguration | None = None - # Total instances in benchmark total: int - # Number of runs aggregated num_runs: int average: float | None = None @@ -131,68 +187,43 @@ class LeaderboardAggregate(BaseModel): ci_high: float | None = None pass_hat_5: float | None = None - # Averaged metrics across runs average_duration: float | None = None - # Benchmark version(s) from aggregated runs benchmark_version: str @classmethod - def from_runs(cls, runs: Sequence[EvaluationResultSummary]) -> "LeaderboardAggregate": + def from_runs(cls, runs: Sequence[ExecutionBasedEvaluationResultSummary]) -> "LeaderboardAggregate": if not runs: raise ValueError("Cannot create aggregate from empty runs list") - first_run: EvaluationResultSummary = runs[0] - total: int = first_run.total - num_runs: int = len(runs) - - # All runs should have the same benchmark_version (enforced by _get_combination_key grouping) - benchmark_version: str = first_run.benchmark_version + first_run = runs[0] + total = first_run.total + num_runs = len(runs) + benchmark_version = first_run.benchmark_version - # Warn if runs have different instance counts unique_totals = {r.total for r in runs} if len(unique_totals) > 1: logger.warning(f"Aggregating runs with different instance counts for '{first_run.agent_name}' + '{first_run.model}': {sorted(unique_totals)}. pass^k metrics may be misleading.") # Average duration across runs - durations: list[float] = [r.average_duration for r in runs if r.average_duration] - average_duration: float | None = sum(durations) / len(durations) if durations else None - - # Legacy single run without instance_results: use simple pass rate - if num_runs == 1 and not first_run.instance_results: - pass_rate = first_run.resolved / first_run.total if first_run.total > 0 else 0.0 - return cls( - model=first_run.model, - agent_name=first_run.agent_name, - category=first_run.category, - experiment=first_run.experiment, - total=total, - num_runs=num_runs, - average=round(pass_rate, 3), - ci_low=None, - ci_high=None, - pass_hat_5=None, - average_duration=round(average_duration, 1) if average_duration else None, - benchmark_version=benchmark_version, - ) + durations = [r.average_duration for r in runs if r.average_duration] + average_duration = sum(durations) / len(durations) if durations else None # Collect per-instance results across runs for pass^5 instance_resolved: dict[str, list[bool]] = {} for run in runs: - if run.instance_results: - for instance_id, resolved in run.instance_results.items(): - if instance_id not in instance_resolved: - instance_resolved[instance_id] = [] - instance_resolved[instance_id].append(resolved) - - # Calculate per-run pass rates for average and CI - per_run_rates = [run.resolved / run.total for run in runs if run.total > 0] + for instance_id, outcome in run.instance_results.items(): + if instance_id not in instance_resolved: + instance_resolved[instance_id] = [] + instance_resolved[instance_id].append(bool(outcome)) + + # Per-run scores for average and CI + per_run_rates = [run.percentage / 100.0 for run in runs] avg = round(sum(per_run_rates) / len(per_run_rates), 3) if per_run_rates else None ci_result = bootstrap_ci(per_run_rates) ci_low = round(ci_result["ci_low"], 3) if ci_result["ci_low"] is not None else None ci_high = round(ci_result["ci_high"], 3) if ci_result["ci_high"] is not None else None - # Calculate pass^5 pass_hat_5_val = _calculate_pass_hat_k(instance_resolved, 5, num_runs) if num_runs >= 5 else None return cls( @@ -212,7 +243,13 @@ def from_runs(cls, runs: Sequence[EvaluationResultSummary]) -> "LeaderboardAggre class Leaderboard(BaseModel): - runs: list[EvaluationResultSummary] + """Leaderboard for execution-based categories only. + + Non-execution-based categories (e.g. code-review) will need a different + leaderboard model once they are introduced. + """ + + runs: list[ExecutionBasedEvaluationResultSummary] aggregate: list[LeaderboardAggregate] @classmethod @@ -221,7 +258,6 @@ def load(cls, path: Path) -> "Leaderboard": return cls(runs=[], aggregate=[]) with open(path, encoding="utf-8") as f: data = json.load(f) - # Handle empty arrays or invalid structures if not data or not isinstance(data, dict): return cls(runs=[], aggregate=[]) return cls.model_validate(data) @@ -233,6 +269,11 @@ def to_dict(self) -> dict[str, Any]: } +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + def _calculate_pass_hat_k(instance_resolved: dict[str, list[bool]], k: int, num_trials: int) -> float: if num_trials < k: return 0.0 @@ -245,16 +286,10 @@ def _calculate_pass_hat_k(instance_resolved: dict[str, list[bool]], k: int, num_ return round(total_pass_hat_k / len(instance_resolved), 3) -def _calculate_average_tool_usage(tool_usages: list[dict[str, int]]) -> dict[str, float]: - """Calculate average tool usage across multiple results. - - Sums up all tool counts and divides by the number of results to get average. - """ +def calculate_average_tool_usage(tool_usages: list[dict[str, int]]) -> dict[str, float]: if not tool_usages: return {} aggregated = sum((Counter(usage) for usage in tool_usages), Counter()) - - # Calculate average (rounded to 2 decimal places) num_results = len(tool_usages) return {tool: round(count / num_results, 2) for tool, count in aggregated.items()} diff --git a/src/bcbench/results/testgeneration.py b/src/bcbench/results/testgeneration.py index fb80d0e05..0393f84ae 100644 --- a/src/bcbench/results/testgeneration.py +++ b/src/bcbench/results/testgeneration.py @@ -1,19 +1,26 @@ from typing import Self -from bcbench.results.base import BaseEvaluationResult +from bcbench.results.base import ExecutionBasedEvaluationResult from bcbench.types import EvaluationContext -class TestGenerationResult(BaseEvaluationResult): - """Result class for test-generation evaluation category. - - Inherits all shared metrics from BaseEvaluationResult. - Tracks whether generated tests failed before patch and passed after patch. - """ +class TestGenerationResult(ExecutionBasedEvaluationResult): + """Result class for test-generation evaluation category.""" pre_patch_failed: bool = False post_patch_passed: bool = False + @property + def category_metrics(self) -> dict[str, int | float | bool]: + return {**super().category_metrics, "pre_patch_failed": self.pre_patch_failed, "post_patch_passed": self.post_patch_passed} + + @property + def display_row(self) -> dict[str, str]: + return { + "Pre-Patch Failed": "Yes" if self.pre_patch_failed else "No", + "Post-Patch Passed": "Yes" if self.post_patch_passed else "No", + } + @classmethod - def create_no_tests_extracted(cls, context: "EvaluationContext", generated_patch: str, error_message: str) -> Self: - return cls._create_from_context(context, resolved=False, build=False, generated_patch=generated_patch, error_message=error_message) + def create_no_tests_extracted(cls, context: "EvaluationContext", output: str, error_message: str) -> Self: + return cls._create_from_context(context, resolved=False, build=False, output=output, error_message=error_message) diff --git a/src/bcbench/types.py b/src/bcbench/types.py index c2177a8a6..80731d2e7 100644 --- a/src/bcbench/types.py +++ b/src/bcbench/types.py @@ -14,6 +14,8 @@ if TYPE_CHECKING: from bcbench.dataset import BaseDatasetEntry from bcbench.evaluate.base import EvaluationPipeline + from bcbench.results.base import BaseEvaluationResult + from bcbench.results.summary import EvaluationResultSummary __all__ = ["AgentMetrics", "AgentType", "ContainerConfig", "EvaluationCategory", "EvaluationContext", "ExperimentConfiguration"] @@ -126,6 +128,32 @@ def entry_class(self) -> type[BaseDatasetEntry]: raise ValueError(f"Unknown evaluation category: {self}") + @property + def result_class(self) -> type[BaseEvaluationResult]: + from bcbench.results.bugfix import BugFixResult + from bcbench.results.testgeneration import TestGenerationResult + + match self: + case EvaluationCategory.BUG_FIX: + return BugFixResult + case EvaluationCategory.TEST_GENERATION: + return TestGenerationResult + + raise ValueError(f"Unknown evaluation category: {self}") + + @property + def summary_class(self) -> type[EvaluationResultSummary]: + """Returns the EvaluationResultSummary subclass for this category.""" + from bcbench.results.summary import ExecutionBasedEvaluationResultSummary + + match self: + case EvaluationCategory.BUG_FIX: + return ExecutionBasedEvaluationResultSummary + case EvaluationCategory.TEST_GENERATION: + return ExecutionBasedEvaluationResultSummary + + raise ValueError(f"Unknown evaluation category: {self}") + @property def pipeline(self) -> EvaluationPipeline: from bcbench.evaluate import BugFixPipeline, TestGenerationPipeline diff --git a/tests/conftest.py b/tests/conftest.py index 6c6ab4922..ce2dfaf34 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -103,7 +103,7 @@ def create_bugfix_result( agent_name: str = "copilot-cli", resolved: bool = True, build: bool = True, - generated_patch: str = "diff --git a/test.al b/test.al\n+fixed", + output: str = "diff --git a/test.al b/test.al\n+fixed", error_message: str | None = None, metrics: AgentMetrics | None = None, ) -> BugFixResult: @@ -115,7 +115,7 @@ def create_bugfix_result( category=EvaluationCategory.BUG_FIX, resolved=resolved, build=build, - generated_patch=generated_patch, + output=output, error_message=error_message, metrics=metrics, ) @@ -128,7 +128,7 @@ def create_testgen_result( agent_name: str = "copilot-cli", resolved: bool = False, build: bool = True, - generated_patch: str = "diff --git a/test.al b/test.al\n+test", + output: str = "diff --git a/test.al b/test.al\n+test", error_message: str | None = None, metrics: AgentMetrics | None = None, pre_patch_failed: bool = False, @@ -142,7 +142,7 @@ def create_testgen_result( category=EvaluationCategory.TEST_GENERATION, resolved=resolved, build=build, - generated_patch=generated_patch, + output=output, error_message=error_message, metrics=metrics, pre_patch_failed=pre_patch_failed, diff --git a/tests/test_cli_commands.py b/tests/test_cli_commands.py index 2290cff5c..12f678ecf 100644 --- a/tests/test_cli_commands.py +++ b/tests/test_cli_commands.py @@ -342,6 +342,7 @@ def sample_leaderboard_and_summary(tmp_path): "failed": 4, "build": 9, "percentage": 60.0, + "instance_results": copilot_instance_results, "date": "2025-01-10", "model": "gpt-4o", "category": "bug-fix", @@ -356,7 +357,6 @@ def sample_leaderboard_and_summary(tmp_path): "custom_instructions": True, "custom_agent": None, }, - "instance_results": copilot_instance_results, "benchmark_version": "0.1.0", }, { @@ -365,6 +365,7 @@ def sample_leaderboard_and_summary(tmp_path): "failed": 3, "build": 10, "percentage": 70.0, + "instance_results": mini_instance_results, "date": "2025-01-12", "model": "gpt-4o", "category": "bug-fix", @@ -379,7 +380,6 @@ def sample_leaderboard_and_summary(tmp_path): "custom_instructions": False, "custom_agent": None, }, - "instance_results": mini_instance_results, "benchmark_version": "0.1.0", }, ], @@ -430,6 +430,7 @@ def sample_leaderboard_and_summary(tmp_path): "failed": 5, "build": 8, "percentage": 50.0, + "instance_results": testgen_instance_results, "date": "2025-01-11", "model": "gpt-4-turbo", "category": "test-generation", @@ -444,7 +445,6 @@ def sample_leaderboard_and_summary(tmp_path): "custom_instructions": False, "custom_agent": None, }, - "instance_results": testgen_instance_results, "benchmark_version": "0.1.0", }, ], @@ -477,10 +477,11 @@ def sample_leaderboard_and_summary(tmp_path): new_summary = { "total": 10, - "resolved": 8, # Improved from 6 to 8 + "resolved": 8, "failed": 2, - "build": 10, # Improved from 9 to 10 + "build": 10, "percentage": 80.0, + "instance_results": new_summary_instance_results, "date": "2025-01-15", "model": "gpt-4o", "category": "bug-fix", @@ -495,7 +496,6 @@ def sample_leaderboard_and_summary(tmp_path): "custom_instructions": True, "custom_agent": None, }, - "instance_results": new_summary_instance_results, "benchmark_version": "0.1.0", } @@ -561,6 +561,7 @@ def test_result_update_adds_new_entry(sample_leaderboard_and_summary): "failed": 1, "build": 10, "percentage": 90.0, + "instance_results": new_agent_instance_results, "date": "2025-01-16", "model": "gpt-4o", "category": "test-generation", @@ -575,7 +576,6 @@ def test_result_update_adds_new_entry(sample_leaderboard_and_summary): "custom_instructions": False, "custom_agent": None, }, - "instance_results": new_agent_instance_results, "benchmark_version": "0.1.0", } @@ -629,6 +629,7 @@ def test_result_update_distinguishes_by_mcp_servers(sample_leaderboard_and_summa "failed": 3, "build": 9, "percentage": 70.0, + "instance_results": diff_mcp_instance_results, "date": "2025-01-17", "model": "gpt-4o", "category": "bug-fix", @@ -643,7 +644,6 @@ def test_result_update_distinguishes_by_mcp_servers(sample_leaderboard_and_summa "custom_instructions": False, # Different from existing True "custom_agent": None, }, - "instance_results": diff_mcp_instance_results, "benchmark_version": "0.1.0", } @@ -771,6 +771,7 @@ def test_result_update_stores_multiple_results_with_default_n(sample_leaderboard "failed": 2, "build": 10, "percentage": 80.0, + "instance_results": multi_results_instance, "date": "2025-01-15", "model": "gpt-4o", "category": "bug-fix", @@ -785,7 +786,6 @@ def test_result_update_stores_multiple_results_with_default_n(sample_leaderboard "custom_instructions": True, "custom_agent": None, }, - "instance_results": multi_results_instance, "benchmark_version": "0.1.0", } @@ -824,6 +824,7 @@ def test_result_update_replaces_oldest_when_exceeding_n(sample_leaderboard_and_s "failed": 3, "build": 9, "percentage": 70.0, + "instance_results": oldest_instance_results, "model": "gpt-4o", "category": "bug-fix", "agent_name": "copilot", @@ -836,7 +837,6 @@ def test_result_update_replaces_oldest_when_exceeding_n(sample_leaderboard_and_s "custom_instructions": True, "custom_agent": None, }, - "instance_results": oldest_instance_results, "benchmark_version": "0.1.0", } @@ -858,7 +858,16 @@ def test_result_update_replaces_oldest_when_exceeding_n(sample_leaderboard_and_s # Now add a 6th result - should replace oldest (2025-01-10) newest_instance_results = {f"test__inst_{i}": (i < 9) for i in range(10)} # 9 resolved - summary_new = {**base_summary, "date": "2025-01-20", "github_run_id": "run_sixth", "resolved": 9, "instance_results": newest_instance_results} + summary_new = { + **base_summary, + "date": "2025-01-20", + "github_run_id": "run_sixth", + "resolved": 9, + "failed": 1, + "build": 10, + "percentage": 90.0, + "instance_results": newest_instance_results, + } with open(summary_path, "w") as f: json.dump(summary_new, f, indent=2) @@ -927,17 +936,13 @@ def test_result_refresh_handles_empty_leaderboard(tmp_path): @pytest.mark.integration def test_result_refresh_handles_legacy_runs_without_instance_results(tmp_path): - """Test that refresh handles legacy runs that don't have instance_results.""" + """Test that refresh handles runs without instance_results.""" leaderboard_path = tmp_path / "bug-fix.json" legacy_data = { "runs": [ { "total": 10, - "resolved": 6, - "failed": 4, - "build": 9, - "percentage": 60.0, "date": "2025-01-10", "model": "gpt-4o", "category": "bug-fix", @@ -948,8 +953,11 @@ def test_result_refresh_handles_legacy_runs_without_instance_results(tmp_path): "average_llm_duration": 70.0, "github_run_id": "run_legacy", "experiment": None, - "instance_results": None, # Legacy: no instance_results "benchmark_version": "0.1.0", + "resolved": 6, + "failed": 4, + "build": 9, + "percentage": 60.0, }, ], "aggregate": [ @@ -996,6 +1004,7 @@ def test_result_refresh_separates_runs_by_benchmark_version(tmp_path): "failed": 4, "build": 10, "percentage": 60.0, + "instance_results": {f"test__inst_{i}": (i < 6) for i in range(10)}, "date": "2025-01-10", "model": "gpt-4o", "category": "bug-fix", @@ -1006,7 +1015,6 @@ def test_result_refresh_separates_runs_by_benchmark_version(tmp_path): "average_llm_duration": 70.0, "github_run_id": "run_v1", "experiment": None, - "instance_results": {f"test__inst_{i}": (i < 6) for i in range(10)}, "benchmark_version": "0.1.0", }, { @@ -1015,6 +1023,7 @@ def test_result_refresh_separates_runs_by_benchmark_version(tmp_path): "failed": 2, "build": 10, "percentage": 80.0, + "instance_results": {f"test__inst_{i}": (i < 8) for i in range(10)}, "date": "2025-01-15", "model": "gpt-4o", "category": "bug-fix", @@ -1025,7 +1034,6 @@ def test_result_refresh_separates_runs_by_benchmark_version(tmp_path): "average_llm_duration": 65.0, "github_run_id": "run_v2", "experiment": None, - "instance_results": {f"test__inst_{i}": (i < 8) for i in range(10)}, "benchmark_version": "0.2.0", }, ], @@ -1070,6 +1078,7 @@ def test_result_update_groups_by_benchmark_version(tmp_path): "failed": 5, "build": 10, "percentage": 50.0, + "instance_results": {f"test__inst_{i}": (i < 5) for i in range(10)}, "date": "2025-01-10", "model": "gpt-4o", "category": "bug-fix", @@ -1080,7 +1089,6 @@ def test_result_update_groups_by_benchmark_version(tmp_path): "average_llm_duration": 70.0, "github_run_id": "run_v1", "experiment": None, - "instance_results": {f"test__inst_{i}": (i < 5) for i in range(10)}, "benchmark_version": "0.1.0", }, ], @@ -1113,6 +1121,7 @@ def test_result_update_groups_by_benchmark_version(tmp_path): "failed": 3, "build": 10, "percentage": 70.0, + "instance_results": {f"test__inst_{i}": (i < 7) for i in range(10)}, "date": "2025-01-15", "model": "gpt-4o", "category": "bug-fix", @@ -1123,7 +1132,6 @@ def test_result_update_groups_by_benchmark_version(tmp_path): "average_llm_duration": 65.0, "github_run_id": "run_v2", "experiment": None, - "instance_results": {f"test__inst_{i}": (i < 7) for i in range(10)}, "benchmark_version": "0.2.0", } diff --git a/tests/test_evaluation_summary.py b/tests/test_evaluation_summary.py index a0c9586bf..7c3c97690 100644 --- a/tests/test_evaluation_summary.py +++ b/tests/test_evaluation_summary.py @@ -4,7 +4,7 @@ import pytest from bcbench.config import get_config -from bcbench.results.evaluation_result import EvaluationResultSummary +from bcbench.results.summary import EvaluationResultSummary, ExecutionBasedEvaluationResultSummary from bcbench.types import AgentMetrics, EvaluationCategory, ExperimentConfiguration from tests.conftest import create_bugfix_result, create_testgen_result @@ -13,7 +13,7 @@ class TestEvaluationResultSummary: def test_summary_save_creates_json_file(self, tmp_path): - summary = EvaluationResultSummary( + summary = ExecutionBasedEvaluationResultSummary( total=10, resolved=8, failed=2, @@ -43,6 +43,7 @@ def test_summary_save_creates_json_file(self, tmp_path): assert data["resolved"] == 8 assert data["failed"] == 2 assert data["build"] == 9 + assert data["instance_results"] == {} assert data["date"] == "2025-01-15" assert data["model"] == "gpt-4o" assert data["agent_name"] == "copilot-cli" @@ -51,7 +52,7 @@ def test_summary_save_creates_json_file(self, tmp_path): assert data["average_completion_tokens"] == 1200.0 def test_summary_save_with_custom_filename(self, tmp_path): - summary = EvaluationResultSummary( + summary = ExecutionBasedEvaluationResultSummary( total=5, resolved=4, failed=1, @@ -74,7 +75,7 @@ def test_summary_save_with_custom_filename(self, tmp_path): assert output_file.exists() def test_loading_existing_results(self): - from bcbench.results.evaluation_result import Leaderboard + from bcbench.results.summary import Leaderboard for category in EvaluationCategory: leaderboard_path = _config.paths.leaderboard_dir / f"{category.value}.json" @@ -87,7 +88,7 @@ def test_loading_existing_results(self): else: # Old format: array of items for item in data: - EvaluationResultSummary.model_validate(item) + ExecutionBasedEvaluationResultSummary.model_validate(item) class TestFromResults: @@ -255,7 +256,7 @@ def test_summary_with_experiment_configuration(self): custom_instructions=True, custom_agent="custom-bc-agent", ) - summary = EvaluationResultSummary( + summary = ExecutionBasedEvaluationResultSummary( total=5, resolved=3, failed=2, @@ -279,7 +280,7 @@ def test_summary_with_experiment_configuration(self): assert summary.experiment.custom_agent == "custom-bc-agent" def test_summary_without_experiment_configuration(self): - summary = EvaluationResultSummary( + summary = ExecutionBasedEvaluationResultSummary( total=5, resolved=3, failed=2, @@ -303,7 +304,7 @@ def test_summary_save_includes_experiment_in_json(self, tmp_path): mcp_servers=["pylance"], custom_instructions=True, ) - summary = EvaluationResultSummary( + summary = ExecutionBasedEvaluationResultSummary( total=10, resolved=8, failed=2, @@ -332,7 +333,7 @@ def test_summary_save_includes_experiment_in_json(self, tmp_path): assert data["experiment"]["custom_agent"] is None def test_summary_save_with_none_experiment(self, tmp_path): - summary = EvaluationResultSummary( + summary = ExecutionBasedEvaluationResultSummary( total=5, resolved=3, failed=2, @@ -465,16 +466,16 @@ def test_from_results_creates_instance_results(self): summary = EvaluationResultSummary.from_results(results, run_id="test_run") - assert summary.instance_results is not None - assert len(summary.instance_results) == 3 - assert summary.instance_results["test__1"] is True - assert summary.instance_results["test__2"] is False - assert summary.instance_results["test__3"] is True + instance_results = summary.instance_results + assert len(instance_results) == 3 + assert instance_results["test__1"] is True + assert instance_results["test__2"] is False + assert instance_results["test__3"] is True class TestLeaderboardAggregate: def test_from_single_run_calculates_average(self): - from bcbench.results.evaluation_result import LeaderboardAggregate + from bcbench.results.summary import LeaderboardAggregate summary = EvaluationResultSummary.from_results( [ @@ -496,7 +497,7 @@ def test_from_single_run_calculates_average(self): assert agg.pass_hat_5 is None # Not enough runs def test_from_multiple_runs_calculates_average_and_ci_bounds(self): - from bcbench.results.evaluation_result import LeaderboardAggregate + from bcbench.results.summary import LeaderboardAggregate run1 = EvaluationResultSummary.from_results( [ @@ -535,7 +536,7 @@ def test_from_multiple_runs_calculates_average_and_ci_bounds(self): assert agg.pass_hat_5 is None # Not enough runs def test_average_and_ci_bounds_with_varying_results(self): - from bcbench.results.evaluation_result import LeaderboardAggregate + from bcbench.results.summary import LeaderboardAggregate # Create 3 runs where: # - run1: 3/3 resolved (100%) @@ -579,7 +580,7 @@ def test_average_and_ci_bounds_with_varying_results(self): assert agg.pass_hat_5 is None # Not enough runs def test_consistent_results_have_zero_ci(self): - from bcbench.results.evaluation_result import LeaderboardAggregate + from bcbench.results.summary import LeaderboardAggregate # All instances pass all runs run1 = EvaluationResultSummary.from_results( @@ -615,7 +616,7 @@ def test_consistent_results_have_zero_ci(self): class TestLeaderboard: def test_aggregate_from_runs(self): - from bcbench.results.evaluation_result import LeaderboardAggregate + from bcbench.results.summary import LeaderboardAggregate run1 = EvaluationResultSummary.from_results( [ @@ -632,7 +633,7 @@ def test_aggregate_from_runs(self): assert agg.average == 0.5 def test_leaderboard_to_dict(self): - from bcbench.results.evaluation_result import Leaderboard, LeaderboardAggregate + from bcbench.results.summary import Leaderboard, LeaderboardAggregate run1 = EvaluationResultSummary.from_results( [create_bugfix_result(instance_id="test__1", resolved=True)], @@ -649,11 +650,10 @@ def test_leaderboard_to_dict(self): assert data["aggregate"][0]["average"] == 1.0 def test_aggregate_from_legacy_runs_without_instance_results(self): - """Test that a single legacy run without instance_results uses pass rate ratio.""" - from bcbench.results.evaluation_result import LeaderboardAggregate + """Test that a single run without instance_results uses pass rate from percentage.""" + from bcbench.results.summary import LeaderboardAggregate - # Create a summary without instance_results (simulates legacy data) - legacy_run = EvaluationResultSummary( + legacy_run = ExecutionBasedEvaluationResultSummary( total=10, resolved=6, failed=4, @@ -666,7 +666,6 @@ def test_aggregate_from_legacy_runs_without_instance_results(self): average_duration=100.0, average_prompt_tokens=1000.0, average_completion_tokens=500.0, - instance_results=None, # Legacy: no instance_results benchmark_version="0.1.0", ) @@ -674,14 +673,14 @@ def test_aggregate_from_legacy_runs_without_instance_results(self): assert agg.num_runs == 1 assert agg.total == 10 - # Should fall back to pass rate (resolved/total) from the run - assert agg.average == 0.6 # 6/10 = 0.6 + # Uses percentage / 100 as the run's pass rate + assert agg.average == 0.6 # 60.0% -> 0.6 assert agg.ci_low is None assert agg.ci_high is None assert agg.pass_hat_5 is None def test_aggregate_includes_benchmark_version_from_runs(self): - from bcbench.results.evaluation_result import LeaderboardAggregate + from bcbench.results.summary import LeaderboardAggregate run1 = EvaluationResultSummary.from_results( [create_bugfix_result(instance_id="test__1", resolved=True)], @@ -695,14 +694,15 @@ def test_aggregate_includes_benchmark_version_from_runs(self): assert agg.benchmark_version is not None def test_aggregate_allows_same_benchmark_versions(self): - from bcbench.results.evaluation_result import LeaderboardAggregate + from bcbench.results.summary import LeaderboardAggregate - run1 = EvaluationResultSummary( + run1 = ExecutionBasedEvaluationResultSummary( total=3, resolved=2, failed=1, build=3, percentage=66.7, + instance_results={"test__1": True, "test__2": True, "test__3": False}, date=date.today(), model="gpt-4o", agent_name="copilot", @@ -710,15 +710,15 @@ def test_aggregate_allows_same_benchmark_versions(self): average_duration=100.0, average_prompt_tokens=1000.0, average_completion_tokens=500.0, - instance_results={"test__1": True, "test__2": True, "test__3": False}, benchmark_version="0.1.0", ) - run2 = EvaluationResultSummary( + run2 = ExecutionBasedEvaluationResultSummary( total=3, resolved=1, failed=2, build=3, percentage=33.3, + instance_results={"test__1": False, "test__2": True, "test__3": False}, date=date.today(), model="gpt-4o", agent_name="copilot", @@ -726,7 +726,6 @@ def test_aggregate_allows_same_benchmark_versions(self): average_duration=100.0, average_prompt_tokens=1000.0, average_completion_tokens=500.0, - instance_results={"test__1": False, "test__2": True, "test__3": False}, benchmark_version="0.1.0", # Same version ) @@ -735,7 +734,7 @@ def test_aggregate_allows_same_benchmark_versions(self): assert agg.benchmark_version == "0.1.0" def test_load_empty_leaderboard_file(self, tmp_path): - from bcbench.results.evaluation_result import Leaderboard + from bcbench.results.summary import Leaderboard empty_file = tmp_path / "empty.json" empty_file.write_text("[]") @@ -746,7 +745,7 @@ def test_load_empty_leaderboard_file(self, tmp_path): assert leaderboard.aggregate == [] def test_load_empty_object_leaderboard_file(self, tmp_path): - from bcbench.results.evaluation_result import Leaderboard + from bcbench.results.summary import Leaderboard empty_file = tmp_path / "empty.json" empty_file.write_text("{}") diff --git a/tests/test_get_task.py b/tests/test_get_task.py index 783cc9418..64a2f97df 100644 --- a/tests/test_get_task.py +++ b/tests/test_get_task.py @@ -14,109 +14,3 @@ def test_returns_readme_content(self, tmp_path: Path): result = entry.get_task() assert result == content - - def test_transform_image_paths_false_preserves_relative_paths(self, tmp_path: Path): - content = "# Task\n\n![diagram](./diagram.png)\n\nSome text." - problem_dir = create_problem_statement_dir(tmp_path, content) - entry = create_dataset_entry() - - with patch.object(type(entry), "problem_statement_dir", property(lambda self: problem_dir)): - result = entry.get_task(transform_image_paths=False) - - assert "![diagram](./diagram.png)" in result - - def test_transform_image_paths_true_converts_to_problem_directory(self, tmp_path: Path): - content = "# Task\n\n![diagram](./diagram.png)\n\nSome text." - problem_dir = create_problem_statement_dir(tmp_path, content) - entry = create_dataset_entry() - - with patch.object(type(entry), "problem_statement_dir", property(lambda self: problem_dir)): - result = entry.get_task(transform_image_paths=True) - - assert "![diagram](problem/diagram.png)" in result - assert "./diagram.png" not in result - - def test_transform_image_paths_handles_multiple_images(self, tmp_path: Path): - content = "# Task\n\n![first](./img1.png)\n\nText\n\n![second](./img2.png)" - problem_dir = create_problem_statement_dir(tmp_path, content) - entry = create_dataset_entry() - - with patch.object(type(entry), "problem_statement_dir", property(lambda self: problem_dir)): - result = entry.get_task(transform_image_paths=True) - - assert "![first](problem/img1.png)" in result - assert "![second](problem/img2.png)" in result - - def test_transform_image_paths_preserves_alt_text(self, tmp_path: Path): - content = "![Complex Alt Text with spaces](./image.png)" - problem_dir = create_problem_statement_dir(tmp_path, content) - entry = create_dataset_entry() - - with patch.object(type(entry), "problem_statement_dir", property(lambda self: problem_dir)): - result = entry.get_task(transform_image_paths=True) - - assert "![Complex Alt Text with spaces](problem/image.png)" in result - - def test_transform_image_paths_handles_empty_alt_text(self, tmp_path: Path): - content = "![](./image.png)" - problem_dir = create_problem_statement_dir(tmp_path, content) - entry = create_dataset_entry() - - with patch.object(type(entry), "problem_statement_dir", property(lambda self: problem_dir)): - result = entry.get_task(transform_image_paths=True) - - assert "![](problem/image.png)" in result - - def test_transform_image_paths_handles_nested_paths(self, tmp_path: Path): - content = "![diagram](./images/subdir/diagram.png)" - problem_dir = create_problem_statement_dir(tmp_path, content) - entry = create_dataset_entry() - - with patch.object(type(entry), "problem_statement_dir", property(lambda self: problem_dir)): - result = entry.get_task(transform_image_paths=True) - - assert "![diagram](problem/images/subdir/diagram.png)" in result - - def test_transform_image_paths_ignores_absolute_urls(self, tmp_path: Path): - content = "![external](https://example.com/image.png)" - problem_dir = create_problem_statement_dir(tmp_path, content) - entry = create_dataset_entry() - - with patch.object(type(entry), "problem_statement_dir", property(lambda self: problem_dir)): - result = entry.get_task(transform_image_paths=True) - - assert "![external](https://example.com/image.png)" in result - - def test_transform_image_paths_ignores_non_relative_paths(self, tmp_path: Path): - content = "![other](images/diagram.png)" - problem_dir = create_problem_statement_dir(tmp_path, content) - entry = create_dataset_entry() - - with patch.object(type(entry), "problem_statement_dir", property(lambda self: problem_dir)): - result = entry.get_task(transform_image_paths=True) - - # Paths without ./ prefix should not be transformed - assert "![other](images/diagram.png)" in result - - def test_transform_image_paths_handles_mixed_content(self, tmp_path: Path): - content = """# Problem - -![local](./diagram.png) - -Some text with [a link](./doc.md) that is not an image. - -![external](https://example.com/img.png) - -![another local](./screenshot.jpg) -""" - problem_dir = create_problem_statement_dir(tmp_path, content) - entry = create_dataset_entry() - - with patch.object(type(entry), "problem_statement_dir", property(lambda self: problem_dir)): - result = entry.get_task(transform_image_paths=True) - - assert "![local](problem/diagram.png)" in result - assert "![another local](problem/screenshot.jpg)" in result - assert "![external](https://example.com/img.png)" in result - # Regular links should be preserved (not images) - assert "[a link](./doc.md)" in result diff --git a/tests/test_result_hierarchy.py b/tests/test_result_hierarchy.py new file mode 100644 index 000000000..8420a61a0 --- /dev/null +++ b/tests/test_result_hierarchy.py @@ -0,0 +1,393 @@ +"""Tests for the result and summary class hierarchies after the category refactor. + +Covers: +- BaseEvaluationResult vs ExecutionBasedEvaluationResult field separation +- status_label, category_metrics, display_row polymorphism +- from_json dispatch to correct subclass +- EvaluationResultSummary.from_results dispatch and super() chain +- ExecutionBasedEvaluationResultSummary category-specific aggregation +- display_summary on summaries +- display.py console/GitHub summary rendering +""" + +from datetime import date + +import pytest + +from bcbench.results.base import BaseEvaluationResult, ExecutionBasedEvaluationResult +from bcbench.results.bugfix import BugFixResult +from bcbench.results.display import create_console_summary, create_github_job_summary +from bcbench.results.summary import ( + EvaluationResultSummary, + ExecutionBasedEvaluationResultSummary, +) +from bcbench.results.testgeneration import TestGenerationResult +from bcbench.types import AgentMetrics, EvaluationCategory +from tests.conftest import create_bugfix_result, create_evaluation_context, create_testgen_result + + +def _make_config_with_summary(summary_path: str): + """Create a config mock with github_step_summary set.""" + from bcbench.config import get_config + + config = get_config() + # Return a shallow copy-like object that overrides env.github_step_summary + from unittest.mock import MagicMock + + mock = MagicMock(wraps=config) + mock.env.github_step_summary = summary_path + return mock + + +# --------------------------------------------------------------------------- +# BaseEvaluationResult +# --------------------------------------------------------------------------- + + +class TestBaseEvaluationResult: + def test_base_has_no_resolved_or_build(self): + assert "resolved" not in BaseEvaluationResult.model_fields + assert "build" not in BaseEvaluationResult.model_fields + + def test_execution_based_has_resolved_and_build(self): + assert "resolved" in ExecutionBasedEvaluationResult.model_fields + assert "build" in ExecutionBasedEvaluationResult.model_fields + + def test_bugfix_inherits_execution_based(self): + assert issubclass(BugFixResult, ExecutionBasedEvaluationResult) + + def test_testgen_inherits_execution_based(self): + assert issubclass(TestGenerationResult, ExecutionBasedEvaluationResult) + + +# --------------------------------------------------------------------------- +# status_label +# --------------------------------------------------------------------------- + + +class TestStatusLabel: + def test_base_completed(self): + result = create_bugfix_result(resolved=True) + assert result.status_label == "Success" + + def test_base_timeout(self): + result = create_bugfix_result(resolved=False, build=False) + result.timeout = True + assert result.status_label == "Timeout" + + def test_execution_based_success(self): + result = create_bugfix_result(resolved=True, build=True) + assert result.status_label == "Success" + + def test_execution_based_failed(self): + result = create_bugfix_result(resolved=False, build=True, error_message="Tests failed") + assert result.status_label == "Failed" + + +# --------------------------------------------------------------------------- +# category_metrics +# --------------------------------------------------------------------------- + + +class TestCategoryMetrics: + def test_bugfix_category_metrics(self): + result = create_bugfix_result(resolved=True, build=True) + assert result.category_metrics == {"resolved": True, "build": True} + + def test_bugfix_failed_category_metrics(self): + result = create_bugfix_result(resolved=False, build=False) + assert result.category_metrics == {"resolved": False, "build": False} + + def test_testgen_category_metrics_includes_extra_fields(self): + result = create_testgen_result(resolved=True, build=True, pre_patch_failed=True, post_patch_passed=True) + metrics = result.category_metrics + assert metrics["resolved"] is True + assert metrics["build"] is True + assert metrics["pre_patch_failed"] is True + assert metrics["post_patch_passed"] is True + + def test_testgen_category_metrics_defaults(self): + result = create_testgen_result() + metrics = result.category_metrics + assert metrics["pre_patch_failed"] is False + assert metrics["post_patch_passed"] is False + + +# --------------------------------------------------------------------------- +# display_row +# --------------------------------------------------------------------------- + + +class TestDisplayRow: + def test_bugfix_display_row_is_empty(self): + result = create_bugfix_result() + assert result.display_row == {} + + def test_testgen_display_row_has_columns(self): + result = create_testgen_result(pre_patch_failed=True, post_patch_passed=False) + row = result.display_row + assert row["Pre-Patch Failed"] == "Yes" + assert row["Post-Patch Passed"] == "No" + + def test_testgen_display_row_no_flags(self): + result = create_testgen_result(pre_patch_failed=False, post_patch_passed=False) + row = result.display_row + assert row["Pre-Patch Failed"] == "No" + assert row["Post-Patch Passed"] == "No" + + +# --------------------------------------------------------------------------- +# from_json dispatch +# --------------------------------------------------------------------------- + + +class TestFromJsonDispatch: + def test_from_json_returns_bugfix_result(self): + payload = create_bugfix_result().model_dump(mode="json") + loaded = BaseEvaluationResult.from_json(payload) + assert isinstance(loaded, BugFixResult) + + def test_from_json_returns_testgen_result(self): + payload = create_testgen_result(pre_patch_failed=True).model_dump(mode="json") + loaded = BaseEvaluationResult.from_json(payload) + assert isinstance(loaded, TestGenerationResult) + assert loaded.pre_patch_failed is True + + def test_from_json_preserves_all_fields(self): + original = create_bugfix_result( + instance_id="test__round-trip", + resolved=True, + build=True, + output="patch content", + error_message=None, + ) + loaded = BaseEvaluationResult.from_json(original.model_dump(mode="json")) + assert loaded.instance_id == original.instance_id + assert loaded.output == original.output + + def test_from_json_unknown_category_raises(self): + payload = create_bugfix_result().model_dump(mode="json") + payload["category"] = "nonexistent" + with pytest.raises(ValueError, match="nonexistent"): + BaseEvaluationResult.from_json(payload) + + +# --------------------------------------------------------------------------- +# create_agent_timeout_failure +# --------------------------------------------------------------------------- + + +class TestCreateAgentTimeout: + def test_timeout_sets_fields(self, tmp_path): + ctx = create_evaluation_context(tmp_path) + result = BugFixResult.create_agent_timeout_failure(ctx) + assert result.timeout is True + assert result.error_message == "Agent timed out" + assert result.status_label == "Timeout" + + +# --------------------------------------------------------------------------- +# EvaluationResultSummary.from_results — dispatch + super() chain +# --------------------------------------------------------------------------- + + +class TestSummaryFromResults: + def test_base_dispatches_to_execution_based_for_bugfix(self): + results = [create_bugfix_result(instance_id="test__1", resolved=True)] + summary = EvaluationResultSummary.from_results(results, run_id="run1") + assert isinstance(summary, ExecutionBasedEvaluationResultSummary) + + def test_base_dispatches_to_execution_based_for_testgen(self): + results = [create_testgen_result(instance_id="test__1")] + summary = EvaluationResultSummary.from_results(results, run_id="run1") + assert isinstance(summary, ExecutionBasedEvaluationResultSummary) + + def test_subclass_direct_call_also_works(self): + results = [create_bugfix_result(instance_id="test__1", resolved=True)] + summary = ExecutionBasedEvaluationResultSummary.from_results(results, run_id="run1") + assert isinstance(summary, ExecutionBasedEvaluationResultSummary) + assert summary.resolved == 1 + + def test_common_fields_computed(self): + results = [ + create_bugfix_result( + instance_id="test__1", + resolved=True, + metrics=AgentMetrics(execution_time=100.0, prompt_tokens=1000, completion_tokens=500), + ), + create_bugfix_result( + instance_id="test__2", + resolved=False, + metrics=AgentMetrics(execution_time=200.0, prompt_tokens=3000, completion_tokens=1500), + ), + ] + summary = EvaluationResultSummary.from_results(results, run_id="run1") + + assert summary.total == 2 + assert summary.model == "gpt-4o" + assert summary.agent_name == "copilot-cli" + assert summary.average_duration == pytest.approx(150.0) + assert summary.average_prompt_tokens == pytest.approx(2000.0) + assert summary.average_completion_tokens == pytest.approx(1000.0) + assert summary.date == date.today() + + def test_category_specific_fields_computed(self): + results = [ + create_bugfix_result(instance_id="test__1", resolved=True, build=True), + create_bugfix_result(instance_id="test__2", resolved=False, build=True), + create_bugfix_result(instance_id="test__3", resolved=False, build=False), + ] + summary = EvaluationResultSummary.from_results(results, run_id="run1") + + assert isinstance(summary, ExecutionBasedEvaluationResultSummary) + assert summary.resolved == 1 + assert summary.failed == 2 + assert summary.build == 2 + assert summary.percentage == pytest.approx(33.3) + + def test_instance_results_populated(self): + results = [ + create_bugfix_result(instance_id="test__a", resolved=True), + create_bugfix_result(instance_id="test__b", resolved=False), + ] + summary = EvaluationResultSummary.from_results(results, run_id="run1") + + assert isinstance(summary, ExecutionBasedEvaluationResultSummary) + assert summary.instance_results == {"test__a": True, "test__b": False} + + +# --------------------------------------------------------------------------- +# display_summary +# --------------------------------------------------------------------------- + + +class TestDisplaySummary: + def test_execution_based_display_summary(self): + summary = ExecutionBasedEvaluationResultSummary( + total=10, + resolved=7, + failed=3, + build=9, + percentage=70.0, + date=date.today(), + model="gpt-4o", + agent_name="copilot", + category=EvaluationCategory.BUG_FIX, + average_duration=100.0, + average_prompt_tokens=1000.0, + average_completion_tokens=500.0, + benchmark_version="0.1.0", + ) + display = summary.display_summary() + assert display == {"resolved": 7, "failed": 3, "build": 9, "percentage": 70.0} + + +# --------------------------------------------------------------------------- +# Summary from_json dispatch +# --------------------------------------------------------------------------- + + +class TestSummaryFromJson: + def test_from_json_returns_execution_based_for_bugfix(self): + payload = { + "total": 5, + "resolved": 3, + "failed": 2, + "build": 4, + "percentage": 60.0, + "date": "2025-01-15", + "model": "gpt-4o", + "category": "bug-fix", + "agent_name": "copilot", + "average_duration": 100.0, + "average_prompt_tokens": 1000.0, + "average_completion_tokens": 500.0, + "benchmark_version": "0.1.0", + } + summary = EvaluationResultSummary.from_json(payload) + assert isinstance(summary, ExecutionBasedEvaluationResultSummary) + assert summary.resolved == 3 + + def test_from_json_unknown_category_raises(self): + payload = { + "total": 5, + "date": "2025-01-15", + "model": "gpt-4o", + "category": "nonexistent", + "agent_name": "copilot", + "average_duration": 100.0, + "average_prompt_tokens": 1000.0, + "average_completion_tokens": 500.0, + "benchmark_version": "0.1.0", + } + with pytest.raises(ValueError, match="nonexistent"): + EvaluationResultSummary.from_json(payload) + + +# --------------------------------------------------------------------------- +# display.py — console and GitHub summary +# --------------------------------------------------------------------------- + + +class TestConsoleSummary: + def test_console_summary_renders(self, capsys): + results = [ + create_bugfix_result(instance_id="test__1", resolved=True), + create_bugfix_result(instance_id="test__2", resolved=False, error_message="Build failed"), + ] + create_console_summary(results, EvaluationResultSummary.from_results(results, run_id="")) + captured = capsys.readouterr() + assert "test__1" in captured.out + assert "test__2" in captured.out + assert "Evaluation Results Summary" in captured.out + + def test_console_summary_shows_testgen_data_values(self, capsys): + results = [ + create_testgen_result(instance_id="test__1", resolved=True, pre_patch_failed=True, post_patch_passed=True), + ] + create_console_summary(results, EvaluationResultSummary.from_results(results, run_id="")) + captured = capsys.readouterr() + # Rich truncates column headers, but data values "Yes" should appear + assert "Yes" in captured.out + assert "test__1" in captured.out + + +class TestGitHubJobSummary: + def test_github_summary_renders_markdown(self, tmp_path, monkeypatch): + summary_file = tmp_path / "summary.md" + monkeypatch.setattr("bcbench.results.display.get_config", lambda: _make_config_with_summary(str(summary_file))) + results = [ + create_bugfix_result(instance_id="test__1", resolved=True), + create_bugfix_result(instance_id="test__2", resolved=False, error_message="Build failed"), + ] + create_github_job_summary(results, EvaluationResultSummary.from_results(results, run_id="")) + content = summary_file.read_text() + assert "test__1" in content + assert "test__2" in content + assert "bug-fix" in content + + def test_github_summary_includes_testgen_columns(self, tmp_path, monkeypatch): + summary_file = tmp_path / "summary.md" + monkeypatch.setattr("bcbench.results.display.get_config", lambda: _make_config_with_summary(str(summary_file))) + results = [ + create_testgen_result(instance_id="test__1", resolved=True, pre_patch_failed=True, post_patch_passed=True), + ] + create_github_job_summary(results, EvaluationResultSummary.from_results(results, run_id="")) + content = summary_file.read_text() + assert "Pre-Patch Failed" in content + assert "Post-Patch Passed" in content + + def test_github_summary_includes_tool_usage(self, tmp_path, monkeypatch): + summary_file = tmp_path / "summary.md" + monkeypatch.setattr("bcbench.results.display.get_config", lambda: _make_config_with_summary(str(summary_file))) + results = [ + create_bugfix_result( + instance_id="test__1", + resolved=True, + metrics=AgentMetrics(execution_time=100.0, tool_usage={"bash": 5, "view": 3}), + ), + ] + create_github_job_summary(results, EvaluationResultSummary.from_results(results, run_id="")) + content = summary_file.read_text() + assert "Tool Usage" in content + assert "bash" in content diff --git a/tests/test_result_serialization.py b/tests/test_result_serialization.py index c8e5487b3..7644fc8a8 100644 --- a/tests/test_result_serialization.py +++ b/tests/test_result_serialization.py @@ -2,8 +2,8 @@ import pytest -from bcbench.results.base import create_result_from_json -from bcbench.results.evaluation_result import EvaluationResultSummary +from bcbench.results.base import BaseEvaluationResult +from bcbench.results.summary import EvaluationResultSummary from bcbench.types import AgentMetrics, EvaluationCategory, ExperimentConfiguration from tests.conftest import create_bugfix_result, create_testgen_result @@ -55,10 +55,10 @@ def test_bug_fix_category_loads_from_string(self): "category": "bug-fix", "resolved": True, "build": True, - "generated_patch": "patch", + "output": "patch", } - result = create_result_from_json(payload) + result = BaseEvaluationResult.from_json(payload) assert result.category == EvaluationCategory.BUG_FIX @@ -71,10 +71,10 @@ def test_test_generation_category_loads_from_string(self): "category": "test-generation", "resolved": False, "build": True, - "generated_patch": "test patch", + "output": "test patch", } - result = create_result_from_json(payload) + result = BaseEvaluationResult.from_json(payload) assert result.category == EvaluationCategory.TEST_GENERATION @@ -88,7 +88,7 @@ def test_round_trip_bug_fix(self, tmp_path): with open(tmp_path / "test.jsonl") as f: data = json.loads(f.readline()) - loaded = create_result_from_json(data) + loaded = BaseEvaluationResult.from_json(data) assert loaded.category == original.category assert loaded.category == EvaluationCategory.BUG_FIX @@ -103,7 +103,7 @@ def test_round_trip_test_generation(self, tmp_path): with open(tmp_path / "test.jsonl") as f: data = json.loads(f.readline()) - loaded = create_result_from_json(data) + loaded = BaseEvaluationResult.from_json(data) assert loaded.category == original.category assert loaded.category == EvaluationCategory.TEST_GENERATION @@ -136,7 +136,9 @@ def test_summary_category_loads_from_string(self): "benchmark_version": "0.1.0", } - summary = EvaluationResultSummary.model_validate(payload) + from bcbench.results.summary import EvaluationResultSummary + + summary = EvaluationResultSummary.from_json(payload) # Pydantic handles the enum conversion automatically assert summary.category == EvaluationCategory.TEST_GENERATION @@ -210,7 +212,7 @@ def test_tool_usage_loads_from_json(self): "category": "bug-fix", "resolved": True, "build": True, - "generated_patch": "patch", + "output": "patch", "metrics": { "execution_time": 100.0, "prompt_tokens": 5000, @@ -219,7 +221,7 @@ def test_tool_usage_loads_from_json(self): }, } - result = create_result_from_json(payload) + result = BaseEvaluationResult.from_json(payload) assert result.metrics is not None assert result.metrics.tool_usage is not None @@ -243,10 +245,22 @@ def test_tool_usage_round_trip(self, tmp_path): with open(tmp_path / "test.jsonl") as f: data = json.loads(f.readline()) - loaded = create_result_from_json(data) + loaded = BaseEvaluationResult.from_json(data) assert loaded.metrics is not None assert loaded.metrics.tool_usage is not None assert original.metrics is not None assert original.metrics.tool_usage is not None assert loaded.metrics.tool_usage == original.metrics.tool_usage + + def test_model_dump_json_serializes_category_as_string_value(self): + bug_fix = create_bugfix_result() + test_gen = create_testgen_result() + + bug_fix_dump = bug_fix.model_dump(mode="json") + test_gen_dump = test_gen.model_dump(mode="json") + + assert bug_fix_dump["category"] == "bug-fix" + assert test_gen_dump["category"] == "test-generation" + assert isinstance(bug_fix_dump["category"], str) + assert isinstance(test_gen_dump["category"], str) diff --git a/tests/test_testgeneration_validation.py b/tests/test_testgeneration_validation.py index 074d60de0..f29253ef6 100644 --- a/tests/test_testgeneration_validation.py +++ b/tests/test_testgeneration_validation.py @@ -3,7 +3,7 @@ import pytest import yaml -from bcbench.operations.setup_operations import _get_test_generation_input_mode +from bcbench.evaluate.testgeneration import _get_test_generation_input_mode def test_get_test_generation_input_mode_valid_gold_patch(): diff --git a/tests/test_version.py b/tests/test_version.py index 654f4ff41..5082fc4a7 100644 --- a/tests/test_version.py +++ b/tests/test_version.py @@ -1,6 +1,6 @@ """Tests for version utility.""" -from bcbench.results.evaluation_result import _get_benchmark_version +from bcbench.results.summary import _get_benchmark_version def test_get_benchmark_version_returns_string():