diff --git a/.github/actions/setup-bc-container/action.yml b/.github/actions/setup-bc-container/action.yml
index 700cc58c9..3b71c1457 100644
--- a/.github/actions/setup-bc-container/action.yml
+++ b/.github/actions/setup-bc-container/action.yml
@@ -14,6 +14,10 @@ inputs:
   github-token:
     description: GitHub token for accessing public repositories
     required: true
+  skip-container:
+    description: Skip BC container setup (only clone repository)
+    required: false
+    default: "false"
 
 outputs:
   repo_path:
@@ -24,6 +28,7 @@ runs:
   using: composite
   steps:
     - name: Generate BC container name and credentials
+      if: inputs.skip-container != 'true'
       run: |
         # Generate a 32-character random password using Get-Random
         # The password is short-lived and only used for the duration of the workflow
@@ -38,6 +43,7 @@ runs:
       shell: pwsh
 
     - name: Install BcContainerHelper module
+      if: inputs.skip-container != 'true'
       run: Install-Module -Name BcContainerHelper -Force -AllowClobber -AllowPrerelease
       shell: pwsh
 
@@ -59,5 +65,5 @@ runs:
         $env:ADO_TOKEN = az account get-access-token --resource "499b84ac-1321-427f-aa17-267ca6975798" --query accessToken -o tsv
         Write-Output "::add-mask::$env:ADO_TOKEN"
 
-        .\scripts\Setup-ContainerAndRepository.ps1 -InstanceId "${{ inputs.instance-id }}"
+        .\scripts\Setup-ContainerAndRepository.ps1 -InstanceId "${{ inputs.instance-id }}" ${{ inputs.skip-container == 'true' && '-SkipContainer' || '' }}
       shell: pwsh
diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md
index 98156aef5..76b60e234 100644
--- a/.github/copilot-instructions.md
+++ b/.github/copilot-instructions.md
@@ -14,6 +14,9 @@ This is a benchmark for evaluating coding agents on real-world Business Central
 - Uses `uv` for dependency management: e.g. `uv add <package>` to add packages, `uv run <command>` to run commands
 - Uses `pre-commit` for code quality checks (ruff linting/formatting, trailing whitespace, etc.)
 
+## Categories
+BC-Bench is category-based and designed to grow over time. It currently has two categories, `bug-fix` and `test-generation`. They share the same dataset tasks and execution-based setup, but use different prompts, expected outputs, and evaluation pipelines. Future categories such as `code-review` can be added within the same overall benchmark structure, though they may require different inputs, setup, or evaluation methods.
+
 ## Coding Patterns and Guidelines
 
 - Prefer strong typing and type hints
diff --git a/notebooks/bug-fix/overview.ipynb b/notebooks/bug-fix/overview.ipynb
index 49211055d..8e31e3804 100644
--- a/notebooks/bug-fix/overview.ipynb
+++ b/notebooks/bug-fix/overview.ipynb
@@ -269,7 +269,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
    "id": "8b5bb1be",
    "metadata": {},
    "outputs": [
@@ -291,7 +291,7 @@
     "merged_df[\"image_bin\"] = pd.cut(merged_df[\"image_count\"], bins=bins, labels=labels)\n",
     "\n",
     "# Add problem statement char count\n",
-    "ps_chars = {entry.instance_id: len(entry.get_task(transform_image_paths=False)) for entry in bcbench_dataset}\n",
+    "ps_chars = {entry.instance_id: len(entry.get_task()) for entry in bcbench_dataset}\n",
     "merged_df[\"ps_chars\"] = merged_df[\"instance_id\"].map(ps_chars)\n",
     "\n",
     "instance_df = (\n",
diff --git a/scripts/Setup-ContainerAndRepository.ps1 b/scripts/Setup-ContainerAndRepository.ps1
index 36e665ed3..77f6d8d4b 100644
--- a/scripts/Setup-ContainerAndRepository.ps1
+++ b/scripts/Setup-ContainerAndRepository.ps1
@@ -25,7 +25,10 @@ param(
     [SecureString]$Password,
 
     [Parameter(Mandatory = $false)]
-    [string]$RepoPath
+    [string]$RepoPath,
+
+    [Parameter(Mandatory = $false)]
+    [switch]$SkipContainer
 )
 
 [DatasetEntry[]] $entries = Get-DatasetEntries -DatasetPath $DatasetPath -Version $Version -InstanceId $InstanceId
@@ -37,9 +40,7 @@ else {
     Write-Log "Found $($entries.Count) dataset entries to process." -Level Info
 }
 
-Write-Log "Setting up BC container and repository for version $Version, Dataset Path: $DatasetPath" -Level Info
-
-[PSCredential]$credential = Get-BCCredential -Username $Username -Password $Password
+Write-Log "Setting up repository for version $Version, Dataset Path: $DatasetPath" -Level Info
 
 if (-not $RepoPath) {
     $RepoPath = Join-Path -Path $env:GITHUB_WORKSPACE -ChildPath "testbed"
@@ -56,27 +57,34 @@ if (Test-Path $RepoPath) {
 Write-Log "Cloning repository $($entries[0].repo) to $RepoPath" -Level Info
 Invoke-GitCloneWithRetry -RepoUrl $cloneInfo.Url -Token $cloneInfo.Token -ClonePath $RepoPath -CommitSha $commitSha -SparseCheckoutPaths $cloneInfo.SparseCheckoutPaths
 
-Import-Module BcContainerHelper -Force -DisableNameChecking
+if (-not $SkipContainer) {
+    [PSCredential]$credential = Get-BCCredential -Username $Username -Password $Password
 
-Write-Log "Container name: $ContainerName" -Level Info
+    Import-Module BcContainerHelper -Force -DisableNameChecking
 
-if (Test-ContainerExists -containerName $ContainerName) {
-    throw "Container $ContainerName already exists. This indicates the machine was not properly cleaned up from a previous run."
-}
+    Write-Log "Container name: $ContainerName" -Level Info
 
-Write-Log "Creating container $ContainerName for version $Version..." -Level Info
+    if (Test-ContainerExists -containerName $ContainerName) {
+        throw "Container $ContainerName already exists. This indicates the machine was not properly cleaned up from a previous run."
+    }
 
-# Get BC artifact URL
-[string] $url = Get-BCArtifactUrl -version $Version -Country $Country
-Write-Log "Retrieved artifact URL: $url" -Level Info
+    Write-Log "Creating container $ContainerName for version $Version..." -Level Info
 
-# Create container synchronously with NAV folder shared
-New-BCContainerSync -ContainerName $ContainerName -Version $Version -ArtifactUrl $url -Credential $credential -AdditionalFolders @($RepoPath)
+    # Get BC artifact URL
+    [string] $url = Get-BCArtifactUrl -version $Version -Country $Country
+    Write-Log "Retrieved artifact URL: $url" -Level Info
 
-# Create compiler folder synchronously
-New-BCCompilerFolderSync -ContainerName $ContainerName -ArtifactUrl $url
+    # Create container synchronously with NAV folder shared
+    New-BCContainerSync -ContainerName $ContainerName -Version $Version -ArtifactUrl $url -Credential $credential -AdditionalFolders @($RepoPath)
 
-Initialize-ContainerForDevelopment -ContainerName $ContainerName -RepoVersion ([System.Version]$Version)
+    # Create compiler folder synchronously
+    New-BCCompilerFolderSync -ContainerName $ContainerName -ArtifactUrl $url
+
+    Initialize-ContainerForDevelopment -ContainerName $ContainerName -RepoVersion ([System.Version]$Version)
+}
+else {
+    Write-Log "Skipping BC container setup (SkipContainer flag set)" -Level Info
+}
 
 # Set output for GitHub Actions or return path
 if ($env:GITHUB_OUTPUT) {
diff --git a/src/bcbench/agent/mini/agent.py b/src/bcbench/agent/mini/agent.py
index fec150331..0fbba7088 100644
--- a/src/bcbench/agent/mini/agent.py
+++ b/src/bcbench/agent/mini/agent.py
@@ -73,7 +73,9 @@ def run_mini_agent(
 
     logger.info(f"Running mini-bc-agent on: {entry.instance_id}")
 
-    task: str = entry.get_task(transform_image_paths=True)
+    from bcbench.agent.shared.prompt import _transform_image_paths
+
+    task: str = _transform_image_paths(entry.get_task())
 
     # Lazy import and create agent
     from minisweagent.models.litellm_model import LitellmModel
diff --git a/src/bcbench/agent/shared/prompt.py b/src/bcbench/agent/shared/prompt.py
index 474105993..1dd36d360 100644
--- a/src/bcbench/agent/shared/prompt.py
+++ b/src/bcbench/agent/shared/prompt.py
@@ -1,10 +1,19 @@
+import re
 from pathlib import Path
 
 from jinja2 import Template
 
+from bcbench.config import get_config
 from bcbench.dataset import BaseDatasetEntry
 from bcbench.types import EvaluationCategory
 
+_config = get_config()
+
+
+def _transform_image_paths(content: str) -> str:
+    dest_dir = _config.file_patterns.problem_statement_dest_dir
+    return re.sub(r"!\[([^\]]*)\]\(\./([^)]+)\)", rf"![\1]({dest_dir}/\2)", content)
+
 
 def build_prompt(entry: BaseDatasetEntry, repo_path: Path, config: dict, category: EvaluationCategory, al_mcp: bool = False) -> str:
     prompt_config = config.get("prompt", {})
@@ -15,10 +24,12 @@ def build_prompt(entry: BaseDatasetEntry, repo_path: Path, config: dict, categor
     is_gold_patch: bool = category == EvaluationCategory.TEST_GENERATION and test_gen_input in ("gold-patch", "both")
     is_problem_statement: bool = category == EvaluationCategory.TEST_GENERATION and test_gen_input in ("problem-statement", "both")
 
+    task = _transform_image_paths(entry.get_task())
+
     template = Template(template_str)
     return template.render(
         repo_path=repo_path,
-        task=entry.get_task(transform_image_paths=True),
+        task=task,
         project_paths=", ".join(entry.project_paths),
         include_project_paths=include_project_paths,
         is_gold_patch=is_gold_patch,  # only relevant for test-generation
diff --git a/src/bcbench/commands/dataset.py b/src/bcbench/commands/dataset.py
index 732faee7a..678c9502e 100644
--- a/src/bcbench/commands/dataset.py
+++ b/src/bcbench/commands/dataset.py
@@ -93,8 +93,9 @@ def view_entry(
 
     metadata_dict = entry.metadata.model_dump()
     for field_name, field_value in metadata_dict.items():
-        display_name = field_name.replace("_", " ").title()
-        info_table.add_row(f"[dim]Metadata:[/dim] {display_name}", str(field_value) if field_value else "N/A")
+        if field_value is not None:
+            display_name = field_name.replace("_", " ").title()
+            info_table.add_row(f"[dim]Metadata:[/dim] {display_name}", str(field_value))
 
     console.print(Panel(info_table, title="[bold]Entry Information[/bold]", border_style="blue"))
 
diff --git a/src/bcbench/commands/evaluate.py b/src/bcbench/commands/evaluate.py
index ffef5d745..1322ac473 100644
--- a/src/bcbench/commands/evaluate.py
+++ b/src/bcbench/commands/evaluate.py
@@ -23,7 +23,7 @@
 from bcbench.dataset import BaseDatasetEntry
 from bcbench.evaluate import EvaluationPipeline
 from bcbench.logger import get_logger
-from bcbench.results import BaseEvaluationResult
+from bcbench.results import BaseEvaluationResult, ExecutionBasedEvaluationResult
 from bcbench.types import AgentMetrics, ContainerConfig, EvaluationContext, ExperimentConfiguration
 
 logger = get_logger(__name__)
@@ -228,6 +228,9 @@ class MockEvaluationPipeline(EvaluationPipeline[BaseDatasetEntry]):
     It randomly generates different scenarios to test result handling and serialization.
     """
 
+    def setup_workspace(self, entry: BaseDatasetEntry, repo_path: Path) -> None:
+        logger.info("Mock pipeline: Skipping workspace setup")
+
     def setup(self, context: EvaluationContext[BaseDatasetEntry]) -> None:
         logger.info("Mock pipeline: Skipping setup")
 
@@ -271,11 +274,11 @@ def evaluate(self, context: EvaluationContext[BaseDatasetEntry]) -> None:
         result: BaseEvaluationResult
         match scenario:
             case "success":
-                result = BaseEvaluationResult.create_success(context, "MOCK_PATCH_CONTENT")
+                result = ExecutionBasedEvaluationResult.create_success(context, "MOCK_PATCH_CONTENT")
             case "build-fail":
-                result = BaseEvaluationResult.create_build_failure(context, "MOCK_PATCH_CONTENT", "Mock build failure")
+                result = ExecutionBasedEvaluationResult.create_build_failure(context, "MOCK_PATCH_CONTENT", "Mock build failure")
             case "test-fail":
-                result = BaseEvaluationResult.create_test_failure(context, "MOCK_PATCH_CONTENT", "Mock test failure")
+                result = ExecutionBasedEvaluationResult.create_test_failure(context, "MOCK_PATCH_CONTENT", "Mock test failure")
             case _:
                 raise ValueError("Invalid mock scenario, this should not happen")
 
diff --git a/src/bcbench/commands/result.py b/src/bcbench/commands/result.py
index 40cb6d375..178dd2749 100644
--- a/src/bcbench/commands/result.py
+++ b/src/bcbench/commands/result.py
@@ -12,11 +12,11 @@
 from bcbench.results import (
     BaseEvaluationResult,
     EvaluationResultSummary,
+    ExecutionBasedEvaluationResultSummary,
     Leaderboard,
     LeaderboardAggregate,
     create_console_summary,
     create_github_job_summary,
-    create_result_from_json,
     write_bceval_results,
 )
 
@@ -65,7 +65,7 @@ def result_summarize(
     for results_path in result_files:
         logger.info(f"Reading results from: {results_path}")
         with open(results_path) as f:
-            results.extend(create_result_from_json(json.loads(line)) for line in f if line.strip())
+            results.extend(BaseEvaluationResult.from_json(json.loads(line)) for line in f if line.strip())
 
     if not results:
         logger.error("No results found in the result files")
@@ -73,13 +73,13 @@ def result_summarize(
 
     write_bceval_results(results, run_dir, run_id, bceval_output, category)
 
+    summary = EvaluationResultSummary.from_results(results, run_id=run_id)
+
     if _config.env.github_actions:
-        create_github_job_summary(results)
+        create_github_job_summary(results, summary)
     else:
-        create_console_summary(results)
+        create_console_summary(results, summary)
 
-    # Save summary JSON
-    summary = EvaluationResultSummary.from_results(results, run_id=run_id)
     summary.save(run_dir, summary_output)
 
 
@@ -90,8 +90,8 @@ def _get_combination_key(result: EvaluationResultSummary) -> tuple[str, str, str
     return (result.agent_name, result.model, exp_key, result.benchmark_version)
 
 
-def _rebuild_aggregates(runs: list[EvaluationResultSummary]) -> list[LeaderboardAggregate]:
-    grouped: defaultdict[tuple[str, str, str | None, str], list[EvaluationResultSummary]] = defaultdict(list)
+def _rebuild_aggregates(runs: list[ExecutionBasedEvaluationResultSummary]) -> list[LeaderboardAggregate]:
+    grouped: defaultdict[tuple[str, str, str | None, str], list[ExecutionBasedEvaluationResultSummary]] = defaultdict(list)
     for run in runs:
         grouped[_get_combination_key(run)].append(run)
     return [LeaderboardAggregate.from_runs(group) for group in grouped.values()]
@@ -111,7 +111,7 @@ def result_update(
     """
     logger.info(f"Loading evaluation summary from: {evaluation_summary}")
     with open(evaluation_summary, encoding="utf-8") as f:
-        new_result = EvaluationResultSummary.model_validate_json(f.read())
+        new_result = ExecutionBasedEvaluationResultSummary.model_validate_json(f.read())
 
     logger.info(f"Processing result for agent '{new_result.agent_name}' with model '{new_result.model}' in category '{new_result.category.value}'")
 
@@ -120,13 +120,13 @@ def result_update(
 
     # Load existing leaderboard
     leaderboard: Leaderboard = Leaderboard.load(leaderboard_path)
-    runs: list[EvaluationResultSummary] = list(leaderboard.runs)
+    runs: list[ExecutionBasedEvaluationResultSummary] = list(leaderboard.runs)
     logger.info(f"Loaded {len(runs)} existing runs")
 
     # Find runs matching this combination
     new_result_key = _get_combination_key(new_result)
-    matching_runs: list[EvaluationResultSummary] = [r for r in runs if _get_combination_key(r) == new_result_key]
-    other_runs: list[EvaluationResultSummary] = [r for r in runs if _get_combination_key(r) != new_result_key]
+    matching_runs: list[ExecutionBasedEvaluationResultSummary] = [r for r in runs if _get_combination_key(r) == new_result_key]
+    other_runs: list[ExecutionBasedEvaluationResultSummary] = [r for r in runs if _get_combination_key(r) != new_result_key]
 
     if len(matching_runs) < n:
         logger.info(f"Adding run ({len(matching_runs) + 1}/{n}) for '{new_result.agent_name}' + '{new_result.model}'")
@@ -137,7 +137,7 @@ def result_update(
         matching_runs = [*matching_runs[1:], new_result]
 
     # Combine and rebuild aggregates
-    all_runs: list[EvaluationResultSummary] = other_runs + matching_runs
+    all_runs: list[ExecutionBasedEvaluationResultSummary] = other_runs + matching_runs
     aggregates = _rebuild_aggregates(all_runs)
 
     # Write back
@@ -171,7 +171,7 @@ def result_refresh(
         logger.info(f"Refreshing: {leaderboard_path.name}")
 
         leaderboard: Leaderboard = Leaderboard.load(leaderboard_path)
-        runs: list[EvaluationResultSummary] = list(leaderboard.runs)
+        runs: list[ExecutionBasedEvaluationResultSummary] = list(leaderboard.runs)
 
         if not runs:
             logger.warning(f"No runs found in {leaderboard_path.name}, skipping")
diff --git a/src/bcbench/commands/run.py b/src/bcbench/commands/run.py
index 38832a1b8..4b069155d 100644
--- a/src/bcbench/commands/run.py
+++ b/src/bcbench/commands/run.py
@@ -19,9 +19,7 @@
     RepoPath,
 )
 from bcbench.config import get_config
-from bcbench.dataset.dataset_entry import _BugFixTestGenBase
 from bcbench.logger import get_logger
-from bcbench.operations import setup_repo_postbuild, setup_repo_prebuild
 
 logger = get_logger(__name__)
 _config = get_config()
@@ -46,9 +44,7 @@ def run_mini(
         uv run bcbench run mini microsoft__BCApps-5633 --step-limit 5 --category bug-fix
     """
     entry = category.entry_class.load(category.dataset_path, entry_id=entry_id)[0]
-    setup_repo_prebuild(entry, repo_path)
-    if isinstance(entry, _BugFixTestGenBase):
-        setup_repo_postbuild(entry, repo_path, category)
+    category.pipeline.setup_workspace(entry, repo_path)
 
     run_mini_agent(
         entry=entry,
@@ -78,9 +74,7 @@ def run_copilot(
         uv run bcbench run copilot microsoft__BCApps-5633 --category bug-fix --repo-path /path/to/BCApps
     """
     entry = category.entry_class.load(category.dataset_path, entry_id=entry_id)[0]
-    setup_repo_prebuild(entry, repo_path)
-    if isinstance(entry, _BugFixTestGenBase):
-        setup_repo_postbuild(entry, repo_path, category)
+    category.pipeline.setup_workspace(entry, repo_path)
 
     run_copilot_agent(entry=entry, repo_path=repo_path, model=model, category=category, output_dir=output_dir, al_mcp=al_mcp, container_name=container_name)
 
@@ -104,9 +98,7 @@ def run_claude(
         uv run bcbench run claude microsoft__BCApps-5633 --category bug-fix --repo-path /path/to/BCApps
     """
     entry = category.entry_class.load(category.dataset_path, entry_id=entry_id)[0]
-    setup_repo_prebuild(entry, repo_path)
-    if isinstance(entry, _BugFixTestGenBase):
-        setup_repo_postbuild(entry, repo_path, category)
+    category.pipeline.setup_workspace(entry, repo_path)
 
     run_claude_code(entry=entry, repo_path=repo_path, model=model, category=category, output_dir=output_dir, al_mcp=al_mcp, container_name=container_name)
 
diff --git a/src/bcbench/dataset/dataset_entry.py b/src/bcbench/dataset/dataset_entry.py
index 2d073a156..c2620caa0 100644
--- a/src/bcbench/dataset/dataset_entry.py
+++ b/src/bcbench/dataset/dataset_entry.py
@@ -37,12 +37,12 @@ class BaseDatasetEntry(BaseModel):
 
     metadata: EntryMetadata = Field(default_factory=EntryMetadata)
 
-    repo: str = Field(default="microsoftInternal/NAV", pattern=r"^[a-zA-Z0-9_-]+/[a-zA-Z0-9_-]+$")
+    repo: str = Field(default="microsoft/BCApps", pattern=r"^[a-zA-Z0-9_-]+/[a-zA-Z0-9_-]+$")
     instance_id: str = Field(pattern=_config.file_patterns.instance_pattern)
     base_commit: str = Field(pattern=r"^[a-fA-F0-9]{40}$")
     created_at: Annotated[str, Field(min_length=1)]
     environment_setup_version: str = Field(pattern=r"^[0-9]{2}\.[0-9]{1}$")
-    project_paths: Annotated[list[str], Field(min_length=2)]
+    project_paths: list[str] = []
     patch: Annotated[str, Field(min_length=1)]
 
     @classmethod
@@ -85,7 +85,7 @@ def save_to_file(self, filepath: Path | str) -> None:
             handle.write("\n")
 
     @abstractmethod
-    def get_task(self, transform_image_paths: bool = False) -> str:
+    def get_task(self) -> str:
         pass
 
     @abstractmethod
@@ -116,15 +116,9 @@ class _BugFixTestGenBase(BaseDatasetEntry):
     def problem_statement_dir(self) -> Path:
         return _config.paths.problem_statement_dir / self.instance_id
 
-    def get_task(self, transform_image_paths: bool = False) -> str:
+    def get_task(self) -> str:
         readme_path = self.problem_statement_dir / _config.file_patterns.problem_statement_readme
-        content: str = readme_path.read_text(encoding="utf-8")
-
-        if not transform_image_paths:
-            return content
-
-        dest_dir = _config.file_patterns.problem_statement_dest_dir
-        return re.sub(r"!\[([^\]]*)\]\(\./([^)]+)\)", rf"![\1]({dest_dir}/\2)", content)
+        return readme_path.read_text(encoding="utf-8")
 
     @model_validator(mode="after")
     def validate_baseapp_patches_are_w1_only(self) -> Self:
diff --git a/src/bcbench/evaluate/base.py b/src/bcbench/evaluate/base.py
index 8c2dbb2d6..fd7850354 100644
--- a/src/bcbench/evaluate/base.py
+++ b/src/bcbench/evaluate/base.py
@@ -2,6 +2,7 @@
 
 from abc import ABC, abstractmethod
 from collections.abc import Callable
+from pathlib import Path
 
 from bcbench.config import get_config
 from bcbench.dataset import BaseDatasetEntry
@@ -23,6 +24,14 @@ class EvaluationPipeline[E: BaseDatasetEntry](ABC):
     The execute() method provides a template orchestrating the overall evaluation flow.
     """
 
+    @abstractmethod
+    def setup_workspace(self, entry: E, repo_path: Path) -> None:
+        """Prepare the workspace for agent execution (no build).
+
+        Used by the `run` command to set up the repo without building.
+        """
+        raise NotImplementedError()
+
     @abstractmethod
     def setup(self, context: EvaluationContext[E]) -> None:
         """Setup environment: e.g. clean repo, checkout base commit, initial build.
diff --git a/src/bcbench/evaluate/bugfix.py b/src/bcbench/evaluate/bugfix.py
index 9df7eee67..b575407fe 100644
--- a/src/bcbench/evaluate/bugfix.py
+++ b/src/bcbench/evaluate/bugfix.py
@@ -1,4 +1,5 @@
 from collections.abc import Callable
+from pathlib import Path
 
 from bcbench.dataset import BugFixEntry
 from bcbench.evaluate.base import EvaluationPipeline
@@ -9,8 +10,8 @@
     build_and_publish_projects,
     categorize_projects,
     clean_project_paths,
+    copy_problem_statement_folder,
     run_tests,
-    setup_repo_postbuild,
     setup_repo_prebuild,
     stage_and_get_diff,
 )
@@ -25,6 +26,10 @@
 class BugFixPipeline(EvaluationPipeline[BugFixEntry]):
     """Pipeline for bug-fix evaluation category."""
 
+    def setup_workspace(self, entry: BugFixEntry, repo_path: Path) -> None:
+        setup_repo_prebuild(entry, repo_path)
+        copy_problem_statement_folder(entry, repo_path)
+
     def setup(self, context: EvaluationContext[BugFixEntry]) -> None:
         setup_repo_prebuild(context.entry, context.repo_path)
 
@@ -35,7 +40,7 @@ def setup(self, context: EvaluationContext[BugFixEntry]) -> None:
             context.entry.environment_setup_version,
         )
 
-        setup_repo_postbuild(context.entry, context.repo_path, context.category)
+        copy_problem_statement_folder(context.entry, context.repo_path)
 
     def run_agent(self, context: EvaluationContext[BugFixEntry], agent_runner: Callable) -> None:
         with github_log_group(f"{context.agent_name} -- Entry: {context.entry.instance_id}"):
diff --git a/src/bcbench/evaluate/testgeneration.py b/src/bcbench/evaluate/testgeneration.py
index 11642dee1..f0e3848ee 100644
--- a/src/bcbench/evaluate/testgeneration.py
+++ b/src/bcbench/evaluate/testgeneration.py
@@ -1,6 +1,10 @@
 from collections.abc import Callable
+from pathlib import Path
+
+import yaml
 
 from bcbench.collection.patch_utils import extract_file_paths_from_patch
+from bcbench.config import get_config
 from bcbench.dataset import TestEntry, TestGenEntry
 from bcbench.evaluate.base import EvaluationPipeline
 from bcbench.exceptions import BuildError, NoTestsExtractedError, TestExecutionError
@@ -10,8 +14,8 @@
     build_and_publish_projects,
     categorize_projects,
     clean_project_paths,
+    copy_problem_statement_folder,
     extract_tests_from_patch,
-    setup_repo_postbuild,
     setup_repo_prebuild,
     stage_and_get_diff,
 )
@@ -20,13 +24,44 @@
 from bcbench.types import EvaluationContext
 
 logger = get_logger(__name__)
+_config = get_config()
+
+__all__ = ["TestGenerationPipeline", "_get_test_generation_input_mode"]
+
 
-__all__ = ["TestGenerationPipeline"]
+def _get_test_generation_input_mode() -> str:
+    config_file: Path = _config.paths.agent_share_dir / "config.yaml"
+    shared_config = yaml.safe_load(config_file.read_text())
+    input_mode: str = shared_config.get("prompt", {}).get("test-generation-input", "problem-statement")
+
+    valid_modes: set[str] = {"gold-patch", "problem-statement", "both"}
+    if input_mode not in valid_modes:
+        raise ValueError(f"Invalid test-generation-input mode: '{input_mode}'. Must be one of {valid_modes}. Note: Use hyphens, not underscores (e.g., 'gold-patch' not 'gold_patch')")
+
+    return input_mode
 
 
 class TestGenerationPipeline(EvaluationPipeline[TestGenEntry]):
     """Pipeline for test-generation evaluation category."""
 
+    def _apply_input_postbuild(self, entry: TestGenEntry, repo_path: Path) -> None:
+        input_mode = _get_test_generation_input_mode()
+        logger.info(f"Test generation input mode: {input_mode}")
+        match input_mode:
+            case "gold-patch":
+                apply_patch(repo_path, entry.patch, f"{entry.instance_id} gold patch")
+            case "both":
+                apply_patch(repo_path, entry.patch, f"{entry.instance_id} gold patch")
+                copy_problem_statement_folder(entry, repo_path)
+            case "problem-statement":
+                copy_problem_statement_folder(entry, repo_path)
+            case _:
+                raise ValueError(f"Unhandled test generation input mode: {input_mode}")
+
+    def setup_workspace(self, entry: TestGenEntry, repo_path: Path) -> None:
+        setup_repo_prebuild(entry, repo_path)
+        self._apply_input_postbuild(entry, repo_path)
+
     def setup(self, context: EvaluationContext[TestGenEntry]) -> None:
         setup_repo_prebuild(context.entry, context.repo_path)
 
@@ -37,7 +72,7 @@ def setup(self, context: EvaluationContext[TestGenEntry]) -> None:
             context.entry.environment_setup_version,
         )
 
-        setup_repo_postbuild(context.entry, context.repo_path, context.category)
+        self._apply_input_postbuild(context.entry, context.repo_path)
 
     def run_agent(self, context: EvaluationContext[TestGenEntry], agent_runner: Callable) -> None:
         with github_log_group(f"{context.agent_name} -- Entry: {context.entry.instance_id}"):
diff --git a/src/bcbench/operations/__init__.py b/src/bcbench/operations/__init__.py
index 05fd52171..45d7dcf5e 100644
--- a/src/bcbench/operations/__init__.py
+++ b/src/bcbench/operations/__init__.py
@@ -17,7 +17,7 @@
 )
 from bcbench.operations.instruction_operations import copy_problem_statement_folder, setup_custom_agent, setup_instructions_from_config
 from bcbench.operations.project_operations import categorize_projects
-from bcbench.operations.setup_operations import setup_repo_postbuild, setup_repo_prebuild
+from bcbench.operations.setup_operations import setup_repo_prebuild
 from bcbench.operations.skills_operations import setup_agent_skills
 from bcbench.operations.test_operations import extract_tests_from_patch
 
@@ -38,7 +38,6 @@
     "setup_agent_skills",
     "setup_custom_agent",
     "setup_instructions_from_config",
-    "setup_repo_postbuild",
     "setup_repo_prebuild",
     "stage_and_get_diff",
 ]
diff --git a/src/bcbench/operations/setup_operations.py b/src/bcbench/operations/setup_operations.py
index bc502ff9f..4682ed721 100644
--- a/src/bcbench/operations/setup_operations.py
+++ b/src/bcbench/operations/setup_operations.py
@@ -2,39 +2,15 @@
 
 from pathlib import Path
 
-import yaml
-
 from bcbench.config import get_config
-from bcbench.dataset.dataset_entry import BaseDatasetEntry, _BugFixTestGenBase
+from bcbench.dataset.dataset_entry import BaseDatasetEntry
 from bcbench.logger import get_logger
-from bcbench.operations.git_operations import apply_patch, checkout_commit, clean_repo
-from bcbench.operations.instruction_operations import copy_problem_statement_folder
-from bcbench.types import EvaluationCategory
+from bcbench.operations.git_operations import checkout_commit, clean_repo
 
 logger = get_logger(__name__)
 _config = get_config()
 
-__all__ = ["_get_test_generation_input_mode", "setup_repo_postbuild", "setup_repo_prebuild"]
-
-
-def _get_test_generation_input_mode() -> str:
-    """Read test-generation input mode from shared agent config.
-
-    Returns:
-        str: The validated input mode: "gold-patch", "problem-statement", or "both"
-
-    Raises:
-        ValueError: If the input mode is not one of the valid values
-    """
-    config_file: Path = _config.paths.agent_share_dir / "config.yaml"
-    shared_config = yaml.safe_load(config_file.read_text())
-    input_mode: str = shared_config.get("prompt", {}).get("test-generation-input", "problem-statement")
-
-    valid_modes: set[str] = {"gold-patch", "problem-statement", "both"}
-    if input_mode not in valid_modes:
-        raise ValueError(f"Invalid test-generation-input mode: '{input_mode}'. Must be one of {valid_modes}. Note: Use hyphens, not underscores (e.g., 'gold-patch' not 'gold_patch')")
-
-    return input_mode
+__all__ = ["setup_repo_prebuild"]
 
 
 def setup_repo_prebuild(entry: BaseDatasetEntry, repo_path: Path) -> None:
@@ -42,36 +18,15 @@ def setup_repo_prebuild(entry: BaseDatasetEntry, repo_path: Path) -> None:
 
     This is the first phase of repo setup that should be called BEFORE build_and_publish_projects.
     It prepares a clean slate at the base commit without any patches or problem statements.
+    Skips for entries without a base_commit (e.g. categories that start from a blank project).
 
     Args:
         entry: Dataset entry with instance metadata
         repo_path: Path to the repository
     """
+    if not entry.base_commit:
+        logger.info(f"Skipping prebuild setup for {entry.instance_id} (no base_commit)")
+        return
+
     clean_repo(repo_path)
     checkout_commit(repo_path, entry.base_commit)
-
-
-def setup_repo_postbuild(entry: _BugFixTestGenBase, repo_path: Path, category: EvaluationCategory) -> None:
-    """Setup repository after building for bug-fix and test-generation categories.
-
-    This is the second phase of repo setup that should be called AFTER build_and_publish_projects.
-    For test-generation, this ensures the gold patch is applied only after the base code is built,
-    so the agent sees the fixed code but tests run against the unfixed published app.
-
-    Note: Other categories should implement their own postbuild setup.
-    """
-    if category == EvaluationCategory.TEST_GENERATION:
-        input_mode: str = _get_test_generation_input_mode()
-        logger.info(f"Test generation input mode: {input_mode}")
-        match input_mode:
-            case "gold-patch":
-                apply_patch(repo_path, entry.patch, f"{entry.instance_id} gold patch")
-            case "both":
-                apply_patch(repo_path, entry.patch, f"{entry.instance_id} gold patch")
-                copy_problem_statement_folder(entry, repo_path)
-            case "problem-statement":
-                copy_problem_statement_folder(entry, repo_path)
-            case _:
-                raise ValueError(f"Unhandled test generation input mode: {input_mode}")
-    else:
-        copy_problem_statement_folder(entry, repo_path)
diff --git a/src/bcbench/results/__init__.py b/src/bcbench/results/__init__.py
index f769c58af..162f2f678 100644
--- a/src/bcbench/results/__init__.py
+++ b/src/bcbench/results/__init__.py
@@ -1,23 +1,25 @@
-from bcbench.results.base import create_result_from_json
+from bcbench.results.base import ExecutionBasedEvaluationResult
 from bcbench.results.bceval_export import write_bceval_results
 from bcbench.results.display import create_console_summary, create_github_job_summary
-from bcbench.results.evaluation_result import (
+from bcbench.results.metrics import bootstrap_ci, pass_at_k, pass_hat_k
+from bcbench.results.summary import (
     BaseEvaluationResult,
     EvaluationResultSummary,
+    ExecutionBasedEvaluationResultSummary,
     Leaderboard,
     LeaderboardAggregate,
 )
-from bcbench.results.metrics import bootstrap_ci, pass_at_k, pass_hat_k
 
 __all__ = [
     "BaseEvaluationResult",
     "EvaluationResultSummary",
+    "ExecutionBasedEvaluationResult",
+    "ExecutionBasedEvaluationResultSummary",
     "Leaderboard",
     "LeaderboardAggregate",
     "bootstrap_ci",
     "create_console_summary",
     "create_github_job_summary",
-    "create_result_from_json",
     "pass_at_k",
     "pass_hat_k",
     "write_bceval_results",
diff --git a/src/bcbench/results/base.py b/src/bcbench/results/base.py
index fffe9beba..495eaaf27 100644
--- a/src/bcbench/results/base.py
+++ b/src/bcbench/results/base.py
@@ -16,16 +16,14 @@ class BaseEvaluationResult(BaseModel):
     """Base class for all evaluation results with shared metrics across categories."""
 
     instance_id: str
-    project: str  # TODO: move to category-specific subclasses?
+    project: str
     model: str
     agent_name: str
     category: EvaluationCategory
 
-    resolved: bool
-    build: bool
     timeout: bool = False
 
-    generated_patch: str = ""
+    output: str = ""
     error_message: str | None = None
 
     metrics: AgentMetrics | None = None
@@ -35,26 +33,10 @@ class BaseEvaluationResult(BaseModel):
     def _create_from_context(
         cls,
         context: "EvaluationContext",
-        resolved: bool,
-        build: bool,
         error_message: str | None = None,
-        generated_patch: str = "",
+        output: str = "",
         **kwargs: Any,
     ) -> Self:
-        """Create result from EvaluationContext with validation and metric extraction.
-
-        Args:
-            context: Evaluation context with configuration
-            resolved: Whether the evaluation was successful
-            build: Whether the build succeeded
-            error_message: Optional error message if evaluation failed
-            generated_patch: The generated patch content
-            **kwargs: Additional category-specific fields
-
-        Returns:
-            Result instance (base or category-specific subclass)
-        """
-        # Warn about missing metrics if they are not present
         if not context.metrics:
             logger.warning(f"Creating result for {context.entry.instance_id} with no agent metrics - performance data will be unavailable")
         elif missing_metrics := [name for name in AgentMetrics.model_fields if getattr(context.metrics, name) is None]:
@@ -64,64 +46,86 @@ def _create_from_context(
         return cls(
             instance_id=context.entry.instance_id,
             project=project,
-            resolved=resolved,
-            build=build,
             model=context.model.replace(".", "-"),
             category=context.category,
             agent_name=context.agent_name,
-            generated_patch=generated_patch,
+            output=output,
             error_message=error_message,
             metrics=context.metrics,
             experiment=context.experiment,
             **kwargs,
         )
 
-    @classmethod
-    def create_success(cls, context: "EvaluationContext", generated_patch: str, **kwargs: Any) -> Self:
-        return cls._create_from_context(context, resolved=True, build=True, generated_patch=generated_patch, **kwargs)
-
-    @classmethod
-    def create_build_failure(cls, context: "EvaluationContext", generated_patch: str, error_msg: str, **kwargs: Any) -> Self:
-        return cls._create_from_context(context, resolved=False, build=False, error_message=error_msg, generated_patch=generated_patch, **kwargs)
-
-    @classmethod
-    def create_test_failure(cls, context: "EvaluationContext", generated_patch: str, error_msg: str = "Tests failed", **kwargs: Any) -> Self:
-        return cls._create_from_context(context, resolved=False, build=True, error_message=error_msg, generated_patch=generated_patch, **kwargs)
-
     @classmethod
     def create_agent_timeout_failure(cls, context: "EvaluationContext", **kwargs: Any) -> Self:
-        return cls._create_from_context(context, resolved=False, build=False, timeout=True, error_message="Agent timed out", **kwargs)
+        return cls._create_from_context(context, timeout=True, error_message="Agent timed out", **kwargs)
 
     def save(self, output_dir: Path, result_file: str) -> None:
         output_file = output_dir / result_file
         with open(output_file, "a", encoding="utf-8") as f:
             result_dict = self.model_dump(mode="json")
-            result_dict["category"] = self.category.value
             # Per-instance JSONL result files are uploaded as workflow artifacts and are the only inputs required by the summarize-results workflow.
             f.write(json.dumps(result_dict) + "\n")
 
         logger.info(f"Saved evaluation result for {self.instance_id} to {output_file}")
 
+    @property
+    def status_label(self) -> str:
+        """Short human-readable label for the result status shown in tables (e.g. 'Completed', 'Timeout')."""
+        if self.timeout:
+            return "Timeout"
+        if self.error_message:
+            return "Error"
+        return "Completed"
+
+    @property
+    def category_metrics(self) -> dict[str, int | float | bool]:
+        """Category-specific metrics included in bceval export metadata.
+
+        Keys become metadata fields; values must be JSON-serializable scalars.
+        Subclasses override to add metrics like 'resolved', 'build', etc.
+        """
+        return {}
 
-def create_result_from_json(payload: dict[str, Any]) -> BaseEvaluationResult:
-    """Create appropriate result instance from JSON payload based on category.
+    @property
+    def display_row(self) -> dict[str, str]:
+        """Extra columns for per-instance detail tables.
 
-    Args:
-        payload: Dictionary containing result data
+        Keys are column headers; values are the cell text for this result.
+        Subclasses override to surface category-specific per-instance info.
+        """
+        return {}
 
-    Returns:
-        BugFixResult or TestGenerationResult instance based on category
-    """
-    # Import here to avoid circular dependencies
-    from bcbench.results.bugfix import BugFixResult
-    from bcbench.results.testgeneration import TestGenerationResult
+    @classmethod
+    def from_json(cls, payload: dict[str, Any]) -> "BaseEvaluationResult":
+        category = EvaluationCategory(payload["category"])
+        return category.result_class.model_validate(payload)
+
+
+class ExecutionBasedEvaluationResult(BaseEvaluationResult):
+    """Result for categories that involve building/compiling AL code and have binary pass/fail outcomes."""
+
+    resolved: bool = False
+    build: bool = False
 
-    category = EvaluationCategory(payload["category"])
+    @classmethod
+    def create_success(cls, context: "EvaluationContext", output: str, **kwargs: Any) -> Self:
+        return cls._create_from_context(context, output=output, resolved=True, build=True, **kwargs)
+
+    @classmethod
+    def create_build_failure(cls, context: "EvaluationContext", output: str, error_msg: str, **kwargs: Any) -> Self:
+        return cls._create_from_context(context, output=output, error_message=error_msg, resolved=False, build=False, **kwargs)
 
-    match category:
-        case EvaluationCategory.BUG_FIX:
-            return BugFixResult.model_validate(payload)
-        case EvaluationCategory.TEST_GENERATION:
-            return TestGenerationResult.model_validate(payload)
-        case _:
-            raise ValueError(f"Unknown evaluation category: {category}")
+    @classmethod
+    def create_test_failure(cls, context: "EvaluationContext", output: str, error_msg: str = "Tests failed", **kwargs: Any) -> Self:
+        return cls._create_from_context(context, output=output, error_message=error_msg, resolved=False, build=True, **kwargs)
+
+    @property
+    def status_label(self) -> str:
+        if self.timeout:
+            return "Timeout"
+        return "Success" if self.resolved else "Failed"
+
+    @property
+    def category_metrics(self) -> dict[str, int | float | bool]:
+        return {"resolved": self.resolved, "build": self.build}
diff --git a/src/bcbench/results/bceval_export.py b/src/bcbench/results/bceval_export.py
index eadb79fb0..ad28f4727 100644
--- a/src/bcbench/results/bceval_export.py
+++ b/src/bcbench/results/bceval_export.py
@@ -9,7 +9,6 @@
 from bcbench.dataset import BaseDatasetEntry
 from bcbench.logger import get_logger
 from bcbench.results.base import BaseEvaluationResult
-from bcbench.results.testgeneration import TestGenerationResult
 from bcbench.types import EvaluationCategory
 
 logger = get_logger(__name__)
@@ -39,23 +38,18 @@ def write_bceval_results(results: list[BaseEvaluationResult], out_dir: Path, run
                 "llm_duration": (result.metrics.llm_duration if result.metrics else None) or 0,
                 "latency": (result.metrics.execution_time if result.metrics else None) or 0,
                 "turn_count": (result.metrics.turn_count if result.metrics else None) or 0,
-                "resolved": result.resolved,
-                "build": result.build,
+                **result.category_metrics,
                 "run_id": run_id,
                 "project": result.project,
                 "error_message": result.error_message,
                 "tool_usage": (result.metrics.tool_usage if result.metrics and result.metrics.tool_usage else None) or 0,
             }
 
-            if isinstance(result, TestGenerationResult):
-                metadata["pre_patch_failed"] = result.pre_patch_failed
-                metadata["post_patch_passed"] = result.post_patch_passed
-
             bceval_result = {
                 "id": result.instance_id,
                 "input": input,
                 "expected": expected,
-                "output": result.generated_patch,
+                "output": result.output,
                 "context": "",
                 "metadata": metadata,
                 "tags": [],
diff --git a/src/bcbench/results/bugfix.py b/src/bcbench/results/bugfix.py
index 505de5236..bc55dbe82 100644
--- a/src/bcbench/results/bugfix.py
+++ b/src/bcbench/results/bugfix.py
@@ -1,9 +1,5 @@
-from bcbench.results.base import BaseEvaluationResult
+from bcbench.results.base import ExecutionBasedEvaluationResult
 
 
-class BugFixResult(BaseEvaluationResult):
-    """Result class for bug-fix evaluation category.
-
-    Inherits all shared metrics from BaseEvaluationResult.
-    Category-specific fields can be added here as needed.
-    """
+class BugFixResult(ExecutionBasedEvaluationResult):
+    """Result class for bug-fix evaluation category."""
diff --git a/src/bcbench/results/display.py b/src/bcbench/results/display.py
index bec719bc7..f0354f3a2 100644
--- a/src/bcbench/results/display.py
+++ b/src/bcbench/results/display.py
@@ -1,30 +1,31 @@
+from collections.abc import Sequence
+
 from rich.console import Console
 from rich.table import Table
 
 from bcbench.config import get_config
 from bcbench.logger import get_logger
 from bcbench.results.base import BaseEvaluationResult
-from bcbench.results.evaluation_result import _calculate_average_tool_usage
+from bcbench.results.summary import EvaluationResultSummary, calculate_average_tool_usage
 
 logger = get_logger(__name__)
 console = Console()
 
 
-def create_console_summary(results: list[BaseEvaluationResult]) -> None:
+def create_console_summary(results: Sequence[BaseEvaluationResult], summary: EvaluationResultSummary) -> None:
     total = len(results)
-    resolved = sum(r.resolved for r in results)
-    failed = total - resolved
+    display_metrics: dict[str, int | float | bool] = summary.display_summary()
 
     console.print("\n[bold cyan]Evaluation Results Summary[/bold cyan]")
     console.print(f"Total Processed: [bold]{total}[/bold], using [bold]{results[0].agent_name}({results[0].model})[/bold]")
     console.print(f"Category: [bold]{results[0].category.value}[/bold]")
-    console.print(f"Resolved: [bold green]{resolved}[/bold green]")
-    console.print(f"Failed: [bold red]{failed}[/bold red]")
+    for key, value in display_metrics.items():
+        console.print(f"{key.replace('_', ' ').title()}: [bold]{value}[/bold]")
 
     # Display average tool usage if available
     tool_usages = [r.metrics.tool_usage for r in results if r.metrics and r.metrics.tool_usage is not None]
     if tool_usages:
-        avg_usage = _calculate_average_tool_usage(tool_usages)
+        avg_usage = calculate_average_tool_usage(tool_usages)
         if avg_usage:
             console.print("\n[bold cyan]Average Tool Usage[/bold cyan]")
             sorted_tools = sorted(avg_usage.items(), key=lambda x: x[1], reverse=True)
@@ -35,6 +36,12 @@ def create_console_summary(results: list[BaseEvaluationResult]) -> None:
     table.add_column("Instance ID", style="cyan", no_wrap=True)
     table.add_column("Project", style="magenta", no_wrap=True)
     table.add_column("Status", justify="center")
+
+    # Dynamic columns from display_row()
+    extra_columns = list(results[0].display_row.keys()) if results else []
+    for col_name in extra_columns:
+        table.add_column(col_name, style="yellow")
+
     table.add_column("MCP Servers", style="yellow")
     table.add_column("Custom Instructions", style="yellow")
     table.add_column("Skills", style="yellow")
@@ -42,12 +49,14 @@ def create_console_summary(results: list[BaseEvaluationResult]) -> None:
     table.add_column("Error Message", style="dim")
 
     for result in results:
-        status = "[green]Success[/green]" if result.resolved else "[red]Failed[/red]"
+        has_error = result.error_message is not None or result.timeout
+        status = f"[red]{result.status_label}[/red]" if has_error else f"[green]{result.status_label}[/green]"
         mcp_servers = ", ".join(result.experiment.mcp_servers) if result.experiment and result.experiment.mcp_servers else "N/A"
         custom_instructions = "Yes" if result.experiment and result.experiment.custom_instructions else "No"
         skills = "Yes" if result.experiment and result.experiment.skills_enabled else "No"
         custom_agent = result.experiment.custom_agent if result.experiment and result.experiment.custom_agent else "N/A"
-        table.add_row(result.instance_id, result.project, status, mcp_servers, custom_instructions, skills, custom_agent, result.error_message or "")
+        extra_values = list(result.display_row.values())
+        table.add_row(result.instance_id, result.project, status, *extra_values, mcp_servers, custom_instructions, skills, custom_agent, result.error_message or "")
 
     console.print(table)
     console.print()
@@ -61,12 +70,12 @@ def _get_short_error_message(error_message: str | None) -> str:
     return first_line.replace("|", "\\|")
 
 
-def create_github_job_summary(results: list[BaseEvaluationResult]) -> None:
+def create_github_job_summary(results: Sequence[BaseEvaluationResult], summary: EvaluationResultSummary) -> None:
     total = len(results)
-    resolved = sum(r.resolved for r in results)
-    failed = total - resolved
+    display_metrics: dict[str, int | float | bool] = summary.display_summary()
+    errors = sum(1 for r in results if r.error_message or r.timeout)
 
-    success_icon = ":white_check_mark:" if failed == 0 else ":x:"
+    success_icon = ":white_check_mark:" if errors == 0 else ":x:"
 
     mcp_servers = ", ".join(results[0].experiment.mcp_servers) if results[0].experiment and results[0].experiment.mcp_servers else "None"
     custom_instructions = "Yes" if results[0].experiment and results[0].experiment.custom_instructions else "No"
@@ -77,31 +86,50 @@ def create_github_job_summary(results: list[BaseEvaluationResult]) -> None:
     tool_usage_section = ""
     tool_usages = [r.metrics.tool_usage for r in results if r.metrics and r.metrics.tool_usage is not None]
     if tool_usages:
-        avg_usage = _calculate_average_tool_usage(tool_usages)
+        avg_usage = calculate_average_tool_usage(tool_usages)
         if avg_usage:
             sorted_tools = sorted(avg_usage.items(), key=lambda x: x[1], reverse=True)
             tool_lines = [f"  - `{tool}`: {count}" for tool, count in sorted_tools]
             tool_usage_section = "\n\n## Average Tool Usage\n" + "\n".join(tool_lines)
 
+    # Build category-specific summary lines
+    display_lines = "\n".join(f"- {key.replace('_', ' ').title()}: {value}" for key, value in display_metrics.items())
+
     markdown_summary = f"""Total entries processed: {total}, using **{results[0].agent_name} ({results[0].model})**
 - Category: `{results[0].category.value}`
 - MCP Servers used: {mcp_servers}
 - Custom Instructions: {custom_instructions}
 - Skills: {skills}
 - Custom Agent: {custom_agent}
-- Successful evaluations: {resolved} :white_check_mark:
-- Failed evaluations: {failed} {success_icon}{tool_usage_section}
+{display_lines}
+- Errors: {errors} {success_icon}{tool_usage_section}
 
 ## Detailed Results
 
-| Instance ID | Project | Status | Error Message |
-|-------------|---------|--------|---------------|
 """
+
+    # Dynamic columns from display_row()
+    extra_columns = list(results[0].display_row.keys()) if results else []
+    extra_headers = " | ".join(extra_columns)
+    extra_separator = " | ".join("------" for _ in extra_columns)
+
+    if extra_columns:
+        markdown_summary += f"| Instance ID | Project | Status | {extra_headers} | Error Message |\n"
+        markdown_summary += f"|-------------|---------|--------|{extra_separator}|---------------|\n"
+    else:
+        markdown_summary += "| Instance ID | Project | Status | Error Message |\n"
+        markdown_summary += "|-------------|---------|--------|---------------|\n"
+
     for result in results:
-        status_icon = ":white_check_mark:" if result.resolved else ":x:"
-        status_text = f"{status_icon} {'Success' if result.resolved else 'Failed'}"
+        has_error = result.error_message is not None or result.timeout
+        status_icon = ":x:" if has_error else ":white_check_mark:"
+        status_text = f"{status_icon} {result.status_label}"
         error_msg = _get_short_error_message(result.error_message)
-        markdown_summary += f"| `{result.instance_id}` | `{result.project}` | {status_text} | {error_msg} |\n"
+        extra_values = " | ".join(result.display_row.values())
+        if extra_columns:
+            markdown_summary += f"| `{result.instance_id}` | `{result.project}` | {status_text} | {extra_values} | {error_msg} |\n"
+        else:
+            markdown_summary += f"| `{result.instance_id}` | `{result.project}` | {status_text} | {error_msg} |\n"
 
     _write_github_step_summary(markdown_summary)
 
diff --git a/src/bcbench/results/evaluation_result.py b/src/bcbench/results/summary.py
similarity index 59%
rename from src/bcbench/results/evaluation_result.py
rename to src/bcbench/results/summary.py
index 79aae3e39..858b19acd 100644
--- a/src/bcbench/results/evaluation_result.py
+++ b/src/bcbench/results/summary.py
@@ -1,11 +1,13 @@
 import json
 import tomllib
+from abc import ABC, abstractmethod
 from collections import Counter
+from collections.abc import Sequence
 from datetime import date
 from pathlib import Path
-from typing import Any, Sequence
+from typing import Any
 
-from pydantic import BaseModel
+from pydantic import BaseModel, Field
 
 from bcbench.logger import get_logger
 from bcbench.results.base import BaseEvaluationResult
@@ -28,12 +30,14 @@ def _get_benchmark_version() -> str:
         return tomllib.load(f).get("project", {}).get("version", "unknown")
 
 
-class EvaluationResultSummary(BaseModel):
+class EvaluationResultSummary(BaseModel, ABC):
+    """Base summary for a single evaluation run across all instances.
+
+    Contains agent metrics common to every category (tokens, duration, tool usage).
+    Category-specific metrics (resolved, build, etc.) live on subclasses.
+    """
+
     total: int
-    resolved: int
-    failed: int
-    build: int
-    percentage: float
 
     date: date
 
@@ -50,39 +54,38 @@ class EvaluationResultSummary(BaseModel):
     github_run_id: str | None = None
     experiment: ExperimentConfiguration | None = None
 
-    # Per-instance results for aggregate metrics calculation: instance_id -> resolved
-    instance_results: dict[str, bool] | None = None
-
-    # Benchmark version from pyproject.toml at evaluation time
     benchmark_version: str
 
+    @abstractmethod
+    def display_summary(self) -> dict[str, int | float]:
+        """Return category-specific metrics for console/GitHub summary display.
+
+        Subclasses must override. Keys become display labels (underscores replaced
+        with spaces and title-cased). Values are shown as-is.
+        """
+
     @classmethod
     def from_results(cls, results: Sequence[BaseEvaluationResult], run_id: str) -> "EvaluationResultSummary":
-        total = len(results)
-        resolved = sum(r.resolved for r in results)
+        """Create a summary from a list of per-instance results.
+
+        When called on the base class, dispatches to the correct subclass.
+        Subclasses override, call super().from_results(), and extend via model_copy().
+        """
+        if cls is EvaluationResultSummary:
+            summary_cls = results[0].category.summary_class
+            return summary_cls.from_results(results, run_id)
 
         durations = [r.metrics.execution_time for r in results if r.metrics and r.metrics.execution_time is not None]
         prompt_tokens = [r.metrics.prompt_tokens for r in results if r.metrics and r.metrics.prompt_tokens is not None]
         completion_tokens = [r.metrics.completion_tokens for r in results if r.metrics and r.metrics.completion_tokens is not None]
         llm_durations = [r.metrics.llm_duration for r in results if r.metrics and r.metrics.llm_duration is not None]
-
-        # Calculate average tool usage across all results
         tool_usages = [r.metrics.tool_usage for r in results if r.metrics and r.metrics.tool_usage is not None]
-        average_tool_usage = _calculate_average_tool_usage(tool_usages) if tool_usages else None
 
-        # Extract experiment configuration from first result (all should be same in a run)
         first_result = results[0]
         experiment = first_result.experiment if first_result.experiment and not first_result.experiment.is_empty() else None
 
-        # Create per-instance results for aggregate metrics calculation
-        instance_results = {r.instance_id: r.resolved for r in results}
-
         return cls(
-            total=total,
-            resolved=resolved,
-            percentage=round(resolved / total * 100, 1),
-            failed=total - resolved,
-            build=sum(r.build for r in results),
+            total=len(results),
             date=date.today(),
             category=first_result.category,
             model=first_result.model,
@@ -91,16 +94,19 @@ def from_results(cls, results: Sequence[BaseEvaluationResult], run_id: str) -> "
             average_prompt_tokens=sum(prompt_tokens) / len(prompt_tokens) if prompt_tokens else 0.0,
             average_completion_tokens=sum(completion_tokens) / len(completion_tokens) if completion_tokens else 0.0,
             average_llm_duration=sum(llm_durations) / len(llm_durations) if llm_durations else 0.0,
-            average_tool_usage=average_tool_usage,
+            average_tool_usage=calculate_average_tool_usage(tool_usages) if tool_usages else None,
             github_run_id=run_id,
             experiment=experiment,
-            instance_results=instance_results,
             benchmark_version=_get_benchmark_version(),
         )
 
+    @classmethod
+    def from_json(cls, payload: dict[str, Any]) -> "EvaluationResultSummary":
+        category = EvaluationCategory(payload["category"])
+        return category.summary_class.model_validate(payload)
+
     def to_dict(self) -> dict[str, Any]:
         data = self.model_dump(mode="json")
-        # Round numeric values for readability
         data["average_duration"] = round(data["average_duration"], 1)
         data["average_prompt_tokens"] = round(data["average_prompt_tokens"], 1)
         data["average_completion_tokens"] = round(data["average_completion_tokens"], 1)
@@ -115,15 +121,65 @@ def save(self, output_dir: Path, summary_file: str) -> None:
         logger.info(f"Saved evaluation summary to {output_file}")
 
 
+class ExecutionBasedEvaluationResultSummary(EvaluationResultSummary):
+    """Summary for categories with binary pass/fail outcomes (bug-fix, test-generation).
+
+    Fields match the original flat layout in the leaderboard JSON files.
+    """
+
+    resolved: int = 0
+    failed: int = 0
+    build: int = 0
+    percentage: float = 0.0
+
+    # Per-instance pass/fail for aggregate metrics (pass^k, CI)
+    instance_results: dict[str, bool] = Field(default_factory=dict)
+
+    def display_summary(self) -> dict[str, int | float]:
+        return {
+            "resolved": self.resolved,
+            "failed": self.failed,
+            "build": self.build,
+            "percentage": self.percentage,
+        }
+
+    @classmethod
+    def from_results(cls, results: Sequence[BaseEvaluationResult], run_id: str) -> "ExecutionBasedEvaluationResultSummary":
+        from bcbench.results.base import ExecutionBasedEvaluationResult
+
+        summary = super().from_results(results, run_id)
+        assert isinstance(summary, ExecutionBasedEvaluationResultSummary)
+        total = summary.total
+
+        resolved = sum(1 for r in results if isinstance(r, ExecutionBasedEvaluationResult) and r.resolved)
+        build = sum(1 for r in results if isinstance(r, ExecutionBasedEvaluationResult) and r.build)
+        instance_results = {r.instance_id: (isinstance(r, ExecutionBasedEvaluationResult) and r.resolved) for r in results}
+
+        return summary.model_copy(
+            update={
+                "resolved": resolved,
+                "failed": total - resolved,
+                "build": build,
+                "percentage": round(resolved / total * 100, 1) if total else 0.0,
+                "instance_results": instance_results,
+            }
+        )
+
+
+# ---------------------------------------------------------------------------
+# Leaderboard aggregation (execution-based categories only)
+# ---------------------------------------------------------------------------
+
+
 class LeaderboardAggregate(BaseModel):
+    """Aggregate metrics across multiple runs. Execution-based categories only for now."""
+
     model: str
     agent_name: str
     category: EvaluationCategory
     experiment: ExperimentConfiguration | None = None
 
-    # Total instances in benchmark
     total: int
-    # Number of runs aggregated
     num_runs: int
 
     average: float | None = None
@@ -131,68 +187,43 @@ class LeaderboardAggregate(BaseModel):
     ci_high: float | None = None
     pass_hat_5: float | None = None
 
-    # Averaged metrics across runs
     average_duration: float | None = None
 
-    # Benchmark version(s) from aggregated runs
     benchmark_version: str
 
     @classmethod
-    def from_runs(cls, runs: Sequence[EvaluationResultSummary]) -> "LeaderboardAggregate":
+    def from_runs(cls, runs: Sequence[ExecutionBasedEvaluationResultSummary]) -> "LeaderboardAggregate":
         if not runs:
             raise ValueError("Cannot create aggregate from empty runs list")
 
-        first_run: EvaluationResultSummary = runs[0]
-        total: int = first_run.total
-        num_runs: int = len(runs)
-
-        # All runs should have the same benchmark_version (enforced by _get_combination_key grouping)
-        benchmark_version: str = first_run.benchmark_version
+        first_run = runs[0]
+        total = first_run.total
+        num_runs = len(runs)
+        benchmark_version = first_run.benchmark_version
 
-        # Warn if runs have different instance counts
         unique_totals = {r.total for r in runs}
         if len(unique_totals) > 1:
             logger.warning(f"Aggregating runs with different instance counts for '{first_run.agent_name}' + '{first_run.model}': {sorted(unique_totals)}. pass^k metrics may be misleading.")
 
         # Average duration across runs
-        durations: list[float] = [r.average_duration for r in runs if r.average_duration]
-        average_duration: float | None = sum(durations) / len(durations) if durations else None
-
-        # Legacy single run without instance_results: use simple pass rate
-        if num_runs == 1 and not first_run.instance_results:
-            pass_rate = first_run.resolved / first_run.total if first_run.total > 0 else 0.0
-            return cls(
-                model=first_run.model,
-                agent_name=first_run.agent_name,
-                category=first_run.category,
-                experiment=first_run.experiment,
-                total=total,
-                num_runs=num_runs,
-                average=round(pass_rate, 3),
-                ci_low=None,
-                ci_high=None,
-                pass_hat_5=None,
-                average_duration=round(average_duration, 1) if average_duration else None,
-                benchmark_version=benchmark_version,
-            )
+        durations = [r.average_duration for r in runs if r.average_duration]
+        average_duration = sum(durations) / len(durations) if durations else None
 
         # Collect per-instance results across runs for pass^5
         instance_resolved: dict[str, list[bool]] = {}
         for run in runs:
-            if run.instance_results:
-                for instance_id, resolved in run.instance_results.items():
-                    if instance_id not in instance_resolved:
-                        instance_resolved[instance_id] = []
-                    instance_resolved[instance_id].append(resolved)
-
-        # Calculate per-run pass rates for average and CI
-        per_run_rates = [run.resolved / run.total for run in runs if run.total > 0]
+            for instance_id, outcome in run.instance_results.items():
+                if instance_id not in instance_resolved:
+                    instance_resolved[instance_id] = []
+                instance_resolved[instance_id].append(bool(outcome))
+
+        # Per-run scores for average and CI
+        per_run_rates = [run.percentage / 100.0 for run in runs]
         avg = round(sum(per_run_rates) / len(per_run_rates), 3) if per_run_rates else None
         ci_result = bootstrap_ci(per_run_rates)
         ci_low = round(ci_result["ci_low"], 3) if ci_result["ci_low"] is not None else None
         ci_high = round(ci_result["ci_high"], 3) if ci_result["ci_high"] is not None else None
 
-        # Calculate pass^5
         pass_hat_5_val = _calculate_pass_hat_k(instance_resolved, 5, num_runs) if num_runs >= 5 else None
 
         return cls(
@@ -212,7 +243,13 @@ def from_runs(cls, runs: Sequence[EvaluationResultSummary]) -> "LeaderboardAggre
 
 
 class Leaderboard(BaseModel):
-    runs: list[EvaluationResultSummary]
+    """Leaderboard for execution-based categories only.
+
+    Non-execution-based categories (e.g. code-review) will need a different
+    leaderboard model once they are introduced.
+    """
+
+    runs: list[ExecutionBasedEvaluationResultSummary]
     aggregate: list[LeaderboardAggregate]
 
     @classmethod
@@ -221,7 +258,6 @@ def load(cls, path: Path) -> "Leaderboard":
             return cls(runs=[], aggregate=[])
         with open(path, encoding="utf-8") as f:
             data = json.load(f)
-            # Handle empty arrays or invalid structures
             if not data or not isinstance(data, dict):
                 return cls(runs=[], aggregate=[])
             return cls.model_validate(data)
@@ -233,6 +269,11 @@ def to_dict(self) -> dict[str, Any]:
         }
 
 
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
 def _calculate_pass_hat_k(instance_resolved: dict[str, list[bool]], k: int, num_trials: int) -> float:
     if num_trials < k:
         return 0.0
@@ -245,16 +286,10 @@ def _calculate_pass_hat_k(instance_resolved: dict[str, list[bool]], k: int, num_
     return round(total_pass_hat_k / len(instance_resolved), 3)
 
 
-def _calculate_average_tool_usage(tool_usages: list[dict[str, int]]) -> dict[str, float]:
-    """Calculate average tool usage across multiple results.
-
-    Sums up all tool counts and divides by the number of results to get average.
-    """
+def calculate_average_tool_usage(tool_usages: list[dict[str, int]]) -> dict[str, float]:
     if not tool_usages:
         return {}
 
     aggregated = sum((Counter(usage) for usage in tool_usages), Counter())
-
-    # Calculate average (rounded to 2 decimal places)
     num_results = len(tool_usages)
     return {tool: round(count / num_results, 2) for tool, count in aggregated.items()}
diff --git a/src/bcbench/results/testgeneration.py b/src/bcbench/results/testgeneration.py
index fb80d0e05..0393f84ae 100644
--- a/src/bcbench/results/testgeneration.py
+++ b/src/bcbench/results/testgeneration.py
@@ -1,19 +1,26 @@
 from typing import Self
 
-from bcbench.results.base import BaseEvaluationResult
+from bcbench.results.base import ExecutionBasedEvaluationResult
 from bcbench.types import EvaluationContext
 
 
-class TestGenerationResult(BaseEvaluationResult):
-    """Result class for test-generation evaluation category.
-
-    Inherits all shared metrics from BaseEvaluationResult.
-    Tracks whether generated tests failed before patch and passed after patch.
-    """
+class TestGenerationResult(ExecutionBasedEvaluationResult):
+    """Result class for test-generation evaluation category."""
 
     pre_patch_failed: bool = False
     post_patch_passed: bool = False
 
+    @property
+    def category_metrics(self) -> dict[str, int | float | bool]:
+        return {**super().category_metrics, "pre_patch_failed": self.pre_patch_failed, "post_patch_passed": self.post_patch_passed}
+
+    @property
+    def display_row(self) -> dict[str, str]:
+        return {
+            "Pre-Patch Failed": "Yes" if self.pre_patch_failed else "No",
+            "Post-Patch Passed": "Yes" if self.post_patch_passed else "No",
+        }
+
     @classmethod
-    def create_no_tests_extracted(cls, context: "EvaluationContext", generated_patch: str, error_message: str) -> Self:
-        return cls._create_from_context(context, resolved=False, build=False, generated_patch=generated_patch, error_message=error_message)
+    def create_no_tests_extracted(cls, context: "EvaluationContext", output: str, error_message: str) -> Self:
+        return cls._create_from_context(context, resolved=False, build=False, output=output, error_message=error_message)
diff --git a/src/bcbench/types.py b/src/bcbench/types.py
index c2177a8a6..80731d2e7 100644
--- a/src/bcbench/types.py
+++ b/src/bcbench/types.py
@@ -14,6 +14,8 @@
 if TYPE_CHECKING:
     from bcbench.dataset import BaseDatasetEntry
     from bcbench.evaluate.base import EvaluationPipeline
+    from bcbench.results.base import BaseEvaluationResult
+    from bcbench.results.summary import EvaluationResultSummary
 
 __all__ = ["AgentMetrics", "AgentType", "ContainerConfig", "EvaluationCategory", "EvaluationContext", "ExperimentConfiguration"]
 
@@ -126,6 +128,32 @@ def entry_class(self) -> type[BaseDatasetEntry]:
 
         raise ValueError(f"Unknown evaluation category: {self}")
 
+    @property
+    def result_class(self) -> type[BaseEvaluationResult]:
+        from bcbench.results.bugfix import BugFixResult
+        from bcbench.results.testgeneration import TestGenerationResult
+
+        match self:
+            case EvaluationCategory.BUG_FIX:
+                return BugFixResult
+            case EvaluationCategory.TEST_GENERATION:
+                return TestGenerationResult
+
+        raise ValueError(f"Unknown evaluation category: {self}")
+
+    @property
+    def summary_class(self) -> type[EvaluationResultSummary]:
+        """Returns the EvaluationResultSummary subclass for this category."""
+        from bcbench.results.summary import ExecutionBasedEvaluationResultSummary
+
+        match self:
+            case EvaluationCategory.BUG_FIX:
+                return ExecutionBasedEvaluationResultSummary
+            case EvaluationCategory.TEST_GENERATION:
+                return ExecutionBasedEvaluationResultSummary
+
+        raise ValueError(f"Unknown evaluation category: {self}")
+
     @property
     def pipeline(self) -> EvaluationPipeline:
         from bcbench.evaluate import BugFixPipeline, TestGenerationPipeline
diff --git a/tests/conftest.py b/tests/conftest.py
index 6c6ab4922..ce2dfaf34 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -103,7 +103,7 @@ def create_bugfix_result(
     agent_name: str = "copilot-cli",
     resolved: bool = True,
     build: bool = True,
-    generated_patch: str = "diff --git a/test.al b/test.al\n+fixed",
+    output: str = "diff --git a/test.al b/test.al\n+fixed",
     error_message: str | None = None,
     metrics: AgentMetrics | None = None,
 ) -> BugFixResult:
@@ -115,7 +115,7 @@ def create_bugfix_result(
         category=EvaluationCategory.BUG_FIX,
         resolved=resolved,
         build=build,
-        generated_patch=generated_patch,
+        output=output,
         error_message=error_message,
         metrics=metrics,
     )
@@ -128,7 +128,7 @@ def create_testgen_result(
     agent_name: str = "copilot-cli",
     resolved: bool = False,
     build: bool = True,
-    generated_patch: str = "diff --git a/test.al b/test.al\n+test",
+    output: str = "diff --git a/test.al b/test.al\n+test",
     error_message: str | None = None,
     metrics: AgentMetrics | None = None,
     pre_patch_failed: bool = False,
@@ -142,7 +142,7 @@ def create_testgen_result(
         category=EvaluationCategory.TEST_GENERATION,
         resolved=resolved,
         build=build,
-        generated_patch=generated_patch,
+        output=output,
         error_message=error_message,
         metrics=metrics,
         pre_patch_failed=pre_patch_failed,
diff --git a/tests/test_cli_commands.py b/tests/test_cli_commands.py
index 2290cff5c..12f678ecf 100644
--- a/tests/test_cli_commands.py
+++ b/tests/test_cli_commands.py
@@ -342,6 +342,7 @@ def sample_leaderboard_and_summary(tmp_path):
                 "failed": 4,
                 "build": 9,
                 "percentage": 60.0,
+                "instance_results": copilot_instance_results,
                 "date": "2025-01-10",
                 "model": "gpt-4o",
                 "category": "bug-fix",
@@ -356,7 +357,6 @@ def sample_leaderboard_and_summary(tmp_path):
                     "custom_instructions": True,
                     "custom_agent": None,
                 },
-                "instance_results": copilot_instance_results,
                 "benchmark_version": "0.1.0",
             },
             {
@@ -365,6 +365,7 @@ def sample_leaderboard_and_summary(tmp_path):
                 "failed": 3,
                 "build": 10,
                 "percentage": 70.0,
+                "instance_results": mini_instance_results,
                 "date": "2025-01-12",
                 "model": "gpt-4o",
                 "category": "bug-fix",
@@ -379,7 +380,6 @@ def sample_leaderboard_and_summary(tmp_path):
                     "custom_instructions": False,
                     "custom_agent": None,
                 },
-                "instance_results": mini_instance_results,
                 "benchmark_version": "0.1.0",
             },
         ],
@@ -430,6 +430,7 @@ def sample_leaderboard_and_summary(tmp_path):
                 "failed": 5,
                 "build": 8,
                 "percentage": 50.0,
+                "instance_results": testgen_instance_results,
                 "date": "2025-01-11",
                 "model": "gpt-4-turbo",
                 "category": "test-generation",
@@ -444,7 +445,6 @@ def sample_leaderboard_and_summary(tmp_path):
                     "custom_instructions": False,
                     "custom_agent": None,
                 },
-                "instance_results": testgen_instance_results,
                 "benchmark_version": "0.1.0",
             },
         ],
@@ -477,10 +477,11 @@ def sample_leaderboard_and_summary(tmp_path):
 
     new_summary = {
         "total": 10,
-        "resolved": 8,  # Improved from 6 to 8
+        "resolved": 8,
         "failed": 2,
-        "build": 10,  # Improved from 9 to 10
+        "build": 10,
         "percentage": 80.0,
+        "instance_results": new_summary_instance_results,
         "date": "2025-01-15",
         "model": "gpt-4o",
         "category": "bug-fix",
@@ -495,7 +496,6 @@ def sample_leaderboard_and_summary(tmp_path):
             "custom_instructions": True,
             "custom_agent": None,
         },
-        "instance_results": new_summary_instance_results,
         "benchmark_version": "0.1.0",
     }
 
@@ -561,6 +561,7 @@ def test_result_update_adds_new_entry(sample_leaderboard_and_summary):
         "failed": 1,
         "build": 10,
         "percentage": 90.0,
+        "instance_results": new_agent_instance_results,
         "date": "2025-01-16",
         "model": "gpt-4o",
         "category": "test-generation",
@@ -575,7 +576,6 @@ def test_result_update_adds_new_entry(sample_leaderboard_and_summary):
             "custom_instructions": False,
             "custom_agent": None,
         },
-        "instance_results": new_agent_instance_results,
         "benchmark_version": "0.1.0",
     }
 
@@ -629,6 +629,7 @@ def test_result_update_distinguishes_by_mcp_servers(sample_leaderboard_and_summa
         "failed": 3,
         "build": 9,
         "percentage": 70.0,
+        "instance_results": diff_mcp_instance_results,
         "date": "2025-01-17",
         "model": "gpt-4o",
         "category": "bug-fix",
@@ -643,7 +644,6 @@ def test_result_update_distinguishes_by_mcp_servers(sample_leaderboard_and_summa
             "custom_instructions": False,  # Different from existing True
             "custom_agent": None,
         },
-        "instance_results": diff_mcp_instance_results,
         "benchmark_version": "0.1.0",
     }
 
@@ -771,6 +771,7 @@ def test_result_update_stores_multiple_results_with_default_n(sample_leaderboard
         "failed": 2,
         "build": 10,
         "percentage": 80.0,
+        "instance_results": multi_results_instance,
         "date": "2025-01-15",
         "model": "gpt-4o",
         "category": "bug-fix",
@@ -785,7 +786,6 @@ def test_result_update_stores_multiple_results_with_default_n(sample_leaderboard
             "custom_instructions": True,
             "custom_agent": None,
         },
-        "instance_results": multi_results_instance,
         "benchmark_version": "0.1.0",
     }
 
@@ -824,6 +824,7 @@ def test_result_update_replaces_oldest_when_exceeding_n(sample_leaderboard_and_s
         "failed": 3,
         "build": 9,
         "percentage": 70.0,
+        "instance_results": oldest_instance_results,
         "model": "gpt-4o",
         "category": "bug-fix",
         "agent_name": "copilot",
@@ -836,7 +837,6 @@ def test_result_update_replaces_oldest_when_exceeding_n(sample_leaderboard_and_s
             "custom_instructions": True,
             "custom_agent": None,
         },
-        "instance_results": oldest_instance_results,
         "benchmark_version": "0.1.0",
     }
 
@@ -858,7 +858,16 @@ def test_result_update_replaces_oldest_when_exceeding_n(sample_leaderboard_and_s
 
     # Now add a 6th result - should replace oldest (2025-01-10)
     newest_instance_results = {f"test__inst_{i}": (i < 9) for i in range(10)}  # 9 resolved
-    summary_new = {**base_summary, "date": "2025-01-20", "github_run_id": "run_sixth", "resolved": 9, "instance_results": newest_instance_results}
+    summary_new = {
+        **base_summary,
+        "date": "2025-01-20",
+        "github_run_id": "run_sixth",
+        "resolved": 9,
+        "failed": 1,
+        "build": 10,
+        "percentage": 90.0,
+        "instance_results": newest_instance_results,
+    }
     with open(summary_path, "w") as f:
         json.dump(summary_new, f, indent=2)
 
@@ -927,17 +936,13 @@ def test_result_refresh_handles_empty_leaderboard(tmp_path):
 
 @pytest.mark.integration
 def test_result_refresh_handles_legacy_runs_without_instance_results(tmp_path):
-    """Test that refresh handles legacy runs that don't have instance_results."""
+    """Test that refresh handles runs without instance_results."""
     leaderboard_path = tmp_path / "bug-fix.json"
 
     legacy_data = {
         "runs": [
             {
                 "total": 10,
-                "resolved": 6,
-                "failed": 4,
-                "build": 9,
-                "percentage": 60.0,
                 "date": "2025-01-10",
                 "model": "gpt-4o",
                 "category": "bug-fix",
@@ -948,8 +953,11 @@ def test_result_refresh_handles_legacy_runs_without_instance_results(tmp_path):
                 "average_llm_duration": 70.0,
                 "github_run_id": "run_legacy",
                 "experiment": None,
-                "instance_results": None,  # Legacy: no instance_results
                 "benchmark_version": "0.1.0",
+                "resolved": 6,
+                "failed": 4,
+                "build": 9,
+                "percentage": 60.0,
             },
         ],
         "aggregate": [
@@ -996,6 +1004,7 @@ def test_result_refresh_separates_runs_by_benchmark_version(tmp_path):
                 "failed": 4,
                 "build": 10,
                 "percentage": 60.0,
+                "instance_results": {f"test__inst_{i}": (i < 6) for i in range(10)},
                 "date": "2025-01-10",
                 "model": "gpt-4o",
                 "category": "bug-fix",
@@ -1006,7 +1015,6 @@ def test_result_refresh_separates_runs_by_benchmark_version(tmp_path):
                 "average_llm_duration": 70.0,
                 "github_run_id": "run_v1",
                 "experiment": None,
-                "instance_results": {f"test__inst_{i}": (i < 6) for i in range(10)},
                 "benchmark_version": "0.1.0",
             },
             {
@@ -1015,6 +1023,7 @@ def test_result_refresh_separates_runs_by_benchmark_version(tmp_path):
                 "failed": 2,
                 "build": 10,
                 "percentage": 80.0,
+                "instance_results": {f"test__inst_{i}": (i < 8) for i in range(10)},
                 "date": "2025-01-15",
                 "model": "gpt-4o",
                 "category": "bug-fix",
@@ -1025,7 +1034,6 @@ def test_result_refresh_separates_runs_by_benchmark_version(tmp_path):
                 "average_llm_duration": 65.0,
                 "github_run_id": "run_v2",
                 "experiment": None,
-                "instance_results": {f"test__inst_{i}": (i < 8) for i in range(10)},
                 "benchmark_version": "0.2.0",
             },
         ],
@@ -1070,6 +1078,7 @@ def test_result_update_groups_by_benchmark_version(tmp_path):
                 "failed": 5,
                 "build": 10,
                 "percentage": 50.0,
+                "instance_results": {f"test__inst_{i}": (i < 5) for i in range(10)},
                 "date": "2025-01-10",
                 "model": "gpt-4o",
                 "category": "bug-fix",
@@ -1080,7 +1089,6 @@ def test_result_update_groups_by_benchmark_version(tmp_path):
                 "average_llm_duration": 70.0,
                 "github_run_id": "run_v1",
                 "experiment": None,
-                "instance_results": {f"test__inst_{i}": (i < 5) for i in range(10)},
                 "benchmark_version": "0.1.0",
             },
         ],
@@ -1113,6 +1121,7 @@ def test_result_update_groups_by_benchmark_version(tmp_path):
         "failed": 3,
         "build": 10,
         "percentage": 70.0,
+        "instance_results": {f"test__inst_{i}": (i < 7) for i in range(10)},
         "date": "2025-01-15",
         "model": "gpt-4o",
         "category": "bug-fix",
@@ -1123,7 +1132,6 @@ def test_result_update_groups_by_benchmark_version(tmp_path):
         "average_llm_duration": 65.0,
         "github_run_id": "run_v2",
         "experiment": None,
-        "instance_results": {f"test__inst_{i}": (i < 7) for i in range(10)},
         "benchmark_version": "0.2.0",
     }
 
diff --git a/tests/test_evaluation_summary.py b/tests/test_evaluation_summary.py
index a0c9586bf..7c3c97690 100644
--- a/tests/test_evaluation_summary.py
+++ b/tests/test_evaluation_summary.py
@@ -4,7 +4,7 @@
 import pytest
 
 from bcbench.config import get_config
-from bcbench.results.evaluation_result import EvaluationResultSummary
+from bcbench.results.summary import EvaluationResultSummary, ExecutionBasedEvaluationResultSummary
 from bcbench.types import AgentMetrics, EvaluationCategory, ExperimentConfiguration
 from tests.conftest import create_bugfix_result, create_testgen_result
 
@@ -13,7 +13,7 @@
 
 class TestEvaluationResultSummary:
     def test_summary_save_creates_json_file(self, tmp_path):
-        summary = EvaluationResultSummary(
+        summary = ExecutionBasedEvaluationResultSummary(
             total=10,
             resolved=8,
             failed=2,
@@ -43,6 +43,7 @@ def test_summary_save_creates_json_file(self, tmp_path):
         assert data["resolved"] == 8
         assert data["failed"] == 2
         assert data["build"] == 9
+        assert data["instance_results"] == {}
         assert data["date"] == "2025-01-15"
         assert data["model"] == "gpt-4o"
         assert data["agent_name"] == "copilot-cli"
@@ -51,7 +52,7 @@ def test_summary_save_creates_json_file(self, tmp_path):
         assert data["average_completion_tokens"] == 1200.0
 
     def test_summary_save_with_custom_filename(self, tmp_path):
-        summary = EvaluationResultSummary(
+        summary = ExecutionBasedEvaluationResultSummary(
             total=5,
             resolved=4,
             failed=1,
@@ -74,7 +75,7 @@ def test_summary_save_with_custom_filename(self, tmp_path):
         assert output_file.exists()
 
     def test_loading_existing_results(self):
-        from bcbench.results.evaluation_result import Leaderboard
+        from bcbench.results.summary import Leaderboard
 
         for category in EvaluationCategory:
             leaderboard_path = _config.paths.leaderboard_dir / f"{category.value}.json"
@@ -87,7 +88,7 @@ def test_loading_existing_results(self):
                 else:
                     # Old format: array of items
                     for item in data:
-                        EvaluationResultSummary.model_validate(item)
+                        ExecutionBasedEvaluationResultSummary.model_validate(item)
 
 
 class TestFromResults:
@@ -255,7 +256,7 @@ def test_summary_with_experiment_configuration(self):
             custom_instructions=True,
             custom_agent="custom-bc-agent",
         )
-        summary = EvaluationResultSummary(
+        summary = ExecutionBasedEvaluationResultSummary(
             total=5,
             resolved=3,
             failed=2,
@@ -279,7 +280,7 @@ def test_summary_with_experiment_configuration(self):
         assert summary.experiment.custom_agent == "custom-bc-agent"
 
     def test_summary_without_experiment_configuration(self):
-        summary = EvaluationResultSummary(
+        summary = ExecutionBasedEvaluationResultSummary(
             total=5,
             resolved=3,
             failed=2,
@@ -303,7 +304,7 @@ def test_summary_save_includes_experiment_in_json(self, tmp_path):
             mcp_servers=["pylance"],
             custom_instructions=True,
         )
-        summary = EvaluationResultSummary(
+        summary = ExecutionBasedEvaluationResultSummary(
             total=10,
             resolved=8,
             failed=2,
@@ -332,7 +333,7 @@ def test_summary_save_includes_experiment_in_json(self, tmp_path):
         assert data["experiment"]["custom_agent"] is None
 
     def test_summary_save_with_none_experiment(self, tmp_path):
-        summary = EvaluationResultSummary(
+        summary = ExecutionBasedEvaluationResultSummary(
             total=5,
             resolved=3,
             failed=2,
@@ -465,16 +466,16 @@ def test_from_results_creates_instance_results(self):
 
         summary = EvaluationResultSummary.from_results(results, run_id="test_run")
 
-        assert summary.instance_results is not None
-        assert len(summary.instance_results) == 3
-        assert summary.instance_results["test__1"] is True
-        assert summary.instance_results["test__2"] is False
-        assert summary.instance_results["test__3"] is True
+        instance_results = summary.instance_results
+        assert len(instance_results) == 3
+        assert instance_results["test__1"] is True
+        assert instance_results["test__2"] is False
+        assert instance_results["test__3"] is True
 
 
 class TestLeaderboardAggregate:
     def test_from_single_run_calculates_average(self):
-        from bcbench.results.evaluation_result import LeaderboardAggregate
+        from bcbench.results.summary import LeaderboardAggregate
 
         summary = EvaluationResultSummary.from_results(
             [
@@ -496,7 +497,7 @@ def test_from_single_run_calculates_average(self):
         assert agg.pass_hat_5 is None  # Not enough runs
 
     def test_from_multiple_runs_calculates_average_and_ci_bounds(self):
-        from bcbench.results.evaluation_result import LeaderboardAggregate
+        from bcbench.results.summary import LeaderboardAggregate
 
         run1 = EvaluationResultSummary.from_results(
             [
@@ -535,7 +536,7 @@ def test_from_multiple_runs_calculates_average_and_ci_bounds(self):
         assert agg.pass_hat_5 is None  # Not enough runs
 
     def test_average_and_ci_bounds_with_varying_results(self):
-        from bcbench.results.evaluation_result import LeaderboardAggregate
+        from bcbench.results.summary import LeaderboardAggregate
 
         # Create 3 runs where:
         # - run1: 3/3 resolved (100%)
@@ -579,7 +580,7 @@ def test_average_and_ci_bounds_with_varying_results(self):
         assert agg.pass_hat_5 is None  # Not enough runs
 
     def test_consistent_results_have_zero_ci(self):
-        from bcbench.results.evaluation_result import LeaderboardAggregate
+        from bcbench.results.summary import LeaderboardAggregate
 
         # All instances pass all runs
         run1 = EvaluationResultSummary.from_results(
@@ -615,7 +616,7 @@ def test_consistent_results_have_zero_ci(self):
 
 class TestLeaderboard:
     def test_aggregate_from_runs(self):
-        from bcbench.results.evaluation_result import LeaderboardAggregate
+        from bcbench.results.summary import LeaderboardAggregate
 
         run1 = EvaluationResultSummary.from_results(
             [
@@ -632,7 +633,7 @@ def test_aggregate_from_runs(self):
         assert agg.average == 0.5
 
     def test_leaderboard_to_dict(self):
-        from bcbench.results.evaluation_result import Leaderboard, LeaderboardAggregate
+        from bcbench.results.summary import Leaderboard, LeaderboardAggregate
 
         run1 = EvaluationResultSummary.from_results(
             [create_bugfix_result(instance_id="test__1", resolved=True)],
@@ -649,11 +650,10 @@ def test_leaderboard_to_dict(self):
         assert data["aggregate"][0]["average"] == 1.0
 
     def test_aggregate_from_legacy_runs_without_instance_results(self):
-        """Test that a single legacy run without instance_results uses pass rate ratio."""
-        from bcbench.results.evaluation_result import LeaderboardAggregate
+        """Test that a single run without instance_results uses pass rate from percentage."""
+        from bcbench.results.summary import LeaderboardAggregate
 
-        # Create a summary without instance_results (simulates legacy data)
-        legacy_run = EvaluationResultSummary(
+        legacy_run = ExecutionBasedEvaluationResultSummary(
             total=10,
             resolved=6,
             failed=4,
@@ -666,7 +666,6 @@ def test_aggregate_from_legacy_runs_without_instance_results(self):
             average_duration=100.0,
             average_prompt_tokens=1000.0,
             average_completion_tokens=500.0,
-            instance_results=None,  # Legacy: no instance_results
             benchmark_version="0.1.0",
         )
 
@@ -674,14 +673,14 @@ def test_aggregate_from_legacy_runs_without_instance_results(self):
 
         assert agg.num_runs == 1
         assert agg.total == 10
-        # Should fall back to pass rate (resolved/total) from the run
-        assert agg.average == 0.6  # 6/10 = 0.6
+        # Uses percentage / 100 as the run's pass rate
+        assert agg.average == 0.6  # 60.0% -> 0.6
         assert agg.ci_low is None
         assert agg.ci_high is None
         assert agg.pass_hat_5 is None
 
     def test_aggregate_includes_benchmark_version_from_runs(self):
-        from bcbench.results.evaluation_result import LeaderboardAggregate
+        from bcbench.results.summary import LeaderboardAggregate
 
         run1 = EvaluationResultSummary.from_results(
             [create_bugfix_result(instance_id="test__1", resolved=True)],
@@ -695,14 +694,15 @@ def test_aggregate_includes_benchmark_version_from_runs(self):
         assert agg.benchmark_version is not None
 
     def test_aggregate_allows_same_benchmark_versions(self):
-        from bcbench.results.evaluation_result import LeaderboardAggregate
+        from bcbench.results.summary import LeaderboardAggregate
 
-        run1 = EvaluationResultSummary(
+        run1 = ExecutionBasedEvaluationResultSummary(
             total=3,
             resolved=2,
             failed=1,
             build=3,
             percentage=66.7,
+            instance_results={"test__1": True, "test__2": True, "test__3": False},
             date=date.today(),
             model="gpt-4o",
             agent_name="copilot",
@@ -710,15 +710,15 @@ def test_aggregate_allows_same_benchmark_versions(self):
             average_duration=100.0,
             average_prompt_tokens=1000.0,
             average_completion_tokens=500.0,
-            instance_results={"test__1": True, "test__2": True, "test__3": False},
             benchmark_version="0.1.0",
         )
-        run2 = EvaluationResultSummary(
+        run2 = ExecutionBasedEvaluationResultSummary(
             total=3,
             resolved=1,
             failed=2,
             build=3,
             percentage=33.3,
+            instance_results={"test__1": False, "test__2": True, "test__3": False},
             date=date.today(),
             model="gpt-4o",
             agent_name="copilot",
@@ -726,7 +726,6 @@ def test_aggregate_allows_same_benchmark_versions(self):
             average_duration=100.0,
             average_prompt_tokens=1000.0,
             average_completion_tokens=500.0,
-            instance_results={"test__1": False, "test__2": True, "test__3": False},
             benchmark_version="0.1.0",  # Same version
         )
 
@@ -735,7 +734,7 @@ def test_aggregate_allows_same_benchmark_versions(self):
         assert agg.benchmark_version == "0.1.0"
 
     def test_load_empty_leaderboard_file(self, tmp_path):
-        from bcbench.results.evaluation_result import Leaderboard
+        from bcbench.results.summary import Leaderboard
 
         empty_file = tmp_path / "empty.json"
         empty_file.write_text("[]")
@@ -746,7 +745,7 @@ def test_load_empty_leaderboard_file(self, tmp_path):
         assert leaderboard.aggregate == []
 
     def test_load_empty_object_leaderboard_file(self, tmp_path):
-        from bcbench.results.evaluation_result import Leaderboard
+        from bcbench.results.summary import Leaderboard
 
         empty_file = tmp_path / "empty.json"
         empty_file.write_text("{}")
diff --git a/tests/test_get_task.py b/tests/test_get_task.py
index 783cc9418..64a2f97df 100644
--- a/tests/test_get_task.py
+++ b/tests/test_get_task.py
@@ -14,109 +14,3 @@ def test_returns_readme_content(self, tmp_path: Path):
             result = entry.get_task()
 
         assert result == content
-
-    def test_transform_image_paths_false_preserves_relative_paths(self, tmp_path: Path):
-        content = "# Task\n\n![diagram](./diagram.png)\n\nSome text."
-        problem_dir = create_problem_statement_dir(tmp_path, content)
-        entry = create_dataset_entry()
-
-        with patch.object(type(entry), "problem_statement_dir", property(lambda self: problem_dir)):
-            result = entry.get_task(transform_image_paths=False)
-
-        assert "![diagram](./diagram.png)" in result
-
-    def test_transform_image_paths_true_converts_to_problem_directory(self, tmp_path: Path):
-        content = "# Task\n\n![diagram](./diagram.png)\n\nSome text."
-        problem_dir = create_problem_statement_dir(tmp_path, content)
-        entry = create_dataset_entry()
-
-        with patch.object(type(entry), "problem_statement_dir", property(lambda self: problem_dir)):
-            result = entry.get_task(transform_image_paths=True)
-
-        assert "![diagram](problem/diagram.png)" in result
-        assert "./diagram.png" not in result
-
-    def test_transform_image_paths_handles_multiple_images(self, tmp_path: Path):
-        content = "# Task\n\n![first](./img1.png)\n\nText\n\n![second](./img2.png)"
-        problem_dir = create_problem_statement_dir(tmp_path, content)
-        entry = create_dataset_entry()
-
-        with patch.object(type(entry), "problem_statement_dir", property(lambda self: problem_dir)):
-            result = entry.get_task(transform_image_paths=True)
-
-        assert "![first](problem/img1.png)" in result
-        assert "![second](problem/img2.png)" in result
-
-    def test_transform_image_paths_preserves_alt_text(self, tmp_path: Path):
-        content = "![Complex Alt Text with spaces](./image.png)"
-        problem_dir = create_problem_statement_dir(tmp_path, content)
-        entry = create_dataset_entry()
-
-        with patch.object(type(entry), "problem_statement_dir", property(lambda self: problem_dir)):
-            result = entry.get_task(transform_image_paths=True)
-
-        assert "![Complex Alt Text with spaces](problem/image.png)" in result
-
-    def test_transform_image_paths_handles_empty_alt_text(self, tmp_path: Path):
-        content = "![](./image.png)"
-        problem_dir = create_problem_statement_dir(tmp_path, content)
-        entry = create_dataset_entry()
-
-        with patch.object(type(entry), "problem_statement_dir", property(lambda self: problem_dir)):
-            result = entry.get_task(transform_image_paths=True)
-
-        assert "![](problem/image.png)" in result
-
-    def test_transform_image_paths_handles_nested_paths(self, tmp_path: Path):
-        content = "![diagram](./images/subdir/diagram.png)"
-        problem_dir = create_problem_statement_dir(tmp_path, content)
-        entry = create_dataset_entry()
-
-        with patch.object(type(entry), "problem_statement_dir", property(lambda self: problem_dir)):
-            result = entry.get_task(transform_image_paths=True)
-
-        assert "![diagram](problem/images/subdir/diagram.png)" in result
-
-    def test_transform_image_paths_ignores_absolute_urls(self, tmp_path: Path):
-        content = "![external](https://example.com/image.png)"
-        problem_dir = create_problem_statement_dir(tmp_path, content)
-        entry = create_dataset_entry()
-
-        with patch.object(type(entry), "problem_statement_dir", property(lambda self: problem_dir)):
-            result = entry.get_task(transform_image_paths=True)
-
-        assert "![external](https://example.com/image.png)" in result
-
-    def test_transform_image_paths_ignores_non_relative_paths(self, tmp_path: Path):
-        content = "![other](images/diagram.png)"
-        problem_dir = create_problem_statement_dir(tmp_path, content)
-        entry = create_dataset_entry()
-
-        with patch.object(type(entry), "problem_statement_dir", property(lambda self: problem_dir)):
-            result = entry.get_task(transform_image_paths=True)
-
-        # Paths without ./ prefix should not be transformed
-        assert "![other](images/diagram.png)" in result
-
-    def test_transform_image_paths_handles_mixed_content(self, tmp_path: Path):
-        content = """# Problem
-
-![local](./diagram.png)
-
-Some text with [a link](./doc.md) that is not an image.
-
-![external](https://example.com/img.png)
-
-![another local](./screenshot.jpg)
-"""
-        problem_dir = create_problem_statement_dir(tmp_path, content)
-        entry = create_dataset_entry()
-
-        with patch.object(type(entry), "problem_statement_dir", property(lambda self: problem_dir)):
-            result = entry.get_task(transform_image_paths=True)
-
-        assert "![local](problem/diagram.png)" in result
-        assert "![another local](problem/screenshot.jpg)" in result
-        assert "![external](https://example.com/img.png)" in result
-        # Regular links should be preserved (not images)
-        assert "[a link](./doc.md)" in result
diff --git a/tests/test_result_hierarchy.py b/tests/test_result_hierarchy.py
new file mode 100644
index 000000000..8420a61a0
--- /dev/null
+++ b/tests/test_result_hierarchy.py
@@ -0,0 +1,393 @@
+"""Tests for the result and summary class hierarchies after the category refactor.
+
+Covers:
+- BaseEvaluationResult vs ExecutionBasedEvaluationResult field separation
+- status_label, category_metrics, display_row polymorphism
+- from_json dispatch to correct subclass
+- EvaluationResultSummary.from_results dispatch and super() chain
+- ExecutionBasedEvaluationResultSummary category-specific aggregation
+- display_summary on summaries
+- display.py console/GitHub summary rendering
+"""
+
+from datetime import date
+
+import pytest
+
+from bcbench.results.base import BaseEvaluationResult, ExecutionBasedEvaluationResult
+from bcbench.results.bugfix import BugFixResult
+from bcbench.results.display import create_console_summary, create_github_job_summary
+from bcbench.results.summary import (
+    EvaluationResultSummary,
+    ExecutionBasedEvaluationResultSummary,
+)
+from bcbench.results.testgeneration import TestGenerationResult
+from bcbench.types import AgentMetrics, EvaluationCategory
+from tests.conftest import create_bugfix_result, create_evaluation_context, create_testgen_result
+
+
+def _make_config_with_summary(summary_path: str):
+    """Create a config mock with github_step_summary set."""
+    from bcbench.config import get_config
+
+    config = get_config()
+    # Return a shallow copy-like object that overrides env.github_step_summary
+    from unittest.mock import MagicMock
+
+    mock = MagicMock(wraps=config)
+    mock.env.github_step_summary = summary_path
+    return mock
+
+
+# ---------------------------------------------------------------------------
+# BaseEvaluationResult
+# ---------------------------------------------------------------------------
+
+
+class TestBaseEvaluationResult:
+    def test_base_has_no_resolved_or_build(self):
+        assert "resolved" not in BaseEvaluationResult.model_fields
+        assert "build" not in BaseEvaluationResult.model_fields
+
+    def test_execution_based_has_resolved_and_build(self):
+        assert "resolved" in ExecutionBasedEvaluationResult.model_fields
+        assert "build" in ExecutionBasedEvaluationResult.model_fields
+
+    def test_bugfix_inherits_execution_based(self):
+        assert issubclass(BugFixResult, ExecutionBasedEvaluationResult)
+
+    def test_testgen_inherits_execution_based(self):
+        assert issubclass(TestGenerationResult, ExecutionBasedEvaluationResult)
+
+
+# ---------------------------------------------------------------------------
+# status_label
+# ---------------------------------------------------------------------------
+
+
+class TestStatusLabel:
+    def test_base_completed(self):
+        result = create_bugfix_result(resolved=True)
+        assert result.status_label == "Success"
+
+    def test_base_timeout(self):
+        result = create_bugfix_result(resolved=False, build=False)
+        result.timeout = True
+        assert result.status_label == "Timeout"
+
+    def test_execution_based_success(self):
+        result = create_bugfix_result(resolved=True, build=True)
+        assert result.status_label == "Success"
+
+    def test_execution_based_failed(self):
+        result = create_bugfix_result(resolved=False, build=True, error_message="Tests failed")
+        assert result.status_label == "Failed"
+
+
+# ---------------------------------------------------------------------------
+# category_metrics
+# ---------------------------------------------------------------------------
+
+
+class TestCategoryMetrics:
+    def test_bugfix_category_metrics(self):
+        result = create_bugfix_result(resolved=True, build=True)
+        assert result.category_metrics == {"resolved": True, "build": True}
+
+    def test_bugfix_failed_category_metrics(self):
+        result = create_bugfix_result(resolved=False, build=False)
+        assert result.category_metrics == {"resolved": False, "build": False}
+
+    def test_testgen_category_metrics_includes_extra_fields(self):
+        result = create_testgen_result(resolved=True, build=True, pre_patch_failed=True, post_patch_passed=True)
+        metrics = result.category_metrics
+        assert metrics["resolved"] is True
+        assert metrics["build"] is True
+        assert metrics["pre_patch_failed"] is True
+        assert metrics["post_patch_passed"] is True
+
+    def test_testgen_category_metrics_defaults(self):
+        result = create_testgen_result()
+        metrics = result.category_metrics
+        assert metrics["pre_patch_failed"] is False
+        assert metrics["post_patch_passed"] is False
+
+
+# ---------------------------------------------------------------------------
+# display_row
+# ---------------------------------------------------------------------------
+
+
+class TestDisplayRow:
+    def test_bugfix_display_row_is_empty(self):
+        result = create_bugfix_result()
+        assert result.display_row == {}
+
+    def test_testgen_display_row_has_columns(self):
+        result = create_testgen_result(pre_patch_failed=True, post_patch_passed=False)
+        row = result.display_row
+        assert row["Pre-Patch Failed"] == "Yes"
+        assert row["Post-Patch Passed"] == "No"
+
+    def test_testgen_display_row_no_flags(self):
+        result = create_testgen_result(pre_patch_failed=False, post_patch_passed=False)
+        row = result.display_row
+        assert row["Pre-Patch Failed"] == "No"
+        assert row["Post-Patch Passed"] == "No"
+
+
+# ---------------------------------------------------------------------------
+# from_json dispatch
+# ---------------------------------------------------------------------------
+
+
+class TestFromJsonDispatch:
+    def test_from_json_returns_bugfix_result(self):
+        payload = create_bugfix_result().model_dump(mode="json")
+        loaded = BaseEvaluationResult.from_json(payload)
+        assert isinstance(loaded, BugFixResult)
+
+    def test_from_json_returns_testgen_result(self):
+        payload = create_testgen_result(pre_patch_failed=True).model_dump(mode="json")
+        loaded = BaseEvaluationResult.from_json(payload)
+        assert isinstance(loaded, TestGenerationResult)
+        assert loaded.pre_patch_failed is True
+
+    def test_from_json_preserves_all_fields(self):
+        original = create_bugfix_result(
+            instance_id="test__round-trip",
+            resolved=True,
+            build=True,
+            output="patch content",
+            error_message=None,
+        )
+        loaded = BaseEvaluationResult.from_json(original.model_dump(mode="json"))
+        assert loaded.instance_id == original.instance_id
+        assert loaded.output == original.output
+
+    def test_from_json_unknown_category_raises(self):
+        payload = create_bugfix_result().model_dump(mode="json")
+        payload["category"] = "nonexistent"
+        with pytest.raises(ValueError, match="nonexistent"):
+            BaseEvaluationResult.from_json(payload)
+
+
+# ---------------------------------------------------------------------------
+# create_agent_timeout_failure
+# ---------------------------------------------------------------------------
+
+
+class TestCreateAgentTimeout:
+    def test_timeout_sets_fields(self, tmp_path):
+        ctx = create_evaluation_context(tmp_path)
+        result = BugFixResult.create_agent_timeout_failure(ctx)
+        assert result.timeout is True
+        assert result.error_message == "Agent timed out"
+        assert result.status_label == "Timeout"
+
+
+# ---------------------------------------------------------------------------
+# EvaluationResultSummary.from_results — dispatch + super() chain
+# ---------------------------------------------------------------------------
+
+
+class TestSummaryFromResults:
+    def test_base_dispatches_to_execution_based_for_bugfix(self):
+        results = [create_bugfix_result(instance_id="test__1", resolved=True)]
+        summary = EvaluationResultSummary.from_results(results, run_id="run1")
+        assert isinstance(summary, ExecutionBasedEvaluationResultSummary)
+
+    def test_base_dispatches_to_execution_based_for_testgen(self):
+        results = [create_testgen_result(instance_id="test__1")]
+        summary = EvaluationResultSummary.from_results(results, run_id="run1")
+        assert isinstance(summary, ExecutionBasedEvaluationResultSummary)
+
+    def test_subclass_direct_call_also_works(self):
+        results = [create_bugfix_result(instance_id="test__1", resolved=True)]
+        summary = ExecutionBasedEvaluationResultSummary.from_results(results, run_id="run1")
+        assert isinstance(summary, ExecutionBasedEvaluationResultSummary)
+        assert summary.resolved == 1
+
+    def test_common_fields_computed(self):
+        results = [
+            create_bugfix_result(
+                instance_id="test__1",
+                resolved=True,
+                metrics=AgentMetrics(execution_time=100.0, prompt_tokens=1000, completion_tokens=500),
+            ),
+            create_bugfix_result(
+                instance_id="test__2",
+                resolved=False,
+                metrics=AgentMetrics(execution_time=200.0, prompt_tokens=3000, completion_tokens=1500),
+            ),
+        ]
+        summary = EvaluationResultSummary.from_results(results, run_id="run1")
+
+        assert summary.total == 2
+        assert summary.model == "gpt-4o"
+        assert summary.agent_name == "copilot-cli"
+        assert summary.average_duration == pytest.approx(150.0)
+        assert summary.average_prompt_tokens == pytest.approx(2000.0)
+        assert summary.average_completion_tokens == pytest.approx(1000.0)
+        assert summary.date == date.today()
+
+    def test_category_specific_fields_computed(self):
+        results = [
+            create_bugfix_result(instance_id="test__1", resolved=True, build=True),
+            create_bugfix_result(instance_id="test__2", resolved=False, build=True),
+            create_bugfix_result(instance_id="test__3", resolved=False, build=False),
+        ]
+        summary = EvaluationResultSummary.from_results(results, run_id="run1")
+
+        assert isinstance(summary, ExecutionBasedEvaluationResultSummary)
+        assert summary.resolved == 1
+        assert summary.failed == 2
+        assert summary.build == 2
+        assert summary.percentage == pytest.approx(33.3)
+
+    def test_instance_results_populated(self):
+        results = [
+            create_bugfix_result(instance_id="test__a", resolved=True),
+            create_bugfix_result(instance_id="test__b", resolved=False),
+        ]
+        summary = EvaluationResultSummary.from_results(results, run_id="run1")
+
+        assert isinstance(summary, ExecutionBasedEvaluationResultSummary)
+        assert summary.instance_results == {"test__a": True, "test__b": False}
+
+
+# ---------------------------------------------------------------------------
+# display_summary
+# ---------------------------------------------------------------------------
+
+
+class TestDisplaySummary:
+    def test_execution_based_display_summary(self):
+        summary = ExecutionBasedEvaluationResultSummary(
+            total=10,
+            resolved=7,
+            failed=3,
+            build=9,
+            percentage=70.0,
+            date=date.today(),
+            model="gpt-4o",
+            agent_name="copilot",
+            category=EvaluationCategory.BUG_FIX,
+            average_duration=100.0,
+            average_prompt_tokens=1000.0,
+            average_completion_tokens=500.0,
+            benchmark_version="0.1.0",
+        )
+        display = summary.display_summary()
+        assert display == {"resolved": 7, "failed": 3, "build": 9, "percentage": 70.0}
+
+
+# ---------------------------------------------------------------------------
+# Summary from_json dispatch
+# ---------------------------------------------------------------------------
+
+
+class TestSummaryFromJson:
+    def test_from_json_returns_execution_based_for_bugfix(self):
+        payload = {
+            "total": 5,
+            "resolved": 3,
+            "failed": 2,
+            "build": 4,
+            "percentage": 60.0,
+            "date": "2025-01-15",
+            "model": "gpt-4o",
+            "category": "bug-fix",
+            "agent_name": "copilot",
+            "average_duration": 100.0,
+            "average_prompt_tokens": 1000.0,
+            "average_completion_tokens": 500.0,
+            "benchmark_version": "0.1.0",
+        }
+        summary = EvaluationResultSummary.from_json(payload)
+        assert isinstance(summary, ExecutionBasedEvaluationResultSummary)
+        assert summary.resolved == 3
+
+    def test_from_json_unknown_category_raises(self):
+        payload = {
+            "total": 5,
+            "date": "2025-01-15",
+            "model": "gpt-4o",
+            "category": "nonexistent",
+            "agent_name": "copilot",
+            "average_duration": 100.0,
+            "average_prompt_tokens": 1000.0,
+            "average_completion_tokens": 500.0,
+            "benchmark_version": "0.1.0",
+        }
+        with pytest.raises(ValueError, match="nonexistent"):
+            EvaluationResultSummary.from_json(payload)
+
+
+# ---------------------------------------------------------------------------
+# display.py — console and GitHub summary
+# ---------------------------------------------------------------------------
+
+
+class TestConsoleSummary:
+    def test_console_summary_renders(self, capsys):
+        results = [
+            create_bugfix_result(instance_id="test__1", resolved=True),
+            create_bugfix_result(instance_id="test__2", resolved=False, error_message="Build failed"),
+        ]
+        create_console_summary(results, EvaluationResultSummary.from_results(results, run_id=""))
+        captured = capsys.readouterr()
+        assert "test__1" in captured.out
+        assert "test__2" in captured.out
+        assert "Evaluation Results Summary" in captured.out
+
+    def test_console_summary_shows_testgen_data_values(self, capsys):
+        results = [
+            create_testgen_result(instance_id="test__1", resolved=True, pre_patch_failed=True, post_patch_passed=True),
+        ]
+        create_console_summary(results, EvaluationResultSummary.from_results(results, run_id=""))
+        captured = capsys.readouterr()
+        # Rich truncates column headers, but data values "Yes" should appear
+        assert "Yes" in captured.out
+        assert "test__1" in captured.out
+
+
+class TestGitHubJobSummary:
+    def test_github_summary_renders_markdown(self, tmp_path, monkeypatch):
+        summary_file = tmp_path / "summary.md"
+        monkeypatch.setattr("bcbench.results.display.get_config", lambda: _make_config_with_summary(str(summary_file)))
+        results = [
+            create_bugfix_result(instance_id="test__1", resolved=True),
+            create_bugfix_result(instance_id="test__2", resolved=False, error_message="Build failed"),
+        ]
+        create_github_job_summary(results, EvaluationResultSummary.from_results(results, run_id=""))
+        content = summary_file.read_text()
+        assert "test__1" in content
+        assert "test__2" in content
+        assert "bug-fix" in content
+
+    def test_github_summary_includes_testgen_columns(self, tmp_path, monkeypatch):
+        summary_file = tmp_path / "summary.md"
+        monkeypatch.setattr("bcbench.results.display.get_config", lambda: _make_config_with_summary(str(summary_file)))
+        results = [
+            create_testgen_result(instance_id="test__1", resolved=True, pre_patch_failed=True, post_patch_passed=True),
+        ]
+        create_github_job_summary(results, EvaluationResultSummary.from_results(results, run_id=""))
+        content = summary_file.read_text()
+        assert "Pre-Patch Failed" in content
+        assert "Post-Patch Passed" in content
+
+    def test_github_summary_includes_tool_usage(self, tmp_path, monkeypatch):
+        summary_file = tmp_path / "summary.md"
+        monkeypatch.setattr("bcbench.results.display.get_config", lambda: _make_config_with_summary(str(summary_file)))
+        results = [
+            create_bugfix_result(
+                instance_id="test__1",
+                resolved=True,
+                metrics=AgentMetrics(execution_time=100.0, tool_usage={"bash": 5, "view": 3}),
+            ),
+        ]
+        create_github_job_summary(results, EvaluationResultSummary.from_results(results, run_id=""))
+        content = summary_file.read_text()
+        assert "Tool Usage" in content
+        assert "bash" in content
diff --git a/tests/test_result_serialization.py b/tests/test_result_serialization.py
index c8e5487b3..7644fc8a8 100644
--- a/tests/test_result_serialization.py
+++ b/tests/test_result_serialization.py
@@ -2,8 +2,8 @@
 
 import pytest
 
-from bcbench.results.base import create_result_from_json
-from bcbench.results.evaluation_result import EvaluationResultSummary
+from bcbench.results.base import BaseEvaluationResult
+from bcbench.results.summary import EvaluationResultSummary
 from bcbench.types import AgentMetrics, EvaluationCategory, ExperimentConfiguration
 from tests.conftest import create_bugfix_result, create_testgen_result
 
@@ -55,10 +55,10 @@ def test_bug_fix_category_loads_from_string(self):
             "category": "bug-fix",
             "resolved": True,
             "build": True,
-            "generated_patch": "patch",
+            "output": "patch",
         }
 
-        result = create_result_from_json(payload)
+        result = BaseEvaluationResult.from_json(payload)
 
         assert result.category == EvaluationCategory.BUG_FIX
 
@@ -71,10 +71,10 @@ def test_test_generation_category_loads_from_string(self):
             "category": "test-generation",
             "resolved": False,
             "build": True,
-            "generated_patch": "test patch",
+            "output": "test patch",
         }
 
-        result = create_result_from_json(payload)
+        result = BaseEvaluationResult.from_json(payload)
 
         assert result.category == EvaluationCategory.TEST_GENERATION
 
@@ -88,7 +88,7 @@ def test_round_trip_bug_fix(self, tmp_path):
         with open(tmp_path / "test.jsonl") as f:
             data = json.loads(f.readline())
 
-        loaded = create_result_from_json(data)
+        loaded = BaseEvaluationResult.from_json(data)
 
         assert loaded.category == original.category
         assert loaded.category == EvaluationCategory.BUG_FIX
@@ -103,7 +103,7 @@ def test_round_trip_test_generation(self, tmp_path):
         with open(tmp_path / "test.jsonl") as f:
             data = json.loads(f.readline())
 
-        loaded = create_result_from_json(data)
+        loaded = BaseEvaluationResult.from_json(data)
 
         assert loaded.category == original.category
         assert loaded.category == EvaluationCategory.TEST_GENERATION
@@ -136,7 +136,9 @@ def test_summary_category_loads_from_string(self):
             "benchmark_version": "0.1.0",
         }
 
-        summary = EvaluationResultSummary.model_validate(payload)
+        from bcbench.results.summary import EvaluationResultSummary
+
+        summary = EvaluationResultSummary.from_json(payload)
 
         # Pydantic handles the enum conversion automatically
         assert summary.category == EvaluationCategory.TEST_GENERATION
@@ -210,7 +212,7 @@ def test_tool_usage_loads_from_json(self):
             "category": "bug-fix",
             "resolved": True,
             "build": True,
-            "generated_patch": "patch",
+            "output": "patch",
             "metrics": {
                 "execution_time": 100.0,
                 "prompt_tokens": 5000,
@@ -219,7 +221,7 @@ def test_tool_usage_loads_from_json(self):
             },
         }
 
-        result = create_result_from_json(payload)
+        result = BaseEvaluationResult.from_json(payload)
 
         assert result.metrics is not None
         assert result.metrics.tool_usage is not None
@@ -243,10 +245,22 @@ def test_tool_usage_round_trip(self, tmp_path):
         with open(tmp_path / "test.jsonl") as f:
             data = json.loads(f.readline())
 
-        loaded = create_result_from_json(data)
+        loaded = BaseEvaluationResult.from_json(data)
 
         assert loaded.metrics is not None
         assert loaded.metrics.tool_usage is not None
         assert original.metrics is not None
         assert original.metrics.tool_usage is not None
         assert loaded.metrics.tool_usage == original.metrics.tool_usage
+
+    def test_model_dump_json_serializes_category_as_string_value(self):
+        bug_fix = create_bugfix_result()
+        test_gen = create_testgen_result()
+
+        bug_fix_dump = bug_fix.model_dump(mode="json")
+        test_gen_dump = test_gen.model_dump(mode="json")
+
+        assert bug_fix_dump["category"] == "bug-fix"
+        assert test_gen_dump["category"] == "test-generation"
+        assert isinstance(bug_fix_dump["category"], str)
+        assert isinstance(test_gen_dump["category"], str)
diff --git a/tests/test_testgeneration_validation.py b/tests/test_testgeneration_validation.py
index 074d60de0..f29253ef6 100644
--- a/tests/test_testgeneration_validation.py
+++ b/tests/test_testgeneration_validation.py
@@ -3,7 +3,7 @@
 import pytest
 import yaml
 
-from bcbench.operations.setup_operations import _get_test_generation_input_mode
+from bcbench.evaluate.testgeneration import _get_test_generation_input_mode
 
 
 def test_get_test_generation_input_mode_valid_gold_patch():
diff --git a/tests/test_version.py b/tests/test_version.py
index 654f4ff41..5082fc4a7 100644
--- a/tests/test_version.py
+++ b/tests/test_version.py
@@ -1,6 +1,6 @@
 """Tests for version utility."""
 
-from bcbench.results.evaluation_result import _get_benchmark_version
+from bcbench.results.summary import _get_benchmark_version
 
 
 def test_get_benchmark_version_returns_string():