diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
index 884968fb5..1eafcb1fc 100644
--- a/.github/workflows/CI.yml
+++ b/.github/workflows/CI.yml
@@ -32,11 +32,6 @@ jobs:
       - name: Run tests with coverage
         run: uv run pytest --cov=src/bcbench --cov-report=term-missing
 
-  get-entries:
-    uses: ./.github/workflows/get-entries.yml
-    with:
-      test-run: true
-
   select-category:
     runs-on: ubuntu-latest
     outputs:
@@ -50,6 +45,13 @@ jobs:
           $selected = $categories | Get-Random
           echo "category=$selected" >> $env:GITHUB_OUTPUT
 
+  get-entries:
+    needs: select-category
+    uses: ./.github/workflows/get-entries.yml
+    with:
+      test-run: true
+      category: ${{ needs.select-category.outputs.category }}
+
   mock-evaluation:
     runs-on: ubuntu-latest
     needs: [get-entries, select-category]
@@ -79,7 +81,7 @@ jobs:
           retention-days: 1
 
   summarize-results:
-    needs: mock-evaluation
+    needs: [mock-evaluation, select-category]
     uses: ./.github/workflows/summarize-results.yml
     permissions:
       contents: write
@@ -89,5 +91,5 @@ jobs:
       model: ${{ github.run_id }}
       agent: "mock-agent"
       mock: true
-      category: "delete"
+      category: ${{ needs.select-category.outputs.category }}
     secrets: inherit
diff --git a/.github/workflows/claude-evaluation.yml b/.github/workflows/claude-evaluation.yml
index c523be844..69542c85e 100644
--- a/.github/workflows/claude-evaluation.yml
+++ b/.github/workflows/claude-evaluation.yml
@@ -57,6 +57,7 @@ jobs:
     uses: ./.github/workflows/get-entries.yml
     with:
       test-run: ${{ inputs.test-run }}
+      category: ${{ inputs.category }}
 
   evaluate-with-claude-code:
     runs-on: [self-hosted, 1ES.Pool=GitHub-BCBench]
diff --git a/.github/workflows/copilot-evaluation.yml b/.github/workflows/copilot-evaluation.yml
index d82914c12..3c0ef9162 100644
--- a/.github/workflows/copilot-evaluation.yml
+++ b/.github/workflows/copilot-evaluation.yml
@@ -65,6 +65,7 @@ jobs:
     uses: ./.github/workflows/get-entries.yml
     with:
       test-run: ${{ inputs.test-run }}
+      category: ${{ inputs.category }}
 
   evaluate-with-copilot-cli:
     runs-on: [self-hosted, 1ES.Pool=GitHub-BCBench]
diff --git a/.github/workflows/dataset-validation.yml b/.github/workflows/dataset-validation.yml
index d2d0900a1..180b1b53b 100644
--- a/.github/workflows/dataset-validation.yml
+++ b/.github/workflows/dataset-validation.yml
@@ -11,7 +11,7 @@ on:
         default: true
         type: boolean
   schedule:
-    - cron: "0 0 * * 0" # Weekly on Sundays at midnight
+    - cron: "0 0 * * 0"
 
 jobs:
   get-entries:
@@ -19,6 +19,7 @@ jobs:
     with:
       modified-only: false
       test-run: ${{ inputs.test-run || false }}
+      category: "bug-fix"
 
   verify-build-and-tests:
     runs-on: [self-hosted, 1ES.Pool=GitHub-BCBench]
diff --git a/.github/workflows/get-entries.yml b/.github/workflows/get-entries.yml
index 10a44edf1..31c952d87 100644
--- a/.github/workflows/get-entries.yml
+++ b/.github/workflows/get-entries.yml
@@ -15,6 +15,11 @@ on:
         required: false
         type: boolean
         default: false
+      category:
+        description: Evaluation category
+        required: true
+        type: string
+        default: "bug-fix"
     outputs:
       entries:
         description: JSON array of dataset entries
@@ -37,7 +42,7 @@ jobs:
       - name: Get entries for matrix
         id: get-entries
         run: |
-          cmd="uv run bcbench dataset list --github-output entries"
+          cmd="uv run bcbench dataset list --category ${{ inputs.category }} --github-output entries"
 
           if [[ "${{ inputs.modified-only }}" == "true" ]]; then
             cmd="$cmd --modified-only"
diff --git a/.github/workflows/mini-evaluation.yml b/.github/workflows/mini-evaluation.yml
index 29430f0cc..8149947e3 100644
--- a/.github/workflows/mini-evaluation.yml
+++ b/.github/workflows/mini-evaluation.yml
@@ -38,6 +38,7 @@ jobs:
     uses: ./.github/workflows/get-entries.yml
     with:
       test-run: ${{ inputs.test-run }}
+      category: ${{ inputs.category }}
 
   evaluate-with-mini-agent:
     runs-on: [self-hosted, 1ES.Pool=GitHub-BCBench]
diff --git a/.github/workflows/summarize-results.yml b/.github/workflows/summarize-results.yml
index bd65b480a..a492f7a86 100644
--- a/.github/workflows/summarize-results.yml
+++ b/.github/workflows/summarize-results.yml
@@ -53,7 +53,7 @@ jobs:
           merge-multiple: true
 
       - name: Summarize evaluation results
-        run: uv run bcbench result summarize --result-dir "${{ inputs.results-dir }}" --bceval-output "${{ env.BCEVAL_RESULT_FILE }}" --summary-output "${{ env.SUMMARY_OUTPUT_FILE }}"
+        run: uv run bcbench result summarize --category "${{ inputs.category }}" --result-dir "${{ inputs.results-dir }}" --bceval-output "${{ env.BCEVAL_RESULT_FILE }}" --summary-output "${{ env.SUMMARY_OUTPUT_FILE }}"
 
       - name: Upload evaluation summary to artifacts
         uses: actions/upload-artifact@v6
diff --git a/src/bcbench/agent/claude/agent.py b/src/bcbench/agent/claude/agent.py
index ab71fa200..b07c48c57 100644
--- a/src/bcbench/agent/claude/agent.py
+++ b/src/bcbench/agent/claude/agent.py
@@ -8,7 +8,7 @@
 from bcbench.agent.claude.metrics import parse_metrics
 from bcbench.agent.shared import build_mcp_config, build_prompt
 from bcbench.config import get_config
-from bcbench.dataset import DatasetEntry
+from bcbench.dataset import BaseDatasetEntry
 from bcbench.exceptions import AgentError, AgentTimeoutError
 from bcbench.logger import get_logger
 from bcbench.operations import setup_agent_skills, setup_custom_agent, setup_instructions_from_config
@@ -19,7 +19,7 @@
 
 
 def run_claude_code(
-    entry: DatasetEntry, model: str, category: EvaluationCategory, repo_path: Path, output_dir: Path, al_mcp: bool = False, container_name: str = "bcbench"
+    entry: BaseDatasetEntry, model: str, category: EvaluationCategory, repo_path: Path, output_dir: Path, al_mcp: bool = False, container_name: str = "bcbench"
 ) -> tuple[AgentMetrics | None, ExperimentConfiguration]:
     """Run Claude Code on a single dataset entry.
 
diff --git a/src/bcbench/agent/copilot/agent.py b/src/bcbench/agent/copilot/agent.py
index 2fdc98078..ab2f1a79e 100644
--- a/src/bcbench/agent/copilot/agent.py
+++ b/src/bcbench/agent/copilot/agent.py
@@ -10,7 +10,7 @@
 from bcbench.agent.copilot.metrics import parse_metrics
 from bcbench.agent.shared import build_mcp_config, build_prompt
 from bcbench.config import get_config
-from bcbench.dataset import DatasetEntry
+from bcbench.dataset import BaseDatasetEntry
 from bcbench.exceptions import AgentError, AgentTimeoutError
 from bcbench.logger import get_logger
 from bcbench.operations import setup_agent_skills, setup_custom_agent, setup_instructions_from_config
@@ -21,7 +21,7 @@
 
 
 def run_copilot_agent(
-    entry: DatasetEntry, model: str, category: EvaluationCategory, repo_path: Path, output_dir: Path, al_mcp: bool = False, container_name: str = "bcbench"
+    entry: BaseDatasetEntry, model: str, category: EvaluationCategory, repo_path: Path, output_dir: Path, al_mcp: bool = False, container_name: str = "bcbench"
 ) -> tuple[AgentMetrics | None, ExperimentConfiguration]:
     """Run GitHub Copilot CLI agent on a single dataset entry.
 
diff --git a/src/bcbench/agent/mini/agent.py b/src/bcbench/agent/mini/agent.py
index 9f82ab64c..fec150331 100644
--- a/src/bcbench/agent/mini/agent.py
+++ b/src/bcbench/agent/mini/agent.py
@@ -8,7 +8,7 @@
 import yaml
 
 from bcbench.config import get_config
-from bcbench.dataset import DatasetEntry
+from bcbench.dataset import BaseDatasetEntry
 from bcbench.exceptions import ConfigurationError
 from bcbench.logger import get_logger
 from bcbench.types import AgentMetrics, EvaluationCategory, ExperimentConfiguration
@@ -52,7 +52,7 @@ def parse_action(self, response: dict) -> dict:
 
 
 def run_mini_agent(
-    entry: DatasetEntry,
+    entry: BaseDatasetEntry,
     repo_path: Path,
     model: str,
     category: EvaluationCategory,
diff --git a/src/bcbench/agent/shared/mcp.py b/src/bcbench/agent/shared/mcp.py
index 85046d6e0..822df035a 100644
--- a/src/bcbench/agent/shared/mcp.py
+++ b/src/bcbench/agent/shared/mcp.py
@@ -6,7 +6,7 @@
 from jinja2 import Template
 from packaging.version import Version
 
-from bcbench.dataset import DatasetEntry
+from bcbench.dataset import BaseDatasetEntry
 from bcbench.exceptions import AgentError
 from bcbench.logger import get_logger
 
@@ -144,7 +144,7 @@ def _build_server_entry(server: dict[str, Any], template_context: dict[str, Any]
             raise AgentError(f"Unsupported MCP server type: {server_type}")
 
 
-def build_mcp_config(config: dict[str, Any], entry: DatasetEntry, repo_path: Path, al_mcp: bool = False, container_name: str = "bcbench") -> tuple[str | None, list[str] | None]:
+def build_mcp_config(config: dict[str, Any], entry: BaseDatasetEntry, repo_path: Path, al_mcp: bool = False, container_name: str = "bcbench") -> tuple[str | None, list[str] | None]:
     mcp_servers: list[dict[str, Any]] = config.get("mcp", {}).get("servers", [])
 
     if not al_mcp:
diff --git a/src/bcbench/agent/shared/prompt.py b/src/bcbench/agent/shared/prompt.py
index b819ada9f..474105993 100644
--- a/src/bcbench/agent/shared/prompt.py
+++ b/src/bcbench/agent/shared/prompt.py
@@ -2,11 +2,11 @@
 
 from jinja2 import Template
 
-from bcbench.dataset import DatasetEntry
+from bcbench.dataset import BaseDatasetEntry
 from bcbench.types import EvaluationCategory
 
 
-def build_prompt(entry: DatasetEntry, repo_path: Path, config: dict, category: EvaluationCategory, al_mcp: bool = False) -> str:
+def build_prompt(entry: BaseDatasetEntry, repo_path: Path, config: dict, category: EvaluationCategory, al_mcp: bool = False) -> str:
     prompt_config = config.get("prompt", {})
     template_str = prompt_config.get(f"{category.value}-template")
     include_project_paths = prompt_config.get("include_project_paths")
diff --git a/src/bcbench/cli_options.py b/src/bcbench/cli_options.py
index 923e376c1..d8221f27f 100644
--- a/src/bcbench/cli_options.py
+++ b/src/bcbench/cli_options.py
@@ -9,12 +9,8 @@
 
 # Type aliases for cleaner command signatures
 # Note: Defaults are provided in function signatures, not here
-DatasetPath = Annotated[Path, typer.Option(help="Path to dataset file")]
-
 RepoPath = Annotated[Path, typer.Option(help="Path to repository")]
 
-SchemaPath = Annotated[Path, typer.Option(help="Path to schema file")]
-
 OutputDir = Annotated[Path, typer.Option(help="Directory to save evaluation results")]
 
 RunId = Annotated[str, typer.Option(envvar="GITHUB_RUN_ID", help="Unique identifier for this evaluation run")]
diff --git a/src/bcbench/collection/build_entry.py b/src/bcbench/collection/build_entry.py
index 05a6ec5c2..88e340e67 100644
--- a/src/bcbench/collection/build_entry.py
+++ b/src/bcbench/collection/build_entry.py
@@ -1,4 +1,4 @@
-"""Builder functions for creating DatasetEntry from ADO sources."""
+"""Builder functions for creating BugFixEntry from ADO sources."""
 
 from pathlib import Path
 from typing import Any
@@ -7,7 +7,7 @@
 from bcbench.collection.patch_utils import extract_file_paths_from_patch, extract_patches, find_project_paths_from_diff
 from bcbench.collection.version_resolver import determine_environment_setup_version
 from bcbench.config import get_config
-from bcbench.dataset import DatasetEntry
+from bcbench.dataset import BugFixEntry
 from bcbench.operations.git_operations import checkout_commit
 from bcbench.operations.test_operations import extract_tests_from_patch
 
@@ -51,7 +51,7 @@ def build_dataset_entry_from_ado(
     base_commit: str,
     commit: str,
     diff_path: list[str] | None = None,
-) -> DatasetEntry:
+) -> BugFixEntry:
     created_at = extract_creation_date(pr_data)
     patch, patch_fix, patch_test = extract_patches(repo_path, base_commit, commit, diff_path=diff_path)
     problem_statement, hints = extract_problem_statement(work_item_data)
@@ -73,7 +73,7 @@ def build_dataset_entry_from_ado(
         hints=hints,
     )
 
-    return DatasetEntry(
+    return BugFixEntry(
         instance_id=instance_id,
         base_commit=base_commit,
         created_at=created_at,
diff --git a/src/bcbench/collection/collect_gh.py b/src/bcbench/collection/collect_gh.py
index 4a9046a99..651b61dd0 100644
--- a/src/bcbench/collection/collect_gh.py
+++ b/src/bcbench/collection/collect_gh.py
@@ -9,7 +9,7 @@
 from bcbench.collection.gh_client import GHClient
 from bcbench.collection.patch_utils import extract_file_paths_from_patch, find_project_paths_from_diff, separate_patches
 from bcbench.config import get_config
-from bcbench.dataset import DatasetEntry
+from bcbench.dataset import BugFixEntry
 from bcbench.exceptions import CollectionError
 from bcbench.logger import get_logger
 from bcbench.operations.test_operations import extract_tests_from_patch
@@ -58,7 +58,7 @@ def collect_gh_entry(pr_number: int, output: Path, repo: str = "microsoft/BCApps
 
         save_problem_statement(instance_id=instance_id, problem_statement=problem_statement)
 
-        entry = DatasetEntry(
+        entry = BugFixEntry(
             repo=repo,
             instance_id=instance_id,
             base_commit=base_commit,
diff --git a/src/bcbench/collection/collect_nav.py b/src/bcbench/collection/collect_nav.py
index ed4a1050b..3c8dd7694 100644
--- a/src/bcbench/collection/collect_nav.py
+++ b/src/bcbench/collection/collect_nav.py
@@ -6,7 +6,7 @@
 from bcbench.collection.ado_client import ADOClient
 from bcbench.collection.build_entry import build_dataset_entry_from_ado
 from bcbench.config import get_config
-from bcbench.dataset import DatasetEntry
+from bcbench.dataset import BugFixEntry
 from bcbench.exceptions import CollectionError
 from bcbench.logger import get_logger
 
@@ -35,7 +35,7 @@ def collect_nav_entry(
         if len(parents) != 1:
             raise CollectionError("Commit has multiple parents, cannot determine base commit.")
 
-        entry: DatasetEntry = build_dataset_entry_from_ado(
+        entry: BugFixEntry = build_dataset_entry_from_ado(
             pr_number=pr_number,
             repo_path=repo_path,
             pr_data=pr_data,
diff --git a/src/bcbench/commands/collect.py b/src/bcbench/commands/collect.py
index 18eac9cf8..2df9a03d9 100644
--- a/src/bcbench/commands/collect.py
+++ b/src/bcbench/commands/collect.py
@@ -1,9 +1,11 @@
 """CLI commands for collecting dataset entries."""
 
+from pathlib import Path
+
 import typer
 from typing_extensions import Annotated
 
-from bcbench.cli_options import DatasetPath, RepoPath
+from bcbench.cli_options import RepoPath
 from bcbench.collection import collect_gh_entry, collect_nav_entry
 from bcbench.config import get_config
 
@@ -15,7 +17,7 @@
 @collect_app.command("nav")
 def collect_nav(
     pr_number: Annotated[int, typer.Argument(help="Pull request number to collect")],
-    output: DatasetPath = _config.paths.dataset_path,
+    output: Annotated[Path, typer.Option(help="Path to output dataset file")] = _config.paths.dataset_dir / "bcbench.jsonl",
     repo_path: RepoPath = _config.paths.testbed_path,
     diff_path: Annotated[
         list[str] | None,
@@ -35,7 +37,7 @@ def collect_nav(
 @collect_app.command("gh")
 def collect_gh(
     pr_number: Annotated[int, typer.Argument(help="Pull request number to collect")],
-    output: DatasetPath = _config.paths.dataset_path,
+    output: Annotated[Path, typer.Option(help="Path to output dataset file")] = _config.paths.dataset_dir / "bcbench.jsonl",
     repo: Annotated[str, typer.Option(help="GitHub repository in OWNER/REPO format")] = "microsoft/BCApps",
 ):
     """
diff --git a/src/bcbench/commands/dataset.py b/src/bcbench/commands/dataset.py
index 4ba0d9dda..732faee7a 100644
--- a/src/bcbench/commands/dataset.py
+++ b/src/bcbench/commands/dataset.py
@@ -1,52 +1,34 @@
 """CLI commands for dataset operations."""
 
 import json
-from pathlib import Path
 
 import typer
 from typing_extensions import Annotated
 
-from bcbench.cli_options import DatasetPath
+from bcbench.cli_options import EvaluationCategoryOption
 from bcbench.config import get_config
-from bcbench.dataset import DatasetEntry
-from bcbench.dataset.dataset_loader import load_dataset_entries
-from bcbench.dataset.reviewer import run_dataset_reviewer
+from bcbench.dataset import BaseDatasetEntry
+from bcbench.dataset.dataset_entry import _BugFixTestGenBase
 from bcbench.exceptions import ConfigurationError
 from bcbench.logger import get_logger
+from bcbench.types import EvaluationCategory
 
 logger = get_logger(__name__)
-_config = get_config()
 
 dataset_app = typer.Typer(help="Query and analyze dataset")
 
 
-@dataset_app.command("review")
-def review_dataset(
-    dataset_path: Annotated[Path, typer.Argument(help="Path to dataset JSONL file")] = _config.paths.dataset_path,
-    results_dir: Annotated[
-        Path | None, typer.Option("--results-dir", "-r", help="Directory containing result JSONL files to show resolution stats", exists=True, file_okay=False, dir_okay=True)
-    ] = None,
-):
-    """
-    Review dataset entries using a TUI.
-
-    Opens a split-pane view showing entry information and problem statement.
-    Use arrow keys to navigate between entries.
-
-    If --results-dir is provided, shows resolution stats (e.g., "2/5 resolved")
-    for each entry based on the results in that directory.
-    """
-    run_dataset_reviewer(dataset_path, results_dir)
-
-
 @dataset_app.command("list")
 def list_entries(
-    dataset_path: DatasetPath = _config.paths.dataset_path,
+    category: EvaluationCategoryOption = EvaluationCategory.BUG_FIX,
     github_output: Annotated[str | None, typer.Option(help="Write JSON output to GITHUB_OUTPUT with this key name")] = None,
     modified_only: Annotated[bool, typer.Option(help="Only list entries that have been modified in git diff")] = False,
     test_run: Annotated[bool, typer.Option(help="Indicate this is a test run (with 2 entries)")] = False,
 ):
     """List dataset entry IDs."""
+    entry_cls = category.entry_class
+    resolved_path = category.dataset_path
+
     if modified_only:
         import subprocess
 
@@ -59,19 +41,19 @@ def list_entries(
                 "--no-color",
                 "--diff-filter=AM",
                 "--",
-                str(dataset_path),
+                str(resolved_path),
             ],
             capture_output=True,
             text=True,
             encoding="utf-8",
             check=True,
-            cwd=dataset_path.parent,
+            cwd=resolved_path.parent,
         )
         diff_output: str = result.stdout
         entry_ids: list[str] = _modified_instance_ids_from_diff(diff_output)
     else:
-        dataset_entries: list[DatasetEntry] = load_dataset_entries(dataset_path, random=2 if test_run else None)
-        entry_ids: list[str] = [e.instance_id for e in dataset_entries]
+        entries: list[BaseDatasetEntry] = entry_cls.load(resolved_path, random=2 if test_run else None)
+        entry_ids: list[str] = [e.instance_id for e in entries]
 
     print(f"Found {len(entry_ids)} entry(ies){' (modified only)' if modified_only else ''}:")
     for entry_id in entry_ids:
@@ -84,7 +66,7 @@ def list_entries(
 @dataset_app.command("view")
 def view_entry(
     entry_id: Annotated[str, typer.Argument(help="Entry ID to view")],
-    dataset_path: DatasetPath = _config.paths.dataset_path,
+    category: EvaluationCategoryOption = EvaluationCategory.BUG_FIX,
     show_patch: Annotated[bool, typer.Option(help="Show patch in output")] = False,
 ):
     """View a specific dataset entry with rich formatting."""
@@ -92,7 +74,7 @@ def view_entry(
     from rich.panel import Panel
     from rich.table import Table
 
-    entry = load_dataset_entries(dataset_path, entry_id=entry_id)[0]
+    entry: BaseDatasetEntry = category.entry_class.load(category.dataset_path, entry_id=entry_id)[0]
     console = Console()
 
     info_table = Table(show_header=False, box=None)
@@ -109,7 +91,6 @@ def view_entry(
         "\n".join(entry.project_paths) if entry.project_paths else "N/A",
     )
 
-    # Add metadata fields dynamically
     metadata_dict = entry.metadata.model_dump()
     for field_name, field_value in metadata_dict.items():
         display_name = field_name.replace("_", " ").title()
@@ -123,33 +104,35 @@ def view_entry(
     if show_patch:
         console.print("\n[bold cyan]Patch:[/bold cyan]")
         console.print(Panel(entry.patch or "[dim]Empty[/dim]", border_style="magenta"))
-        console.print("\n[bold cyan]Test Patch:[/bold cyan]")
-        console.print(Panel(entry.test_patch or "[dim]Empty[/dim]", border_style="magenta"))
-
-    console.print("\n[bold cyan]FAIL_TO_PASS Tests:[/bold cyan]")
-    if entry.fail_to_pass:
-        test_table = Table()
-        test_table.add_column("Codeunit ID", style="magenta")
-        test_table.add_column("Functions", style="yellow")
-        for test in entry.fail_to_pass:
-            test_table.add_row(str(test.codeunitID), ", ".join(test.functionName))
-        console.print(test_table)
-    else:
-        console.print("[dim]No FAIL_TO_PASS tests[/dim]")
-
-    console.print("\n[bold cyan]PASS_TO_PASS Tests:[/bold cyan]")
-    if entry.pass_to_pass:
-        test_table = Table()
-        test_table.add_column("Codeunit ID", style="magenta")
-        test_table.add_column("Functions", style="yellow")
-        for test in entry.pass_to_pass:
-            test_table.add_row(
-                str(test.codeunitID),
-                ", ".join(test.functionName),
-            )
-        console.print(test_table)
-    else:
-        console.print("[dim]No PASS_TO_PASS tests[/dim]")
+
+    # Display category-specific fields
+    if isinstance(entry, _BugFixTestGenBase):
+        bugfix_entry = entry
+        if show_patch:
+            console.print("\n[bold cyan]Test Patch:[/bold cyan]")
+            console.print(Panel(bugfix_entry.test_patch or "[dim]Empty[/dim]", border_style="magenta"))
+
+        console.print("\n[bold cyan]FAIL_TO_PASS Tests:[/bold cyan]")
+        if bugfix_entry.fail_to_pass:
+            test_table = Table()
+            test_table.add_column("Codeunit ID", style="magenta")
+            test_table.add_column("Functions", style="yellow")
+            for test in bugfix_entry.fail_to_pass:
+                test_table.add_row(str(test.codeunitID), ", ".join(test.functionName))
+            console.print(test_table)
+        else:
+            console.print("[dim]No FAIL_TO_PASS tests[/dim]")
+
+        console.print("\n[bold cyan]PASS_TO_PASS Tests:[/bold cyan]")
+        if bugfix_entry.pass_to_pass:
+            test_table = Table()
+            test_table.add_column("Codeunit ID", style="magenta")
+            test_table.add_column("Functions", style="yellow")
+            for test in bugfix_entry.pass_to_pass:
+                test_table.add_row(str(test.codeunitID), ", ".join(test.functionName))
+            console.print(test_table)
+        else:
+            console.print("[dim]No PASS_TO_PASS tests[/dim]")
 
 
 def _modified_instance_ids_from_diff(diff_output: str) -> list[str]:
diff --git a/src/bcbench/commands/evaluate.py b/src/bcbench/commands/evaluate.py
index 6e41b2f92..ffef5d745 100644
--- a/src/bcbench/commands/evaluate.py
+++ b/src/bcbench/commands/evaluate.py
@@ -13,7 +13,6 @@
     ContainerPassword,
     ContainerUsername,
     CopilotModel,
-    DatasetPath,
     EvaluationCategoryOption,
     FoundryModel,
     OutputDir,
@@ -21,11 +20,11 @@
     RunId,
 )
 from bcbench.config import get_config
-from bcbench.dataset import DatasetEntry, load_dataset_entries
-from bcbench.evaluate import EvaluationPipeline, create_pipeline
+from bcbench.dataset import BaseDatasetEntry
+from bcbench.evaluate import EvaluationPipeline
 from bcbench.logger import get_logger
 from bcbench.results import BaseEvaluationResult
-from bcbench.types import AgentMetrics, EvaluationContext, ExperimentConfiguration
+from bcbench.types import AgentMetrics, ContainerConfig, EvaluationContext, ExperimentConfiguration
 
 logger = get_logger(__name__)
 _config = get_config()
@@ -33,6 +32,14 @@
 evaluate_app = typer.Typer(help="Evaluate agents on benchmark datasets")
 
 
+def _prepare_run_dir(output_dir: Path, run_id: str) -> Path:
+    run_dir = output_dir / run_id
+    if run_dir.exists():
+        shutil.rmtree(run_dir)
+    run_dir.mkdir(parents=True)
+    return run_dir
+
+
 @evaluate_app.command("mini")
 def evaluate_mini(
     entry_id: Annotated[str, typer.Argument(help="Entry ID to run")],
@@ -41,7 +48,6 @@ def evaluate_mini(
     password: ContainerPassword,
     category: EvaluationCategoryOption,
     model: FoundryModel = "gpt-5.1-codex-mini",
-    dataset_path: DatasetPath = _config.paths.dataset_path,
     repo_path: RepoPath = _config.paths.testbed_path,
     output_dir: OutputDir = _config.paths.evaluation_results_path,
     run_id: RunId = "mini_test_run",
@@ -51,14 +57,8 @@ def evaluate_mini(
 
     To only run the agent to generate a patch without building/testing, use 'bcbench run mini' instead.
     """
-    entries: list[DatasetEntry] = load_dataset_entries(dataset_path, entry_id=entry_id)
-    entry: DatasetEntry = entries[0]
-    logger.info(f"Loaded {entry_id} entry from dataset")
-
-    run_dir: Path = output_dir / run_id
-    if run_dir.exists():
-        shutil.rmtree(run_dir)
-    run_dir.mkdir(parents=True)
+    entry = category.entry_class.load(category.dataset_path, entry_id=entry_id)[0]
+    run_dir = _prepare_run_dir(output_dir, run_id)
 
     logger.info(f"Running evaluation on entry {entry_id} with mini-bc-agent")
 
@@ -66,15 +66,13 @@ def evaluate_mini(
         entry=entry,
         repo_path=repo_path,
         result_dir=run_dir,
-        container_name=container_name,
-        username=username,
-        password=password,
+        container=ContainerConfig(name=container_name, username=username, password=password),
         model=model,
         agent_name="mini-bc-agent",
         category=category,
     )
 
-    pipeline = create_pipeline(category)
+    pipeline = category.pipeline
     pipeline.execute(
         context,
         lambda ctx: run_mini_agent(
@@ -98,7 +96,6 @@ def evaluate_copilot(
     password: ContainerPassword,
     category: EvaluationCategoryOption,
     model: CopilotModel = "claude-haiku-4.5",
-    dataset_path: DatasetPath = _config.paths.dataset_path,
     repo_path: RepoPath = _config.paths.testbed_path,
     output_dir: OutputDir = _config.paths.evaluation_results_path,
     run_id: RunId = "copilot_test_run",
@@ -109,14 +106,8 @@ def evaluate_copilot(
 
     To only run the agent to generate a patch without building/testing, use 'bcbench run copilot' instead.
     """
-    entries: list[DatasetEntry] = load_dataset_entries(dataset_path, entry_id=entry_id)
-    entry: DatasetEntry = entries[0]
-    logger.info(f"Loaded {entry_id} entry from dataset")
-
-    run_dir: Path = output_dir / run_id
-    if run_dir.exists():
-        shutil.rmtree(run_dir)
-    run_dir.mkdir(parents=True)
+    entry = category.entry_class.load(category.dataset_path, entry_id=entry_id)[0]
+    run_dir = _prepare_run_dir(output_dir, run_id)
 
     logger.info(f"Running evaluation on entry {entry_id} with GitHub Copilot CLI")
 
@@ -124,15 +115,13 @@ def evaluate_copilot(
         entry=entry,
         repo_path=repo_path,
         result_dir=run_dir,
-        container_name=container_name,
-        username=username,
-        password=password,
+        container=ContainerConfig(name=container_name, username=username, password=password),
         model=model,
         agent_name="GitHub Copilot",
         category=category,
     )
 
-    pipeline = create_pipeline(category)
+    pipeline = category.pipeline
     pipeline.execute(
         context,
         lambda ctx: run_copilot_agent(
@@ -142,7 +131,7 @@ def evaluate_copilot(
             model=ctx.model,
             output_dir=ctx.result_dir,
             al_mcp=al_mcp,
-            container_name=ctx.container_name,
+            container_name=ctx.get_container().name,
         ),
     )
 
@@ -158,7 +147,6 @@ def evaluate_claude_code(
     password: ContainerPassword,
     category: EvaluationCategoryOption,
     model: ClaudeCodeModel = "claude-haiku-4-5",
-    dataset_path: DatasetPath = _config.paths.dataset_path,
     repo_path: RepoPath = _config.paths.testbed_path,
     output_dir: OutputDir = _config.paths.evaluation_results_path,
     run_id: RunId = "claude_code_test_run",
@@ -169,14 +157,8 @@ def evaluate_claude_code(
 
     To only run the agent to generate a patch without building/testing, use 'bcbench run claude' instead.
     """
-    entries: list[DatasetEntry] = load_dataset_entries(dataset_path, entry_id=entry_id)
-    entry: DatasetEntry = entries[0]
-    logger.info(f"Loaded {entry_id} entry from dataset")
-
-    run_dir: Path = output_dir / run_id
-    if run_dir.exists():
-        shutil.rmtree(run_dir)
-    run_dir.mkdir(parents=True)
+    entry = category.entry_class.load(category.dataset_path, entry_id=entry_id)[0]
+    run_dir = _prepare_run_dir(output_dir, run_id)
 
     logger.info(f"Running evaluation on entry {entry_id} with Claude Code")
 
@@ -184,15 +166,13 @@ def evaluate_claude_code(
         entry=entry,
         repo_path=repo_path,
         result_dir=run_dir,
-        container_name=container_name,
-        username=username,
-        password=password,
+        container=ContainerConfig(name=container_name, username=username, password=password),
         model=model,
         agent_name="Claude Code",
         category=category,
     )
 
-    pipeline = create_pipeline(category)
+    pipeline = category.pipeline
     pipeline.execute(
         context,
         lambda ctx: run_claude_code(
@@ -202,7 +182,7 @@ def evaluate_claude_code(
             model=ctx.model,
             output_dir=ctx.result_dir,
             al_mcp=al_mcp,
-            container_name=ctx.container_name,
+            container_name=ctx.get_container().name,
         ),
     )
 
@@ -214,21 +194,14 @@ def evaluate_claude_code(
 def evaluate_mock(
     entry_id: Annotated[str, typer.Argument(help="Entry ID to run")],
     category: EvaluationCategoryOption,
-    dataset_path: DatasetPath = _config.paths.dataset_path,
     output_dir: OutputDir = _config.paths.evaluation_results_path,
     run_id: RunId = "mock_run",
 ):
     """
     Evaluate mock agent on single dataset entry for testing purposes.
     """
-    entries: list[DatasetEntry] = load_dataset_entries(dataset_path, entry_id=entry_id)
-    entry: DatasetEntry = entries[0]
-    logger.info(f"Loaded {entry_id} entry from dataset")
-
-    run_dir: Path = output_dir / run_id
-    if run_dir.exists():
-        shutil.rmtree(run_dir)
-    run_dir.mkdir(parents=True)
+    entry = category.entry_class.load(category.dataset_path, entry_id=entry_id)[0]
+    run_dir = _prepare_run_dir(output_dir, run_id)
 
     logger.info(f"Running evaluation on entry {entry_id} with mock agent")
 
@@ -236,9 +209,6 @@ def evaluate_mock(
         entry=entry,
         repo_path=Path(),
         result_dir=run_dir,
-        container_name="no container",
-        username="",
-        password="",
         model="mock-model",
         agent_name="mock-agent",
         category=category,
@@ -251,17 +221,17 @@ def evaluate_mock(
     logger.info(f"Results saved to: {run_dir}")
 
 
-class MockEvaluationPipeline(EvaluationPipeline):
+class MockEvaluationPipeline(EvaluationPipeline[BaseDatasetEntry]):
     """Mock pipeline for testing evaluation infrastructure.
 
     This pipeline simulates agent execution without requiring actual BC container setup.
     It randomly generates different scenarios to test result handling and serialization.
     """
 
-    def setup(self, context: EvaluationContext) -> None:
+    def setup(self, context: EvaluationContext[BaseDatasetEntry]) -> None:
         logger.info("Mock pipeline: Skipping setup")
 
-    def run_agent(self, context: EvaluationContext, agent_runner: Callable) -> None:
+    def run_agent(self, context: EvaluationContext[BaseDatasetEntry], agent_runner: Callable) -> None:
         """Generate random agent metrics and experiment configuration."""
         logger.info("Mock pipeline: Generating random metrics and experiment configuration")
 
@@ -290,7 +260,7 @@ def run_agent(self, context: EvaluationContext, agent_runner: Callable) -> None:
         logger.info(f"Using agent metrics: {context.metrics}")
         logger.info(f"Using experiment configuration: {context.experiment}")
 
-    def evaluate(self, context: EvaluationContext) -> None:
+    def evaluate(self, context: EvaluationContext[BaseDatasetEntry]) -> None:
         """Create random evaluation result to test different outcome scenarios."""
         logger.info("Mock pipeline: Generating random evaluation result")
 
diff --git a/src/bcbench/commands/result.py b/src/bcbench/commands/result.py
index eb51534c7..40cb6d375 100644
--- a/src/bcbench/commands/result.py
+++ b/src/bcbench/commands/result.py
@@ -6,7 +6,7 @@
 import typer
 from typing_extensions import Annotated
 
-from bcbench.cli_options import DatasetPath, OutputDir, RunId
+from bcbench.cli_options import EvaluationCategoryOption, OutputDir, RunId
 from bcbench.config import get_config
 from bcbench.logger import get_logger
 from bcbench.results import (
@@ -19,8 +19,6 @@
     create_result_from_json,
     write_bceval_results,
 )
-from bcbench.results.reviewer import run_instance_reviewer, run_reviewer
-from bcbench.types import EvaluationCategory
 
 logger = get_logger(__name__)
 
@@ -30,45 +28,12 @@
 result_app = typer.Typer(help="Process and display evaluation results")
 
 
-@result_app.command("review")
-def result_review(
-    results_file: Annotated[Path, typer.Argument(help="Path to results JSONL file to review (or directory if --instance-id is used)", exists=True, file_okay=True, dir_okay=True)],
-    category: Annotated[EvaluationCategory, typer.Option("--category", "-c", help="Evaluation category: bug-fix (compare patch) or test-generation (compare test_patch)")],
-    dataset_path: DatasetPath = _config.paths.dataset_path,
-    instance_id: Annotated[str | None, typer.Option("--instance-id", "-i", help="Review a single instance across all runs in a directory")] = None,
-    include_resolved: Annotated[bool, typer.Option("--include-resolved", help="Include resolved instances in the review")] = False,
-):
-    """
-    Review evaluation results and annotate failure categories using a TUI.
-
-    Opens a split-pane view showing expected (gold patch) vs actual (agent output).
-    Use j/k or arrows to navigate, 1-7 to select failure category.
-    Categories are auto-saved on navigate and quit.
-
-    Two modes:
-    - Default: Review all unresolved results in a single JSONL file
-    - With --instance-id: Review one instance across all runs in a directory
-
-    Use --category to switch between bug-fix (compares patch) and test-generation (compares test_patch).
-    """
-    if instance_id:
-        if not results_file.is_dir():
-            logger.error(f"When using --instance-id, the path must be a directory containing JSONL files: {results_file}")
-            raise typer.Exit(code=1)
-        run_instance_reviewer(results_file, instance_id, dataset_path, category)
-    else:
-        if not results_file.is_file():
-            logger.error(f"Expected a JSONL file, got directory: {results_file}. Use --instance-id to review across runs.")
-            raise typer.Exit(code=1)
-        run_reviewer(results_file, dataset_path, category, include_resolved)
-
-
 @result_app.command("summarize")
 def result_summarize(
     run_id: RunId,
+    category: EvaluationCategoryOption,
     result_dir: OutputDir = _config.paths.evaluation_results_path,
     result_pattern: Annotated[str, typer.Option(help="Pattern for the per instances result files")] = f"*{_config.file_patterns.result_pattern}",
-    dataset_path: DatasetPath = _config.paths.dataset_path,
     summary_output: Annotated[str, typer.Option(help="Output filename for summary JSON")] = "evaluation_summary.json",
     bceval_output: Annotated[str, typer.Option(help="Output filename for bceval results")] = "bceval_results.jsonl",
 ):
@@ -106,7 +71,7 @@ def result_summarize(
         logger.error("No results found in the result files")
         raise typer.Exit(code=1)
 
-    write_bceval_results(results, run_dir, run_id, dataset_path, bceval_output)
+    write_bceval_results(results, run_dir, run_id, bceval_output, category)
 
     if _config.env.github_actions:
         create_github_job_summary(results)
diff --git a/src/bcbench/commands/run.py b/src/bcbench/commands/run.py
index a026d1435..38832a1b8 100644
--- a/src/bcbench/commands/run.py
+++ b/src/bcbench/commands/run.py
@@ -13,14 +13,13 @@
     ClaudeCodeModel,
     ContainerName,
     CopilotModel,
-    DatasetPath,
     EvaluationCategoryOption,
     FoundryModel,
     OutputDir,
     RepoPath,
 )
 from bcbench.config import get_config
-from bcbench.dataset import DatasetEntry, load_dataset_entries
+from bcbench.dataset.dataset_entry import _BugFixTestGenBase
 from bcbench.logger import get_logger
 from bcbench.operations import setup_repo_postbuild, setup_repo_prebuild
 
@@ -35,7 +34,6 @@ def run_mini(
     entry_id: Annotated[str, typer.Argument(help="Entry ID to run")],
     category: EvaluationCategoryOption,
     model: FoundryModel = "gpt-5.1-codex-mini",
-    dataset_path: DatasetPath = _config.paths.dataset_path,
     repo_path: RepoPath = _config.paths.testbed_path,
     output_dir: OutputDir = _config.paths.evaluation_results_path,
 ):
@@ -47,10 +45,10 @@ def run_mini(
     Example:
         uv run bcbench run mini microsoft__BCApps-5633 --step-limit 5 --category bug-fix
     """
-    entry: DatasetEntry = load_dataset_entries(dataset_path, entry_id=entry_id)[0]
-
+    entry = category.entry_class.load(category.dataset_path, entry_id=entry_id)[0]
     setup_repo_prebuild(entry, repo_path)
-    setup_repo_postbuild(entry, repo_path, category)
+    if isinstance(entry, _BugFixTestGenBase):
+        setup_repo_postbuild(entry, repo_path, category)
 
     run_mini_agent(
         entry=entry,
@@ -67,7 +65,6 @@ def run_copilot(
     category: EvaluationCategoryOption,
     container_name: ContainerName,
     model: CopilotModel = "claude-haiku-4.5",
-    dataset_path: DatasetPath = _config.paths.dataset_path,
     repo_path: RepoPath = _config.paths.testbed_path,
     output_dir: OutputDir = _config.paths.evaluation_results_path,
     al_mcp: Annotated[bool, typer.Option("--al-mcp", help="Enable AL MCP server")] = False,
@@ -80,10 +77,10 @@ def run_copilot(
     Example:
         uv run bcbench run copilot microsoft__BCApps-5633 --category bug-fix --repo-path /path/to/BCApps
     """
-    entry: DatasetEntry = load_dataset_entries(dataset_path, entry_id=entry_id)[0]
-
+    entry = category.entry_class.load(category.dataset_path, entry_id=entry_id)[0]
     setup_repo_prebuild(entry, repo_path)
-    setup_repo_postbuild(entry, repo_path, category)
+    if isinstance(entry, _BugFixTestGenBase):
+        setup_repo_postbuild(entry, repo_path, category)
 
     run_copilot_agent(entry=entry, repo_path=repo_path, model=model, category=category, output_dir=output_dir, al_mcp=al_mcp, container_name=container_name)
 
@@ -94,7 +91,6 @@ def run_claude(
     category: EvaluationCategoryOption,
     container_name: ContainerName,
     model: ClaudeCodeModel = "claude-haiku-4-5",
-    dataset_path: DatasetPath = _config.paths.dataset_path,
     repo_path: RepoPath = _config.paths.testbed_path,
     output_dir: OutputDir = _config.paths.evaluation_results_path,
     al_mcp: Annotated[bool, typer.Option("--al-mcp", help="Enable AL MCP server")] = False,
@@ -107,10 +103,10 @@ def run_claude(
     Example:
         uv run bcbench run claude microsoft__BCApps-5633 --category bug-fix --repo-path /path/to/BCApps
     """
-    entry: DatasetEntry = load_dataset_entries(dataset_path, entry_id=entry_id)[0]
-
+    entry = category.entry_class.load(category.dataset_path, entry_id=entry_id)[0]
     setup_repo_prebuild(entry, repo_path)
-    setup_repo_postbuild(entry, repo_path, category)
+    if isinstance(entry, _BugFixTestGenBase):
+        setup_repo_postbuild(entry, repo_path, category)
 
     run_claude_code(entry=entry, repo_path=repo_path, model=model, category=category, output_dir=output_dir, al_mcp=al_mcp, container_name=container_name)
 
diff --git a/src/bcbench/config.py b/src/bcbench/config.py
index 06fdffd11..c9a35df69 100644
--- a/src/bcbench/config.py
+++ b/src/bcbench/config.py
@@ -35,14 +35,12 @@ class PathConfig:
     """File and directory paths."""
 
     bc_bench_root: Path
-    dataset_path: Path
     dataset_dir: Path
     problem_statement_dir: Path
     testbed_path: Path
     ps_script_path: Path
     evaluation_results_path: Path
     leaderboard_dir: Path
-    copilot_dir: Path
     agent_share_dir: Path
 
     @classmethod
@@ -51,13 +49,11 @@ def from_root(cls, root: Path) -> PathConfig:
         return cls(
             bc_bench_root=root,
             dataset_dir=root / "dataset",
-            dataset_path=root / "dataset" / "bcbench.jsonl",
             problem_statement_dir=root / "dataset" / "problemstatement",
             testbed_path=root.parent / "NAV",
             ps_script_path=root / "scripts",
             evaluation_results_path=root / "evaluation_results",
             leaderboard_dir=root / "docs" / "_data",
-            copilot_dir=root / "src" / "bcbench" / "agent" / "copilot",
             agent_share_dir=root / "src" / "bcbench" / "agent" / "shared",
         )
 
diff --git a/src/bcbench/dataset/__init__.py b/src/bcbench/dataset/__init__.py
index 6af72372f..4e6e205fa 100644
--- a/src/bcbench/dataset/__init__.py
+++ b/src/bcbench/dataset/__init__.py
@@ -1,12 +1,10 @@
 """Dataset module for querying, validating and analyze dataset entries."""
 
-from bcbench.dataset.dataset_entry import DatasetEntry, TestEntry
-from bcbench.dataset.dataset_loader import load_dataset_entries
-from bcbench.dataset.reviewer import run_dataset_reviewer
+from bcbench.dataset.dataset_entry import BaseDatasetEntry, BugFixEntry, TestEntry, TestGenEntry
 
 __all__ = [
-    "DatasetEntry",
+    "BaseDatasetEntry",
+    "BugFixEntry",
     "TestEntry",
-    "load_dataset_entries",
-    "run_dataset_reviewer",
+    "TestGenEntry",
 ]
diff --git a/src/bcbench/dataset/dataset_entry.py b/src/bcbench/dataset/dataset_entry.py
index 76a1361a0..2d073a156 100644
--- a/src/bcbench/dataset/dataset_entry.py
+++ b/src/bcbench/dataset/dataset_entry.py
@@ -2,16 +2,18 @@
 
 import json
 import re
+from abc import abstractmethod
 from pathlib import Path
-from typing import Annotated
+from typing import Annotated, Self
 
 from pydantic import BaseModel, ConfigDict, Field, model_validator
 
 from bcbench.config import get_config
+from bcbench.exceptions import EntryNotFoundError
 
 _config = get_config()
 
-__all__ = ["DatasetEntry", "TestEntry"]
+__all__ = ["BaseDatasetEntry", "BugFixEntry", "TestEntry", "TestGenEntry"]
 
 
 class TestEntry(BaseModel):
@@ -28,7 +30,9 @@ class EntryMetadata(BaseModel):
     image_count: int | None = None
 
 
-class DatasetEntry(BaseModel):
+class BaseDatasetEntry(BaseModel):
+    """Base class for all dataset entries. Contains common properties shared across categories."""
+
     model_config = ConfigDict(frozen=True, populate_by_name=True)
 
     metadata: EntryMetadata = Field(default_factory=EntryMetadata)
@@ -39,90 +43,116 @@ class DatasetEntry(BaseModel):
     created_at: Annotated[str, Field(min_length=1)]
     environment_setup_version: str = Field(pattern=r"^[0-9]{2}\.[0-9]{1}$")
     project_paths: Annotated[list[str], Field(min_length=2)]
-    fail_to_pass: Annotated[list[TestEntry], Field(alias="FAIL_TO_PASS", min_length=1)]
-    pass_to_pass: Annotated[list[TestEntry], Field(alias="PASS_TO_PASS")] = []
-    test_patch: Annotated[str, Field(min_length=1)]
     patch: Annotated[str, Field(min_length=1)]
 
-    @model_validator(mode="after")
-    def validate_baseapp_patches_are_w1_only(self) -> DatasetEntry:
-        """Validate that patches only modify files in the expected layer (currently W1).
+    @classmethod
+    def load(cls, dataset_path: Path, entry_id: str | None = None, random: int | None = None) -> list[Self]:
+        if not dataset_path.exists():
+            raise FileNotFoundError(f"Dataset file not found: {dataset_path}")
 
-        Only applicable to BaseApp, patches should only modify W1 layer files, not country-specific layers (IT, DE, etc.).
-        """
-        if self.extract_project_name() != "BaseApp":
-            return self
+        entries: list[Self] = []
 
-        # Check both patch and test_patch
-        for patch in (self.patch, self.test_patch):
-            patch_paths = re.findall(r"^diff --git a/(.+?) b/", patch, re.MULTILINE)
+        with open(dataset_path, encoding="utf-8") as file:
+            for line in file:
+                stripped_line: str = line.strip()
+                if not stripped_line:
+                    continue
 
-            for patch_path in patch_paths:
-                match = re.match(r"App/Layers/([^/]+)/", patch_path)
-                if match:
-                    layer = match.group(1)
-                    if layer != "W1":
-                        raise ValueError(f"Patch modifies non-W1 layer '{layer}': {patch_path}")
+                entry = cls.model_validate_json(stripped_line)
 
-        return self
+                if entry_id:
+                    if entry.instance_id == entry_id:
+                        return [entry]
+                    continue
 
-    @property
-    def problem_statement_dir(self) -> Path:
-        return _config.paths.problem_statement_dir / self.instance_id
+                entries.append(entry)
+
+        if entry_id:
+            raise EntryNotFoundError(entry_id)
+
+        if random is not None and random > 0:
+            import random as random_module
+
+            return random_module.sample(entries, min(random, len(entries)))
+
+        return entries
 
     def save_to_file(self, filepath: Path | str) -> None:
         path = Path(filepath)
         path.parent.mkdir(parents=True, exist_ok=True)
         with path.open("a", encoding="utf-8") as handle:
-            # For JSONL format, always write compact JSON without indentation
             json.dump(self.model_dump(by_alias=True, mode="json"), handle, ensure_ascii=False)
             handle.write("\n")
 
+    @abstractmethod
     def get_task(self, transform_image_paths: bool = False) -> str:
-        """Get the task description.
+        pass
 
-        problem_statment and hints_text stored in the README.md file under the problem statement directory.
+    @abstractmethod
+    def get_expected_output(self) -> str:
+        pass
 
-        Args:
-            transform_image_paths: Whether to transform relative image paths to include the problem statement directory. Needed when passing to Agents.
-        """
-        readme_path = self.problem_statement_dir / _config.file_patterns.problem_statement_readme
+    def extract_project_name(self) -> str:
+        if not self.project_paths:
+            return ""
+
+        path = self.project_paths[0]
+        parts = path.replace("\\", "/").split("/")
+
+        if len(parts) >= 4:
+            return parts[-2] if parts[-1].lower() in ("app", "test") else parts[-1]
+
+        return parts[-1] if parts else ""
 
+
+class _BugFixTestGenBase(BaseDatasetEntry):
+    """Shared schema for bug-fix and test-generation entries (same JSONL, different semantics)."""
+
+    fail_to_pass: Annotated[list[TestEntry], Field(alias="FAIL_TO_PASS", min_length=1)]
+    pass_to_pass: Annotated[list[TestEntry], Field(alias="PASS_TO_PASS")] = []
+    test_patch: Annotated[str, Field(min_length=1)]
+
+    @property
+    def problem_statement_dir(self) -> Path:
+        return _config.paths.problem_statement_dir / self.instance_id
+
+    def get_task(self, transform_image_paths: bool = False) -> str:
+        readme_path = self.problem_statement_dir / _config.file_patterns.problem_statement_readme
         content: str = readme_path.read_text(encoding="utf-8")
 
         if not transform_image_paths:
             return content
 
-        # Transform relative image paths: ![alt text](./diagram.png) -> ![alt text](problem/diagram.png)
         dest_dir = _config.file_patterns.problem_statement_dest_dir
         return re.sub(r"!\[([^\]]*)\]\(\./([^)]+)\)", rf"![\1]({dest_dir}/\2)", content)
 
-    def extract_project_name(self) -> str:
-        """Extract the project name from the first project path.
+    @model_validator(mode="after")
+    def validate_baseapp_patches_are_w1_only(self) -> Self:
+        if self.extract_project_name() != "BaseApp":
+            return self
 
-        Examples:
-            App\\Apps\\W1\\Sustainability\\app -> Sustainability
-            App\\Layers\\W1\\BaseApp -> BaseApp
-            src\\Apps\\W1\\Shopify\\App -> Shopify
+        for patch in (self.patch, self.test_patch):
+            patch_paths = re.findall(r"^diff --git a/(.+?) b/", patch, re.MULTILINE)
 
-        Returns:
-            The extracted project name, or empty string if no project paths.
-        """
-        if not self.project_paths:
-            return ""
+            for patch_path in patch_paths:
+                match = re.match(r"App/Layers/([^/]+)/", patch_path)
+                if match:
+                    layer = match.group(1)
+                    if layer != "W1":
+                        raise ValueError(f"Patch modifies non-W1 layer '{layer}': {patch_path}")
 
-        # Take the first project path
-        path = self.project_paths[0]
+        return self
 
-        # Split by backslash or forward slash
-        parts = path.replace("\\", "/").split("/")
 
-        # Look for the meaningful project name
-        # Pattern: App\Apps\W1\<ProjectName>\app or App\Layers\W1\<ProjectName>
-        if len(parts) >= 4:
-            # For paths like App\Apps\W1\Sustainability\app, return "Sustainability"
-            # For paths like App\Layers\W1\BaseApp, return "BaseApp"
-            return parts[-2] if parts[-1].lower() in ("app", "test") else parts[-1]
+class BugFixEntry(_BugFixTestGenBase):
+    """Dataset entry for the bug-fix category."""
 
-        # Fallback to the last meaningful part
-        return parts[-1] if parts else ""
+    def get_expected_output(self) -> str:
+        return self.patch
+
+
+class TestGenEntry(_BugFixTestGenBase):
+    """Dataset entry for the test-generation category."""
+
+    def get_expected_output(self) -> str:
+        return self.test_patch
diff --git a/src/bcbench/dataset/dataset_loader.py b/src/bcbench/dataset/dataset_loader.py
deleted file mode 100644
index 8eb1b3f12..000000000
--- a/src/bcbench/dataset/dataset_loader.py
+++ /dev/null
@@ -1,51 +0,0 @@
-"""Utilities for loading dataset entries from JSONL files."""
-
-from pathlib import Path
-
-from bcbench.dataset.dataset_entry import DatasetEntry
-from bcbench.exceptions import EntryNotFoundError
-
-__all__ = ["load_dataset_entries"]
-
-
-def load_dataset_entries(dataset_path: Path, entry_id: str | None = None, random: int | None = None) -> list[DatasetEntry]:
-    """
-    Load dataset entries from a JSONL file.
-
-    Examples:
-        # Load a single entry by ID
-        entries = load_dataset_entries(path, entry_id="NAV_12345")
-
-        # Load 2 random entries
-        entries = load_dataset_entries(path, random=2)
-    """
-    if not dataset_path.exists():
-        raise FileNotFoundError(f"Dataset file not found: {dataset_path}")
-
-    entries: list[DatasetEntry] = []
-
-    with open(dataset_path, encoding="utf-8") as file:
-        for line in file:
-            stripped_line: str = line.strip()
-            if not stripped_line:
-                continue
-
-            entry = DatasetEntry.model_validate_json(stripped_line)
-
-            # If searching for specific entry_id, return immediately when found
-            if entry_id:
-                if entry.instance_id == entry_id:
-                    return [entry]
-                continue
-
-            entries.append(entry)
-
-    if entry_id:
-        raise EntryNotFoundError(entry_id)
-
-    if random is not None and random > 0:
-        import random as random_module
-
-        return random_module.sample(entries, min(random, len(entries)))
-
-    return entries
diff --git a/src/bcbench/dataset/reviewer.py b/src/bcbench/dataset/reviewer.py
deleted file mode 100644
index 18dfc0123..000000000
--- a/src/bcbench/dataset/reviewer.py
+++ /dev/null
@@ -1,234 +0,0 @@
-"""
-Dataset Review TUI for browsing dataset entries.
-
-Shows dataset entry info and problem statement. Use arrow keys to navigate.
-If results directory is provided, shows resolution stats across runs.
-
-Copilot wrote all of this code, seems to work, haven't read it.
-"""
-
-import json
-from pathlib import Path
-
-from textual.app import App, ComposeResult
-from textual.binding import Binding
-from textual.containers import Horizontal, Vertical, VerticalScroll
-from textual.widgets import Footer, Header, Label, Static
-
-from bcbench.dataset import DatasetEntry, load_dataset_entries
-
-
-class DatasetReviewer(App):
-    """TUI for reviewing dataset entries."""
-
-    TITLE = "Dataset Review"
-
-    CSS = """
-    #content-container {
-        height: 1fr;
-    }
-    .content-panel {
-        height: 1fr;
-        border: solid blue;
-    }
-    #info-panel {
-        height: auto;
-        max-height: 12;
-    }
-    #info-columns {
-        height: auto;
-    }
-    .info-column {
-        width: 1fr;
-        height: auto;
-    }
-    #problem-panel {
-        border: solid green;
-    }
-    """
-
-    BINDINGS = [  # noqa: RUF012
-        Binding("right", "next_item", "Next →"),
-        Binding("left", "prev_item", "← Prev"),
-        Binding("escape", "quit", "Quit"),
-    ]
-
-    def __init__(self, dataset_path: Path, results_dir: Path | None = None):
-        super().__init__()
-        self.dataset_path = dataset_path
-        self.results_dir = results_dir
-        self.entries: list[DatasetEntry] = []
-        self.current_index = 0
-        self.resolution_cache: dict[str, tuple[int, int, list[str]]] = {}  # instance_id -> (resolved, total, failure_categories)
-
-    def compose(self) -> ComposeResult:
-        yield Header()
-        yield Label("", id="progress")
-        with Vertical(id="content-container"):
-            with VerticalScroll(id="info-panel", classes="content-panel"), Horizontal(id="info-columns"):
-                yield Static("", id="info-left", classes="info-column")
-                yield Static("", id="info-right", classes="info-column")
-            with VerticalScroll(id="problem-panel", classes="content-panel"):
-                yield Static("", id="problem-content")
-        yield Footer()
-
-    def on_mount(self) -> None:
-        self._load_data()
-        self._update_display()
-
-    def _load_data(self) -> None:
-        self.entries = load_dataset_entries(self.dataset_path)
-        if self.results_dir:
-            self._load_resolution_stats()
-
-    def _load_resolution_stats(self) -> None:
-        if not self.results_dir or not self.results_dir.exists():
-            return
-
-        for jsonl_file in self.results_dir.glob("*.jsonl"):
-            with open(jsonl_file, encoding="utf-8") as f:
-                for line in f:
-                    if not line.strip():
-                        continue
-                    try:
-                        result = json.loads(line)
-                        instance_id = result.get("instance_id") or result.get("InstanceID", "")
-                        if not instance_id:
-                            continue
-
-                        scores = result.get("scores", {})
-                        resolved = scores.get("ResolutionRate", 0) == 1
-
-                        if instance_id not in self.resolution_cache:
-                            self.resolution_cache[instance_id] = (0, 0, [])
-
-                        current_resolved, current_total, failure_cats = self.resolution_cache[instance_id]
-
-                        # Collect failure category if not resolved
-                        if not resolved:
-                            failure_cat = result.get("review", {}).get("failure_category")
-                            if failure_cat:
-                                failure_cats = [*failure_cats, failure_cat]
-
-                        self.resolution_cache[instance_id] = (
-                            current_resolved + (1 if resolved else 0),
-                            current_total + 1,
-                            failure_cats,
-                        )
-                    except json.JSONDecodeError:
-                        continue
-
-    def _get_current_entry(self) -> DatasetEntry | None:
-        if not self.entries or self.current_index >= len(self.entries):
-            return None
-        return self.entries[self.current_index]
-
-    def _get_resolution_text(self, instance_id: str) -> str:
-        if not self.results_dir:
-            return ""
-        if instance_id not in self.resolution_cache:
-            return " | [dim]No runs found[/dim]"
-        resolved, total, failure_cats = self.resolution_cache[instance_id]
-        if resolved == total:
-            return f" | [green]{resolved}/{total} resolved[/green]"
-
-        # Build failure category summary
-        failure_text = ""
-        if failure_cats:
-            from collections import Counter
-
-            counts = Counter(failure_cats)
-            failure_text = " | " + ", ".join(f"{cat}: {cnt}" for cat, cnt in counts.most_common())
-
-        if resolved == 0:
-            return f" | [red]{resolved}/{total} resolved[/red]{failure_text}"
-        return f" | [yellow]{resolved}/{total} resolved[/yellow]{failure_text}"
-
-    def _format_entry_info_left(self, entry: DatasetEntry) -> str:
-        lines = [
-            f"[cyan bold]Repo:[/cyan bold] {entry.repo}",
-            f"[cyan bold]Instance ID:[/cyan bold] {entry.instance_id}",
-            f"[cyan bold]Base Commit:[/cyan bold] {entry.base_commit}",
-            f"[cyan bold]Created At:[/cyan bold] {entry.created_at}",
-            f"[cyan bold]Version:[/cyan bold] {entry.environment_setup_version}",
-        ]
-
-        # Project Paths
-        lines.append("")
-        lines.append("[cyan bold]Project Paths:[/cyan bold]")
-        for path in entry.project_paths:
-            lines.append(f"  {path}")
-
-        return "\n".join(lines)
-
-    def _format_entry_info_right(self, entry: DatasetEntry) -> str:
-        lines = []
-
-        # Metadata
-        metadata = entry.metadata.model_dump()
-        if any(v is not None for v in metadata.values()):
-            lines.append("[bold]Metadata:[/bold]")
-            for field_name, field_value in metadata.items():
-                if field_value is not None:
-                    display_name = field_name.replace("_", " ").title()
-                    lines.append(f"  [dim]{display_name}:[/dim] {field_value}")
-            lines.append("")
-
-        # FAIL_TO_PASS tests
-        lines.append("[bold]FAIL_TO_PASS:[/bold]")
-        if entry.fail_to_pass:
-            for test in entry.fail_to_pass:
-                lines.append(f"  [magenta]{test.codeunitID}[/magenta]: {', '.join(test.functionName)}")
-        else:
-            lines.append("  [dim]None[/dim]")
-
-        # PASS_TO_PASS tests
-        lines.append("")
-        lines.append("[bold]PASS_TO_PASS:[/bold]")
-        if entry.pass_to_pass:
-            for test in entry.pass_to_pass:
-                lines.append(f"  [magenta]{test.codeunitID}[/magenta]: {', '.join(test.functionName)}")
-        else:
-            lines.append("  [dim]None[/dim]")
-
-        return "\n".join(lines)
-
-    def _update_display(self) -> None:
-        entry = self._get_current_entry()
-        if not entry:
-            self.query_one("#progress", Label).update("No entries in dataset")
-            return
-
-        resolution_text = self._get_resolution_text(entry.instance_id)
-        progress_text = f"[{self.current_index + 1}/{len(self.entries)}]{resolution_text} | {entry.instance_id}"
-        self.query_one("#progress", Label).update(progress_text)
-
-        info_panel = self.query_one("#info-panel", VerticalScroll)
-        problem_panel = self.query_one("#problem-panel", VerticalScroll)
-        info_panel.border_title = "Entry Information"
-        problem_panel.border_title = "Problem Statement"
-
-        self.query_one("#info-left", Static).update(self._format_entry_info_left(entry))
-        self.query_one("#info-right", Static).update(self._format_entry_info_right(entry))
-        self.query_one("#problem-content", Static).update(entry.get_task())
-
-        info_panel.scroll_home(animate=False)
-        problem_panel.scroll_home(animate=False)
-
-    def action_next_item(self) -> None:
-        if self.current_index < len(self.entries) - 1:
-            self.current_index += 1
-            self._update_display()
-
-    def action_prev_item(self) -> None:
-        if self.current_index > 0:
-            self.current_index -= 1
-            self._update_display()
-
-    def action_quit(self) -> None:
-        self.exit()
-
-
-def run_dataset_reviewer(dataset_path: Path, results_dir: Path | None = None) -> None:
-    app = DatasetReviewer(dataset_path, results_dir)
-    app.run()
diff --git a/src/bcbench/evaluate/__init__.py b/src/bcbench/evaluate/__init__.py
index 79c109648..9a29b5f09 100644
--- a/src/bcbench/evaluate/__init__.py
+++ b/src/bcbench/evaluate/__init__.py
@@ -1,5 +1,7 @@
 """Evaluation module for running pipelines and creating results."""
 
-from bcbench.evaluate.base import EvaluationPipeline, create_pipeline
+from bcbench.evaluate.base import EvaluationPipeline
+from bcbench.evaluate.bugfix import BugFixPipeline
+from bcbench.evaluate.testgeneration import TestGenerationPipeline
 
-__all__ = ["EvaluationPipeline", "create_pipeline"]
+__all__ = ["BugFixPipeline", "EvaluationPipeline", "TestGenerationPipeline"]
diff --git a/src/bcbench/evaluate/base.py b/src/bcbench/evaluate/base.py
index 87ec251b2..8c2dbb2d6 100644
--- a/src/bcbench/evaluate/base.py
+++ b/src/bcbench/evaluate/base.py
@@ -1,19 +1,22 @@
+from __future__ import annotations
+
 from abc import ABC, abstractmethod
 from collections.abc import Callable
 
 from bcbench.config import get_config
+from bcbench.dataset import BaseDatasetEntry
 from bcbench.exceptions import AgentTimeoutError
 from bcbench.logger import get_logger
 from bcbench.results import BaseEvaluationResult
-from bcbench.types import AgentMetrics, EvaluationCategory, EvaluationContext, ExperimentConfiguration
+from bcbench.types import AgentMetrics, EvaluationContext, ExperimentConfiguration
 
 logger = get_logger(__name__)
 _config = get_config()
 
-__all__ = ["EvaluationPipeline", "create_pipeline"]
+__all__ = ["EvaluationPipeline"]
 
 
-class EvaluationPipeline(ABC):
+class EvaluationPipeline[E: BaseDatasetEntry](ABC):
     """Abstract base class for evaluation pipelines.
 
     Subclasses implement category-specific setup, agent execution, and validation logic.
@@ -21,7 +24,7 @@ class EvaluationPipeline(ABC):
     """
 
     @abstractmethod
-    def setup(self, context: EvaluationContext) -> None:
+    def setup(self, context: EvaluationContext[E]) -> None:
         """Setup environment: e.g. clean repo, checkout base commit, initial build.
 
         Args:
@@ -33,7 +36,7 @@ def setup(self, context: EvaluationContext) -> None:
         raise NotImplementedError()
 
     @abstractmethod
-    def run_agent(self, context: EvaluationContext, agent_runner: Callable) -> None:
+    def run_agent(self, context: EvaluationContext[E], agent_runner: Callable) -> None:
         """Run the agent and capture metrics.
 
         Args:
@@ -46,7 +49,7 @@ def run_agent(self, context: EvaluationContext, agent_runner: Callable) -> None:
         raise NotImplementedError()
 
     @abstractmethod
-    def evaluate(self, context: EvaluationContext) -> None:
+    def evaluate(self, context: EvaluationContext[E]) -> None:
         """Evaluate results: e.g. apply patches, build, run tests.
 
         Implementation should raise category-specific exceptions on failure.
@@ -61,8 +64,8 @@ def evaluate(self, context: EvaluationContext) -> None:
 
     def execute(
         self,
-        context: EvaluationContext,
-        agent_runner: Callable[[EvaluationContext], tuple[AgentMetrics | None, ExperimentConfiguration | None]],
+        context: EvaluationContext[E],
+        agent_runner: Callable[[EvaluationContext[E]], tuple[AgentMetrics | None, ExperimentConfiguration | None]],
     ) -> None:
         """Template method orchestrating the evaluation flow.
 
@@ -90,7 +93,7 @@ def execute(
 
         self.evaluate(context)
 
-    def save_result(self, context: EvaluationContext, result: BaseEvaluationResult) -> None:
+    def save_result(self, context: EvaluationContext[E], result: BaseEvaluationResult) -> None:
         """Save result directly using result object.
 
         Args:
@@ -99,20 +102,3 @@ def save_result(self, context: EvaluationContext, result: BaseEvaluationResult)
         """
 
         result.save(context.result_dir, f"{context.entry.instance_id}{_config.file_patterns.result_pattern}")
-
-
-def create_pipeline(category: EvaluationCategory) -> EvaluationPipeline:
-    """Factory function to create evaluation pipeline based on category."""
-    from bcbench.evaluate.bugfix import BugFixPipeline
-    from bcbench.evaluate.testgeneration import TestGenerationPipeline
-
-    match category:
-        case EvaluationCategory.BUG_FIX:
-            logger.info(f"Using BugFixPipeline for category: {category}")
-            return BugFixPipeline()
-        case EvaluationCategory.TEST_GENERATION:
-            logger.info(f"Using TestGenerationPipeline for category: {category}")
-            return TestGenerationPipeline()
-        case _:
-            raise ValueError(f"Unknown evaluation category: {category}")
-    raise RuntimeError("Unreachable: no pipeline returned")
diff --git a/src/bcbench/evaluate/bugfix.py b/src/bcbench/evaluate/bugfix.py
index 310d455d1..9df7eee67 100644
--- a/src/bcbench/evaluate/bugfix.py
+++ b/src/bcbench/evaluate/bugfix.py
@@ -1,5 +1,6 @@
 from collections.abc import Callable
 
+from bcbench.dataset import BugFixEntry
 from bcbench.evaluate.base import EvaluationPipeline
 from bcbench.exceptions import BuildError, TestExecutionError
 from bcbench.logger import get_logger, github_log_group
@@ -21,42 +22,30 @@
 __all__ = ["BugFixPipeline"]
 
 
-class BugFixPipeline(EvaluationPipeline):
-    """Pipeline for bug-fix evaluation category.
+class BugFixPipeline(EvaluationPipeline[BugFixEntry]):
+    """Pipeline for bug-fix evaluation category."""
 
-    Workflow:
-    1. Setup: clean repo, checkout base commit, copy problem statement, build
-    2. Run agent: execute agent to generate fix patch
-    3. evaluate: apply test patch, build, run tests
-    """
-
-    def setup(self, context: EvaluationContext) -> None:
+    def setup(self, context: EvaluationContext[BugFixEntry]) -> None:
         setup_repo_prebuild(context.entry, context.repo_path)
 
         build_and_publish_projects(
             context.repo_path,
             context.entry.project_paths,
-            context.container_name,
-            context.username,
-            context.password,
+            context.get_container(),
             context.entry.environment_setup_version,
         )
 
         setup_repo_postbuild(context.entry, context.repo_path, context.category)
 
-    def run_agent(self, context: EvaluationContext, agent_runner: Callable) -> None:
+    def run_agent(self, context: EvaluationContext[BugFixEntry], agent_runner: Callable) -> None:
         with github_log_group(f"{context.agent_name} -- Entry: {context.entry.instance_id}"):
             context.metrics, context.experiment = agent_runner(context)
 
-    def evaluate(self, context: EvaluationContext) -> None:
-        """Apply test patch, build, and run tests.
-
-        Creates and saves appropriate result based on validation outcome.
-        """
+    def evaluate(self, context: EvaluationContext[BugFixEntry]) -> None:
+        container = context.get_container()
         test_projects, _app_projects = categorize_projects(context.entry.project_paths)
 
         # Clean test projects to revert any unintended agent changes before capturing diff
-        # Evaluation focuses on valid changes (app code), treating unintended modifications as out-of-scope noise
         clean_project_paths(context.repo_path, test_projects)
 
         generated_patch = stage_and_get_diff(context.repo_path)
@@ -67,12 +56,10 @@ def evaluate(self, context: EvaluationContext) -> None:
             build_and_publish_projects(
                 context.repo_path,
                 context.entry.project_paths,
-                context.container_name,
-                context.username,
-                context.password,
+                container,
                 context.entry.environment_setup_version,
             )
-            run_tests(context.entry, context.container_name, context.username, context.password)
+            run_tests(context.entry, container)
 
             result = BugFixResult.create_success(context, generated_patch)
             logger.info(f"Successfully completed {context.entry.instance_id}")
diff --git a/src/bcbench/evaluate/testgeneration.py b/src/bcbench/evaluate/testgeneration.py
index 0a8758ab2..11642dee1 100644
--- a/src/bcbench/evaluate/testgeneration.py
+++ b/src/bcbench/evaluate/testgeneration.py
@@ -1,7 +1,7 @@
 from collections.abc import Callable
 
 from bcbench.collection.patch_utils import extract_file_paths_from_patch
-from bcbench.dataset import TestEntry
+from bcbench.dataset import TestEntry, TestGenEntry
 from bcbench.evaluate.base import EvaluationPipeline
 from bcbench.exceptions import BuildError, NoTestsExtractedError, TestExecutionError
 from bcbench.logger import get_logger, github_log_group
@@ -24,42 +24,27 @@
 __all__ = ["TestGenerationPipeline"]
 
 
-class TestGenerationPipeline(EvaluationPipeline):
-    """Pipeline for test-generation evaluation category.
+class TestGenerationPipeline(EvaluationPipeline[TestGenEntry]):
+    """Pipeline for test-generation evaluation category."""
 
-    Workflow:
-    1. Setup: clean repo, checkout base commit, build, then apply gold patch/problem statement
-    2. Run agent: execute agent to generate test code
-    3. Evaluate: build, run tests with expected failures, then apply original patch, build, run tests with expected passes
-
-    Input modes (configured in copilot/config.yaml):
-    - problem-statement: Agent receives bug description, generates tests from problem statement
-    - gold-patch: Agent sees the fixed code, generates tests that verify the fix works
-
-    Note: For test-generation, we build BEFORE applying the gold patch so the published app
-    doesn't include the fix. The agent sees the fix in source but tests run against unfixed app.
-    """
-
-    def setup(self, context: EvaluationContext) -> None:
+    def setup(self, context: EvaluationContext[TestGenEntry]) -> None:
         setup_repo_prebuild(context.entry, context.repo_path)
 
         build_and_publish_projects(
             context.repo_path,
             context.entry.project_paths,
-            context.container_name,
-            context.username,
-            context.password,
+            context.get_container(),
             context.entry.environment_setup_version,
         )
 
-        # Apply gold patch / problem statement AFTER build so fix isn't included in published app
         setup_repo_postbuild(context.entry, context.repo_path, context.category)
 
-    def run_agent(self, context: EvaluationContext, agent_runner: Callable) -> None:
+    def run_agent(self, context: EvaluationContext[TestGenEntry], agent_runner: Callable) -> None:
         with github_log_group(f"{context.agent_name} -- Entry: {context.entry.instance_id}"):
             context.metrics, context.experiment = agent_runner(context)
 
-    def evaluate(self, context: EvaluationContext) -> None:
+    def evaluate(self, context: EvaluationContext[TestGenEntry]) -> None:
+        container = context.get_container()
         test_projects, app_projects = categorize_projects(context.entry.project_paths)
 
         # Clean app projects to revert any unintended agent changes before capturing diff
@@ -83,24 +68,20 @@ def evaluate(self, context: EvaluationContext) -> None:
             build_and_publish_projects(
                 context.repo_path,
                 test_projects,
-                context.container_name,
-                context.username,
-                context.password,
+                container,
                 context.entry.environment_setup_version,
             )
-            run_test_suite(generated_tests, "Fail", context.container_name, context.username, context.password)
+            run_test_suite(generated_tests, "Fail", container)
 
             apply_patch(context.repo_path, context.entry.patch, f"{context.entry.instance_id} patch")
 
             build_and_publish_projects(
                 context.repo_path,
                 app_projects,
-                context.container_name,
-                context.username,
-                context.password,
+                container,
                 context.entry.environment_setup_version,
             )
-            run_test_suite(generated_tests, "Pass", context.container_name, context.username, context.password)
+            run_test_suite(generated_tests, "Pass", container)
 
             result = TestGenerationResult.create_success(context, generated_patch, pre_patch_failed=True, post_patch_passed=True)
             logger.info(f"Successfully completed {context.entry.instance_id}")
diff --git a/src/bcbench/operations/bc_operations.py b/src/bcbench/operations/bc_operations.py
index 76baa631b..cca41e654 100644
--- a/src/bcbench/operations/bc_operations.py
+++ b/src/bcbench/operations/bc_operations.py
@@ -8,9 +8,11 @@
 from pydantic import TypeAdapter
 
 from bcbench.config import get_config
-from bcbench.dataset import DatasetEntry, TestEntry
+from bcbench.dataset import TestEntry
+from bcbench.dataset.dataset_entry import _BugFixTestGenBase
 from bcbench.exceptions import BuildError, BuildTimeoutExpired, TestExecutionError, TestExecutionTimeoutExpired
 from bcbench.logger import get_logger
+from bcbench.types import ContainerConfig
 
 logger = get_logger(__name__)
 _config = get_config()
@@ -116,7 +118,7 @@ def build_ps_dataset_tests_script(container_name: str, username: str, password:
     )
 
 
-def build_and_publish_projects(repo_path: Path, project_paths: list[str], container_name: str, username: str, password: str, version: str) -> None:
+def build_and_publish_projects(repo_path: Path, project_paths: list[str], container: ContainerConfig, version: str) -> None:
     """Build and publish all projects."""
     logger.info(f"Building and publishing {len(project_paths)} projects")
 
@@ -125,9 +127,9 @@ def build_and_publish_projects(repo_path: Path, project_paths: list[str], contai
         logger.info(f"Building project: {full_project_path}")
 
         ps_script = build_ps_app_build_and_publish(
-            container_name=container_name,
-            username=username,
-            password=password,
+            container_name=container.name,
+            username=container.username,
+            password=container.password,
             project_path=full_project_path,
             version=version,
         )
@@ -157,26 +159,26 @@ def build_and_publish_projects(repo_path: Path, project_paths: list[str], contai
     logger.info("All projects built and published")
 
 
-def run_tests(entry: DatasetEntry, container_name: str, username: str, password: str) -> None:
+def run_tests(entry: _BugFixTestGenBase, container: ContainerConfig) -> None:
     if entry.fail_to_pass:
         logger.info(f"Running {len(entry.fail_to_pass)} fail-to-pass tests")
-        run_test_suite(entry.fail_to_pass, "Pass", container_name, username, password)
+        run_test_suite(entry.fail_to_pass, "Pass", container)
 
     if entry.pass_to_pass:
         logger.info(f"Running {len(entry.pass_to_pass)} pass-to-pass tests")
-        run_test_suite(entry.pass_to_pass, "Pass", container_name, username, password)
+        run_test_suite(entry.pass_to_pass, "Pass", container)
 
     logger.info("All tests completed")
 
 
-def run_test_suite(test_entries: list[TestEntry], expectation: Literal["Pass", "Fail"], container_name: str, username: str, password: str) -> None:
+def run_test_suite(test_entries: list[TestEntry], expectation: Literal["Pass", "Fail"], container: ContainerConfig) -> None:
     """Run a suite of tests."""
     test_entries_json: str = TypeAdapter(list[TestEntry]).dump_json(test_entries).decode()
 
     ps_script = build_ps_dataset_tests_script(
-        container_name=container_name,
-        username=username,
-        password=password,
+        container_name=container.name,
+        username=container.username,
+        password=container.password,
         test_entries_json=test_entries_json,
         expectation=expectation,
     )
diff --git a/src/bcbench/operations/instruction_operations.py b/src/bcbench/operations/instruction_operations.py
index 761f28c02..a7120267f 100644
--- a/src/bcbench/operations/instruction_operations.py
+++ b/src/bcbench/operations/instruction_operations.py
@@ -2,7 +2,8 @@
 from shutil import copytree, rmtree
 
 from bcbench.config import get_config
-from bcbench.dataset import DatasetEntry
+from bcbench.dataset import BaseDatasetEntry
+from bcbench.dataset.dataset_entry import _BugFixTestGenBase
 from bcbench.logger import get_logger
 from bcbench.types import AgentType
 
@@ -10,7 +11,7 @@
 _config = get_config()
 
 
-def setup_instructions_from_config(agent_config: dict, entry: DatasetEntry, repo_path: Path, agent_type: AgentType) -> bool:
+def setup_instructions_from_config(agent_config: dict, entry: BaseDatasetEntry, repo_path: Path, agent_type: AgentType) -> bool:
     """
     Setup custom instructions from config if enabled.
 
@@ -47,7 +48,7 @@ def setup_instructions_from_config(agent_config: dict, entry: DatasetEntry, repo
     return instructions_enabled
 
 
-def setup_custom_agent(agent_config: dict, entry: DatasetEntry, repo_path: Path, agent_type: AgentType) -> str | None:
+def setup_custom_agent(agent_config: dict, entry: BaseDatasetEntry, repo_path: Path, agent_type: AgentType) -> str | None:
     """
     Setup custom agents in the repository if available.
     """
@@ -83,7 +84,7 @@ def _get_source_instructions_path(repo_name: str) -> Path:
     return instructions_path
 
 
-def copy_problem_statement_folder(entry: DatasetEntry, repo_path: Path) -> None:
+def copy_problem_statement_folder(entry: _BugFixTestGenBase, repo_path: Path) -> None:
     """
     Copy problem statement folder to the testbed repository root.
 
diff --git a/src/bcbench/operations/setup_operations.py b/src/bcbench/operations/setup_operations.py
index f2924ddf2..bc502ff9f 100644
--- a/src/bcbench/operations/setup_operations.py
+++ b/src/bcbench/operations/setup_operations.py
@@ -5,7 +5,7 @@
 import yaml
 
 from bcbench.config import get_config
-from bcbench.dataset import DatasetEntry
+from bcbench.dataset.dataset_entry import BaseDatasetEntry, _BugFixTestGenBase
 from bcbench.logger import get_logger
 from bcbench.operations.git_operations import apply_patch, checkout_commit, clean_repo
 from bcbench.operations.instruction_operations import copy_problem_statement_folder
@@ -37,7 +37,7 @@ def _get_test_generation_input_mode() -> str:
     return input_mode
 
 
-def setup_repo_prebuild(entry: DatasetEntry, repo_path: Path) -> None:
+def setup_repo_prebuild(entry: BaseDatasetEntry, repo_path: Path) -> None:
     """Setup repository before building - clean and checkout base commit.
 
     This is the first phase of repo setup that should be called BEFORE build_and_publish_projects.
@@ -51,17 +51,14 @@ def setup_repo_prebuild(entry: DatasetEntry, repo_path: Path) -> None:
     checkout_commit(repo_path, entry.base_commit)
 
 
-def setup_repo_postbuild(entry: DatasetEntry, repo_path: Path, category: EvaluationCategory) -> None:
-    """Setup repository after building - apply patches and copy problem statements.
+def setup_repo_postbuild(entry: _BugFixTestGenBase, repo_path: Path, category: EvaluationCategory) -> None:
+    """Setup repository after building for bug-fix and test-generation categories.
 
     This is the second phase of repo setup that should be called AFTER build_and_publish_projects.
     For test-generation, this ensures the gold patch is applied only after the base code is built,
     so the agent sees the fixed code but tests run against the unfixed published app.
 
-    Args:
-        entry: Dataset entry with instance metadata
-        repo_path: Path to the repository
-        category: Evaluation category (bug-fix or test-generation)
+    Note: Other categories should implement their own postbuild setup.
     """
     if category == EvaluationCategory.TEST_GENERATION:
         input_mode: str = _get_test_generation_input_mode()
diff --git a/src/bcbench/operations/skills_operations.py b/src/bcbench/operations/skills_operations.py
index da327eaa8..67f25d219 100644
--- a/src/bcbench/operations/skills_operations.py
+++ b/src/bcbench/operations/skills_operations.py
@@ -1,7 +1,7 @@
 from pathlib import Path
 from shutil import copytree, rmtree
 
-from bcbench.dataset.dataset_entry import DatasetEntry
+from bcbench.dataset.dataset_entry import BaseDatasetEntry
 from bcbench.logger import get_logger
 from bcbench.operations.instruction_operations import _get_source_instructions_path
 from bcbench.types import AgentType
@@ -9,7 +9,7 @@
 logger = get_logger(__name__)
 
 
-def setup_agent_skills(agent_config: dict, entry: DatasetEntry, repo_path: Path, agent_type: AgentType) -> bool:
+def setup_agent_skills(agent_config: dict, entry: BaseDatasetEntry, repo_path: Path, agent_type: AgentType) -> bool:
     """
     Setup skills in the repository if available.
 
diff --git a/src/bcbench/results/base.py b/src/bcbench/results/base.py
index 8e9783727..fffe9beba 100644
--- a/src/bcbench/results/base.py
+++ b/src/bcbench/results/base.py
@@ -2,7 +2,7 @@
 
 import json
 from pathlib import Path
-from typing import Any, TypeVar
+from typing import Any, Self
 
 from pydantic import BaseModel
 
@@ -11,9 +11,6 @@
 
 logger = get_logger(__name__)
 
-# Type variable for proper return type hints in class methods
-T = TypeVar("T", bound="BaseEvaluationResult")
-
 
 class BaseEvaluationResult(BaseModel):
     """Base class for all evaluation results with shared metrics across categories."""
@@ -36,14 +33,14 @@ class BaseEvaluationResult(BaseModel):
 
     @classmethod
     def _create_from_context(
-        cls: type[T],
+        cls,
         context: "EvaluationContext",
         resolved: bool,
         build: bool,
         error_message: str | None = None,
         generated_patch: str = "",
         **kwargs: Any,
-    ) -> T:
+    ) -> Self:
         """Create result from EvaluationContext with validation and metric extraction.
 
         Args:
@@ -80,19 +77,19 @@ def _create_from_context(
         )
 
     @classmethod
-    def create_success(cls: type[T], context: "EvaluationContext", generated_patch: str, **kwargs: Any) -> T:
+    def create_success(cls, context: "EvaluationContext", generated_patch: str, **kwargs: Any) -> Self:
         return cls._create_from_context(context, resolved=True, build=True, generated_patch=generated_patch, **kwargs)
 
     @classmethod
-    def create_build_failure(cls: type[T], context: "EvaluationContext", generated_patch: str, error_msg: str, **kwargs: Any) -> T:
+    def create_build_failure(cls, context: "EvaluationContext", generated_patch: str, error_msg: str, **kwargs: Any) -> Self:
         return cls._create_from_context(context, resolved=False, build=False, error_message=error_msg, generated_patch=generated_patch, **kwargs)
 
     @classmethod
-    def create_test_failure(cls: type[T], context: "EvaluationContext", generated_patch: str, error_msg: str = "Tests failed", **kwargs: Any) -> T:
+    def create_test_failure(cls, context: "EvaluationContext", generated_patch: str, error_msg: str = "Tests failed", **kwargs: Any) -> Self:
         return cls._create_from_context(context, resolved=False, build=True, error_message=error_msg, generated_patch=generated_patch, **kwargs)
 
     @classmethod
-    def create_agent_timeout_failure(cls: type[T], context: "EvaluationContext", **kwargs: Any) -> T:
+    def create_agent_timeout_failure(cls, context: "EvaluationContext", **kwargs: Any) -> Self:
         return cls._create_from_context(context, resolved=False, build=False, timeout=True, error_message="Agent timed out", **kwargs)
 
     def save(self, output_dir: Path, result_file: str) -> None:
diff --git a/src/bcbench/results/bceval_export.py b/src/bcbench/results/bceval_export.py
index 043e0f84d..eadb79fb0 100644
--- a/src/bcbench/results/bceval_export.py
+++ b/src/bcbench/results/bceval_export.py
@@ -6,7 +6,7 @@
 from pathlib import Path
 from typing import Any
 
-from bcbench.dataset import DatasetEntry, load_dataset_entries
+from bcbench.dataset import BaseDatasetEntry
 from bcbench.logger import get_logger
 from bcbench.results.base import BaseEvaluationResult
 from bcbench.results.testgeneration import TestGenerationResult
@@ -15,9 +15,10 @@
 logger = get_logger(__name__)
 
 
-def write_bceval_results(results: list[BaseEvaluationResult], out_dir: Path, run_id: str, dataset_path: Path, output_filename: str) -> None:
+def write_bceval_results(results: list[BaseEvaluationResult], out_dir: Path, run_id: str, output_filename: str, category: EvaluationCategory) -> None:
     """Write results into a JSONL file for bceval consumption."""
-    dataset_entries: list[DatasetEntry] = load_dataset_entries(dataset_path)
+    entry_cls = category.entry_class
+    dataset_entries: list[BaseDatasetEntry] = entry_cls.load(category.dataset_path)
 
     output_file = out_dir / output_filename
     with open(output_file, "w") as f:
@@ -28,7 +29,8 @@ def write_bceval_results(results: list[BaseEvaluationResult], out_dir: Path, run
                 logger.error(f"No matching dataset entry found for instance_id: {result.instance_id}")
                 continue
 
-            input, expected = get_info_from_dataset_entry(matching_entries[0], result.category)
+            matched_entry = matching_entries[0]
+            input, expected = matched_entry.get_task(), matched_entry.get_expected_output()
 
             metadata: dict[str, Any] = {
                 "model": result.model,
@@ -61,22 +63,3 @@ def write_bceval_results(results: list[BaseEvaluationResult], out_dir: Path, run
             f.write(json.dumps(bceval_result) + "\n")
 
     logger.info(f"Wrote bceval results to: {output_file}")
-
-
-def get_info_from_dataset_entry(entry: DatasetEntry, category: EvaluationCategory) -> tuple[str, str]:
-    """
-    Extract relevant info from DatasetEntry for bceval results.
-
-    Args:
-        entry: The DatasetEntry instance
-        category: The evaluation category
-    Returns:
-        A tuple of (input, expected output)
-    """
-    match category:
-        case EvaluationCategory.BUG_FIX:
-            return entry.get_task(), entry.patch
-        case EvaluationCategory.TEST_GENERATION:
-            return entry.get_task(), entry.test_patch
-        case _:
-            raise ValueError(f"Unsupported evaluation category: {category}")
diff --git a/src/bcbench/results/reviewer.py b/src/bcbench/results/reviewer.py
deleted file mode 100644
index 4b5d760da..000000000
--- a/src/bcbench/results/reviewer.py
+++ /dev/null
@@ -1,376 +0,0 @@
-"""
-Failure Mode Analysis TUI for reviewing evaluation results.
-
-Shows expected vs actual diffs side-by-side. Use arrow keys to navigate,
-number keys to select failure category. Press Escape to quit.
-
-Copilot wrote most of this code, seems to work, haven't read it all carefully.
-"""
-
-import json
-from abc import abstractmethod
-from pathlib import Path
-
-from textual.app import App, ComposeResult
-from textual.binding import Binding
-from textual.containers import Horizontal, VerticalScroll
-from textual.widgets import Footer, Header, Label, Static
-
-from bcbench.types import EvaluationCategory
-
-
-def _get_patch_field(category: EvaluationCategory) -> str:
-    return "patch" if category == EvaluationCategory.BUG_FIX else "test_patch"
-
-
-def _get_expected_panel_title(category: EvaluationCategory) -> str:
-    return "Expected (Gold)" if category == EvaluationCategory.BUG_FIX else "Expected (Test Patch)"
-
-
-# Failure categories (name, description)
-FAILURE_CATEGORIES = [
-    ("Wrong Solution", "Incorrect approach/logic"),
-    ("Syntax Error", "AL syntax issues"),
-    ("Incorrect File", "Wrong file edited"),
-    ("Missing Using", "Missing using statement for namespace"),
-    ("Timeout", "Agent timed out and made no changes"),
-    ("Other", "Doesn't fit above"),
-    ("Flag For Review", "Needs further review"),
-]
-
-
-class BaseReviewer(App):
-    """Base class for failure mode review TUIs."""
-
-    CSS = """
-    #diff-container {
-        height: 1fr;
-    }
-    .diff-panel {
-        width: 1fr;
-        border: solid green;
-    }
-    #actual-panel {
-        border: solid red;
-    }
-    """
-
-    BINDINGS = [  # noqa: RUF012
-        Binding("right", "next_item", "Next →"),
-        Binding("left", "prev_item", "← Prev"),
-        Binding("1", "select_category(0)", "Wrong Solution"),
-        Binding("2", "select_category(1)", "Syntax Error"),
-        Binding("3", "select_category(2)", "Incorrect File"),
-        Binding("4", "select_category(3)", "Missing Using"),
-        Binding("5", "select_category(4)", "Timeout"),
-        Binding("6", "select_category(5)", "Other"),
-        Binding("7", "select_category(6)", "Flag For Review"),
-        Binding("escape", "quit", "Quit"),
-    ]
-
-    def __init__(self, dataset_path: Path, category: EvaluationCategory = EvaluationCategory.BUG_FIX):
-        super().__init__()
-        self.dataset_path = dataset_path
-        self.category = category
-        self.current_index = 0
-
-    def compose(self) -> ComposeResult:
-        yield Header()
-        yield Label("", id="progress")
-        with Horizontal(id="diff-container"):
-            with VerticalScroll(id="expected-panel", classes="diff-panel"):
-                yield Static("", id="expected-content")
-            with VerticalScroll(id="actual-panel", classes="diff-panel"):
-                yield Static("", id="actual-content")
-        yield Footer()
-
-    def on_mount(self) -> None:
-        self._load_data()
-        self._update_display()
-
-    @abstractmethod
-    def _load_data(self) -> None:
-        pass
-
-    @abstractmethod
-    def _get_current_result(self) -> dict | None:
-        pass
-
-    @abstractmethod
-    def _get_expected_patch(self, result: dict) -> str:
-        pass
-
-    @abstractmethod
-    def _get_progress_text(self, result: dict) -> str:
-        pass
-
-    @abstractmethod
-    def _get_actual_panel_title(self) -> str:
-        pass
-
-    @abstractmethod
-    def _save(self) -> None:
-        pass
-
-    @abstractmethod
-    def _get_item_count(self) -> int:
-        pass
-
-    @abstractmethod
-    def _get_reviewed_count(self) -> int:
-        pass
-
-    def _update_display(self) -> None:
-        result = self._get_current_result()
-        if not result:
-            self.query_one("#progress", Label).update(self._get_empty_message())
-            return
-
-        self.query_one("#progress", Label).update(self._get_progress_text(result))
-
-        expected = self._get_expected_patch(result)
-        actual = result.get("output", "No output")
-
-        exp_panel = self.query_one("#expected-panel", VerticalScroll)
-        act_panel = self.query_one("#actual-panel", VerticalScroll)
-        exp_panel.border_title = _get_expected_panel_title(self.category)
-        act_panel.border_title = self._get_actual_panel_title()
-        self.query_one("#expected-content", Static).update(self._format_diff(expected))
-        self.query_one("#actual-content", Static).update(self._format_diff(actual))
-        exp_panel.scroll_home(animate=False)
-        act_panel.scroll_home(animate=False)
-
-    def _get_empty_message(self) -> str:
-        return "No results to review"
-
-    def _format_diff(self, text: str) -> str:
-        if not text:
-            return "(empty)"
-        lines = []
-        for line in text.split("\n"):
-            escaped = line.replace("[", r"\[")
-            if line.startswith(("+++", "---", "@@")):
-                lines.append(f"[cyan]{escaped}[/cyan]")
-            elif line.startswith("+"):
-                lines.append(f"[green]{escaped}[/green]")
-            elif line.startswith("-"):
-                lines.append(f"[red]{escaped}[/red]")
-            else:
-                lines.append(escaped)
-        return "\n".join(lines)
-
-    def _format_status(self, result: dict) -> str:
-        scores = result.get("scores", {})
-        build = scores.get("BuildRate", 0) == 1
-        resolved = scores.get("ResolutionRate", 0) == 1
-        build_str = "[green]✓[/green]" if build else "[red]✗[/red]"
-        resolved_str = "[green]✓[/green]" if resolved else "[red]✗[/red]"
-        current_category = result.get("review", {}).get("failure_category") or "[dim]None[/dim]"
-        parts: list[str] = [f"Build: {build_str}"]
-        if self.category == EvaluationCategory.TEST_GENERATION:
-            post_passed = scores.get("PostPatchPassedRate", 0) == 1
-            pre_failed = scores.get("PrePatchFailedRate", 0) == 1
-            post_str = "[green]✓[/green]" if post_passed else "[red]✗[/red]"
-            pre_str = "[green]✓[/green]" if pre_failed else "[red]✗[/red]"
-            parts.append(f"PrePatchFailed: {pre_str}")
-            parts.append(f"PostPatchPassed: {post_str}")
-        parts.append(f"Resolved: {resolved_str}")
-        parts.append(f"Category: {current_category}")
-        return " | ".join(parts)
-
-    def _set_category(self, idx: int) -> None:
-        result = self._get_current_result()
-        if not result:
-            return
-        if "review" not in result:
-            result["review"] = {}
-        result["review"]["failure_category"] = FAILURE_CATEGORIES[idx][0]
-        self._save()
-        self.notify(f"Set: {FAILURE_CATEGORIES[idx][0]}")
-        self._update_display()
-
-    def action_next_item(self) -> None:
-        if self.current_index < self._get_item_count() - 1:
-            self.current_index += 1
-            self._update_display()
-
-    def action_prev_item(self) -> None:
-        if self.current_index > 0:
-            self.current_index -= 1
-            self._update_display()
-
-    def action_select_category(self, index: int) -> None:
-        if 0 <= index < len(FAILURE_CATEGORIES):
-            self._set_category(index)
-
-    def action_quit(self) -> None:
-        self._save()
-        self.exit()
-
-
-class InstanceAcrossRunsReviewer(BaseReviewer):
-    """Review a single instance across all runs in a directory."""
-
-    TITLE = "Instance Across Runs Review"
-
-    def __init__(self, results_dir: Path, instance_id: str, dataset_path: Path, category: EvaluationCategory = EvaluationCategory.BUG_FIX):
-        super().__init__(dataset_path, category)
-        self.results_dir = results_dir
-        self.instance_id = instance_id
-        self.run_results: list[tuple[str, dict, Path]] = []  # (run_id, result_dict, file_path)
-        self.expected_patch: str = ""
-
-    def _load_data(self) -> None:
-        with open(self.dataset_path, encoding="utf-8") as f:
-            for line in f:
-                if line.strip():
-                    entry = json.loads(line)
-                    if entry["instance_id"] == self.instance_id:
-                        self.expected_patch = entry.get(_get_patch_field(self.category), "")
-                        break
-
-        for jsonl_file in sorted(self.results_dir.glob("*.jsonl")):
-            run_id = jsonl_file.stem
-            with open(jsonl_file, encoding="utf-8") as f:
-                for line in f:
-                    if not line.strip():
-                        continue
-                    result = json.loads(line)
-                    result_instance_id = result.get("instance_id") or result.get("InstanceID", "")
-                    if result_instance_id == self.instance_id:
-                        self.run_results.append((run_id, result, jsonl_file))
-                        break
-
-    def _get_current_result(self) -> dict | None:
-        if not self.run_results or self.current_index >= len(self.run_results):
-            return None
-        return self.run_results[self.current_index][1]
-
-    def _get_expected_patch(self, result: dict) -> str:
-        return self.expected_patch or "Not found in dataset"
-
-    def _get_progress_text(self, result: dict) -> str:
-        run_id = self.run_results[self.current_index][0]
-        status = self._format_status(result)
-        return f"[{self.current_index + 1}/{len(self.run_results)} runs] (reviewed: {self._get_reviewed_count()}) | Run: {run_id} | {status} | {self.instance_id}"
-
-    def _get_actual_panel_title(self) -> str:
-        run_id = self.run_results[self.current_index][0]
-        return f"Actual (Run: {run_id})"
-
-    def _get_empty_message(self) -> str:
-        return f"No results found for instance: {self.instance_id}"
-
-    def _get_item_count(self) -> int:
-        return len(self.run_results)
-
-    def _get_reviewed_count(self) -> int:
-        return sum(1 for _, r, _ in self.run_results if r.get("review", {}).get("failure_category"))
-
-    def _save(self) -> None:
-        if not self.run_results or self.current_index >= len(self.run_results):
-            return
-        _, result, file_path = self.run_results[self.current_index]
-
-        all_results = []
-        with open(file_path, encoding="utf-8") as f:
-            for line in f:
-                if line.strip():
-                    all_results.append(json.loads(line))
-
-        for i, r in enumerate(all_results):
-            r_instance_id = r.get("instance_id") or r.get("InstanceID", "")
-            if r_instance_id == self.instance_id:
-                all_results[i] = result
-                break
-
-        self._write_results(file_path, all_results)
-
-    @staticmethod
-    def _write_results(file_path: Path, results: list[dict]) -> None:
-        with open(file_path, "w", encoding="utf-8") as f:
-            for r in results:
-                if "review" in r:
-                    ordered = {"review": r["review"]}
-                    ordered.update({k: v for k, v in r.items() if k != "review"})
-                    f.write(json.dumps(ordered) + "\n")
-                else:
-                    f.write(json.dumps(r) + "\n")
-
-
-class FailureModeAnalysis(BaseReviewer):
-    """Review and categorize agent failures in a single results file."""
-
-    TITLE = "Failure Mode Analysis"
-
-    def __init__(self, results_path: Path, dataset_path: Path, category: EvaluationCategory = EvaluationCategory.BUG_FIX, include_resolved: bool = False):
-        super().__init__(dataset_path, category)
-        self.results_path = results_path
-        self.results: list[dict] = []
-        self.unresolved_indices: list[int] = []
-        self.dataset_lookup: dict[str, str] = {}
-        self.include_resolved = include_resolved
-
-    def _load_data(self) -> None:
-        # Build dataset lookup and track order
-        dataset_order: dict[str, int] = {}
-        with open(self.dataset_path, encoding="utf-8") as f:
-            for idx, line in enumerate(f):
-                if line.strip():
-                    entry = json.loads(line)
-                    instance_id = entry["instance_id"]
-                    self.dataset_lookup[instance_id] = entry.get(_get_patch_field(self.category), "")
-                    dataset_order[instance_id] = idx
-
-        with open(self.results_path, encoding="utf-8") as f:
-            for line in f:
-                if line.strip():
-                    self.results.append(json.loads(line))
-
-        if self.include_resolved:
-            # Sort indices by dataset order to match dataset review iteration
-            self.unresolved_indices = sorted(
-                range(len(self.results)),
-                key=lambda i: dataset_order.get(self.results[i].get("instance_id") or self.results[i].get("InstanceID", ""), float("inf")),
-            )
-        else:
-            self.unresolved_indices = [i for i, r in enumerate(self.results) if not r.get("resolved", False) and r.get("scores", {}).get("ResolutionRate", 0) == 0]
-
-    def _get_current_result(self) -> dict | None:
-        if not self.unresolved_indices or self.current_index >= len(self.unresolved_indices):
-            return None
-        return self.results[self.unresolved_indices[self.current_index]]
-
-    def _get_expected_patch(self, result: dict) -> str:
-        instance_id = result.get("instance_id") or result.get("InstanceID", "unknown")
-        return self.dataset_lookup.get(instance_id, "Not found in dataset")
-
-    def _get_progress_text(self, result: dict) -> str:
-        instance_id = result.get("instance_id") or result.get("InstanceID", "unknown")
-        status = self._format_status(result)
-        return f"[{self.current_index + 1}/{len(self.unresolved_indices)}] (reviewed: {self._get_reviewed_count()}) | {status} | {instance_id}"
-
-    def _get_actual_panel_title(self) -> str:
-        return "Actual (Agent)"
-
-    def _get_empty_message(self) -> str:
-        return "No results to review" if self.include_resolved else "No unresolved results"
-
-    def _get_item_count(self) -> int:
-        return len(self.unresolved_indices)
-
-    def _get_reviewed_count(self) -> int:
-        return sum(1 for i in self.unresolved_indices if self.results[i].get("review", {}).get("failure_category"))
-
-    def _save(self) -> None:
-        InstanceAcrossRunsReviewer._write_results(self.results_path, self.results)
-
-
-def run_reviewer(results_path: Path, dataset_path: Path, category: EvaluationCategory = EvaluationCategory.BUG_FIX, include_resolved: bool = False) -> None:
-    app = FailureModeAnalysis(results_path, dataset_path, category, include_resolved)
-    app.run()
-
-
-def run_instance_reviewer(results_dir: Path, instance_id: str, dataset_path: Path, category: EvaluationCategory = EvaluationCategory.BUG_FIX) -> None:
-    app = InstanceAcrossRunsReviewer(results_dir, instance_id, dataset_path, category)
-    app.run()
diff --git a/src/bcbench/results/testgeneration.py b/src/bcbench/results/testgeneration.py
index 01516ee69..fb80d0e05 100644
--- a/src/bcbench/results/testgeneration.py
+++ b/src/bcbench/results/testgeneration.py
@@ -1,4 +1,6 @@
-from bcbench.results.base import BaseEvaluationResult, T
+from typing import Self
+
+from bcbench.results.base import BaseEvaluationResult
 from bcbench.types import EvaluationContext
 
 
@@ -13,5 +15,5 @@ class TestGenerationResult(BaseEvaluationResult):
     post_patch_passed: bool = False
 
     @classmethod
-    def create_no_tests_extracted(cls: type[T], context: "EvaluationContext", generated_patch: str, error_message: str) -> T:
+    def create_no_tests_extracted(cls, context: "EvaluationContext", generated_patch: str, error_message: str) -> Self:
         return cls._create_from_context(context, resolved=False, build=False, generated_patch=generated_patch, error_message=error_message)
diff --git a/src/bcbench/types.py b/src/bcbench/types.py
index 6489414ae..c2177a8a6 100644
--- a/src/bcbench/types.py
+++ b/src/bcbench/types.py
@@ -12,9 +12,10 @@
 from bcbench.logger import get_logger
 
 if TYPE_CHECKING:
-    from bcbench.dataset import DatasetEntry
+    from bcbench.dataset import BaseDatasetEntry
+    from bcbench.evaluate.base import EvaluationPipeline
 
-__all__ = ["AgentMetrics", "AgentType", "EvaluationCategory", "EvaluationContext", "ExperimentConfiguration"]
+__all__ = ["AgentMetrics", "AgentType", "ContainerConfig", "EvaluationCategory", "EvaluationContext", "ExperimentConfiguration"]
 
 logger = get_logger(__name__)
 
@@ -101,9 +102,52 @@ class EvaluationCategory(str, Enum):
     # CODE_REVIEW = "code-review"
     # EVENT_REQUEST = "event-request"
 
+    @property
+    def dataset_path(self) -> Path:
+        from bcbench.config import get_config
+
+        match self:
+            case EvaluationCategory.BUG_FIX:
+                return get_config().paths.dataset_dir / "bcbench.jsonl"
+            case EvaluationCategory.TEST_GENERATION:
+                return get_config().paths.dataset_dir / "bcbench.jsonl"
+
+        raise ValueError(f"Unknown evaluation category: {self}")
+
+    @property
+    def entry_class(self) -> type[BaseDatasetEntry]:
+        from bcbench.dataset import BugFixEntry, TestGenEntry
+
+        match self:
+            case EvaluationCategory.BUG_FIX:
+                return BugFixEntry
+            case EvaluationCategory.TEST_GENERATION:
+                return TestGenEntry
+
+        raise ValueError(f"Unknown evaluation category: {self}")
+
+    @property
+    def pipeline(self) -> EvaluationPipeline:
+        from bcbench.evaluate import BugFixPipeline, TestGenerationPipeline
+
+        match self:
+            case EvaluationCategory.BUG_FIX:
+                return BugFixPipeline()
+            case EvaluationCategory.TEST_GENERATION:
+                return TestGenerationPipeline()
+
+        raise ValueError(f"Unknown evaluation category: {self}")
+
+
+@dataclass(frozen=True)
+class ContainerConfig:
+    name: str
+    username: str
+    password: str
+
 
 @dataclass
-class EvaluationContext:
+class EvaluationContext[E: BaseDatasetEntry]:
     """Context object containing all configuration for evaluation pipeline.
 
     This bundles related configuration together to avoid long parameter lists
@@ -111,15 +155,10 @@ class EvaluationContext:
     """
 
     # Core configuration
-    entry: DatasetEntry
+    entry: E
     repo_path: Path
     result_dir: Path
 
-    # BC Container configuration
-    container_name: str
-    password: str
-    username: str
-
     # Agent metadata
     agent_name: str
     model: str
@@ -127,8 +166,16 @@ class EvaluationContext:
     # Evaluation category
     category: EvaluationCategory
 
+    # BC Container configuration (optional — not all categories require a container)
+    container: ContainerConfig | None = None
+
     # Agent execution metrics
     metrics: AgentMetrics | None = None
 
     # Experiment configuration
     experiment: ExperimentConfiguration | None = None
+
+    def get_container(self) -> ContainerConfig:
+        if self.container is None:
+            raise ValueError(f"Container configuration is required for {self.category.value} evaluation")
+        return self.container
diff --git a/tests/conftest.py b/tests/conftest.py
index a1ee29531..6c6ab4922 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,7 +1,7 @@
 """
 Shared test fixtures for BC-Bench tests.
 
-This module provides reusable fixtures that create valid DatasetEntry objects
+This module provides reusable fixtures that create valid BugFixEntry objects
 meeting all Pydantic validation requirements.
 """
 
@@ -12,12 +12,13 @@
 
 import pytest
 
-from bcbench.dataset import DatasetEntry, TestEntry
+from bcbench.dataset import BugFixEntry, TestEntry
+from bcbench.dataset.dataset_entry import _BugFixTestGenBase
 from bcbench.results.bugfix import BugFixResult
 from bcbench.results.testgeneration import TestGenerationResult
-from bcbench.types import AgentMetrics, EvaluationCategory, EvaluationContext
+from bcbench.types import AgentMetrics, ContainerConfig, EvaluationCategory, EvaluationContext
 
-# Valid test data that passes all DatasetEntry validation rules
+# Valid test data that passes all BugFixEntry validation rules
 VALID_INSTANCE_ID = "microsoftInternal__NAV-123456"
 VALID_REPO = "microsoftInternal/NAV"
 VALID_BASE_COMMIT = "a" * 40  # 40-character hex string
@@ -49,7 +50,7 @@ def create_dataset_entry(
     created_at: str = VALID_CREATED_AT,
     fail_to_pass: list[TestEntry] | None = None,
     pass_to_pass: list[TestEntry] | None = None,
-) -> DatasetEntry:
+) -> BugFixEntry:
     if project_paths is None:
         project_paths = VALID_PROJECT_PATHS.copy()
     if fail_to_pass is None:
@@ -57,7 +58,7 @@ def create_dataset_entry(
     if pass_to_pass is None:
         pass_to_pass = []
 
-    return DatasetEntry(
+    return BugFixEntry(
         instance_id=instance_id,
         repo=repo,
         base_commit=base_commit,
@@ -73,24 +74,22 @@ def create_dataset_entry(
 
 def create_evaluation_context(
     tmp_path: Path,
-    entry: DatasetEntry | None = None,
+    entry: BugFixEntry | None = None,
     agent_name: str = "test-agent",
     model: str = "test-model",
     category: EvaluationCategory = EvaluationCategory.BUG_FIX,
     container_name: str = "test-container",
     password: str = "test-password",
     username: str = "test-user",
-) -> EvaluationContext:
+) -> EvaluationContext[BugFixEntry]:
     if entry is None:
         entry = create_dataset_entry()
 
-    return EvaluationContext(
+    return EvaluationContext[BugFixEntry](
         entry=entry,
         repo_path=tmp_path / "repo",
         result_dir=tmp_path / "results",
-        container_name=container_name,
-        password=password,
-        username=username,
+        container=ContainerConfig(name=container_name, username=username, password=password),
         agent_name=agent_name,
         model=model,
         category=category,
@@ -151,7 +150,7 @@ def create_testgen_result(
     )
 
 
-def create_dataset_file(tmp_path: Path, entries: list[DatasetEntry] | None = None) -> Path:
+def create_dataset_file(tmp_path: Path, entries: list[BugFixEntry] | None = None) -> Path:
     if entries is None:
         entries = [create_dataset_entry()]
 
@@ -188,12 +187,12 @@ def sample_test_entry() -> TestEntry:
 
 
 @pytest.fixture
-def sample_dataset_entry() -> DatasetEntry:
+def sample_dataset_entry() -> BugFixEntry:
     return create_dataset_entry()
 
 
 @pytest.fixture
-def sample_evaluation_context(tmp_path: Path) -> EvaluationContext:
+def sample_evaluation_context(tmp_path: Path) -> EvaluationContext[BugFixEntry]:
     return create_evaluation_context(tmp_path)
 
 
@@ -225,10 +224,9 @@ def sample_bugfix_result_with_metrics() -> BugFixResult:
 
 
 @pytest.fixture
-def sample_dataset_entry_with_problem_statement(tmp_path: Path) -> Generator[DatasetEntry]:
+def sample_dataset_entry_with_problem_statement(tmp_path: Path) -> Generator[BugFixEntry]:
     problem_dir = create_problem_statement_dir(tmp_path)
     entry = create_dataset_entry()
 
-    # Patch the problem_statement_dir property to return our temp directory
-    with patch.object(type(entry), "problem_statement_dir", property(lambda self: problem_dir)):
+    with patch.object(_BugFixTestGenBase, "problem_statement_dir", property(lambda self: problem_dir)):
         yield entry
diff --git a/tests/test_agent_skills.py b/tests/test_agent_skills.py
index c49028c74..4025d72f2 100644
--- a/tests/test_agent_skills.py
+++ b/tests/test_agent_skills.py
@@ -6,7 +6,7 @@
 from tempfile import TemporaryDirectory
 from unittest.mock import MagicMock
 
-from bcbench.dataset import DatasetEntry
+from bcbench.dataset import BaseDatasetEntry
 from bcbench.operations import setup_agent_skills
 from bcbench.operations.instruction_operations import _get_source_instructions_path
 from bcbench.types import AgentType
@@ -24,7 +24,7 @@ def test_setup_agent_skills():
 
     with TemporaryDirectory() as tmpdir:
         repo_path = Path(tmpdir)
-        entry = MagicMock(spec=DatasetEntry)
+        entry = MagicMock(spec=BaseDatasetEntry)
         entry.repo = "microsoftInternal/NAV"
         config = {"skills": {"enabled": True}}
 
@@ -69,7 +69,7 @@ def test_nonexistent_skills():
     """Test that setup_agent_skills raises FileNotFoundError for nonexistent repo."""
     with TemporaryDirectory() as tmpdir:
         repo_path = Path(tmpdir)
-        entry = MagicMock(spec=DatasetEntry)
+        entry = MagicMock(spec=BaseDatasetEntry)
         entry.repo = "nonexistent/repo"
         config = {"skills": {"enabled": True}}
 
@@ -92,7 +92,7 @@ def test_overwrite_skill_folder_files():
 
     with TemporaryDirectory() as tmpdir:
         repo_path = Path(tmpdir)
-        entry = MagicMock(spec=DatasetEntry)
+        entry = MagicMock(spec=BaseDatasetEntry)
         entry.repo = "microsoftInternal/NAV"
         config = {"skills": {"enabled": True}}
 
@@ -122,7 +122,7 @@ def test_overwrite_skill_folder_files():
 def test_path_specific_skills_copied():
     with TemporaryDirectory() as tmpdir:
         repo_path = Path(tmpdir)
-        entry = MagicMock(spec=DatasetEntry)
+        entry = MagicMock(spec=BaseDatasetEntry)
         entry.repo = "microsoftInternal/NAV"
         config = {"skills": {"enabled": True}}
 
@@ -141,7 +141,7 @@ def test_path_specific_skills_copied():
 def test_path_specific_skills_removed_before_copy():
     with TemporaryDirectory() as tmpdir:
         repo_path = Path(tmpdir)
-        entry = MagicMock(spec=DatasetEntry)
+        entry = MagicMock(spec=BaseDatasetEntry)
         entry.repo = "microsoftInternal/NAV"
         config = {"skills": {"enabled": True}}
 
@@ -166,7 +166,7 @@ def test_skills_disabled():
     """When skills disabled, should return False and not create directory."""
     with TemporaryDirectory() as tmpdir:
         repo_path = Path(tmpdir)
-        entry = MagicMock(spec=DatasetEntry)
+        entry = MagicMock(spec=BaseDatasetEntry)
         entry.repo = "microsoftInternal/NAV"
         config = {"skills": {"enabled": False}}
 
diff --git a/tests/test_cli_commands.py b/tests/test_cli_commands.py
index eb5c99dd4..2290cff5c 100644
--- a/tests/test_cli_commands.py
+++ b/tests/test_cli_commands.py
@@ -1,14 +1,14 @@
 """Integration tests for CLI commands using Typer's CliRunner."""
 
 import json
-from unittest.mock import patch
+from unittest.mock import PropertyMock, patch
 
 import pytest
 from typer.testing import CliRunner
 
 from bcbench.cli import app
-from bcbench.dataset import DatasetEntry
-from bcbench.types import AgentMetrics
+from bcbench.dataset.dataset_entry import _BugFixTestGenBase
+from bcbench.types import AgentMetrics, EvaluationCategory
 from tests.conftest import (
     create_bugfix_result,
     create_dataset_entry,
@@ -95,18 +95,21 @@ def test_result_summarize_creates_all_outputs(sample_results_directory, problem_
     base_path, run_id, dataset_path = sample_results_directory
     results_dir = base_path / run_id
 
-    with patch.object(DatasetEntry, "problem_statement_dir", property(lambda self: problem_statement_dir)):
+    with (
+        patch.object(_BugFixTestGenBase, "problem_statement_dir", property(lambda self: problem_statement_dir)),
+        patch.object(EvaluationCategory, "dataset_path", new_callable=PropertyMock, return_value=dataset_path),
+    ):
         result = runner.invoke(
             app,
             [
                 "result",
                 "summarize",
+                "--category",
+                "bug-fix",
                 "--run-id",
                 run_id,
                 "--result-dir",
                 str(base_path),
-                "--dataset-path",
-                str(dataset_path),
             ],
         )
 
@@ -126,18 +129,21 @@ def test_result_summarize_verifies_summary_calculations(sample_results_directory
     base_path, run_id, dataset_path = sample_results_directory
     results_dir = base_path / run_id
 
-    with patch.object(DatasetEntry, "problem_statement_dir", property(lambda self: problem_statement_dir)):
+    with (
+        patch.object(_BugFixTestGenBase, "problem_statement_dir", property(lambda self: problem_statement_dir)),
+        patch.object(EvaluationCategory, "dataset_path", new_callable=PropertyMock, return_value=dataset_path),
+    ):
         result = runner.invoke(
             app,
             [
                 "result",
                 "summarize",
+                "--category",
+                "bug-fix",
                 "--run-id",
                 run_id,
                 "--result-dir",
                 str(base_path),
-                "--dataset-path",
-                str(dataset_path),
             ],
         )
 
@@ -162,6 +168,8 @@ def test_result_summarize_missing_directory_fails_gracefully(tmp_path):
         [
             "result",
             "summarize",
+            "--category",
+            "bug-fix",
             "--run-id",
             "nonexistent_run",
             "--result-dir",
@@ -187,6 +195,8 @@ def test_result_summarize_no_matching_files_fails_gracefully(tmp_path):
         [
             "result",
             "summarize",
+            "--category",
+            "bug-fix",
             "--run-id",
             run_id,
             "--result-dir",
@@ -201,18 +211,21 @@ def test_result_summarize_no_matching_files_fails_gracefully(tmp_path):
 def test_result_summarize_with_custom_pattern(sample_results_directory, problem_statement_dir):
     base_path, run_id, dataset_path = sample_results_directory
 
-    with patch.object(DatasetEntry, "problem_statement_dir", property(lambda self: problem_statement_dir)):
+    with (
+        patch.object(_BugFixTestGenBase, "problem_statement_dir", property(lambda self: problem_statement_dir)),
+        patch.object(EvaluationCategory, "dataset_path", new_callable=PropertyMock, return_value=dataset_path),
+    ):
         result = runner.invoke(
             app,
             [
                 "result",
                 "summarize",
+                "--category",
+                "bug-fix",
                 "--run-id",
                 run_id,
                 "--result-dir",
                 str(base_path),
-                "--dataset-path",
-                str(dataset_path),
                 "--result-pattern",
                 "*.jsonl",
             ],
@@ -223,15 +236,14 @@ def test_result_summarize_with_custom_pattern(sample_results_directory, problem_
 
 @pytest.mark.integration
 def test_dataset_list_displays_all_entries(sample_dataset_file_for_cli):
-    result = runner.invoke(
-        app,
-        [
-            "dataset",
-            "list",
-            "--dataset-path",
-            str(sample_dataset_file_for_cli),
-        ],
-    )
+    with patch.object(EvaluationCategory, "dataset_path", new_callable=PropertyMock, return_value=sample_dataset_file_for_cli):
+        result = runner.invoke(
+            app,
+            [
+                "dataset",
+                "list",
+            ],
+        )
 
     assert result.exit_code == 0
     assert "microsoftInternal__NAV-1" in result.stdout
@@ -244,15 +256,14 @@ def test_dataset_list_displays_all_entries(sample_dataset_file_for_cli):
 def test_dataset_list_missing_file_fails_gracefully(tmp_path):
     nonexistent_path = tmp_path / "nonexistent.jsonl"
 
-    result = runner.invoke(
-        app,
-        [
-            "dataset",
-            "list",
-            "--dataset-path",
-            str(nonexistent_path),
-        ],
-    )
+    with patch.object(EvaluationCategory, "dataset_path", new_callable=PropertyMock, return_value=nonexistent_path):
+        result = runner.invoke(
+            app,
+            [
+                "dataset",
+                "list",
+            ],
+        )
 
     assert result.exit_code != 0
 
@@ -262,15 +273,14 @@ def test_dataset_list_empty_file_shows_zero_entries(tmp_path):
     empty_dataset = tmp_path / "empty.jsonl"
     empty_dataset.write_text("")
 
-    result = runner.invoke(
-        app,
-        [
-            "dataset",
-            "list",
-            "--dataset-path",
-            str(empty_dataset),
-        ],
-    )
+    with patch.object(EvaluationCategory, "dataset_path", new_callable=PropertyMock, return_value=empty_dataset):
+        result = runner.invoke(
+            app,
+            [
+                "dataset",
+                "list",
+            ],
+        )
 
     assert result.exit_code == 0
     assert "Found 0 entry(ies)" in result.stdout
@@ -281,15 +291,14 @@ def test_dataset_list_single_entry(tmp_path):
     entry = create_dataset_entry(instance_id="microsoftInternal__NAV-100")
     dataset_path = create_dataset_file(tmp_path, [entry])
 
-    result = runner.invoke(
-        app,
-        [
-            "dataset",
-            "list",
-            "--dataset-path",
-            str(dataset_path),
-        ],
-    )
+    with patch.object(EvaluationCategory, "dataset_path", new_callable=PropertyMock, return_value=dataset_path):
+        result = runner.invoke(
+            app,
+            [
+                "dataset",
+                "list",
+            ],
+        )
 
     assert result.exit_code == 0
     assert "microsoftInternal__NAV-100" in result.stdout
@@ -298,15 +307,14 @@ def test_dataset_list_single_entry(tmp_path):
 
 @pytest.mark.integration
 def test_dataset_list_verifies_entry_format(sample_dataset_file_for_cli):
-    result = runner.invoke(
-        app,
-        [
-            "dataset",
-            "list",
-            "--dataset-path",
-            str(sample_dataset_file_for_cli),
-        ],
-    )
+    with patch.object(EvaluationCategory, "dataset_path", new_callable=PropertyMock, return_value=sample_dataset_file_for_cli):
+        result = runner.invoke(
+            app,
+            [
+                "dataset",
+                "list",
+            ],
+        )
 
     assert result.exit_code == 0
     # Should contain instance_id of first entry
diff --git a/tests/test_custom_instructions.py b/tests/test_custom_instructions.py
index 1a3eb54a2..caf7a56f4 100644
--- a/tests/test_custom_instructions.py
+++ b/tests/test_custom_instructions.py
@@ -8,7 +8,7 @@
 from unittest.mock import MagicMock
 
 from bcbench.config import get_config
-from bcbench.dataset import DatasetEntry
+from bcbench.dataset import BaseDatasetEntry
 from bcbench.operations.instruction_operations import (
     _get_source_instructions_path,
     setup_instructions_from_config,
@@ -30,7 +30,7 @@ def test_setup_custom_instructions():
 
     with TemporaryDirectory() as tmpdir:
         repo_path = Path(tmpdir)
-        entry = MagicMock(spec=DatasetEntry)
+        entry = MagicMock(spec=BaseDatasetEntry)
         entry.repo = "microsoftInternal/NAV"
         config = {"instructions": {"enabled": True}}
 
@@ -85,7 +85,7 @@ def test_overwrite_existing_instructions():
 
     with TemporaryDirectory() as tmpdir:
         repo_path = Path(tmpdir)
-        entry = MagicMock(spec=DatasetEntry)
+        entry = MagicMock(spec=BaseDatasetEntry)
         entry.repo = "microsoftInternal/NAV"
         config = {"instructions": {"enabled": True}}
 
@@ -110,7 +110,7 @@ def test_overwrite_existing_instructions():
 def test_path_specific_instructions_removed_before_copy():
     with TemporaryDirectory() as tmpdir:
         repo_path = Path(tmpdir)
-        entry = MagicMock(spec=DatasetEntry)
+        entry = MagicMock(spec=BaseDatasetEntry)
         entry.repo = "microsoftInternal/NAV"
         config = {"instructions": {"enabled": True}}
 
@@ -133,7 +133,7 @@ def test_path_specific_instructions_removed_before_copy():
 def test_no_path_specific_instructions_warning():
     with TemporaryDirectory() as tmpdir:
         repo_path = Path(tmpdir)
-        entry = MagicMock(spec=DatasetEntry)
+        entry = MagicMock(spec=BaseDatasetEntry)
         entry.repo = "microsoftInternal/NAV"
         config = {"instructions": {"enabled": True}}
 
@@ -149,7 +149,7 @@ def test_no_path_specific_instructions_warning():
 def test_empty_instructions_folder_warning():
     with TemporaryDirectory() as tmpdir:
         repo_path = Path(tmpdir)
-        entry = MagicMock(spec=DatasetEntry)
+        entry = MagicMock(spec=BaseDatasetEntry)
         entry.repo = "microsoftInternal/NAV"
         config = {"instructions": {"enabled": True}}
 
@@ -167,7 +167,7 @@ def test_claude_instructions_renamed():
 
     with TemporaryDirectory() as tmpdir:
         repo_path = Path(tmpdir)
-        entry = MagicMock(spec=DatasetEntry)
+        entry = MagicMock(spec=BaseDatasetEntry)
         entry.repo = "microsoftInternal/NAV"
         config = {"instructions": {"enabled": True}}
 
diff --git a/tests/test_ps_templates.py b/tests/test_ps_templates.py
index 82cb7e983..7cc0bb658 100644
--- a/tests/test_ps_templates.py
+++ b/tests/test_ps_templates.py
@@ -5,6 +5,7 @@
 from bcbench.config import get_config
 from bcbench.dataset import TestEntry
 from bcbench.operations import bc_operations
+from bcbench.types import ContainerConfig
 
 _config = get_config()
 
@@ -225,9 +226,7 @@ def test_test_entries_serialized_as_json(self, mock_subprocess):
         bc_operations.run_test_suite(
             test_entries=test_entries,
             expectation="Pass",
-            container_name="bcserver",
-            username="admin",
-            password="Test123",
+            container=ContainerConfig(name="bcserver", username="admin", password="Test123"),
         )
 
         assert len(mock_subprocess) == 1
@@ -248,9 +247,7 @@ def test_multiple_test_entries_serialized_as_json(self, mock_subprocess):
         bc_operations.run_test_suite(
             test_entries=test_entries,
             expectation="Pass",
-            container_name="bcserver",
-            username="admin",
-            password="Test123",
+            container=ContainerConfig(name="bcserver", username="admin", password="Test123"),
         )
 
         assert len(mock_subprocess) == 1
diff --git a/tests/test_result_writer.py b/tests/test_result_writer.py
index 97138b203..4133f8714 100644
--- a/tests/test_result_writer.py
+++ b/tests/test_result_writer.py
@@ -1,9 +1,9 @@
 import json
-from unittest.mock import patch
+from unittest.mock import PropertyMock, patch
 
-from bcbench.dataset import DatasetEntry
+from bcbench.dataset.dataset_entry import _BugFixTestGenBase
 from bcbench.results.bceval_export import write_bceval_results
-from bcbench.types import AgentMetrics
+from bcbench.types import AgentMetrics, EvaluationCategory
 from tests.conftest import VALID_INSTANCE_ID, create_bugfix_result
 
 
@@ -12,13 +12,16 @@ def test_writes_bceval_results_with_all_fields(self, tmp_path, sample_dataset_fi
         output_dir = tmp_path / "output"
         output_dir.mkdir()
 
-        with patch.object(DatasetEntry, "problem_statement_dir", property(lambda self: problem_statement_dir)):
+        with (
+            patch.object(_BugFixTestGenBase, "problem_statement_dir", property(lambda self: problem_statement_dir)),
+            patch.object(EvaluationCategory, "dataset_path", new_callable=PropertyMock, return_value=sample_dataset_file),
+        ):
             write_bceval_results(
                 results=[sample_bugfix_result_with_metrics],
                 out_dir=output_dir,
                 run_id="test_run_123",
-                dataset_path=sample_dataset_file,
                 output_filename="results.jsonl",
+                category=EvaluationCategory.BUG_FIX,
             )
 
         output_file = output_dir / "results.jsonl"
@@ -43,13 +46,16 @@ def test_handles_none_prompt_tokens(self, tmp_path, sample_dataset_file, sample_
         output_dir = tmp_path / "output"
         output_dir.mkdir()
 
-        with patch.object(DatasetEntry, "problem_statement_dir", property(lambda self: problem_statement_dir)):
+        with (
+            patch.object(_BugFixTestGenBase, "problem_statement_dir", property(lambda self: problem_statement_dir)),
+            patch.object(EvaluationCategory, "dataset_path", new_callable=PropertyMock, return_value=sample_dataset_file),
+        ):
             write_bceval_results(
                 results=[sample_testgen_result],
                 out_dir=output_dir,
                 run_id="test_run_456",
-                dataset_path=sample_dataset_file,
                 output_filename="results.jsonl",
+                category=EvaluationCategory.TEST_GENERATION,
             )
 
         output_file = output_dir / "results.jsonl"
@@ -65,13 +71,16 @@ def test_handles_mixed_results(self, tmp_path, sample_dataset_file, sample_bugfi
         output_dir = tmp_path / "output"
         output_dir.mkdir()
 
-        with patch.object(DatasetEntry, "problem_statement_dir", property(lambda self: problem_statement_dir)):
+        with (
+            patch.object(_BugFixTestGenBase, "problem_statement_dir", property(lambda self: problem_statement_dir)),
+            patch.object(EvaluationCategory, "dataset_path", new_callable=PropertyMock, return_value=sample_dataset_file),
+        ):
             write_bceval_results(
                 results=[sample_bugfix_result_with_metrics, sample_testgen_result],
                 out_dir=output_dir,
                 run_id="test_run_789",
-                dataset_path=sample_dataset_file,
                 output_filename="results.jsonl",
+                category=EvaluationCategory.BUG_FIX,
             )
 
         output_file = output_dir / "results.jsonl"
@@ -94,13 +103,16 @@ def test_includes_expected_fields_in_bceval_format(self, tmp_path, sample_datase
         output_dir = tmp_path / "output"
         output_dir.mkdir()
 
-        with patch.object(DatasetEntry, "problem_statement_dir", property(lambda self: problem_statement_dir)):
+        with (
+            patch.object(_BugFixTestGenBase, "problem_statement_dir", property(lambda self: problem_statement_dir)),
+            patch.object(EvaluationCategory, "dataset_path", new_callable=PropertyMock, return_value=sample_dataset_file),
+        ):
             write_bceval_results(
                 results=[sample_bugfix_result_with_metrics],
                 out_dir=output_dir,
                 run_id="test_run_abc",
-                dataset_path=sample_dataset_file,
                 output_filename="results.jsonl",
+                category=EvaluationCategory.BUG_FIX,
             )
 
         output_file = output_dir / "results.jsonl"
@@ -127,13 +139,14 @@ def test_skips_results_without_matching_dataset_entry(self, tmp_path, sample_dat
             metrics=AgentMetrics(prompt_tokens=1000, completion_tokens=200),
         )
 
-        write_bceval_results(
-            results=[non_matching_result],
-            out_dir=output_dir,
-            run_id="test_run_xyz",
-            dataset_path=sample_dataset_file,
-            output_filename="results.jsonl",
-        )
+        with patch.object(EvaluationCategory, "dataset_path", new_callable=PropertyMock, return_value=sample_dataset_file):
+            write_bceval_results(
+                results=[non_matching_result],
+                out_dir=output_dir,
+                run_id="test_run_xyz",
+                output_filename="results.jsonl",
+                category=EvaluationCategory.BUG_FIX,
+            )
 
         output_file = output_dir / "results.jsonl"
         with open(output_file) as f:
@@ -148,13 +161,16 @@ def test_handles_partial_none_metrics(self, tmp_path, sample_dataset_file, probl
 
         result = create_bugfix_result(metrics=AgentMetrics(execution_time=100.0, prompt_tokens=None, completion_tokens=1500))
 
-        with patch.object(DatasetEntry, "problem_statement_dir", property(lambda self: problem_statement_dir)):
+        with (
+            patch.object(_BugFixTestGenBase, "problem_statement_dir", property(lambda self: problem_statement_dir)),
+            patch.object(EvaluationCategory, "dataset_path", new_callable=PropertyMock, return_value=sample_dataset_file),
+        ):
             write_bceval_results(
                 results=[result],
                 out_dir=output_dir,
                 run_id="test_run_partial",
-                dataset_path=sample_dataset_file,
                 output_filename="results.jsonl",
+                category=EvaluationCategory.BUG_FIX,
             )
 
         output_file = output_dir / "results.jsonl"
diff --git a/tests/test_type_exhaustiveness.py b/tests/test_type_exhaustiveness.py
index a229628b8..3da3844f7 100644
--- a/tests/test_type_exhaustiveness.py
+++ b/tests/test_type_exhaustiveness.py
@@ -1,8 +1,6 @@
 from pathlib import Path
 
-from bcbench.dataset import DatasetEntry
-from bcbench.evaluate import create_pipeline
-from bcbench.results.bceval_export import get_info_from_dataset_entry
+from bcbench.dataset import BugFixEntry
 from bcbench.types import AgentType, EvaluationCategory
 
 
@@ -23,13 +21,23 @@ def test_all_agent_types_have_instruction_filename():
 
 def test_all_categories_have_pipelines():
     for category in EvaluationCategory:
-        pipeline = create_pipeline(category)
+        pipeline = category.pipeline
         assert pipeline is not None
 
 
-def test_all_categories_handled_in_get_info_from_dataset_entry(sample_dataset_entry_with_problem_statement: DatasetEntry):
+def test_all_categories_have_entry_classes():
     for category in EvaluationCategory:
-        input_text, expected_output = get_info_from_dataset_entry(sample_dataset_entry_with_problem_statement, category)
+        entry_cls = category.entry_class
+        assert entry_cls is not None
+
+
+def test_all_categories_handled_in_get_expected_output(sample_dataset_entry_with_problem_statement: BugFixEntry):
+    for category in EvaluationCategory:
+        entry_cls = category.entry_class
+        # Reconstruct entry as the category-specific type so get_expected_output() works
+        entry = entry_cls.model_validate(sample_dataset_entry_with_problem_statement.model_dump(by_alias=True))
+        input_text = entry.get_task()
+        expected_output = entry.get_expected_output()
         assert isinstance(input_text, str)
         assert isinstance(expected_output, str)
         assert len(expected_output) > 0