microsoft · haoranpb · Apr 8, 2026 · Apr 9, 2026 · Apr 9, 2026 · Apr 9, 2026
diff --git a/.github/actions/setup-bc-container/action.yml b/.github/actions/setup-bc-container/action.yml
@@ -14,6 +14,10 @@ inputs:
   github-token:
     description: GitHub token for accessing public repositories
     required: true
+  skip-container:
+    description: Skip BC container setup (only clone repository)
+    required: false
+    default: "false"
 
 outputs:
   repo_path:
@@ -24,6 +28,7 @@ runs:
   using: composite
   steps:
     - name: Generate BC container name and credentials
+      if: inputs.skip-container != 'true'
       run: |
         # Generate a 32-character random password using Get-Random
         # The password is short-lived and only used for the duration of the workflow
@@ -38,6 +43,7 @@ runs:
       shell: pwsh
 
     - name: Install BcContainerHelper module
+      if: inputs.skip-container != 'true'
       run: Install-Module -Name BcContainerHelper -Force -AllowClobber -AllowPrerelease
       shell: pwsh
 
@@ -59,5 +65,5 @@ runs:
         $env:ADO_TOKEN = az account get-access-token --resource "499b84ac-1321-427f-aa17-267ca6975798" --query accessToken -o tsv
         Write-Output "::add-mask::$env:ADO_TOKEN"
 
-        .\scripts\Setup-ContainerAndRepository.ps1 -InstanceId "${{ inputs.instance-id }}"
+        .\scripts\Setup-ContainerAndRepository.ps1 -InstanceId "${{ inputs.instance-id }}" ${{ inputs.skip-container == 'true' && '-SkipContainer' || '' }}
       shell: pwsh
diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md
@@ -14,6 +14,9 @@ This is a benchmark for evaluating coding agents on real-world Business Central
 - Uses `uv` for dependency management: e.g. `uv add <package>` to add packages, `uv run <command>` to run commands
 - Uses `pre-commit` for code quality checks (ruff linting/formatting, trailing whitespace, etc.)
 
+## Categories
+BC-Bench is category-based and designed to grow over time. It currently has two categories, `bug-fix` and `test-generation`. They share the same dataset tasks and execution-based setup, but use different prompts, expected outputs, and evaluation pipelines. Future categories such as `code-review` can be added within the same overall benchmark structure, though they may require different inputs, setup, or evaluation methods.
+
 ## Coding Patterns and Guidelines
 
 - Prefer strong typing and type hints

diff --git a/notebooks/bug-fix/overview.ipynb b/notebooks/bug-fix/overview.ipynb
@@ -269,7 +269,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
    "id": "8b5bb1be",
    "metadata": {},
    "outputs": [
@@ -291,7 +291,7 @@
     "merged_df[\"image_bin\"] = pd.cut(merged_df[\"image_count\"], bins=bins, labels=labels)\n",
     "\n",
     "# Add problem statement char count\n",
-    "ps_chars = {entry.instance_id: len(entry.get_task(transform_image_paths=False)) for entry in bcbench_dataset}\n",
+    "ps_chars = {entry.instance_id: len(entry.get_task()) for entry in bcbench_dataset}\n",
     "merged_df[\"ps_chars\"] = merged_df[\"instance_id\"].map(ps_chars)\n",
     "\n",
     "instance_df = (\n",

diff --git a/scripts/Setup-ContainerAndRepository.ps1 b/scripts/Setup-ContainerAndRepository.ps1
@@ -25,7 +25,10 @@ param(
     [SecureString]$Password,
 
     [Parameter(Mandatory = $false)]
-    [string]$RepoPath
+    [string]$RepoPath,
+
+    [Parameter(Mandatory = $false)]
+    [switch]$SkipContainer
 )
 
 [DatasetEntry[]] $entries = Get-DatasetEntries -DatasetPath $DatasetPath -Version $Version -InstanceId $InstanceId
@@ -37,9 +40,7 @@ else {
     Write-Log "Found $($entries.Count) dataset entries to process." -Level Info
 }
 
-Write-Log "Setting up BC container and repository for version $Version, Dataset Path: $DatasetPath" -Level Info
-
-[PSCredential]$credential = Get-BCCredential -Username $Username -Password $Password
+Write-Log "Setting up repository for version $Version, Dataset Path: $DatasetPath" -Level Info
 
 if (-not $RepoPath) {
     $RepoPath = Join-Path -Path $env:GITHUB_WORKSPACE -ChildPath "testbed"
@@ -56,27 +57,34 @@ if (Test-Path $RepoPath) {
 Write-Log "Cloning repository $($entries[0].repo) to $RepoPath" -Level Info
 Invoke-GitCloneWithRetry -RepoUrl $cloneInfo.Url -Token $cloneInfo.Token -ClonePath $RepoPath -CommitSha $commitSha -SparseCheckoutPaths $cloneInfo.SparseCheckoutPaths
 
-Import-Module BcContainerHelper -Force -DisableNameChecking
+if (-not $SkipContainer) {
+    [PSCredential]$credential = Get-BCCredential -Username $Username -Password $Password
 
-Write-Log "Container name: $ContainerName" -Level Info
+    Import-Module BcContainerHelper -Force -DisableNameChecking
 
-if (Test-ContainerExists -containerName $ContainerName) {
-    throw "Container $ContainerName already exists. This indicates the machine was not properly cleaned up from a previous run."
-}
+    Write-Log "Container name: $ContainerName" -Level Info
 
-Write-Log "Creating container $ContainerName for version $Version..." -Level Info
+    if (Test-ContainerExists -containerName $ContainerName) {
+        throw "Container $ContainerName already exists. This indicates the machine was not properly cleaned up from a previous run."
+    }
 
-# Get BC artifact URL
-[string] $url = Get-BCArtifactUrl -version $Version -Country $Country
-Write-Log "Retrieved artifact URL: $url" -Level Info
+    Write-Log "Creating container $ContainerName for version $Version..." -Level Info
 
-# Create container synchronously with NAV folder shared
-New-BCContainerSync -ContainerName $ContainerName -Version $Version -ArtifactUrl $url -Credential $credential -AdditionalFolders @($RepoPath)
+    # Get BC artifact URL
+    [string] $url = Get-BCArtifactUrl -version $Version -Country $Country
+    Write-Log "Retrieved artifact URL: $url" -Level Info
 
-# Create compiler folder synchronously
-New-BCCompilerFolderSync -ContainerName $ContainerName -ArtifactUrl $url
+    # Create container synchronously with NAV folder shared
+    New-BCContainerSync -ContainerName $ContainerName -Version $Version -ArtifactUrl $url -Credential $credential -AdditionalFolders @($RepoPath)
 
-Initialize-ContainerForDevelopment -ContainerName $ContainerName -RepoVersion ([System.Version]$Version)
+    # Create compiler folder synchronously
+    New-BCCompilerFolderSync -ContainerName $ContainerName -ArtifactUrl $url
+
+    Initialize-ContainerForDevelopment -ContainerName $ContainerName -RepoVersion ([System.Version]$Version)
+}
+else {
+    Write-Log "Skipping BC container setup (SkipContainer flag set)" -Level Info
+}
 
 # Set output for GitHub Actions or return path
 if ($env:GITHUB_OUTPUT) {

diff --git a/src/bcbench/agent/mini/agent.py b/src/bcbench/agent/mini/agent.py
@@ -73,7 +73,9 @@ def run_mini_agent(
 
     logger.info(f"Running mini-bc-agent on: {entry.instance_id}")
 
-    task: str = entry.get_task(transform_image_paths=True)
+    from bcbench.agent.shared.prompt import _transform_image_paths
+
+    task: str = _transform_image_paths(entry.get_task())
 
     # Lazy import and create agent
     from minisweagent.models.litellm_model import LitellmModel

diff --git a/src/bcbench/agent/shared/prompt.py b/src/bcbench/agent/shared/prompt.py
@@ -1,10 +1,19 @@
+import re
 from pathlib import Path
 
 from jinja2 import Template
 
+from bcbench.config import get_config
 from bcbench.dataset import BaseDatasetEntry
 from bcbench.types import EvaluationCategory
 
+_config = get_config()
+
+
+def _transform_image_paths(content: str) -> str:
+    dest_dir = _config.file_patterns.problem_statement_dest_dir
+    return re.sub(r"!\[([^\]]*)\]\(\./([^)]+)\)", rf"![\1]({dest_dir}/\2)", content)
+
 
 def build_prompt(entry: BaseDatasetEntry, repo_path: Path, config: dict, category: EvaluationCategory, al_mcp: bool = False) -> str:
     prompt_config = config.get("prompt", {})
@@ -15,10 +24,12 @@ def build_prompt(entry: BaseDatasetEntry, repo_path: Path, config: dict, categor
     is_gold_patch: bool = category == EvaluationCategory.TEST_GENERATION and test_gen_input in ("gold-patch", "both")
     is_problem_statement: bool = category == EvaluationCategory.TEST_GENERATION and test_gen_input in ("problem-statement", "both")
 
+    task = _transform_image_paths(entry.get_task())
+
     template = Template(template_str)
     return template.render(
         repo_path=repo_path,
-        task=entry.get_task(transform_image_paths=True),
+        task=task,
         project_paths=", ".join(entry.project_paths),
         include_project_paths=include_project_paths,
         is_gold_patch=is_gold_patch,  # only relevant for test-generation

diff --git a/src/bcbench/commands/dataset.py b/src/bcbench/commands/dataset.py
@@ -93,8 +93,9 @@ def view_entry(
 
     metadata_dict = entry.metadata.model_dump()
     for field_name, field_value in metadata_dict.items():
-        display_name = field_name.replace("_", " ").title()
-        info_table.add_row(f"[dim]Metadata:[/dim] {display_name}", str(field_value) if field_value else "N/A")
+        if field_value is not None:
+            display_name = field_name.replace("_", " ").title()
+            info_table.add_row(f"[dim]Metadata:[/dim] {display_name}", str(field_value))
 
     console.print(Panel(info_table, title="[bold]Entry Information[/bold]", border_style="blue"))
 

diff --git a/src/bcbench/commands/evaluate.py b/src/bcbench/commands/evaluate.py
@@ -23,7 +23,7 @@
 from bcbench.dataset import BaseDatasetEntry
 from bcbench.evaluate import EvaluationPipeline
 from bcbench.logger import get_logger
-from bcbench.results import BaseEvaluationResult
+from bcbench.results import BaseEvaluationResult, ExecutionBasedEvaluationResult
 from bcbench.types import AgentMetrics, ContainerConfig, EvaluationContext, ExperimentConfiguration
 
 logger = get_logger(__name__)
@@ -228,6 +228,9 @@ class MockEvaluationPipeline(EvaluationPipeline[BaseDatasetEntry]):
     It randomly generates different scenarios to test result handling and serialization.
     """
 
+    def setup_workspace(self, entry: BaseDatasetEntry, repo_path: Path) -> None:
+        logger.info("Mock pipeline: Skipping workspace setup")
+
     def setup(self, context: EvaluationContext[BaseDatasetEntry]) -> None:
         logger.info("Mock pipeline: Skipping setup")
 
@@ -271,11 +274,11 @@ def evaluate(self, context: EvaluationContext[BaseDatasetEntry]) -> None:
         result: BaseEvaluationResult
         match scenario:
             case "success":
-                result = BaseEvaluationResult.create_success(context, "MOCK_PATCH_CONTENT")
+                result = ExecutionBasedEvaluationResult.create_success(context, "MOCK_PATCH_CONTENT")
             case "build-fail":
-                result = BaseEvaluationResult.create_build_failure(context, "MOCK_PATCH_CONTENT", "Mock build failure")
+                result = ExecutionBasedEvaluationResult.create_build_failure(context, "MOCK_PATCH_CONTENT", "Mock build failure")
             case "test-fail":
-                result = BaseEvaluationResult.create_test_failure(context, "MOCK_PATCH_CONTENT", "Mock test failure")
+                result = ExecutionBasedEvaluationResult.create_test_failure(context, "MOCK_PATCH_CONTENT", "Mock test failure")
             case _:
                 raise ValueError("Invalid mock scenario, this should not happen")
 

diff --git a/src/bcbench/commands/result.py b/src/bcbench/commands/result.py
@@ -12,11 +12,11 @@
 from bcbench.results import (
     BaseEvaluationResult,
     EvaluationResultSummary,
+    ExecutionBasedEvaluationResultSummary,
     Leaderboard,
     LeaderboardAggregate,
     create_console_summary,
     create_github_job_summary,
-    create_result_from_json,
     write_bceval_results,
 )
 
@@ -65,21 +65,21 @@ def result_summarize(
     for results_path in result_files:
         logger.info(f"Reading results from: {results_path}")
         with open(results_path) as f:
-            results.extend(create_result_from_json(json.loads(line)) for line in f if line.strip())
+            results.extend(BaseEvaluationResult.from_json(json.loads(line)) for line in f if line.strip())
 
     if not results:
         logger.error("No results found in the result files")
         raise typer.Exit(code=1)
 
     write_bceval_results(results, run_dir, run_id, bceval_output, category)
 
+    summary = EvaluationResultSummary.from_results(results, run_id=run_id)
+
     if _config.env.github_actions:
-        create_github_job_summary(results)
+        create_github_job_summary(results, summary)
     else:
-        create_console_summary(results)
+        create_console_summary(results, summary)
 
-    # Save summary JSON
-    summary = EvaluationResultSummary.from_results(results, run_id=run_id)
     summary.save(run_dir, summary_output)
 
 
@@ -90,8 +90,8 @@ def _get_combination_key(result: EvaluationResultSummary) -> tuple[str, str, str
     return (result.agent_name, result.model, exp_key, result.benchmark_version)
 
 
-def _rebuild_aggregates(runs: list[EvaluationResultSummary]) -> list[LeaderboardAggregate]:
-    grouped: defaultdict[tuple[str, str, str | None, str], list[EvaluationResultSummary]] = defaultdict(list)
+def _rebuild_aggregates(runs: list[ExecutionBasedEvaluationResultSummary]) -> list[LeaderboardAggregate]:
+    grouped: defaultdict[tuple[str, str, str | None, str], list[ExecutionBasedEvaluationResultSummary]] = defaultdict(list)
     for run in runs:
         grouped[_get_combination_key(run)].append(run)
     return [LeaderboardAggregate.from_runs(group) for group in grouped.values()]
@@ -111,7 +111,7 @@ def result_update(
     """
     logger.info(f"Loading evaluation summary from: {evaluation_summary}")
     with open(evaluation_summary, encoding="utf-8") as f:
-        new_result = EvaluationResultSummary.model_validate_json(f.read())
+        new_result = ExecutionBasedEvaluationResultSummary.model_validate_json(f.read())
 
     logger.info(f"Processing result for agent '{new_result.agent_name}' with model '{new_result.model}' in category '{new_result.category.value}'")
 
@@ -120,13 +120,13 @@ def result_update(
 
     # Load existing leaderboard
     leaderboard: Leaderboard = Leaderboard.load(leaderboard_path)
-    runs: list[EvaluationResultSummary] = list(leaderboard.runs)
+    runs: list[ExecutionBasedEvaluationResultSummary] = list(leaderboard.runs)
     logger.info(f"Loaded {len(runs)} existing runs")
 
     # Find runs matching this combination
     new_result_key = _get_combination_key(new_result)
-    matching_runs: list[EvaluationResultSummary] = [r for r in runs if _get_combination_key(r) == new_result_key]
-    other_runs: list[EvaluationResultSummary] = [r for r in runs if _get_combination_key(r) != new_result_key]
+    matching_runs: list[ExecutionBasedEvaluationResultSummary] = [r for r in runs if _get_combination_key(r) == new_result_key]
+    other_runs: list[ExecutionBasedEvaluationResultSummary] = [r for r in runs if _get_combination_key(r) != new_result_key]
 
     if len(matching_runs) < n:
         logger.info(f"Adding run ({len(matching_runs) + 1}/{n}) for '{new_result.agent_name}' + '{new_result.model}'")
@@ -137,7 +137,7 @@ def result_update(
         matching_runs = [*matching_runs[1:], new_result]
 
     # Combine and rebuild aggregates
-    all_runs: list[EvaluationResultSummary] = other_runs + matching_runs
+    all_runs: list[ExecutionBasedEvaluationResultSummary] = other_runs + matching_runs
     aggregates = _rebuild_aggregates(all_runs)
 
     # Write back
@@ -171,7 +171,7 @@ def result_refresh(
         logger.info(f"Refreshing: {leaderboard_path.name}")
 
         leaderboard: Leaderboard = Leaderboard.load(leaderboard_path)
-        runs: list[EvaluationResultSummary] = list(leaderboard.runs)
+        runs: list[ExecutionBasedEvaluationResultSummary] = list(leaderboard.runs)
 
         if not runs:
             logger.warning(f"No runs found in {leaderboard_path.name}, skipping")

diff --git a/src/bcbench/commands/run.py b/src/bcbench/commands/run.py
@@ -19,9 +19,7 @@
     RepoPath,
 )
 from bcbench.config import get_config
-from bcbench.dataset.dataset_entry import _BugFixTestGenBase
 from bcbench.logger import get_logger
-from bcbench.operations import setup_repo_postbuild, setup_repo_prebuild
 
 logger = get_logger(__name__)
 _config = get_config()
@@ -46,9 +44,7 @@ def run_mini(
         uv run bcbench run mini microsoft__BCApps-5633 --step-limit 5 --category bug-fix
     """
     entry = category.entry_class.load(category.dataset_path, entry_id=entry_id)[0]
-    setup_repo_prebuild(entry, repo_path)
-    if isinstance(entry, _BugFixTestGenBase):
-        setup_repo_postbuild(entry, repo_path, category)
+    category.pipeline.setup_workspace(entry, repo_path)
 
     run_mini_agent(
         entry=entry,
@@ -78,9 +74,7 @@ def run_copilot(
         uv run bcbench run copilot microsoft__BCApps-5633 --category bug-fix --repo-path /path/to/BCApps
     """
     entry = category.entry_class.load(category.dataset_path, entry_id=entry_id)[0]
-    setup_repo_prebuild(entry, repo_path)
-    if isinstance(entry, _BugFixTestGenBase):
-        setup_repo_postbuild(entry, repo_path, category)
+    category.pipeline.setup_workspace(entry, repo_path)
 
     run_copilot_agent(entry=entry, repo_path=repo_path, model=model, category=category, output_dir=output_dir, al_mcp=al_mcp, container_name=container_name)
 
@@ -104,9 +98,7 @@ def run_claude(
         uv run bcbench run claude microsoft__BCApps-5633 --category bug-fix --repo-path /path/to/BCApps
     """
     entry = category.entry_class.load(category.dataset_path, entry_id=entry_id)[0]
-    setup_repo_prebuild(entry, repo_path)
-    if isinstance(entry, _BugFixTestGenBase):
-        setup_repo_postbuild(entry, repo_path, category)
+    category.pipeline.setup_workspace(entry, repo_path)
 
     run_claude_code(entry=entry, repo_path=repo_path, model=model, category=category, output_dir=output_dir, al_mcp=al_mcp, container_name=container_name)