Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion .github/actions/setup-bc-container/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,10 @@ inputs:
github-token:
description: GitHub token for accessing public repositories
required: true
skip-container:
description: Skip BC container setup (only clone repository)
required: false
default: "false"

outputs:
repo_path:
Expand All @@ -24,6 +28,7 @@ runs:
using: composite
steps:
- name: Generate BC container name and credentials
if: inputs.skip-container != 'true'
run: |
# Generate a 32-character random password using Get-Random
# The password is short-lived and only used for the duration of the workflow
Expand All @@ -38,6 +43,7 @@ runs:
shell: pwsh

- name: Install BcContainerHelper module
if: inputs.skip-container != 'true'
run: Install-Module -Name BcContainerHelper -Force -AllowClobber -AllowPrerelease
shell: pwsh

Expand All @@ -59,5 +65,5 @@ runs:
$env:ADO_TOKEN = az account get-access-token --resource "499b84ac-1321-427f-aa17-267ca6975798" --query accessToken -o tsv
Write-Output "::add-mask::$env:ADO_TOKEN"

.\scripts\Setup-ContainerAndRepository.ps1 -InstanceId "${{ inputs.instance-id }}"
.\scripts\Setup-ContainerAndRepository.ps1 -InstanceId "${{ inputs.instance-id }}" ${{ inputs.skip-container == 'true' && '-SkipContainer' || '' }}
shell: pwsh
3 changes: 3 additions & 0 deletions .github/copilot-instructions.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@ This is a benchmark for evaluating coding agents on real-world Business Central
- Uses `uv` for dependency management: e.g. `uv add <package>` to add packages, `uv run <command>` to run commands
- Uses `pre-commit` for code quality checks (ruff linting/formatting, trailing whitespace, etc.)

## Categories
BC-Bench is category-based and designed to grow over time. It currently has two categories, `bug-fix` and `test-generation`. They share the same dataset tasks and execution-based setup, but use different prompts, expected outputs, and evaluation pipelines. Future categories such as `code-review` can be added within the same overall benchmark structure, though they may require different inputs, setup, or evaluation methods.

## Coding Patterns and Guidelines

- Prefer strong typing and type hints
Expand Down
4 changes: 2 additions & 2 deletions notebooks/bug-fix/overview.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -269,7 +269,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": null,
"id": "8b5bb1be",
"metadata": {},
"outputs": [
Expand All @@ -291,7 +291,7 @@
"merged_df[\"image_bin\"] = pd.cut(merged_df[\"image_count\"], bins=bins, labels=labels)\n",
"\n",
"# Add problem statement char count\n",
"ps_chars = {entry.instance_id: len(entry.get_task(transform_image_paths=False)) for entry in bcbench_dataset}\n",
"ps_chars = {entry.instance_id: len(entry.get_task()) for entry in bcbench_dataset}\n",
"merged_df[\"ps_chars\"] = merged_df[\"instance_id\"].map(ps_chars)\n",
"\n",
"instance_df = (\n",
Expand Down
44 changes: 26 additions & 18 deletions scripts/Setup-ContainerAndRepository.ps1
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,10 @@ param(
[SecureString]$Password,

[Parameter(Mandatory = $false)]
[string]$RepoPath
[string]$RepoPath,

[Parameter(Mandatory = $false)]
[switch]$SkipContainer
)

[DatasetEntry[]] $entries = Get-DatasetEntries -DatasetPath $DatasetPath -Version $Version -InstanceId $InstanceId
Expand All @@ -37,9 +40,7 @@ else {
Write-Log "Found $($entries.Count) dataset entries to process." -Level Info
}

Write-Log "Setting up BC container and repository for version $Version, Dataset Path: $DatasetPath" -Level Info

[PSCredential]$credential = Get-BCCredential -Username $Username -Password $Password
Write-Log "Setting up repository for version $Version, Dataset Path: $DatasetPath" -Level Info

if (-not $RepoPath) {
$RepoPath = Join-Path -Path $env:GITHUB_WORKSPACE -ChildPath "testbed"
Expand All @@ -56,27 +57,34 @@ if (Test-Path $RepoPath) {
Write-Log "Cloning repository $($entries[0].repo) to $RepoPath" -Level Info
Invoke-GitCloneWithRetry -RepoUrl $cloneInfo.Url -Token $cloneInfo.Token -ClonePath $RepoPath -CommitSha $commitSha -SparseCheckoutPaths $cloneInfo.SparseCheckoutPaths

Import-Module BcContainerHelper -Force -DisableNameChecking
if (-not $SkipContainer) {
[PSCredential]$credential = Get-BCCredential -Username $Username -Password $Password

Write-Log "Container name: $ContainerName" -Level Info
Import-Module BcContainerHelper -Force -DisableNameChecking

if (Test-ContainerExists -containerName $ContainerName) {
throw "Container $ContainerName already exists. This indicates the machine was not properly cleaned up from a previous run."
}
Write-Log "Container name: $ContainerName" -Level Info

Write-Log "Creating container $ContainerName for version $Version..." -Level Info
if (Test-ContainerExists -containerName $ContainerName) {
throw "Container $ContainerName already exists. This indicates the machine was not properly cleaned up from a previous run."
}

# Get BC artifact URL
[string] $url = Get-BCArtifactUrl -version $Version -Country $Country
Write-Log "Retrieved artifact URL: $url" -Level Info
Write-Log "Creating container $ContainerName for version $Version..." -Level Info

# Create container synchronously with NAV folder shared
New-BCContainerSync -ContainerName $ContainerName -Version $Version -ArtifactUrl $url -Credential $credential -AdditionalFolders @($RepoPath)
# Get BC artifact URL
[string] $url = Get-BCArtifactUrl -version $Version -Country $Country
Write-Log "Retrieved artifact URL: $url" -Level Info

# Create compiler folder synchronously
New-BCCompilerFolderSync -ContainerName $ContainerName -ArtifactUrl $url
# Create container synchronously with NAV folder shared
New-BCContainerSync -ContainerName $ContainerName -Version $Version -ArtifactUrl $url -Credential $credential -AdditionalFolders @($RepoPath)

Initialize-ContainerForDevelopment -ContainerName $ContainerName -RepoVersion ([System.Version]$Version)
# Create compiler folder synchronously
New-BCCompilerFolderSync -ContainerName $ContainerName -ArtifactUrl $url

Initialize-ContainerForDevelopment -ContainerName $ContainerName -RepoVersion ([System.Version]$Version)
}
else {
Write-Log "Skipping BC container setup (SkipContainer flag set)" -Level Info
}

# Set output for GitHub Actions or return path
if ($env:GITHUB_OUTPUT) {
Expand Down
4 changes: 3 additions & 1 deletion src/bcbench/agent/mini/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,9 @@ def run_mini_agent(

logger.info(f"Running mini-bc-agent on: {entry.instance_id}")

task: str = entry.get_task(transform_image_paths=True)
from bcbench.agent.shared.prompt import _transform_image_paths

task: str = _transform_image_paths(entry.get_task())

# Lazy import and create agent
from minisweagent.models.litellm_model import LitellmModel
Expand Down
13 changes: 12 additions & 1 deletion src/bcbench/agent/shared/prompt.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,19 @@
import re
from pathlib import Path

from jinja2 import Template

from bcbench.config import get_config
from bcbench.dataset import BaseDatasetEntry
from bcbench.types import EvaluationCategory

_config = get_config()


def _transform_image_paths(content: str) -> str:
dest_dir = _config.file_patterns.problem_statement_dest_dir
return re.sub(r"!\[([^\]]*)\]\(\./([^)]+)\)", rf"![\1]({dest_dir}/\2)", content)


def build_prompt(entry: BaseDatasetEntry, repo_path: Path, config: dict, category: EvaluationCategory, al_mcp: bool = False) -> str:
prompt_config = config.get("prompt", {})
Expand All @@ -15,10 +24,12 @@ def build_prompt(entry: BaseDatasetEntry, repo_path: Path, config: dict, categor
is_gold_patch: bool = category == EvaluationCategory.TEST_GENERATION and test_gen_input in ("gold-patch", "both")
is_problem_statement: bool = category == EvaluationCategory.TEST_GENERATION and test_gen_input in ("problem-statement", "both")

task = _transform_image_paths(entry.get_task())

template = Template(template_str)
return template.render(
repo_path=repo_path,
task=entry.get_task(transform_image_paths=True),
task=task,
project_paths=", ".join(entry.project_paths),
include_project_paths=include_project_paths,
is_gold_patch=is_gold_patch, # only relevant for test-generation
Expand Down
5 changes: 3 additions & 2 deletions src/bcbench/commands/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,8 +93,9 @@ def view_entry(

metadata_dict = entry.metadata.model_dump()
for field_name, field_value in metadata_dict.items():
display_name = field_name.replace("_", " ").title()
info_table.add_row(f"[dim]Metadata:[/dim] {display_name}", str(field_value) if field_value else "N/A")
if field_value is not None:
display_name = field_name.replace("_", " ").title()
info_table.add_row(f"[dim]Metadata:[/dim] {display_name}", str(field_value))

console.print(Panel(info_table, title="[bold]Entry Information[/bold]", border_style="blue"))

Expand Down
11 changes: 7 additions & 4 deletions src/bcbench/commands/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
from bcbench.dataset import BaseDatasetEntry
from bcbench.evaluate import EvaluationPipeline
from bcbench.logger import get_logger
from bcbench.results import BaseEvaluationResult
from bcbench.results import BaseEvaluationResult, ExecutionBasedEvaluationResult
from bcbench.types import AgentMetrics, ContainerConfig, EvaluationContext, ExperimentConfiguration

logger = get_logger(__name__)
Expand Down Expand Up @@ -228,6 +228,9 @@ class MockEvaluationPipeline(EvaluationPipeline[BaseDatasetEntry]):
It randomly generates different scenarios to test result handling and serialization.
"""

def setup_workspace(self, entry: BaseDatasetEntry, repo_path: Path) -> None:
logger.info("Mock pipeline: Skipping workspace setup")

def setup(self, context: EvaluationContext[BaseDatasetEntry]) -> None:
logger.info("Mock pipeline: Skipping setup")

Expand Down Expand Up @@ -271,11 +274,11 @@ def evaluate(self, context: EvaluationContext[BaseDatasetEntry]) -> None:
result: BaseEvaluationResult
match scenario:
case "success":
result = BaseEvaluationResult.create_success(context, "MOCK_PATCH_CONTENT")
result = ExecutionBasedEvaluationResult.create_success(context, "MOCK_PATCH_CONTENT")
case "build-fail":
result = BaseEvaluationResult.create_build_failure(context, "MOCK_PATCH_CONTENT", "Mock build failure")
result = ExecutionBasedEvaluationResult.create_build_failure(context, "MOCK_PATCH_CONTENT", "Mock build failure")
case "test-fail":
result = BaseEvaluationResult.create_test_failure(context, "MOCK_PATCH_CONTENT", "Mock test failure")
result = ExecutionBasedEvaluationResult.create_test_failure(context, "MOCK_PATCH_CONTENT", "Mock test failure")
case _:
raise ValueError("Invalid mock scenario, this should not happen")

Expand Down
28 changes: 14 additions & 14 deletions src/bcbench/commands/result.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,11 @@
from bcbench.results import (
BaseEvaluationResult,
EvaluationResultSummary,
ExecutionBasedEvaluationResultSummary,
Leaderboard,
LeaderboardAggregate,
create_console_summary,
create_github_job_summary,
create_result_from_json,
write_bceval_results,
)

Expand Down Expand Up @@ -65,21 +65,21 @@ def result_summarize(
for results_path in result_files:
logger.info(f"Reading results from: {results_path}")
with open(results_path) as f:
results.extend(create_result_from_json(json.loads(line)) for line in f if line.strip())
results.extend(BaseEvaluationResult.from_json(json.loads(line)) for line in f if line.strip())

if not results:
logger.error("No results found in the result files")
raise typer.Exit(code=1)

write_bceval_results(results, run_dir, run_id, bceval_output, category)

summary = EvaluationResultSummary.from_results(results, run_id=run_id)

if _config.env.github_actions:
create_github_job_summary(results)
create_github_job_summary(results, summary)
else:
create_console_summary(results)
create_console_summary(results, summary)

# Save summary JSON
summary = EvaluationResultSummary.from_results(results, run_id=run_id)
summary.save(run_dir, summary_output)


Expand All @@ -90,8 +90,8 @@ def _get_combination_key(result: EvaluationResultSummary) -> tuple[str, str, str
return (result.agent_name, result.model, exp_key, result.benchmark_version)


def _rebuild_aggregates(runs: list[EvaluationResultSummary]) -> list[LeaderboardAggregate]:
grouped: defaultdict[tuple[str, str, str | None, str], list[EvaluationResultSummary]] = defaultdict(list)
def _rebuild_aggregates(runs: list[ExecutionBasedEvaluationResultSummary]) -> list[LeaderboardAggregate]:
grouped: defaultdict[tuple[str, str, str | None, str], list[ExecutionBasedEvaluationResultSummary]] = defaultdict(list)
for run in runs:
grouped[_get_combination_key(run)].append(run)
return [LeaderboardAggregate.from_runs(group) for group in grouped.values()]
Expand All @@ -111,7 +111,7 @@ def result_update(
"""
logger.info(f"Loading evaluation summary from: {evaluation_summary}")
with open(evaluation_summary, encoding="utf-8") as f:
new_result = EvaluationResultSummary.model_validate_json(f.read())
new_result = ExecutionBasedEvaluationResultSummary.model_validate_json(f.read())

logger.info(f"Processing result for agent '{new_result.agent_name}' with model '{new_result.model}' in category '{new_result.category.value}'")

Expand All @@ -120,13 +120,13 @@ def result_update(

# Load existing leaderboard
leaderboard: Leaderboard = Leaderboard.load(leaderboard_path)
runs: list[EvaluationResultSummary] = list(leaderboard.runs)
runs: list[ExecutionBasedEvaluationResultSummary] = list(leaderboard.runs)
logger.info(f"Loaded {len(runs)} existing runs")

# Find runs matching this combination
new_result_key = _get_combination_key(new_result)
matching_runs: list[EvaluationResultSummary] = [r for r in runs if _get_combination_key(r) == new_result_key]
other_runs: list[EvaluationResultSummary] = [r for r in runs if _get_combination_key(r) != new_result_key]
matching_runs: list[ExecutionBasedEvaluationResultSummary] = [r for r in runs if _get_combination_key(r) == new_result_key]
other_runs: list[ExecutionBasedEvaluationResultSummary] = [r for r in runs if _get_combination_key(r) != new_result_key]

if len(matching_runs) < n:
logger.info(f"Adding run ({len(matching_runs) + 1}/{n}) for '{new_result.agent_name}' + '{new_result.model}'")
Expand All @@ -137,7 +137,7 @@ def result_update(
matching_runs = [*matching_runs[1:], new_result]

# Combine and rebuild aggregates
all_runs: list[EvaluationResultSummary] = other_runs + matching_runs
all_runs: list[ExecutionBasedEvaluationResultSummary] = other_runs + matching_runs
aggregates = _rebuild_aggregates(all_runs)

# Write back
Expand Down Expand Up @@ -171,7 +171,7 @@ def result_refresh(
logger.info(f"Refreshing: {leaderboard_path.name}")

leaderboard: Leaderboard = Leaderboard.load(leaderboard_path)
runs: list[EvaluationResultSummary] = list(leaderboard.runs)
runs: list[ExecutionBasedEvaluationResultSummary] = list(leaderboard.runs)

if not runs:
logger.warning(f"No runs found in {leaderboard_path.name}, skipping")
Expand Down
14 changes: 3 additions & 11 deletions src/bcbench/commands/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,7 @@
RepoPath,
)
from bcbench.config import get_config
from bcbench.dataset.dataset_entry import _BugFixTestGenBase
from bcbench.logger import get_logger
from bcbench.operations import setup_repo_postbuild, setup_repo_prebuild

logger = get_logger(__name__)
_config = get_config()
Expand All @@ -46,9 +44,7 @@ def run_mini(
uv run bcbench run mini microsoft__BCApps-5633 --step-limit 5 --category bug-fix
"""
entry = category.entry_class.load(category.dataset_path, entry_id=entry_id)[0]
setup_repo_prebuild(entry, repo_path)
if isinstance(entry, _BugFixTestGenBase):
setup_repo_postbuild(entry, repo_path, category)
category.pipeline.setup_workspace(entry, repo_path)

run_mini_agent(
entry=entry,
Expand Down Expand Up @@ -78,9 +74,7 @@ def run_copilot(
uv run bcbench run copilot microsoft__BCApps-5633 --category bug-fix --repo-path /path/to/BCApps
"""
entry = category.entry_class.load(category.dataset_path, entry_id=entry_id)[0]
setup_repo_prebuild(entry, repo_path)
if isinstance(entry, _BugFixTestGenBase):
setup_repo_postbuild(entry, repo_path, category)
category.pipeline.setup_workspace(entry, repo_path)

run_copilot_agent(entry=entry, repo_path=repo_path, model=model, category=category, output_dir=output_dir, al_mcp=al_mcp, container_name=container_name)

Expand All @@ -104,9 +98,7 @@ def run_claude(
uv run bcbench run claude microsoft__BCApps-5633 --category bug-fix --repo-path /path/to/BCApps
"""
entry = category.entry_class.load(category.dataset_path, entry_id=entry_id)[0]
setup_repo_prebuild(entry, repo_path)
if isinstance(entry, _BugFixTestGenBase):
setup_repo_postbuild(entry, repo_path, category)
category.pipeline.setup_workspace(entry, repo_path)

run_claude_code(entry=entry, repo_path=repo_path, model=model, category=category, output_dir=output_dir, al_mcp=al_mcp, container_name=container_name)

Expand Down
Loading
Loading