Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 9 additions & 7 deletions .github/workflows/CI.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,6 @@ jobs:
- name: Run tests with coverage
run: uv run pytest --cov=src/bcbench --cov-report=term-missing

get-entries:
uses: ./.github/workflows/get-entries.yml
with:
test-run: true

select-category:
runs-on: ubuntu-latest
outputs:
Expand All @@ -50,6 +45,13 @@ jobs:
$selected = $categories | Get-Random
echo "category=$selected" >> $env:GITHUB_OUTPUT

get-entries:
needs: select-category
uses: ./.github/workflows/get-entries.yml
with:
test-run: true
category: ${{ needs.select-category.outputs.category }}

mock-evaluation:
runs-on: ubuntu-latest
needs: [get-entries, select-category]
Expand Down Expand Up @@ -79,7 +81,7 @@ jobs:
retention-days: 1

summarize-results:
needs: mock-evaluation
needs: [mock-evaluation, select-category]
uses: ./.github/workflows/summarize-results.yml
permissions:
contents: write
Expand All @@ -89,5 +91,5 @@ jobs:
model: ${{ github.run_id }}
agent: "mock-agent"
mock: true
category: "delete"
category: ${{ needs.select-category.outputs.category }}
secrets: inherit
1 change: 1 addition & 0 deletions .github/workflows/claude-evaluation.yml
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ jobs:
uses: ./.github/workflows/get-entries.yml
with:
test-run: ${{ inputs.test-run }}
category: ${{ inputs.category }}

evaluate-with-claude-code:
runs-on: [self-hosted, 1ES.Pool=GitHub-BCBench]
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/copilot-evaluation.yml
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ jobs:
uses: ./.github/workflows/get-entries.yml
with:
test-run: ${{ inputs.test-run }}
category: ${{ inputs.category }}

evaluate-with-copilot-cli:
runs-on: [self-hosted, 1ES.Pool=GitHub-BCBench]
Expand Down
3 changes: 2 additions & 1 deletion .github/workflows/dataset-validation.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,15 @@ on:
default: true
type: boolean
schedule:
- cron: "0 0 * * 0" # Weekly on Sundays at midnight
- cron: "0 0 * * 0"

jobs:
get-entries:
uses: ./.github/workflows/get-entries.yml
with:
modified-only: false
test-run: ${{ inputs.test-run || false }}
category: "bug-fix"

verify-build-and-tests:
runs-on: [self-hosted, 1ES.Pool=GitHub-BCBench]
Expand Down
7 changes: 6 additions & 1 deletion .github/workflows/get-entries.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,11 @@ on:
required: false
type: boolean
default: false
category:
description: Evaluation category
required: true
type: string
default: "bug-fix"
outputs:
entries:
description: JSON array of dataset entries
Expand All @@ -37,7 +42,7 @@ jobs:
- name: Get entries for matrix
id: get-entries
run: |
cmd="uv run bcbench dataset list --github-output entries"
cmd="uv run bcbench dataset list --category ${{ inputs.category }} --github-output entries"

if [[ "${{ inputs.modified-only }}" == "true" ]]; then
cmd="$cmd --modified-only"
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/mini-evaluation.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ jobs:
uses: ./.github/workflows/get-entries.yml
with:
test-run: ${{ inputs.test-run }}
category: ${{ inputs.category }}

evaluate-with-mini-agent:
runs-on: [self-hosted, 1ES.Pool=GitHub-BCBench]
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/summarize-results.yml
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ jobs:
merge-multiple: true

- name: Summarize evaluation results
run: uv run bcbench result summarize --result-dir "${{ inputs.results-dir }}" --bceval-output "${{ env.BCEVAL_RESULT_FILE }}" --summary-output "${{ env.SUMMARY_OUTPUT_FILE }}"
run: uv run bcbench result summarize --category "${{ inputs.category }}" --result-dir "${{ inputs.results-dir }}" --bceval-output "${{ env.BCEVAL_RESULT_FILE }}" --summary-output "${{ env.SUMMARY_OUTPUT_FILE }}"

- name: Upload evaluation summary to artifacts
uses: actions/upload-artifact@v6
Expand Down
4 changes: 2 additions & 2 deletions src/bcbench/agent/claude/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from bcbench.agent.claude.metrics import parse_metrics
from bcbench.agent.shared import build_mcp_config, build_prompt
from bcbench.config import get_config
from bcbench.dataset import DatasetEntry
from bcbench.dataset import BaseDatasetEntry
from bcbench.exceptions import AgentError, AgentTimeoutError
from bcbench.logger import get_logger
from bcbench.operations import setup_agent_skills, setup_custom_agent, setup_instructions_from_config
Expand All @@ -19,7 +19,7 @@


def run_claude_code(
entry: DatasetEntry, model: str, category: EvaluationCategory, repo_path: Path, output_dir: Path, al_mcp: bool = False, container_name: str = "bcbench"
entry: BaseDatasetEntry, model: str, category: EvaluationCategory, repo_path: Path, output_dir: Path, al_mcp: bool = False, container_name: str = "bcbench"
) -> tuple[AgentMetrics | None, ExperimentConfiguration]:
"""Run Claude Code on a single dataset entry.

Expand Down
4 changes: 2 additions & 2 deletions src/bcbench/agent/copilot/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from bcbench.agent.copilot.metrics import parse_metrics
from bcbench.agent.shared import build_mcp_config, build_prompt
from bcbench.config import get_config
from bcbench.dataset import DatasetEntry
from bcbench.dataset import BaseDatasetEntry
from bcbench.exceptions import AgentError, AgentTimeoutError
from bcbench.logger import get_logger
from bcbench.operations import setup_agent_skills, setup_custom_agent, setup_instructions_from_config
Expand All @@ -21,7 +21,7 @@


def run_copilot_agent(
entry: DatasetEntry, model: str, category: EvaluationCategory, repo_path: Path, output_dir: Path, al_mcp: bool = False, container_name: str = "bcbench"
entry: BaseDatasetEntry, model: str, category: EvaluationCategory, repo_path: Path, output_dir: Path, al_mcp: bool = False, container_name: str = "bcbench"
) -> tuple[AgentMetrics | None, ExperimentConfiguration]:
"""Run GitHub Copilot CLI agent on a single dataset entry.

Expand Down
4 changes: 2 additions & 2 deletions src/bcbench/agent/mini/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import yaml

from bcbench.config import get_config
from bcbench.dataset import DatasetEntry
from bcbench.dataset import BaseDatasetEntry
from bcbench.exceptions import ConfigurationError
from bcbench.logger import get_logger
from bcbench.types import AgentMetrics, EvaluationCategory, ExperimentConfiguration
Expand Down Expand Up @@ -52,7 +52,7 @@ def parse_action(self, response: dict) -> dict:


def run_mini_agent(
entry: DatasetEntry,
entry: BaseDatasetEntry,
repo_path: Path,
model: str,
category: EvaluationCategory,
Expand Down
4 changes: 2 additions & 2 deletions src/bcbench/agent/shared/mcp.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from jinja2 import Template
from packaging.version import Version

from bcbench.dataset import DatasetEntry
from bcbench.dataset import BaseDatasetEntry
from bcbench.exceptions import AgentError
from bcbench.logger import get_logger

Expand Down Expand Up @@ -144,7 +144,7 @@ def _build_server_entry(server: dict[str, Any], template_context: dict[str, Any]
raise AgentError(f"Unsupported MCP server type: {server_type}")


def build_mcp_config(config: dict[str, Any], entry: DatasetEntry, repo_path: Path, al_mcp: bool = False, container_name: str = "bcbench") -> tuple[str | None, list[str] | None]:
def build_mcp_config(config: dict[str, Any], entry: BaseDatasetEntry, repo_path: Path, al_mcp: bool = False, container_name: str = "bcbench") -> tuple[str | None, list[str] | None]:
mcp_servers: list[dict[str, Any]] = config.get("mcp", {}).get("servers", [])

if not al_mcp:
Expand Down
4 changes: 2 additions & 2 deletions src/bcbench/agent/shared/prompt.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@

from jinja2 import Template

from bcbench.dataset import DatasetEntry
from bcbench.dataset import BaseDatasetEntry
from bcbench.types import EvaluationCategory


def build_prompt(entry: DatasetEntry, repo_path: Path, config: dict, category: EvaluationCategory, al_mcp: bool = False) -> str:
def build_prompt(entry: BaseDatasetEntry, repo_path: Path, config: dict, category: EvaluationCategory, al_mcp: bool = False) -> str:
prompt_config = config.get("prompt", {})
template_str = prompt_config.get(f"{category.value}-template")
include_project_paths = prompt_config.get("include_project_paths")
Expand Down
4 changes: 0 additions & 4 deletions src/bcbench/cli_options.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,8 @@

# Type aliases for cleaner command signatures
# Note: Defaults are provided in function signatures, not here
DatasetPath = Annotated[Path, typer.Option(help="Path to dataset file")]

RepoPath = Annotated[Path, typer.Option(help="Path to repository")]

SchemaPath = Annotated[Path, typer.Option(help="Path to schema file")]

OutputDir = Annotated[Path, typer.Option(help="Directory to save evaluation results")]

RunId = Annotated[str, typer.Option(envvar="GITHUB_RUN_ID", help="Unique identifier for this evaluation run")]
Expand Down
8 changes: 4 additions & 4 deletions src/bcbench/collection/build_entry.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""Builder functions for creating DatasetEntry from ADO sources."""
"""Builder functions for creating BugFixEntry from ADO sources."""

from pathlib import Path
from typing import Any
Expand All @@ -7,7 +7,7 @@
from bcbench.collection.patch_utils import extract_file_paths_from_patch, extract_patches, find_project_paths_from_diff
from bcbench.collection.version_resolver import determine_environment_setup_version
from bcbench.config import get_config
from bcbench.dataset import DatasetEntry
from bcbench.dataset import BugFixEntry
from bcbench.operations.git_operations import checkout_commit
from bcbench.operations.test_operations import extract_tests_from_patch

Expand Down Expand Up @@ -51,7 +51,7 @@ def build_dataset_entry_from_ado(
base_commit: str,
commit: str,
diff_path: list[str] | None = None,
) -> DatasetEntry:
) -> BugFixEntry:
created_at = extract_creation_date(pr_data)
patch, patch_fix, patch_test = extract_patches(repo_path, base_commit, commit, diff_path=diff_path)
problem_statement, hints = extract_problem_statement(work_item_data)
Expand All @@ -73,7 +73,7 @@ def build_dataset_entry_from_ado(
hints=hints,
)

return DatasetEntry(
return BugFixEntry(
instance_id=instance_id,
base_commit=base_commit,
created_at=created_at,
Expand Down
4 changes: 2 additions & 2 deletions src/bcbench/collection/collect_gh.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from bcbench.collection.gh_client import GHClient
from bcbench.collection.patch_utils import extract_file_paths_from_patch, find_project_paths_from_diff, separate_patches
from bcbench.config import get_config
from bcbench.dataset import DatasetEntry
from bcbench.dataset import BugFixEntry
from bcbench.exceptions import CollectionError
from bcbench.logger import get_logger
from bcbench.operations.test_operations import extract_tests_from_patch
Expand Down Expand Up @@ -58,7 +58,7 @@ def collect_gh_entry(pr_number: int, output: Path, repo: str = "microsoft/BCApps

save_problem_statement(instance_id=instance_id, problem_statement=problem_statement)

entry = DatasetEntry(
entry = BugFixEntry(
repo=repo,
instance_id=instance_id,
base_commit=base_commit,
Expand Down
4 changes: 2 additions & 2 deletions src/bcbench/collection/collect_nav.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from bcbench.collection.ado_client import ADOClient
from bcbench.collection.build_entry import build_dataset_entry_from_ado
from bcbench.config import get_config
from bcbench.dataset import DatasetEntry
from bcbench.dataset import BugFixEntry
from bcbench.exceptions import CollectionError
from bcbench.logger import get_logger

Expand Down Expand Up @@ -35,7 +35,7 @@ def collect_nav_entry(
if len(parents) != 1:
raise CollectionError("Commit has multiple parents, cannot determine base commit.")

entry: DatasetEntry = build_dataset_entry_from_ado(
entry: BugFixEntry = build_dataset_entry_from_ado(
pr_number=pr_number,
repo_path=repo_path,
pr_data=pr_data,
Expand Down
8 changes: 5 additions & 3 deletions src/bcbench/commands/collect.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
"""CLI commands for collecting dataset entries."""

from pathlib import Path

import typer
from typing_extensions import Annotated

from bcbench.cli_options import DatasetPath, RepoPath
from bcbench.cli_options import RepoPath
from bcbench.collection import collect_gh_entry, collect_nav_entry
from bcbench.config import get_config

Expand All @@ -15,7 +17,7 @@
@collect_app.command("nav")
def collect_nav(
pr_number: Annotated[int, typer.Argument(help="Pull request number to collect")],
output: DatasetPath = _config.paths.dataset_path,
output: Annotated[Path, typer.Option(help="Path to output dataset file")] = _config.paths.dataset_dir / "bcbench.jsonl",
repo_path: RepoPath = _config.paths.testbed_path,
diff_path: Annotated[
list[str] | None,
Expand All @@ -35,7 +37,7 @@ def collect_nav(
@collect_app.command("gh")
def collect_gh(
pr_number: Annotated[int, typer.Argument(help="Pull request number to collect")],
output: DatasetPath = _config.paths.dataset_path,
output: Annotated[Path, typer.Option(help="Path to output dataset file")] = _config.paths.dataset_dir / "bcbench.jsonl",
repo: Annotated[str, typer.Option(help="GitHub repository in OWNER/REPO format")] = "microsoft/BCApps",
):
"""
Expand Down
Loading
Loading