microsoft · Jiawen-CS · Mar 18, 2026 · Mar 18, 2026 · Mar 18, 2026 · Mar 18, 2026
diff --git a/.github/actions/setup-bc-container/action.yml b/.github/actions/setup-bc-container/action.yml
@@ -32,7 +32,16 @@ runs:
         # Mask the password in GitHub Actions logs
         Write-Output "::add-mask::$password"
 
-        "BC_CONTAINER_NAME=bcbench-$("${{ inputs.instance-id }}".Split('-')[1])" | Out-File -FilePath $env:GITHUB_ENV -Append
+        # Extract numeric ticket ID from instance-id, ignoring __cf-N suffix for counterfactual entries
+        # e.g. "microsoftInternal__NAV-210528__cf-1" -> "210528", "microsoft__BCApps-4699" -> "4699"
+        $instanceId = "${{ inputs.instance-id }}"
+        if ($instanceId -match '[A-Za-z]+-(\d+)') {
+            $ticketNumber = $Matches[1]
+        } else {
+            $ticketNumber = $instanceId.Split('-')[1]
+        }
+
+        "BC_CONTAINER_NAME=bcbench-$ticketNumber" | Out-File -FilePath $env:GITHUB_ENV -Append
         "BC_CONTAINER_USERNAME=admin" | Out-File -FilePath $env:GITHUB_ENV -Append
         "BC_CONTAINER_PASSWORD=$password" | Out-File -FilePath $env:GITHUB_ENV -Append
       shell: pwsh

diff --git a/.github/prompts/create-counterfactual.prompt.md b/.github/prompts/create-counterfactual.prompt.md
@@ -0,0 +1,85 @@
+---
+description: "Create counterfactual (CF) dataset entries for BC-Bench. Provide the base instance_id and describe the code changes for each variant."
+---
+
+# Create Counterfactual Dataset Entries
+
+You are helping create counterfactual (CF) entries for the BC-Bench benchmark dataset.
+
+## Context
+
+Read these files first to understand the workflow:
+- `COUNTERFACTUAL.md` — authoring guide
+- `dataset/bcbench.jsonl` — find the base entry by instance_id
+- `dataset/counterfactual.jsonl` — existing CF entries (match format/key ordering)
+
+## Input Required from User
+
+The user will provide:
+1. **Base instance_id** — e.g. `microsoftInternal__NAV-224009`
+2. **CF variants** — for each variant:
+   - What code changes to make in `test/after/` (test modifications)
+   - What code changes to make in `fix/after/` (fix modifications, often unchanged)
+   - A short variant description
+   - The failure layer (`L1-syntax-representation`, `L2-execution-validation`, `L3-event-driven-paradigm`, `L4-workflow-business-logic`, `L5-toolchain-ecosystem`) — classified post-hoc, not at creation time
+3. **Problem statement** — either a pre-written README path or content to generate
+
+## Workflow (per variant)
+
+### Step 1: Analyze the base entry
+```bash
+python -c "import json; [print(json.dumps(json.loads(l), indent=2)) for l in open('dataset/bcbench.jsonl') if '<BASE_ID>' in l]"
+```
+- Understand the patch (fix) and test_patch (test) diffs
+- Read the base problem statement from `dataset/problemstatement/<instance_id>/README.md`
+
+### Step 2: Extract workspace
+```bash
+uv run bcbench dataset cf-extract <base_instance_id> -o cf-<short-name>
+```
+- Patch-only mode creates padded files — use `Get-Content ... | Where-Object { $_.Trim() }` to view content
+
+### Step 3: Edit the after/ files
+- Apply the user's described code changes to `test/after/` and/or `fix/after/`
+- If the fix needs to be **reversed** (e.g. CF removes a filter instead of adding one), swap fix/before and fix/after contents:
+  ```powershell
+  $before = Get-Content "fix\before\<path>" -Raw
+  $after = Get-Content "fix\after\<path>" -Raw
+  Set-Content "fix\before\<path>" -Value $after -NoNewline
+  Set-Content "fix\after\<path>" -Value $before -NoNewline
+  ```
+- Verify edits with `Get-Content ... | Where-Object { $_.Trim() }`
+
+### Step 4: Create the CF entry
+```bash
+uv run bcbench dataset cf-create ./cf-<short-name> \
+  -d "<variant description>"
+```
+
+**This command automatically handles:**
+- Patch regeneration from before/after files
+- `FAIL_TO_PASS` auto-detection from [Test] procedures in test patch
+- `PASS_TO_PASS` auto-population from the base entry
+- Canonical key ordering in counterfactual.jsonl
+- Problem statement directory scaffolding (copies base README **and all image/asset files** as template)
+
+### Step 5: Edit problem statement README
+- If user provided a pre-written README, copy it to the scaffolded directory at `dataset/problemstatement/<cf_instance_id>/README.md`
+- Otherwise, edit the scaffolded README to describe the variant
+- **Images & assets are copied automatically** by `cf-create`. Verify with `Get-ChildItem dataset/problemstatement/<cf_instance_id>/` that all referenced images are present.
+
+### Step 6: Verify
+```bash
+uv run pytest tests/test_dataset_integrity.py tests/test_counterfactual.py -q
+```
+Confirm all tests pass. Then briefly show the created entry's key fields.
+
+## Key Rules
+- Fix patch is usually **unchanged** from base (same bug fix, different test scenario)
+- If the CF requires a **different** fix, the fix/after file should contain the CF's gold fix code
+- Test patch is the primary thing that changes between variants
+- **No manual key reordering needed** — cf-create handles this automatically
+- **No manual PASS_TO_PASS needed** — cf-create copies from base entry automatically
+- Problem statement directory naming: `<base_id>__cf-N` (double underscore + hyphen)
+
+{{{ input }}}
diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
@@ -51,6 +51,7 @@ jobs:
     with:
       test-run: true
       category: ${{ needs.select-category.outputs.category }}
+      include-counterfactual: false
 
   mock-evaluation:
     runs-on: ubuntu-latest

diff --git a/.github/workflows/claude-evaluation.yml b/.github/workflows/claude-evaluation.yml
@@ -23,6 +23,7 @@ on:
         options:
           - "bug-fix"
           - "test-generation"
+          - "counterfactual-evaluation"
       test-run:
         description: "Indicate this is a test run (with few entries)"
         required: false
@@ -33,6 +34,24 @@ on:
         required: false
         default: false
         type: boolean
+      batch:
+        description: "Batch index (1-based) for splitting large datasets (0=no splitting)"
+        required: false
+        default: "0"
+        type: choice
+        options:
+          - "0"
+          - "1"
+          - "2"
+          - "3"
+      batch-count:
+        description: "Total number of batches to split into (0=no splitting)"
+        required: false
+        default: "0"
+        type: choice
+        options:
+          - "0"
+          - "3"
       repeat:
         description: "Number of times to run sequentially (ignored for test runs)"
         required: false
@@ -58,6 +77,9 @@ jobs:
     with:
       test-run: ${{ inputs.test-run }}
       category: ${{ inputs.category }}
+      include-counterfactual: false
+      batch: ${{ inputs.batch && fromJSON(inputs.batch) || 0 }}
+      batch-count: ${{ inputs.batch-count && fromJSON(inputs.batch-count) || 0 }}
 
   evaluate-with-claude-code:
     runs-on: [GitHub-BCBench]
@@ -154,4 +176,4 @@ jobs:
       workflow-file: claude-evaluation.yml
       repeat: ${{ inputs.repeat }}
       workflow-inputs: |
-        {"model": "${{ inputs.model }}", "category": "${{ inputs.category }}", "test-run": "${{ inputs.test-run }}", "al-mcp": "${{ inputs.al-mcp }}"}
+        {"model": "${{ inputs.model }}", "category": "${{ inputs.category }}", "test-run": "${{ inputs.test-run }}", "al-mcp": "${{ inputs.al-mcp }}", "batch": "${{ inputs.batch }}", "batch-count": "${{ inputs.batch-count }}"}
diff --git a/.github/workflows/copilot-evaluation.yml b/.github/workflows/copilot-evaluation.yml
@@ -31,6 +31,7 @@ on:
         options:
           - "bug-fix"
           - "test-generation"
+          - "counterfactual-evaluation"
       test-run:
         description: "Indicate this is a test run (with few entries)"
         required: false
@@ -41,6 +42,24 @@ on:
         required: false
         default: false
         type: boolean
+      batch:
+        description: "Batch index (1-based) for splitting large datasets (0=no splitting)"
+        required: false
+        default: "0"
+        type: choice
+        options:
+          - "0"
+          - "1"
+          - "2"
+          - "3"
+      batch-count:
+        description: "Total number of batches to split into (0=no splitting)"
+        required: false
+        default: "0"
+        type: choice
+        options:
+          - "0"
+          - "3"
       repeat:
         description: "Number of times to run sequentially (ignored for test runs)"
         required: false
@@ -66,6 +85,9 @@ jobs:
     with:
       test-run: ${{ inputs.test-run }}
       category: ${{ inputs.category }}
+      include-counterfactual: false
+      batch: ${{ inputs.batch && fromJSON(inputs.batch) || 0 }}
+      batch-count: ${{ inputs.batch-count && fromJSON(inputs.batch-count) || 0 }}
 
   evaluate-with-copilot-cli:
     runs-on: [GitHub-BCBench]
@@ -168,4 +190,4 @@ jobs:
       workflow-file: copilot-evaluation.yml
       repeat: ${{ inputs.repeat }}
       workflow-inputs: |
-        {"model": "${{ inputs.model }}", "category": "${{ inputs.category }}", "test-run": "${{ inputs.test-run }}", "al-mcp": "${{ inputs.al-mcp }}"}
+        {"model": "${{ inputs.model }}", "category": "${{ inputs.category }}", "test-run": "${{ inputs.test-run }}", "al-mcp": "${{ inputs.al-mcp }}", "batch": "${{ inputs.batch }}", "batch-count": "${{ inputs.batch-count }}"}
diff --git a/.github/workflows/dataset-validation.yml b/.github/workflows/dataset-validation.yml
@@ -3,23 +3,35 @@ permissions:
   contents: read
 
 on:
+  push:
+    branches:
+      - master_thesis
+    paths:
+      - "dataset/**"
   workflow_dispatch:
     inputs:
       test-run:
         description: "Indicate this is a test run (with few entries)"
         required: false
         default: true
         type: boolean
+      base-ref:
+        description: "Git ref to diff against for modified-only (e.g., HEAD~1)"
+        required: false
+        default: "origin/main"
+        type: string
   schedule:
     - cron: "0 0 * * 0"
 
 jobs:
   get-entries:
     uses: ./.github/workflows/get-entries.yml
     with:
-      modified-only: false
+      modified-only: ${{ github.event_name == 'push' }}
+      base-ref: ${{ inputs.base-ref || 'HEAD~1' }}
       test-run: ${{ inputs.test-run || false }}
       category: "bug-fix"
+      include-counterfactual: false
 
   verify-build-and-tests:
     runs-on: [GitHub-BCBench]
@@ -54,3 +66,45 @@ jobs:
         timeout-minutes: 60
         run: .\scripts\Verify-BuildAndTests.ps1 -InstanceId "${{ matrix.entry }}" -RepoPath "${{ steps.setup-env.outputs.repo_path }}"
         shell: pwsh
+
+  get-cf-entries:
+    uses: ./.github/workflows/get-entries.yml
+    with:
+      modified-only: ${{ github.event_name == 'push' }}
+      base-ref: ${{ inputs.base-ref || 'HEAD~1' }}
+      test-run: ${{ inputs.test-run || false }}
+      category: "counterfactual-evaluation"
+
+  verify-counterfactual-entries:
+    runs-on: [GitHub-BCBench]
+    needs: get-cf-entries
+    if: needs.get-cf-entries.outputs.entries != '[]'
+    environment:
+      name: ado-read
+      deployment: false
+    permissions:
+      contents: read
+      id-token: write
+    name: cf-${{ matrix.entry }}
+    strategy:
+      fail-fast: false
+      matrix:
+        entry: ${{ fromJson(needs.get-cf-entries.outputs.entries) }}
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v5
+
+      - name: Setup BC container
+        id: setup-env
+        timeout-minutes: 40
+        uses: ./.github/actions/setup-bc-container
+        with:
+          instance-id: ${{ matrix.entry }}
+          azure-client-id: ${{ secrets.AZURE_CLIENT_ID }}
+          azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }}
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Run build and test verification for ${{ matrix.entry }}
+        timeout-minutes: 60
+        run: .\scripts\Verify-BuildAndTests.ps1 -InstanceId "${{ matrix.entry }}" -RepoPath "${{ steps.setup-env.outputs.repo_path }}"
+        shell: pwsh
diff --git a/.github/workflows/get-entries.yml b/.github/workflows/get-entries.yml
@@ -10,6 +10,11 @@ on:
         required: false
         type: boolean
         default: false
+      base-ref:
+        description: Git ref to diff against when using modified-only (e.g., HEAD~1, a commit SHA)
+        required: false
+        type: string
+        default: origin/main
       test-run:
         description: Indicate this is a test run (with 2 entries)
         required: false
@@ -20,6 +25,21 @@ on:
         required: true
         type: string
         default: "bug-fix"
+      include-counterfactual:
+        description: Include counterfactual entries from counterfactual.jsonl
+        required: false
+        type: boolean
+        default: true
+      batch:
+        description: "Batch index (1-based) for splitting large datasets across runs"
+        required: false
+        type: number
+        default: 0
+      batch-count:
+        description: "Total number of batches (0 = no splitting)"
+        required: false
+        type: number
+        default: 0
     outputs:
       entries:
         description: JSON array of dataset entries
@@ -45,9 +65,19 @@ jobs:
           cmd="uv run bcbench dataset list --category ${{ inputs.category }} --github-output entries"
 
           if [[ "${{ inputs.modified-only }}" == "true" ]]; then
-            cmd="$cmd --modified-only"
+            cmd="$cmd --modified-only --base-ref '${{ inputs.base-ref }}'"
           elif [[ "${{ inputs.test-run }}" == "true" ]]; then
             cmd="$cmd --test-run"
           fi
 
+          if [[ "${{ inputs.include-counterfactual }}" == "false" ]]; then
+            cmd="$cmd --no-include-counterfactual"
+          fi
+
+          if [[ "${{ inputs.batch-count }}" != "0" ]]; then
+            cmd="$cmd --batch ${{ inputs.batch }} --batch-count ${{ inputs.batch-count }}"
+          fi
+
+          echo "Running: $cmd"
           eval "$cmd"
+          echo "entries output: $(cat $GITHUB_OUTPUT)"
diff --git a/.github/workflows/mini-evaluation.yml b/.github/workflows/mini-evaluation.yml
@@ -39,6 +39,7 @@ jobs:
     with:
       test-run: ${{ inputs.test-run }}
       category: ${{ inputs.category }}
+      include-counterfactual: false
 
   evaluate-with-mini-agent:
     runs-on: [GitHub-BCBench]