diff --git a/scripts/Get-WorkflowSummary.ps1 b/scripts/Get-WorkflowSummary.ps1 new file mode 100644 index 000000000..1bdd40466 --- /dev/null +++ b/scripts/Get-WorkflowSummary.ps1 @@ -0,0 +1,559 @@ +using module .\BCBenchUtils.psm1 + +<# + .SYNOPSIS + Gets evaluation summary from GitHub Actions workflow runs and downloads JSONL files (even if embedded in zip files). + + .DESCRIPTION + Fetches workflow run summaries from the copilot-evaluation.yml workflow and extracts + failure information including instance IDs, projects, and error messages. + + Additionally: + - Downloads run artifacts + - Locates .jsonl files (either directly in artifacts OR inside .zip files) + - Optionally copies discovered JSONL files into a stable output folder + + .PARAMETER RunId + Optional specific run ID to fetch. If not provided, fetches the most recent run(s). + + .PARAMETER Last + Number of recent runs to fetch (default: 1). Ignored if RunId is specified. + + .PARAMETER Branch + Filter runs by branch name. + + .PARAMETER Status + Filter runs by status (completed, in_progress, queued, etc.) + + .PARAMETER Repository + GitHub repo in OWNER/REPO format (default: microsoft/BC-Bench). + + .PARAMETER Workflow + Workflow file name (default: copilot-evaluation.yml). + + .PARAMETER DownloadJsonl + If true (default), downloads artifacts and searches for jsonl (including inside zip). + + .PARAMETER JsonlOutputRoot + If provided, copies all found jsonl files into subfolders per runId for easy access. + + .PARAMETER KeepArtifacts + If set, does not delete temp artifact download folders (useful for debugging). +#> + +param( + [Parameter(Mandatory = $false)] + [string]$RunId, + + [Parameter(Mandatory = $false)] + [int]$Last = 1, + + [Parameter(Mandatory = $false)] + [string]$Branch, + + [Parameter(Mandatory = $false)] + [ValidateSet("completed", "in_progress", "queued", "waiting", "requested", "pending")] + [string]$Status = "completed", + + [Parameter(Mandatory = $false)] + [string]$Repository = "microsoft/BC-Bench", + + [Parameter(Mandatory = $false)] + [string]$Workflow = "copilot-evaluation.yml", + + [Parameter(Mandatory = $false)] + [bool]$DownloadJsonl = $true, + + [Parameter(Mandatory = $false)] + [string]$JsonlOutputRoot, + + [Parameter(Mandatory = $false)] + [switch]$KeepArtifacts, + + [Parameter(Mandatory = $false)] + [string]$Category +) + +function Get-WorkflowRuns { + param( + [string]$Repo, + [string]$WorkflowFile, + [int]$Limit, + [string]$BranchFilter, + [string]$StatusFilter + ) + + $args = @( + "run", "list", + "--repo", $Repo, + "--workflow", $WorkflowFile, + "--limit", $Limit, + "--json", "databaseId,displayTitle,conclusion,status,createdAt,headBranch,url" + ) + + if ($BranchFilter) { $args += @("--branch", $BranchFilter) } + if ($StatusFilter) { $args += @("--status", $StatusFilter) } + + $result = gh @args 2>&1 + if ($LASTEXITCODE -ne 0) { + throw "Failed to fetch workflow runs: $result" + } + + return $result | ConvertFrom-Json +} + +function Get-RunDetails { + param( + [string]$Repo, + [string]$RunId + ) + + $json = gh run view $RunId --repo $Repo --json "databaseId,displayTitle,conclusion,status,createdAt,headBranch,url" 2>&1 + if ($LASTEXITCODE -ne 0) { + throw "Failed to fetch run details for run $RunId`: $json" + } + + return $json | ConvertFrom-Json +} + +function Get-JobSummary { + param( + [string]$Repo, + [string]$RunId + ) + + $jobs = gh run view $RunId --repo $Repo --json jobs 2>&1 + if ($LASTEXITCODE -ne 0) { + throw "Failed to fetch jobs for run $RunId`: $jobs" + } + + return ($jobs | ConvertFrom-Json).jobs +} + +function Get-SummarizeJobOutput { + <# + .SYNOPSIS + Gets the output from an artifact containing the evaluation summary markdown. + #> + param( + [string]$Repo, + [string]$RunId + ) + + $tempDir = Join-Path $env:TEMP "gh-run-$RunId-summary" + + try { + gh run download $RunId --repo $Repo --dir $tempDir --pattern "evaluation-summary*" 2>&1 | Out-Null + + if (Test-Path $tempDir) { + $summaryFiles = Get-ChildItem -Path $tempDir -Filter "*.md" -Recurse -ErrorAction SilentlyContinue + if ($summaryFiles) { + return Get-Content $summaryFiles[0].FullName -Raw + } + } + } + catch { + Write-Log "Could not download summary artifact: $_" -Level Warning + } + finally { + if (Test-Path $tempDir) { + Remove-Item $tempDir -Recurse -Force -ErrorAction SilentlyContinue + } + } + + return $null +} + +function Get-JobStepSummary { + <# + .SYNOPSIS + Fallback: fetches run logs (may be large). Useful if summary artifact isn't present. + #> + param( + [string]$Repo, + [string]$RunId + ) + + try { + $logs = gh run view $RunId --repo $Repo --log 2>&1 + if ($LASTEXITCODE -eq 0 -and $logs) { return $logs } + } + catch { + Write-Log "Error fetching job logs: $_" -Level Warning + } + + return $null +} + +function Parse-EvaluationSummary { + <# + .SYNOPSIS + Parses the evaluation summary markdown/log text to extract failure information. + #> + param( + [string]$SummaryText + ) + + $result = [PSCustomObject]@{ + TotalEntries = 0 + Model = "" + Category = "" + SuccessfulEvaluations = 0 + FailedEvaluations = 0 + FailedInstances = @() + ToolUsage = @{} + RawSummary = $SummaryText + } + + if ([string]::IsNullOrWhiteSpace($SummaryText)) { + return $result + } + + if ($SummaryText -match "Total entries processed:\s*(\d+),\s*using\s+(.+)") { + $result.TotalEntries = [int]$Matches[1] + $result.Model = $Matches[2].Trim() + } + + if ($SummaryText -match "Category:\s*(.+)") { + $result.Category = $Matches[1].Trim() + } + + if ($SummaryText -match "Successful evaluations:\s*(\d+)") { + $result.SuccessfulEvaluations = [int]$Matches[1] + } + + if ($SummaryText -match "Failed evaluations:\s*(\d+)") { + $result.FailedEvaluations = [int]$Matches[1] + } + + # Markdown table format: + # Instance ID | Project | Status | Error Message + $tablePattern = "(?m)^([a-zA-Z0-9_]+__[a-zA-Z0-9-]+)\s*\|\s*([^\|]+)\s*\|\s*❌\s*Failed\s*\|\s*(.+)$" + $matches = [regex]::Matches($SummaryText, $tablePattern) + + foreach ($match in $matches) { + $result.FailedInstances += [PSCustomObject]@{ + InstanceId = $match.Groups[1].Value.Trim() + Project = $match.Groups[2].Value.Trim() + ErrorMessage = $match.Groups[3].Value.Trim() + } + } + + # Alternative whitespace format (fallback) + if ($result.FailedInstances.Count -eq 0) { + $altPattern = "(?m)^([^\s\t]+)\s+([^\s\t]+)\s+❌\s*Failed\s+(.+)$" + $matches = [regex]::Matches($SummaryText, $altPattern) + + foreach ($match in $matches) { + $result.FailedInstances += [PSCustomObject]@{ + InstanceId = $match.Groups[1].Value.Trim() + Project = $match.Groups[2].Value.Trim() + ErrorMessage = $match.Groups[3].Value.Trim() + } + } + } + + # Tool usage lines like: toolName: 1.0 + $toolPattern = "(?m)^(\w+):\s*([\d.]+)$" + $toolMatches = [regex]::Matches($SummaryText, $toolPattern) + + foreach ($match in $toolMatches) { + $toolName = $match.Groups[1].Value + $toolCount = [double]$match.Groups[2].Value + + if ($toolName -notin @("Category", "Total", "Successful", "Failed", "MCP")) { + $result.ToolUsage[$toolName] = $toolCount + } + } + + return $result +} + +function Download-RunArtifacts { + <# + .SYNOPSIS + Downloads all artifacts for a run into a destination folder using gh run download. + #> + param( + [string]$Repo, + [string]$RunId, + [string]$Destination + ) + + New-Item -ItemType Directory -Force -Path $Destination | Out-Null + + $result = gh run download $RunId --repo $Repo --dir $Destination 2>&1 + if ($LASTEXITCODE -ne 0) { + throw "Failed to download artifacts for run $RunId`: $result" + } +} + +function Expand-ZipsRecursively { + <# + .SYNOPSIS + Expands all zip files found under Root into sibling folders (zipname_extracted), recursively. + #> + param( + [string]$Root + ) + + $expandedAny = $false + + while ($true) { + $zips = Get-ChildItem -Path $Root -Recurse -Filter *.zip -ErrorAction SilentlyContinue + if (-not $zips -or $zips.Count -eq 0) { break } + + $didExpand = $false + foreach ($zip in $zips) { + $dest = Join-Path $zip.Directory.FullName ($zip.BaseName + "_extracted") + if (Test-Path $dest) { continue } + + try { + Expand-Archive -Path $zip.FullName -DestinationPath $dest -Force + $didExpand = $true + $expandedAny = $true + } + catch { + Write-Log "Failed to expand zip $($zip.FullName): $_" -Level Warning + } + } + + if (-not $didExpand) { break } + } + + return $expandedAny +} + +function Get-JsonlFilesFromDownloadedArtifacts { + <# + .SYNOPSIS + Finds .jsonl files in a downloaded artifact folder. Also expands any .zip files and searches again. + #> + param( + [string]$ArtifactsRoot + ) + + # First pass: maybe jsonl is directly present + $jsonl = Get-ChildItem -Path $ArtifactsRoot -Recurse -Filter *.jsonl -ErrorAction SilentlyContinue + + # If none, expand zips and search again + if (-not $jsonl -or $jsonl.Count -eq 0) { + $expanded = Expand-ZipsRecursively -Root $ArtifactsRoot + if ($expanded) { + $jsonl = Get-ChildItem -Path $ArtifactsRoot -Recurse -Filter *.jsonl -ErrorAction SilentlyContinue + } + } + + return $jsonl +} + +function Copy-JsonlToOutputRoot { + param( + [System.IO.FileInfo[]]$JsonlFiles, + [string]$OutputRoot, + [string]$RunId + ) + + if (-not $OutputRoot) { return $null } + + $target = Join-Path $OutputRoot ("run-" + $RunId) + New-Item -ItemType Directory -Force -Path $target | Out-Null + + $copied = @() + foreach ($f in $JsonlFiles) { + $name = $f.Name + $dest = Join-Path $target $name + + # Avoid name collisions + $i = 1 + while (Test-Path $dest) { + $dest = Join-Path $target ("{0}_{1}{2}" -f [IO.Path]::GetFileNameWithoutExtension($name), $i, [IO.Path]::GetExtension($name)) + $i++ + } + + Copy-Item -Path $f.FullName -Destination $dest -Force + $copied += Get-Item $dest + } + + return $copied +} + +# ----------------------- +# Main execution +# ----------------------- +Write-Log "Fetching workflow runs from $Repository..." -Level Info + +try { + if ($RunId) { + # Fetch full run details so url/branch/createdAt are not null + $runDetails = Get-RunDetails -Repo $Repository -RunId $RunId + $runs = @($runDetails) + } + else { + $runs = Get-WorkflowRuns -Repo $Repository -WorkflowFile $Workflow -Limit $Last -BranchFilter $Branch -StatusFilter $Status + } + + if (-not $runs -or $runs.Count -eq 0) { + Write-Log "No workflow runs found matching criteria" -Level Warning + exit 0 + } + + Write-Log "Found $($runs.Count) run(s) to process" -Level Success + + $allResults = @() + + foreach ($run in $runs) { + $currentRunId = $run.databaseId + + if ($run.conclusion -in @("cancelled", "skipped")) { + Write-Log "Skipping run $currentRunId because conclusion is $($run.conclusion)" -Level Warning + continue + } + + Write-Log "`nProcessing run $currentRunId..." -Level Info + + if ($run.displayTitle) { Write-Log " Title: $($run.displayTitle)" -Level Info } + if ($run.headBranch) { Write-Log " Branch: $($run.headBranch)" -Level Info } + if ($run.conclusion) { Write-Log " Conclusion: $($run.conclusion)" -Level Info } + + # ----------------------- + # Download JSONL from ZIP inside artifacts + # ----------------------- + $jsonlFiles = @() + $jsonlCopied = @() + $artifactsDir = $null + + if ($DownloadJsonl) { + $artifactsDir = Join-Path $env:TEMP ("bcbench-artifacts-" + $currentRunId) + if (Test-Path $artifactsDir) { + Remove-Item $artifactsDir -Recurse -Force -ErrorAction SilentlyContinue + } + + Write-Log " Downloading artifacts for run $currentRunId..." -Level Info + try { + Download-RunArtifacts -Repo $Repository -RunId $currentRunId -Destination $artifactsDir + } + catch { + Write-Log "No artifacts for run $currentRunId, continuing..." -Level Warning + $jsonlFiles = @() + } + + $jsonlFiles = Get-JsonlFilesFromDownloadedArtifacts -ArtifactsRoot $artifactsDir + + if ($jsonlFiles -and $jsonlFiles.Count -gt 0) { + Write-Log " Found JSONL files: $($jsonlFiles.Count)" -Level Success + + if ($JsonlOutputRoot) { + New-Item -ItemType Directory -Force -Path $JsonlOutputRoot | Out-Null + $jsonlCopied = Copy-JsonlToOutputRoot -JsonlFiles $jsonlFiles -OutputRoot $JsonlOutputRoot -RunId $currentRunId + Write-Log " Copied JSONL files to: $(Join-Path $JsonlOutputRoot ('run-' + $currentRunId))" -Level Success + } + else { + $jsonlFiles | Select-Object -First 2 | ForEach-Object { + Write-Log " JSONL: $($_.FullName)" -Level Info + } + } + } + else { + Write-Log " No JSONL files found in artifacts (even after expanding zips)." -Level Warning + } + } + + # ----------------------- + # Existing: retrieve summary + # ----------------------- + $summaryText = Get-SummarizeJobOutput -Repo $Repository -RunId $currentRunId + if (-not $summaryText) { + $summaryText = Get-JobStepSummary -Repo $Repository -RunId $currentRunId + } + + if ($summaryText) { + $parsed = Parse-EvaluationSummary -SummaryText $summaryText + # ✅ Category filter + if ($Category) { + if (-not $parsed.Category) { + Write-Log "Skipping run $currentRunId (no category found)" -Level Warning + continue + } + + if ($parsed.Category -ne $Category) { + Write-Log "Skipping run $currentRunId (category '$($parsed.Category)' does not match '$Category')" -Level Info + continue + } + } + $parsed | Add-Member -NotePropertyName "RunId" -NotePropertyValue $currentRunId + $parsed | Add-Member -NotePropertyName "RunUrl" -NotePropertyValue $run.url + $parsed | Add-Member -NotePropertyName "Branch" -NotePropertyValue $run.headBranch + $parsed | Add-Member -NotePropertyName "CreatedAt" -NotePropertyValue $run.createdAt + + $parsed | Add-Member -NotePropertyName "JsonlFilesCount" -NotePropertyValue ($jsonlFiles.Count) + + # PS 5.1-safe selection (instead of ??) + $jsonlToAttach = $null + if ($jsonlCopied -and $jsonlCopied.Count -gt 0) { + $jsonlToAttach = $jsonlCopied.FullName + } + elseif ($jsonlFiles -and $jsonlFiles.Count -gt 0) { + $jsonlToAttach = $jsonlFiles.FullName + } + + $parsed | Add-Member -NotePropertyName "JsonlFiles" -NotePropertyValue $jsonlToAttach + + $allResults += $parsed + + # Display summary + Write-Log "`n === Evaluation Summary ===" -Level Success + Write-Log " Model: $($parsed.Model)" -Level Info + Write-Log " Category: $($parsed.Category)" -Level Info + Write-Log " Total Entries: $($parsed.TotalEntries)" -Level Info + Write-Log " Successful: $($parsed.SuccessfulEvaluations) ✅" -Level Success + Write-Log " Failed: $($parsed.FailedEvaluations) ❌" -Level $(if ($parsed.FailedEvaluations -gt 0) { "Error" } else { "Success" }) + + if ($parsed.FailedInstances.Count -gt 0) { + Write-Log "`n Failed Instances:" -Level Warning + foreach ($instance in $parsed.FailedInstances) { + Write-Log " - $($instance.InstanceId) ($($instance.Project)): $($instance.ErrorMessage)" -Level Warning + } + } + + if ($parsed.ToolUsage.Count -gt 0) { + Write-Log "`n Tool Usage:" -Level Info + foreach ($tool in $parsed.ToolUsage.GetEnumerator() | Sort-Object Value -Descending) { + Write-Log " $($tool.Key): $($tool.Value)" -Level Info + } + } + } + else { + Write-Log " Could not retrieve summary for run $currentRunId" -Level Warning + + # At minimum, show job-level failures + $jobs = Get-JobSummary -Repo $Repository -RunId $currentRunId + $failedJobs = $jobs | Where-Object { $_.conclusion -eq "failure" } + + if ($failedJobs) { + Write-Log " Failed jobs: $($failedJobs.Count)" -Level Error + foreach ($job in $failedJobs) { + Write-Log " - $($job.name): $($job.conclusion)" -Level Warning + } + } + } + + # Cleanup unless requested otherwise + if ($DownloadJsonl -and $artifactsDir -and (Test-Path $artifactsDir) -and (-not $KeepArtifacts)) { + Remove-Item $artifactsDir -Recurse -Force -ErrorAction SilentlyContinue + } + elseif ($DownloadJsonl -and $artifactsDir -and (Test-Path $artifactsDir) -and $KeepArtifacts) { + Write-Log " Keeping artifacts folder: $artifactsDir" -Level Info + } + } + + # Return results for pipeline use + if ($allResults.Count -gt 0) { + return $allResults + } +} +catch { + Write-Log "Error: $_" -Level Error + Write-Log $_.ScriptStackTrace -Level Error + exit 1 +} diff --git a/scripts/bcbench_analyze_artifacts.py b/scripts/bcbench_analyze_artifacts.py new file mode 100644 index 000000000..daa2c1af8 --- /dev/null +++ b/scripts/bcbench_analyze_artifacts.py @@ -0,0 +1,565 @@ +#!/usr/bin/env python3 +""" +bcbench_analyze_artifacts.py + +Analyze BC-Bench GitHub Actions artifacts that you already downloaded. + +Supports TWO input modes (no GitHub API, no tokens), which can be COMBINED: + +1) ZIP mode: point to a folder of artifact .zip files you downloaded from Actions UI + - Uses --zips-dir or repeated --zip + - Supports run subfolders like: + artifacts/manual/1/*.zip + artifacts/manual/2/*.zip + artifacts/manual/3/*.zip + Each immediate subfolder is treated as one "run". + +2) EXTRACTED mode: point to a folder that ALREADY contains extracted artifact content + - Uses --extracted-dir + - Also works if you point --zips-dir to a folder with *no zip files* but with extracted subfolders. + +Both modes can be used together (e.g. --zips-dir artifacts/manual --extracted-dir out2) +to merge zip-based and pre-extracted runs into a single analysis. + +Outputs (under --out): + artifacts_extracted/ (only in ZIP mode) + files/ (collected *.jsonl/*.txt) + summary.csv + top_failures.csv + errors_summary.csv + grouped_errors.csv (+ grouped_errors.xlsx if openpyxl is available) + extracted_tests//meta.json + extraction_report.json + error_variations.json + extracted_tests//.diff/.al/.txt + _error.txt when available + +This script focuses on: +- top failing tests across the provided runs +- error-message variations (if error_message exists) +- extracting generated test code/patch (if generated_patch/test_code exists) +""" + +import argparse +import csv +import json +import re +import sys +import zipfile +from dataclasses import dataclass +from pathlib import Path +from typing import Any + + +def die(msg: str, code: int = 2) -> None: + print(f"ERROR: {msg}", file=sys.stderr) + sys.exit(code) + + +def safe_name(s: str) -> str: + s = re.sub(r"[^\w\-. ]+", "_", (s or "").strip()) + s = re.sub(r"\s+", " ", s).strip() + return s or "artifact" + + +def extract_zip_file(zip_path: Path, dest_dir: Path) -> None: + dest_dir.mkdir(parents=True, exist_ok=True) + with zipfile.ZipFile(zip_path) as z: + z.extractall(dest_dir) + + +def find_zip_files(root: Path) -> list[Path]: + if root.is_file() and root.suffix.lower() == ".zip": + return [root] + if root.is_dir(): + return sorted([p for p in root.rglob("*.zip") if p.is_file()]) + return [] + + +def rglob_files(root: Path, pattern: str) -> list[Path]: + return [p for p in root.rglob(pattern) if p.is_file()] + + +# ---------------------------- Grouped error reporting ---------------------------- +_ANSI_RE = re.compile(r"\x1b\[[0-9;]*m") +_TIME_RE = re.compile(r"\[[0-2]\d:[0-5]\d:[0-5]\d\]") +_WINPATH_RE = re.compile(r"[A-Z]:\\[^\n]+") + + +def _normalize_error_message(msg: str) -> str: + """Normalize error messages so similar failures group together.""" + if msg is None: + return "" + msg = str(msg).replace("\r\n", "\n") + msg = _ANSI_RE.sub("", msg) + msg = _TIME_RE.sub("[HH:MM:SS]", msg) + msg = _WINPATH_RE.sub("", msg) + + # Normalize common variable parts + msg = re.sub(r"Setting test codeunit range '\d+'", "Setting test codeunit range ''", msg) + msg = re.sub(r"\bCodeunit\s+\d+\b", "Codeunit ", msg) + msg = re.sub(r"\bline\s+\d+\b", "line ", msg, flags=re.IGNORECASE) + msg = re.sub(r"Line No\. = '.*?'", "Line No. = ''", msg) + + # Collapse whitespace and drop empty lines + return "\n".join(ln.rstrip() for ln in msg.strip().splitlines() if ln.strip()) + + +def _bucket_error(msg: str) -> str: + m = (msg or "").lower() + if "agent timed out" in m or "timed out" in m: + return "timeout" + if "build or publish failed" in m: + return "build/publish" + if "passed pre-patch" in m and "expected: fail" in m: + return "expectation_mismatch_prepatch_pass" + if "failed post-patch" in m and "expected: pass" in m: + return "expectation_mismatch_postpatch_fail" + if "ui handlers were not executed" in m: + return "missing_ui_handler" + if "must assign a lot number" in m or "must assign a serial number" in m or "checkitemtracking" in m: + return "item_tracking_not_handled" + if "assert.areequal failed" in m and ("integer" in m and "biginteger" in m): + return "assert_type_mismatch" + if "assert." in m and ("recordcount failed" in m or "areequal failed" in m or "isfalse failed" in m): + return "assert_failed" + return "other" + + +# ---------------------------- Record parsing ---------------------------- +def try_parse_jsonl_line(line: str) -> dict[str, Any] | None: + line = line.strip() + if not line: + return None + if line.startswith("{") and line.endswith("}"): + try: + return json.loads(line) + except json.JSONDecodeError: + return None + return None + + +def split_kv_records(text: str) -> list[str]: + text = text.strip() + if not text: + return [] + if text.startswith("instance_id "): + parts = re.split(r"\n(?=instance_id\s)", text) + return [p.strip() for p in parts if p.strip()] + return [text] + + +def parse_kv_record(block: str) -> dict[str, Any]: + b = block.replace("\r\n", "\n").replace("\r", "\n") + + # Extract generated_patch multiline + gen_patch = None + m = re.search(r"\bgenerated_patch\s", b) + if m: + start = m.end() + m2 = re.search(r"\nerror_message\s", b[start:]) + if m2: + gen_patch = b[start : start + m2.start()] + rest = b[start + m2.start() :] + head = b[: m.start()] + else: + gen_patch = b[start:] + rest = "" + head = b[: m.start()] + else: + head = b + rest = "" + + head_tokens = re.split(r"\s+", head.strip()) + data: dict[str, Any] = {} + i = 0 + while i < len(head_tokens) - 1: + key = head_tokens[i] + val = head_tokens[i + 1] + if key in {"instance_id", "project", "model", "agent_name", "category", "resolved", "build", "timeout"}: + data[key] = val + i += 2 + else: + i += 1 + + if gen_patch is not None: + data["generated_patch"] = gen_patch.strip("\n") + + # Parse error_message from rest + if rest: + rm = re.search(r"\berror_message\s", rest) + if rm: + start = rm.end() + stop = None + for key2 in [" metrics ", " execution_time ", " llm_duration ", "\nmetrics ", "\nexecution_time "]: + pos = rest.find(key2, start) + if pos != -1: + stop = pos + break + em = rest[start:].strip() if stop is None else rest[start:stop].strip() + data["error_message"] = em + + # Coerce booleans + for k in ["resolved", "build", "timeout"]: + if k in data: + v = str(data[k]).strip().lower() + if v in ("true", "false"): + data[k] = (v == "true") + + return data + + +def iter_records_from_file(path: Path) -> list[dict[str, Any]]: + content = path.read_text(encoding="utf-8", errors="replace") + + # JSONL + recs: list[dict[str, Any]] = [] + json_hits = 0 + for line in content.splitlines(): + obj = try_parse_jsonl_line(line) + if obj is not None: + recs.append(obj) + json_hits += 1 + if json_hits: + return recs + + # KV fallback + return [parse_kv_record(block) for block in split_kv_records(content)] + + +def get_test_id(rec: dict[str, Any]) -> str: + if isinstance(rec.get("instance_id"), str) and rec["instance_id"].strip(): + return rec["instance_id"].strip() + for k in ["test_name", "testName", "name", "id", "testId", "test_id", "title"]: + v = rec.get(k) + if isinstance(v, str) and v.strip(): + return v.strip() + return "unknown_test" + + +def get_category(rec: dict[str, Any]) -> str | None: + v = rec.get("category") + return v.strip() if isinstance(v, str) and v.strip() else None + + +def get_success_fail(rec: dict[str, Any]) -> str | None: + # KV schema + if isinstance(rec.get("resolved"), bool) or isinstance(rec.get("build"), bool) or isinstance(rec.get("timeout"), bool): + resolved = rec.get("resolved") + build = rec.get("build") + timeout = rec.get("timeout") + if resolved is True and build is True and timeout is False: + return "success" + return "fail" + + # Common JSON fields + if isinstance(rec.get("passed"), bool): + return "success" if rec["passed"] else "fail" + if isinstance(rec.get("success"), bool): + return "success" if rec["success"] else "fail" + + for k in ["status", "result", "outcome", "conclusion"]: + v = rec.get(k) + if isinstance(v, str): + vl = v.strip().lower() + if vl in ["passed", "pass", "success", "ok"]: + return "success" + if vl in ["failed", "fail", "error", "timeout", "cancelled", "canceled"]: + return "fail" + + return None + + +def extract_code_text(rec: dict[str, Any]) -> tuple[str, str] | None: + if isinstance(rec.get("generated_patch"), str) and rec["generated_patch"].strip(): + return (".diff", rec["generated_patch"]) + + for k in ["test_code", "testCode", "generated_code", "generatedCode", "code", "al", "al_code", "source"]: + v = rec.get(k) + if isinstance(v, str) and v.strip(): + ext = ".al" if ("codeunit" in v.lower() or "procedure" in v.lower()) else ".txt" + return (ext, v) + + return None + + +@dataclass +class Agg: + total: int = 0 + success: int = 0 + fail: int = 0 + + +def main() -> None: + ap = argparse.ArgumentParser() + ap.add_argument("--zip", dest="zips", action="append", default=[], help="Path to an artifact .zip (repeatable)") + ap.add_argument( + "--zips-dir", + default=None, + help=( + "Directory containing artifact .zip files. If it contains run subfolders, each subfolder is treated as one run. " + "If it contains no zips, it's treated as extracted content." + ), + ) + ap.add_argument("--extracted-dir", default=None, help="Directory containing already extracted artifact content") + ap.add_argument("--zip-depth", type=int, default=3, help="How deep to extract nested zip files (ZIP mode)") + ap.add_argument("--category", default="test-generation", help="Filter records by category") + ap.add_argument("--top", type=int, default=10, help="How many top failing tests to extract") + ap.add_argument("--out", default="out", help="Output directory") + args = ap.parse_args() + + out_root = Path(args.out) + extract_root = out_root / "artifacts_extracted" + files_root = out_root / "files" + out_root.mkdir(parents=True, exist_ok=True) + files_root.mkdir(parents=True, exist_ok=True) + + # ---------- Decide input mode (both --zips-dir and --extracted-dir can be combined) ---------- + extracted_dirs: list[Path] = [] + + # EXTRACTED mode: pre-extracted content folders + if args.extracted_dir: + root = Path(args.extracted_dir) + if not root.exists(): + die(f"--extracted-dir does not exist: {root}") + sub = [p for p in root.iterdir() if p.is_dir()] + extracted_dirs = sorted(sub) if sub else [root] + print(f"Using extracted content: {root} (runs={len(extracted_dirs)})") + + # ZIP mode: gather zip inputs and group by run folder when applicable + run_groups: list[tuple[str, list[Path]]] = [] + + # Group by immediate subfolders under --zips-dir (manual/1, manual/2, manual/3) + if args.zips_dir: + root_dir = Path(args.zips_dir) + if root_dir.exists() and root_dir.is_dir(): + subdirs = sorted([d for d in root_dir.iterdir() if d.is_dir()]) + if subdirs: + for sd in subdirs: + zips_in_sd = find_zip_files(sd) + if zips_in_sd: + run_groups.append((sd.name, zips_in_sd)) + + # Also include zips directly under root as one group (optional) + root_zips = sorted([z for z in root_dir.glob("*.zip") if z.is_file()]) + if root_zips: + run_groups.insert(0, (root_dir.name, root_zips)) + else: + # No subdirs; treat root as a single run + zips_in_root = find_zip_files(root_dir) + if zips_in_root: + run_groups.append((root_dir.name, zips_in_root)) + + # Explicit --zip files become their own run group if not already included + explicit_zip_inputs: list[Path] = [] + for z in args.zips: + explicit_zip_inputs.extend(find_zip_files(Path(z))) + explicit_zip_inputs = sorted(set(explicit_zip_inputs)) + if explicit_zip_inputs: + in_group = {zp for _, zs in run_groups for zp in zs} + for zp in explicit_zip_inputs: + if zp not in in_group: + run_groups.append((zp.stem, [zp])) + + if run_groups: + extract_root.mkdir(parents=True, exist_ok=True) + for run_i, (run_name, zips_for_run) in enumerate(run_groups, start=1): + tag = safe_name(run_name) + dest = extract_root / f"{run_i:03d}_{tag}" + dest.mkdir(parents=True, exist_ok=True) + print(f"Extract run [{run_i}/{len(run_groups)}]: {run_name} (zips={len(zips_for_run)}) -> {dest}") + + for i, zip_path in enumerate(zips_for_run, start=1): + zip_tag = safe_name(zip_path.stem) + zip_dest = dest / f"{i:03d}_{zip_tag}" + print(f" - Extract zip [{i}/{len(zips_for_run)}]: {zip_path} -> {zip_dest}") + extract_zip_file(zip_path, zip_dest) + + # Nested extraction inside this zip subtree + cur_level = [zip_dest] + for _depth in range(1, args.zip_depth + 1): + next_level: list[Path] = [] + for d in cur_level: + for nested in rglob_files(d, "*.zip"): + nested_tag = safe_name(nested.stem) + nested_dest = nested.parent / f"{nested_tag}__unzipped" + if nested_dest.exists(): + continue + try: + extract_zip_file(nested, nested_dest) + next_level.append(nested_dest) + except zipfile.BadZipFile: + continue + cur_level = next_level + if not cur_level: + break + + extracted_dirs.append(dest) + elif not extracted_dirs: + # No zips found and no extracted dirs. If --zips-dir exists, treat it as extracted content. + if args.zips_dir and Path(args.zips_dir).exists(): + root = Path(args.zips_dir) + sub = [d for d in root.iterdir() if d.is_dir()] + extracted_dirs = sorted(sub) if sub else [root] + print(f"No .zip files found under --zips-dir; treating as extracted content: {root} (runs={len(extracted_dirs)})") + else: + die("No .zip files found. Use --zip or --zips-dir or --extracted-dir .") + + print(f"\nTotal runs to analyze: {len(extracted_dirs)}") + + # ---------- Collect jsonl/txt per extracted run ---------- + run_index = 0 + for d in extracted_dirs: + run_index += 1 + run_out = files_root / f"run-{run_index:03d}" + run_out.mkdir(parents=True, exist_ok=True) + + candidates = rglob_files(d, "*.jsonl") + rglob_files(d, "*.txt") + if not candidates: + print(f"[run {run_index:03d}] No .jsonl/.txt found under {d}") + continue + + seen: dict[str, int] = {} + for p in candidates: + base = p.name + if base in seen: + seen[base] += 1 + target = run_out / f"{seen[base]}_{base}" + else: + seen[base] = 0 + target = run_out / base + target.write_bytes(p.read_bytes()) + + print(f"[run {run_index:03d}] collected {len(candidates)} files -> {run_out}") + + # ---------- Analyze ---------- + category_filter = args.category.strip().lower() + agg: dict[str, Agg] = {} + rec_cache: dict[str, list[tuple[str, dict[str, Any]]]] = {} + + for run_folder in sorted(files_root.glob("run-*")): + run_id = run_folder.name + for f in list(run_folder.glob("*.jsonl")) + list(run_folder.glob("*.txt")): + for rec in iter_records_from_file(f): + cat = get_category(rec) + if category_filter and (not cat or cat.strip().lower() != category_filter): + continue + + tid = get_test_id(rec) + status = get_success_fail(rec) + if status is None: + continue + + a = agg.setdefault(tid, Agg()) + a.total += 1 + if status == "success": + a.success += 1 + else: + a.fail += 1 + + rec_cache.setdefault(tid, []).append((run_id, rec)) + + if not agg: + die(f"No records found for category='{args.category}'.") + + summary_csv = out_root / "summary.csv" + with summary_csv.open("w", newline="", encoding="utf-8") as f: + w = csv.writer(f) + w.writerow(["test_id", "category", "total", "success", "fail", "fail_rate"]) + for tid, a in sorted(agg.items(), key=lambda kv: (-kv[1].fail, kv[0].lower())): + rate = (a.fail / a.total) if a.total else 0.0 + w.writerow([tid, args.category, a.total, a.success, a.fail, f"{rate:.4f}"]) + + top = sorted(agg.items(), key=lambda kv: (kv[1].fail, kv[1].total), reverse=True)[: args.top] + + top_csv = out_root / "top_failures.csv" + with top_csv.open("w", newline="", encoding="utf-8") as f: + w = csv.writer(f) + w.writerow(["rank", "test_id", "fail", "total", "fail_rate"]) + for i, (tid, a) in enumerate(top, start=1): + rate = (a.fail / a.total) if a.total else 0.0 + w.writerow([i, tid, a.fail, a.total, f"{rate:.4f}"]) + + # ---------- Error variations + extracted code per top failing test ---------- + extracted_tests_root = out_root / "extracted_tests" + extracted_tests_root.mkdir(parents=True, exist_ok=True) + + errors_summary_csv = out_root / "errors_summary.csv" + with errors_summary_csv.open("w", newline="", encoding="utf-8") as f: + w = csv.writer(f) + w.writerow(["test_id", "error_rank", "count", "error_message"]) + + for tid, a in top: + test_folder = extracted_tests_root / safe_name(tid) + test_folder.mkdir(parents=True, exist_ok=True) + + (test_folder / "meta.json").write_text( + json.dumps( + { + "test_id": tid, + "category": args.category, + "total": a.total, + "success": a.success, + "fail": a.fail, + }, + indent=2, + ), + encoding="utf-8", + ) + + saved = 0 + for run_id, rec in rec_cache.get(tid, [])[:10]: + code_piece = extract_code_text(rec) + if code_piece: + ext, txt = code_piece + (test_folder / f"{run_id}{ext}").write_text(txt, encoding="utf-8") + saved += 1 + + em = rec.get("error_message") + if isinstance(em, str) and em.strip(): + (test_folder / f"{run_id}_error.txt").write_text(em, encoding="utf-8") + + (test_folder / "extraction_report.json").write_text( + json.dumps({"code_snippets_saved": saved}, indent=2), + encoding="utf-8", + ) + + variants: dict[str, int] = {} + for _run_id, rec in rec_cache.get(tid, []): + em = rec.get("error_message") + if not isinstance(em, str): + continue + em_norm = "\n".join([ln.rstrip() for ln in em.strip().splitlines()]).strip() + if not em_norm: + continue + variants[em_norm] = variants.get(em_norm, 0) + 1 + + variants_sorted = sorted(variants.items(), key=lambda kv: kv[1], reverse=True) + + (test_folder / "error_variations.json").write_text( + json.dumps( + { + "test_id": tid, + "total_failures": a.fail, + "distinct_error_messages": len(variants_sorted), + "variants": [{"count": c, "error_message": msg} for msg, c in variants_sorted], + }, + indent=2, + ), + encoding="utf-8", + ) + + for rank, (msg, c) in enumerate(variants_sorted, start=1): + msg_csv = msg if len(msg) <= 3000 else (msg[:3000] + "…") + w.writerow([tid, rank, c, msg_csv]) + + + print("\nDONE ✅") + if extract_root.exists(): + print(f"- Extracted zips -> {extract_root}") + print(f"- Collected files -> {files_root}") + print(f"- Summary -> {summary_csv}") + print(f"- Top failures -> {top_csv}") + print(f"- Error variations -> {errors_summary_csv}") + print(f"- Extracted tests -> {extracted_tests_root}") + + +if __name__ == "__main__": + main() diff --git a/scripts/group_errors_from_summary.py b/scripts/group_errors_from_summary.py new file mode 100644 index 000000000..ae807c169 --- /dev/null +++ b/scripts/group_errors_from_summary.py @@ -0,0 +1,149 @@ +#!/usr/bin/env python3 + +import csv +import sys +from collections import defaultdict +from pathlib import Path + +# ---------------------------- +# Configuration +# ---------------------------- + +ERROR_GROUPS = [ + "Generated tests Passed pre-patch", + "Generated tests Failed post-patch", + "Build or publish failed", +] + +# ---------------------------- +# Helpers +# ---------------------------- + +def die(msg: str, code: int = 2) -> None: + print(f"ERROR: {msg}", file=sys.stderr) + sys.exit(code) + + +def extract_error_group(error_message: str) -> str: + """ + Determine high-level error group based on the FIRST meaningful line. + """ + if not error_message: + return "Unknown" + + for raw_line in error_message.splitlines(): + line = raw_line.strip() + if not line: + continue + + for group in ERROR_GROUPS: + if line.startswith(group): + return group + + # Fallback: first non-empty line + return line + + return "Unknown" + + +# ---------------------------- +# Core logic +# ---------------------------- + +def group_errors(errors_summary_csv: Path, out_dir: Path) -> Path: + groups = {} + + with errors_summary_csv.open("r", encoding="utf-8", newline="") as f: + reader = csv.DictReader(f) + + for row in reader: + test_id = (row.get("test_id") or "").strip() + error_message = row.get("error_message") or "" + + try: + count = int(row.get("count", "1") or 1) + except ValueError: + count = 1 + + error_group = extract_error_group(error_message) + + g = groups.setdefault( + error_group, + { + "occurrences": 0, + "tests": set(), + "full_messages": defaultdict(int), + }, + ) + + g["occurrences"] += count + if test_id: + g["tests"].add(test_id) + if error_message.strip(): + g["full_messages"][error_message.strip()] += count + + if not groups: + die("No data found in errors_summary.csv") + + # Prepare output rows + out_rows = [] + for error_group, g in groups.items(): + top_message = "" + if g["full_messages"]: + top_message = max( + g["full_messages"].items(), + key=lambda kv: kv[1], + )[0] + + out_rows.append( + { + "error_group": error_group, + "occurrences": g["occurrences"], + "distinct_tests": len(g["tests"]), + "example_test_ids": ",".join(sorted(g["tests"])[:5]), + "top_full_error_message": top_message, + } + ) + + out_rows.sort(key=lambda r: r["occurrences"], reverse=True) + + out_csv = out_dir / "grouped_errors_summary.csv" + with out_csv.open("w", encoding="utf-8", newline="") as f: + writer = csv.DictWriter( + f, + fieldnames=[ + "error_group", + "occurrences", + "distinct_tests", + "example_test_ids", + "top_full_error_message", + ], + ) + writer.writeheader() + writer.writerows(out_rows) + + return out_csv + + +# ---------------------------- +# Entry point +# ---------------------------- + +def main() -> None: + if len(sys.argv) != 3: + die("Usage: python group_errors_from_summary.py ") + + errors_summary_csv = Path(sys.argv[1]) + out_dir = Path(sys.argv[2]) + + if not errors_summary_csv.exists(): + die(f"File not found: {errors_summary_csv}") + + out_dir.mkdir(parents=True, exist_ok=True) + + out_csv = group_errors(errors_summary_csv, out_dir) + print(f"✅ Grouped errors summary written to: {out_csv}") + + +if __name__ == "__main__": + main()