diff --git a/README.md b/README.md index db44fad..c3dd8f1 100644 --- a/README.md +++ b/README.md @@ -42,10 +42,12 @@ gitcortex extract --repo . \ --ignore "*_generated.go" ``` -Patterns match against the file path as emitted by `git log --raw` (forward-slash, repo-relative). Directory patterns like `vendor/*` exclude anything under that prefix. File-name patterns like `*.pb.go` match at any depth. +Patterns match against the file path as emitted by `git log --raw` (forward-slash, repo-relative). Directory patterns like `vendor/*` are **repo-root prefixes** — they exclude everything under `vendor/` at the top of the tree, but **not** nested occurrences like `pkg/vendor/foo.go` or `services/auth/vendor/bar.go`. For those you need explicit entries such as `--ignore "pkg/vendor/*"`. File-name patterns like `*.pb.go` and `package-lock.json` match at any depth via extract's basename match, so one entry covers every occurrence. Start permissive, run `gitcortex stats --stat hotspots --top 20` and `--stat churn-risk --top 20`, and add `--ignore` entries for whatever generated file type dominates the output. Re-extract until the top list represents real changes worth understanding. +**You don't need to get this right on the first try.** When `stats` runs on an un-filtered dataset and likely vendor/generated paths account for ≥10% of repo churn, it prints a warning to stderr with the matched buckets and a copy-pasteable `--ignore` invocation. The warning enumerates the exact nested prefixes it found (e.g. `wp-includes/js/dist/*`, `services/auth/vendor/*`), so monorepos and subproject-heavy layouts get the specific entries they need without guessing. Running the suggestion and re-extracting is the fastest path from raw repo to usable stats. + > Both commit-level (`Summary.TotalAdditions/Deletions`) and file-level aggregations recompute from the filtered set, so all totals stay consistent after `--ignore` — the extract step recalculates commit additions/deletions as the sum of non-ignored file records before writing them to JSONL. ## Privacy and reliability @@ -256,27 +258,27 @@ gitcortex stats --input data.jsonl --stat churn-risk --top 15 gitcortex stats --input data.jsonl --stat churn-risk --churn-half-life 60 # faster decay ``` -Real output from the Pi-hole repository (one sample per label): +Real output: ``` -PATH LABEL CHURN BF AGE TREND -automated install/basic-install.sh active 115.3 15 4121d 0.00 -.github/workflows/codeql-analysis.yml legacy-hotspot 66.2 2 1640d 0.26 -advanced/bash-completion/pihole-ftl.bash silo 16.5 1 240d 1.00 -test/_alpine_3_23.Dockerfile active-core 7.1 1 120d 1.00 -advanced/Templates/gravity.db.schema cold 0.0 1 2616d 1.00 +PATH LABEL RECENT CHURN BF AGE TREND +automated install/basic-install.sh active (age P90, trend P87) 115.3 15 4121d 0.00 +.github/workflows/codeql-analysis.yml active-core (age P30, trend P95) 66.2 2 1640d 0.26 +advanced/Scripts/utils.sh active-core (age P27, trend P94) 53.3 2 1523d 0.10 ``` | Label | Meaning | |-------|---------| | `cold` | Low recent churn — ignore. | | `active` | Shared ownership (bus factor ≥ 3). Healthy. | -| `active-core` | New code (< 180d), single author. Usually fine. | +| `active-core` | New code (younger than most of the repo), single author. Usually fine. | | `silo` | Old + concentrated + stable/growing. Knowledge bottleneck — plan transfer. | | `legacy-hotspot` | **Urgent.** Old + concentrated + declining. Deprecated paths still being touched. | Sort key is `recent_churn`; the label answers "is this activity a problem?". The composite `risk_score` field (`recent_churn / bus_factor`) is still emitted for CI gate back-compat. +**The `(age PXX, trend PYY)` suffix** reports where the file sits in this repo's distribution: `age P90` = older than 90% of tracked files, `trend P08` = declining more sharply than 92%. Classification thresholds are not absolute — they adapt to each dataset (P75 age and P25 trend, with a fallback to fixed constants for repos under 8 files). A `legacy-hotspot` with `(age P76, trend P24)` barely qualifies; one at `(age P98, trend P03)` is the real alarm. Distance from the boundary is now visible instead of hidden. See `docs/METRICS.md` for the adaptive-thresholds section. + `--churn-half-life` controls how fast old changes lose weight (default 90 days = changes lose half their weight every 90 days). ### Working patterns diff --git a/cmd/gitcortex/main.go b/cmd/gitcortex/main.go index 87bba2c..f1e7b46 100644 --- a/cmd/gitcortex/main.go +++ b/cmd/gitcortex/main.go @@ -4,6 +4,7 @@ import ( "context" "encoding/json" "fmt" + "io" "os" "os/signal" "path/filepath" @@ -180,6 +181,16 @@ func statsCmd() *cobra.Command { fmt.Fprintf(os.Stderr, "Loaded %d commits, %d files, %d devs\n\n", ds.CommitCount, ds.UniqueFileCount, ds.DevCount) + // Suspect vendor/generated warning only fires when the + // aggregate matched churn exceeds suspectWarningMinChurnRatio + // of the total. Text-format stats only: JSON/CSV consumers + // typically pipe the output and don't want a chatter prefix. + if sf.format == "" || sf.format == "table" { + if buckets, worth := stats.DetectSuspectFiles(ds); worth { + printSuspectWarning(os.Stderr, buckets) + } + } + return renderStats(ds, &sf) }, } @@ -188,6 +199,43 @@ func statsCmd() *cobra.Command { return cmd } +// printSuspectWarning emits a stderr block listing likely vendor/generated +// patterns that matched, with a copy-pasteable --ignore suggestion. Called +// only when DetectSuspectFiles reports the matched churn crosses the +// noise floor, so repos with one incidental .lock file don't get spammed. +func printSuspectWarning(w io.Writer, buckets []stats.SuspectBucket) { + if len(buckets) == 0 { + return + } + // Top 6 buckets — enough to be useful, not enough to drown the prompt. + const maxShown = 6 + shown := buckets + if len(shown) > maxShown { + shown = shown[:maxShown] + } + fmt.Fprintln(w, "⚠ Suspect vendor/generated paths detected — they inflate churn and bus factor") + fmt.Fprintln(w, " without reflecting hand-authored code. Top matches:") + for _, b := range shown { + fmt.Fprintf(w, " %-22s %4d files, %8d churn (%s)\n", + b.Pattern.Glob, len(b.Paths), b.Churn, b.Pattern.Reason) + } + if len(buckets) > len(shown) { + fmt.Fprintf(w, " ... and %d more bucket(s) — see suggestion below for full set\n", + len(buckets)-len(shown)) + } + // Suggestions cover ALL buckets, not just the shown subset — the + // warning threshold is computed over every bucket, so a remediation + // that skips unshown ones would leave the warning firing after the + // suggested fix. + suggestions := stats.CollectAllSuggestions(buckets) + fmt.Fprint(w, " Rerun extract with --ignore to drop them, e.g.:\n gitcortex extract --repo .") + for _, s := range suggestions { + fmt.Fprintf(w, " --ignore %s", stats.ShellQuoteSingle(s)) + } + fmt.Fprintln(w) + fmt.Fprintln(w) +} + func renderStats(ds *stats.Dataset, sf *statsFlags) error { showAll := sf.stat == "" f := stats.NewFormatter(os.Stdout, sf.format) diff --git a/docs/METRICS.md b/docs/METRICS.md index 55dbbac..1b91bc4 100644 --- a/docs/METRICS.md +++ b/docs/METRICS.md @@ -134,14 +134,26 @@ rows implicitly assume the earlier rows didn't match. |---|-------|------|--------| | 1 | **cold** | `recent_churn ≤ 0.5 × median(recent_churn)` | Ignore. | | 2 | **active** | `bus_factor ≥ 3` | Healthy, shared. | -| 3 | **active-core** | `bus_factor ≤ 2` and `age < 180 days` | New code, single author is expected. | -| 4 | **legacy-hotspot** | `bus_factor ≤ 2`, `age ≥ 180 days`, and `trend < 0.5` | **Urgent.** Old + concentrated + declining. | +| 3 | **active-core** | `bus_factor ≤ 2` and `age < oldAgeThreshold` | New code, single author is expected. | +| 4 | **legacy-hotspot** | `bus_factor ≤ 2`, `age ≥ oldAgeThreshold`, and `trend < decliningTrendThreshold` | **Urgent.** Old + concentrated + declining. | | 5 | **silo** | default (everything the rules above didn't catch) | Knowledge bottleneck — plan transfer. | Where: - `age = days between firstChange and latest commit in dataset` - `trend = churn_last_3_months / churn_earlier`. Edge cases: empty history returns 1 (no signal); recent-only history returns 2 (grew from nothing); earlier-only history returns 0 (declined to nothing — the strongest `legacy-hotspot` signal); short-span datasets whose entire window fits inside the trend window return 1 to avoid false "growing" reports +### Adaptive thresholds (per-dataset calibration) + +`oldAgeThreshold` and `decliningTrendThreshold` are not fixed constants: they are derived from the dataset's own distribution each run. With at least `classifyMinSample` (8) files present: +- `oldAgeThreshold` = **P75** of file ages in this dataset +- `decliningTrendThreshold` = **P25** of file trends in this dataset, clamped to at least `adaptiveDecliningTrendFloor` (0.01). The floor matters on mature repos where ≥25% of files are dormant (trend=0 via the earlier-only path): P25 would otherwise collapse to 0 and the strict `trend < threshold` check would never fire, silently misclassifying every dormant concentrated file as `silo` instead of `legacy-hotspot`. The floor keeps the threshold strictly positive so the trend=0 signal — the strongest legacy-hotspot alarm — still reaches the rule. + +This makes "old" mean "older than 75% of tracked files in this repo" instead of an absolute 180 days. A 4-year-old file in a 12-year-old codebase was previously tagged `legacy-hotspot` even though it was newer than most of the repo — now the same file lands in `active-core`. Below the sample threshold, the absolute fallbacks `classifyOldAgeDays` and `classifyDecliningTrend` apply so tiny repos still produce labels. + +Each `ChurnRiskResult` also exposes `AgePercentile` and `TrendPercentile` (0-100) showing where the file sits in the distribution. The fields are nil (omitted from JSON, empty in CSV) when the fallback path ran. The CLI and HTML surface these alongside the label — `legacy-hotspot (age P92, trend P08)` tells you the file is both old and sharply declining relative to peers; `legacy-hotspot (age P76, trend P24)` barely qualifies. Distance from the classification boundary is now readable, not hidden. + +> **Degenerate trend distribution.** When every file's entire history fits inside the trend window (e.g. a repo with <3 months of commits), `churnTrend` returns the flat-signal sentinel `1.0` for all of them. The adaptive P25 then lands on `1.0` too, and the `trend < P25` predicate matches nobody — no file reaches `legacy-hotspot` through the trend check. Old + concentrated files fall through to `silo` instead. This is mathematically correct (there's no variation to classify on) but can surprise readers of short-lived repos. Pinned by `TestChurnRiskAdaptiveDegenerateTrendDistribution` so future refactors don't silently flip it. + > **Sensitivity note.** Files touched a single time long ago and never again correctly route to `legacy-hotspot` via the earlier-only trend=0 path. On large mature repos this pattern is the common case, not the exception — e.g. validation on a kubernetes snapshot classified ~29k files this way. If the label distribution looks heavy on `legacy-hotspot` for a long-lived codebase, that is usually diagnosing real dormant code, not a bug. ### Additional columns @@ -264,8 +276,11 @@ Every classification boundary is a named constant in `internal/stats/stats.go`. |----------|---------|----------| | `classifyColdChurnRatio` | `0.5` | A file is `cold` when `recent_churn ≤ ratio × median(recent_churn)`. | | `classifyActiveBusFactor` | `3` | A file is `active` (shared, healthy) when `bus_factor ≥ this`. | -| `classifyOldAgeDays` | `180` | Age cutoff for `active-core` vs `silo`/`legacy-hotspot`. | -| `classifyDecliningTrend` | `0.5` | Trend ratio below this marks `legacy-hotspot` (old + declining). | +| `classifyOldAgeDays` | `180` | **Fallback only** (dataset < `classifyMinSample` files). Adaptive path uses P75 of the dataset's own age distribution. | +| `classifyDecliningTrend` | `0.5` | **Fallback only**. Adaptive path uses P25 of the dataset's own trend distribution. | +| `classifyMinSample` | `8` | Below this many files, percentile estimates are too noisy to trust and the two thresholds above revert to absolutes. | +| `adaptiveDecliningTrendFloor` | `0.01` | Minimum value for the adaptive `decliningTrendThreshold`. Prevents P25 from collapsing to 0 on mature repos where dormant files dominate, which would hide every legacy-hotspot. | +| `suspectWarningMinChurnRatio` | `0.10` | Vendor/generated path warning fires only when matched paths together exceed this fraction of total repo churn — prevents a single incidental `.lock` file from triggering noise. | | `classifyTrendWindowMonths` | `3` | Window (months, relative to latest commit) for the recent vs earlier split in `trend`. | | `contribRefactorRatio` | `0.8` | `del/add ≥ this` → dev profile `contribType = refactor`. | | `contribBalancedRatio` | `0.4` | `0.4 ≤ del/add < 0.8` → `balanced`; below 0.4 → `growth`. | @@ -299,6 +314,18 @@ A third-level tiebreaker on path/sha/email asc is applied where primary and seco Inside `busfactor`, the per-file `TopDevs` list is sorted by lines desc with an email asc tiebreaker. Without it, binary assets and small files where two devs contribute equal lines (e.g. `.gif`, `.png`, one-line configs) produced a different `TopDevs` email order on every run. +### Vendor/generated path warning + +When `stats` loads a dataset in table format, it scans for paths matching a conservative list of vendor/generated heuristics: `vendor/`, `node_modules/`, `dist/`, `build/`, `third_party/`, `*.min.js`, `*.min.css`, `*.lock`, language-specific lockfiles (`package-lock.json`, `go.sum`, `Cargo.lock`, `poetry.lock`, `yarn.lock`, `pnpm-lock.yaml`), and common generated extensions (`*.pb.go`, `*_pb2.py`, `*.generated.*`). + +If the matched paths together account for at least `suspectWarningMinChurnRatio` (10%) of total repo churn, a warning is emitted to stderr listing the top-6 buckets with a copy-pasteable `extract --ignore` invocation. Below the floor, no warning — a single incidental `.lock` file in an otherwise clean repo stays silent. + +Directory-segment heuristics (`vendor`, `node_modules`, `dist`, `build`, `third_party`) match the segment wherever it appears in the path, but `extract --ignore` treats a bare `dist/*` glob as a repo-root prefix. To avoid suggesting a fix that wouldn't actually remove the matched files, each bucket carries a `Suggestions` list of the specific parent prefixes it matched (e.g. `wp-includes/js/dist/*`, `services/auth/vendor/*`), and the warning emits every unique prefix so the copy-pasteable command covers every source of distortion. Suffix and basename patterns (`*.min.js`, `package-lock.json`, etc.) collapse to a single glob because extract's basename match already handles them at any depth. + +The warning is advisory. Nothing is auto-filtered; the user decides whether to re-extract. Matches do not affect computed stats in that run. JSON/CSV output paths skip the warning since they're typically piped. + +Statistical heuristics (very high churn-per-commit, single-author bulk updates) are deliberately out of scope — their false-positive rate on hand-authored code is higher than the path-based list and we'd rather stay quiet than cry wolf. + ### `--mailmap` off by default `extract` does not apply `.mailmap` unless you pass `--mailmap`. Without it, the same person with two emails (e.g. `alice@work.com` and `alice@personal.com`) splits into two contributors. Affected metrics: `contributors`, `bus factor`, `dev network`, `profiles`, churn-risk label (via bus factor). diff --git a/internal/extract/extract.go b/internal/extract/extract.go index ad14bf9..47e2bfa 100644 --- a/internal/extract/extract.go +++ b/internal/extract/extract.go @@ -244,7 +244,7 @@ func emitCommit(writer *bufio.Writer, commit *git.StreamCommit, sizeMap map[stri if path == "" { path = entry.PathOld } - if shouldIgnore(path, ignorePatterns) { + if ShouldIgnore(path, ignorePatterns) { continue } filteredRaw = append(filteredRaw, entry) @@ -389,7 +389,12 @@ func loadDevEmails(path string) (map[string]struct{}, error) { return cache, scanner.Err() } -func shouldIgnore(path string, patterns []string) bool { +// ShouldIgnore reports whether path matches any of the ignore patterns. +// Exported so downstream packages (e.g. the stats suspect-warning) +// can verify that the globs they emit actually match the paths they +// describe — without a shared predicate the two surfaces can drift +// and users end up with --ignore suggestions that don't do anything. +func ShouldIgnore(path string, patterns []string) bool { if len(patterns) == 0 { return false } diff --git a/internal/extract/extract_test.go b/internal/extract/extract_test.go index 94a0973..a2a4fc9 100644 --- a/internal/extract/extract_test.go +++ b/internal/extract/extract_test.go @@ -127,9 +127,9 @@ func TestShouldIgnore(t *testing.T) { } for _, tt := range tests { - got := shouldIgnore(tt.path, tt.patterns) + got := ShouldIgnore(tt.path, tt.patterns) if got != tt.want { - t.Errorf("shouldIgnore(%q, %v) = %v, want %v", tt.path, tt.patterns, got, tt.want) + t.Errorf("ShouldIgnore(%q, %v) = %v, want %v", tt.path, tt.patterns, got, tt.want) } } } diff --git a/internal/report/report.go b/internal/report/report.go index 4d01516..aeff14f 100644 --- a/internal/report/report.go +++ b/internal/report/report.go @@ -394,10 +394,22 @@ var funcMap = template.FuncMap{ "joinDevs": stats.JoinDevs, "seq": seq, "list": list, - "int64": toInt64, - "actColor": actColor, - "pctRatio": pctRatio, - "plusInt": plusInt, + "int64": toInt64, + "actColor": actColor, + "pctRatio": pctRatio, + "plusInt": plusInt, + "derefInt": derefInt, +} + +// derefInt returns the value behind an *int, or 0 if nil. Template-side +// helper for optional percentile fields on ChurnRiskResult: nil becomes a +// safe zero so `{{derefInt .AgePercentile}}` never panics or prints a +// pointer address (which is what %d on *int would do). +func derefInt(p *int) int { + if p == nil { + return 0 + } + return *p } var tmpl = template.Must(template.New("report").Funcs(funcMap).Parse(reportHTML)) diff --git a/internal/report/template.go b/internal/report/template.go index a9cec66..46a1a08 100644 --- a/internal/report/template.go +++ b/internal/report/template.go @@ -181,14 +181,14 @@ footer { margin-top: 40px; padding-top: 16px; border-top: 1px solid #d0d7de; col {{if .ChurnRisk}}
Files ranked by recent churn. Label classifies context so you can judge action: legacy-hotspot (old code + concentrated + declining) is the urgent alarm; silo suggests knowledge transfer; active-core is young code with a single author (often fine); active is shared healthy work; cold is quiet.
+Files ranked by recent churn. Label classifies context so you can judge action: legacy-hotspot (old code + concentrated + declining) is the urgent alarm; silo suggests knowledge transfer; active-core is young code with a single author (often fine); active is shared healthy work; cold is quiet.{{if (index .ChurnRisk 0).AgePercentile}} Age P__ / Trend P__ under the label show where this file sits in the repo's distribution: age P90 means older than 90% of tracked files; trend P10 means declining more sharply than 90%. Classification boundaries are the P75 age and P25 trend of this dataset (see METRICS.md).{{end}}
| Path | Label | Recent Churn | BF | Age | Trend | Last Change | |
|---|---|---|---|---|---|---|---|
| {{.Path}} | -{{if eq .Label "legacy-hotspot"}}🔴 {{.Label}}{{else if eq .Label "silo"}}🟡 {{.Label}}{{else if eq .Label "active-core"}}{{.Label}}{{else if eq .Label "active"}}{{.Label}}{{else}}{{.Label}}{{end}} | +{{if eq .Label "legacy-hotspot"}}🔴 {{.Label}}{{else if eq .Label "silo"}}🟡 {{.Label}}{{else if eq .Label "active-core"}}{{.Label}}{{else if eq .Label "active"}}{{.Label}}{{else}}{{.Label}}{{end}}{{if .AgePercentile}} age P{{derefInt .AgePercentile}} · trend P{{derefInt .TrendPercentile}} {{end}} |
{{printf "%.1f" .RecentChurn}} | {{.BusFactor}} | diff --git a/internal/stats/format.go b/internal/stats/format.go index 54dc853..ad61d0f 100644 --- a/internal/stats/format.go +++ b/internal/stats/format.go @@ -237,6 +237,26 @@ func (f *Formatter) PrintCoupling(results []CouplingResult) error { } } +// LabelWithPercentile decorates a churn-risk label with the age and trend +// percentile ranks when they are available, so readers can tell a barely +// classified file from a strongly classified one. Returns the bare label +// when either percentile is nil (dataset below classifyMinSample). +func LabelWithPercentile(label string, agePercentile, trendPercentile *int) string { + if agePercentile == nil || trendPercentile == nil { + return label + } + return fmt.Sprintf("%s (age P%d, trend P%d)", label, *agePercentile, *trendPercentile) +} + +// formatPercentile renders a percentile pointer for CSV: empty string +// when nil so the column stays machine-parseable without a magic sentinel. +func formatPercentile(p *int) string { + if p == nil { + return "" + } + return fmt.Sprintf("%d", *p) +} + func (f *Formatter) PrintChurnRisk(results []ChurnRiskResult) error { switch f.format { case "json": @@ -250,19 +270,24 @@ func (f *Formatter) PrintChurnRisk(results []ChurnRiskResult) error { fmt.Sprintf("%.1f", r.RecentChurn), fmt.Sprintf("%d", r.BusFactor), fmt.Sprintf("%d", r.AgeDays), + formatPercentile(r.AgePercentile), fmt.Sprintf("%.2f", r.Trend), + formatPercentile(r.TrendPercentile), fmt.Sprintf("%d", r.TotalChanges), r.FirstChangeDate, r.LastChangeDate, } } - return f.writeCSV([]string{"path", "label", "recent_churn", "bus_factor", "age_days", "trend", "total_changes", "first_change", "last_change"}, rows) + return f.writeCSV([]string{"path", "label", "recent_churn", "bus_factor", "age_days", "age_percentile", "trend", "trend_percentile", "total_changes", "first_change", "last_change"}, rows) default: tw := tabwriter.NewWriter(f.w, 0, 4, 2, ' ', 0) fmt.Fprintf(tw, "PATH\tLABEL\tRECENT CHURN\tBF\tAGE\tTREND\tLAST CHANGE\n") fmt.Fprintf(tw, "----\t-----\t------------\t--\t---\t-----\t-----------\n") for _, r := range results { - fmt.Fprintf(tw, "%s\t%s\t%.1f\t%d\t%dd\t%.2f\t%s\n", r.Path, r.Label, r.RecentChurn, r.BusFactor, r.AgeDays, r.Trend, r.LastChangeDate) + fmt.Fprintf(tw, "%s\t%s\t%.1f\t%d\t%dd\t%.2f\t%s\n", + r.Path, + LabelWithPercentile(r.Label, r.AgePercentile, r.TrendPercentile), + r.RecentChurn, r.BusFactor, r.AgeDays, r.Trend, r.LastChangeDate) } return tw.Flush() } diff --git a/internal/stats/stats.go b/internal/stats/stats.go index 2f589a5..0121aed 100644 --- a/internal/stats/stats.go +++ b/internal/stats/stats.go @@ -15,9 +15,21 @@ const ( // Churn Risk classification classifyColdChurnRatio = 0.5 // recent_churn ≤ 0.5 × median → cold classifyActiveBusFactor = 3 // bf ≥ 3 → active (shared) - classifyOldAgeDays = 180 // age ≥ 180d is "old" - classifyDecliningTrend = 0.5 // trend < 0.5 = declining + classifyOldAgeDays = 180 // fallback "old" threshold when dataset < classifyMinSample files; adaptive mode uses the P75 of file ages in the dataset + classifyDecliningTrend = 0.5 // fallback "declining" threshold under the same condition; adaptive mode uses the P25 of file trends classifyTrendWindowMonths = 3 // recent vs earlier split + classifyMinSample = 8 // below this, percentile estimates are noisy and we fall back to the absolute constants above + // adaptiveDecliningTrendFloor keeps the P25-derived "declining" + // threshold strictly positive. churnTrend clamps its output at 0 + // for files with earlier-only history (the strongest + // legacy-hotspot signal). In mature repos where ≥25% of files are + // dormant, P25 collapses to 0; without this floor, `trend < 0` + // never fires and every dormant concentrated file is misrouted to + // silo instead of legacy-hotspot — the exact signal the rule is + // supposed to surface. Epsilon is small enough not to widen the + // declining band past the fallback (0.5) even in the pathological + // case, large enough that 0.0 ≠ threshold under float compare. + adaptiveDecliningTrendFloor = 0.01 // Developer profile contribution type (del/add ratio) contribRefactorRatio = 0.8 // ratio ≥ 0.8 → refactor @@ -120,6 +132,19 @@ type ChurnRiskResult struct { AgeDays int Trend float64 // recent 3mo churn / earlier churn; 1 = flat, <0.5 declining, >1.5 growing Label string // "cold" | "active" | "active-core" | "silo" | "legacy-hotspot" + // AgePercentile and TrendPercentile report where this file lands in the + // per-dataset distribution (0-100). Nil when the fallback path ran + // (dataset below classifyMinSample) so JSON consumers see the field + // omitted rather than a `-1` sentinel. Surfacing these alongside the + // label makes the distance from the classification boundary visible: + // `legacy-hotspot (age P92, trend P08)` vs a file that barely crossed. + // Tag form `json:",omitempty"` (with the leading comma) keeps Go's + // default PascalCase name — AgePercentile / TrendPercentile — so the + // field names match every other field on this struct. Without it + // the explicit snake_case name would make the emitted schema mix + // two conventions in the same object. + AgePercentile *int `json:",omitempty"` + TrendPercentile *int `json:",omitempty"` } type WorkingPattern struct { @@ -556,9 +581,142 @@ func churnTrend(monthChurn map[string]int64, earliest, latest time.Time) float64 return float64(recent) / float64(earlier) } +// classifyBands holds the per-dataset calibrated thresholds used by +// classifyFile. When the dataset has fewer than classifyMinSample files, +// percentile estimates would be noisy, so defaultBands() returns the +// absolute fallback constants and nil sorted slices (callers that want +// to surface a percentile rank should check via HasPercentiles()). +type classifyBands struct { + OldAgeDays int + DecliningTrend float64 + sortedAges []int // ascending; nil in fallback mode + sortedTrends []float64 // ascending; nil in fallback mode +} + +func defaultBands() classifyBands { + return classifyBands{ + OldAgeDays: classifyOldAgeDays, + DecliningTrend: classifyDecliningTrend, + } +} + +// HasPercentiles reports whether the bands carry the population data +// needed to answer agePercentile/trendPercentile queries. +func (b classifyBands) HasPercentiles() bool { + return b.sortedAges != nil && b.sortedTrends != nil +} + +// computeBands gathers ages and trends across the dataset and returns +// the P75 age / P25 trend as calibrated thresholds. Falls back to the +// absolute constants when the sample is too small. +func computeBands(ages []int, trends []float64) classifyBands { + if len(ages) < classifyMinSample || len(trends) < classifyMinSample { + return defaultBands() + } + sortedAges := make([]int, len(ages)) + copy(sortedAges, ages) + sort.Ints(sortedAges) + sortedTrends := make([]float64, len(trends)) + copy(sortedTrends, trends) + sort.Float64s(sortedTrends) + + declining := percentileFloat(sortedTrends, 25) + if declining < adaptiveDecliningTrendFloor { + declining = adaptiveDecliningTrendFloor + } + return classifyBands{ + OldAgeDays: percentileInt(sortedAges, 75), + DecliningTrend: declining, + sortedAges: sortedAges, + sortedTrends: sortedTrends, + } +} + +// nearestRankIndex returns the 0-indexed slice position for the p-th +// percentile of an N-element sorted sequence using the textbook +// nearest-rank method: idx = ceil(p * N / 100) - 1. Tie-rules, +// clamping, and edge cases (p=0, p=100, N=1) are handled here so +// percentileInt/percentileFloat stay a single line each. +// +// A previous implementation used (p * (N-1)) / 100, which is actually +// truncated linear-position addressing — it systematically under-picks +// the cutoff for many {p, N} pairs (e.g. P75 of 10 → idx 6 instead of +// 7) and lowered oldAgeThreshold / decliningTrendThreshold by one +// sorted position. Files near the quartile boundary were labelled +// inconsistently with the stated P75/P25 semantics. Using nearest-rank +// aligns the implementation with the documentation. +func nearestRankIndex(p, n int) int { + if n <= 0 { + return 0 + } + num := p * n + idx := num / 100 + if num%100 != 0 { + idx++ + } + idx-- + if idx < 0 { + idx = 0 + } + if idx >= n { + idx = n - 1 + } + return idx +} + +// percentileInt returns the p-th percentile of a sorted int slice using +// the nearest-rank method. Assumes len(sorted) >= 1 (callers guard via +// classifyMinSample). +func percentileInt(sorted []int, p int) int { + if len(sorted) == 0 { + return 0 + } + return sorted[nearestRankIndex(p, len(sorted))] +} + +func percentileFloat(sorted []float64, p int) float64 { + if len(sorted) == 0 { + return 0 + } + return sorted[nearestRankIndex(p, len(sorted))] +} + +// rankInt returns the percentile rank (0-100) of v within a sorted +// slice — i.e. the percentage of entries strictly below v. Ties count +// as "at" rather than "below", so the rank of the minimum is 0 and the +// rank of a value present in the slice reports how many are strictly +// less. +func rankInt(sorted []int, v int) int { + // binary search for first index i where sorted[i] >= v + lo, hi := 0, len(sorted) + for lo < hi { + mid := (lo + hi) / 2 + if sorted[mid] < v { + lo = mid + 1 + } else { + hi = mid + } + } + return lo * 100 / len(sorted) +} + +func rankFloat(sorted []float64, v float64) int { + lo, hi := 0, len(sorted) + for lo < hi { + mid := (lo + hi) / 2 + if sorted[mid] < v { + lo = mid + 1 + } else { + hi = mid + } + } + return lo * 100 / len(sorted) +} + // classifyFile assigns an actionable label based on churn, ownership, age, -// and trend. Thresholds are package constants (classify*). -func classifyFile(recentChurn, lowChurn float64, bf, ageDays int, trend float64) string { +// and trend. Thresholds come from bands, which are either dataset-derived +// percentiles or the fallback absolute constants. +func classifyFile(recentChurn, lowChurn float64, bf, ageDays int, trend float64, bands classifyBands) string { if recentChurn <= lowChurn { return "cold" } @@ -566,10 +724,10 @@ func classifyFile(recentChurn, lowChurn float64, bf, ageDays int, trend float64) return "active" // shared, healthy } // Concentrated ownership (bf 1-2) with meaningful churn. - if ageDays < classifyOldAgeDays { + if ageDays < bands.OldAgeDays { return "active-core" // new code, single author is expected } - if trend < classifyDecliningTrend { + if trend < bands.DecliningTrend { return "legacy-hotspot" // old + concentrated + declining → urgent } return "silo" // old + concentrated + stable/growing → knowledge bottleneck @@ -590,9 +748,34 @@ func ChurnRisk(ds *Dataset, n int) []ChurnRiskResult { lowChurn = median * classifyColdChurnRatio } + // Pass 1: gather per-file age and trend so bands can be calibrated + // from the whole-dataset distribution before classification. + type perFile struct { + path string + fe *fileEntry + ageDays int + trend float64 + } + items := make([]perFile, 0, len(ds.files)) + var ages []int + var trends []float64 + for path, fe := range ds.files { + ageDays := 0 + if !fe.firstChange.IsZero() && !ds.Latest.IsZero() { + ageDays = int(ds.Latest.Sub(fe.firstChange).Hours() / 24) + } + trend := churnTrend(fe.monthChurn, ds.Earliest, ds.Latest) + items = append(items, perFile{path: path, fe: fe, ageDays: ageDays, trend: trend}) + ages = append(ages, ageDays) + trends = append(trends, trend) + } + + bands := computeBands(ages, trends) + var results []ChurnRiskResult - for path, fe := range ds.files { + for _, it := range items { + path, fe := it.path, it.fe // Compute bus factor (80% threshold), same as BusFactor stat. type dl struct{ lines int64 } devs := make([]dl, 0, len(fe.devLines)) @@ -627,13 +810,15 @@ func ChurnRisk(ds *Dataset, n int) []ChurnRiskResult { firstDate = fe.firstChange.UTC().Format("2006-01-02") } - ageDays := 0 - if !fe.firstChange.IsZero() && !ds.Latest.IsZero() { - ageDays = int(ds.Latest.Sub(fe.firstChange).Hours() / 24) - } + label := classifyFile(fe.recentChurn, lowChurn, bf, it.ageDays, it.trend, bands) - trend := churnTrend(fe.monthChurn, ds.Earliest, ds.Latest) - label := classifyFile(fe.recentChurn, lowChurn, bf, ageDays, trend) + var agePct, trendPct *int + if bands.HasPercentiles() { + a := rankInt(bands.sortedAges, it.ageDays) + tr := rankFloat(bands.sortedTrends, it.trend) + agePct = &a + trendPct = &tr + } results = append(results, ChurnRiskResult{ Path: path, @@ -643,9 +828,11 @@ func ChurnRisk(ds *Dataset, n int) []ChurnRiskResult { TotalChanges: fe.commits, LastChangeDate: lastDate, FirstChangeDate: firstDate, - AgeDays: ageDays, - Trend: math.Round(trend*100) / 100, + AgeDays: it.ageDays, + Trend: math.Round(it.trend*100) / 100, Label: label, + AgePercentile: agePct, + TrendPercentile: trendPct, }) } diff --git a/internal/stats/stats_test.go b/internal/stats/stats_test.go index a68dee3..0716fa2 100644 --- a/internal/stats/stats_test.go +++ b/internal/stats/stats_test.go @@ -639,8 +639,348 @@ func TestClassifyFile(t *testing.T) { {"legacy-hotspot: old + concentrated + declining", 200, 50, 1, 365, 0.3, "legacy-hotspot"}, {"cold wins over everything when churn low", 10, 50, 1, 365, 0.1, "cold"}, } + // Use defaultBands so the old absolute constants (180d age, 0.5 trend) + // still drive classification — this test covers the fallback path. + bands := defaultBands() for _, c := range cases { - got := classifyFile(c.recentChurn, c.lowChurn, c.bf, c.ageDays, c.trend) + got := classifyFile(c.recentChurn, c.lowChurn, c.bf, c.ageDays, c.trend, bands) + if got != c.want { + t.Errorf("%s: got %q, want %q", c.name, got, c.want) + } + } +} + +func TestComputeBandsFallsBackBelowMinSample(t *testing.T) { + // Fewer than classifyMinSample files → percentiles are unreliable, + // so bands must fall back to the absolute constants and report + // HasPercentiles() == false. + ages := []int{10, 200, 400} + trends := []float64{0.2, 1.0, 2.0} + b := computeBands(ages, trends) + if b.OldAgeDays != classifyOldAgeDays { + t.Errorf("fallback OldAgeDays = %d, want %d", b.OldAgeDays, classifyOldAgeDays) + } + if b.DecliningTrend != classifyDecliningTrend { + t.Errorf("fallback DecliningTrend = %.2f, want %.2f", b.DecliningTrend, classifyDecliningTrend) + } + if b.HasPercentiles() { + t.Errorf("HasPercentiles should be false when sample < classifyMinSample") + } +} + +func TestComputeBandsAdaptive(t *testing.T) { + // With 10 files spanning a wide range, P75 of ages and P25 of trends + // should replace the absolute constants. Nearest-rank indexing: + // idx(p, n) = ceil(p*n/100) - 1 + ages := []int{10, 20, 30, 40, 50, 60, 70, 80, 90, 1000} + // P75 over 10 items: ceil(750/100)-1 = 7 → sorted[7] = 80 + trends := []float64{0.1, 0.3, 0.5, 0.7, 0.9, 1.0, 1.1, 1.2, 1.5, 2.0} + // P25 over 10 items: ceil(250/100)-1 = 2 → sorted[2] = 0.5 + b := computeBands(ages, trends) + if !b.HasPercentiles() { + t.Fatal("HasPercentiles should be true for 10-item sample") + } + if b.OldAgeDays != 80 { + t.Errorf("adaptive OldAgeDays = %d, want 80 (P75 of 10 values, nearest-rank)", b.OldAgeDays) + } + if b.DecliningTrend != 0.5 { + t.Errorf("adaptive DecliningTrend = %.2f, want 0.5 (P25 nearest-rank)", b.DecliningTrend) + } +} + +func TestNearestRankIndex(t *testing.T) { + cases := []struct { + p, n, want int + }{ + // Exact percentile positions (no ceiling kicks in). + {25, 4, 0}, // ceil(1.0)-1 = 0 + {50, 4, 1}, // ceil(2.0)-1 = 1 + {75, 4, 2}, // ceil(3.0)-1 = 2 + {100, 4, 3}, // ceil(4.0)-1 = 3 + + // Ceiling-driven cases — the classic pitfall the previous + // implementation missed. + {75, 10, 7}, // ceil(7.5)-1 = 7 (old impl returned 6) + {25, 10, 2}, // ceil(2.5)-1 = 2 + {25, 7, 1}, // ceil(1.75)-1 = 1 + {75, 11, 8}, // ceil(8.25)-1 = 8 + + // Edges: p=0 clamps to 0, p=100 to n-1, n=1 collapses both. + {0, 10, 0}, + {100, 10, 9}, + {50, 1, 0}, + {99, 1, 0}, + } + for _, c := range cases { + if got := nearestRankIndex(c.p, c.n); got != c.want { + t.Errorf("nearestRankIndex(p=%d, n=%d) = %d, want %d", c.p, c.n, got, c.want) + } + } +} + +func TestRankInt(t *testing.T) { + sorted := []int{10, 20, 30, 40, 50, 60, 70, 80, 90, 100} + cases := []struct { + v int + want int // percentile rank (% of values strictly below v) + }{ + {5, 0}, // below min → 0/10 = 0% + {10, 0}, // equal to min → 0 below + {50, 40}, // 4 below → 40% + {55, 50}, // 5 below → 50% + {100, 90}, // 9 below → 90% + {999, 100}, + } + for _, c := range cases { + if got := rankInt(sorted, c.v); got != c.want { + t.Errorf("rankInt(%d) = %d, want %d", c.v, got, c.want) + } + } +} + +func TestChurnRiskPercentilesPopulatedOnLargeDataset(t *testing.T) { + // 10 files with varied ages and monthly churn histories so the + // adaptive path fires and percentiles are populated in results. + ds := &Dataset{ + Earliest: time.Date(2023, 1, 1, 0, 0, 0, 0, time.UTC), + Latest: time.Date(2024, 6, 1, 0, 0, 0, 0, time.UTC), + files: map[string]*fileEntry{}, + } + for i := 0; i < 10; i++ { + path := fmt.Sprintf("f%02d.go", i) + firstChange := ds.Latest.AddDate(0, 0, -(30 + i*30)) + ds.files[path] = &fileEntry{ + commits: 5, + additions: int64(10 * (i + 1)), + deletions: int64(5 * (i + 1)), + recentChurn: float64(10 * (i + 1)), + firstChange: firstChange, + lastChange: ds.Latest, + devLines: map[string]int64{"a@x": int64(15 * (i + 1))}, + monthChurn: map[string]int64{"2024-05": int64(10 * (i + 1))}, + } + } + results := ChurnRisk(ds, 0) + if len(results) != 10 { + t.Fatalf("got %d results, want 10", len(results)) + } + for _, r := range results { + if r.AgePercentile == nil || r.TrendPercentile == nil { + t.Errorf("%s: percentiles should be populated on large dataset", r.Path) + continue + } + if *r.AgePercentile < 0 || *r.AgePercentile > 100 { + t.Errorf("%s: AgePercentile out of range: %d", r.Path, *r.AgePercentile) + } + if *r.TrendPercentile < 0 || *r.TrendPercentile > 100 { + t.Errorf("%s: TrendPercentile out of range: %d", r.Path, *r.TrendPercentile) + } + } +} + +func TestChurnRiskAdaptiveDormantP25ZeroFlooring(t *testing.T) { + // Mature-repo shape: half the files are dormant (trend=0 via the + // earlier-only path), half are active. Without the declining-trend + // floor, P25 collapses to 0 and `trend < 0` never fires — dormant + // concentrated files would silently be misclassified as silo + // instead of legacy-hotspot, hiding the strongest alarm. + // + // The floor guarantees the threshold is strictly positive so the + // signal survives the adaptive-mode switch. + latest := time.Date(2024, 6, 1, 0, 0, 0, 0, time.UTC) + ds := &Dataset{ + // Wide span so "earlier" window exists for the active files. + Earliest: latest.AddDate(-3, 0, 0), + Latest: latest, + files: map[string]*fileEntry{}, + } + + // 6 dormant files — single activity 2 years ago, nothing since. + // churnTrend returns 0 for these (earlier-only bucket). + for i := 0; i < 6; i++ { + path := fmt.Sprintf("dormant/old_%02d.go", i) + ds.files[path] = &fileEntry{ + commits: 3, + additions: 200, + deletions: 100, + recentChurn: 200, // above cold threshold + firstChange: latest.AddDate(-2, 0, 0), + lastChange: latest.AddDate(-2, 0, 0), + devLines: map[string]int64{"solo@x": 300}, // bf=1 + monthChurn: map[string]int64{"2022-06": 300}, + } + } + // 6 active files — current activity, fresh history. + for i := 0; i < 6; i++ { + path := fmt.Sprintf("active/new_%02d.go", i) + ds.files[path] = &fileEntry{ + commits: 3, + additions: 200, + deletions: 100, + recentChurn: 200, + firstChange: latest.AddDate(-2, 0, 0), + lastChange: latest, + devLines: map[string]int64{"a@x": 100, "b@x": 100, "c@x": 100}, // bf=3 + monthChurn: map[string]int64{"2024-05": 300}, + } + } + + results := ChurnRisk(ds, 0) + var legacyCount int + var dormantLabels []string + for _, r := range results { + if r.Label == "legacy-hotspot" { + legacyCount++ + } + if strings.HasPrefix(r.Path, "dormant/") { + dormantLabels = append(dormantLabels, r.Label) + } + } + if legacyCount == 0 { + t.Errorf("expected dormant+concentrated files to be flagged legacy-hotspot; got 0 "+ + "(dormant labels: %v). P25 likely collapsed to 0 without the floor.", dormantLabels) + } + // Sanity: every dormant file (bf=1, old, trend=0) should be legacy-hotspot. + for _, lbl := range dormantLabels { + if lbl != "legacy-hotspot" { + t.Errorf("dormant file got label %q, want legacy-hotspot", lbl) + } + } +} + +func TestClassifyFileFloorBoundary(t *testing.T) { + // Strict-less-than check with the adaptive floor means files whose + // trend is exactly at the floor are NOT declining; files strictly + // below it are. This pins the comparison semantics so a future + // refactor doesn't silently flip < to <=. + bands := classifyBands{ + OldAgeDays: 100, + DecliningTrend: adaptiveDecliningTrendFloor, // 0.01 + } + // All cases share old age + concentrated ownership + non-cold churn + // so the trend predicate drives the label. + cases := []struct { + name string + trend float64 + want string + }{ + {"trend 0 (dormant, earlier-only) → declining", 0.0, "legacy-hotspot"}, + {"trend 0.001 below floor → declining", 0.001, "legacy-hotspot"}, + {"trend exactly at floor 0.01 → NOT declining", 0.01, "silo"}, + {"trend 0.011 above floor → NOT declining", 0.011, "silo"}, + {"trend 1.0 flat → NOT declining", 1.0, "silo"}, + } + for _, c := range cases { + got := classifyFile(100, 50, 1, 200, c.trend, bands) + if got != c.want { + t.Errorf("%s: got %q, want %q", c.name, got, c.want) + } + } +} + +func TestChurnRiskAdaptiveDegenerateTrendDistribution(t *testing.T) { + // Degenerate trend case: every file's history fits entirely inside the + // trend window (earlier bucket is empty), so churnTrend returns the + // sentinel 1.0 for all of them. The adaptive P25 then collapses onto + // 1.0 and the "declining" check (`trend < 1.0`) matches nobody — no + // file can reach legacy-hotspot via the trend predicate. Old + + // concentrated files fall through to silo. + // + // This test pins that behavior so future refactors don't silently + // flip it. The degenerate shape is a known limitation documented in + // METRICS.md; the code is not a bug, but the outcome is surprising + // enough that a regression test is worth the 30 lines. + latest := time.Date(2024, 6, 1, 0, 0, 0, 0, time.UTC) + ds := &Dataset{ + Earliest: latest.AddDate(0, -2, 0), // 2-month span — fits entirely in the 3-month trend window + Latest: latest, + files: map[string]*fileEntry{}, + } + for i := 0; i < 12; i++ { + path := fmt.Sprintf("old/file_%02d.go", i) + ds.files[path] = &fileEntry{ + commits: 3, + additions: 200, + deletions: 100, + recentChurn: 500, // well above cold threshold for all + // Old enough (> 1 year) that they'd trip classifyOldAgeDays if we + // were on the absolute path, AND above the adaptive P75 for this + // dataset regardless of how ages are distributed. + firstChange: latest.AddDate(-2, 0, -(i * 30)), + lastChange: latest, + devLines: map[string]int64{"solo@x": 300}, // bf=1 for everyone + // Entire history fits inside the trend window → churnTrend + // returns 1.0 for every file (its flat-signal sentinel). + monthChurn: map[string]int64{"2024-05": 300}, + } + } + results := ChurnRisk(ds, 0) + if len(results) != 12 { + t.Fatalf("got %d results, want 12", len(results)) + } + legacyCount, siloCount := 0, 0 + for _, r := range results { + switch r.Label { + case "legacy-hotspot": + legacyCount++ + case "silo": + siloCount++ + } + // Trend of 1.0 means P25 of a constant-1 distribution is also 1, + // so `trend < 1.0` never fires and no file is declining. + if r.Label == "legacy-hotspot" { + t.Errorf("%s: unexpected legacy-hotspot — trend distribution is degenerate (all 1.0), "+ + "no file should be classified as declining", r.Path) + } + } + if siloCount == 0 { + t.Errorf("expected at least one silo (old + concentrated + not declining), got legacy=%d silo=%d", + legacyCount, siloCount) + } +} + +func TestChurnRiskPercentilesNotSetOnSmallDataset(t *testing.T) { + // 3 files → below classifyMinSample. Percentile pointers should be nil + // so JSON consumers see omitted fields, not a `-1` sentinel. + ds := &Dataset{ + Earliest: time.Date(2023, 1, 1, 0, 0, 0, 0, time.UTC), + Latest: time.Date(2024, 6, 1, 0, 0, 0, 0, time.UTC), + files: map[string]*fileEntry{}, + } + for i, p := range []string{"a.go", "b.go", "c.go"} { + ds.files[p] = &fileEntry{ + commits: 1, recentChurn: float64(10 * (i + 1)), + firstChange: ds.Latest.AddDate(0, 0, -100), + lastChange: ds.Latest, + devLines: map[string]int64{"a@x": 10}, + monthChurn: map[string]int64{"2024-05": 10}, + } + } + results := ChurnRisk(ds, 0) + for _, r := range results { + if r.AgePercentile != nil || r.TrendPercentile != nil { + t.Errorf("%s: percentiles should be nil for small dataset, got age=%v trend=%v", + r.Path, r.AgePercentile, r.TrendPercentile) + } + } +} + +func TestLabelWithPercentile(t *testing.T) { + p := func(v int) *int { return &v } + cases := []struct { + name, label string + age, trend *int + want string + }{ + {"both nil → bare label", "legacy-hotspot", nil, nil, "legacy-hotspot"}, + {"age nil → bare label", "silo", nil, p(10), "silo"}, + {"trend nil → bare label", "active", p(75), nil, "active"}, + {"both set → suffix", "legacy-hotspot", p(92), p(8), "legacy-hotspot (age P92, trend P8)"}, + {"zero values render", "active-core", p(0), p(0), "active-core (age P0, trend P0)"}, + {"three-digit value renders unpadded", "active", p(100), p(100), "active (age P100, trend P100)"}, + } + for _, c := range cases { + got := LabelWithPercentile(c.label, c.age, c.trend) if got != c.want { t.Errorf("%s: got %q, want %q", c.name, got, c.want) } diff --git a/internal/stats/suspect.go b/internal/stats/suspect.go new file mode 100644 index 0000000..33a4b5f --- /dev/null +++ b/internal/stats/suspect.go @@ -0,0 +1,227 @@ +package stats + +import ( + "sort" + "strings" +) + +// suspectWarningMinChurnRatio gates the suspect-path warning: unless the +// matched paths together account for at least this fraction of total repo +// churn, no warning is emitted. Without this guard, a single vendored +// file in a large repo would trigger noise every run. +const suspectWarningMinChurnRatio = 0.10 + +// SuspectPattern is a path heuristic for likely vendor/generated content +// that would distort hotspots, churn-risk, and bus factor. Glob is the +// bucket label (short, always the same); Suggest returns the actual +// --ignore glob that matches the path supplied, which may differ for +// nested occurrences (e.g. vendor appearing under pkg/, subproject/ ...) +// because extract.shouldIgnore treats directory patterns as repo-root +// prefixes. +type SuspectPattern struct { + Glob string + Reason string + Match func(path string) bool + Suggest func(path string) string +} + +// SuspectBucket is one pattern's match set aggregated across the dataset. +// Suggestions lists the unique extract --ignore globs needed to cover +// every matched path — typically one entry for suffix/basename patterns, +// possibly several for directory patterns that matched at multiple +// depths (e.g. "vendor/*" and "pkg/vendor/*" for the same bucket). +type SuspectBucket struct { + Pattern SuspectPattern + Paths []string + Churn int64 + Suggestions []string +} + +// defaultSuspectPatterns covers high-confidence vendor/generated paths. +// Kept conservative so the warning doesn't cry wolf; statistical signals +// (churn/commit outliers, single-author bulk updates) are deliberately +// out of scope until we have evidence they help rather than create +// false positives. +var defaultSuspectPatterns = []SuspectPattern{ + {Glob: "vendor/*", Reason: "vendored third-party code", Match: hasPathSegment("vendor"), Suggest: suggestDirGlob("vendor")}, + {Glob: "node_modules/*", Reason: "npm dependencies", Match: hasPathSegment("node_modules"), Suggest: suggestDirGlob("node_modules")}, + {Glob: "dist/*", Reason: "build artifacts", Match: hasPathSegment("dist"), Suggest: suggestDirGlob("dist")}, + {Glob: "build/*", Reason: "build output", Match: hasPathSegment("build"), Suggest: suggestDirGlob("build")}, + {Glob: "third_party/*", Reason: "third-party code", Match: hasPathSegment("third_party"), Suggest: suggestDirGlob("third_party")}, + {Glob: "*.min.js", Reason: "minified JS", Match: hasSuffixOf(".min.js"), Suggest: constantSuggest("*.min.js")}, + {Glob: "*.min.css", Reason: "minified CSS", Match: hasSuffixOf(".min.css"), Suggest: constantSuggest("*.min.css")}, + {Glob: "*.lock", Reason: "generic lock file", Match: hasSuffixOf(".lock"), Suggest: constantSuggest("*.lock")}, + {Glob: "package-lock.json", Reason: "npm lockfile", Match: basenameEquals("package-lock.json"), Suggest: constantSuggest("package-lock.json")}, + {Glob: "yarn.lock", Reason: "yarn lockfile", Match: basenameEquals("yarn.lock"), Suggest: constantSuggest("yarn.lock")}, + {Glob: "pnpm-lock.yaml", Reason: "pnpm lockfile", Match: basenameEquals("pnpm-lock.yaml"), Suggest: constantSuggest("pnpm-lock.yaml")}, + {Glob: "go.sum", Reason: "go module hashes", Match: basenameEquals("go.sum"), Suggest: constantSuggest("go.sum")}, + {Glob: "Cargo.lock", Reason: "cargo lockfile", Match: basenameEquals("Cargo.lock"), Suggest: constantSuggest("Cargo.lock")}, + {Glob: "poetry.lock", Reason: "poetry lockfile", Match: basenameEquals("poetry.lock"), Suggest: constantSuggest("poetry.lock")}, + {Glob: "*.pb.go", Reason: "protobuf generated (Go)", Match: hasSuffixOf(".pb.go"), Suggest: constantSuggest("*.pb.go")}, + {Glob: "*_pb2.py", Reason: "protobuf generated (Python)", Match: hasSuffixOf("_pb2.py"), Suggest: constantSuggest("*_pb2.py")}, + {Glob: "*.generated.*", Reason: "generated code", Match: basenameContains(".generated."), Suggest: constantSuggest("*.generated.*")}, +} + +func hasPathSegment(seg string) func(string) bool { + return func(p string) bool { + return strings.HasPrefix(p, seg+"/") || strings.Contains(p, "/"+seg+"/") + } +} + +// suggestDirGlob returns, for the path that matched seg somewhere in its +// directory chain, the specific "...parent/.../seg/*" glob that extract +// --ignore will actually act on. extract.shouldIgnore reads "dist/*" as +// a repo-root prefix, so emitting just that for a match on +// "pkg/dist/foo.js" would tell the user to run a fix that doesn't +// remove the offending paths and the warning would keep firing. +func suggestDirGlob(seg string) func(string) string { + return func(p string) string { + if strings.HasPrefix(p, seg+"/") { + return seg + "/*" + } + marker := "/" + seg + "/" + if i := strings.Index(p, marker); i >= 0 { + // p[:i] is everything before "/seg/", add "/seg/*" back. + return p[:i] + "/" + seg + "/*" + } + return seg + "/*" + } +} + +func constantSuggest(glob string) func(string) string { + // Suffix and basename patterns work at any depth via extract's + // basename match, so a single glob suffices regardless of where the + // file lives. + return func(string) string { return glob } +} + +func hasSuffixOf(suf string) func(string) bool { + return func(p string) bool { return strings.HasSuffix(p, suf) } +} + +func basenameEquals(name string) func(string) bool { + return func(p string) bool { + if i := strings.LastIndex(p, "/"); i >= 0 { + return p[i+1:] == name + } + return p == name + } +} + +// basenameContains matches when seg appears in the final path segment +// (the filename), not in any intermediate directory. Aligns with how +// extract.ShouldIgnore applies `*.generated.*` style globs — if the +// detector matches on a directory but the Suggest glob only targets +// basenames, the user's copy-pasted --ignore would leave the file in. +func basenameContains(seg string) func(string) bool { + return func(p string) bool { + base := p + if i := strings.LastIndex(p, "/"); i >= 0 { + base = p[i+1:] + } + return strings.Contains(base, seg) + } +} + +// DetectSuspectFiles scans the dataset for paths matching known +// vendor/generated heuristics. Returns buckets sorted by churn desc. +// Second return value is true when the aggregate suspect churn crosses +// suspectWarningMinChurnRatio of total repo churn — i.e. the warning is +// worth showing. Callers that want to surface findings regardless (e.g. +// a --explain flag) can ignore the boolean. +func DetectSuspectFiles(ds *Dataset) ([]SuspectBucket, bool) { + if ds == nil || len(ds.files) == 0 { + return nil, false + } + + buckets := make(map[string]*SuspectBucket) + // Per-bucket suggestion sets so identical globs aren't listed twice. + suggSets := make(map[string]map[string]struct{}) + var totalChurn, suspectChurn int64 + + for path, fe := range ds.files { + fileChurn := fe.additions + fe.deletions + totalChurn += fileChurn + // A file might match multiple patterns (e.g. vendor/*.pb.go). + // Attribute it to the first match so totals don't double-count; + // the first-match wins keeps output predictable. + for _, pat := range defaultSuspectPatterns { + if !pat.Match(path) { + continue + } + b, ok := buckets[pat.Glob] + if !ok { + b = &SuspectBucket{Pattern: pat} + buckets[pat.Glob] = b + suggSets[pat.Glob] = map[string]struct{}{} + } + b.Paths = append(b.Paths, path) + b.Churn += fileChurn + suspectChurn += fileChurn + if pat.Suggest != nil { + suggSets[pat.Glob][pat.Suggest(path)] = struct{}{} + } + break + } + } + + if totalChurn == 0 || len(buckets) == 0 { + return nil, false + } + + out := make([]SuspectBucket, 0, len(buckets)) + for _, b := range buckets { + sort.Strings(b.Paths) // deterministic inner order + if set := suggSets[b.Pattern.Glob]; set != nil { + b.Suggestions = make([]string, 0, len(set)) + for s := range set { + b.Suggestions = append(b.Suggestions, s) + } + sort.Strings(b.Suggestions) + } + out = append(out, *b) + } + sort.Slice(out, func(i, j int) bool { + if out[i].Churn != out[j].Churn { + return out[i].Churn > out[j].Churn + } + return out[i].Pattern.Glob < out[j].Pattern.Glob + }) + + worth := float64(suspectChurn)/float64(totalChurn) >= suspectWarningMinChurnRatio + return out, worth +} + +// ShellQuoteSingle wraps s in POSIX single quotes, escaping any +// embedded single quotes via the `'\''` sequence. Git paths can +// legally contain single quotes (and every other shell metacharacter), +// so suggestion globs derived from real repo paths must not be +// interpolated into copy-paste commands with naive `'%s'` formatting — +// one stray `'` in a subdirectory name would split the command into +// multiple words and break everything after it. +func ShellQuoteSingle(s string) string { + return "'" + strings.ReplaceAll(s, "'", `'\''`) + "'" +} + +// CollectAllSuggestions returns deduplicated --ignore globs across +// every bucket, preserving bucket order followed by in-bucket order. +// Callers surface a subset of buckets in their UI (e.g. top-N display) +// but must emit remediation suggestions over the full set — the +// warning's noise-floor check is computed against every bucket, so +// trimming suggestions to just the displayed subset would leave +// unshown suspects untouched and cause the warning to persist after +// the suggested fix. +func CollectAllSuggestions(buckets []SuspectBucket) []string { + seen := map[string]struct{}{} + var out []string + for _, b := range buckets { + for _, s := range b.Suggestions { + if _, ok := seen[s]; ok { + continue + } + seen[s] = struct{}{} + out = append(out, s) + } + } + return out +} diff --git a/internal/stats/suspect_test.go b/internal/stats/suspect_test.go new file mode 100644 index 0000000..a83ea2c --- /dev/null +++ b/internal/stats/suspect_test.go @@ -0,0 +1,424 @@ +package stats + +import ( + "testing" + + "github.com/lex0c/gitcortex/internal/extract" +) + +func TestDetectSuspectFilesPaths(t *testing.T) { + ds := &Dataset{ + files: map[string]*fileEntry{ + "src/main.go": {additions: 100, deletions: 50}, + "vendor/github.com/x/y/util.go": {additions: 500, deletions: 200}, + "node_modules/react/index.js": {additions: 300, deletions: 100}, + "package-lock.json": {additions: 5000, deletions: 4000}, + "static/app.min.js": {additions: 200, deletions: 100}, + "proto/foo.pb.go": {additions: 800, deletions: 700}, + "subproject/package-lock.json": {additions: 1000, deletions: 800}, + "clean/file.txt": {additions: 20, deletions: 5}, + }, + } + buckets, worth := DetectSuspectFiles(ds) + if !worth { + t.Fatal("expected worth=true with heavy vendor + lock churn") + } + + wantGlobs := map[string]bool{ + "vendor/*": true, + "node_modules/*": true, + "package-lock.json": true, + "*.min.js": true, + "*.pb.go": true, + } + gotGlobs := map[string]bool{} + for _, b := range buckets { + gotGlobs[b.Pattern.Glob] = true + } + for g := range wantGlobs { + if !gotGlobs[g] { + t.Errorf("expected bucket for %q, missing. Got: %v", g, gotGlobs) + } + } + // package-lock.json should match both root and subproject/... via the + // basenameEquals matcher + for _, b := range buckets { + if b.Pattern.Glob == "package-lock.json" && len(b.Paths) != 2 { + t.Errorf("package-lock.json matched %d files, want 2 (both roots)", len(b.Paths)) + } + } +} + +func TestDetectSuspectFilesBelowNoiseFloor(t *testing.T) { + // One small lock file in an otherwise clean repo — below 10% churn + // ratio, so worth=false. + ds := &Dataset{ + files: map[string]*fileEntry{ + "src/a.go": {additions: 1000, deletions: 500}, + "src/b.go": {additions: 800, deletions: 300}, + "src/c.go": {additions: 600, deletions: 200}, + "package-lock.json": {additions: 50, deletions: 30}, + }, + } + _, worth := DetectSuspectFiles(ds) + if worth { + t.Errorf("tiny lock file should not trigger warning; got worth=true") + } +} + +func TestDetectSuspectFilesEmpty(t *testing.T) { + _, worth := DetectSuspectFiles(&Dataset{files: map[string]*fileEntry{}}) + if worth { + t.Error("empty dataset should not trigger warning") + } + _, worth = DetectSuspectFiles(nil) + if worth { + t.Error("nil dataset should not trigger warning") + } +} + +func TestDetectSuspectFilesNestedDirSuggestions(t *testing.T) { + // Directory-segment matcher catches nested occurrences (pkg/vendor/..., + // subproject/node_modules/...), but the Glob label is only a short + // bucket header. The Suggestions list must include the *specific* + // parent paths so extract --ignore actually removes the matched + // files — extract treats "vendor/*" as a repo-root prefix and will + // not match "pkg/vendor/foo.go". + ds := &Dataset{ + files: map[string]*fileEntry{ + "vendor/a.go": {additions: 100, deletions: 50}, + "pkg/vendor/b.go": {additions: 100, deletions: 50}, + "pkg/vendor/c.go": {additions: 100, deletions: 50}, + "services/auth/vendor/d.go": {additions: 100, deletions: 50}, + "node_modules/e.js": {additions: 50, deletions: 25}, + "sub/node_modules/f.js": {additions: 50, deletions: 25}, + }, + } + buckets, _ := DetectSuspectFiles(ds) + byGlob := map[string]SuspectBucket{} + for _, b := range buckets { + byGlob[b.Pattern.Glob] = b + } + + vb, ok := byGlob["vendor/*"] + if !ok { + t.Fatal("vendor/* bucket missing") + } + wantVendor := map[string]bool{"vendor/*": true, "pkg/vendor/*": true, "services/auth/vendor/*": true} + if len(vb.Suggestions) != len(wantVendor) { + t.Errorf("vendor Suggestions = %v, want keys %v", vb.Suggestions, keys(wantVendor)) + } + for _, s := range vb.Suggestions { + if !wantVendor[s] { + t.Errorf("unexpected vendor suggestion %q", s) + } + } + + nm, ok := byGlob["node_modules/*"] + if !ok { + t.Fatal("node_modules/* bucket missing") + } + wantNM := map[string]bool{"node_modules/*": true, "sub/node_modules/*": true} + for _, s := range nm.Suggestions { + if !wantNM[s] { + t.Errorf("unexpected node_modules suggestion %q", s) + } + } + if len(nm.Suggestions) != len(wantNM) { + t.Errorf("node_modules Suggestions = %v, want %v", nm.Suggestions, keys(wantNM)) + } +} + +func TestDetectSuspectFilesGeneratedMatchesBasenameOnly(t *testing.T) { + // *.generated.* in the detector used to match on the full path via + // strings.Contains, catching paths where `.generated.` appeared in + // a directory name (src/foo.generated.v1/bar.go). The emitted + // suggestion is the literal `*.generated.*` glob, and extract's + // basename match would not ignore bar.go, so the warning would + // keep firing on every subsequent run. + // + // The matcher is now a basename check so directory-name occurrences + // don't get flagged in the first place. This test pins both sides: + // the flagged path IS matched by the suggestion, and the dir-only + // path is NOT flagged. + ds := &Dataset{ + files: map[string]*fileEntry{ + "src/main.generated.go": {additions: 500, deletions: 500}, // real generated file → should flag + "src/foo.generated.v1/bar.go": {additions: 500, deletions: 500}, // `.generated.` is in dir name → should NOT flag + "pkg/util/types.generated.proto": {additions: 500, deletions: 500}, // basename has it → should flag + }, + } + buckets, _ := DetectSuspectFiles(ds) + var gen *SuspectBucket + for i := range buckets { + if buckets[i].Pattern.Glob == "*.generated.*" { + gen = &buckets[i] + break + } + } + if gen == nil { + t.Fatal("*.generated.* bucket missing — expected at least the two basename matches") + } + // Must contain the two basename matches, must NOT contain the dir-only one. + wantIn := map[string]bool{ + "src/main.generated.go": true, + "pkg/util/types.generated.proto": true, + } + wantOut := "src/foo.generated.v1/bar.go" + for _, p := range gen.Paths { + if !wantIn[p] && p != wantOut { + t.Errorf("unexpected path in *.generated.* bucket: %q", p) + } + if p == wantOut { + t.Errorf("path %q has `.generated.` only in its directory — should not be flagged because "+ + "`--ignore '*.generated.*'` would not cover it (shouldIgnore matches basename)", p) + } + } +} + +func TestDetectSuspectFilesSuffixSuggestionsUnchanged(t *testing.T) { + // Suffix/basename matchers already work at any depth via extract's + // basename match path, so their Suggestions collapse to a single + // canonical glob regardless of how deeply nested the matches are. + ds := &Dataset{ + files: map[string]*fileEntry{ + "app.min.js": {additions: 500, deletions: 500}, + "static/jquery.min.js": {additions: 500, deletions: 500}, + "sub/dist/foo/bar.min.js": {additions: 500, deletions: 500}, // also matches dist/*, first match wins + "pkg/sub/package-lock.json": {additions: 500, deletions: 500}, + "package-lock.json": {additions: 500, deletions: 500}, + }, + } + buckets, _ := DetectSuspectFiles(ds) + for _, b := range buckets { + switch b.Pattern.Glob { + case "*.min.js": + if len(b.Suggestions) != 1 || b.Suggestions[0] != "*.min.js" { + t.Errorf("*.min.js Suggestions = %v, want [*.min.js]", b.Suggestions) + } + case "package-lock.json": + if len(b.Suggestions) != 1 || b.Suggestions[0] != "package-lock.json" { + t.Errorf("package-lock.json Suggestions = %v, want [package-lock.json]", b.Suggestions) + } + } + } +} + +func keys(m map[string]bool) []string { + out := make([]string, 0, len(m)) + for k := range m { + out = append(out, k) + } + return out +} + +// TestSuspectSuggestionsMatchExtractShouldIgnore is the end-to-end +// invariant: every --ignore glob the warning emits must cause +// extract.ShouldIgnore to return true for the paths that caused that +// glob to be emitted. Without this test the two surfaces (detector +// and ignore matcher) can drift silently — e.g. a future refactor of +// ShouldIgnore that tightens prefix semantics would break the +// suspect warning's entire value proposition without any stats test +// firing. +func TestShellQuoteSingle(t *testing.T) { + cases := []struct { + in, want string + }{ + {"vendor/*", `'vendor/*'`}, + {"*.min.js", `'*.min.js'`}, + {"package-lock.json", `'package-lock.json'`}, + {"", `''`}, + // The whole reason this helper exists: paths carrying a single + // quote would break naive '%s' interpolation. Expected POSIX + // shell-quoting closes the quote, appends an escaped `'`, then + // reopens the quote. + {"foo's vendor/*", `'foo'\''s vendor/*'`}, + {"a'b'c", `'a'\''b'\''c'`}, + {"'", `''\'''`}, + // Other shell metacharacters stay safe inside single quotes. + {"$VAR`cmd`/*", `'$VAR` + "`cmd`" + `/*'`}, + {`has "double" quotes/*`, `'has "double" quotes/*'`}, + } + for _, c := range cases { + got := ShellQuoteSingle(c.in) + if got != c.want { + t.Errorf("ShellQuoteSingle(%q) = %q, want %q", c.in, got, c.want) + } + } +} + +// TestShellQuoteSingleRoundTrip sanity-checks that the quoted form +// actually parses back to the original string under POSIX shell +// single-quote rules. Implemented by a hand-rolled mini-parser rather +// than shelling out, so the test is deterministic and hermetic. +func TestShellQuoteSingleRoundTrip(t *testing.T) { + inputs := []string{ + "vendor/*", + "foo's dir/*.go", + "it's a/b's c", + "'", + "''", + "a'b", + "plain/path", + } + for _, in := range inputs { + quoted := ShellQuoteSingle(in) + got := posixSingleQuoteUnquote(t, quoted) + if got != in { + t.Errorf("round-trip of %q via %q → %q", in, quoted, got) + } + } +} + +// posixSingleQuoteUnquote parses a string composed only of single-quoted +// segments and `\'` escapes (the exact shape ShellQuoteSingle emits). +// Anything else is a test failure. +func posixSingleQuoteUnquote(t *testing.T, s string) string { + t.Helper() + var out []byte + i := 0 + for i < len(s) { + if s[i] != '\'' { + t.Fatalf("expected opening ' at index %d in %q", i, s) + } + i++ // consume opening ' + for i < len(s) && s[i] != '\'' { + out = append(out, s[i]) + i++ + } + if i >= len(s) { + t.Fatalf("unterminated single quote in %q", s) + } + i++ // consume closing ' + // Optional `\'` escape bridging two single-quoted segments + if i+1 < len(s) && s[i] == '\\' && s[i+1] == '\'' { + out = append(out, '\'') + i += 2 + } + } + return string(out) +} + +func TestCollectAllSuggestionsCoversUnshownBuckets(t *testing.T) { + // Construct a dataset that triggers every directory-segment and + // common lockfile/suffix bucket, more than the CLI's display limit + // of 6 buckets. The aggregated suggestion list must include every + // bucket's glob so that a user copy-pasting the --ignore command + // covers the whole suspect set — not just the top-6 displayed. + ds := &Dataset{ + files: map[string]*fileEntry{ + "vendor/a.go": {additions: 500, deletions: 500}, + "node_modules/b.js": {additions: 500, deletions: 500}, + "dist/c.js": {additions: 500, deletions: 500}, + "build/d.out": {additions: 500, deletions: 500}, + "third_party/e.go": {additions: 500, deletions: 500}, + "foo.min.js": {additions: 500, deletions: 500}, + "bar.min.css": {additions: 500, deletions: 500}, + "package-lock.json": {additions: 500, deletions: 500}, + "yarn.lock": {additions: 500, deletions: 500}, + "go.sum": {additions: 500, deletions: 500}, + "proto/thing.pb.go": {additions: 500, deletions: 500}, + }, + } + buckets, worth := DetectSuspectFiles(ds) + if !worth { + t.Fatal("expected warning-worthy dataset") + } + if len(buckets) <= 6 { + t.Fatalf("this test needs > 6 buckets to exercise the unshown-bucket path; got %d", len(buckets)) + } + + suggestions := CollectAllSuggestions(buckets) + seen := map[string]bool{} + for _, s := range suggestions { + seen[s] = true + } + // Every bucket's Suggestions must appear in the aggregated list. + for _, b := range buckets { + for _, want := range b.Suggestions { + if !seen[want] { + t.Errorf("bucket %q suggestion %q missing from aggregated list — "+ + "user copy-paste would leave this bucket's paths in place", + b.Pattern.Glob, want) + } + } + } + // Dedup sanity: no duplicates in the output. + counts := map[string]int{} + for _, s := range suggestions { + counts[s]++ + if counts[s] > 1 { + t.Errorf("suggestion %q appears %d times; expected dedup", s, counts[s]) + } + } +} + +func TestSuspectSuggestionsMatchExtractShouldIgnore(t *testing.T) { + // Mix root-level and nested occurrences of every pattern class: + // directory segments (vendor, dist), suffix (*.min.js), basename + // (package-lock.json), generated extensions (*.pb.go). + ds := &Dataset{ + files: map[string]*fileEntry{ + "vendor/a.go": {additions: 100, deletions: 50}, + "pkg/vendor/deep/b.go": {additions: 100, deletions: 50}, + "services/auth/vendor/c.go": {additions: 100, deletions: 50}, + "dist/bundle.js": {additions: 200, deletions: 200}, + "wp-includes/js/dist/editor.js": {additions: 200, deletions: 200}, + "wp-admin/dist/app/chunk.js": {additions: 200, deletions: 200}, + "node_modules/foo.js": {additions: 80, deletions: 40}, + "sub/proj/node_modules/bar.js": {additions: 80, deletions: 40}, + "app.min.js": {additions: 60, deletions: 30}, + "static/vendor.min.js": {additions: 60, deletions: 30}, // also matches vendor, first match wins + "deep/nested/path/thing.min.js": {additions: 60, deletions: 30}, + "package-lock.json": {additions: 500, deletions: 400}, + "projects/x/package-lock.json": {additions: 500, deletions: 400}, + "proto/foo.pb.go": {additions: 300, deletions: 200}, + }, + } + buckets, _ := DetectSuspectFiles(ds) + if len(buckets) == 0 { + t.Fatal("no buckets produced; test inputs should trigger at least vendor/dist/min.js buckets") + } + + // For each bucket, feed its Suggestions into ShouldIgnore and + // assert every Path in that bucket is matched by at least one + // suggestion. A false here means the warning's suggested fix + // would leave the file untouched — exactly the regression we're + // guarding against. + for _, b := range buckets { + if len(b.Suggestions) == 0 { + t.Errorf("bucket %q has paths %v but no suggestions", b.Pattern.Glob, b.Paths) + continue + } + for _, p := range b.Paths { + if !extract.ShouldIgnore(p, b.Suggestions) { + t.Errorf("bucket %q: path %q is not matched by any of its suggestions %v — "+ + "user would copy-paste a fix that doesn't drop this file", + b.Pattern.Glob, p, b.Suggestions) + } + } + } +} + +func TestDetectSuspectFilesOrdering(t *testing.T) { + // Buckets must be sorted by churn desc; ties by glob asc. Determinism + // matters because the warning output lists top-N. + ds := &Dataset{ + files: map[string]*fileEntry{ + "vendor/a.go": {additions: 100, deletions: 100}, // 200 churn + "node_modules/x.js": {additions: 500, deletions: 500}, // 1000 churn + "app.min.js": {additions: 100, deletions: 100}, // 200 churn + }, + } + buckets, _ := DetectSuspectFiles(ds) + if len(buckets) < 3 { + t.Fatalf("want 3 buckets, got %d", len(buckets)) + } + if buckets[0].Pattern.Glob != "node_modules/*" { + t.Errorf("top bucket = %q, want %q", buckets[0].Pattern.Glob, "node_modules/*") + } + // vendor/* and *.min.js tied at 200 — tiebreak by glob asc → *.min.js first. + if buckets[1].Pattern.Glob != "*.min.js" { + t.Errorf("second bucket = %q, want %q (tiebreak asc)", buckets[1].Pattern.Glob, "*.min.js") + } +}