From 5143f4d487cee71d14744f88e71b655e0126645b Mon Sep 17 00:00:00 2001 From: lex0c Date: Sun, 19 Apr 2026 20:49:20 -0300 Subject: [PATCH 1/7] Add extensions stat: file extensions ranked by recent churn MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Historical lens on language distribution — "which extension is the team spending effort on now", not "what exists in the tree" (which cloc/tokei answer from the filesystem). ExtensionStats aggregates ds.files by extension bucket with files, churn, recent_churn, unique_devs, first_seen, last_seen. Sort is recent_churn desc so dormant extensions with high lifetime churn can't displace active ones. extractExtension policy: last segment after the final dot, single-dot dotfiles kept verbatim (".gitignore" as its own bucket), multi-dot takes the final segment (.env.local → .local), extensionless and degenerate inputs collapse into "(none)". Included in CLI --stat default sweep (output is compact), new HTML section below Directories, new Extensions section in METRICS.md documenting the policy + reading signals + explicit non-goals (no language-family grouping). Co-Authored-By: Claude Opus 4.7 (1M context) --- README.md | 1 + cmd/gitcortex/main.go | 17 +++- docs/METRICS.md | 25 ++++++ docs/RUNBOOK.md | 1 + internal/report/report.go | 2 + internal/report/template.go | 21 +++++ internal/stats/extension_test.go | 136 ++++++++++++++++++++++++++++++ internal/stats/format.go | 30 +++++++ internal/stats/stats.go | 138 +++++++++++++++++++++++++++++++ 9 files changed, 367 insertions(+), 4 deletions(-) create mode 100644 internal/stats/extension_test.go diff --git a/README.md b/README.md index ff5ce09..30e82d6 100644 --- a/README.md +++ b/README.md @@ -211,6 +211,7 @@ Available stats: | `top-commits` | Largest commits ranked by lines changed (includes message if extracted with `--include-commit-messages`) | | `pareto` | Concentration (80% threshold) across files, devs (two lenses: commits and churn), and directories | | `structure` | Repo layout as a `tree(1)`-style view, dirs sorted by aggregate churn, capped by `--tree-depth` (default 3) | +| `extensions` | File extensions ranked by recent churn, with file count, unique devs, and first/last-seen — the historical lens on language distribution | Output formats: `table` (default, human-readable), `csv` (single clean table per `--stat`, header row on line 1), `json` (unified object with all sections). diff --git a/cmd/gitcortex/main.go b/cmd/gitcortex/main.go index 0aee55b..d4da8ab 100644 --- a/cmd/gitcortex/main.go +++ b/cmd/gitcortex/main.go @@ -105,8 +105,8 @@ func isValidGranularity(s string) bool { func isValidStat(s string) bool { switch s { - case "summary", "contributors", "hotspots", "directories", "activity", - "busfactor", "coupling", "churn-risk", "working-patterns", + case "summary", "contributors", "hotspots", "directories", "extensions", + "activity", "busfactor", "coupling", "churn-risk", "working-patterns", "dev-network", "profile", "top-commits", "pareto", "structure": return true } @@ -133,7 +133,7 @@ func addStatsFlags(cmd *cobra.Command, sf *statsFlags) { cmd.Flags().StringVar(&sf.format, "format", "table", "Output format: table, csv, json") cmd.Flags().IntVar(&sf.topN, "top", 10, "Number of top entries to show (0 = all)") cmd.Flags().StringVar(&sf.granularity, "granularity", "month", "Activity granularity: day, week, month, year") - cmd.Flags().StringVar(&sf.stat, "stat", "", "Show a specific stat: summary, contributors, hotspots, directories, activity, busfactor, coupling, churn-risk, working-patterns, dev-network, profile, top-commits, pareto, structure") + cmd.Flags().StringVar(&sf.stat, "stat", "", "Show a specific stat: summary, contributors, hotspots, directories, extensions, activity, busfactor, coupling, churn-risk, working-patterns, dev-network, profile, top-commits, pareto, structure") cmd.Flags().IntVar(&sf.couplingMaxFiles, "coupling-max-files", 50, "Max files per commit for coupling analysis") cmd.Flags().IntVar(&sf.couplingMinChanges, "coupling-min-changes", 5, "Min co-changes for coupling results") cmd.Flags().IntVar(&sf.churnHalfLife, "churn-half-life", 90, "Half-life in days for churn decay (churn-risk)") @@ -151,7 +151,7 @@ func validateStatsFlags(sf *statsFlags) error { return fmt.Errorf("invalid --granularity %q; must be one of: day, week, month, year", sf.granularity) } if sf.stat != "" && !isValidStat(sf.stat) { - return fmt.Errorf("invalid --stat %q; valid: summary, contributors, hotspots, directories, activity, busfactor, coupling, churn-risk, working-patterns, dev-network, profile, top-commits, pareto, structure", sf.stat) + return fmt.Errorf("invalid --stat %q; valid: summary, contributors, hotspots, directories, extensions, activity, busfactor, coupling, churn-risk, working-patterns, dev-network, profile, top-commits, pareto, structure", sf.stat) } return nil } @@ -271,6 +271,12 @@ func renderStats(ds *stats.Dataset, sf *statsFlags) error { return err } } + if showAll || sf.stat == "extensions" { + fmt.Fprintf(os.Stderr, "\n=== Top %d Extensions ===\n", sf.topN) + if err := f.PrintExtensions(stats.ExtensionStats(ds, sf.topN)); err != nil { + return err + } + } if showAll || sf.stat == "activity" { fmt.Fprintf(os.Stderr, "\n=== Activity (%s) ===\n", sf.granularity) if err := f.PrintActivity(stats.ActivityOverTime(ds, sf.granularity)); err != nil { @@ -369,6 +375,9 @@ func renderStatsJSON(f *stats.Formatter, ds *stats.Dataset, sf *statsFlags) erro if showAll || sf.stat == "directories" { report["directories"] = stats.DirectoryStats(ds, sf.topN) } + if showAll || sf.stat == "extensions" { + report["extensions"] = stats.ExtensionStats(ds, sf.topN) + } if showAll || sf.stat == "activity" { report["activity"] = stats.ActivityOverTime(ds, sf.granularity) } diff --git a/docs/METRICS.md b/docs/METRICS.md index 1b7a0f6..5a2aefd 100644 --- a/docs/METRICS.md +++ b/docs/METRICS.md @@ -254,6 +254,31 @@ Two dev lenses are surfaced because commit count alone is a flawed proxy for con **How to interpret**: "20 files concentrate 80% of all churn" describes where change lands — it can indicate a healthy core module under active development, or a bottleneck if combined with low bus factor. Cross-reference with the Churn Risk section before drawing conclusions. +## Extensions + +File extensions aggregated from `ds.files`, ranked by **recent churn** (decay-weighted — see "Recent churn" below). The historical lens is the point: `cloc`/`tokei` answer "what languages exist on disk"; this answers "which extensions is the team spending effort on right now". + +**Extraction policy** (`extractExtension`): +- Last path segment (after the final `/`). +- Multi-dot names report the final segment: `foo.tar.gz` → `.gz`, `.eslintrc.json` → `.json`. +- Single-dot dotfiles keep their full name: `.gitignore` → `.gitignore`, `.env` → `.env`. Merging these into "(none)" would erase a meaningful group. +- No-dot names collapse into the `(none)` bucket: `Makefile`, `LICENSE`, `bin/run`. +- Extensions lowercased so `.PNG` and `.png` aggregate. + +**Per-bucket fields**: +- `files` — distinct file paths. +- `churn` — lifetime additions + deletions. +- `recent_churn` — decay-weighted aggregate (same half-life as other stats, set at load time). Leads the sort so a dormant extension with high lifetime churn won't displace an active one. +- `unique_devs` — distinct emails that contributed any line. +- `first_seen` / `last_seen` — min/max across the bucket's files (UTC date). + +**Reading signals**: +- `.yaml` recent churn high + unique_devs low → config owned by one person; schedule handoff before they leave. +- `.md` recent churn high → docs-heavy phase (release prep?) or churn-heavy README thrash. +- Cross-read with Directories: `.yaml` concentrated in one dir is config-as-code; `.yaml` spread across many dirs is config sprawl. + +**What it does not do**: no language-family grouping (`.js`+`.ts`+`.tsx` stay distinct). Aggregate downstream if you need "frontend vs backend"; the tool does not prescribe the taxonomy. Generated-file buckets (`.lock`, `.pb.go`, `.min.js`) will dominate unless filtered via `--ignore` at extract time — the suspect-paths warning flags these. + ## Repo Structure A `tree(1)`-style view of the repository's directory layout, built from paths seen in history (`FileHotspots`), not from the filesystem at HEAD. Deleted files are included — the view answers "what shaped the codebase", not "what is present today". diff --git a/docs/RUNBOOK.md b/docs/RUNBOOK.md index 4cc90c8..8bf5256 100644 --- a/docs/RUNBOOK.md +++ b/docs/RUNBOOK.md @@ -156,6 +156,7 @@ Section headers go to stderr, data to stdout. To capture only data: ./gitcortex stats --input data.jsonl --stat profile --email alice@company.com ./gitcortex stats --input data.jsonl --stat top-commits --top 20 ./gitcortex stats --input data.jsonl --stat structure --tree-depth 3 +./gitcortex stats --input data.jsonl --stat extensions --top 15 ``` ### Time filtering diff --git a/internal/report/report.go b/internal/report/report.go index cddc3a1..9ac870b 100644 --- a/internal/report/report.go +++ b/internal/report/report.go @@ -26,6 +26,7 @@ type ReportData struct { Contributors []stats.ContributorStat Hotspots []stats.FileStat Directories []stats.DirStat + Extensions []stats.ExtensionStat ActivityRaw []stats.ActivityBucket ActivityYears []string ActivityGrid [][]ActivityCell // [year][month 0-11] @@ -341,6 +342,7 @@ func Generate(w io.Writer, ds *stats.Dataset, repoName string, topN int, sf stat Contributors: stats.TopContributors(ds, topN), Hotspots: stats.FileHotspots(ds, topN), Directories: stats.DirectoryStats(ds, topN), + Extensions: stats.ExtensionStats(ds, topN), ActivityRaw: actRaw, ActivityYears: actYears, ActivityGrid: actGrid, diff --git a/internal/report/template.go b/internal/report/template.go index 0947812..fc4108a 100644 --- a/internal/report/template.go +++ b/internal/report/template.go @@ -239,6 +239,27 @@ footer { margin-top: 40px; padding-top: 16px; border-top: 1px solid #d0d7de; col {{end}} +{{if .Extensions}} +

Extensions

+

File extensions ranked by recent churn — "where is the team spending effort now", not "what exists at HEAD". Cross-read with Directories: a repo with high .yaml recent churn concentrated in one dir is config-as-code; spread across many dirs is config sprawl. · {{docRef "extensions"}}

+ + +{{$maxRecent := 0.0}}{{range .Extensions}}{{if gt .RecentChurn $maxRecent}}{{$maxRecent = .RecentChurn}}{{end}}{{end}} +{{range .Extensions}} + + + + + + + + + + +{{end}} +
ExtFilesChurnRecent ChurnDevsFirst SeenLast Seen
{{.Ext}}{{thousands .Files}}{{thousands .Churn}}{{printf "%.1f" .RecentChurn}}
{{.UniqueDevs}}{{.FirstSeen}}{{.LastSeen}}
+{{end}} + {{if .ChurnRisk}}

Churn Risk

Files ranked by recent churn. Label classifies context so you can judge action: legacy-hotspot (old code + concentrated + declining) is the urgent alarm; silo suggests knowledge transfer; active-core is young code with a single author (often fine); active is shared healthy work; cold is quiet.{{if (index .ChurnRisk 0).AgePercentile}} Age P__ / Trend P__ under the label show where this file sits in the repo's distribution: age P90 means older than 90% of tracked files; trend P10 means declining more sharply than 90%. Classification boundaries are the P75 age and P25 trend of this dataset (see {{docRef "churn-risk"}}).{{end}}

diff --git a/internal/stats/extension_test.go b/internal/stats/extension_test.go new file mode 100644 index 0000000..52b9e88 --- /dev/null +++ b/internal/stats/extension_test.go @@ -0,0 +1,136 @@ +package stats + +import ( + "testing" + "time" +) + +func TestExtractExtensionPolicy(t *testing.T) { + cases := []struct { + path string + want string + }{ + // Happy path + {"main.go", ".go"}, + {"src/pkg/util.go", ".go"}, + {"foo/bar/Baz.PNG", ".png"}, // lowercased + // Multi-dot: last segment wins. + {"archive.tar.gz", ".gz"}, + {"src/.eslintrc.json", ".json"}, // dotfile with a real ext splits on last + // Single-dot dotfiles keep full name — ".gitignore" carries + // meaning as a group; merging with "(none)" would confuse a + // Makefile-only repo with a dotfile-only one. + {".gitignore", ".gitignore"}, + {".env", ".env"}, + {"project/.env", ".env"}, + // Extensionless files. + {"Makefile", "(none)"}, + {"LICENSE", "(none)"}, + {"bin/run", "(none)"}, + // Degenerate. + {"", "(none)"}, + {"weird.", "(none)"}, + {".", "(none)"}, // single dot is not a dotfile, just noise + {"..", "(none)"}, // two dots → trailing-dot rule collapses + {"/", "(none)"}, // just a separator + {"foo/", "(none)"}, // trailing slash, empty basename + } + for _, c := range cases { + got := extractExtension(c.path) + if got != c.want { + t.Errorf("extractExtension(%q) = %q, want %q", c.path, got, c.want) + } + } +} + +func TestExtensionStatsAggregation(t *testing.T) { + // Hand-built dataset so aggregation is inspectable: two .go files + // with distinct devs, one .yaml shared by both, and a Makefile + // (extensionless) owned by one dev. First/last dates differ so the + // aggregator must track min/max across files, not last-seen only. + ds := &Dataset{ + Latest: time.Date(2024, 6, 1, 0, 0, 0, 0, time.UTC), + files: map[string]*fileEntry{ + "cmd/main.go": { + additions: 100, deletions: 20, recentChurn: 80, + devLines: map[string]int64{"alice@x": 90, "bob@x": 30}, + firstChange: time.Date(2024, 1, 1, 0, 0, 0, 0, time.UTC), + lastChange: time.Date(2024, 5, 1, 0, 0, 0, 0, time.UTC), + }, + "internal/util.go": { + additions: 40, deletions: 10, recentChurn: 30, + devLines: map[string]int64{"alice@x": 50}, + firstChange: time.Date(2024, 2, 1, 0, 0, 0, 0, time.UTC), + lastChange: time.Date(2024, 4, 1, 0, 0, 0, 0, time.UTC), + }, + "deploy/prod.yaml": { + additions: 20, deletions: 5, recentChurn: 15, + devLines: map[string]int64{"alice@x": 15, "bob@x": 10}, + firstChange: time.Date(2024, 3, 1, 0, 0, 0, 0, time.UTC), + lastChange: time.Date(2024, 6, 1, 0, 0, 0, 0, time.UTC), + }, + "Makefile": { + additions: 5, deletions: 1, recentChurn: 2, + devLines: map[string]int64{"bob@x": 6}, + firstChange: time.Date(2023, 12, 1, 0, 0, 0, 0, time.UTC), + lastChange: time.Date(2024, 1, 15, 0, 0, 0, 0, time.UTC), + }, + }, + } + + result := ExtensionStats(ds, 0) + if len(result) != 3 { + t.Fatalf("got %d buckets, want 3 (.go, .yaml, (none))", len(result)) + } + + // Sort order: recent churn desc → .go (110) > .yaml (15) > (none) (2). + if result[0].Ext != ".go" { + t.Errorf("result[0] = %q, want .go", result[0].Ext) + } + if result[0].Files != 2 { + t.Errorf(".go files = %d, want 2", result[0].Files) + } + if result[0].Churn != 170 { // 120 + 50 + t.Errorf(".go churn = %d, want 170", result[0].Churn) + } + if result[0].RecentChurn != 110 { + t.Errorf(".go recentChurn = %.1f, want 110", result[0].RecentChurn) + } + if result[0].UniqueDevs != 2 { // alice, bob across both .go files + t.Errorf(".go devs = %d, want 2", result[0].UniqueDevs) + } + if result[0].FirstSeen != "2024-01-01" { + t.Errorf(".go firstSeen = %q, want 2024-01-01", result[0].FirstSeen) + } + if result[0].LastSeen != "2024-05-01" { + t.Errorf(".go lastSeen = %q, want 2024-05-01 (max across .go files)", result[0].LastSeen) + } + + // (none) bucket is last (lowest recent churn). Confirm Makefile + // collapsed there and the FirstSeen predates the .go range — the + // aggregator must take the earliest firstChange across bucket files. + last := result[len(result)-1] + if last.Ext != "(none)" { + t.Errorf("last bucket = %q, want (none)", last.Ext) + } + if last.Files != 1 || last.FirstSeen != "2023-12-01" { + t.Errorf("(none) bucket = %+v", last) + } +} + +func TestExtensionStatsTopN(t *testing.T) { + ds := &Dataset{ + files: map[string]*fileEntry{ + "a.go": {recentChurn: 100, devLines: map[string]int64{"a": 1}}, + "b.py": {recentChurn: 50, devLines: map[string]int64{"a": 1}}, + "c.rs": {recentChurn: 10, devLines: map[string]int64{"a": 1}}, + }, + } + result := ExtensionStats(ds, 2) + if len(result) != 2 { + t.Fatalf("top 2 len = %d", len(result)) + } + if result[0].Ext != ".go" || result[1].Ext != ".py" { + t.Errorf("top 2 = [%s, %s], want [.go, .py]", result[0].Ext, result[1].Ext) + } +} diff --git a/internal/stats/format.go b/internal/stats/format.go index ad61d0f..e0d409e 100644 --- a/internal/stats/format.go +++ b/internal/stats/format.go @@ -159,6 +159,36 @@ func (f *Formatter) PrintDirectories(dirs []DirStat) error { } } +func (f *Formatter) PrintExtensions(exts []ExtensionStat) error { + switch f.format { + case "json": + return f.writeJSON(exts) + case "csv": + rows := make([][]string, len(exts)) + for i, e := range exts { + rows[i] = []string{ + e.Ext, + fmt.Sprintf("%d", e.Files), + fmt.Sprintf("%d", e.Churn), + fmt.Sprintf("%.1f", e.RecentChurn), + fmt.Sprintf("%d", e.UniqueDevs), + e.FirstSeen, + e.LastSeen, + } + } + return f.writeCSV([]string{"ext", "files", "churn", "recent_churn", "unique_devs", "first_seen", "last_seen"}, rows) + default: + tw := tabwriter.NewWriter(f.w, 0, 4, 2, ' ', 0) + fmt.Fprintf(tw, "EXT\tFILES\tCHURN\tRECENT CHURN\tDEVS\tFIRST SEEN\tLAST SEEN\n") + fmt.Fprintf(tw, "---\t-----\t-----\t------------\t----\t----------\t---------\n") + for _, e := range exts { + fmt.Fprintf(tw, "%s\t%d\t%d\t%.1f\t%d\t%s\t%s\n", + e.Ext, e.Files, e.Churn, e.RecentChurn, e.UniqueDevs, e.FirstSeen, e.LastSeen) + } + return tw.Flush() + } +} + func (f *Formatter) PrintActivity(buckets []ActivityBucket) error { switch f.format { case "json": diff --git a/internal/stats/stats.go b/internal/stats/stats.go index 3064a69..e31cd75 100644 --- a/internal/stats/stats.go +++ b/internal/stats/stats.go @@ -376,6 +376,144 @@ func DirectoryStats(ds *Dataset, n int) []DirStat { return result } +// ExtensionStat rolls history up per file extension. The historical +// lens is the point: "which extension is the team spending effort on" +// answers a different question than "which extension exists in the +// repo" — the latter is what cloc/tokei do from the filesystem. +// +// Ext is the normalized extension (leading dot, lowercased) — e.g. +// ".go", ".yaml", ".gitignore". Files without a conventional extension +// collapse into the bucket "(none)"; see extractExtension for the +// policy (dotfiles kept verbatim, last-segment rule for the rest). +// +// RecentChurn is the decay-weighted aggregate from the same pipeline +// as FileStat.RecentChurn (half-life set at Dataset load time), so a +// dropped-yesterday extension won't outrank a still-active one just +// because it accumulated more lifetime churn. +type ExtensionStat struct { + Ext string + Files int + Churn int64 + RecentChurn float64 + UniqueDevs int + FirstSeen string + LastSeen string +} + +// extractExtension returns the extension bucket for a path. +// Policy: +// - Basename after the final "/" is the subject; directory prefix is +// ignored. +// - Single-dot dotfiles (".gitignore", ".env") keep their full name — +// they carry meaning as a group, and reducing them to "" would +// merge them with extension-less files (Makefile, LICENSE). +// - Multi-segment dotfiles (".env.local", ".eslintrc.json") report +// the last segment (".local", ".json"). Imperfect but keeps the +// rule "last segment after the final dot" consistent and explicit. +// - Files with no dot (or a trailing dot) collapse into "(none)". +// - Extensions are lowercased so ".PNG" and ".png" aggregate. +func extractExtension(path string) string { + if i := strings.LastIndex(path, "/"); i >= 0 { + path = path[i+1:] + } + if path == "" { + return "(none)" + } + lastDot := strings.LastIndex(path, ".") + if lastDot <= 0 { + // No dot at all, or a name that begins with "." and has no + // second dot. Dotfiles get their full name (".gitignore" is a + // meaningful group); "." alone is degenerate and collapses + // like Makefile / LICENSE. + if len(path) > 1 && strings.HasPrefix(path, ".") { + return strings.ToLower(path) + } + return "(none)" + } + ext := path[lastDot:] + if ext == "." { + return "(none)" // trailing dot like "foo." — pathological but defined + } + return strings.ToLower(ext) +} + +// ExtensionStats aggregates ds.files by extension bucket. Complexity +// is O(F + E log E) where F is the number of tracked files and E is +// the number of distinct extensions — trivial compared to the stats +// that compute Herfindahl indices or per-commit coupling pairs. +func ExtensionStats(ds *Dataset, n int) []ExtensionStat { + type extAcc struct { + files int + churn int64 + recentChurn float64 + devs map[string]struct{} + first, last time.Time + } + + buckets := make(map[string]*extAcc) + for path, fe := range ds.files { + ext := extractExtension(path) + b, ok := buckets[ext] + if !ok { + b = &extAcc{devs: make(map[string]struct{})} + buckets[ext] = b + } + b.files++ + b.churn += fe.additions + fe.deletions + b.recentChurn += fe.recentChurn + for email := range fe.devLines { + b.devs[email] = struct{}{} + } + // Mirror-shape for first/last: only adopt non-zero inputs so + // we never stamp a zero-value time onto a previously-valid + // aggregate. Before/After both behave as expected across zero + // times but the explicit IsZero guard reads cleaner. + if !fe.firstChange.IsZero() && (b.first.IsZero() || fe.firstChange.Before(b.first)) { + b.first = fe.firstChange + } + if !fe.lastChange.IsZero() && fe.lastChange.After(b.last) { + b.last = fe.lastChange + } + } + + result := make([]ExtensionStat, 0, len(buckets)) + for ext, b := range buckets { + es := ExtensionStat{ + Ext: ext, + Files: b.files, + Churn: b.churn, + RecentChurn: math.Round(b.recentChurn*10) / 10, + UniqueDevs: len(b.devs), + } + if !b.first.IsZero() { + es.FirstSeen = b.first.UTC().Format("2006-01-02") + } + if !b.last.IsZero() { + es.LastSeen = b.last.UTC().Format("2006-01-02") + } + result = append(result, es) + } + + // Deterministic ordering under ties: recent churn desc, files desc, + // ext asc. Recent churn leads so that "where is effort landing now" + // is what the reader sees first; files breaks ties between similar + // recent-churn levels by established footprint. + sort.Slice(result, func(i, j int) bool { + if result[i].RecentChurn != result[j].RecentChurn { + return result[i].RecentChurn > result[j].RecentChurn + } + if result[i].Files != result[j].Files { + return result[i].Files > result[j].Files + } + return result[i].Ext < result[j].Ext + }) + + if n > 0 && n < len(result) { + result = result[:n] + } + return result +} + func ActivityOverTime(ds *Dataset, granularity string) []ActivityBucket { buckets := make(map[string]*ActivityBucket) var order []string From 3f61643f3ff9da6014d755a934511e448296276f Mon Sep 17 00:00:00 2001 From: lex0c Date: Sun, 19 Apr 2026 21:19:55 -0300 Subject: [PATCH 2/7] Fix bar width rendering in Extensions and ChurnRisk tables MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The pct template helper returns a string, so wrapping its output in printf "%.0f" produced %!f(string=...) which html/template then replaced with the ZgotmplZ safe-escape sentinel in CSS contexts. The rendered width was never a valid length and the bars collapsed. Drop the printf wrapper in both places (Extensions table just shipped, ChurnRisk inherited the same pattern) and pass pct straight through — its "68.6" output is already a valid CSS length. Hotspots already did this and was the only bar-churn surface rendering correctly. Co-Authored-By: Claude Opus 4.7 (1M context) --- internal/report/template.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/internal/report/template.go b/internal/report/template.go index fc4108a..2af01c7 100644 --- a/internal/report/template.go +++ b/internal/report/template.go @@ -251,7 +251,7 @@ footer { margin-top: 40px; padding-top: 16px; border-top: 1px solid #d0d7de; col {{thousands .Files}} {{thousands .Churn}} {{printf "%.1f" .RecentChurn}} -
+
{{.UniqueDevs}} {{.FirstSeen}} {{.LastSeen}} @@ -279,7 +279,7 @@ footer { margin-top: 40px; padding-top: 16px; border-top: 1px solid #d0d7de; col {{.Path}} {{if eq .Label "legacy-hotspot"}}🔴 {{.Label}}{{else if eq .Label "silo"}}🟡 {{.Label}}{{else if eq .Label "active-core"}}{{.Label}}{{else if eq .Label "active"}}{{.Label}}{{else}}{{.Label}}{{end}}{{if .AgePercentile}}
age P{{derefInt .AgePercentile}} · trend P{{derefInt .TrendPercentile}}
{{end}} {{printf "%.1f" .RecentChurn}} -
+
{{.BusFactor}} {{.AgeDays}}d {{if lt .Trend 0.5}}↓ {{printf "%.2f" .Trend}}{{else if gt .Trend 1.5}}↑ {{printf "%.2f" .Trend}}{{else}}→ {{printf "%.2f" .Trend}}{{end}} From f111e731f133d22137af659808707a9b3e094270 Mon Sep 17 00:00:00 2001 From: lex0c Date: Sun, 19 Apr 2026 21:38:20 -0300 Subject: [PATCH 3/7] Attribute extension churn per era, not by canonical path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ds.files is keyed by canonical (post-rename) path, so bucketing on extractExtension(path) lumped all of a foo.js → foo.ts lineage onto .ts and left .js at zero. Migration-heavy repos were misattributed. Fix: capture the extension at each change time (pre-rename) into a new fileEntry.byExt map populated in lockstep with additions / deletions / recentChurn. mergeFileEntry folds byExt across rename collapse. ExtensionStats consumes the per-era split, falling back to the canonical path's extension when byExt is nil (hand-built test dataset). Caveats documented in METRICS.md: "files" counts once per extension the lineage ever held (total across buckets can exceed len(ds.files)); "unique_devs" is still lineage-union so a dev who only touched pre-migration appears under the post-migration extension too — fixing this would need per-ext dev tracking on extContribution. Tests: rename split on the consumer side (TestExtensionStatsHonors PerEraSplit), and three TestMergeFileEntryByExt* variants pinning the merger so an accidental drop/clobber in the producer path can't pass unit tests. Co-Authored-By: Claude Opus 4.7 (1M context) --- docs/METRICS.md | 10 +- internal/stats/extension_test.go | 214 +++++++++++++++++++++++++++++++ internal/stats/reader.go | 63 +++++++++ internal/stats/stats.go | 80 +++++++++--- 4 files changed, 345 insertions(+), 22 deletions(-) diff --git a/docs/METRICS.md b/docs/METRICS.md index 5a2aefd..0c758da 100644 --- a/docs/METRICS.md +++ b/docs/METRICS.md @@ -266,11 +266,11 @@ File extensions aggregated from `ds.files`, ranked by **recent churn** (decay-we - Extensions lowercased so `.PNG` and `.png` aggregate. **Per-bucket fields**: -- `files` — distinct file paths. -- `churn` — lifetime additions + deletions. -- `recent_churn` — decay-weighted aggregate (same half-life as other stats, set at load time). Leads the sort so a dormant extension with high lifetime churn won't displace an active one. -- `unique_devs` — distinct emails that contributed any line. -- `first_seen` / `last_seen` — min/max across the bucket's files (UTC date). +- `files` — distinct file lineages that ever held this extension. A file renamed across extensions (foo.js → foo.ts) counts once in each bucket; totals across buckets can therefore exceed the dataset's file count in migration-heavy repos. +- `churn` — lifetime additions + deletions attributed to this extension specifically. A foo.js → foo.ts migration with 1000 lines of pre-rename churn and 500 post-rename does **not** collapse all 1500 onto `.ts`; `.js` keeps its 1000 and `.ts` gets 500. The attribution comes from capturing the path's extension at each change before `applyRenames` merges the lineage. +- `recent_churn` — same per-era semantics, decay-weighted (same half-life as other stats, set at load time). Leads the sort so a dormant extension with high lifetime churn won't displace an active one. +- `unique_devs` — distinct emails that touched any file that ever held this extension. **Over-counts across migrations**: a dev who only worked on `foo.js` pre-migration still appears under `.ts` if that file was migrated. Splitting devs per era would need per-commit dev tracking that `fileEntry` does not retain. Read this as "people with context on files that at some point were this extension" rather than "active contributors in this extension". +- `first_seen` / `last_seen` — min/max within the bucket's era, UTC date. For the `.js` bucket in a TypeScript migration, `last_seen` is the migration cutoff, not today's date. **Reading signals**: - `.yaml` recent churn high + unique_devs low → config owned by one person; schedule handoff before they leave. diff --git a/internal/stats/extension_test.go b/internal/stats/extension_test.go index 52b9e88..2dfe539 100644 --- a/internal/stats/extension_test.go +++ b/internal/stats/extension_test.go @@ -118,6 +118,220 @@ func TestExtensionStatsAggregation(t *testing.T) { } } +// Regression: a file renamed across extensions (foo.js → foo.ts) +// collapses onto one canonical path after applyRenames. Bucketing on +// that canonical path alone would assign ALL historical churn to .ts +// and zero to .js — an ugly skew in migration-heavy repos. The fix +// uses fileEntry.byExt (populated at per-change time) to split the +// lineage back across both buckets. +func TestExtensionStatsHonorsPerEraSplit(t *testing.T) { + ds := &Dataset{ + Latest: time.Date(2024, 6, 1, 0, 0, 0, 0, time.UTC), + files: map[string]*fileEntry{ + // Simulates what reader.go produces post-applyRenames: one + // canonical key ("foo.ts") with churn split across the two + // extensions the file held during its lifetime. + "foo.ts": { + additions: 1200, deletions: 300, + recentChurn: 900, + devLines: map[string]int64{"alice@x": 900, "bob@x": 600}, + byExt: map[string]*extContribution{ + ".js": { + churn: 1000, + recentChurn: 200, // old era, decayed + firstChange: time.Date(2023, 1, 1, 0, 0, 0, 0, time.UTC), + lastChange: time.Date(2024, 2, 15, 0, 0, 0, 0, time.UTC), + }, + ".ts": { + churn: 500, + recentChurn: 700, // recent migration, less decay + firstChange: time.Date(2024, 2, 16, 0, 0, 0, 0, time.UTC), + lastChange: time.Date(2024, 5, 30, 0, 0, 0, 0, time.UTC), + }, + }, + }, + }, + } + + result := ExtensionStats(ds, 0) + if len(result) != 2 { + t.Fatalf("got %d buckets, want 2 (.js + .ts)", len(result)) + } + + var js, ts *ExtensionStat + for i := range result { + switch result[i].Ext { + case ".js": + js = &result[i] + case ".ts": + ts = &result[i] + } + } + if js == nil || ts == nil { + t.Fatalf("missing .js or .ts bucket; result=%+v", result) + } + + // Churn must reflect the per-era split, NOT be lumped into .ts. + if js.Churn != 1000 || ts.Churn != 500 { + t.Errorf("churn split wrong: .js=%d (want 1000), .ts=%d (want 500)", js.Churn, ts.Churn) + } + // RecentChurn similarly preserved per-era. + if js.RecentChurn != 200 || ts.RecentChurn != 700 { + t.Errorf("recent churn split wrong: .js=%.1f, .ts=%.1f", js.RecentChurn, ts.RecentChurn) + } + // One file lineage, but counts once per ext it held. + if js.Files != 1 || ts.Files != 1 { + t.Errorf("files per bucket: .js=%d, .ts=%d, want 1/1 (lineage counts in each bucket it held)", js.Files, ts.Files) + } + // Dates: each ext reports the range from its own era, not the whole + // file's range. .js ended when the migration started; .ts starts + // the day after. + if js.LastSeen != "2024-02-15" { + t.Errorf(".js LastSeen = %q, want 2024-02-15 (migration cutoff, not post-rename activity)", js.LastSeen) + } + if ts.FirstSeen != "2024-02-16" { + t.Errorf(".ts FirstSeen = %q, want 2024-02-16 (post-rename era start)", ts.FirstSeen) + } +} + +// Regression: applyRenames calls mergeFileEntry when two fileEntries +// collapse onto the same canonical path (foo.js → foo.ts). If the +// byExt merge drops an entry, sums dates incorrectly, or clobbers an +// overlapping bucket, ExtensionStats silently loses per-era +// attribution and the TestExtensionStatsHonorsPerEraSplit consumer- +// side guard wouldn't catch it — that test hand-builds byExt and +// never exercises the merger. +func TestMergeFileEntryByExt(t *testing.T) { + // dst covers two extensions; src covers one that overlaps (.js) + // and one that's new to dst (.md). After merge: .js aggregates, + // .md is transferred, .ts is untouched. + dst := &fileEntry{ + devLines: map[string]int64{"alice@x": 10}, + devCommits: map[string]int{"alice@x": 1}, + monthChurn: map[string]int64{}, + byExt: map[string]*extContribution{ + ".js": { + churn: 300, + recentChurn: 100, + firstChange: time.Date(2023, 1, 1, 0, 0, 0, 0, time.UTC), + lastChange: time.Date(2023, 6, 1, 0, 0, 0, 0, time.UTC), + }, + ".ts": { + churn: 500, + recentChurn: 400, + firstChange: time.Date(2024, 1, 1, 0, 0, 0, 0, time.UTC), + lastChange: time.Date(2024, 5, 1, 0, 0, 0, 0, time.UTC), + }, + }, + } + src := &fileEntry{ + devLines: map[string]int64{"bob@x": 20}, + devCommits: map[string]int{"bob@x": 2}, + monthChurn: map[string]int64{}, + byExt: map[string]*extContribution{ + ".js": { + // Earlier first + later last than dst — both bounds + // should widen after merge. + churn: 100, + recentChurn: 50, + firstChange: time.Date(2022, 6, 1, 0, 0, 0, 0, time.UTC), + lastChange: time.Date(2023, 12, 1, 0, 0, 0, 0, time.UTC), + }, + ".md": { + churn: 40, + recentChurn: 10, + firstChange: time.Date(2023, 3, 1, 0, 0, 0, 0, time.UTC), + lastChange: time.Date(2023, 8, 1, 0, 0, 0, 0, time.UTC), + }, + }, + } + + mergeFileEntry(dst, src) + + if len(dst.byExt) != 3 { + t.Fatalf("dst.byExt size = %d, want 3 (.js .ts .md)", len(dst.byExt)) + } + + // Overlapping bucket: churn and recentChurn add; dates widen. + js := dst.byExt[".js"] + if js.churn != 400 { + t.Errorf(".js churn = %d, want 400 (300+100)", js.churn) + } + if js.recentChurn != 150 { + t.Errorf(".js recentChurn = %.1f, want 150 (100+50)", js.recentChurn) + } + if !js.firstChange.Equal(time.Date(2022, 6, 1, 0, 0, 0, 0, time.UTC)) { + t.Errorf(".js firstChange = %v, want 2022-06-01 (src's earlier date won)", js.firstChange) + } + if !js.lastChange.Equal(time.Date(2023, 12, 1, 0, 0, 0, 0, time.UTC)) { + t.Errorf(".js lastChange = %v, want 2023-12-01 (src's later date won)", js.lastChange) + } + + // Non-overlapping bucket transferred from src: must be present + // with src's values preserved verbatim. + md := dst.byExt[".md"] + if md == nil { + t.Fatal(".md bucket missing after merge — src entry dropped") + } + if md.churn != 40 || md.recentChurn != 10 { + t.Errorf(".md = %+v, want churn=40 recentChurn=10", md) + } + + // dst-only bucket untouched. + ts := dst.byExt[".ts"] + if ts.churn != 500 || ts.recentChurn != 400 { + t.Errorf(".ts = %+v, want churn=500 recentChurn=400 (dst value preserved)", ts) + } +} + +// mergeFileEntry must lazily create dst.byExt when dst started nil — +// covers the case where a long-lived file without extension history +// collides with a newly-seen one that carries a byExt map. +func TestMergeFileEntryByExtNilDst(t *testing.T) { + dst := &fileEntry{ + devLines: map[string]int64{}, + devCommits: map[string]int{}, + monthChurn: map[string]int64{}, + } + src := &fileEntry{ + devLines: map[string]int64{}, + devCommits: map[string]int{}, + monthChurn: map[string]int64{}, + byExt: map[string]*extContribution{ + ".go": {churn: 10, recentChurn: 5}, + }, + } + mergeFileEntry(dst, src) + if dst.byExt == nil { + t.Fatal("dst.byExt still nil after merging from non-nil src") + } + if got := dst.byExt[".go"]; got == nil || got.churn != 10 { + t.Errorf("dst.byExt[.go] = %+v, want churn=10", got) + } +} + +// Inverse case: src has no byExt (e.g. legacy or hand-built). Merge +// must be a no-op on dst.byExt and not panic. +func TestMergeFileEntryByExtNilSrc(t *testing.T) { + dst := &fileEntry{ + devLines: map[string]int64{}, + devCommits: map[string]int{}, + monthChurn: map[string]int64{}, + byExt: map[string]*extContribution{ + ".rs": {churn: 7}, + }, + } + src := &fileEntry{ + devLines: map[string]int64{}, + devCommits: map[string]int{}, + monthChurn: map[string]int64{}, + } + mergeFileEntry(dst, src) + if len(dst.byExt) != 1 || dst.byExt[".rs"].churn != 7 { + t.Errorf("dst.byExt mutated by nil-src merge: %+v", dst.byExt) + } +} + func TestExtensionStatsTopN(t *testing.T) { ds := &Dataset{ files: map[string]*fileEntry{ diff --git a/internal/stats/reader.go b/internal/stats/reader.go index d75952c..6c395d8 100644 --- a/internal/stats/reader.go +++ b/internal/stats/reader.go @@ -34,6 +34,21 @@ type fileEntry struct { firstChange time.Time lastChange time.Time monthChurn map[string]int64 // key: "YYYY-MM"; used for trend classification + // byExt splits this file's churn across extensions as they appeared + // at each change, so a rename-across-extensions (foo.js → foo.ts) + // keeps per-era attribution correct even after applyRenames merges + // the lineage onto one canonical path. Populated at ingest in + // lockstep with additions/deletions/recentChurn; consumed only by + // ExtensionStats. nil for hand-built fileEntries in tests — the + // aggregator falls back to the canonical path's extension when so. + byExt map[string]*extContribution +} + +type extContribution struct { + churn int64 + recentChurn float64 + firstChange time.Time + lastChange time.Time } type filePair struct{ a, b string } @@ -334,6 +349,22 @@ func streamLoadInto(ds *Dataset, r io.Reader, opt LoadOptions, pathPrefix string fe.additions += cf.Additions fe.deletions += cf.Deletions + // byExt: attribute this change to the extension the path had + // AT THIS COMMIT, not the canonical post-rename extension. + // This is the only place the pre-rename path is still + // available; after applyRenames all entries collapse onto + // the final path and the split would be lost. + changeExt := extractExtension(path) + if fe.byExt == nil { + fe.byExt = make(map[string]*extContribution) + } + ec, ok := fe.byExt[changeExt] + if !ok { + ec = &extContribution{} + fe.byExt[changeExt] = ec + } + ec.churn += cf.Additions + cf.Deletions + cm := ds.commits[cf.Commit] if cm != nil { fe.devLines[cm.email] += cf.Additions + cf.Deletions @@ -349,12 +380,19 @@ func streamLoadInto(ds *Dataset, r io.Reader, opt LoadOptions, pathPrefix string days := ds.Latest.Sub(cm.date).Hours() / 24 weight := math.Exp(-lambda * days) fe.recentChurn += float64(cf.Additions+cf.Deletions) * weight + ec.recentChurn += float64(cf.Additions+cf.Deletions) * weight if cm.date.After(fe.lastChange) { fe.lastChange = cm.date } if fe.firstChange.IsZero() || cm.date.Before(fe.firstChange) { fe.firstChange = cm.date } + if cm.date.After(ec.lastChange) { + ec.lastChange = cm.date + } + if ec.firstChange.IsZero() || cm.date.Before(ec.firstChange) { + ec.firstChange = cm.date + } fe.monthChurn[cm.date.UTC().Format("2006-01")] += cf.Additions + cf.Deletions } } @@ -598,6 +636,31 @@ func mergeFileEntry(dst, src *fileEntry) { if src.lastChange.After(dst.lastChange) { dst.lastChange = src.lastChange } + + // byExt: merge per-ext buckets. When rename collapses foo.js → foo.ts, + // this preserves ".js" churn on the ts-keyed entry so ExtensionStats + // can split it back. Same bucket on both sides = add; new bucket = + // transfer the pointer (src is discarded afterwards so ownership + // transfer is safe). + if src.byExt != nil { + if dst.byExt == nil { + dst.byExt = make(map[string]*extContribution, len(src.byExt)) + } + for ext, srcEC := range src.byExt { + if dstEC, ok := dst.byExt[ext]; ok { + dstEC.churn += srcEC.churn + dstEC.recentChurn += srcEC.recentChurn + if !srcEC.firstChange.IsZero() && (dstEC.firstChange.IsZero() || srcEC.firstChange.Before(dstEC.firstChange)) { + dstEC.firstChange = srcEC.firstChange + } + if srcEC.lastChange.After(dstEC.lastChange) { + dstEC.lastChange = srcEC.lastChange + } + } else { + dst.byExt[ext] = srcEC + } + } + } } // isMechanicalRefactor returns true when a commit's shape matches a likely diff --git a/internal/stats/stats.go b/internal/stats/stats.go index e31cd75..4bba40f 100644 --- a/internal/stats/stats.go +++ b/internal/stats/stats.go @@ -441,6 +441,26 @@ func extractExtension(path string) string { // is O(F + E log E) where F is the number of tracked files and E is // the number of distinct extensions — trivial compared to the stats // that compute Herfindahl indices or per-commit coupling pairs. +// +// Attribution uses fileEntry.byExt, which tracks churn per extension +// as it existed at each change. A file renamed across extensions +// (foo.js → foo.ts) contributes to both buckets proportionally to +// its pre- and post-rename history; bucketing on the canonical path's +// current extension alone would misattribute the entire lineage to +// the final extension. A file that touched two extensions counts once +// in each — "files" is "distinct lineages that ever held this +// extension". Totals across buckets therefore exceed len(ds.files) in +// repos with migrations. +// +// UniqueDevs is per-file-union: any dev who touched a file contributes +// to every extension bucket that file ever held. This over-counts on +// migration: a dev who only touched foo.js pre-rename will also +// appear under ".ts" if the file was migrated. Accepting this as a +// bounded caveat — splitting devs per era would require per-commit +// dev tracking that doesn't exist on fileEntry. +// +// Falls back to the canonical path's extension when fe.byExt is nil +// (hand-built fileEntries in unit tests never populate it). func ExtensionStats(ds *Dataset, n int) []ExtensionStat { type extAcc struct { files int @@ -450,29 +470,55 @@ func ExtensionStats(ds *Dataset, n int) []ExtensionStat { first, last time.Time } - buckets := make(map[string]*extAcc) - for path, fe := range ds.files { - ext := extractExtension(path) + getBucket := func(buckets map[string]*extAcc, ext string) *extAcc { b, ok := buckets[ext] if !ok { b = &extAcc{devs: make(map[string]struct{})} buckets[ext] = b } - b.files++ - b.churn += fe.additions + fe.deletions - b.recentChurn += fe.recentChurn - for email := range fe.devLines { - b.devs[email] = struct{}{} - } - // Mirror-shape for first/last: only adopt non-zero inputs so - // we never stamp a zero-value time onto a previously-valid - // aggregate. Before/After both behave as expected across zero - // times but the explicit IsZero guard reads cleaner. - if !fe.firstChange.IsZero() && (b.first.IsZero() || fe.firstChange.Before(b.first)) { - b.first = fe.firstChange + return b + } + + buckets := make(map[string]*extAcc) + for path, fe := range ds.files { + // Real path: use the rename-aware per-ext split captured at + // ingest. Fallback: if byExt is nil (legacy data / test + // dataset), synthesize a single-bucket split from the + // canonical path's extension + current file aggregate. + if fe.byExt == nil { + ext := extractExtension(path) + b := getBucket(buckets, ext) + b.files++ + b.churn += fe.additions + fe.deletions + b.recentChurn += fe.recentChurn + for email := range fe.devLines { + b.devs[email] = struct{}{} + } + if !fe.firstChange.IsZero() && (b.first.IsZero() || fe.firstChange.Before(b.first)) { + b.first = fe.firstChange + } + if !fe.lastChange.IsZero() && fe.lastChange.After(b.last) { + b.last = fe.lastChange + } + continue } - if !fe.lastChange.IsZero() && fe.lastChange.After(b.last) { - b.last = fe.lastChange + for ext, ec := range fe.byExt { + b := getBucket(buckets, ext) + b.files++ + b.churn += ec.churn + b.recentChurn += ec.recentChurn + // Dev attribution uses the per-file dev set (see doc comment + // — we accept the cross-era over-count rather than a coarser + // approximation). + for email := range fe.devLines { + b.devs[email] = struct{}{} + } + if !ec.firstChange.IsZero() && (b.first.IsZero() || ec.firstChange.Before(b.first)) { + b.first = ec.firstChange + } + if !ec.lastChange.IsZero() && ec.lastChange.After(b.last) { + b.last = ec.lastChange + } } } From e3b14e40b5aa263f011b868ebb300bba065d3ede Mon Sep 17 00:00:00 2001 From: lex0c Date: Sun, 19 Apr 2026 21:55:39 -0300 Subject: [PATCH 4/7] Preserve fractional churn in bar widths MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit pct cast RecentChurn through int64 before scaling, so any dataset where every bucket's RecentChurn was below 1 (heavy decay, aggressive --since, or --churn-half-life shrunk to a day or two) collapsed every bar to 0% — the visualization became useless precisely when the table still carried meaningful relative differences. Add pctFloat(val, max float64) and route the Extensions and ChurnRisk bars through it. Verified end-to-end with gitcortex report --churn-half-life 1 ... where Extensions buckets ranged 0.4 → 0.1 RecentChurn and bars now render at 100% / 25% / 0% (proportional) instead of flat-zero. Co-Authored-By: Claude Opus 4.7 (1M context) --- internal/report/report.go | 14 ++++++++++++++ internal/report/report_test.go | 29 +++++++++++++++++++++++++++++ internal/report/template.go | 4 ++-- 3 files changed, 45 insertions(+), 2 deletions(-) diff --git a/internal/report/report.go b/internal/report/report.go index 9ac870b..6b1b741 100644 --- a/internal/report/report.go +++ b/internal/report/report.go @@ -394,6 +394,19 @@ func pctInt(val, max int) string { return fmt.Sprintf("%.1f", float64(val)/float64(max)*100) } +// pctFloat is the float-domain sibling of pct. Needed for metrics like +// RecentChurn that carry sub-1 fractional values after heavy decay +// (small repos, or --since restricting the window): casting through +// int64 truncates every bucket to 0 and the bar reads 0% across the +// board even though the table shows non-zero churn. Accepting +// float64 straight through preserves the relative scale. +func pctFloat(val, max float64) string { + if max == 0 { + return "0" + } + return fmt.Sprintf("%.1f", val/max*100) +} + func heatColor(val, max int) string { if max == 0 || val == 0 { return "#f0f0f0" @@ -453,6 +466,7 @@ func actColor(commits, max int) string { var funcMap = template.FuncMap{ "pct": pct, "pctInt": pctInt, + "pctFloat": pctFloat, "heatColor": heatColor, "joinDevs": stats.JoinDevs, "seq": seq, diff --git a/internal/report/report_test.go b/internal/report/report_test.go index e6d114e..2b4c826 100644 --- a/internal/report/report_test.go +++ b/internal/report/report_test.go @@ -436,6 +436,35 @@ func TestBuildLabelCountListOmitsEmpty(t *testing.T) { } } +// Regression: pct(int64(x), int64(y)) collapsed every sub-1 float to +// 0 before this helper existed, so extension/churn-risk bars all +// rendered as 0% on datasets with heavily decayed RecentChurn (small +// repos, aggressive --since filters). pctFloat preserves the relative +// scale. +func TestPctFloat(t *testing.T) { + cases := []struct { + val, max float64 + want string + }{ + // Sub-1 values: relative scale preserved (would all be 0 under int64 cast). + {0.5, 1.0, "50.0"}, + {0.25, 0.5, "50.0"}, + {0.1, 0.9, "11.1"}, + // Mixed small + large. + {50.0, 200.0, "25.0"}, + // max at zero → safe zero string, no NaN or division by zero. + {5.0, 0.0, "0"}, + {0.0, 0.0, "0"}, + // val > max (can happen under rounding noise in sort+display). + {10.0, 5.0, "200.0"}, + } + for _, c := range cases { + if got := pctFloat(c.val, c.max); got != c.want { + t.Errorf("pctFloat(%v, %v) = %q, want %q", c.val, c.max, got, c.want) + } + } +} + func TestThousands(t *testing.T) { cases := []struct { in interface{} diff --git a/internal/report/template.go b/internal/report/template.go index 2af01c7..4f5ef3a 100644 --- a/internal/report/template.go +++ b/internal/report/template.go @@ -251,7 +251,7 @@ footer { margin-top: 40px; padding-top: 16px; border-top: 1px solid #d0d7de; col {{thousands .Files}} {{thousands .Churn}} {{printf "%.1f" .RecentChurn}} -
+
{{.UniqueDevs}} {{.FirstSeen}} {{.LastSeen}} @@ -279,7 +279,7 @@ footer { margin-top: 40px; padding-top: 16px; border-top: 1px solid #d0d7de; col {{.Path}} {{if eq .Label "legacy-hotspot"}}🔴 {{.Label}}{{else if eq .Label "silo"}}🟡 {{.Label}}{{else if eq .Label "active-core"}}{{.Label}}{{else if eq .Label "active"}}{{.Label}}{{else}}{{.Label}}{{end}}{{if .AgePercentile}}
age P{{derefInt .AgePercentile}} · trend P{{derefInt .TrendPercentile}}
{{end}} {{printf "%.1f" .RecentChurn}} -
+
{{.BusFactor}} {{.AgeDays}}d {{if lt .Trend 0.5}}↓ {{printf "%.2f" .Trend}}{{else if gt .Trend 1.5}}↑ {{printf "%.2f" .Trend}}{{else}}→ {{printf "%.2f" .Trend}}{{end}} From da43deba443694395b854f78037b29ef182d0b4d Mon Sep 17 00:00:00 2001 From: lex0c Date: Sun, 19 Apr 2026 22:13:47 -0300 Subject: [PATCH 5/7] Strip multi-input stem prefix before parsing extension MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit LoadMultiJSONL prepends ":" to every tracked path so multi-repo reports can disambiguate colliding filenames. For nested paths the slash-split in extractExtension already discards the prefix, but a root-level extensionless file (Makefile, LICENSE) keeps it — and if the stem contains a dot (repo.v1, project.2024), LastIndex(".") picked the stem's dot and emitted a bogus bucket like ".v1:makefile" instead of "(none)". Silent corruption of counts and ranking in multi-repo mode. Fix in extractExtension so the producer (reader.go ingest) and the fallback in ExtensionStats both benefit from one change. Stripping on the first ":" accepts a theoretical false positive for filenames that genuinely contain ":" — rare on POSIX/Windows and absent from every ds.files key except multi-input prefixes in practice. Verified end-to-end: reproducing the case by copying a fixture to pi-hole.v1.jsonl and running multi-input stats showed zero ".v1:*" buckets and a correctly populated "(none)" bucket. Unit test covers 7 prefix patterns including nested paths (where the fix is a no-op) and empty basenames. Co-Authored-By: Claude Opus 4.7 (1M context) --- internal/stats/extension_test.go | 12 ++++++++++++ internal/stats/stats.go | 20 ++++++++++++++++++++ 2 files changed, 32 insertions(+) diff --git a/internal/stats/extension_test.go b/internal/stats/extension_test.go index 2dfe539..6860562 100644 --- a/internal/stats/extension_test.go +++ b/internal/stats/extension_test.go @@ -34,6 +34,18 @@ func TestExtractExtensionPolicy(t *testing.T) { {"..", "(none)"}, // two dots → trailing-dot rule collapses {"/", "(none)"}, // just a separator {"foo/", "(none)"}, // trailing slash, empty basename + // Multi-input stem prefix. LoadMultiJSONL prepends ":" + // to root-level paths, and the stem may legitimately contain + // dots; without stripping, those dots would be mistaken for + // extensions. Only reaches the basename for root-level files — + // nested paths already discard the prefix via the slash split. + {"repo.v1:Makefile", "(none)"}, + {"repo.v1:LICENSE", "(none)"}, + {"repo.v1:foo.go", ".go"}, // real ext still wins after prefix strip + {"repo:Makefile", "(none)"}, // stem with no dots — same rule + {"repo.v1:.gitignore", ".gitignore"}, // dotfile survives prefix + {"repo.v1:src/foo.go", ".go"}, // nested path: slash strips prefix first + {"repo.v1:", "(none)"}, // prefix with empty basename } for _, c := range cases { got := extractExtension(c.path) diff --git a/internal/stats/stats.go b/internal/stats/stats.go index 4bba40f..a5fc8d8 100644 --- a/internal/stats/stats.go +++ b/internal/stats/stats.go @@ -404,6 +404,15 @@ type ExtensionStat struct { // Policy: // - Basename after the final "/" is the subject; directory prefix is // ignored. +// - Multi-input prefix (":") on root-level files is stripped +// first, so repo.v1:Makefile collapses to (none) instead of +// picking the dot inside the stem and emitting ".v1:makefile". +// LoadMultiJSONL is the only code path that injects ":" into +// tracked paths, so the presence of ":" in a basename is a strong +// signal of this prefix; the alternative — a real filename with +// ":" — is rare enough on POSIX/Windows that we accept the +// false-positive risk in exchange for correct multi-repo +// behaviour. // - Single-dot dotfiles (".gitignore", ".env") keep their full name — // they carry meaning as a group, and reducing them to "" would // merge them with extension-less files (Makefile, LICENSE). @@ -419,6 +428,17 @@ func extractExtension(path string) string { if path == "" { return "(none)" } + // Strip multi-input stem prefix. LoadMultiJSONL prepends ":" + // to paths; for nested files the slash-split above already + // discarded it, but root-level files still carry it and the dots + // inside a stem name (e.g. "repo.v1") would otherwise be mistaken + // for a real extension. + if i := strings.IndexByte(path, ':'); i >= 0 { + path = path[i+1:] + if path == "" { + return "(none)" + } + } lastDot := strings.LastIndex(path, ".") if lastDot <= 0 { // No dot at all, or a name that begins with "." and has no From 6ff4e1ce4802bd0dbff6d4e3fe58b101f705b9e2 Mon Sep 17 00:00:00 2001 From: lex0c Date: Sun, 19 Apr 2026 22:22:25 -0300 Subject: [PATCH 6/7] Document what gitcortex collects from git MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Consolidates the "what data leaves the repo" story that was previously scattered across README (JSONL example), RUNBOOK (per-record schema), and METRICS.md (data-flow diagram). Readers evaluating adoption on a sensitive repo had to stitch three docs together to answer "does this ever read source code" and "what's opt-in vs default". New section covers: the two git commands run, a per-field table for commit and commit_file records showing source + which stat consumes each field, an explicit "not collected" list (file contents, messages off by default, refs other than the branch, zero network), and the four opt-in flags that change what ships. Verified the "consumed by" column against the source — committer_* feeds emitDev (so committers appear in the dev registry) but no other stat reads it from the commit record; old_hash/new_hash/old/new_size are written by extract but never read by stats. Co-Authored-By: Claude Opus 4.7 (1M context) --- README.md | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/README.md b/README.md index 30e82d6..37ffb99 100644 --- a/README.md +++ b/README.md @@ -143,6 +143,47 @@ The default branch is auto-detected from `origin/HEAD`, falling back to `main`, The `--mailmap` flag uses git's built-in `.mailmap` support to unify developer identities. Without it, the same person with different emails (e.g., `alice@work.com` and `alice@personal.com`) appears as separate contributors. +### What gitcortex collects from git + +Extraction runs two git commands against the local repository and streams their output. No source-code bytes are read. + +``` +git log -M --raw --numstat --format= → commits, parents, per-file diffs (counts only) +git cat-file --batch-check → blob sizes (old/new) for each file change +``` + +Per-commit metadata (populates the `commit` record): + +| Field | Source | Used by | +|---|---|---| +| `sha`, `tree`, `parents` | `git log --format` | commit graph, merge detection | +| `author_name`, `author_email`, `author_date` | `git log --format` | contributors, activity, working patterns, bus factor | +| `committer_name`, `committer_email`, `committer_date` | `git log --format` | committer identity feeds the `dev` registry (so a committer who is never an author still appears as a known developer); no other stat consumes these fields | +| `additions`, `deletions`, `files_changed` | summed from `--numstat` | summary totals, hotspots, churn-risk | +| `message` | `git log --format` | opt-in only (`--include-commit-messages`); truncated to 80 chars in `top-commits` when present | + +Per-file-change metadata (populates the `commit_file` record): + +| Field | Source | Used by | +|---|---|---| +| `path_current`, `path_previous`, `status` | `git log --raw` | hotspots, directories, extensions, rename tracking (`R100` / `C075` trigger merges) | +| `additions`, `deletions` | `git log --numstat` | per-file churn, recent churn, coupling | +| `old_hash`, `new_hash`, `old_size`, `new_size` | `git cat-file --batch-check` | retained but not currently used in stats | + +**Not collected:** +- File contents / diff hunks — only line counts from `--numstat`. +- Commit messages (unless `--include-commit-messages` is passed). +- Tags, refs other than the traversed branch, reflog, notes. +- Any network traffic — extraction is 100% local to the git directory. + +**Opt-ins that change what ships in the JSONL:** +- `--include-commit-messages` — adds the commit subject to each `commit` record (off by default). +- `--mailmap` — normalizes author/committer names+emails via git's `.mailmap` before recording (off by default; warned when a `.mailmap` exists but the flag is omitted). +- `--ignore ` — drops matching `commit_file` records entirely at extract time (counts in the `commit` record are recomputed so totals remain consistent). +- `--first-parent` — traverses only the first-parent chain, skipping merged branch history. + +Full per-record schema (every field, types, enums): see [`docs/RUNBOOK.md`](docs/RUNBOOK.md#jsonl-format). + Output is a JSONL file with one record per line. Four record types: ```jsonl From a6224f925180ce3e11e4329b59c31ba530e7d292 Mon Sep 17 00:00:00 2001 From: lex0c Date: Sun, 19 Apr 2026 22:39:39 -0300 Subject: [PATCH 7/7] Add extension footprint to developer profiles MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit DevProfile now surfaces the top 5 extensions a dev has worked on — their language/skill fingerprint. Answers questions the rest of the profile can't: "who on the team writes .rs?", "is this dev .yaml-heavy (infra) or .go-heavy (app code)?", "did Carol stop touching .sql after the migration?". Aggregation reuses the devFiles map already built for Scope and TopFiles — no new ingest cost. Sort is Files desc (tiebreak Churn desc, Ext asc) so the displayed Pct is monotonic in both CLI and HTML bar widths, matching Scope's UX. Churn is still exposed on the DevExtContrib struct for JSON consumers who want a churn-ranked view. Rendered in three surfaces: CLI PrintProfiles adds an Extensions line after Scope; main report profile cards add a row in the grid; the dedicated profile page (gitcortex report --email ...) gets a full block with a proportional horizontal bar mirroring the Scope widget. Caveats documented in METRICS.md: bucket is derived from the file's canonical (post-rename) path so cross-extension renames credit pre-rename work to the new extension; Pct values may sum < 100% when the dev's contribution to a file was pure rename with no line change. Per-era per-dev attribution would require byExt to carry a dev dimension, which isn't tracked. Tests cover the sort discrimination (churn-first vs files-first diverges on a hand-built case), top-5 truncation, all-(none) edge, and empty-Extensions guarding. Verified on pi-hole: CLI monotonic, main report 20/20 cards monotonic by script, dedicated profile bars render 32→32→5→5→3. Co-Authored-By: Claude Opus 4.7 (1M context) --- docs/METRICS.md | 1 + internal/report/profile_template.go | 13 +++ internal/report/template.go | 5 + internal/stats/extension_test.go | 164 ++++++++++++++++++++++++++++ internal/stats/format.go | 10 ++ internal/stats/stats.go | 77 ++++++++++++- 6 files changed, 269 insertions(+), 1 deletion(-) diff --git a/docs/METRICS.md b/docs/METRICS.md index 0c758da..1161e36 100644 --- a/docs/METRICS.md +++ b/docs/METRICS.md @@ -215,6 +215,7 @@ Per-developer report combining multiple metrics. | Pace | commits / active_days (smooths bursts — a dev with 100 commits on 2 days and silence for 28 shows pace=50, which reads as a steady rate but isn't) | | Weekend % | commits on Saturday+Sunday / total commits × 100 | | Scope | Top 5 directories by unique file count, as % of total files touched | +| Extensions | Top 5 file extensions the dev touched, sorted by **files desc** (tiebreak churn desc, then ext asc) so the displayed `Pct` is monotonic with the sort order and HTML bar widths read correctly. `Pct` is `Files/FilesTouched * 100`; the raw dev-attributable `Churn` (sum of `devLines[email]` across bucket files) is kept on the struct for JSON consumers who want a churn-ranked view. Answers the "language/skill fingerprint" question (`.go` + `.yaml` → backend+infra; `.tsx` + `.ts` + `.css` → frontend). **Caveats:** (1) bucket is derived from the file's canonical (post-rename) path — a dev who worked on `foo.js` pre-migration still shows up under `.ts` if it was later renamed; per-era per-dev attribution would need `byExt` to carry a dev dimension, which isn't tracked. (2) `Pct` values may sum to less than 100% when the dev appears as a contributor on files without adding lines (pure-rename contributions), since the extension aggregation only walks files with non-zero `devLines[email]`. | | Specialization | Herfindahl index over the **full** per-directory file-count distribution: Σ pᵢ² where pᵢ is the share of the dev's files in directory i. 1 = all files in one directory (narrow specialist); 1/N for a uniform spread across N directories; approaches 0 as the distribution widens. Computed before the top-5 Scope truncation so it reflects actual breadth. Labels (see `specBroadGeneralistMax`, `specBalancedMax`, `specFocusedMax` constants): `< 0.15` broad generalist, `< 0.35` balanced, `< 0.7` focused specialist, `≥ 0.7` narrow specialist. Herfindahl, not Gini, because Gini would collapse "1 file in 1 dir" and "1 file in each of 5 dirs" to the same value (both have zero inequality among buckets), which misses the specialization distinction. **Measures file distribution, not domain expertise** — see caveat below. **Display vs raw:** CLI and HTML show the value rounded to 3 decimals (`%.3f`) for readability; JSON output preserves the full float64. Band classification runs against the raw float, so a value like 0.149 lands in `broad generalist` even though %.2f would have rounded it to `0.15`. JSON consumers that reproduce the banding must use the raw value, not a rounded version. | | Contribution type | Based on del/add ratio: growth (<0.4), balanced (0.4-0.8), refactor (>0.8) | | Collaborators | Top 5 devs sharing code with this dev. Ranked by `shared_lines` (Σ min(linesA, linesB) across shared files), tiebreak `shared_files`, then email. Same `shared_lines` semantics as the Developer Network metric — discounts trivial one-line touches so "collaborator" reflects real overlap. | diff --git a/internal/report/profile_template.go b/internal/report/profile_template.go index a540f63..74e61ef 100644 --- a/internal/report/profile_template.go +++ b/internal/report/profile_template.go @@ -91,6 +91,19 @@ footer { margin-top: 40px; padding-top: 16px; border-top: 1px solid #d0d7de; col +{{if .Profile.Extensions}} +
+
Extensions
+
The dev's language/skill fingerprint by share of files touched. Extension attribution uses the file's current canonical path, so cross-extension renames (e.g. .js → .ts) credit pre-rename work to the new extension. · {{docRef "profile"}}
+
+ {{range $i, $e := .Profile.Extensions}}
{{if gt $e.Pct 8.0}}{{$e.Ext}} {{printf "%.0f" $e.Pct}}%{{end}}
{{end}} +
+
+ {{range $i, $e := .Profile.Extensions}} {{$e.Ext}} ({{printf "%.0f" $e.Pct}}%){{end}} +
+
+{{end}} +
Contribution diff --git a/internal/report/template.go b/internal/report/template.go index 4f5ef3a..ed67f6c 100644 --- a/internal/report/template.go +++ b/internal/report/template.go @@ -383,6 +383,11 @@ footer { margin-top: 40px; padding-top: 16px; border-top: 1px solid #d0d7de; col Scope {{range $i, $s := .Scope}}{{if $i}}, {{end}}{{$s.Dir}} ({{printf "%.0f" $s.Pct}}%){{end}} + {{if .Extensions}} + Extensions + {{range $i, $e := .Extensions}}{{if $i}}, {{end}}{{$e.Ext}} ({{printf "%.0f" $e.Pct}}%){{end}} + {{end}} + Specialization {{printf "%.3f" .Specialization}} ({{if lt .Specialization 0.15}}broad generalist{{else if lt .Specialization 0.35}}balanced{{else if lt .Specialization 0.7}}focused specialist{{else}}narrow specialist{{end}}) diff --git a/internal/stats/extension_test.go b/internal/stats/extension_test.go index 6860562..e315c5d 100644 --- a/internal/stats/extension_test.go +++ b/internal/stats/extension_test.go @@ -344,6 +344,170 @@ func TestMergeFileEntryByExtNilSrc(t *testing.T) { } } +// DevProfile extensions: verifies that a dev's extension footprint is +// aggregated from the files they touched, sorted churn-desc, with Pct +// equal to files/FilesTouched*100. Uses a hand-built dataset so the +// expected distribution is deterministic. +func TestDevProfileExtensions(t *testing.T) { + ds := &Dataset{ + Latest: time.Date(2024, 6, 1, 0, 0, 0, 0, time.UTC), + contributors: map[string]*ContributorStat{ + "alice@x": { + Name: "Alice", Email: "alice@x", + Commits: 4, FilesTouched: 4, ActiveDays: 2, + FirstDate: "2024-01-01", LastDate: "2024-05-01", + }, + }, + files: map[string]*fileEntry{ + "cmd/main.go": {devLines: map[string]int64{"alice@x": 100}, devCommits: map[string]int{"alice@x": 2}, additions: 80, deletions: 20}, + "internal/util.go": {devLines: map[string]int64{"alice@x": 60}, devCommits: map[string]int{"alice@x": 1}, additions: 50, deletions: 10}, + "deploy/prod.yaml": {devLines: map[string]int64{"alice@x": 20}, devCommits: map[string]int{"alice@x": 1}, additions: 15, deletions: 5}, + "Makefile": {devLines: map[string]int64{"alice@x": 5}, devCommits: map[string]int{"alice@x": 1}, additions: 5, deletions: 0}, + }, + commits: map[string]*commitEntry{}, + workGrid: [7][24]int{}, + } + + profiles := DevProfiles(ds, "alice@x", 0) + if len(profiles) != 1 { + t.Fatalf("got %d profiles, want 1", len(profiles)) + } + p := profiles[0] + + if len(p.Extensions) != 3 { + t.Fatalf("alice.Extensions len = %d, want 3 (.go, .yaml, (none))", len(p.Extensions)) + } + // .go dominates (160 churn across 2 files), .yaml next (20), (none) + // last (5). Pct based on files/FilesTouched = files/4. + if p.Extensions[0].Ext != ".go" || p.Extensions[0].Files != 2 || p.Extensions[0].Churn != 160 { + t.Errorf("[0] = %+v, want {.go, 2, 160}", p.Extensions[0]) + } + if p.Extensions[0].Pct != 50.0 { + t.Errorf(".go pct = %.1f, want 50.0 (2/4)", p.Extensions[0].Pct) + } + if p.Extensions[1].Ext != ".yaml" || p.Extensions[1].Files != 1 { + t.Errorf("[1] = %+v, want .yaml/1", p.Extensions[1]) + } + if p.Extensions[2].Ext != "(none)" { + t.Errorf("[2] = %+v, want (none)", p.Extensions[2]) + } +} + +// Regression: sort MUST be files desc (not churn desc) so the +// displayed Pct — computed from files — is monotonic in both CLI and +// HTML bar widths. Here .py has MORE churn (one huge commit) but +// FEWER files than .go. Under the previous churn-first sort, .py +// would lead and the Pct column (25% for .py, 75% for .go) would +// decrease non-monotonically in a files-sorted visual. Under the +// corrected files-first sort, .go leads as it should. +func TestDevProfileExtensionsSortedByFiles(t *testing.T) { + ds := &Dataset{ + contributors: map[string]*ContributorStat{ + "alice@x": {Email: "alice@x", Commits: 2, FilesTouched: 4, ActiveDays: 1}, + }, + files: map[string]*fileEntry{ + "a.go": {devLines: map[string]int64{"alice@x": 10}, devCommits: map[string]int{"alice@x": 1}}, + "b.go": {devLines: map[string]int64{"alice@x": 10}, devCommits: map[string]int{"alice@x": 1}}, + "c.go": {devLines: map[string]int64{"alice@x": 10}, devCommits: map[string]int{"alice@x": 1}}, + // One .py file with way more dev-lines than each .go. + "big.py": {devLines: map[string]int64{"alice@x": 500}, devCommits: map[string]int{"alice@x": 1}}, + }, + commits: map[string]*commitEntry{}, + workGrid: [7][24]int{}, + } + p := DevProfiles(ds, "alice@x", 0)[0] + if p.Extensions[0].Ext != ".go" { + t.Errorf("[0] = %q, want .go (3 files beats .py's 1 file under files-first sort)", p.Extensions[0].Ext) + } + if p.Extensions[0].Pct != 75.0 { + t.Errorf(".go Pct = %.1f, want 75.0", p.Extensions[0].Pct) + } + if p.Extensions[1].Ext != ".py" || p.Extensions[1].Pct != 25.0 { + t.Errorf("[1] = %+v, want .py @ 25%%", p.Extensions[1]) + } + // The .py churn (500) is surfaced on the field for JSON consumers + // even though it ranks second by file count. + if p.Extensions[1].Churn != 500 { + t.Errorf(".py Churn = %d, want 500 (raw value still exposed)", p.Extensions[1].Churn) + } +} + +// Edge case: a dev whose only touches are root-level extensionless +// files (Makefile, LICENSE) collapses into a single "(none)" bucket +// at 100% — no crash, no fallthrough. +func TestDevProfileExtensionsAllNone(t *testing.T) { + ds := &Dataset{ + contributors: map[string]*ContributorStat{ + "ops@x": {Email: "ops@x", Commits: 2, FilesTouched: 2, ActiveDays: 1}, + }, + files: map[string]*fileEntry{ + "Makefile": {devLines: map[string]int64{"ops@x": 30}, devCommits: map[string]int{"ops@x": 1}}, + "LICENSE": {devLines: map[string]int64{"ops@x": 5}, devCommits: map[string]int{"ops@x": 1}}, + }, + commits: map[string]*commitEntry{}, + workGrid: [7][24]int{}, + } + p := DevProfiles(ds, "ops@x", 0)[0] + if len(p.Extensions) != 1 || p.Extensions[0].Ext != "(none)" { + t.Fatalf("Extensions = %+v, want single (none) bucket", p.Extensions) + } + if p.Extensions[0].Pct != 100.0 { + t.Errorf("(none) Pct = %.1f, want 100.0", p.Extensions[0].Pct) + } +} + +// Edge case: a dev whose commits never touch any file (all commits +// had files_changed = 0, so no commit_file records reached fe.devLines). +// devFiles[email] is absent; Extensions must be nil — both HTML +// templates guard on truthiness so a nil slice renders as nothing. +func TestDevProfileExtensionsEmpty(t *testing.T) { + ds := &Dataset{ + contributors: map[string]*ContributorStat{ + "ghost@x": {Email: "ghost@x", Commits: 1, FilesTouched: 0, ActiveDays: 1}, + }, + files: map[string]*fileEntry{}, + commits: map[string]*commitEntry{}, + workGrid: [7][24]int{}, + } + p := DevProfiles(ds, "ghost@x", 0)[0] + if len(p.Extensions) != 0 { + t.Errorf("Extensions = %+v, want empty", p.Extensions) + } +} + +// Truncate to top-5 when a dev's extension set is larger. Under the +// files-first sort, ties on file count (all 1 each here) fall through +// to churn desc, so the top 5 by churn still win. +func TestDevProfileExtensionsTopFive(t *testing.T) { + ds := &Dataset{ + Latest: time.Date(2024, 6, 1, 0, 0, 0, 0, time.UTC), + contributors: map[string]*ContributorStat{ + "alice@x": {Email: "alice@x", Commits: 7, FilesTouched: 7, ActiveDays: 1}, + }, + files: map[string]*fileEntry{ + "a.go": {devLines: map[string]int64{"alice@x": 100}, devCommits: map[string]int{"alice@x": 1}}, + "a.py": {devLines: map[string]int64{"alice@x": 80}, devCommits: map[string]int{"alice@x": 1}}, + "a.rs": {devLines: map[string]int64{"alice@x": 60}, devCommits: map[string]int{"alice@x": 1}}, + "a.ts": {devLines: map[string]int64{"alice@x": 40}, devCommits: map[string]int{"alice@x": 1}}, + "a.md": {devLines: map[string]int64{"alice@x": 20}, devCommits: map[string]int{"alice@x": 1}}, + "a.sh": {devLines: map[string]int64{"alice@x": 5}, devCommits: map[string]int{"alice@x": 1}}, + "a.yml": {devLines: map[string]int64{"alice@x": 3}, devCommits: map[string]int{"alice@x": 1}}, + }, + commits: map[string]*commitEntry{}, + workGrid: [7][24]int{}, + } + p := DevProfiles(ds, "alice@x", 0)[0] + if len(p.Extensions) != 5 { + t.Fatalf("Extensions len = %d, want top-5 truncation", len(p.Extensions)) + } + // Top 5 by churn: .go .py .rs .ts .md. .sh and .yml excluded. + for i, want := range []string{".go", ".py", ".rs", ".ts", ".md"} { + if p.Extensions[i].Ext != want { + t.Errorf("[%d] = %q, want %q", i, p.Extensions[i].Ext, want) + } + } +} + func TestExtensionStatsTopN(t *testing.T) { ds := &Dataset{ files: map[string]*fileEntry{ diff --git a/internal/stats/format.go b/internal/stats/format.go index e0d409e..66ca240 100644 --- a/internal/stats/format.go +++ b/internal/stats/format.go @@ -468,6 +468,16 @@ func (f *Formatter) PrintProfiles(profiles []DevProfile) error { fmt.Fprintf(f.w, "%s (%.0f%%)", s.Dir, s.Pct) } fmt.Fprintln(f.w) + if len(p.Extensions) > 0 { + fmt.Fprintf(f.w, " Extensions: ") + for j, e := range p.Extensions { + if j > 0 { + fmt.Fprintf(f.w, ", ") + } + fmt.Fprintf(f.w, "%s (%.0f%%)", e.Ext, e.Pct) + } + fmt.Fprintln(f.w) + } // %.3f (not %.2f): labels are assigned at thresholds 0.15 / 0.35 // / 0.7 using the unrounded float. With %.2f a value like // 0.149 displays as "0.15" and the "broad generalist" label diff --git a/internal/stats/stats.go b/internal/stats/stats.go index a5fc8d8..1e85bf2 100644 --- a/internal/stats/stats.go +++ b/internal/stats/stats.go @@ -1264,6 +1264,7 @@ type DevProfile struct { LastDate string TopFiles []DevFileContrib Scope []DirScope + Extensions []DevExtContrib Specialization float64 // Gini over dir file-count distribution: 0 = broad generalist, 1 = single-dir specialist ContribRatio float64 // del/add — 0=growth, ~1=rewrite, >1=cleanup ContribType string // "growth", "balanced", "refactor" @@ -1292,6 +1293,26 @@ type DevFileContrib struct { Churn int64 } +// DevExtContrib is a dev's footprint in a single extension bucket. +// Churn is the summed per-file dev-lines (from fe.devLines), so it +// reflects lines the dev personally added/removed across files that +// currently carry this extension — NOT the file's lifetime churn. +// Pct is the share of the dev's files (by count) that land in this +// bucket, matching DirScope.Pct's semantics so the two read +// consistently side by side. +// +// Caveat: the bucket is derived from the file's canonical post-rename +// path. A dev who worked on foo.js pre-migration still shows up under +// ".ts" if that file was later renamed. Per-era per-dev attribution +// would need byExt to carry a dev dimension, which isn't tracked; see +// METRICS.md for the full rationale. +type DevExtContrib struct { + Ext string + Files int + Churn int64 + Pct float64 +} + // DevProfiles returns a profile for each developer (or a specific one if filterEmail is set). // DevProfiles builds per-developer profile structs. Pass n > 0 to // limit output to the top-N contributors by commit count — the cap is @@ -1562,6 +1583,60 @@ func DevProfiles(ds *Dataset, filterEmail string, n int) []DevProfile { scope = scope[:5] } + // Extensions: the dev's language/skill fingerprint. Aggregated + // from the same devFiles map used for Scope + TopFiles — each + // file contributes its dev-attributable churn (devLines[email]) + // to the bucket picked via the file's canonical-path + // extension. The canonical-path simplification is documented + // in METRICS.md; per-era per-dev would need byExt to carry a + // dev dimension. + type extAccForDev struct { + files int + churn int64 + } + extCount := make(map[string]*extAccForDev) + if files, ok := devFiles[email]; ok { + for path, fa := range files { + ext := extractExtension(path) + acc, ok := extCount[ext] + if !ok { + acc = &extAccForDev{} + extCount[ext] = acc + } + acc.files++ + acc.churn += fa.churn + } + } + var extensions []DevExtContrib + for ext, acc := range extCount { + pct := 0.0 + if cs.FilesTouched > 0 { + pct = math.Round(float64(acc.files)/float64(cs.FilesTouched)*1000) / 10 + } + extensions = append(extensions, DevExtContrib{ + Ext: ext, Files: acc.files, Churn: acc.churn, Pct: pct, + }) + } + // Sort mirrors Scope: files desc first, so the displayed Pct + // (computed from files) is monotonic in CLI and the HTML bar + // widths — Pct-sorted = visually sorted. Tiebreak on churn + // desc keeps the "more investment wins" signal when two + // buckets hold the same number of files, then ext asc for + // determinism. The Churn field on each entry is still + // available for JSON consumers who want a churn-ranked view. + sort.Slice(extensions, func(i, j int) bool { + if extensions[i].Files != extensions[j].Files { + return extensions[i].Files > extensions[j].Files + } + if extensions[i].Churn != extensions[j].Churn { + return extensions[i].Churn > extensions[j].Churn + } + return extensions[i].Ext < extensions[j].Ext + }) + if len(extensions) > 5 { + extensions = extensions[:5] + } + // Contribution type contribRatio := 0.0 contribType := "growth" @@ -1608,7 +1683,7 @@ func DevProfiles(ds *Dataset, filterEmail string, n int) []DevProfile { Commits: cs.Commits, Additions: cs.Additions, Deletions: cs.Deletions, LinesChanged: cs.Additions + cs.Deletions, FilesTouched: cs.FilesTouched, ActiveDays: cs.ActiveDays, FirstDate: cs.FirstDate, LastDate: cs.LastDate, - TopFiles: topFiles, Scope: scope, Specialization: specialization, + TopFiles: topFiles, Scope: scope, Extensions: extensions, Specialization: specialization, ContribRatio: contribRatio, ContribType: contribType, Pace: pace, Collaborators: collabs, MonthlyActivity: monthly, WorkGrid: grid, WeekendPct: wpct,