diff --git a/README.md b/README.md index ff5ce09..37ffb99 100644 --- a/README.md +++ b/README.md @@ -143,6 +143,47 @@ The default branch is auto-detected from `origin/HEAD`, falling back to `main`, The `--mailmap` flag uses git's built-in `.mailmap` support to unify developer identities. Without it, the same person with different emails (e.g., `alice@work.com` and `alice@personal.com`) appears as separate contributors. +### What gitcortex collects from git + +Extraction runs two git commands against the local repository and streams their output. No source-code bytes are read. + +``` +git log -M --raw --numstat --format= → commits, parents, per-file diffs (counts only) +git cat-file --batch-check → blob sizes (old/new) for each file change +``` + +Per-commit metadata (populates the `commit` record): + +| Field | Source | Used by | +|---|---|---| +| `sha`, `tree`, `parents` | `git log --format` | commit graph, merge detection | +| `author_name`, `author_email`, `author_date` | `git log --format` | contributors, activity, working patterns, bus factor | +| `committer_name`, `committer_email`, `committer_date` | `git log --format` | committer identity feeds the `dev` registry (so a committer who is never an author still appears as a known developer); no other stat consumes these fields | +| `additions`, `deletions`, `files_changed` | summed from `--numstat` | summary totals, hotspots, churn-risk | +| `message` | `git log --format` | opt-in only (`--include-commit-messages`); truncated to 80 chars in `top-commits` when present | + +Per-file-change metadata (populates the `commit_file` record): + +| Field | Source | Used by | +|---|---|---| +| `path_current`, `path_previous`, `status` | `git log --raw` | hotspots, directories, extensions, rename tracking (`R100` / `C075` trigger merges) | +| `additions`, `deletions` | `git log --numstat` | per-file churn, recent churn, coupling | +| `old_hash`, `new_hash`, `old_size`, `new_size` | `git cat-file --batch-check` | retained but not currently used in stats | + +**Not collected:** +- File contents / diff hunks — only line counts from `--numstat`. +- Commit messages (unless `--include-commit-messages` is passed). +- Tags, refs other than the traversed branch, reflog, notes. +- Any network traffic — extraction is 100% local to the git directory. + +**Opt-ins that change what ships in the JSONL:** +- `--include-commit-messages` — adds the commit subject to each `commit` record (off by default). +- `--mailmap` — normalizes author/committer names+emails via git's `.mailmap` before recording (off by default; warned when a `.mailmap` exists but the flag is omitted). +- `--ignore ` — drops matching `commit_file` records entirely at extract time (counts in the `commit` record are recomputed so totals remain consistent). +- `--first-parent` — traverses only the first-parent chain, skipping merged branch history. + +Full per-record schema (every field, types, enums): see [`docs/RUNBOOK.md`](docs/RUNBOOK.md#jsonl-format). + Output is a JSONL file with one record per line. Four record types: ```jsonl @@ -211,6 +252,7 @@ Available stats: | `top-commits` | Largest commits ranked by lines changed (includes message if extracted with `--include-commit-messages`) | | `pareto` | Concentration (80% threshold) across files, devs (two lenses: commits and churn), and directories | | `structure` | Repo layout as a `tree(1)`-style view, dirs sorted by aggregate churn, capped by `--tree-depth` (default 3) | +| `extensions` | File extensions ranked by recent churn, with file count, unique devs, and first/last-seen — the historical lens on language distribution | Output formats: `table` (default, human-readable), `csv` (single clean table per `--stat`, header row on line 1), `json` (unified object with all sections). diff --git a/cmd/gitcortex/main.go b/cmd/gitcortex/main.go index 0aee55b..d4da8ab 100644 --- a/cmd/gitcortex/main.go +++ b/cmd/gitcortex/main.go @@ -105,8 +105,8 @@ func isValidGranularity(s string) bool { func isValidStat(s string) bool { switch s { - case "summary", "contributors", "hotspots", "directories", "activity", - "busfactor", "coupling", "churn-risk", "working-patterns", + case "summary", "contributors", "hotspots", "directories", "extensions", + "activity", "busfactor", "coupling", "churn-risk", "working-patterns", "dev-network", "profile", "top-commits", "pareto", "structure": return true } @@ -133,7 +133,7 @@ func addStatsFlags(cmd *cobra.Command, sf *statsFlags) { cmd.Flags().StringVar(&sf.format, "format", "table", "Output format: table, csv, json") cmd.Flags().IntVar(&sf.topN, "top", 10, "Number of top entries to show (0 = all)") cmd.Flags().StringVar(&sf.granularity, "granularity", "month", "Activity granularity: day, week, month, year") - cmd.Flags().StringVar(&sf.stat, "stat", "", "Show a specific stat: summary, contributors, hotspots, directories, activity, busfactor, coupling, churn-risk, working-patterns, dev-network, profile, top-commits, pareto, structure") + cmd.Flags().StringVar(&sf.stat, "stat", "", "Show a specific stat: summary, contributors, hotspots, directories, extensions, activity, busfactor, coupling, churn-risk, working-patterns, dev-network, profile, top-commits, pareto, structure") cmd.Flags().IntVar(&sf.couplingMaxFiles, "coupling-max-files", 50, "Max files per commit for coupling analysis") cmd.Flags().IntVar(&sf.couplingMinChanges, "coupling-min-changes", 5, "Min co-changes for coupling results") cmd.Flags().IntVar(&sf.churnHalfLife, "churn-half-life", 90, "Half-life in days for churn decay (churn-risk)") @@ -151,7 +151,7 @@ func validateStatsFlags(sf *statsFlags) error { return fmt.Errorf("invalid --granularity %q; must be one of: day, week, month, year", sf.granularity) } if sf.stat != "" && !isValidStat(sf.stat) { - return fmt.Errorf("invalid --stat %q; valid: summary, contributors, hotspots, directories, activity, busfactor, coupling, churn-risk, working-patterns, dev-network, profile, top-commits, pareto, structure", sf.stat) + return fmt.Errorf("invalid --stat %q; valid: summary, contributors, hotspots, directories, extensions, activity, busfactor, coupling, churn-risk, working-patterns, dev-network, profile, top-commits, pareto, structure", sf.stat) } return nil } @@ -271,6 +271,12 @@ func renderStats(ds *stats.Dataset, sf *statsFlags) error { return err } } + if showAll || sf.stat == "extensions" { + fmt.Fprintf(os.Stderr, "\n=== Top %d Extensions ===\n", sf.topN) + if err := f.PrintExtensions(stats.ExtensionStats(ds, sf.topN)); err != nil { + return err + } + } if showAll || sf.stat == "activity" { fmt.Fprintf(os.Stderr, "\n=== Activity (%s) ===\n", sf.granularity) if err := f.PrintActivity(stats.ActivityOverTime(ds, sf.granularity)); err != nil { @@ -369,6 +375,9 @@ func renderStatsJSON(f *stats.Formatter, ds *stats.Dataset, sf *statsFlags) erro if showAll || sf.stat == "directories" { report["directories"] = stats.DirectoryStats(ds, sf.topN) } + if showAll || sf.stat == "extensions" { + report["extensions"] = stats.ExtensionStats(ds, sf.topN) + } if showAll || sf.stat == "activity" { report["activity"] = stats.ActivityOverTime(ds, sf.granularity) } diff --git a/docs/METRICS.md b/docs/METRICS.md index 1b7a0f6..1161e36 100644 --- a/docs/METRICS.md +++ b/docs/METRICS.md @@ -215,6 +215,7 @@ Per-developer report combining multiple metrics. | Pace | commits / active_days (smooths bursts — a dev with 100 commits on 2 days and silence for 28 shows pace=50, which reads as a steady rate but isn't) | | Weekend % | commits on Saturday+Sunday / total commits × 100 | | Scope | Top 5 directories by unique file count, as % of total files touched | +| Extensions | Top 5 file extensions the dev touched, sorted by **files desc** (tiebreak churn desc, then ext asc) so the displayed `Pct` is monotonic with the sort order and HTML bar widths read correctly. `Pct` is `Files/FilesTouched * 100`; the raw dev-attributable `Churn` (sum of `devLines[email]` across bucket files) is kept on the struct for JSON consumers who want a churn-ranked view. Answers the "language/skill fingerprint" question (`.go` + `.yaml` → backend+infra; `.tsx` + `.ts` + `.css` → frontend). **Caveats:** (1) bucket is derived from the file's canonical (post-rename) path — a dev who worked on `foo.js` pre-migration still shows up under `.ts` if it was later renamed; per-era per-dev attribution would need `byExt` to carry a dev dimension, which isn't tracked. (2) `Pct` values may sum to less than 100% when the dev appears as a contributor on files without adding lines (pure-rename contributions), since the extension aggregation only walks files with non-zero `devLines[email]`. | | Specialization | Herfindahl index over the **full** per-directory file-count distribution: Σ pᵢ² where pᵢ is the share of the dev's files in directory i. 1 = all files in one directory (narrow specialist); 1/N for a uniform spread across N directories; approaches 0 as the distribution widens. Computed before the top-5 Scope truncation so it reflects actual breadth. Labels (see `specBroadGeneralistMax`, `specBalancedMax`, `specFocusedMax` constants): `< 0.15` broad generalist, `< 0.35` balanced, `< 0.7` focused specialist, `≥ 0.7` narrow specialist. Herfindahl, not Gini, because Gini would collapse "1 file in 1 dir" and "1 file in each of 5 dirs" to the same value (both have zero inequality among buckets), which misses the specialization distinction. **Measures file distribution, not domain expertise** — see caveat below. **Display vs raw:** CLI and HTML show the value rounded to 3 decimals (`%.3f`) for readability; JSON output preserves the full float64. Band classification runs against the raw float, so a value like 0.149 lands in `broad generalist` even though %.2f would have rounded it to `0.15`. JSON consumers that reproduce the banding must use the raw value, not a rounded version. | | Contribution type | Based on del/add ratio: growth (<0.4), balanced (0.4-0.8), refactor (>0.8) | | Collaborators | Top 5 devs sharing code with this dev. Ranked by `shared_lines` (Σ min(linesA, linesB) across shared files), tiebreak `shared_files`, then email. Same `shared_lines` semantics as the Developer Network metric — discounts trivial one-line touches so "collaborator" reflects real overlap. | @@ -254,6 +255,31 @@ Two dev lenses are surfaced because commit count alone is a flawed proxy for con **How to interpret**: "20 files concentrate 80% of all churn" describes where change lands — it can indicate a healthy core module under active development, or a bottleneck if combined with low bus factor. Cross-reference with the Churn Risk section before drawing conclusions. +## Extensions + +File extensions aggregated from `ds.files`, ranked by **recent churn** (decay-weighted — see "Recent churn" below). The historical lens is the point: `cloc`/`tokei` answer "what languages exist on disk"; this answers "which extensions is the team spending effort on right now". + +**Extraction policy** (`extractExtension`): +- Last path segment (after the final `/`). +- Multi-dot names report the final segment: `foo.tar.gz` → `.gz`, `.eslintrc.json` → `.json`. +- Single-dot dotfiles keep their full name: `.gitignore` → `.gitignore`, `.env` → `.env`. Merging these into "(none)" would erase a meaningful group. +- No-dot names collapse into the `(none)` bucket: `Makefile`, `LICENSE`, `bin/run`. +- Extensions lowercased so `.PNG` and `.png` aggregate. + +**Per-bucket fields**: +- `files` — distinct file lineages that ever held this extension. A file renamed across extensions (foo.js → foo.ts) counts once in each bucket; totals across buckets can therefore exceed the dataset's file count in migration-heavy repos. +- `churn` — lifetime additions + deletions attributed to this extension specifically. A foo.js → foo.ts migration with 1000 lines of pre-rename churn and 500 post-rename does **not** collapse all 1500 onto `.ts`; `.js` keeps its 1000 and `.ts` gets 500. The attribution comes from capturing the path's extension at each change before `applyRenames` merges the lineage. +- `recent_churn` — same per-era semantics, decay-weighted (same half-life as other stats, set at load time). Leads the sort so a dormant extension with high lifetime churn won't displace an active one. +- `unique_devs` — distinct emails that touched any file that ever held this extension. **Over-counts across migrations**: a dev who only worked on `foo.js` pre-migration still appears under `.ts` if that file was migrated. Splitting devs per era would need per-commit dev tracking that `fileEntry` does not retain. Read this as "people with context on files that at some point were this extension" rather than "active contributors in this extension". +- `first_seen` / `last_seen` — min/max within the bucket's era, UTC date. For the `.js` bucket in a TypeScript migration, `last_seen` is the migration cutoff, not today's date. + +**Reading signals**: +- `.yaml` recent churn high + unique_devs low → config owned by one person; schedule handoff before they leave. +- `.md` recent churn high → docs-heavy phase (release prep?) or churn-heavy README thrash. +- Cross-read with Directories: `.yaml` concentrated in one dir is config-as-code; `.yaml` spread across many dirs is config sprawl. + +**What it does not do**: no language-family grouping (`.js`+`.ts`+`.tsx` stay distinct). Aggregate downstream if you need "frontend vs backend"; the tool does not prescribe the taxonomy. Generated-file buckets (`.lock`, `.pb.go`, `.min.js`) will dominate unless filtered via `--ignore` at extract time — the suspect-paths warning flags these. + ## Repo Structure A `tree(1)`-style view of the repository's directory layout, built from paths seen in history (`FileHotspots`), not from the filesystem at HEAD. Deleted files are included — the view answers "what shaped the codebase", not "what is present today". diff --git a/docs/RUNBOOK.md b/docs/RUNBOOK.md index 4cc90c8..8bf5256 100644 --- a/docs/RUNBOOK.md +++ b/docs/RUNBOOK.md @@ -156,6 +156,7 @@ Section headers go to stderr, data to stdout. To capture only data: ./gitcortex stats --input data.jsonl --stat profile --email alice@company.com ./gitcortex stats --input data.jsonl --stat top-commits --top 20 ./gitcortex stats --input data.jsonl --stat structure --tree-depth 3 +./gitcortex stats --input data.jsonl --stat extensions --top 15 ``` ### Time filtering diff --git a/internal/report/profile_template.go b/internal/report/profile_template.go index a540f63..74e61ef 100644 --- a/internal/report/profile_template.go +++ b/internal/report/profile_template.go @@ -91,6 +91,19 @@ footer { margin-top: 40px; padding-top: 16px; border-top: 1px solid #d0d7de; col +{{if .Profile.Extensions}} +
+
Extensions
+
The dev's language/skill fingerprint by share of files touched. Extension attribution uses the file's current canonical path, so cross-extension renames (e.g. .js → .ts) credit pre-rename work to the new extension. · {{docRef "profile"}}
+
+ {{range $i, $e := .Profile.Extensions}}
{{if gt $e.Pct 8.0}}{{$e.Ext}} {{printf "%.0f" $e.Pct}}%{{end}}
{{end}} +
+
+ {{range $i, $e := .Profile.Extensions}} {{$e.Ext}} ({{printf "%.0f" $e.Pct}}%){{end}} +
+
+{{end}} +
Contribution diff --git a/internal/report/report.go b/internal/report/report.go index cddc3a1..6b1b741 100644 --- a/internal/report/report.go +++ b/internal/report/report.go @@ -26,6 +26,7 @@ type ReportData struct { Contributors []stats.ContributorStat Hotspots []stats.FileStat Directories []stats.DirStat + Extensions []stats.ExtensionStat ActivityRaw []stats.ActivityBucket ActivityYears []string ActivityGrid [][]ActivityCell // [year][month 0-11] @@ -341,6 +342,7 @@ func Generate(w io.Writer, ds *stats.Dataset, repoName string, topN int, sf stat Contributors: stats.TopContributors(ds, topN), Hotspots: stats.FileHotspots(ds, topN), Directories: stats.DirectoryStats(ds, topN), + Extensions: stats.ExtensionStats(ds, topN), ActivityRaw: actRaw, ActivityYears: actYears, ActivityGrid: actGrid, @@ -392,6 +394,19 @@ func pctInt(val, max int) string { return fmt.Sprintf("%.1f", float64(val)/float64(max)*100) } +// pctFloat is the float-domain sibling of pct. Needed for metrics like +// RecentChurn that carry sub-1 fractional values after heavy decay +// (small repos, or --since restricting the window): casting through +// int64 truncates every bucket to 0 and the bar reads 0% across the +// board even though the table shows non-zero churn. Accepting +// float64 straight through preserves the relative scale. +func pctFloat(val, max float64) string { + if max == 0 { + return "0" + } + return fmt.Sprintf("%.1f", val/max*100) +} + func heatColor(val, max int) string { if max == 0 || val == 0 { return "#f0f0f0" @@ -451,6 +466,7 @@ func actColor(commits, max int) string { var funcMap = template.FuncMap{ "pct": pct, "pctInt": pctInt, + "pctFloat": pctFloat, "heatColor": heatColor, "joinDevs": stats.JoinDevs, "seq": seq, diff --git a/internal/report/report_test.go b/internal/report/report_test.go index e6d114e..2b4c826 100644 --- a/internal/report/report_test.go +++ b/internal/report/report_test.go @@ -436,6 +436,35 @@ func TestBuildLabelCountListOmitsEmpty(t *testing.T) { } } +// Regression: pct(int64(x), int64(y)) collapsed every sub-1 float to +// 0 before this helper existed, so extension/churn-risk bars all +// rendered as 0% on datasets with heavily decayed RecentChurn (small +// repos, aggressive --since filters). pctFloat preserves the relative +// scale. +func TestPctFloat(t *testing.T) { + cases := []struct { + val, max float64 + want string + }{ + // Sub-1 values: relative scale preserved (would all be 0 under int64 cast). + {0.5, 1.0, "50.0"}, + {0.25, 0.5, "50.0"}, + {0.1, 0.9, "11.1"}, + // Mixed small + large. + {50.0, 200.0, "25.0"}, + // max at zero → safe zero string, no NaN or division by zero. + {5.0, 0.0, "0"}, + {0.0, 0.0, "0"}, + // val > max (can happen under rounding noise in sort+display). + {10.0, 5.0, "200.0"}, + } + for _, c := range cases { + if got := pctFloat(c.val, c.max); got != c.want { + t.Errorf("pctFloat(%v, %v) = %q, want %q", c.val, c.max, got, c.want) + } + } +} + func TestThousands(t *testing.T) { cases := []struct { in interface{} diff --git a/internal/report/template.go b/internal/report/template.go index 0947812..ed67f6c 100644 --- a/internal/report/template.go +++ b/internal/report/template.go @@ -239,6 +239,27 @@ footer { margin-top: 40px; padding-top: 16px; border-top: 1px solid #d0d7de; col {{end}} +{{if .Extensions}} +

Extensions

+

File extensions ranked by recent churn — "where is the team spending effort now", not "what exists at HEAD". Cross-read with Directories: a repo with high .yaml recent churn concentrated in one dir is config-as-code; spread across many dirs is config sprawl. · {{docRef "extensions"}}

+ + +{{$maxRecent := 0.0}}{{range .Extensions}}{{if gt .RecentChurn $maxRecent}}{{$maxRecent = .RecentChurn}}{{end}}{{end}} +{{range .Extensions}} + + + + + + + + + + +{{end}} +
ExtFilesChurnRecent ChurnDevsFirst SeenLast Seen
{{.Ext}}{{thousands .Files}}{{thousands .Churn}}{{printf "%.1f" .RecentChurn}}
{{.UniqueDevs}}{{.FirstSeen}}{{.LastSeen}}
+{{end}} + {{if .ChurnRisk}}

Churn Risk

Files ranked by recent churn. Label classifies context so you can judge action: legacy-hotspot (old code + concentrated + declining) is the urgent alarm; silo suggests knowledge transfer; active-core is young code with a single author (often fine); active is shared healthy work; cold is quiet.{{if (index .ChurnRisk 0).AgePercentile}} Age P__ / Trend P__ under the label show where this file sits in the repo's distribution: age P90 means older than 90% of tracked files; trend P10 means declining more sharply than 90%. Classification boundaries are the P75 age and P25 trend of this dataset (see {{docRef "churn-risk"}}).{{end}}

@@ -258,7 +279,7 @@ footer { margin-top: 40px; padding-top: 16px; border-top: 1px solid #d0d7de; col {{.Path}} {{if eq .Label "legacy-hotspot"}}🔴 {{.Label}}{{else if eq .Label "silo"}}🟡 {{.Label}}{{else if eq .Label "active-core"}}{{.Label}}{{else if eq .Label "active"}}{{.Label}}{{else}}{{.Label}}{{end}}{{if .AgePercentile}}
age P{{derefInt .AgePercentile}} · trend P{{derefInt .TrendPercentile}}
{{end}} {{printf "%.1f" .RecentChurn}} -
+
{{.BusFactor}} {{.AgeDays}}d {{if lt .Trend 0.5}}↓ {{printf "%.2f" .Trend}}{{else if gt .Trend 1.5}}↑ {{printf "%.2f" .Trend}}{{else}}→ {{printf "%.2f" .Trend}}{{end}} @@ -362,6 +383,11 @@ footer { margin-top: 40px; padding-top: 16px; border-top: 1px solid #d0d7de; col Scope {{range $i, $s := .Scope}}{{if $i}}, {{end}}{{$s.Dir}} ({{printf "%.0f" $s.Pct}}%){{end}} + {{if .Extensions}} + Extensions + {{range $i, $e := .Extensions}}{{if $i}}, {{end}}{{$e.Ext}} ({{printf "%.0f" $e.Pct}}%){{end}} + {{end}} + Specialization {{printf "%.3f" .Specialization}} ({{if lt .Specialization 0.15}}broad generalist{{else if lt .Specialization 0.35}}balanced{{else if lt .Specialization 0.7}}focused specialist{{else}}narrow specialist{{end}}) diff --git a/internal/stats/extension_test.go b/internal/stats/extension_test.go new file mode 100644 index 0000000..e315c5d --- /dev/null +++ b/internal/stats/extension_test.go @@ -0,0 +1,526 @@ +package stats + +import ( + "testing" + "time" +) + +func TestExtractExtensionPolicy(t *testing.T) { + cases := []struct { + path string + want string + }{ + // Happy path + {"main.go", ".go"}, + {"src/pkg/util.go", ".go"}, + {"foo/bar/Baz.PNG", ".png"}, // lowercased + // Multi-dot: last segment wins. + {"archive.tar.gz", ".gz"}, + {"src/.eslintrc.json", ".json"}, // dotfile with a real ext splits on last + // Single-dot dotfiles keep full name — ".gitignore" carries + // meaning as a group; merging with "(none)" would confuse a + // Makefile-only repo with a dotfile-only one. + {".gitignore", ".gitignore"}, + {".env", ".env"}, + {"project/.env", ".env"}, + // Extensionless files. + {"Makefile", "(none)"}, + {"LICENSE", "(none)"}, + {"bin/run", "(none)"}, + // Degenerate. + {"", "(none)"}, + {"weird.", "(none)"}, + {".", "(none)"}, // single dot is not a dotfile, just noise + {"..", "(none)"}, // two dots → trailing-dot rule collapses + {"/", "(none)"}, // just a separator + {"foo/", "(none)"}, // trailing slash, empty basename + // Multi-input stem prefix. LoadMultiJSONL prepends ":" + // to root-level paths, and the stem may legitimately contain + // dots; without stripping, those dots would be mistaken for + // extensions. Only reaches the basename for root-level files — + // nested paths already discard the prefix via the slash split. + {"repo.v1:Makefile", "(none)"}, + {"repo.v1:LICENSE", "(none)"}, + {"repo.v1:foo.go", ".go"}, // real ext still wins after prefix strip + {"repo:Makefile", "(none)"}, // stem with no dots — same rule + {"repo.v1:.gitignore", ".gitignore"}, // dotfile survives prefix + {"repo.v1:src/foo.go", ".go"}, // nested path: slash strips prefix first + {"repo.v1:", "(none)"}, // prefix with empty basename + } + for _, c := range cases { + got := extractExtension(c.path) + if got != c.want { + t.Errorf("extractExtension(%q) = %q, want %q", c.path, got, c.want) + } + } +} + +func TestExtensionStatsAggregation(t *testing.T) { + // Hand-built dataset so aggregation is inspectable: two .go files + // with distinct devs, one .yaml shared by both, and a Makefile + // (extensionless) owned by one dev. First/last dates differ so the + // aggregator must track min/max across files, not last-seen only. + ds := &Dataset{ + Latest: time.Date(2024, 6, 1, 0, 0, 0, 0, time.UTC), + files: map[string]*fileEntry{ + "cmd/main.go": { + additions: 100, deletions: 20, recentChurn: 80, + devLines: map[string]int64{"alice@x": 90, "bob@x": 30}, + firstChange: time.Date(2024, 1, 1, 0, 0, 0, 0, time.UTC), + lastChange: time.Date(2024, 5, 1, 0, 0, 0, 0, time.UTC), + }, + "internal/util.go": { + additions: 40, deletions: 10, recentChurn: 30, + devLines: map[string]int64{"alice@x": 50}, + firstChange: time.Date(2024, 2, 1, 0, 0, 0, 0, time.UTC), + lastChange: time.Date(2024, 4, 1, 0, 0, 0, 0, time.UTC), + }, + "deploy/prod.yaml": { + additions: 20, deletions: 5, recentChurn: 15, + devLines: map[string]int64{"alice@x": 15, "bob@x": 10}, + firstChange: time.Date(2024, 3, 1, 0, 0, 0, 0, time.UTC), + lastChange: time.Date(2024, 6, 1, 0, 0, 0, 0, time.UTC), + }, + "Makefile": { + additions: 5, deletions: 1, recentChurn: 2, + devLines: map[string]int64{"bob@x": 6}, + firstChange: time.Date(2023, 12, 1, 0, 0, 0, 0, time.UTC), + lastChange: time.Date(2024, 1, 15, 0, 0, 0, 0, time.UTC), + }, + }, + } + + result := ExtensionStats(ds, 0) + if len(result) != 3 { + t.Fatalf("got %d buckets, want 3 (.go, .yaml, (none))", len(result)) + } + + // Sort order: recent churn desc → .go (110) > .yaml (15) > (none) (2). + if result[0].Ext != ".go" { + t.Errorf("result[0] = %q, want .go", result[0].Ext) + } + if result[0].Files != 2 { + t.Errorf(".go files = %d, want 2", result[0].Files) + } + if result[0].Churn != 170 { // 120 + 50 + t.Errorf(".go churn = %d, want 170", result[0].Churn) + } + if result[0].RecentChurn != 110 { + t.Errorf(".go recentChurn = %.1f, want 110", result[0].RecentChurn) + } + if result[0].UniqueDevs != 2 { // alice, bob across both .go files + t.Errorf(".go devs = %d, want 2", result[0].UniqueDevs) + } + if result[0].FirstSeen != "2024-01-01" { + t.Errorf(".go firstSeen = %q, want 2024-01-01", result[0].FirstSeen) + } + if result[0].LastSeen != "2024-05-01" { + t.Errorf(".go lastSeen = %q, want 2024-05-01 (max across .go files)", result[0].LastSeen) + } + + // (none) bucket is last (lowest recent churn). Confirm Makefile + // collapsed there and the FirstSeen predates the .go range — the + // aggregator must take the earliest firstChange across bucket files. + last := result[len(result)-1] + if last.Ext != "(none)" { + t.Errorf("last bucket = %q, want (none)", last.Ext) + } + if last.Files != 1 || last.FirstSeen != "2023-12-01" { + t.Errorf("(none) bucket = %+v", last) + } +} + +// Regression: a file renamed across extensions (foo.js → foo.ts) +// collapses onto one canonical path after applyRenames. Bucketing on +// that canonical path alone would assign ALL historical churn to .ts +// and zero to .js — an ugly skew in migration-heavy repos. The fix +// uses fileEntry.byExt (populated at per-change time) to split the +// lineage back across both buckets. +func TestExtensionStatsHonorsPerEraSplit(t *testing.T) { + ds := &Dataset{ + Latest: time.Date(2024, 6, 1, 0, 0, 0, 0, time.UTC), + files: map[string]*fileEntry{ + // Simulates what reader.go produces post-applyRenames: one + // canonical key ("foo.ts") with churn split across the two + // extensions the file held during its lifetime. + "foo.ts": { + additions: 1200, deletions: 300, + recentChurn: 900, + devLines: map[string]int64{"alice@x": 900, "bob@x": 600}, + byExt: map[string]*extContribution{ + ".js": { + churn: 1000, + recentChurn: 200, // old era, decayed + firstChange: time.Date(2023, 1, 1, 0, 0, 0, 0, time.UTC), + lastChange: time.Date(2024, 2, 15, 0, 0, 0, 0, time.UTC), + }, + ".ts": { + churn: 500, + recentChurn: 700, // recent migration, less decay + firstChange: time.Date(2024, 2, 16, 0, 0, 0, 0, time.UTC), + lastChange: time.Date(2024, 5, 30, 0, 0, 0, 0, time.UTC), + }, + }, + }, + }, + } + + result := ExtensionStats(ds, 0) + if len(result) != 2 { + t.Fatalf("got %d buckets, want 2 (.js + .ts)", len(result)) + } + + var js, ts *ExtensionStat + for i := range result { + switch result[i].Ext { + case ".js": + js = &result[i] + case ".ts": + ts = &result[i] + } + } + if js == nil || ts == nil { + t.Fatalf("missing .js or .ts bucket; result=%+v", result) + } + + // Churn must reflect the per-era split, NOT be lumped into .ts. + if js.Churn != 1000 || ts.Churn != 500 { + t.Errorf("churn split wrong: .js=%d (want 1000), .ts=%d (want 500)", js.Churn, ts.Churn) + } + // RecentChurn similarly preserved per-era. + if js.RecentChurn != 200 || ts.RecentChurn != 700 { + t.Errorf("recent churn split wrong: .js=%.1f, .ts=%.1f", js.RecentChurn, ts.RecentChurn) + } + // One file lineage, but counts once per ext it held. + if js.Files != 1 || ts.Files != 1 { + t.Errorf("files per bucket: .js=%d, .ts=%d, want 1/1 (lineage counts in each bucket it held)", js.Files, ts.Files) + } + // Dates: each ext reports the range from its own era, not the whole + // file's range. .js ended when the migration started; .ts starts + // the day after. + if js.LastSeen != "2024-02-15" { + t.Errorf(".js LastSeen = %q, want 2024-02-15 (migration cutoff, not post-rename activity)", js.LastSeen) + } + if ts.FirstSeen != "2024-02-16" { + t.Errorf(".ts FirstSeen = %q, want 2024-02-16 (post-rename era start)", ts.FirstSeen) + } +} + +// Regression: applyRenames calls mergeFileEntry when two fileEntries +// collapse onto the same canonical path (foo.js → foo.ts). If the +// byExt merge drops an entry, sums dates incorrectly, or clobbers an +// overlapping bucket, ExtensionStats silently loses per-era +// attribution and the TestExtensionStatsHonorsPerEraSplit consumer- +// side guard wouldn't catch it — that test hand-builds byExt and +// never exercises the merger. +func TestMergeFileEntryByExt(t *testing.T) { + // dst covers two extensions; src covers one that overlaps (.js) + // and one that's new to dst (.md). After merge: .js aggregates, + // .md is transferred, .ts is untouched. + dst := &fileEntry{ + devLines: map[string]int64{"alice@x": 10}, + devCommits: map[string]int{"alice@x": 1}, + monthChurn: map[string]int64{}, + byExt: map[string]*extContribution{ + ".js": { + churn: 300, + recentChurn: 100, + firstChange: time.Date(2023, 1, 1, 0, 0, 0, 0, time.UTC), + lastChange: time.Date(2023, 6, 1, 0, 0, 0, 0, time.UTC), + }, + ".ts": { + churn: 500, + recentChurn: 400, + firstChange: time.Date(2024, 1, 1, 0, 0, 0, 0, time.UTC), + lastChange: time.Date(2024, 5, 1, 0, 0, 0, 0, time.UTC), + }, + }, + } + src := &fileEntry{ + devLines: map[string]int64{"bob@x": 20}, + devCommits: map[string]int{"bob@x": 2}, + monthChurn: map[string]int64{}, + byExt: map[string]*extContribution{ + ".js": { + // Earlier first + later last than dst — both bounds + // should widen after merge. + churn: 100, + recentChurn: 50, + firstChange: time.Date(2022, 6, 1, 0, 0, 0, 0, time.UTC), + lastChange: time.Date(2023, 12, 1, 0, 0, 0, 0, time.UTC), + }, + ".md": { + churn: 40, + recentChurn: 10, + firstChange: time.Date(2023, 3, 1, 0, 0, 0, 0, time.UTC), + lastChange: time.Date(2023, 8, 1, 0, 0, 0, 0, time.UTC), + }, + }, + } + + mergeFileEntry(dst, src) + + if len(dst.byExt) != 3 { + t.Fatalf("dst.byExt size = %d, want 3 (.js .ts .md)", len(dst.byExt)) + } + + // Overlapping bucket: churn and recentChurn add; dates widen. + js := dst.byExt[".js"] + if js.churn != 400 { + t.Errorf(".js churn = %d, want 400 (300+100)", js.churn) + } + if js.recentChurn != 150 { + t.Errorf(".js recentChurn = %.1f, want 150 (100+50)", js.recentChurn) + } + if !js.firstChange.Equal(time.Date(2022, 6, 1, 0, 0, 0, 0, time.UTC)) { + t.Errorf(".js firstChange = %v, want 2022-06-01 (src's earlier date won)", js.firstChange) + } + if !js.lastChange.Equal(time.Date(2023, 12, 1, 0, 0, 0, 0, time.UTC)) { + t.Errorf(".js lastChange = %v, want 2023-12-01 (src's later date won)", js.lastChange) + } + + // Non-overlapping bucket transferred from src: must be present + // with src's values preserved verbatim. + md := dst.byExt[".md"] + if md == nil { + t.Fatal(".md bucket missing after merge — src entry dropped") + } + if md.churn != 40 || md.recentChurn != 10 { + t.Errorf(".md = %+v, want churn=40 recentChurn=10", md) + } + + // dst-only bucket untouched. + ts := dst.byExt[".ts"] + if ts.churn != 500 || ts.recentChurn != 400 { + t.Errorf(".ts = %+v, want churn=500 recentChurn=400 (dst value preserved)", ts) + } +} + +// mergeFileEntry must lazily create dst.byExt when dst started nil — +// covers the case where a long-lived file without extension history +// collides with a newly-seen one that carries a byExt map. +func TestMergeFileEntryByExtNilDst(t *testing.T) { + dst := &fileEntry{ + devLines: map[string]int64{}, + devCommits: map[string]int{}, + monthChurn: map[string]int64{}, + } + src := &fileEntry{ + devLines: map[string]int64{}, + devCommits: map[string]int{}, + monthChurn: map[string]int64{}, + byExt: map[string]*extContribution{ + ".go": {churn: 10, recentChurn: 5}, + }, + } + mergeFileEntry(dst, src) + if dst.byExt == nil { + t.Fatal("dst.byExt still nil after merging from non-nil src") + } + if got := dst.byExt[".go"]; got == nil || got.churn != 10 { + t.Errorf("dst.byExt[.go] = %+v, want churn=10", got) + } +} + +// Inverse case: src has no byExt (e.g. legacy or hand-built). Merge +// must be a no-op on dst.byExt and not panic. +func TestMergeFileEntryByExtNilSrc(t *testing.T) { + dst := &fileEntry{ + devLines: map[string]int64{}, + devCommits: map[string]int{}, + monthChurn: map[string]int64{}, + byExt: map[string]*extContribution{ + ".rs": {churn: 7}, + }, + } + src := &fileEntry{ + devLines: map[string]int64{}, + devCommits: map[string]int{}, + monthChurn: map[string]int64{}, + } + mergeFileEntry(dst, src) + if len(dst.byExt) != 1 || dst.byExt[".rs"].churn != 7 { + t.Errorf("dst.byExt mutated by nil-src merge: %+v", dst.byExt) + } +} + +// DevProfile extensions: verifies that a dev's extension footprint is +// aggregated from the files they touched, sorted churn-desc, with Pct +// equal to files/FilesTouched*100. Uses a hand-built dataset so the +// expected distribution is deterministic. +func TestDevProfileExtensions(t *testing.T) { + ds := &Dataset{ + Latest: time.Date(2024, 6, 1, 0, 0, 0, 0, time.UTC), + contributors: map[string]*ContributorStat{ + "alice@x": { + Name: "Alice", Email: "alice@x", + Commits: 4, FilesTouched: 4, ActiveDays: 2, + FirstDate: "2024-01-01", LastDate: "2024-05-01", + }, + }, + files: map[string]*fileEntry{ + "cmd/main.go": {devLines: map[string]int64{"alice@x": 100}, devCommits: map[string]int{"alice@x": 2}, additions: 80, deletions: 20}, + "internal/util.go": {devLines: map[string]int64{"alice@x": 60}, devCommits: map[string]int{"alice@x": 1}, additions: 50, deletions: 10}, + "deploy/prod.yaml": {devLines: map[string]int64{"alice@x": 20}, devCommits: map[string]int{"alice@x": 1}, additions: 15, deletions: 5}, + "Makefile": {devLines: map[string]int64{"alice@x": 5}, devCommits: map[string]int{"alice@x": 1}, additions: 5, deletions: 0}, + }, + commits: map[string]*commitEntry{}, + workGrid: [7][24]int{}, + } + + profiles := DevProfiles(ds, "alice@x", 0) + if len(profiles) != 1 { + t.Fatalf("got %d profiles, want 1", len(profiles)) + } + p := profiles[0] + + if len(p.Extensions) != 3 { + t.Fatalf("alice.Extensions len = %d, want 3 (.go, .yaml, (none))", len(p.Extensions)) + } + // .go dominates (160 churn across 2 files), .yaml next (20), (none) + // last (5). Pct based on files/FilesTouched = files/4. + if p.Extensions[0].Ext != ".go" || p.Extensions[0].Files != 2 || p.Extensions[0].Churn != 160 { + t.Errorf("[0] = %+v, want {.go, 2, 160}", p.Extensions[0]) + } + if p.Extensions[0].Pct != 50.0 { + t.Errorf(".go pct = %.1f, want 50.0 (2/4)", p.Extensions[0].Pct) + } + if p.Extensions[1].Ext != ".yaml" || p.Extensions[1].Files != 1 { + t.Errorf("[1] = %+v, want .yaml/1", p.Extensions[1]) + } + if p.Extensions[2].Ext != "(none)" { + t.Errorf("[2] = %+v, want (none)", p.Extensions[2]) + } +} + +// Regression: sort MUST be files desc (not churn desc) so the +// displayed Pct — computed from files — is monotonic in both CLI and +// HTML bar widths. Here .py has MORE churn (one huge commit) but +// FEWER files than .go. Under the previous churn-first sort, .py +// would lead and the Pct column (25% for .py, 75% for .go) would +// decrease non-monotonically in a files-sorted visual. Under the +// corrected files-first sort, .go leads as it should. +func TestDevProfileExtensionsSortedByFiles(t *testing.T) { + ds := &Dataset{ + contributors: map[string]*ContributorStat{ + "alice@x": {Email: "alice@x", Commits: 2, FilesTouched: 4, ActiveDays: 1}, + }, + files: map[string]*fileEntry{ + "a.go": {devLines: map[string]int64{"alice@x": 10}, devCommits: map[string]int{"alice@x": 1}}, + "b.go": {devLines: map[string]int64{"alice@x": 10}, devCommits: map[string]int{"alice@x": 1}}, + "c.go": {devLines: map[string]int64{"alice@x": 10}, devCommits: map[string]int{"alice@x": 1}}, + // One .py file with way more dev-lines than each .go. + "big.py": {devLines: map[string]int64{"alice@x": 500}, devCommits: map[string]int{"alice@x": 1}}, + }, + commits: map[string]*commitEntry{}, + workGrid: [7][24]int{}, + } + p := DevProfiles(ds, "alice@x", 0)[0] + if p.Extensions[0].Ext != ".go" { + t.Errorf("[0] = %q, want .go (3 files beats .py's 1 file under files-first sort)", p.Extensions[0].Ext) + } + if p.Extensions[0].Pct != 75.0 { + t.Errorf(".go Pct = %.1f, want 75.0", p.Extensions[0].Pct) + } + if p.Extensions[1].Ext != ".py" || p.Extensions[1].Pct != 25.0 { + t.Errorf("[1] = %+v, want .py @ 25%%", p.Extensions[1]) + } + // The .py churn (500) is surfaced on the field for JSON consumers + // even though it ranks second by file count. + if p.Extensions[1].Churn != 500 { + t.Errorf(".py Churn = %d, want 500 (raw value still exposed)", p.Extensions[1].Churn) + } +} + +// Edge case: a dev whose only touches are root-level extensionless +// files (Makefile, LICENSE) collapses into a single "(none)" bucket +// at 100% — no crash, no fallthrough. +func TestDevProfileExtensionsAllNone(t *testing.T) { + ds := &Dataset{ + contributors: map[string]*ContributorStat{ + "ops@x": {Email: "ops@x", Commits: 2, FilesTouched: 2, ActiveDays: 1}, + }, + files: map[string]*fileEntry{ + "Makefile": {devLines: map[string]int64{"ops@x": 30}, devCommits: map[string]int{"ops@x": 1}}, + "LICENSE": {devLines: map[string]int64{"ops@x": 5}, devCommits: map[string]int{"ops@x": 1}}, + }, + commits: map[string]*commitEntry{}, + workGrid: [7][24]int{}, + } + p := DevProfiles(ds, "ops@x", 0)[0] + if len(p.Extensions) != 1 || p.Extensions[0].Ext != "(none)" { + t.Fatalf("Extensions = %+v, want single (none) bucket", p.Extensions) + } + if p.Extensions[0].Pct != 100.0 { + t.Errorf("(none) Pct = %.1f, want 100.0", p.Extensions[0].Pct) + } +} + +// Edge case: a dev whose commits never touch any file (all commits +// had files_changed = 0, so no commit_file records reached fe.devLines). +// devFiles[email] is absent; Extensions must be nil — both HTML +// templates guard on truthiness so a nil slice renders as nothing. +func TestDevProfileExtensionsEmpty(t *testing.T) { + ds := &Dataset{ + contributors: map[string]*ContributorStat{ + "ghost@x": {Email: "ghost@x", Commits: 1, FilesTouched: 0, ActiveDays: 1}, + }, + files: map[string]*fileEntry{}, + commits: map[string]*commitEntry{}, + workGrid: [7][24]int{}, + } + p := DevProfiles(ds, "ghost@x", 0)[0] + if len(p.Extensions) != 0 { + t.Errorf("Extensions = %+v, want empty", p.Extensions) + } +} + +// Truncate to top-5 when a dev's extension set is larger. Under the +// files-first sort, ties on file count (all 1 each here) fall through +// to churn desc, so the top 5 by churn still win. +func TestDevProfileExtensionsTopFive(t *testing.T) { + ds := &Dataset{ + Latest: time.Date(2024, 6, 1, 0, 0, 0, 0, time.UTC), + contributors: map[string]*ContributorStat{ + "alice@x": {Email: "alice@x", Commits: 7, FilesTouched: 7, ActiveDays: 1}, + }, + files: map[string]*fileEntry{ + "a.go": {devLines: map[string]int64{"alice@x": 100}, devCommits: map[string]int{"alice@x": 1}}, + "a.py": {devLines: map[string]int64{"alice@x": 80}, devCommits: map[string]int{"alice@x": 1}}, + "a.rs": {devLines: map[string]int64{"alice@x": 60}, devCommits: map[string]int{"alice@x": 1}}, + "a.ts": {devLines: map[string]int64{"alice@x": 40}, devCommits: map[string]int{"alice@x": 1}}, + "a.md": {devLines: map[string]int64{"alice@x": 20}, devCommits: map[string]int{"alice@x": 1}}, + "a.sh": {devLines: map[string]int64{"alice@x": 5}, devCommits: map[string]int{"alice@x": 1}}, + "a.yml": {devLines: map[string]int64{"alice@x": 3}, devCommits: map[string]int{"alice@x": 1}}, + }, + commits: map[string]*commitEntry{}, + workGrid: [7][24]int{}, + } + p := DevProfiles(ds, "alice@x", 0)[0] + if len(p.Extensions) != 5 { + t.Fatalf("Extensions len = %d, want top-5 truncation", len(p.Extensions)) + } + // Top 5 by churn: .go .py .rs .ts .md. .sh and .yml excluded. + for i, want := range []string{".go", ".py", ".rs", ".ts", ".md"} { + if p.Extensions[i].Ext != want { + t.Errorf("[%d] = %q, want %q", i, p.Extensions[i].Ext, want) + } + } +} + +func TestExtensionStatsTopN(t *testing.T) { + ds := &Dataset{ + files: map[string]*fileEntry{ + "a.go": {recentChurn: 100, devLines: map[string]int64{"a": 1}}, + "b.py": {recentChurn: 50, devLines: map[string]int64{"a": 1}}, + "c.rs": {recentChurn: 10, devLines: map[string]int64{"a": 1}}, + }, + } + result := ExtensionStats(ds, 2) + if len(result) != 2 { + t.Fatalf("top 2 len = %d", len(result)) + } + if result[0].Ext != ".go" || result[1].Ext != ".py" { + t.Errorf("top 2 = [%s, %s], want [.go, .py]", result[0].Ext, result[1].Ext) + } +} diff --git a/internal/stats/format.go b/internal/stats/format.go index ad61d0f..66ca240 100644 --- a/internal/stats/format.go +++ b/internal/stats/format.go @@ -159,6 +159,36 @@ func (f *Formatter) PrintDirectories(dirs []DirStat) error { } } +func (f *Formatter) PrintExtensions(exts []ExtensionStat) error { + switch f.format { + case "json": + return f.writeJSON(exts) + case "csv": + rows := make([][]string, len(exts)) + for i, e := range exts { + rows[i] = []string{ + e.Ext, + fmt.Sprintf("%d", e.Files), + fmt.Sprintf("%d", e.Churn), + fmt.Sprintf("%.1f", e.RecentChurn), + fmt.Sprintf("%d", e.UniqueDevs), + e.FirstSeen, + e.LastSeen, + } + } + return f.writeCSV([]string{"ext", "files", "churn", "recent_churn", "unique_devs", "first_seen", "last_seen"}, rows) + default: + tw := tabwriter.NewWriter(f.w, 0, 4, 2, ' ', 0) + fmt.Fprintf(tw, "EXT\tFILES\tCHURN\tRECENT CHURN\tDEVS\tFIRST SEEN\tLAST SEEN\n") + fmt.Fprintf(tw, "---\t-----\t-----\t------------\t----\t----------\t---------\n") + for _, e := range exts { + fmt.Fprintf(tw, "%s\t%d\t%d\t%.1f\t%d\t%s\t%s\n", + e.Ext, e.Files, e.Churn, e.RecentChurn, e.UniqueDevs, e.FirstSeen, e.LastSeen) + } + return tw.Flush() + } +} + func (f *Formatter) PrintActivity(buckets []ActivityBucket) error { switch f.format { case "json": @@ -438,6 +468,16 @@ func (f *Formatter) PrintProfiles(profiles []DevProfile) error { fmt.Fprintf(f.w, "%s (%.0f%%)", s.Dir, s.Pct) } fmt.Fprintln(f.w) + if len(p.Extensions) > 0 { + fmt.Fprintf(f.w, " Extensions: ") + for j, e := range p.Extensions { + if j > 0 { + fmt.Fprintf(f.w, ", ") + } + fmt.Fprintf(f.w, "%s (%.0f%%)", e.Ext, e.Pct) + } + fmt.Fprintln(f.w) + } // %.3f (not %.2f): labels are assigned at thresholds 0.15 / 0.35 // / 0.7 using the unrounded float. With %.2f a value like // 0.149 displays as "0.15" and the "broad generalist" label diff --git a/internal/stats/reader.go b/internal/stats/reader.go index d75952c..6c395d8 100644 --- a/internal/stats/reader.go +++ b/internal/stats/reader.go @@ -34,6 +34,21 @@ type fileEntry struct { firstChange time.Time lastChange time.Time monthChurn map[string]int64 // key: "YYYY-MM"; used for trend classification + // byExt splits this file's churn across extensions as they appeared + // at each change, so a rename-across-extensions (foo.js → foo.ts) + // keeps per-era attribution correct even after applyRenames merges + // the lineage onto one canonical path. Populated at ingest in + // lockstep with additions/deletions/recentChurn; consumed only by + // ExtensionStats. nil for hand-built fileEntries in tests — the + // aggregator falls back to the canonical path's extension when so. + byExt map[string]*extContribution +} + +type extContribution struct { + churn int64 + recentChurn float64 + firstChange time.Time + lastChange time.Time } type filePair struct{ a, b string } @@ -334,6 +349,22 @@ func streamLoadInto(ds *Dataset, r io.Reader, opt LoadOptions, pathPrefix string fe.additions += cf.Additions fe.deletions += cf.Deletions + // byExt: attribute this change to the extension the path had + // AT THIS COMMIT, not the canonical post-rename extension. + // This is the only place the pre-rename path is still + // available; after applyRenames all entries collapse onto + // the final path and the split would be lost. + changeExt := extractExtension(path) + if fe.byExt == nil { + fe.byExt = make(map[string]*extContribution) + } + ec, ok := fe.byExt[changeExt] + if !ok { + ec = &extContribution{} + fe.byExt[changeExt] = ec + } + ec.churn += cf.Additions + cf.Deletions + cm := ds.commits[cf.Commit] if cm != nil { fe.devLines[cm.email] += cf.Additions + cf.Deletions @@ -349,12 +380,19 @@ func streamLoadInto(ds *Dataset, r io.Reader, opt LoadOptions, pathPrefix string days := ds.Latest.Sub(cm.date).Hours() / 24 weight := math.Exp(-lambda * days) fe.recentChurn += float64(cf.Additions+cf.Deletions) * weight + ec.recentChurn += float64(cf.Additions+cf.Deletions) * weight if cm.date.After(fe.lastChange) { fe.lastChange = cm.date } if fe.firstChange.IsZero() || cm.date.Before(fe.firstChange) { fe.firstChange = cm.date } + if cm.date.After(ec.lastChange) { + ec.lastChange = cm.date + } + if ec.firstChange.IsZero() || cm.date.Before(ec.firstChange) { + ec.firstChange = cm.date + } fe.monthChurn[cm.date.UTC().Format("2006-01")] += cf.Additions + cf.Deletions } } @@ -598,6 +636,31 @@ func mergeFileEntry(dst, src *fileEntry) { if src.lastChange.After(dst.lastChange) { dst.lastChange = src.lastChange } + + // byExt: merge per-ext buckets. When rename collapses foo.js → foo.ts, + // this preserves ".js" churn on the ts-keyed entry so ExtensionStats + // can split it back. Same bucket on both sides = add; new bucket = + // transfer the pointer (src is discarded afterwards so ownership + // transfer is safe). + if src.byExt != nil { + if dst.byExt == nil { + dst.byExt = make(map[string]*extContribution, len(src.byExt)) + } + for ext, srcEC := range src.byExt { + if dstEC, ok := dst.byExt[ext]; ok { + dstEC.churn += srcEC.churn + dstEC.recentChurn += srcEC.recentChurn + if !srcEC.firstChange.IsZero() && (dstEC.firstChange.IsZero() || srcEC.firstChange.Before(dstEC.firstChange)) { + dstEC.firstChange = srcEC.firstChange + } + if srcEC.lastChange.After(dstEC.lastChange) { + dstEC.lastChange = srcEC.lastChange + } + } else { + dst.byExt[ext] = srcEC + } + } + } } // isMechanicalRefactor returns true when a commit's shape matches a likely diff --git a/internal/stats/stats.go b/internal/stats/stats.go index 3064a69..1e85bf2 100644 --- a/internal/stats/stats.go +++ b/internal/stats/stats.go @@ -376,6 +376,210 @@ func DirectoryStats(ds *Dataset, n int) []DirStat { return result } +// ExtensionStat rolls history up per file extension. The historical +// lens is the point: "which extension is the team spending effort on" +// answers a different question than "which extension exists in the +// repo" — the latter is what cloc/tokei do from the filesystem. +// +// Ext is the normalized extension (leading dot, lowercased) — e.g. +// ".go", ".yaml", ".gitignore". Files without a conventional extension +// collapse into the bucket "(none)"; see extractExtension for the +// policy (dotfiles kept verbatim, last-segment rule for the rest). +// +// RecentChurn is the decay-weighted aggregate from the same pipeline +// as FileStat.RecentChurn (half-life set at Dataset load time), so a +// dropped-yesterday extension won't outrank a still-active one just +// because it accumulated more lifetime churn. +type ExtensionStat struct { + Ext string + Files int + Churn int64 + RecentChurn float64 + UniqueDevs int + FirstSeen string + LastSeen string +} + +// extractExtension returns the extension bucket for a path. +// Policy: +// - Basename after the final "/" is the subject; directory prefix is +// ignored. +// - Multi-input prefix (":") on root-level files is stripped +// first, so repo.v1:Makefile collapses to (none) instead of +// picking the dot inside the stem and emitting ".v1:makefile". +// LoadMultiJSONL is the only code path that injects ":" into +// tracked paths, so the presence of ":" in a basename is a strong +// signal of this prefix; the alternative — a real filename with +// ":" — is rare enough on POSIX/Windows that we accept the +// false-positive risk in exchange for correct multi-repo +// behaviour. +// - Single-dot dotfiles (".gitignore", ".env") keep their full name — +// they carry meaning as a group, and reducing them to "" would +// merge them with extension-less files (Makefile, LICENSE). +// - Multi-segment dotfiles (".env.local", ".eslintrc.json") report +// the last segment (".local", ".json"). Imperfect but keeps the +// rule "last segment after the final dot" consistent and explicit. +// - Files with no dot (or a trailing dot) collapse into "(none)". +// - Extensions are lowercased so ".PNG" and ".png" aggregate. +func extractExtension(path string) string { + if i := strings.LastIndex(path, "/"); i >= 0 { + path = path[i+1:] + } + if path == "" { + return "(none)" + } + // Strip multi-input stem prefix. LoadMultiJSONL prepends ":" + // to paths; for nested files the slash-split above already + // discarded it, but root-level files still carry it and the dots + // inside a stem name (e.g. "repo.v1") would otherwise be mistaken + // for a real extension. + if i := strings.IndexByte(path, ':'); i >= 0 { + path = path[i+1:] + if path == "" { + return "(none)" + } + } + lastDot := strings.LastIndex(path, ".") + if lastDot <= 0 { + // No dot at all, or a name that begins with "." and has no + // second dot. Dotfiles get their full name (".gitignore" is a + // meaningful group); "." alone is degenerate and collapses + // like Makefile / LICENSE. + if len(path) > 1 && strings.HasPrefix(path, ".") { + return strings.ToLower(path) + } + return "(none)" + } + ext := path[lastDot:] + if ext == "." { + return "(none)" // trailing dot like "foo." — pathological but defined + } + return strings.ToLower(ext) +} + +// ExtensionStats aggregates ds.files by extension bucket. Complexity +// is O(F + E log E) where F is the number of tracked files and E is +// the number of distinct extensions — trivial compared to the stats +// that compute Herfindahl indices or per-commit coupling pairs. +// +// Attribution uses fileEntry.byExt, which tracks churn per extension +// as it existed at each change. A file renamed across extensions +// (foo.js → foo.ts) contributes to both buckets proportionally to +// its pre- and post-rename history; bucketing on the canonical path's +// current extension alone would misattribute the entire lineage to +// the final extension. A file that touched two extensions counts once +// in each — "files" is "distinct lineages that ever held this +// extension". Totals across buckets therefore exceed len(ds.files) in +// repos with migrations. +// +// UniqueDevs is per-file-union: any dev who touched a file contributes +// to every extension bucket that file ever held. This over-counts on +// migration: a dev who only touched foo.js pre-rename will also +// appear under ".ts" if the file was migrated. Accepting this as a +// bounded caveat — splitting devs per era would require per-commit +// dev tracking that doesn't exist on fileEntry. +// +// Falls back to the canonical path's extension when fe.byExt is nil +// (hand-built fileEntries in unit tests never populate it). +func ExtensionStats(ds *Dataset, n int) []ExtensionStat { + type extAcc struct { + files int + churn int64 + recentChurn float64 + devs map[string]struct{} + first, last time.Time + } + + getBucket := func(buckets map[string]*extAcc, ext string) *extAcc { + b, ok := buckets[ext] + if !ok { + b = &extAcc{devs: make(map[string]struct{})} + buckets[ext] = b + } + return b + } + + buckets := make(map[string]*extAcc) + for path, fe := range ds.files { + // Real path: use the rename-aware per-ext split captured at + // ingest. Fallback: if byExt is nil (legacy data / test + // dataset), synthesize a single-bucket split from the + // canonical path's extension + current file aggregate. + if fe.byExt == nil { + ext := extractExtension(path) + b := getBucket(buckets, ext) + b.files++ + b.churn += fe.additions + fe.deletions + b.recentChurn += fe.recentChurn + for email := range fe.devLines { + b.devs[email] = struct{}{} + } + if !fe.firstChange.IsZero() && (b.first.IsZero() || fe.firstChange.Before(b.first)) { + b.first = fe.firstChange + } + if !fe.lastChange.IsZero() && fe.lastChange.After(b.last) { + b.last = fe.lastChange + } + continue + } + for ext, ec := range fe.byExt { + b := getBucket(buckets, ext) + b.files++ + b.churn += ec.churn + b.recentChurn += ec.recentChurn + // Dev attribution uses the per-file dev set (see doc comment + // — we accept the cross-era over-count rather than a coarser + // approximation). + for email := range fe.devLines { + b.devs[email] = struct{}{} + } + if !ec.firstChange.IsZero() && (b.first.IsZero() || ec.firstChange.Before(b.first)) { + b.first = ec.firstChange + } + if !ec.lastChange.IsZero() && ec.lastChange.After(b.last) { + b.last = ec.lastChange + } + } + } + + result := make([]ExtensionStat, 0, len(buckets)) + for ext, b := range buckets { + es := ExtensionStat{ + Ext: ext, + Files: b.files, + Churn: b.churn, + RecentChurn: math.Round(b.recentChurn*10) / 10, + UniqueDevs: len(b.devs), + } + if !b.first.IsZero() { + es.FirstSeen = b.first.UTC().Format("2006-01-02") + } + if !b.last.IsZero() { + es.LastSeen = b.last.UTC().Format("2006-01-02") + } + result = append(result, es) + } + + // Deterministic ordering under ties: recent churn desc, files desc, + // ext asc. Recent churn leads so that "where is effort landing now" + // is what the reader sees first; files breaks ties between similar + // recent-churn levels by established footprint. + sort.Slice(result, func(i, j int) bool { + if result[i].RecentChurn != result[j].RecentChurn { + return result[i].RecentChurn > result[j].RecentChurn + } + if result[i].Files != result[j].Files { + return result[i].Files > result[j].Files + } + return result[i].Ext < result[j].Ext + }) + + if n > 0 && n < len(result) { + result = result[:n] + } + return result +} + func ActivityOverTime(ds *Dataset, granularity string) []ActivityBucket { buckets := make(map[string]*ActivityBucket) var order []string @@ -1060,6 +1264,7 @@ type DevProfile struct { LastDate string TopFiles []DevFileContrib Scope []DirScope + Extensions []DevExtContrib Specialization float64 // Gini over dir file-count distribution: 0 = broad generalist, 1 = single-dir specialist ContribRatio float64 // del/add — 0=growth, ~1=rewrite, >1=cleanup ContribType string // "growth", "balanced", "refactor" @@ -1088,6 +1293,26 @@ type DevFileContrib struct { Churn int64 } +// DevExtContrib is a dev's footprint in a single extension bucket. +// Churn is the summed per-file dev-lines (from fe.devLines), so it +// reflects lines the dev personally added/removed across files that +// currently carry this extension — NOT the file's lifetime churn. +// Pct is the share of the dev's files (by count) that land in this +// bucket, matching DirScope.Pct's semantics so the two read +// consistently side by side. +// +// Caveat: the bucket is derived from the file's canonical post-rename +// path. A dev who worked on foo.js pre-migration still shows up under +// ".ts" if that file was later renamed. Per-era per-dev attribution +// would need byExt to carry a dev dimension, which isn't tracked; see +// METRICS.md for the full rationale. +type DevExtContrib struct { + Ext string + Files int + Churn int64 + Pct float64 +} + // DevProfiles returns a profile for each developer (or a specific one if filterEmail is set). // DevProfiles builds per-developer profile structs. Pass n > 0 to // limit output to the top-N contributors by commit count — the cap is @@ -1358,6 +1583,60 @@ func DevProfiles(ds *Dataset, filterEmail string, n int) []DevProfile { scope = scope[:5] } + // Extensions: the dev's language/skill fingerprint. Aggregated + // from the same devFiles map used for Scope + TopFiles — each + // file contributes its dev-attributable churn (devLines[email]) + // to the bucket picked via the file's canonical-path + // extension. The canonical-path simplification is documented + // in METRICS.md; per-era per-dev would need byExt to carry a + // dev dimension. + type extAccForDev struct { + files int + churn int64 + } + extCount := make(map[string]*extAccForDev) + if files, ok := devFiles[email]; ok { + for path, fa := range files { + ext := extractExtension(path) + acc, ok := extCount[ext] + if !ok { + acc = &extAccForDev{} + extCount[ext] = acc + } + acc.files++ + acc.churn += fa.churn + } + } + var extensions []DevExtContrib + for ext, acc := range extCount { + pct := 0.0 + if cs.FilesTouched > 0 { + pct = math.Round(float64(acc.files)/float64(cs.FilesTouched)*1000) / 10 + } + extensions = append(extensions, DevExtContrib{ + Ext: ext, Files: acc.files, Churn: acc.churn, Pct: pct, + }) + } + // Sort mirrors Scope: files desc first, so the displayed Pct + // (computed from files) is monotonic in CLI and the HTML bar + // widths — Pct-sorted = visually sorted. Tiebreak on churn + // desc keeps the "more investment wins" signal when two + // buckets hold the same number of files, then ext asc for + // determinism. The Churn field on each entry is still + // available for JSON consumers who want a churn-ranked view. + sort.Slice(extensions, func(i, j int) bool { + if extensions[i].Files != extensions[j].Files { + return extensions[i].Files > extensions[j].Files + } + if extensions[i].Churn != extensions[j].Churn { + return extensions[i].Churn > extensions[j].Churn + } + return extensions[i].Ext < extensions[j].Ext + }) + if len(extensions) > 5 { + extensions = extensions[:5] + } + // Contribution type contribRatio := 0.0 contribType := "growth" @@ -1404,7 +1683,7 @@ func DevProfiles(ds *Dataset, filterEmail string, n int) []DevProfile { Commits: cs.Commits, Additions: cs.Additions, Deletions: cs.Deletions, LinesChanged: cs.Additions + cs.Deletions, FilesTouched: cs.FilesTouched, ActiveDays: cs.ActiveDays, FirstDate: cs.FirstDate, LastDate: cs.LastDate, - TopFiles: topFiles, Scope: scope, Specialization: specialization, + TopFiles: topFiles, Scope: scope, Extensions: extensions, Specialization: specialization, ContribRatio: contribRatio, ContribType: contribType, Pace: pace, Collaborators: collabs, MonthlyActivity: monthly, WorkGrid: grid, WeekendPct: wpct,