diff --git a/git-collaboration-network.ipynb b/git-collaboration-network.ipynb new file mode 100644 index 0000000..fad3795 --- /dev/null +++ b/git-collaboration-network.ipynb @@ -0,0 +1,404 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Collaborative Network from Git History\n", + "\n", + "Builds two views of contributor collaboration from a GitHub repository:\n", + "\n", + "1. **Network graph** \u2014 nodes are contributors, edges come from:\n", + " - *PR collaboration*: both participated (author / reviewer) in the same pull request\n", + " - *Temporal file overlap*: both touched the same code area **and** their activity windows overlap in time\n", + "2. **Code Archaeology heatmap** \u2014 for each (contributor, code area) pair, measures how many days before the contributor arrived that area was last touched by someone else" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Configuration" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "REPO = \"torvalds/linux\" # change to any owner/repo\n", + "GITHUB_TOKEN = \"\" # set a personal access token to raise rate limits\n", + "MAX_PRS = 100 # merged PRs to analyse (increase for deeper history)\n", + "MAX_COMMIT_PAGES = 5 # x100 commits each, for activity-window computation\n", + "MIN_CONTRIBUTIONS = 2 # exclude contributors below this threshold\n", + "FILE_GROUP_DEPTH = 1 # directory depth for code-area grouping (1 = top-level dir)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import time\n", + "import requests\n", + "import pandas as pd\n", + "import numpy as np\n", + "import networkx as nx\n", + "import matplotlib.pyplot as plt\n", + "import matplotlib.patches as mpatches\n", + "import seaborn as sns\n", + "from collections import defaultdict\n", + "from itertools import combinations" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Fetch data from GitHub" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def make_headers(token=''):\n", + " h = {\"Accept\": \"application/vnd.github+json\"}\n", + " if token:\n", + " h[\"Authorization\"] = f\"Bearer {token}\"\n", + " return h\n", + "\n", + "\n", + "def paginate(url, headers, params=None, max_pages=None):\n", + " results, page = [], 1\n", + " while True:\n", + " p = {**(params or {}), 'page': page, 'per_page': 100}\n", + " r = requests.get(url, headers=headers, params=p, timeout=30)\n", + " if r.status_code == 403:\n", + " raise PermissionError(\"Rate limit hit \u2014 set GITHUB_TOKEN.\")\n", + " if r.status_code == 404:\n", + " raise ValueError(f\"Not found: {url}\")\n", + " r.raise_for_status()\n", + " data = r.json()\n", + " if not data:\n", + " break\n", + " results.extend(data)\n", + " if (max_pages and page >= max_pages) or \"next\" not in r.links:\n", + " break\n", + " page += 1\n", + " return results" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def fetch_pr_data(repo, headers, max_prs=100):\n", + " print(f\"Fetching PRs from {repo}...\")\n", + " raw = paginate(\n", + " f\"https://api.github.com/repos/{repo}/pulls\",\n", + " headers, {\"state\": \"closed\"},\n", + " max_pages=(max_prs // 100 + 1) if max_prs else None,\n", + " )\n", + " merged = [pr for pr in raw if pr.get('merged_at')][:max_prs]\n", + "\n", + " records = []\n", + " for i, pr in enumerate(merged):\n", + " num = pr['number']\n", + " author = (pr.get('user') or {}).get('login')\n", + " date = pd.to_datetime(pr['merged_at'])\n", + "\n", + " reviews = paginate(\n", + " f\"https://api.github.com/repos/{repo}/pulls/{num}/reviews\", headers\n", + " )\n", + " reviewers = list(\n", + " {(r.get('user') or {}).get('login') for r in reviews} - {None, author}\n", + " )\n", + "\n", + " files_data = paginate(\n", + " f\"https://api.github.com/repos/{repo}/pulls/{num}/files\", headers\n", + " )\n", + " files = [f['filename'] for f in files_data]\n", + "\n", + " records.append({\n", + " 'pr': num, 'author': author,\n", + " 'reviewers': reviewers, 'files': files, 'date': date,\n", + " })\n", + " if (i + 1) % 10 == 0:\n", + " print(f\" {i + 1}/{len(merged)} PRs\")\n", + " time.sleep(0.05)\n", + "\n", + " print(f\"Done \u2014 {len(records)} merged PRs loaded\")\n", + " return records\n", + "\n", + "\n", + "def fetch_commit_activity(repo, headers, max_pages=5):\n", + " print(\"Fetching commit history for activity windows...\")\n", + " commits = paginate(\n", + " f\"https://api.github.com/repos/{repo}/commits\", headers, max_pages=max_pages\n", + " )\n", + " activity = defaultdict(list)\n", + " for c in commits:\n", + " login = (c.get('author') or {}).get('login')\n", + " date_str = (c.get('commit', {}).get('author') or {}).get('date')\n", + " if login and date_str:\n", + " activity[login].append(pd.to_datetime(date_str))\n", + " print(f\"Found activity for {len(activity)} contributors\")\n", + " return dict(activity)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Build collaboration network" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def build_activity_windows(pr_records, commit_activity, min_contributions=2):\n", + " \"\"\"Merge PR and commit dates into per-contributor (start, end) windows.\"\"\"\n", + " dates = defaultdict(list)\n", + " for pr in pr_records:\n", + " for person in [pr['author']] + pr['reviewers']:\n", + " if person:\n", + " dates[person].append(pr['date'])\n", + " for person, ds in commit_activity.items():\n", + " dates[person].extend(ds)\n", + " return {\n", + " p: (min(ds), max(ds))\n", + " for p, ds in dates.items()\n", + " if len(ds) >= min_contributions\n", + " }\n", + "\n", + "\n", + "def overlaps(w1, w2):\n", + " return w1[0] <= w2[1] and w2[0] <= w1[1]\n", + "\n", + "\n", + "def file_group(path, depth=1):\n", + " parts = path.split('/')\n", + " return '/'.join(parts[:depth]) if len(parts) > depth else parts[0]\n", + "\n", + "\n", + "def build_network(pr_records, activity_windows, file_group_depth=1):\n", + " G = nx.Graph()\n", + " G.add_nodes_from(activity_windows)\n", + "\n", + " # PR edges: both participated in the same pull request\n", + " for pr in pr_records:\n", + " participants = list({\n", + " p for p in [pr['author']] + pr['reviewers']\n", + " if p in activity_windows\n", + " })\n", + " for a, b in combinations(participants, 2):\n", + " if G.has_edge(a, b):\n", + " G[a][b]['pr_weight'] = G[a][b].get('pr_weight', 0) + 1\n", + " else:\n", + " G.add_edge(a, b, pr_weight=1, file_weight=0)\n", + "\n", + " # File + temporal-overlap edges: touched the same code area while both active\n", + " area_contribs = defaultdict(set)\n", + " for pr in pr_records:\n", + " participants = [p for p in [pr['author']] + pr['reviewers'] if p in activity_windows]\n", + " for f in pr['files']:\n", + " area_contribs[file_group(f, file_group_depth)].update(participants)\n", + "\n", + " for area, people in area_contribs.items():\n", + " for a, b in combinations(people, 2):\n", + " if overlaps(activity_windows[a], activity_windows[b]):\n", + " if G.has_edge(a, b):\n", + " G[a][b]['file_weight'] = G[a][b].get('file_weight', 0) + 1\n", + " else:\n", + " G.add_edge(a, b, pr_weight=0, file_weight=1)\n", + "\n", + " return G" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Network visualisation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def plot_network(G, repo):\n", + " fig, ax = plt.subplots(figsize=(14, 10))\n", + " pos = nx.spring_layout(G, k=1.5, seed=42)\n", + "\n", + " pr_only = [(u, v) for u, v, d in G.edges(data=True) if d.get('pr_weight', 0) > 0 and d.get('file_weight', 0) == 0]\n", + " file_only = [(u, v) for u, v, d in G.edges(data=True) if d.get('pr_weight', 0) == 0 and d.get('file_weight', 0) > 0]\n", + " both = [(u, v) for u, v, d in G.edges(data=True) if d.get('pr_weight', 0) > 0 and d.get('file_weight', 0) > 0]\n", + "\n", + " node_sizes = [200 + 80 * G.degree(n) for n in G.nodes()]\n", + " nx.draw_networkx_nodes(G, pos, node_size=node_sizes, node_color='steelblue', alpha=0.85, ax=ax)\n", + " nx.draw_networkx_labels(G, pos, font_size=7, ax=ax)\n", + " nx.draw_networkx_edges(G, pos, edgelist=pr_only, edge_color='#e07b39', width=1.5, alpha=0.7, ax=ax)\n", + " nx.draw_networkx_edges(G, pos, edgelist=file_only, edge_color='#4caf50', width=1.0, alpha=0.6, ax=ax)\n", + " nx.draw_networkx_edges(G, pos, edgelist=both, edge_color='#9c27b0', width=2.5, alpha=0.8, ax=ax)\n", + "\n", + " legend = [\n", + " mpatches.Patch(color='#e07b39', label='PR collaboration only'),\n", + " mpatches.Patch(color='#4caf50', label='Shared code area (temporal overlap) only'),\n", + " mpatches.Patch(color='#9c27b0', label='Both'),\n", + " ]\n", + " ax.legend(handles=legend, loc='upper left', fontsize=9)\n", + " ax.set_title(f'Collaboration Network \u2014 {repo}', fontsize=13)\n", + " ax.axis('off')\n", + " plt.tight_layout()\n", + " plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Code Archaeology\n", + "\n", + "For each *(contributor, code area)* pair, the **archaeology depth** is the number of days between when a contributor first became active and when that code area was last touched by someone else before them.\n", + "\n", + "- **Large value** \u2192 contributor arrived long after others left; understanding the code requires digging deep into history\n", + "- **Zero / NaN** \u2192 contributor was contemporary with others who touched that area (no archaeology needed)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def build_archaeology_matrix(pr_records, activity_windows, file_group_depth=1,\n", + " top_files=30, top_contributors=25):\n", + " area_timeline = defaultdict(list)\n", + " for pr in pr_records:\n", + " participants = [p for p in [pr['author']] + pr['reviewers'] if p in activity_windows]\n", + " for f in pr['files']:\n", + " g = file_group(f, file_group_depth)\n", + " for p in participants:\n", + " area_timeline[g].append((pr['date'], p))\n", + " for g in area_timeline:\n", + " area_timeline[g].sort()\n", + "\n", + " contrib_count = defaultdict(int)\n", + " for pr in pr_records:\n", + " for p in [pr['author']] + pr['reviewers']:\n", + " if p in activity_windows:\n", + " contrib_count[p] += 1\n", + " top_contribs = sorted(contrib_count, key=contrib_count.get, reverse=True)[:top_contributors]\n", + "\n", + " top_areas = sorted(\n", + " area_timeline, key=lambda g: len(area_timeline[g]), reverse=True\n", + " )[:top_files]\n", + "\n", + " matrix = pd.DataFrame(index=top_areas, columns=top_contribs, dtype=float)\n", + "\n", + " for area in top_areas:\n", + " for c in top_contribs:\n", + " c_start = activity_windows[c][0]\n", + " prior = [\n", + " date for date, person in area_timeline[area]\n", + " if person != c and date < c_start\n", + " ]\n", + " if prior:\n", + " matrix.loc[area, c] = float(max((c_start - max(prior)).days, 0))\n", + "\n", + " return matrix\n", + "\n", + "\n", + "def plot_archaeology(matrix, repo):\n", + " data = matrix.dropna(how='all').dropna(axis=1, how='all')\n", + " if data.empty:\n", + " print('No archaeology data: all contributors were fully contemporaneous.')\n", + " return\n", + "\n", + " h = max(6, len(data.index) * 0.35)\n", + " w = max(10, len(data.columns) * 0.55)\n", + " fig, ax = plt.subplots(figsize=(w, h))\n", + " sns.heatmap(\n", + " data.astype(float),\n", + " cmap='YlOrRd',\n", + " ax=ax,\n", + " linewidths=0.3,\n", + " cbar_kws={'label': 'Archaeology depth (days)'},\n", + " mask=data.isna(),\n", + " )\n", + " ax.set_title(\n", + " f'Code Archaeology \u2014 {repo}\\n'\n", + " 'Days since others last touched this code area before a contributor arrived',\n", + " fontsize=11,\n", + " )\n", + " ax.set_xlabel('Contributor')\n", + " ax.set_ylabel('Code Area')\n", + " plt.xticks(rotation=45, ha='right', fontsize=8)\n", + " plt.yticks(fontsize=7)\n", + " plt.tight_layout()\n", + " plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Run" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "HEADERS = make_headers(GITHUB_TOKEN)\n", + "\n", + "pr_records = fetch_pr_data(REPO, HEADERS, max_prs=MAX_PRS)\n", + "commit_activity = fetch_commit_activity(REPO, HEADERS, max_pages=MAX_COMMIT_PAGES)\n", + "\n", + "activity_windows = build_activity_windows(pr_records, commit_activity,\n", + " min_contributions=MIN_CONTRIBUTIONS)\n", + "print(f'Active contributors (>={MIN_CONTRIBUTIONS} contributions): {len(activity_windows)}')\n", + "\n", + "G = build_network(pr_records, activity_windows, file_group_depth=FILE_GROUP_DEPTH)\n", + "print(f'Network: {G.number_of_nodes()} nodes, {G.number_of_edges()} edges')\n", + "\n", + "plt.style.use('fivethirtyeight')\n", + "\n", + "plot_network(G, REPO)\n", + "\n", + "arch_matrix = build_archaeology_matrix(\n", + " pr_records, activity_windows,\n", + " file_group_depth=FILE_GROUP_DEPTH,\n", + ")\n", + "plot_archaeology(arch_matrix, REPO)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.9.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file diff --git a/github-contributor-analysis.ipynb b/github-contributor-analysis.ipynb new file mode 100644 index 0000000..f7908b7 --- /dev/null +++ b/github-contributor-analysis.ipynb @@ -0,0 +1,198 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# GitHub Contributor Nonlinear Performance Analysis\n", + "\n", + "Fetches contributor statistics from a GitHub repository and fits the nonlinear maintenance model\n", + "\n", + "$$w(t) = \\frac{\\text{amplitude}}{\\mu}\\left(1 - e^{-\\mu t}\\right)$$\n", + "\n", + "to each contributor's cumulative activity over time." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Configuration\n", + "\n", + "Set `REPO` to `\"owner/repo\"`. Optionally set `GITHUB_TOKEN` to avoid rate limiting (required for private repos)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "REPO = \"torvalds/linux\" # change to any public repo, e.g. \"pandas-dev/pandas\"\n", + "GITHUB_TOKEN = \"\" # optional: set a personal access token to raise rate limits" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import time\n", + "import requests\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "from scipy.optimize import minimize" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Fetch contributor statistics\n", + "\n", + "GitHub may return HTTP 202 while it computes statistics for the first time — the cell retries until data is ready." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def fetch_contributors(repo, token=\"\", max_retries=6, retry_delay=5):\n", + " \"\"\"Fetch contributor weekly stats from the GitHub API.\n", + "\n", + " Returns a DataFrame where each row is a contributor with a 'weeks' column\n", + " containing a list of {w, a, d, c} dicts (matching linux_contributors.json format).\n", + " \"\"\"\n", + " url = f\"https://api.github.com/repos/{repo}/stats/contributors\"\n", + " headers = {\"Accept\": \"application/vnd.github+json\"}\n", + " if token:\n", + " headers[\"Authorization\"] = f\"Bearer {token}\"\n", + "\n", + " for attempt in range(1, max_retries + 1):\n", + " response = requests.get(url, headers=headers, timeout=30)\n", + " if response.status_code == 200:\n", + " data = response.json()\n", + " if data: # empty list means no contributors\n", + " return pd.DataFrame(data)\n", + " raise ValueError(f\"Repository '{repo}' has no contributor data.\")\n", + " if response.status_code == 202:\n", + " print(f\"GitHub is computing stats, retrying in {retry_delay}s (attempt {attempt}/{max_retries})...\")\n", + " time.sleep(retry_delay)\n", + " continue\n", + " if response.status_code == 404:\n", + " raise ValueError(f\"Repository '{repo}' not found. Check the owner/repo spelling.\")\n", + " if response.status_code == 403:\n", + " raise PermissionError(\"Rate limit exceeded or access denied. Set GITHUB_TOKEN to a valid token.\")\n", + " response.raise_for_status()\n", + "\n", + " raise TimeoutError(f\"GitHub did not return contributor stats after {max_retries} attempts.\")\n", + "\n", + "\n", + "contributors = fetch_contributors(REPO, token=GITHUB_TOKEN)\n", + "print(f\"Fetched {len(contributors)} contributors from '{REPO}'\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def fun(t, amplitude, mu):\n", + " # t is in weeks; divide by 52 to convert to years (mu is a per-year rate)\n", + " return amplitude / mu * (1 - np.exp(-mu * t / 52))\n", + "\n", + "\n", + "def lsq(x_observed, params):\n", + " t = np.arange(len(x_observed))\n", + " x_model = fun(t, *params)\n", + " err = x_model - x_observed\n", + " return np.dot(err, err) / len(err)\n", + "\n", + "\n", + "def model_fit(x, title=\"\"):\n", + " x = np.array(x, dtype=np.float64)\n", + " x_max = x.max()\n", + " if x_max == 0:\n", + " return None\n", + " x = x / x_max\n", + " res = minimize(\n", + " lambda params: lsq(x, params),\n", + " (1 / len(x), 0.1),\n", + " method='SLSQP',\n", + " bounds=((0, None), (0, None))\n", + " )\n", + " amplitude, mu = res.x\n", + " # effective maintenance ratio: mu normalized by amplitude scale factor\n", + " mu_effective = mu / amplitude\n", + " mx = fun(np.arange(len(x)), amplitude, mu)\n", + " label = f\"{title} — \" if title else \"\"\n", + " pd.Series(mx).plot(title=r\"{}$\\mu$ = {:3.2f}\".format(label, mu_effective))\n", + " pd.Series(x).plot()\n", + " plt.show()\n", + " return res" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Run analysis" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def main(df):\n", + " for _, entry in df.iterrows():\n", + " rm = dict()\n", + " for week in entry.weeks:\n", + " d = dict(week)\n", + " w = d['w']\n", + " if w in rm:\n", + " rm[w] += d['a'] + d['d']\n", + " else:\n", + " rm[w] = d['a'] + d['d']\n", + " sorted_items = sorted(rm.items())\n", + " t = pd.Series([v for _, v in sorted_items]).cumsum()\n", + " t = t[t != 0] # Remove zero-activity weeks\n", + " login = entry.get('author', {}) or {}\n", + " if isinstance(login, dict):\n", + " login = login.get('login', '')\n", + " model_fit(t, title=login)\n", + "\n", + "\n", + "plt.rcParams[\"figure.figsize\"] = (9, 4)\n", + "plt.style.use('fivethirtyeight')\n", + "main(contributors)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.9.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}