diff --git a/collaboration-network-growth.ipynb b/collaboration-network-growth.ipynb
new file mode 100644
index 0000000..e14ef49
--- /dev/null
+++ b/collaboration-network-growth.ipynb
@@ -0,0 +1,436 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Collaboration Network Growth Animation\n",
+ "\n",
+ "Renders the history of a GitHub project as a **movie**: the collaboration network grows from a single contributor outward, frame by frame, one time period at a time.\n",
+ "\n",
+ "**Two outputs:**\n",
+ "- `OUTPUT_FILE` \u2014 GIF or MP4 (dark background, glow nodes, suitable for sharing)\n",
+ "- Plotly figure \u2014 interactive version with a play button and time scrubber (runs in the notebook)\n",
+ "\n",
+ "**Layout:** spring layout computed once on the final graph, so well-connected (core) contributors cluster at the centre and peripheral contributors radiate outward. Nodes are revealed progressively; new arrivals flash brightly on entry.\n",
+ "\n",
+ "**Node colour** follows the `plasma` colourmap: early contributors are purple/blue, later arrivals are orange/yellow."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Configuration"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "REPO = \"facebook/react\"\n",
+ "GITHUB_TOKEN = \"\"\n",
+ "MAX_PRS = 300 # merged PRs to analyse \u2014 more = longer movie\n",
+ "TIME_RESOLUTION = 'Q' # 'M' monthly 'Q' quarterly 'Y' yearly\n",
+ "OUTPUT_FILE = 'collaboration_growth.gif' # .gif (Pillow) or .mp4 (ffmpeg)\n",
+ "FPS = 4 # frames per second"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import time, requests\n",
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "import networkx as nx\n",
+ "import matplotlib.pyplot as plt\n",
+ "import matplotlib.colors as mcolors\n",
+ "from matplotlib.animation import FuncAnimation, PillowWriter\n",
+ "from collections import defaultdict\n",
+ "from itertools import combinations\n",
+ "try:\n",
+ " import plotly.graph_objects as go\n",
+ " HAS_PLOTLY = True\n",
+ "except ImportError:\n",
+ " HAS_PLOTLY = False\n",
+ " print('plotly not installed \u2014 skipping interactive output')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Fetch PR data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def make_headers(token=''):\n",
+ " h = {\"Accept\": \"application/vnd.github+json\"}\n",
+ " if token:\n",
+ " h[\"Authorization\"] = f\"Bearer {token}\"\n",
+ " return h\n",
+ "\n",
+ "\n",
+ "def paginate(url, headers, params=None, max_pages=None):\n",
+ " results, page = [], 1\n",
+ " while True:\n",
+ " p = {**(params or {}), 'page': page, 'per_page': 100}\n",
+ " r = requests.get(url, headers=headers, params=p, timeout=30)\n",
+ " if r.status_code == 403:\n",
+ " raise PermissionError(\"Rate limit hit \u2014 set GITHUB_TOKEN.\")\n",
+ " r.raise_for_status()\n",
+ " data = r.json()\n",
+ " if not data:\n",
+ " break\n",
+ " results.extend(data)\n",
+ " if (max_pages and page >= max_pages) or 'next' not in r.links:\n",
+ " break\n",
+ " page += 1\n",
+ " return results\n",
+ "\n",
+ "\n",
+ "def fetch_pr_data(repo, headers, max_prs=300):\n",
+ " print(f\"Fetching PRs from {repo}...\")\n",
+ " raw = paginate(\n",
+ " f\"https://api.github.com/repos/{repo}/pulls\",\n",
+ " headers, {\"state\": \"closed\"},\n",
+ " max_pages=(max_prs // 100 + 1) if max_prs else None,\n",
+ " )\n",
+ " merged = [pr for pr in raw if pr.get('merged_at')][:max_prs]\n",
+ "\n",
+ " records = []\n",
+ " for i, pr in enumerate(merged):\n",
+ " num = pr['number']\n",
+ " author = (pr.get('user') or {}).get('login')\n",
+ " date = pd.to_datetime(pr['merged_at'])\n",
+ " reviews = paginate(\n",
+ " f\"https://api.github.com/repos/{repo}/pulls/{num}/reviews\", headers\n",
+ " )\n",
+ " reviewers = list(\n",
+ " {(r.get('user') or {}).get('login') for r in reviews} - {None, author}\n",
+ " )\n",
+ " records.append({'pr': num, 'author': author, 'reviewers': reviewers, 'date': date})\n",
+ " if (i + 1) % 50 == 0:\n",
+ " print(f\" {i + 1}/{len(merged)} PRs\")\n",
+ " time.sleep(0.05)\n",
+ "\n",
+ " print(f\"Done \u2014 {len(records)} merged PRs\")\n",
+ " return records"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Build temporal snapshots"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def build_snapshots(pr_records, freq='Q'):\n",
+ " \"\"\"Cumulative network state at each time period.\"\"\"\n",
+ " for pr in pr_records:\n",
+ " pr['period'] = pr['date'].to_period(freq)\n",
+ " periods = sorted({pr['period'] for pr in pr_records})\n",
+ "\n",
+ " cum_nodes = {} # node -> {'first_seen': period, 'pr_count': int}\n",
+ " cum_edges = {} # (a,b) -> {'first_seen': period, 'weight': int}\n",
+ " snapshots = []\n",
+ "\n",
+ " for period in periods:\n",
+ " for pr in (p for p in pr_records if p['period'] == period):\n",
+ " participants = list({p for p in [pr['author']] + pr['reviewers'] if p})\n",
+ " for p in participants:\n",
+ " if p not in cum_nodes:\n",
+ " cum_nodes[p] = {'first_seen': period, 'pr_count': 0}\n",
+ " cum_nodes[p]['pr_count'] += 1\n",
+ " for a, b in combinations(participants, 2):\n",
+ " key = tuple(sorted([a, b]))\n",
+ " if key not in cum_edges:\n",
+ " cum_edges[key] = {'first_seen': period, 'weight': 0}\n",
+ " cum_edges[key]['weight'] += 1\n",
+ "\n",
+ " snapshots.append({\n",
+ " 'period': period,\n",
+ " 'nodes': dict(cum_nodes),\n",
+ " 'edges': dict(cum_edges),\n",
+ " })\n",
+ "\n",
+ " print(f'{len(snapshots)} time periods, '\n",
+ " f'{len(cum_nodes)} contributors, {len(cum_edges)} collaborations total')\n",
+ " return snapshots"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Compute layout"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def compute_layout(snapshots):\n",
+ " \"\"\"Spring layout on the final graph \u2014 core contributors cluster at the centre.\"\"\"\n",
+ " final = snapshots[-1]\n",
+ " G = nx.Graph()\n",
+ " G.add_nodes_from(final['nodes'])\n",
+ " G.add_edges_from(final['edges'])\n",
+ " pos = nx.spring_layout(G, k=2.0, iterations=120, seed=42)\n",
+ " # Normalise to [-1, 1]\n",
+ " coords = np.array(list(pos.values()))\n",
+ " centre = coords.mean(axis=0)\n",
+ " scale = np.abs(coords - centre).max()\n",
+ " return {n: tuple((np.array(xy) - centre) / (scale or 1)) for n, xy in pos.items()}"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Matplotlib animation (GIF / MP4)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "_BG = '#0d1117'\n",
+ "_CMAP = plt.cm.plasma\n",
+ "\n",
+ "\n",
+ "def _draw_frame(ax, snapshot, pos, period_idx, n_periods, prev_nodes):\n",
+ " ax.clear()\n",
+ " ax.set_facecolor(_BG)\n",
+ " ax.axis('off')\n",
+ " ax.set_xlim(-1.3, 1.3)\n",
+ " ax.set_ylim(-1.3, 1.3)\n",
+ "\n",
+ " nodes = snapshot['nodes']\n",
+ " edges = snapshot['edges']\n",
+ " new_nodes = set(nodes) - prev_nodes\n",
+ "\n",
+ " # --- edges ---\n",
+ " for (a, b), ed in edges.items():\n",
+ " if a not in pos or b not in pos:\n",
+ " continue\n",
+ " age = period_idx[snapshot['period']] - period_idx[ed['first_seen']]\n",
+ " alpha = min(0.55, 0.08 + 0.06 * age)\n",
+ " lw = 0.3 + 0.12 * min(ed['weight'], 6)\n",
+ " ax.plot([pos[a][0], pos[b][0]], [pos[a][1], pos[b][1]],\n",
+ " '-', color='#58a6ff', alpha=alpha, linewidth=lw, zorder=1)\n",
+ "\n",
+ " # --- nodes ---\n",
+ " top_nodes = {n for n, d in sorted(nodes.items(),\n",
+ " key=lambda x: x[1]['pr_count'], reverse=True)[:12]}\n",
+ " for node, nd in nodes.items():\n",
+ " if node not in pos:\n",
+ " continue\n",
+ " x, y = pos[node]\n",
+ " c_val = period_idx[nd['first_seen']] / max(1, n_periods - 1)\n",
+ " color = _CMAP(c_val)\n",
+ " size = 25 + 8 * min(nd['pr_count'], 30)\n",
+ " is_new = node in new_nodes\n",
+ " # glow ring\n",
+ " ax.scatter(x, y, s=size * (5 if is_new else 3),\n",
+ " color=color, alpha=0.45 if is_new else 0.12, zorder=2)\n",
+ " # core dot\n",
+ " ax.scatter(x, y, s=size, color=color, alpha=0.95, zorder=3)\n",
+ " # label for prominent contributors\n",
+ " if node in top_nodes:\n",
+ " ax.text(x, y + 0.09, node, ha='center', fontsize=5,\n",
+ " color='#e6edf3', alpha=0.85, zorder=4)\n",
+ "\n",
+ " # --- overlay ---\n",
+ " ax.text(0.02, 0.97, str(snapshot['period']),\n",
+ " transform=ax.transAxes, color='white', fontsize=14,\n",
+ " va='top', fontweight='bold')\n",
+ " ax.text(0.02, 0.91,\n",
+ " f\"{len(nodes)} contributors \u2022 {len(edges)} collaborations\",\n",
+ " transform=ax.transAxes, color='#8b949e', fontsize=8, va='top')\n",
+ "\n",
+ "\n",
+ "def make_animation(snapshots, pos, fps=4, output='collaboration_growth.gif'):\n",
+ " period_idx = {s['period']: i for i, s in enumerate(snapshots)}\n",
+ " n_periods = len(snapshots)\n",
+ " prev_list = [set()] + [set(snapshots[i - 1]['nodes']) for i in range(1, n_periods)]\n",
+ "\n",
+ " fig, ax = plt.subplots(figsize=(10, 10), facecolor=_BG)\n",
+ " fig.tight_layout(pad=0)\n",
+ "\n",
+ " def update(i):\n",
+ " _draw_frame(ax, snapshots[i], pos, period_idx, n_periods, prev_list[i])\n",
+ "\n",
+ " anim = FuncAnimation(fig, update, frames=n_periods,\n",
+ " interval=1000 // fps, repeat=True)\n",
+ "\n",
+ " if output.endswith('.gif'):\n",
+ " anim.save(output, writer=PillowWriter(fps=fps), dpi=120)\n",
+ " else:\n",
+ " from matplotlib.animation import FFMpegWriter\n",
+ " anim.save(output, writer=FFMpegWriter(fps=fps, bitrate=2000), dpi=150)\n",
+ " plt.close()\n",
+ " print(f'Saved \u2192 {output}')\n",
+ " return anim"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Plotly interactive animation"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def make_plotly_animation(snapshots, pos, repo=''):\n",
+ " period_idx = {s['period']: i for i, s in enumerate(snapshots)}\n",
+ " n_periods = len(snapshots)\n",
+ " cmap = plt.cm.plasma\n",
+ "\n",
+ " def rgba(period):\n",
+ " r, g, b, _ = cmap(period_idx[period] / max(1, n_periods - 1))\n",
+ " return f'rgb({int(r*255)},{int(g*255)},{int(b*255)})'\n",
+ "\n",
+ " def snapshot_traces(snapshot):\n",
+ " nodes, edges = snapshot['nodes'], snapshot['edges']\n",
+ " ex, ey = [], []\n",
+ " for (a, b) in edges:\n",
+ " if a in pos and b in pos:\n",
+ " ex += [pos[a][0], pos[b][0], None]\n",
+ " ey += [pos[a][1], pos[b][1], None]\n",
+ " present = [(n, d) for n, d in nodes.items() if n in pos]\n",
+ " nx_c = [pos[n][0] for n, _ in present]\n",
+ " ny_c = [pos[n][1] for n, _ in present]\n",
+ " tips = [f'{n}
since {d[\"first_seen\"]}
{d[\"pr_count\"]} PRs'\n",
+ " for n, d in present]\n",
+ " colors = [rgba(d['first_seen']) for _, d in present]\n",
+ " sizes = [7 + 2 * min(d['pr_count'], 20) for _, d in present]\n",
+ " return [\n",
+ " go.Scatter(x=ex, y=ey, mode='lines',\n",
+ " line=dict(color='#4a90d9', width=0.6), opacity=0.35,\n",
+ " hoverinfo='skip'),\n",
+ " go.Scatter(x=nx_c, y=ny_c, mode='markers',\n",
+ " text=[n for n, _ in present],\n",
+ " hovertext=tips, hoverinfo='text',\n",
+ " marker=dict(size=sizes, color=colors, opacity=0.92,\n",
+ " line=dict(width=0))),\n",
+ " ]\n",
+ "\n",
+ " frames = [go.Frame(name=str(s['period']), data=snapshot_traces(s))\n",
+ " for s in snapshots]\n",
+ "\n",
+ " fig = go.Figure(\n",
+ " data=frames[0].data,\n",
+ " frames=frames,\n",
+ " layout=go.Layout(\n",
+ " paper_bgcolor=_BG, plot_bgcolor=_BG,\n",
+ " showlegend=False,\n",
+ " xaxis=dict(visible=False, range=[-1.4, 1.4]),\n",
+ " yaxis=dict(visible=False, range=[-1.4, 1.4], scaleanchor='x'),\n",
+ " margin=dict(l=10, r=10, t=50, b=60),\n",
+ " title=dict(text=f'Collaboration Network Growth \u2014 {repo}',\n",
+ " font=dict(color='white', size=15)),\n",
+ " updatemenus=[dict(\n",
+ " type='buttons', showactive=False,\n",
+ " y=-0.08, x=0.5, xanchor='center',\n",
+ " buttons=[\n",
+ " dict(label='\u25b6 Play', method='animate',\n",
+ " args=[None, dict(frame=dict(duration=700, redraw=True),\n",
+ " fromcurrent=True)]),\n",
+ " dict(label='\u23f8 Pause', method='animate',\n",
+ " args=[[None], dict(frame=dict(duration=0),\n",
+ " mode='immediate')]),\n",
+ " ]\n",
+ " )],\n",
+ " sliders=[dict(\n",
+ " currentvalue=dict(prefix='Period: ', font=dict(color='white')),\n",
+ " font=dict(color='white'),\n",
+ " pad=dict(t=10),\n",
+ " steps=[dict(\n",
+ " args=[[str(s['period'])],\n",
+ " dict(frame=dict(duration=300), mode='immediate')],\n",
+ " label=str(s['period']),\n",
+ " method='animate',\n",
+ " ) for s in snapshots],\n",
+ " )],\n",
+ " )\n",
+ " )\n",
+ " return fig"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Run"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "HEADERS = make_headers(GITHUB_TOKEN)\n",
+ "pr_records = fetch_pr_data(REPO, HEADERS, max_prs=MAX_PRS)\n",
+ "\n",
+ "snapshots = build_snapshots(pr_records, freq=TIME_RESOLUTION)\n",
+ "pos = compute_layout(snapshots)\n",
+ "\n",
+ "# --- Movie ---\n",
+ "make_animation(snapshots, pos, fps=FPS, output=OUTPUT_FILE)\n",
+ "\n",
+ "# Display GIF inline if in Jupyter\n",
+ "try:\n",
+ " from IPython.display import Image, display\n",
+ " if OUTPUT_FILE.endswith('.gif'):\n",
+ " display(Image(OUTPUT_FILE))\n",
+ "except ImportError:\n",
+ " pass\n",
+ "\n",
+ "# --- Interactive Plotly ---\n",
+ "if HAS_PLOTLY:\n",
+ " fig = make_plotly_animation(snapshots, pos, repo=REPO)\n",
+ " fig.show()"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "name": "python",
+ "version": "3.9.0"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
\ No newline at end of file
diff --git a/git-collaboration-network.ipynb b/git-collaboration-network.ipynb
new file mode 100644
index 0000000..fad3795
--- /dev/null
+++ b/git-collaboration-network.ipynb
@@ -0,0 +1,404 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Collaborative Network from Git History\n",
+ "\n",
+ "Builds two views of contributor collaboration from a GitHub repository:\n",
+ "\n",
+ "1. **Network graph** \u2014 nodes are contributors, edges come from:\n",
+ " - *PR collaboration*: both participated (author / reviewer) in the same pull request\n",
+ " - *Temporal file overlap*: both touched the same code area **and** their activity windows overlap in time\n",
+ "2. **Code Archaeology heatmap** \u2014 for each (contributor, code area) pair, measures how many days before the contributor arrived that area was last touched by someone else"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Configuration"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "REPO = \"torvalds/linux\" # change to any owner/repo\n",
+ "GITHUB_TOKEN = \"\" # set a personal access token to raise rate limits\n",
+ "MAX_PRS = 100 # merged PRs to analyse (increase for deeper history)\n",
+ "MAX_COMMIT_PAGES = 5 # x100 commits each, for activity-window computation\n",
+ "MIN_CONTRIBUTIONS = 2 # exclude contributors below this threshold\n",
+ "FILE_GROUP_DEPTH = 1 # directory depth for code-area grouping (1 = top-level dir)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import time\n",
+ "import requests\n",
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "import networkx as nx\n",
+ "import matplotlib.pyplot as plt\n",
+ "import matplotlib.patches as mpatches\n",
+ "import seaborn as sns\n",
+ "from collections import defaultdict\n",
+ "from itertools import combinations"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Fetch data from GitHub"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def make_headers(token=''):\n",
+ " h = {\"Accept\": \"application/vnd.github+json\"}\n",
+ " if token:\n",
+ " h[\"Authorization\"] = f\"Bearer {token}\"\n",
+ " return h\n",
+ "\n",
+ "\n",
+ "def paginate(url, headers, params=None, max_pages=None):\n",
+ " results, page = [], 1\n",
+ " while True:\n",
+ " p = {**(params or {}), 'page': page, 'per_page': 100}\n",
+ " r = requests.get(url, headers=headers, params=p, timeout=30)\n",
+ " if r.status_code == 403:\n",
+ " raise PermissionError(\"Rate limit hit \u2014 set GITHUB_TOKEN.\")\n",
+ " if r.status_code == 404:\n",
+ " raise ValueError(f\"Not found: {url}\")\n",
+ " r.raise_for_status()\n",
+ " data = r.json()\n",
+ " if not data:\n",
+ " break\n",
+ " results.extend(data)\n",
+ " if (max_pages and page >= max_pages) or \"next\" not in r.links:\n",
+ " break\n",
+ " page += 1\n",
+ " return results"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def fetch_pr_data(repo, headers, max_prs=100):\n",
+ " print(f\"Fetching PRs from {repo}...\")\n",
+ " raw = paginate(\n",
+ " f\"https://api.github.com/repos/{repo}/pulls\",\n",
+ " headers, {\"state\": \"closed\"},\n",
+ " max_pages=(max_prs // 100 + 1) if max_prs else None,\n",
+ " )\n",
+ " merged = [pr for pr in raw if pr.get('merged_at')][:max_prs]\n",
+ "\n",
+ " records = []\n",
+ " for i, pr in enumerate(merged):\n",
+ " num = pr['number']\n",
+ " author = (pr.get('user') or {}).get('login')\n",
+ " date = pd.to_datetime(pr['merged_at'])\n",
+ "\n",
+ " reviews = paginate(\n",
+ " f\"https://api.github.com/repos/{repo}/pulls/{num}/reviews\", headers\n",
+ " )\n",
+ " reviewers = list(\n",
+ " {(r.get('user') or {}).get('login') for r in reviews} - {None, author}\n",
+ " )\n",
+ "\n",
+ " files_data = paginate(\n",
+ " f\"https://api.github.com/repos/{repo}/pulls/{num}/files\", headers\n",
+ " )\n",
+ " files = [f['filename'] for f in files_data]\n",
+ "\n",
+ " records.append({\n",
+ " 'pr': num, 'author': author,\n",
+ " 'reviewers': reviewers, 'files': files, 'date': date,\n",
+ " })\n",
+ " if (i + 1) % 10 == 0:\n",
+ " print(f\" {i + 1}/{len(merged)} PRs\")\n",
+ " time.sleep(0.05)\n",
+ "\n",
+ " print(f\"Done \u2014 {len(records)} merged PRs loaded\")\n",
+ " return records\n",
+ "\n",
+ "\n",
+ "def fetch_commit_activity(repo, headers, max_pages=5):\n",
+ " print(\"Fetching commit history for activity windows...\")\n",
+ " commits = paginate(\n",
+ " f\"https://api.github.com/repos/{repo}/commits\", headers, max_pages=max_pages\n",
+ " )\n",
+ " activity = defaultdict(list)\n",
+ " for c in commits:\n",
+ " login = (c.get('author') or {}).get('login')\n",
+ " date_str = (c.get('commit', {}).get('author') or {}).get('date')\n",
+ " if login and date_str:\n",
+ " activity[login].append(pd.to_datetime(date_str))\n",
+ " print(f\"Found activity for {len(activity)} contributors\")\n",
+ " return dict(activity)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Build collaboration network"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def build_activity_windows(pr_records, commit_activity, min_contributions=2):\n",
+ " \"\"\"Merge PR and commit dates into per-contributor (start, end) windows.\"\"\"\n",
+ " dates = defaultdict(list)\n",
+ " for pr in pr_records:\n",
+ " for person in [pr['author']] + pr['reviewers']:\n",
+ " if person:\n",
+ " dates[person].append(pr['date'])\n",
+ " for person, ds in commit_activity.items():\n",
+ " dates[person].extend(ds)\n",
+ " return {\n",
+ " p: (min(ds), max(ds))\n",
+ " for p, ds in dates.items()\n",
+ " if len(ds) >= min_contributions\n",
+ " }\n",
+ "\n",
+ "\n",
+ "def overlaps(w1, w2):\n",
+ " return w1[0] <= w2[1] and w2[0] <= w1[1]\n",
+ "\n",
+ "\n",
+ "def file_group(path, depth=1):\n",
+ " parts = path.split('/')\n",
+ " return '/'.join(parts[:depth]) if len(parts) > depth else parts[0]\n",
+ "\n",
+ "\n",
+ "def build_network(pr_records, activity_windows, file_group_depth=1):\n",
+ " G = nx.Graph()\n",
+ " G.add_nodes_from(activity_windows)\n",
+ "\n",
+ " # PR edges: both participated in the same pull request\n",
+ " for pr in pr_records:\n",
+ " participants = list({\n",
+ " p for p in [pr['author']] + pr['reviewers']\n",
+ " if p in activity_windows\n",
+ " })\n",
+ " for a, b in combinations(participants, 2):\n",
+ " if G.has_edge(a, b):\n",
+ " G[a][b]['pr_weight'] = G[a][b].get('pr_weight', 0) + 1\n",
+ " else:\n",
+ " G.add_edge(a, b, pr_weight=1, file_weight=0)\n",
+ "\n",
+ " # File + temporal-overlap edges: touched the same code area while both active\n",
+ " area_contribs = defaultdict(set)\n",
+ " for pr in pr_records:\n",
+ " participants = [p for p in [pr['author']] + pr['reviewers'] if p in activity_windows]\n",
+ " for f in pr['files']:\n",
+ " area_contribs[file_group(f, file_group_depth)].update(participants)\n",
+ "\n",
+ " for area, people in area_contribs.items():\n",
+ " for a, b in combinations(people, 2):\n",
+ " if overlaps(activity_windows[a], activity_windows[b]):\n",
+ " if G.has_edge(a, b):\n",
+ " G[a][b]['file_weight'] = G[a][b].get('file_weight', 0) + 1\n",
+ " else:\n",
+ " G.add_edge(a, b, pr_weight=0, file_weight=1)\n",
+ "\n",
+ " return G"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Network visualisation"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def plot_network(G, repo):\n",
+ " fig, ax = plt.subplots(figsize=(14, 10))\n",
+ " pos = nx.spring_layout(G, k=1.5, seed=42)\n",
+ "\n",
+ " pr_only = [(u, v) for u, v, d in G.edges(data=True) if d.get('pr_weight', 0) > 0 and d.get('file_weight', 0) == 0]\n",
+ " file_only = [(u, v) for u, v, d in G.edges(data=True) if d.get('pr_weight', 0) == 0 and d.get('file_weight', 0) > 0]\n",
+ " both = [(u, v) for u, v, d in G.edges(data=True) if d.get('pr_weight', 0) > 0 and d.get('file_weight', 0) > 0]\n",
+ "\n",
+ " node_sizes = [200 + 80 * G.degree(n) for n in G.nodes()]\n",
+ " nx.draw_networkx_nodes(G, pos, node_size=node_sizes, node_color='steelblue', alpha=0.85, ax=ax)\n",
+ " nx.draw_networkx_labels(G, pos, font_size=7, ax=ax)\n",
+ " nx.draw_networkx_edges(G, pos, edgelist=pr_only, edge_color='#e07b39', width=1.5, alpha=0.7, ax=ax)\n",
+ " nx.draw_networkx_edges(G, pos, edgelist=file_only, edge_color='#4caf50', width=1.0, alpha=0.6, ax=ax)\n",
+ " nx.draw_networkx_edges(G, pos, edgelist=both, edge_color='#9c27b0', width=2.5, alpha=0.8, ax=ax)\n",
+ "\n",
+ " legend = [\n",
+ " mpatches.Patch(color='#e07b39', label='PR collaboration only'),\n",
+ " mpatches.Patch(color='#4caf50', label='Shared code area (temporal overlap) only'),\n",
+ " mpatches.Patch(color='#9c27b0', label='Both'),\n",
+ " ]\n",
+ " ax.legend(handles=legend, loc='upper left', fontsize=9)\n",
+ " ax.set_title(f'Collaboration Network \u2014 {repo}', fontsize=13)\n",
+ " ax.axis('off')\n",
+ " plt.tight_layout()\n",
+ " plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Code Archaeology\n",
+ "\n",
+ "For each *(contributor, code area)* pair, the **archaeology depth** is the number of days between when a contributor first became active and when that code area was last touched by someone else before them.\n",
+ "\n",
+ "- **Large value** \u2192 contributor arrived long after others left; understanding the code requires digging deep into history\n",
+ "- **Zero / NaN** \u2192 contributor was contemporary with others who touched that area (no archaeology needed)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def build_archaeology_matrix(pr_records, activity_windows, file_group_depth=1,\n",
+ " top_files=30, top_contributors=25):\n",
+ " area_timeline = defaultdict(list)\n",
+ " for pr in pr_records:\n",
+ " participants = [p for p in [pr['author']] + pr['reviewers'] if p in activity_windows]\n",
+ " for f in pr['files']:\n",
+ " g = file_group(f, file_group_depth)\n",
+ " for p in participants:\n",
+ " area_timeline[g].append((pr['date'], p))\n",
+ " for g in area_timeline:\n",
+ " area_timeline[g].sort()\n",
+ "\n",
+ " contrib_count = defaultdict(int)\n",
+ " for pr in pr_records:\n",
+ " for p in [pr['author']] + pr['reviewers']:\n",
+ " if p in activity_windows:\n",
+ " contrib_count[p] += 1\n",
+ " top_contribs = sorted(contrib_count, key=contrib_count.get, reverse=True)[:top_contributors]\n",
+ "\n",
+ " top_areas = sorted(\n",
+ " area_timeline, key=lambda g: len(area_timeline[g]), reverse=True\n",
+ " )[:top_files]\n",
+ "\n",
+ " matrix = pd.DataFrame(index=top_areas, columns=top_contribs, dtype=float)\n",
+ "\n",
+ " for area in top_areas:\n",
+ " for c in top_contribs:\n",
+ " c_start = activity_windows[c][0]\n",
+ " prior = [\n",
+ " date for date, person in area_timeline[area]\n",
+ " if person != c and date < c_start\n",
+ " ]\n",
+ " if prior:\n",
+ " matrix.loc[area, c] = float(max((c_start - max(prior)).days, 0))\n",
+ "\n",
+ " return matrix\n",
+ "\n",
+ "\n",
+ "def plot_archaeology(matrix, repo):\n",
+ " data = matrix.dropna(how='all').dropna(axis=1, how='all')\n",
+ " if data.empty:\n",
+ " print('No archaeology data: all contributors were fully contemporaneous.')\n",
+ " return\n",
+ "\n",
+ " h = max(6, len(data.index) * 0.35)\n",
+ " w = max(10, len(data.columns) * 0.55)\n",
+ " fig, ax = plt.subplots(figsize=(w, h))\n",
+ " sns.heatmap(\n",
+ " data.astype(float),\n",
+ " cmap='YlOrRd',\n",
+ " ax=ax,\n",
+ " linewidths=0.3,\n",
+ " cbar_kws={'label': 'Archaeology depth (days)'},\n",
+ " mask=data.isna(),\n",
+ " )\n",
+ " ax.set_title(\n",
+ " f'Code Archaeology \u2014 {repo}\\n'\n",
+ " 'Days since others last touched this code area before a contributor arrived',\n",
+ " fontsize=11,\n",
+ " )\n",
+ " ax.set_xlabel('Contributor')\n",
+ " ax.set_ylabel('Code Area')\n",
+ " plt.xticks(rotation=45, ha='right', fontsize=8)\n",
+ " plt.yticks(fontsize=7)\n",
+ " plt.tight_layout()\n",
+ " plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Run"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "HEADERS = make_headers(GITHUB_TOKEN)\n",
+ "\n",
+ "pr_records = fetch_pr_data(REPO, HEADERS, max_prs=MAX_PRS)\n",
+ "commit_activity = fetch_commit_activity(REPO, HEADERS, max_pages=MAX_COMMIT_PAGES)\n",
+ "\n",
+ "activity_windows = build_activity_windows(pr_records, commit_activity,\n",
+ " min_contributions=MIN_CONTRIBUTIONS)\n",
+ "print(f'Active contributors (>={MIN_CONTRIBUTIONS} contributions): {len(activity_windows)}')\n",
+ "\n",
+ "G = build_network(pr_records, activity_windows, file_group_depth=FILE_GROUP_DEPTH)\n",
+ "print(f'Network: {G.number_of_nodes()} nodes, {G.number_of_edges()} edges')\n",
+ "\n",
+ "plt.style.use('fivethirtyeight')\n",
+ "\n",
+ "plot_network(G, REPO)\n",
+ "\n",
+ "arch_matrix = build_archaeology_matrix(\n",
+ " pr_records, activity_windows,\n",
+ " file_group_depth=FILE_GROUP_DEPTH,\n",
+ ")\n",
+ "plot_archaeology(arch_matrix, REPO)"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "name": "python",
+ "version": "3.9.0"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
\ No newline at end of file
diff --git a/github-contributor-analysis.ipynb b/github-contributor-analysis.ipynb
new file mode 100644
index 0000000..f7908b7
--- /dev/null
+++ b/github-contributor-analysis.ipynb
@@ -0,0 +1,198 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# GitHub Contributor Nonlinear Performance Analysis\n",
+ "\n",
+ "Fetches contributor statistics from a GitHub repository and fits the nonlinear maintenance model\n",
+ "\n",
+ "$$w(t) = \\frac{\\text{amplitude}}{\\mu}\\left(1 - e^{-\\mu t}\\right)$$\n",
+ "\n",
+ "to each contributor's cumulative activity over time."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Configuration\n",
+ "\n",
+ "Set `REPO` to `\"owner/repo\"`. Optionally set `GITHUB_TOKEN` to avoid rate limiting (required for private repos)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "REPO = \"torvalds/linux\" # change to any public repo, e.g. \"pandas-dev/pandas\"\n",
+ "GITHUB_TOKEN = \"\" # optional: set a personal access token to raise rate limits"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import time\n",
+ "import requests\n",
+ "import pandas as pd\n",
+ "import matplotlib.pyplot as plt\n",
+ "import numpy as np\n",
+ "from scipy.optimize import minimize"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Fetch contributor statistics\n",
+ "\n",
+ "GitHub may return HTTP 202 while it computes statistics for the first time — the cell retries until data is ready."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def fetch_contributors(repo, token=\"\", max_retries=6, retry_delay=5):\n",
+ " \"\"\"Fetch contributor weekly stats from the GitHub API.\n",
+ "\n",
+ " Returns a DataFrame where each row is a contributor with a 'weeks' column\n",
+ " containing a list of {w, a, d, c} dicts (matching linux_contributors.json format).\n",
+ " \"\"\"\n",
+ " url = f\"https://api.github.com/repos/{repo}/stats/contributors\"\n",
+ " headers = {\"Accept\": \"application/vnd.github+json\"}\n",
+ " if token:\n",
+ " headers[\"Authorization\"] = f\"Bearer {token}\"\n",
+ "\n",
+ " for attempt in range(1, max_retries + 1):\n",
+ " response = requests.get(url, headers=headers, timeout=30)\n",
+ " if response.status_code == 200:\n",
+ " data = response.json()\n",
+ " if data: # empty list means no contributors\n",
+ " return pd.DataFrame(data)\n",
+ " raise ValueError(f\"Repository '{repo}' has no contributor data.\")\n",
+ " if response.status_code == 202:\n",
+ " print(f\"GitHub is computing stats, retrying in {retry_delay}s (attempt {attempt}/{max_retries})...\")\n",
+ " time.sleep(retry_delay)\n",
+ " continue\n",
+ " if response.status_code == 404:\n",
+ " raise ValueError(f\"Repository '{repo}' not found. Check the owner/repo spelling.\")\n",
+ " if response.status_code == 403:\n",
+ " raise PermissionError(\"Rate limit exceeded or access denied. Set GITHUB_TOKEN to a valid token.\")\n",
+ " response.raise_for_status()\n",
+ "\n",
+ " raise TimeoutError(f\"GitHub did not return contributor stats after {max_retries} attempts.\")\n",
+ "\n",
+ "\n",
+ "contributors = fetch_contributors(REPO, token=GITHUB_TOKEN)\n",
+ "print(f\"Fetched {len(contributors)} contributors from '{REPO}'\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Model"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def fun(t, amplitude, mu):\n",
+ " # t is in weeks; divide by 52 to convert to years (mu is a per-year rate)\n",
+ " return amplitude / mu * (1 - np.exp(-mu * t / 52))\n",
+ "\n",
+ "\n",
+ "def lsq(x_observed, params):\n",
+ " t = np.arange(len(x_observed))\n",
+ " x_model = fun(t, *params)\n",
+ " err = x_model - x_observed\n",
+ " return np.dot(err, err) / len(err)\n",
+ "\n",
+ "\n",
+ "def model_fit(x, title=\"\"):\n",
+ " x = np.array(x, dtype=np.float64)\n",
+ " x_max = x.max()\n",
+ " if x_max == 0:\n",
+ " return None\n",
+ " x = x / x_max\n",
+ " res = minimize(\n",
+ " lambda params: lsq(x, params),\n",
+ " (1 / len(x), 0.1),\n",
+ " method='SLSQP',\n",
+ " bounds=((0, None), (0, None))\n",
+ " )\n",
+ " amplitude, mu = res.x\n",
+ " # effective maintenance ratio: mu normalized by amplitude scale factor\n",
+ " mu_effective = mu / amplitude\n",
+ " mx = fun(np.arange(len(x)), amplitude, mu)\n",
+ " label = f\"{title} — \" if title else \"\"\n",
+ " pd.Series(mx).plot(title=r\"{}$\\mu$ = {:3.2f}\".format(label, mu_effective))\n",
+ " pd.Series(x).plot()\n",
+ " plt.show()\n",
+ " return res"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Run analysis"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def main(df):\n",
+ " for _, entry in df.iterrows():\n",
+ " rm = dict()\n",
+ " for week in entry.weeks:\n",
+ " d = dict(week)\n",
+ " w = d['w']\n",
+ " if w in rm:\n",
+ " rm[w] += d['a'] + d['d']\n",
+ " else:\n",
+ " rm[w] = d['a'] + d['d']\n",
+ " sorted_items = sorted(rm.items())\n",
+ " t = pd.Series([v for _, v in sorted_items]).cumsum()\n",
+ " t = t[t != 0] # Remove zero-activity weeks\n",
+ " login = entry.get('author', {}) or {}\n",
+ " if isinstance(login, dict):\n",
+ " login = login.get('login', '')\n",
+ " model_fit(t, title=login)\n",
+ "\n",
+ "\n",
+ "plt.rcParams[\"figure.figsize\"] = (9, 4)\n",
+ "plt.style.use('fivethirtyeight')\n",
+ "main(contributors)"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "name": "python",
+ "version": "3.9.0"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/github-core-network-stats.ipynb b/github-core-network-stats.ipynb
new file mode 100644
index 0000000..9eb3892
--- /dev/null
+++ b/github-core-network-stats.ipynb
@@ -0,0 +1,411 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Core Collaboration Network \u2014 Top 1000 GitHub Repos\n",
+ "\n",
+ "**Hypothesis (from JIRA analysis):** the _core_ collaboration network of a project is very small relative to its total contributor count.\n",
+ "\n",
+ "This notebook fetches the top 1000 GitHub repositories by stars, pulls contributor statistics for each, and computes:\n",
+ "\n",
+ "| Metric | What it measures |\n",
+ "|---|---|\n",
+ "| **80/20 core size** | minimum contributors accounting for 80% of all commits |\n",
+ "| **Gini coefficient** | inequality of contribution distribution (0 = equal, 1 = one person does everything) |\n",
+ "| **Top-1 share** | fraction of commits by the single most active contributor |\n",
+ "| **Temporal core** | contributors whose activity windows overlap with at least one other (potential collaborators) |\n",
+ "| **Core fraction** | core size / total contributors |\n",
+ "\n",
+ "Results are cached to disk so re-runs are instant."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Configuration"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "GITHUB_TOKEN = \"\" # required \u2014 5000 req/hr authenticated vs 60 req/hr anonymous\n",
+ "MAX_REPOS = 1000 # top N repos by stars\n",
+ "CACHE_FILE = \"top_repos_cache.json\" # intermediate results saved here"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import json, time\n",
+ "import requests\n",
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "import matplotlib.pyplot as plt\n",
+ "import matplotlib.ticker as mticker\n",
+ "from pathlib import Path\n",
+ "from itertools import combinations\n",
+ "from collections import defaultdict"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Fetch top repositories"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def make_headers(token=''):\n",
+ " h = {\"Accept\": \"application/vnd.github+json\"}\n",
+ " if token:\n",
+ " h[\"Authorization\"] = f\"Bearer {token}\"\n",
+ " return h\n",
+ "\n",
+ "\n",
+ "def _get(url, headers, params=None, retries=4):\n",
+ " delay = 2\n",
+ " for attempt in range(retries):\n",
+ " r = requests.get(url, headers=headers, params=params, timeout=30)\n",
+ " if r.status_code == 200:\n",
+ " return r\n",
+ " if r.status_code in (202, 429, 500, 502, 503):\n",
+ " time.sleep(delay)\n",
+ " delay *= 2\n",
+ " continue\n",
+ " if r.status_code == 403:\n",
+ " raise PermissionError(\"Rate limit hit \u2014 set GITHUB_TOKEN.\")\n",
+ " return None # 404, 451, etc.\n",
+ " return None\n",
+ "\n",
+ "\n",
+ "def fetch_top_repos(headers, n=1000):\n",
+ " print(f\"Fetching top {n} repos by stars...\")\n",
+ " repos, page = [], 1\n",
+ " while len(repos) < n:\n",
+ " r = _get(\n",
+ " \"https://api.github.com/search/repositories\",\n",
+ " headers,\n",
+ " {\"q\": \"stars:>100\", \"sort\": \"stars\", \"order\": \"desc\", \"per_page\": 100, \"page\": page},\n",
+ " )\n",
+ " if r is None:\n",
+ " break\n",
+ " items = r.json().get('items', [])\n",
+ " if not items:\n",
+ " break\n",
+ " repos.extend(items)\n",
+ " if \"next\" not in r.links:\n",
+ " break\n",
+ " page += 1\n",
+ " time.sleep(0.3)\n",
+ " repos = repos[:n]\n",
+ " print(f\"Got {len(repos)} repos\")\n",
+ " return repos"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Fetch contributor stats (with disk cache)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def fetch_contributor_stats(owner, repo, headers):\n",
+ " \"\"\"Fetch /stats/contributors, retrying on 202 (GitHub is computing).\"\"\"\n",
+ " url = f'https://api.github.com/repos/{owner}/{repo}/stats/contributors'\n",
+ " r = _get(url, headers)\n",
+ " return r.json() if r and isinstance(r.json(), list) else None\n",
+ "\n",
+ "\n",
+ "def fetch_all_stats(repos, headers, cache_file='top_repos_cache.json'):\n",
+ " cache = {}\n",
+ " p = Path(cache_file)\n",
+ " if p.exists():\n",
+ " cache = json.loads(p.read_text())\n",
+ " print(f\"Cache loaded: {len(cache)} repos\")\n",
+ "\n",
+ " results = []\n",
+ " for i, repo in enumerate(repos):\n",
+ " full_name = repo['full_name']\n",
+ " owner, name = full_name.split('/', 1)\n",
+ "\n",
+ " if full_name not in cache:\n",
+ " cache[full_name] = fetch_contributor_stats(owner, name, headers)\n",
+ " time.sleep(0.15)\n",
+ "\n",
+ " stats = cache[full_name]\n",
+ " if stats:\n",
+ " results.append({\n",
+ " 'repo': full_name,\n",
+ " 'stars': repo['stargazers_count'],\n",
+ " 'forks': repo['forks_count'],\n",
+ " 'language': repo.get('language'),\n",
+ " 'contributors': stats,\n",
+ " })\n",
+ "\n",
+ " if (i + 1) % 100 == 0:\n",
+ " p.write_text(json.dumps(cache)) # checkpoint\n",
+ " print(f\" {i + 1}/{len(repos)} repos processed ({len(results)} with data)\")\n",
+ "\n",
+ " p.write_text(json.dumps(cache))\n",
+ " print(f\"Done \u2014 {len(results)}/{len(repos)} repos returned contributor data\")\n",
+ " return results"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Compute per-repo metrics"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def gini(values):\n",
+ " \"\"\"Gini coefficient: 0 = perfect equality, 1 = maximum inequality.\"\"\"\n",
+ " v = np.sort(np.array(values, dtype=float))\n",
+ " n = len(v)\n",
+ " if n == 0 or v.sum() == 0:\n",
+ " return 0.0\n",
+ " ix = np.arange(1, n + 1)\n",
+ " return float((2 * ix.dot(v) / (n * v.sum())) - (n + 1) / n)\n",
+ "\n",
+ "\n",
+ "def pareto_core(totals, threshold=0.8):\n",
+ " \"\"\"Minimum contributors accounting for `threshold` fraction of total commits.\"\"\"\n",
+ " desc = sorted(totals, reverse=True)\n",
+ " total = sum(desc)\n",
+ " if total == 0:\n",
+ " return 0\n",
+ " cumsum = 0\n",
+ " for i, t in enumerate(desc):\n",
+ " cumsum += t\n",
+ " if cumsum >= threshold * total:\n",
+ " return i + 1\n",
+ " return len(totals)\n",
+ "\n",
+ "\n",
+ "def activity_window(weeks):\n",
+ " active = [w['w'] for w in weeks if w.get('c', 0) + w.get('a', 0) + w.get('d', 0) > 0]\n",
+ " return (min(active), max(active)) if active else None\n",
+ "\n",
+ "\n",
+ "def temporal_core_size(contributors):\n",
+ " \"\"\"Contributors whose activity window overlaps with at least one other (sweep-line O(n log n)).\"\"\"\n",
+ " windows = [w for c in contributors for w in [activity_window(c.get('weeks', []))] if w]\n",
+ " if len(windows) < 2:\n",
+ " return len(windows)\n",
+ " windows.sort(key=lambda w: w[0])\n",
+ " has_overlap = [False] * len(windows)\n",
+ " active = [] # (end_time, index)\n",
+ " for i, (start, end) in enumerate(windows):\n",
+ " active = [(e, j) for e, j in active if e >= start]\n",
+ " if active:\n",
+ " has_overlap[i] = True\n",
+ " for _, j in active:\n",
+ " has_overlap[j] = True\n",
+ " active.append((end, i))\n",
+ " return sum(has_overlap)\n",
+ "\n",
+ "\n",
+ "def compute_metrics(repo_data):\n",
+ " records = []\n",
+ " for repo in repo_data:\n",
+ " contributors = repo['contributors']\n",
+ " totals = [c.get('total', 0) for c in contributors if c.get('total', 0) > 0]\n",
+ " if not totals:\n",
+ " continue\n",
+ " total_commits = sum(totals)\n",
+ " records.append({\n",
+ " 'repo': repo['repo'],\n",
+ " 'stars': repo['stars'],\n",
+ " 'forks': repo['forks'],\n",
+ " 'language': repo['language'],\n",
+ " 'n_contributors': len(totals),\n",
+ " 'total_commits': total_commits,\n",
+ " 'gini': gini(totals),\n",
+ " 'core_80': pareto_core(totals, 0.8),\n",
+ " 'core_50': pareto_core(totals, 0.5),\n",
+ " 'top1_share': max(totals) / total_commits,\n",
+ " 'temporal_core': temporal_core_size(contributors),\n",
+ " })\n",
+ " df = pd.DataFrame(records)\n",
+ " df['core_fraction'] = df['core_80'] / df['n_contributors']\n",
+ " return df"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Visualise\n",
+ "\n",
+ "Six panels testing the hypothesis that the core collaboration network is small:\n",
+ "1. Distribution of 80/20 core sizes\n",
+ "2. Core size vs total contributors (log-log)\n",
+ "3. Gini coefficient distribution\n",
+ "4. CDF \u2014 fraction of repos with core \u2264 N\n",
+ "5. Top contributor's share of commits\n",
+ "6. Median core size by programming language"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def plot_stats(df):\n",
+ " fig, axes = plt.subplots(2, 3, figsize=(18, 10))\n",
+ " fig.suptitle(\n",
+ " f'Core Collaboration Network \u2014 Top {len(df)} GitHub Repos\\n'\n",
+ " f'Median contributors: {df[\"n_contributors\"].median():.0f} | '\n",
+ " f'Median 80/20 core: {df[\"core_80\"].median():.0f} | '\n",
+ " f'Median Gini: {df[\"gini\"].median():.2f}',\n",
+ " fontsize=13,\n",
+ " )\n",
+ "\n",
+ " # 1. 80/20 core size histogram\n",
+ " ax = axes[0, 0]\n",
+ " ax.hist(df['core_80'].clip(upper=50), bins=40, edgecolor='white')\n",
+ " ax.axvline(df['core_80'].median(), color='red', linestyle='--',\n",
+ " label=f\"Median: {df['core_80'].median():.0f}\")\n",
+ " ax.set_xlabel('80/20 Core Size (contributors)')\n",
+ " ax.set_ylabel('Repos')\n",
+ " ax.set_title('80/20 Core Size Distribution')\n",
+ " ax.legend()\n",
+ "\n",
+ " # 2. Core vs total (log-log scatter)\n",
+ " ax = axes[0, 1]\n",
+ " ax.scatter(df['n_contributors'], df['core_80'], alpha=0.25, s=8, color='steelblue')\n",
+ " lims = [1, df['n_contributors'].max() * 1.5]\n",
+ " ax.plot(lims, lims, 'r--', linewidth=0.8, label='core = total')\n",
+ " ax.set_xscale('log'); ax.set_yscale('log')\n",
+ " ax.set_xlabel('Total contributors (log)')\n",
+ " ax.set_ylabel('80/20 Core size (log)')\n",
+ " ax.set_title('Core vs Total Contributors')\n",
+ " ax.legend(fontsize=8)\n",
+ "\n",
+ " # 3. Gini coefficient\n",
+ " ax = axes[0, 2]\n",
+ " ax.hist(df['gini'], bins=40, edgecolor='white')\n",
+ " ax.axvline(df['gini'].median(), color='red', linestyle='--',\n",
+ " label=f\"Median: {df['gini'].median():.2f}\")\n",
+ " ax.set_xlabel('Gini Coefficient')\n",
+ " ax.set_ylabel('Repos')\n",
+ " ax.set_title('Contribution Inequality (Gini)')\n",
+ " ax.legend()\n",
+ "\n",
+ " # 4. CDF of core size\n",
+ " ax = axes[1, 0]\n",
+ " sorted_core = np.sort(df['core_80'])\n",
+ " cdf = np.arange(1, len(sorted_core) + 1) / len(sorted_core)\n",
+ " ax.plot(sorted_core, cdf, linewidth=2)\n",
+ " for n in [3, 5, 10, 20]:\n",
+ " pct = (df['core_80'] <= n).mean()\n",
+ " ax.axvline(n, color='grey', linestyle=':', linewidth=0.8)\n",
+ " ax.text(n, 0.05, f'{pct:.0%}\\n\u2264{n}', ha='center', fontsize=7)\n",
+ " ax.set_xlabel('Core Size')\n",
+ " ax.set_ylabel('Fraction of repos')\n",
+ " ax.set_title('CDF: Repos with Core \u2264 N')\n",
+ " ax.set_xlim(0, 50)\n",
+ "\n",
+ " # 5. Top-1 contributor share\n",
+ " ax = axes[1, 1]\n",
+ " ax.hist(df['top1_share'], bins=40, edgecolor='white')\n",
+ " ax.axvline(df['top1_share'].median(), color='red', linestyle='--',\n",
+ " label=f\"Median: {df['top1_share'].median():.0%}\")\n",
+ " ax.xaxis.set_major_formatter(mticker.PercentFormatter(1.0))\n",
+ " ax.set_xlabel('Top Contributor\\'s Share')\n",
+ " ax.set_ylabel('Repos')\n",
+ " ax.set_title('Top-1 Contributor Dominance')\n",
+ " ax.legend()\n",
+ "\n",
+ " # 6. Language breakdown\n",
+ " ax = axes[1, 2]\n",
+ " lang = (df.groupby('language')['core_80']\n",
+ " .agg(median='median', count='count')\n",
+ " .query('count >= 10')\n",
+ " .sort_values('median'))\n",
+ " lang['median'].plot(kind='barh', ax=ax)\n",
+ " ax.set_xlabel('Median 80/20 Core Size')\n",
+ " ax.set_title('Core Size by Language (n \u2265 10 repos)')\n",
+ "\n",
+ " plt.tight_layout()\n",
+ " plt.show()\n",
+ "\n",
+ " print('\\n=== Summary ===')\n",
+ " print(f'Repos analysed: {len(df)}')\n",
+ " print(f'Median total contributors: {df[\"n_contributors\"].median():.0f}')\n",
+ " print(f'Median 80/20 core: {df[\"core_80\"].median():.0f}')\n",
+ " print(f'Median core fraction: {df[\"core_fraction\"].median():.1%}')\n",
+ " print(f'Median Gini: {df[\"gini\"].median():.3f}')\n",
+ " print(f'Median top-1 share: {df[\"top1_share\"].median():.1%}')\n",
+ " print(f'Repos where core \u2264 3: {(df[\"core_80\"] <= 3).mean():.1%}')\n",
+ " print(f'Repos where core \u2264 5: {(df[\"core_80\"] <= 5).mean():.1%}')\n",
+ " print(f'Repos where core \u2264 10: {(df[\"core_80\"] <= 10).mean():.1%}')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Run"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "HEADERS = make_headers(GITHUB_TOKEN)\n",
+ "\n",
+ "repos = fetch_top_repos(HEADERS, n=MAX_REPOS)\n",
+ "raw_data = fetch_all_stats(repos, HEADERS, cache_file=CACHE_FILE)\n",
+ "\n",
+ "df = compute_metrics(raw_data)\n",
+ "print(df.describe())\n",
+ "\n",
+ "plt.style.use('fivethirtyeight')\n",
+ "plot_stats(df)"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "name": "python",
+ "version": "3.9.0"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
\ No newline at end of file