diff --git a/collaboration-network-growth.ipynb b/collaboration-network-growth.ipynb
new file mode 100644
index 0000000..e14ef49
--- /dev/null
+++ b/collaboration-network-growth.ipynb
@@ -0,0 +1,436 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Collaboration Network Growth Animation\n",
+    "\n",
+    "Renders the history of a GitHub project as a **movie**: the collaboration network grows from a single contributor outward, frame by frame, one time period at a time.\n",
+    "\n",
+    "**Two outputs:**\n",
+    "- `OUTPUT_FILE` \u2014 GIF or MP4 (dark background, glow nodes, suitable for sharing)\n",
+    "- Plotly figure \u2014 interactive version with a play button and time scrubber (runs in the notebook)\n",
+    "\n",
+    "**Layout:** spring layout computed once on the final graph, so well-connected (core) contributors cluster at the centre and peripheral contributors radiate outward. Nodes are revealed progressively; new arrivals flash brightly on entry.\n",
+    "\n",
+    "**Node colour** follows the `plasma` colourmap: early contributors are purple/blue, later arrivals are orange/yellow."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Configuration"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "REPO            = \"facebook/react\"\n",
+    "GITHUB_TOKEN    = \"\"\n",
+    "MAX_PRS         = 300     # merged PRs to analyse \u2014 more = longer movie\n",
+    "TIME_RESOLUTION = 'Q'     # 'M' monthly  'Q' quarterly  'Y' yearly\n",
+    "OUTPUT_FILE     = 'collaboration_growth.gif'  # .gif (Pillow) or .mp4 (ffmpeg)\n",
+    "FPS             = 4       # frames per second"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import time, requests\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import networkx as nx\n",
+    "import matplotlib.pyplot as plt\n",
+    "import matplotlib.colors as mcolors\n",
+    "from matplotlib.animation import FuncAnimation, PillowWriter\n",
+    "from collections import defaultdict\n",
+    "from itertools import combinations\n",
+    "try:\n",
+    "    import plotly.graph_objects as go\n",
+    "    HAS_PLOTLY = True\n",
+    "except ImportError:\n",
+    "    HAS_PLOTLY = False\n",
+    "    print('plotly not installed \u2014 skipping interactive output')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Fetch PR data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def make_headers(token=''):\n",
+    "    h = {\"Accept\": \"application/vnd.github+json\"}\n",
+    "    if token:\n",
+    "        h[\"Authorization\"] = f\"Bearer {token}\"\n",
+    "    return h\n",
+    "\n",
+    "\n",
+    "def paginate(url, headers, params=None, max_pages=None):\n",
+    "    results, page = [], 1\n",
+    "    while True:\n",
+    "        p = {**(params or {}), 'page': page, 'per_page': 100}\n",
+    "        r = requests.get(url, headers=headers, params=p, timeout=30)\n",
+    "        if r.status_code == 403:\n",
+    "            raise PermissionError(\"Rate limit hit \u2014 set GITHUB_TOKEN.\")\n",
+    "        r.raise_for_status()\n",
+    "        data = r.json()\n",
+    "        if not data:\n",
+    "            break\n",
+    "        results.extend(data)\n",
+    "        if (max_pages and page >= max_pages) or 'next' not in r.links:\n",
+    "            break\n",
+    "        page += 1\n",
+    "    return results\n",
+    "\n",
+    "\n",
+    "def fetch_pr_data(repo, headers, max_prs=300):\n",
+    "    print(f\"Fetching PRs from {repo}...\")\n",
+    "    raw = paginate(\n",
+    "        f\"https://api.github.com/repos/{repo}/pulls\",\n",
+    "        headers, {\"state\": \"closed\"},\n",
+    "        max_pages=(max_prs // 100 + 1) if max_prs else None,\n",
+    "    )\n",
+    "    merged = [pr for pr in raw if pr.get('merged_at')][:max_prs]\n",
+    "\n",
+    "    records = []\n",
+    "    for i, pr in enumerate(merged):\n",
+    "        num    = pr['number']\n",
+    "        author = (pr.get('user') or {}).get('login')\n",
+    "        date   = pd.to_datetime(pr['merged_at'])\n",
+    "        reviews = paginate(\n",
+    "            f\"https://api.github.com/repos/{repo}/pulls/{num}/reviews\", headers\n",
+    "        )\n",
+    "        reviewers = list(\n",
+    "            {(r.get('user') or {}).get('login') for r in reviews} - {None, author}\n",
+    "        )\n",
+    "        records.append({'pr': num, 'author': author, 'reviewers': reviewers, 'date': date})\n",
+    "        if (i + 1) % 50 == 0:\n",
+    "            print(f\"  {i + 1}/{len(merged)} PRs\")\n",
+    "        time.sleep(0.05)\n",
+    "\n",
+    "    print(f\"Done \u2014 {len(records)} merged PRs\")\n",
+    "    return records"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Build temporal snapshots"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def build_snapshots(pr_records, freq='Q'):\n",
+    "    \"\"\"Cumulative network state at each time period.\"\"\"\n",
+    "    for pr in pr_records:\n",
+    "        pr['period'] = pr['date'].to_period(freq)\n",
+    "    periods = sorted({pr['period'] for pr in pr_records})\n",
+    "\n",
+    "    cum_nodes = {}  # node  -> {'first_seen': period, 'pr_count': int}\n",
+    "    cum_edges = {}  # (a,b) -> {'first_seen': period, 'weight': int}\n",
+    "    snapshots = []\n",
+    "\n",
+    "    for period in periods:\n",
+    "        for pr in (p for p in pr_records if p['period'] == period):\n",
+    "            participants = list({p for p in [pr['author']] + pr['reviewers'] if p})\n",
+    "            for p in participants:\n",
+    "                if p not in cum_nodes:\n",
+    "                    cum_nodes[p] = {'first_seen': period, 'pr_count': 0}\n",
+    "                cum_nodes[p]['pr_count'] += 1\n",
+    "            for a, b in combinations(participants, 2):\n",
+    "                key = tuple(sorted([a, b]))\n",
+    "                if key not in cum_edges:\n",
+    "                    cum_edges[key] = {'first_seen': period, 'weight': 0}\n",
+    "                cum_edges[key]['weight'] += 1\n",
+    "\n",
+    "        snapshots.append({\n",
+    "            'period': period,\n",
+    "            'nodes':  dict(cum_nodes),\n",
+    "            'edges':  dict(cum_edges),\n",
+    "        })\n",
+    "\n",
+    "    print(f'{len(snapshots)} time periods, '\n",
+    "          f'{len(cum_nodes)} contributors, {len(cum_edges)} collaborations total')\n",
+    "    return snapshots"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Compute layout"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def compute_layout(snapshots):\n",
+    "    \"\"\"Spring layout on the final graph \u2014 core contributors cluster at the centre.\"\"\"\n",
+    "    final = snapshots[-1]\n",
+    "    G = nx.Graph()\n",
+    "    G.add_nodes_from(final['nodes'])\n",
+    "    G.add_edges_from(final['edges'])\n",
+    "    pos = nx.spring_layout(G, k=2.0, iterations=120, seed=42)\n",
+    "    # Normalise to [-1, 1]\n",
+    "    coords = np.array(list(pos.values()))\n",
+    "    centre = coords.mean(axis=0)\n",
+    "    scale  = np.abs(coords - centre).max()\n",
+    "    return {n: tuple((np.array(xy) - centre) / (scale or 1)) for n, xy in pos.items()}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Matplotlib animation (GIF / MP4)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "_BG   = '#0d1117'\n",
+    "_CMAP = plt.cm.plasma\n",
+    "\n",
+    "\n",
+    "def _draw_frame(ax, snapshot, pos, period_idx, n_periods, prev_nodes):\n",
+    "    ax.clear()\n",
+    "    ax.set_facecolor(_BG)\n",
+    "    ax.axis('off')\n",
+    "    ax.set_xlim(-1.3, 1.3)\n",
+    "    ax.set_ylim(-1.3, 1.3)\n",
+    "\n",
+    "    nodes    = snapshot['nodes']\n",
+    "    edges    = snapshot['edges']\n",
+    "    new_nodes = set(nodes) - prev_nodes\n",
+    "\n",
+    "    # --- edges ---\n",
+    "    for (a, b), ed in edges.items():\n",
+    "        if a not in pos or b not in pos:\n",
+    "            continue\n",
+    "        age   = period_idx[snapshot['period']] - period_idx[ed['first_seen']]\n",
+    "        alpha = min(0.55, 0.08 + 0.06 * age)\n",
+    "        lw    = 0.3 + 0.12 * min(ed['weight'], 6)\n",
+    "        ax.plot([pos[a][0], pos[b][0]], [pos[a][1], pos[b][1]],\n",
+    "                '-', color='#58a6ff', alpha=alpha, linewidth=lw, zorder=1)\n",
+    "\n",
+    "    # --- nodes ---\n",
+    "    top_nodes = {n for n, d in sorted(nodes.items(),\n",
+    "                 key=lambda x: x[1]['pr_count'], reverse=True)[:12]}\n",
+    "    for node, nd in nodes.items():\n",
+    "        if node not in pos:\n",
+    "            continue\n",
+    "        x, y    = pos[node]\n",
+    "        c_val   = period_idx[nd['first_seen']] / max(1, n_periods - 1)\n",
+    "        color   = _CMAP(c_val)\n",
+    "        size    = 25 + 8 * min(nd['pr_count'], 30)\n",
+    "        is_new  = node in new_nodes\n",
+    "        # glow ring\n",
+    "        ax.scatter(x, y, s=size * (5 if is_new else 3),\n",
+    "                   color=color, alpha=0.45 if is_new else 0.12, zorder=2)\n",
+    "        # core dot\n",
+    "        ax.scatter(x, y, s=size, color=color, alpha=0.95, zorder=3)\n",
+    "        # label for prominent contributors\n",
+    "        if node in top_nodes:\n",
+    "            ax.text(x, y + 0.09, node, ha='center', fontsize=5,\n",
+    "                    color='#e6edf3', alpha=0.85, zorder=4)\n",
+    "\n",
+    "    # --- overlay ---\n",
+    "    ax.text(0.02, 0.97, str(snapshot['period']),\n",
+    "            transform=ax.transAxes, color='white', fontsize=14,\n",
+    "            va='top', fontweight='bold')\n",
+    "    ax.text(0.02, 0.91,\n",
+    "            f\"{len(nodes)} contributors  \u2022  {len(edges)} collaborations\",\n",
+    "            transform=ax.transAxes, color='#8b949e', fontsize=8, va='top')\n",
+    "\n",
+    "\n",
+    "def make_animation(snapshots, pos, fps=4, output='collaboration_growth.gif'):\n",
+    "    period_idx = {s['period']: i for i, s in enumerate(snapshots)}\n",
+    "    n_periods  = len(snapshots)\n",
+    "    prev_list  = [set()] + [set(snapshots[i - 1]['nodes']) for i in range(1, n_periods)]\n",
+    "\n",
+    "    fig, ax = plt.subplots(figsize=(10, 10), facecolor=_BG)\n",
+    "    fig.tight_layout(pad=0)\n",
+    "\n",
+    "    def update(i):\n",
+    "        _draw_frame(ax, snapshots[i], pos, period_idx, n_periods, prev_list[i])\n",
+    "\n",
+    "    anim = FuncAnimation(fig, update, frames=n_periods,\n",
+    "                         interval=1000 // fps, repeat=True)\n",
+    "\n",
+    "    if output.endswith('.gif'):\n",
+    "        anim.save(output, writer=PillowWriter(fps=fps), dpi=120)\n",
+    "    else:\n",
+    "        from matplotlib.animation import FFMpegWriter\n",
+    "        anim.save(output, writer=FFMpegWriter(fps=fps, bitrate=2000), dpi=150)\n",
+    "    plt.close()\n",
+    "    print(f'Saved \u2192 {output}')\n",
+    "    return anim"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Plotly interactive animation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def make_plotly_animation(snapshots, pos, repo=''):\n",
+    "    period_idx = {s['period']: i for i, s in enumerate(snapshots)}\n",
+    "    n_periods  = len(snapshots)\n",
+    "    cmap       = plt.cm.plasma\n",
+    "\n",
+    "    def rgba(period):\n",
+    "        r, g, b, _ = cmap(period_idx[period] / max(1, n_periods - 1))\n",
+    "        return f'rgb({int(r*255)},{int(g*255)},{int(b*255)})'\n",
+    "\n",
+    "    def snapshot_traces(snapshot):\n",
+    "        nodes, edges = snapshot['nodes'], snapshot['edges']\n",
+    "        ex, ey = [], []\n",
+    "        for (a, b) in edges:\n",
+    "            if a in pos and b in pos:\n",
+    "                ex += [pos[a][0], pos[b][0], None]\n",
+    "                ey += [pos[a][1], pos[b][1], None]\n",
+    "        present = [(n, d) for n, d in nodes.items() if n in pos]\n",
+    "        nx_c  = [pos[n][0] for n, _ in present]\n",
+    "        ny_c  = [pos[n][1] for n, _ in present]\n",
+    "        tips  = [f'{n}<br>since {d[\"first_seen\"]}<br>{d[\"pr_count\"]} PRs'\n",
+    "                 for n, d in present]\n",
+    "        colors = [rgba(d['first_seen']) for _, d in present]\n",
+    "        sizes  = [7 + 2 * min(d['pr_count'], 20) for _, d in present]\n",
+    "        return [\n",
+    "            go.Scatter(x=ex, y=ey, mode='lines',\n",
+    "                       line=dict(color='#4a90d9', width=0.6), opacity=0.35,\n",
+    "                       hoverinfo='skip'),\n",
+    "            go.Scatter(x=nx_c, y=ny_c, mode='markers',\n",
+    "                       text=[n for n, _ in present],\n",
+    "                       hovertext=tips, hoverinfo='text',\n",
+    "                       marker=dict(size=sizes, color=colors, opacity=0.92,\n",
+    "                                   line=dict(width=0))),\n",
+    "        ]\n",
+    "\n",
+    "    frames = [go.Frame(name=str(s['period']), data=snapshot_traces(s))\n",
+    "              for s in snapshots]\n",
+    "\n",
+    "    fig = go.Figure(\n",
+    "        data=frames[0].data,\n",
+    "        frames=frames,\n",
+    "        layout=go.Layout(\n",
+    "            paper_bgcolor=_BG, plot_bgcolor=_BG,\n",
+    "            showlegend=False,\n",
+    "            xaxis=dict(visible=False, range=[-1.4, 1.4]),\n",
+    "            yaxis=dict(visible=False, range=[-1.4, 1.4], scaleanchor='x'),\n",
+    "            margin=dict(l=10, r=10, t=50, b=60),\n",
+    "            title=dict(text=f'Collaboration Network Growth \u2014 {repo}',\n",
+    "                       font=dict(color='white', size=15)),\n",
+    "            updatemenus=[dict(\n",
+    "                type='buttons', showactive=False,\n",
+    "                y=-0.08, x=0.5, xanchor='center',\n",
+    "                buttons=[\n",
+    "                    dict(label='\u25b6 Play', method='animate',\n",
+    "                         args=[None, dict(frame=dict(duration=700, redraw=True),\n",
+    "                                         fromcurrent=True)]),\n",
+    "                    dict(label='\u23f8 Pause', method='animate',\n",
+    "                         args=[[None], dict(frame=dict(duration=0),\n",
+    "                                            mode='immediate')]),\n",
+    "                ]\n",
+    "            )],\n",
+    "            sliders=[dict(\n",
+    "                currentvalue=dict(prefix='Period: ', font=dict(color='white')),\n",
+    "                font=dict(color='white'),\n",
+    "                pad=dict(t=10),\n",
+    "                steps=[dict(\n",
+    "                    args=[[str(s['period'])],\n",
+    "                          dict(frame=dict(duration=300), mode='immediate')],\n",
+    "                    label=str(s['period']),\n",
+    "                    method='animate',\n",
+    "                ) for s in snapshots],\n",
+    "            )],\n",
+    "        )\n",
+    "    )\n",
+    "    return fig"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Run"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "HEADERS    = make_headers(GITHUB_TOKEN)\n",
+    "pr_records = fetch_pr_data(REPO, HEADERS, max_prs=MAX_PRS)\n",
+    "\n",
+    "snapshots = build_snapshots(pr_records, freq=TIME_RESOLUTION)\n",
+    "pos       = compute_layout(snapshots)\n",
+    "\n",
+    "# --- Movie ---\n",
+    "make_animation(snapshots, pos, fps=FPS, output=OUTPUT_FILE)\n",
+    "\n",
+    "# Display GIF inline if in Jupyter\n",
+    "try:\n",
+    "    from IPython.display import Image, display\n",
+    "    if OUTPUT_FILE.endswith('.gif'):\n",
+    "        display(Image(OUTPUT_FILE))\n",
+    "except ImportError:\n",
+    "    pass\n",
+    "\n",
+    "# --- Interactive Plotly ---\n",
+    "if HAS_PLOTLY:\n",
+    "    fig = make_plotly_animation(snapshots, pos, repo=REPO)\n",
+    "    fig.show()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.9.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
\ No newline at end of file
diff --git a/git-collaboration-network.ipynb b/git-collaboration-network.ipynb
new file mode 100644
index 0000000..fad3795
--- /dev/null
+++ b/git-collaboration-network.ipynb
@@ -0,0 +1,404 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Collaborative Network from Git History\n",
+    "\n",
+    "Builds two views of contributor collaboration from a GitHub repository:\n",
+    "\n",
+    "1. **Network graph** \u2014 nodes are contributors, edges come from:\n",
+    "   - *PR collaboration*: both participated (author / reviewer) in the same pull request\n",
+    "   - *Temporal file overlap*: both touched the same code area **and** their activity windows overlap in time\n",
+    "2. **Code Archaeology heatmap** \u2014 for each (contributor, code area) pair, measures how many days before the contributor arrived that area was last touched by someone else"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Configuration"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "REPO = \"torvalds/linux\"  # change to any owner/repo\n",
+    "GITHUB_TOKEN = \"\"         # set a personal access token to raise rate limits\n",
+    "MAX_PRS = 100             # merged PRs to analyse (increase for deeper history)\n",
+    "MAX_COMMIT_PAGES = 5      # x100 commits each, for activity-window computation\n",
+    "MIN_CONTRIBUTIONS = 2     # exclude contributors below this threshold\n",
+    "FILE_GROUP_DEPTH = 1      # directory depth for code-area grouping (1 = top-level dir)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import time\n",
+    "import requests\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import networkx as nx\n",
+    "import matplotlib.pyplot as plt\n",
+    "import matplotlib.patches as mpatches\n",
+    "import seaborn as sns\n",
+    "from collections import defaultdict\n",
+    "from itertools import combinations"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Fetch data from GitHub"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def make_headers(token=''):\n",
+    "    h = {\"Accept\": \"application/vnd.github+json\"}\n",
+    "    if token:\n",
+    "        h[\"Authorization\"] = f\"Bearer {token}\"\n",
+    "    return h\n",
+    "\n",
+    "\n",
+    "def paginate(url, headers, params=None, max_pages=None):\n",
+    "    results, page = [], 1\n",
+    "    while True:\n",
+    "        p = {**(params or {}), 'page': page, 'per_page': 100}\n",
+    "        r = requests.get(url, headers=headers, params=p, timeout=30)\n",
+    "        if r.status_code == 403:\n",
+    "            raise PermissionError(\"Rate limit hit \u2014 set GITHUB_TOKEN.\")\n",
+    "        if r.status_code == 404:\n",
+    "            raise ValueError(f\"Not found: {url}\")\n",
+    "        r.raise_for_status()\n",
+    "        data = r.json()\n",
+    "        if not data:\n",
+    "            break\n",
+    "        results.extend(data)\n",
+    "        if (max_pages and page >= max_pages) or \"next\" not in r.links:\n",
+    "            break\n",
+    "        page += 1\n",
+    "    return results"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def fetch_pr_data(repo, headers, max_prs=100):\n",
+    "    print(f\"Fetching PRs from {repo}...\")\n",
+    "    raw = paginate(\n",
+    "        f\"https://api.github.com/repos/{repo}/pulls\",\n",
+    "        headers, {\"state\": \"closed\"},\n",
+    "        max_pages=(max_prs // 100 + 1) if max_prs else None,\n",
+    "    )\n",
+    "    merged = [pr for pr in raw if pr.get('merged_at')][:max_prs]\n",
+    "\n",
+    "    records = []\n",
+    "    for i, pr in enumerate(merged):\n",
+    "        num = pr['number']\n",
+    "        author = (pr.get('user') or {}).get('login')\n",
+    "        date = pd.to_datetime(pr['merged_at'])\n",
+    "\n",
+    "        reviews = paginate(\n",
+    "            f\"https://api.github.com/repos/{repo}/pulls/{num}/reviews\", headers\n",
+    "        )\n",
+    "        reviewers = list(\n",
+    "            {(r.get('user') or {}).get('login') for r in reviews} - {None, author}\n",
+    "        )\n",
+    "\n",
+    "        files_data = paginate(\n",
+    "            f\"https://api.github.com/repos/{repo}/pulls/{num}/files\", headers\n",
+    "        )\n",
+    "        files = [f['filename'] for f in files_data]\n",
+    "\n",
+    "        records.append({\n",
+    "            'pr': num, 'author': author,\n",
+    "            'reviewers': reviewers, 'files': files, 'date': date,\n",
+    "        })\n",
+    "        if (i + 1) % 10 == 0:\n",
+    "            print(f\"  {i + 1}/{len(merged)} PRs\")\n",
+    "        time.sleep(0.05)\n",
+    "\n",
+    "    print(f\"Done \u2014 {len(records)} merged PRs loaded\")\n",
+    "    return records\n",
+    "\n",
+    "\n",
+    "def fetch_commit_activity(repo, headers, max_pages=5):\n",
+    "    print(\"Fetching commit history for activity windows...\")\n",
+    "    commits = paginate(\n",
+    "        f\"https://api.github.com/repos/{repo}/commits\", headers, max_pages=max_pages\n",
+    "    )\n",
+    "    activity = defaultdict(list)\n",
+    "    for c in commits:\n",
+    "        login = (c.get('author') or {}).get('login')\n",
+    "        date_str = (c.get('commit', {}).get('author') or {}).get('date')\n",
+    "        if login and date_str:\n",
+    "            activity[login].append(pd.to_datetime(date_str))\n",
+    "    print(f\"Found activity for {len(activity)} contributors\")\n",
+    "    return dict(activity)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Build collaboration network"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def build_activity_windows(pr_records, commit_activity, min_contributions=2):\n",
+    "    \"\"\"Merge PR and commit dates into per-contributor (start, end) windows.\"\"\"\n",
+    "    dates = defaultdict(list)\n",
+    "    for pr in pr_records:\n",
+    "        for person in [pr['author']] + pr['reviewers']:\n",
+    "            if person:\n",
+    "                dates[person].append(pr['date'])\n",
+    "    for person, ds in commit_activity.items():\n",
+    "        dates[person].extend(ds)\n",
+    "    return {\n",
+    "        p: (min(ds), max(ds))\n",
+    "        for p, ds in dates.items()\n",
+    "        if len(ds) >= min_contributions\n",
+    "    }\n",
+    "\n",
+    "\n",
+    "def overlaps(w1, w2):\n",
+    "    return w1[0] <= w2[1] and w2[0] <= w1[1]\n",
+    "\n",
+    "\n",
+    "def file_group(path, depth=1):\n",
+    "    parts = path.split('/')\n",
+    "    return '/'.join(parts[:depth]) if len(parts) > depth else parts[0]\n",
+    "\n",
+    "\n",
+    "def build_network(pr_records, activity_windows, file_group_depth=1):\n",
+    "    G = nx.Graph()\n",
+    "    G.add_nodes_from(activity_windows)\n",
+    "\n",
+    "    # PR edges: both participated in the same pull request\n",
+    "    for pr in pr_records:\n",
+    "        participants = list({\n",
+    "            p for p in [pr['author']] + pr['reviewers']\n",
+    "            if p in activity_windows\n",
+    "        })\n",
+    "        for a, b in combinations(participants, 2):\n",
+    "            if G.has_edge(a, b):\n",
+    "                G[a][b]['pr_weight'] = G[a][b].get('pr_weight', 0) + 1\n",
+    "            else:\n",
+    "                G.add_edge(a, b, pr_weight=1, file_weight=0)\n",
+    "\n",
+    "    # File + temporal-overlap edges: touched the same code area while both active\n",
+    "    area_contribs = defaultdict(set)\n",
+    "    for pr in pr_records:\n",
+    "        participants = [p for p in [pr['author']] + pr['reviewers'] if p in activity_windows]\n",
+    "        for f in pr['files']:\n",
+    "            area_contribs[file_group(f, file_group_depth)].update(participants)\n",
+    "\n",
+    "    for area, people in area_contribs.items():\n",
+    "        for a, b in combinations(people, 2):\n",
+    "            if overlaps(activity_windows[a], activity_windows[b]):\n",
+    "                if G.has_edge(a, b):\n",
+    "                    G[a][b]['file_weight'] = G[a][b].get('file_weight', 0) + 1\n",
+    "                else:\n",
+    "                    G.add_edge(a, b, pr_weight=0, file_weight=1)\n",
+    "\n",
+    "    return G"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Network visualisation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def plot_network(G, repo):\n",
+    "    fig, ax = plt.subplots(figsize=(14, 10))\n",
+    "    pos = nx.spring_layout(G, k=1.5, seed=42)\n",
+    "\n",
+    "    pr_only   = [(u, v) for u, v, d in G.edges(data=True) if d.get('pr_weight', 0) > 0 and d.get('file_weight', 0) == 0]\n",
+    "    file_only = [(u, v) for u, v, d in G.edges(data=True) if d.get('pr_weight', 0) == 0 and d.get('file_weight', 0) > 0]\n",
+    "    both      = [(u, v) for u, v, d in G.edges(data=True) if d.get('pr_weight', 0) > 0 and d.get('file_weight', 0) > 0]\n",
+    "\n",
+    "    node_sizes = [200 + 80 * G.degree(n) for n in G.nodes()]\n",
+    "    nx.draw_networkx_nodes(G, pos, node_size=node_sizes, node_color='steelblue', alpha=0.85, ax=ax)\n",
+    "    nx.draw_networkx_labels(G, pos, font_size=7, ax=ax)\n",
+    "    nx.draw_networkx_edges(G, pos, edgelist=pr_only,   edge_color='#e07b39', width=1.5, alpha=0.7, ax=ax)\n",
+    "    nx.draw_networkx_edges(G, pos, edgelist=file_only, edge_color='#4caf50', width=1.0, alpha=0.6, ax=ax)\n",
+    "    nx.draw_networkx_edges(G, pos, edgelist=both,      edge_color='#9c27b0', width=2.5, alpha=0.8, ax=ax)\n",
+    "\n",
+    "    legend = [\n",
+    "        mpatches.Patch(color='#e07b39', label='PR collaboration only'),\n",
+    "        mpatches.Patch(color='#4caf50', label='Shared code area (temporal overlap) only'),\n",
+    "        mpatches.Patch(color='#9c27b0', label='Both'),\n",
+    "    ]\n",
+    "    ax.legend(handles=legend, loc='upper left', fontsize=9)\n",
+    "    ax.set_title(f'Collaboration Network \u2014 {repo}', fontsize=13)\n",
+    "    ax.axis('off')\n",
+    "    plt.tight_layout()\n",
+    "    plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Code Archaeology\n",
+    "\n",
+    "For each *(contributor, code area)* pair, the **archaeology depth** is the number of days between when a contributor first became active and when that code area was last touched by someone else before them.\n",
+    "\n",
+    "- **Large value** \u2192 contributor arrived long after others left; understanding the code requires digging deep into history\n",
+    "- **Zero / NaN** \u2192 contributor was contemporary with others who touched that area (no archaeology needed)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def build_archaeology_matrix(pr_records, activity_windows, file_group_depth=1,\n",
+    "                              top_files=30, top_contributors=25):\n",
+    "    area_timeline = defaultdict(list)\n",
+    "    for pr in pr_records:\n",
+    "        participants = [p for p in [pr['author']] + pr['reviewers'] if p in activity_windows]\n",
+    "        for f in pr['files']:\n",
+    "            g = file_group(f, file_group_depth)\n",
+    "            for p in participants:\n",
+    "                area_timeline[g].append((pr['date'], p))\n",
+    "    for g in area_timeline:\n",
+    "        area_timeline[g].sort()\n",
+    "\n",
+    "    contrib_count = defaultdict(int)\n",
+    "    for pr in pr_records:\n",
+    "        for p in [pr['author']] + pr['reviewers']:\n",
+    "            if p in activity_windows:\n",
+    "                contrib_count[p] += 1\n",
+    "    top_contribs = sorted(contrib_count, key=contrib_count.get, reverse=True)[:top_contributors]\n",
+    "\n",
+    "    top_areas = sorted(\n",
+    "        area_timeline, key=lambda g: len(area_timeline[g]), reverse=True\n",
+    "    )[:top_files]\n",
+    "\n",
+    "    matrix = pd.DataFrame(index=top_areas, columns=top_contribs, dtype=float)\n",
+    "\n",
+    "    for area in top_areas:\n",
+    "        for c in top_contribs:\n",
+    "            c_start = activity_windows[c][0]\n",
+    "            prior = [\n",
+    "                date for date, person in area_timeline[area]\n",
+    "                if person != c and date < c_start\n",
+    "            ]\n",
+    "            if prior:\n",
+    "                matrix.loc[area, c] = float(max((c_start - max(prior)).days, 0))\n",
+    "\n",
+    "    return matrix\n",
+    "\n",
+    "\n",
+    "def plot_archaeology(matrix, repo):\n",
+    "    data = matrix.dropna(how='all').dropna(axis=1, how='all')\n",
+    "    if data.empty:\n",
+    "        print('No archaeology data: all contributors were fully contemporaneous.')\n",
+    "        return\n",
+    "\n",
+    "    h = max(6, len(data.index) * 0.35)\n",
+    "    w = max(10, len(data.columns) * 0.55)\n",
+    "    fig, ax = plt.subplots(figsize=(w, h))\n",
+    "    sns.heatmap(\n",
+    "        data.astype(float),\n",
+    "        cmap='YlOrRd',\n",
+    "        ax=ax,\n",
+    "        linewidths=0.3,\n",
+    "        cbar_kws={'label': 'Archaeology depth (days)'},\n",
+    "        mask=data.isna(),\n",
+    "    )\n",
+    "    ax.set_title(\n",
+    "        f'Code Archaeology \u2014 {repo}\\n'\n",
+    "        'Days since others last touched this code area before a contributor arrived',\n",
+    "        fontsize=11,\n",
+    "    )\n",
+    "    ax.set_xlabel('Contributor')\n",
+    "    ax.set_ylabel('Code Area')\n",
+    "    plt.xticks(rotation=45, ha='right', fontsize=8)\n",
+    "    plt.yticks(fontsize=7)\n",
+    "    plt.tight_layout()\n",
+    "    plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Run"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "HEADERS = make_headers(GITHUB_TOKEN)\n",
+    "\n",
+    "pr_records      = fetch_pr_data(REPO, HEADERS, max_prs=MAX_PRS)\n",
+    "commit_activity = fetch_commit_activity(REPO, HEADERS, max_pages=MAX_COMMIT_PAGES)\n",
+    "\n",
+    "activity_windows = build_activity_windows(pr_records, commit_activity,\n",
+    "                                           min_contributions=MIN_CONTRIBUTIONS)\n",
+    "print(f'Active contributors (>={MIN_CONTRIBUTIONS} contributions): {len(activity_windows)}')\n",
+    "\n",
+    "G = build_network(pr_records, activity_windows, file_group_depth=FILE_GROUP_DEPTH)\n",
+    "print(f'Network: {G.number_of_nodes()} nodes, {G.number_of_edges()} edges')\n",
+    "\n",
+    "plt.style.use('fivethirtyeight')\n",
+    "\n",
+    "plot_network(G, REPO)\n",
+    "\n",
+    "arch_matrix = build_archaeology_matrix(\n",
+    "    pr_records, activity_windows,\n",
+    "    file_group_depth=FILE_GROUP_DEPTH,\n",
+    ")\n",
+    "plot_archaeology(arch_matrix, REPO)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.9.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
\ No newline at end of file
diff --git a/github-contributor-analysis.ipynb b/github-contributor-analysis.ipynb
new file mode 100644
index 0000000..f7908b7
--- /dev/null
+++ b/github-contributor-analysis.ipynb
@@ -0,0 +1,198 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# GitHub Contributor Nonlinear Performance Analysis\n",
+    "\n",
+    "Fetches contributor statistics from a GitHub repository and fits the nonlinear maintenance model\n",
+    "\n",
+    "$$w(t) = \\frac{\\text{amplitude}}{\\mu}\\left(1 - e^{-\\mu t}\\right)$$\n",
+    "\n",
+    "to each contributor's cumulative activity over time."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Configuration\n",
+    "\n",
+    "Set `REPO` to `\"owner/repo\"`. Optionally set `GITHUB_TOKEN` to avoid rate limiting (required for private repos)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "REPO = \"torvalds/linux\"  # change to any public repo, e.g. \"pandas-dev/pandas\"\n",
+    "GITHUB_TOKEN = \"\"        # optional: set a personal access token to raise rate limits"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import time\n",
+    "import requests\n",
+    "import pandas as pd\n",
+    "import matplotlib.pyplot as plt\n",
+    "import numpy as np\n",
+    "from scipy.optimize import minimize"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Fetch contributor statistics\n",
+    "\n",
+    "GitHub may return HTTP 202 while it computes statistics for the first time — the cell retries until data is ready."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def fetch_contributors(repo, token=\"\", max_retries=6, retry_delay=5):\n",
+    "    \"\"\"Fetch contributor weekly stats from the GitHub API.\n",
+    "\n",
+    "    Returns a DataFrame where each row is a contributor with a 'weeks' column\n",
+    "    containing a list of {w, a, d, c} dicts (matching linux_contributors.json format).\n",
+    "    \"\"\"\n",
+    "    url = f\"https://api.github.com/repos/{repo}/stats/contributors\"\n",
+    "    headers = {\"Accept\": \"application/vnd.github+json\"}\n",
+    "    if token:\n",
+    "        headers[\"Authorization\"] = f\"Bearer {token}\"\n",
+    "\n",
+    "    for attempt in range(1, max_retries + 1):\n",
+    "        response = requests.get(url, headers=headers, timeout=30)\n",
+    "        if response.status_code == 200:\n",
+    "            data = response.json()\n",
+    "            if data:  # empty list means no contributors\n",
+    "                return pd.DataFrame(data)\n",
+    "            raise ValueError(f\"Repository '{repo}' has no contributor data.\")\n",
+    "        if response.status_code == 202:\n",
+    "            print(f\"GitHub is computing stats, retrying in {retry_delay}s (attempt {attempt}/{max_retries})...\")\n",
+    "            time.sleep(retry_delay)\n",
+    "            continue\n",
+    "        if response.status_code == 404:\n",
+    "            raise ValueError(f\"Repository '{repo}' not found. Check the owner/repo spelling.\")\n",
+    "        if response.status_code == 403:\n",
+    "            raise PermissionError(\"Rate limit exceeded or access denied. Set GITHUB_TOKEN to a valid token.\")\n",
+    "        response.raise_for_status()\n",
+    "\n",
+    "    raise TimeoutError(f\"GitHub did not return contributor stats after {max_retries} attempts.\")\n",
+    "\n",
+    "\n",
+    "contributors = fetch_contributors(REPO, token=GITHUB_TOKEN)\n",
+    "print(f\"Fetched {len(contributors)} contributors from '{REPO}'\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def fun(t, amplitude, mu):\n",
+    "    # t is in weeks; divide by 52 to convert to years (mu is a per-year rate)\n",
+    "    return amplitude / mu * (1 - np.exp(-mu * t / 52))\n",
+    "\n",
+    "\n",
+    "def lsq(x_observed, params):\n",
+    "    t = np.arange(len(x_observed))\n",
+    "    x_model = fun(t, *params)\n",
+    "    err = x_model - x_observed\n",
+    "    return np.dot(err, err) / len(err)\n",
+    "\n",
+    "\n",
+    "def model_fit(x, title=\"\"):\n",
+    "    x = np.array(x, dtype=np.float64)\n",
+    "    x_max = x.max()\n",
+    "    if x_max == 0:\n",
+    "        return None\n",
+    "    x = x / x_max\n",
+    "    res = minimize(\n",
+    "        lambda params: lsq(x, params),\n",
+    "        (1 / len(x), 0.1),\n",
+    "        method='SLSQP',\n",
+    "        bounds=((0, None), (0, None))\n",
+    "    )\n",
+    "    amplitude, mu = res.x\n",
+    "    # effective maintenance ratio: mu normalized by amplitude scale factor\n",
+    "    mu_effective = mu / amplitude\n",
+    "    mx = fun(np.arange(len(x)), amplitude, mu)\n",
+    "    label = f\"{title} — \" if title else \"\"\n",
+    "    pd.Series(mx).plot(title=r\"{}$\\mu$ = {:3.2f}\".format(label, mu_effective))\n",
+    "    pd.Series(x).plot()\n",
+    "    plt.show()\n",
+    "    return res"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Run analysis"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def main(df):\n",
+    "    for _, entry in df.iterrows():\n",
+    "        rm = dict()\n",
+    "        for week in entry.weeks:\n",
+    "            d = dict(week)\n",
+    "            w = d['w']\n",
+    "            if w in rm:\n",
+    "                rm[w] += d['a'] + d['d']\n",
+    "            else:\n",
+    "                rm[w] = d['a'] + d['d']\n",
+    "        sorted_items = sorted(rm.items())\n",
+    "        t = pd.Series([v for _, v in sorted_items]).cumsum()\n",
+    "        t = t[t != 0]  # Remove zero-activity weeks\n",
+    "        login = entry.get('author', {}) or {}\n",
+    "        if isinstance(login, dict):\n",
+    "            login = login.get('login', '')\n",
+    "        model_fit(t, title=login)\n",
+    "\n",
+    "\n",
+    "plt.rcParams[\"figure.figsize\"] = (9, 4)\n",
+    "plt.style.use('fivethirtyeight')\n",
+    "main(contributors)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.9.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/github-core-network-stats.ipynb b/github-core-network-stats.ipynb
new file mode 100644
index 0000000..9eb3892
--- /dev/null
+++ b/github-core-network-stats.ipynb
@@ -0,0 +1,411 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Core Collaboration Network \u2014 Top 1000 GitHub Repos\n",
+    "\n",
+    "**Hypothesis (from JIRA analysis):** the _core_ collaboration network of a project is very small relative to its total contributor count.\n",
+    "\n",
+    "This notebook fetches the top 1000 GitHub repositories by stars, pulls contributor statistics for each, and computes:\n",
+    "\n",
+    "| Metric | What it measures |\n",
+    "|---|---|\n",
+    "| **80/20 core size** | minimum contributors accounting for 80% of all commits |\n",
+    "| **Gini coefficient** | inequality of contribution distribution (0 = equal, 1 = one person does everything) |\n",
+    "| **Top-1 share** | fraction of commits by the single most active contributor |\n",
+    "| **Temporal core** | contributors whose activity windows overlap with at least one other (potential collaborators) |\n",
+    "| **Core fraction** | core size / total contributors |\n",
+    "\n",
+    "Results are cached to disk so re-runs are instant."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Configuration"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "GITHUB_TOKEN = \"\"      # required \u2014 5000 req/hr authenticated vs 60 req/hr anonymous\n",
+    "MAX_REPOS    = 1000    # top N repos by stars\n",
+    "CACHE_FILE   = \"top_repos_cache.json\"  # intermediate results saved here"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json, time\n",
+    "import requests\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import matplotlib.pyplot as plt\n",
+    "import matplotlib.ticker as mticker\n",
+    "from pathlib import Path\n",
+    "from itertools import combinations\n",
+    "from collections import defaultdict"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Fetch top repositories"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def make_headers(token=''):\n",
+    "    h = {\"Accept\": \"application/vnd.github+json\"}\n",
+    "    if token:\n",
+    "        h[\"Authorization\"] = f\"Bearer {token}\"\n",
+    "    return h\n",
+    "\n",
+    "\n",
+    "def _get(url, headers, params=None, retries=4):\n",
+    "    delay = 2\n",
+    "    for attempt in range(retries):\n",
+    "        r = requests.get(url, headers=headers, params=params, timeout=30)\n",
+    "        if r.status_code == 200:\n",
+    "            return r\n",
+    "        if r.status_code in (202, 429, 500, 502, 503):\n",
+    "            time.sleep(delay)\n",
+    "            delay *= 2\n",
+    "            continue\n",
+    "        if r.status_code == 403:\n",
+    "            raise PermissionError(\"Rate limit hit \u2014 set GITHUB_TOKEN.\")\n",
+    "        return None  # 404, 451, etc.\n",
+    "    return None\n",
+    "\n",
+    "\n",
+    "def fetch_top_repos(headers, n=1000):\n",
+    "    print(f\"Fetching top {n} repos by stars...\")\n",
+    "    repos, page = [], 1\n",
+    "    while len(repos) < n:\n",
+    "        r = _get(\n",
+    "            \"https://api.github.com/search/repositories\",\n",
+    "            headers,\n",
+    "            {\"q\": \"stars:>100\", \"sort\": \"stars\", \"order\": \"desc\", \"per_page\": 100, \"page\": page},\n",
+    "        )\n",
+    "        if r is None:\n",
+    "            break\n",
+    "        items = r.json().get('items', [])\n",
+    "        if not items:\n",
+    "            break\n",
+    "        repos.extend(items)\n",
+    "        if \"next\" not in r.links:\n",
+    "            break\n",
+    "        page += 1\n",
+    "        time.sleep(0.3)\n",
+    "    repos = repos[:n]\n",
+    "    print(f\"Got {len(repos)} repos\")\n",
+    "    return repos"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Fetch contributor stats (with disk cache)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def fetch_contributor_stats(owner, repo, headers):\n",
+    "    \"\"\"Fetch /stats/contributors, retrying on 202 (GitHub is computing).\"\"\"\n",
+    "    url = f'https://api.github.com/repos/{owner}/{repo}/stats/contributors'\n",
+    "    r = _get(url, headers)\n",
+    "    return r.json() if r and isinstance(r.json(), list) else None\n",
+    "\n",
+    "\n",
+    "def fetch_all_stats(repos, headers, cache_file='top_repos_cache.json'):\n",
+    "    cache = {}\n",
+    "    p = Path(cache_file)\n",
+    "    if p.exists():\n",
+    "        cache = json.loads(p.read_text())\n",
+    "        print(f\"Cache loaded: {len(cache)} repos\")\n",
+    "\n",
+    "    results = []\n",
+    "    for i, repo in enumerate(repos):\n",
+    "        full_name = repo['full_name']\n",
+    "        owner, name = full_name.split('/', 1)\n",
+    "\n",
+    "        if full_name not in cache:\n",
+    "            cache[full_name] = fetch_contributor_stats(owner, name, headers)\n",
+    "            time.sleep(0.15)\n",
+    "\n",
+    "        stats = cache[full_name]\n",
+    "        if stats:\n",
+    "            results.append({\n",
+    "                'repo': full_name,\n",
+    "                'stars': repo['stargazers_count'],\n",
+    "                'forks': repo['forks_count'],\n",
+    "                'language': repo.get('language'),\n",
+    "                'contributors': stats,\n",
+    "            })\n",
+    "\n",
+    "        if (i + 1) % 100 == 0:\n",
+    "            p.write_text(json.dumps(cache))  # checkpoint\n",
+    "            print(f\"  {i + 1}/{len(repos)} repos processed ({len(results)} with data)\")\n",
+    "\n",
+    "    p.write_text(json.dumps(cache))\n",
+    "    print(f\"Done \u2014 {len(results)}/{len(repos)} repos returned contributor data\")\n",
+    "    return results"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Compute per-repo metrics"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def gini(values):\n",
+    "    \"\"\"Gini coefficient: 0 = perfect equality, 1 = maximum inequality.\"\"\"\n",
+    "    v = np.sort(np.array(values, dtype=float))\n",
+    "    n = len(v)\n",
+    "    if n == 0 or v.sum() == 0:\n",
+    "        return 0.0\n",
+    "    ix = np.arange(1, n + 1)\n",
+    "    return float((2 * ix.dot(v) / (n * v.sum())) - (n + 1) / n)\n",
+    "\n",
+    "\n",
+    "def pareto_core(totals, threshold=0.8):\n",
+    "    \"\"\"Minimum contributors accounting for `threshold` fraction of total commits.\"\"\"\n",
+    "    desc = sorted(totals, reverse=True)\n",
+    "    total = sum(desc)\n",
+    "    if total == 0:\n",
+    "        return 0\n",
+    "    cumsum = 0\n",
+    "    for i, t in enumerate(desc):\n",
+    "        cumsum += t\n",
+    "        if cumsum >= threshold * total:\n",
+    "            return i + 1\n",
+    "    return len(totals)\n",
+    "\n",
+    "\n",
+    "def activity_window(weeks):\n",
+    "    active = [w['w'] for w in weeks if w.get('c', 0) + w.get('a', 0) + w.get('d', 0) > 0]\n",
+    "    return (min(active), max(active)) if active else None\n",
+    "\n",
+    "\n",
+    "def temporal_core_size(contributors):\n",
+    "    \"\"\"Contributors whose activity window overlaps with at least one other (sweep-line O(n log n)).\"\"\"\n",
+    "    windows = [w for c in contributors for w in [activity_window(c.get('weeks', []))] if w]\n",
+    "    if len(windows) < 2:\n",
+    "        return len(windows)\n",
+    "    windows.sort(key=lambda w: w[0])\n",
+    "    has_overlap = [False] * len(windows)\n",
+    "    active = []  # (end_time, index)\n",
+    "    for i, (start, end) in enumerate(windows):\n",
+    "        active = [(e, j) for e, j in active if e >= start]\n",
+    "        if active:\n",
+    "            has_overlap[i] = True\n",
+    "            for _, j in active:\n",
+    "                has_overlap[j] = True\n",
+    "        active.append((end, i))\n",
+    "    return sum(has_overlap)\n",
+    "\n",
+    "\n",
+    "def compute_metrics(repo_data):\n",
+    "    records = []\n",
+    "    for repo in repo_data:\n",
+    "        contributors = repo['contributors']\n",
+    "        totals = [c.get('total', 0) for c in contributors if c.get('total', 0) > 0]\n",
+    "        if not totals:\n",
+    "            continue\n",
+    "        total_commits = sum(totals)\n",
+    "        records.append({\n",
+    "            'repo':           repo['repo'],\n",
+    "            'stars':          repo['stars'],\n",
+    "            'forks':          repo['forks'],\n",
+    "            'language':       repo['language'],\n",
+    "            'n_contributors': len(totals),\n",
+    "            'total_commits':  total_commits,\n",
+    "            'gini':           gini(totals),\n",
+    "            'core_80':        pareto_core(totals, 0.8),\n",
+    "            'core_50':        pareto_core(totals, 0.5),\n",
+    "            'top1_share':     max(totals) / total_commits,\n",
+    "            'temporal_core':  temporal_core_size(contributors),\n",
+    "        })\n",
+    "    df = pd.DataFrame(records)\n",
+    "    df['core_fraction'] = df['core_80'] / df['n_contributors']\n",
+    "    return df"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Visualise\n",
+    "\n",
+    "Six panels testing the hypothesis that the core collaboration network is small:\n",
+    "1. Distribution of 80/20 core sizes\n",
+    "2. Core size vs total contributors (log-log)\n",
+    "3. Gini coefficient distribution\n",
+    "4. CDF \u2014 fraction of repos with core \u2264 N\n",
+    "5. Top contributor's share of commits\n",
+    "6. Median core size by programming language"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def plot_stats(df):\n",
+    "    fig, axes = plt.subplots(2, 3, figsize=(18, 10))\n",
+    "    fig.suptitle(\n",
+    "        f'Core Collaboration Network \u2014 Top {len(df)} GitHub Repos\\n'\n",
+    "        f'Median contributors: {df[\"n_contributors\"].median():.0f}  |  '\n",
+    "        f'Median 80/20 core: {df[\"core_80\"].median():.0f}  |  '\n",
+    "        f'Median Gini: {df[\"gini\"].median():.2f}',\n",
+    "        fontsize=13,\n",
+    "    )\n",
+    "\n",
+    "    # 1. 80/20 core size histogram\n",
+    "    ax = axes[0, 0]\n",
+    "    ax.hist(df['core_80'].clip(upper=50), bins=40, edgecolor='white')\n",
+    "    ax.axvline(df['core_80'].median(), color='red', linestyle='--',\n",
+    "               label=f\"Median: {df['core_80'].median():.0f}\")\n",
+    "    ax.set_xlabel('80/20 Core Size (contributors)')\n",
+    "    ax.set_ylabel('Repos')\n",
+    "    ax.set_title('80/20 Core Size Distribution')\n",
+    "    ax.legend()\n",
+    "\n",
+    "    # 2. Core vs total (log-log scatter)\n",
+    "    ax = axes[0, 1]\n",
+    "    ax.scatter(df['n_contributors'], df['core_80'], alpha=0.25, s=8, color='steelblue')\n",
+    "    lims = [1, df['n_contributors'].max() * 1.5]\n",
+    "    ax.plot(lims, lims, 'r--', linewidth=0.8, label='core = total')\n",
+    "    ax.set_xscale('log'); ax.set_yscale('log')\n",
+    "    ax.set_xlabel('Total contributors (log)')\n",
+    "    ax.set_ylabel('80/20 Core size (log)')\n",
+    "    ax.set_title('Core vs Total Contributors')\n",
+    "    ax.legend(fontsize=8)\n",
+    "\n",
+    "    # 3. Gini coefficient\n",
+    "    ax = axes[0, 2]\n",
+    "    ax.hist(df['gini'], bins=40, edgecolor='white')\n",
+    "    ax.axvline(df['gini'].median(), color='red', linestyle='--',\n",
+    "               label=f\"Median: {df['gini'].median():.2f}\")\n",
+    "    ax.set_xlabel('Gini Coefficient')\n",
+    "    ax.set_ylabel('Repos')\n",
+    "    ax.set_title('Contribution Inequality (Gini)')\n",
+    "    ax.legend()\n",
+    "\n",
+    "    # 4. CDF of core size\n",
+    "    ax = axes[1, 0]\n",
+    "    sorted_core = np.sort(df['core_80'])\n",
+    "    cdf = np.arange(1, len(sorted_core) + 1) / len(sorted_core)\n",
+    "    ax.plot(sorted_core, cdf, linewidth=2)\n",
+    "    for n in [3, 5, 10, 20]:\n",
+    "        pct = (df['core_80'] <= n).mean()\n",
+    "        ax.axvline(n, color='grey', linestyle=':', linewidth=0.8)\n",
+    "        ax.text(n, 0.05, f'{pct:.0%}\\n\u2264{n}', ha='center', fontsize=7)\n",
+    "    ax.set_xlabel('Core Size')\n",
+    "    ax.set_ylabel('Fraction of repos')\n",
+    "    ax.set_title('CDF: Repos with Core \u2264 N')\n",
+    "    ax.set_xlim(0, 50)\n",
+    "\n",
+    "    # 5. Top-1 contributor share\n",
+    "    ax = axes[1, 1]\n",
+    "    ax.hist(df['top1_share'], bins=40, edgecolor='white')\n",
+    "    ax.axvline(df['top1_share'].median(), color='red', linestyle='--',\n",
+    "               label=f\"Median: {df['top1_share'].median():.0%}\")\n",
+    "    ax.xaxis.set_major_formatter(mticker.PercentFormatter(1.0))\n",
+    "    ax.set_xlabel('Top Contributor\\'s Share')\n",
+    "    ax.set_ylabel('Repos')\n",
+    "    ax.set_title('Top-1 Contributor Dominance')\n",
+    "    ax.legend()\n",
+    "\n",
+    "    # 6. Language breakdown\n",
+    "    ax = axes[1, 2]\n",
+    "    lang = (df.groupby('language')['core_80']\n",
+    "            .agg(median='median', count='count')\n",
+    "            .query('count >= 10')\n",
+    "            .sort_values('median'))\n",
+    "    lang['median'].plot(kind='barh', ax=ax)\n",
+    "    ax.set_xlabel('Median 80/20 Core Size')\n",
+    "    ax.set_title('Core Size by Language (n \u2265 10 repos)')\n",
+    "\n",
+    "    plt.tight_layout()\n",
+    "    plt.show()\n",
+    "\n",
+    "    print('\\n=== Summary ===')\n",
+    "    print(f'Repos analysed:             {len(df)}')\n",
+    "    print(f'Median total contributors:  {df[\"n_contributors\"].median():.0f}')\n",
+    "    print(f'Median 80/20 core:          {df[\"core_80\"].median():.0f}')\n",
+    "    print(f'Median core fraction:       {df[\"core_fraction\"].median():.1%}')\n",
+    "    print(f'Median Gini:                {df[\"gini\"].median():.3f}')\n",
+    "    print(f'Median top-1 share:         {df[\"top1_share\"].median():.1%}')\n",
+    "    print(f'Repos where core \u2264 3:       {(df[\"core_80\"] <= 3).mean():.1%}')\n",
+    "    print(f'Repos where core \u2264 5:       {(df[\"core_80\"] <= 5).mean():.1%}')\n",
+    "    print(f'Repos where core \u2264 10:      {(df[\"core_80\"] <= 10).mean():.1%}')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Run"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "HEADERS = make_headers(GITHUB_TOKEN)\n",
+    "\n",
+    "repos    = fetch_top_repos(HEADERS, n=MAX_REPOS)\n",
+    "raw_data = fetch_all_stats(repos, HEADERS, cache_file=CACHE_FILE)\n",
+    "\n",
+    "df = compute_metrics(raw_data)\n",
+    "print(df.describe())\n",
+    "\n",
+    "plt.style.use('fivethirtyeight')\n",
+    "plot_stats(df)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.9.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
\ No newline at end of file