From 209ac32cc3a88dce271c54eb819484417a813a64 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 30 Mar 2026 21:11:41 +0000 Subject: [PATCH 1/4] Add github-contributor-analysis notebook Fetches contributor stats from any GitHub repo via the API and runs the same nonlinear maintenance model fitting as nonlinear-performance.ipynb. Handles 202 retry, rate limit errors, and missing/empty repos. https://claude.ai/code/session_01VAzMQrsv5yZXFmC4BxGrQ2 --- github-contributor-analysis.ipynb | 198 ++++++++++++++++++++++++++++++ 1 file changed, 198 insertions(+) create mode 100644 github-contributor-analysis.ipynb diff --git a/github-contributor-analysis.ipynb b/github-contributor-analysis.ipynb new file mode 100644 index 0000000..f7908b7 --- /dev/null +++ b/github-contributor-analysis.ipynb @@ -0,0 +1,198 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# GitHub Contributor Nonlinear Performance Analysis\n", + "\n", + "Fetches contributor statistics from a GitHub repository and fits the nonlinear maintenance model\n", + "\n", + "$$w(t) = \\frac{\\text{amplitude}}{\\mu}\\left(1 - e^{-\\mu t}\\right)$$\n", + "\n", + "to each contributor's cumulative activity over time." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Configuration\n", + "\n", + "Set `REPO` to `\"owner/repo\"`. Optionally set `GITHUB_TOKEN` to avoid rate limiting (required for private repos)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "REPO = \"torvalds/linux\" # change to any public repo, e.g. \"pandas-dev/pandas\"\n", + "GITHUB_TOKEN = \"\" # optional: set a personal access token to raise rate limits" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import time\n", + "import requests\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "from scipy.optimize import minimize" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Fetch contributor statistics\n", + "\n", + "GitHub may return HTTP 202 while it computes statistics for the first time — the cell retries until data is ready." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def fetch_contributors(repo, token=\"\", max_retries=6, retry_delay=5):\n", + " \"\"\"Fetch contributor weekly stats from the GitHub API.\n", + "\n", + " Returns a DataFrame where each row is a contributor with a 'weeks' column\n", + " containing a list of {w, a, d, c} dicts (matching linux_contributors.json format).\n", + " \"\"\"\n", + " url = f\"https://api.github.com/repos/{repo}/stats/contributors\"\n", + " headers = {\"Accept\": \"application/vnd.github+json\"}\n", + " if token:\n", + " headers[\"Authorization\"] = f\"Bearer {token}\"\n", + "\n", + " for attempt in range(1, max_retries + 1):\n", + " response = requests.get(url, headers=headers, timeout=30)\n", + " if response.status_code == 200:\n", + " data = response.json()\n", + " if data: # empty list means no contributors\n", + " return pd.DataFrame(data)\n", + " raise ValueError(f\"Repository '{repo}' has no contributor data.\")\n", + " if response.status_code == 202:\n", + " print(f\"GitHub is computing stats, retrying in {retry_delay}s (attempt {attempt}/{max_retries})...\")\n", + " time.sleep(retry_delay)\n", + " continue\n", + " if response.status_code == 404:\n", + " raise ValueError(f\"Repository '{repo}' not found. Check the owner/repo spelling.\")\n", + " if response.status_code == 403:\n", + " raise PermissionError(\"Rate limit exceeded or access denied. Set GITHUB_TOKEN to a valid token.\")\n", + " response.raise_for_status()\n", + "\n", + " raise TimeoutError(f\"GitHub did not return contributor stats after {max_retries} attempts.\")\n", + "\n", + "\n", + "contributors = fetch_contributors(REPO, token=GITHUB_TOKEN)\n", + "print(f\"Fetched {len(contributors)} contributors from '{REPO}'\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def fun(t, amplitude, mu):\n", + " # t is in weeks; divide by 52 to convert to years (mu is a per-year rate)\n", + " return amplitude / mu * (1 - np.exp(-mu * t / 52))\n", + "\n", + "\n", + "def lsq(x_observed, params):\n", + " t = np.arange(len(x_observed))\n", + " x_model = fun(t, *params)\n", + " err = x_model - x_observed\n", + " return np.dot(err, err) / len(err)\n", + "\n", + "\n", + "def model_fit(x, title=\"\"):\n", + " x = np.array(x, dtype=np.float64)\n", + " x_max = x.max()\n", + " if x_max == 0:\n", + " return None\n", + " x = x / x_max\n", + " res = minimize(\n", + " lambda params: lsq(x, params),\n", + " (1 / len(x), 0.1),\n", + " method='SLSQP',\n", + " bounds=((0, None), (0, None))\n", + " )\n", + " amplitude, mu = res.x\n", + " # effective maintenance ratio: mu normalized by amplitude scale factor\n", + " mu_effective = mu / amplitude\n", + " mx = fun(np.arange(len(x)), amplitude, mu)\n", + " label = f\"{title} — \" if title else \"\"\n", + " pd.Series(mx).plot(title=r\"{}$\\mu$ = {:3.2f}\".format(label, mu_effective))\n", + " pd.Series(x).plot()\n", + " plt.show()\n", + " return res" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Run analysis" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def main(df):\n", + " for _, entry in df.iterrows():\n", + " rm = dict()\n", + " for week in entry.weeks:\n", + " d = dict(week)\n", + " w = d['w']\n", + " if w in rm:\n", + " rm[w] += d['a'] + d['d']\n", + " else:\n", + " rm[w] = d['a'] + d['d']\n", + " sorted_items = sorted(rm.items())\n", + " t = pd.Series([v for _, v in sorted_items]).cumsum()\n", + " t = t[t != 0] # Remove zero-activity weeks\n", + " login = entry.get('author', {}) or {}\n", + " if isinstance(login, dict):\n", + " login = login.get('login', '')\n", + " model_fit(t, title=login)\n", + "\n", + "\n", + "plt.rcParams[\"figure.figsize\"] = (9, 4)\n", + "plt.style.use('fivethirtyeight')\n", + "main(contributors)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.9.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} From f36055b4df9f9ed7ac5b6dd481199618ae792eb8 Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 18 Apr 2026 16:33:02 +0000 Subject: [PATCH 2/4] Add git-collaboration-network notebook Builds a contributor collaboration network from GitHub PR and commit history, and a code archaeology heatmap showing how deep into history a contributor must reach to understand code areas they arrived at after others had left. https://claude.ai/code/session_01VAzMQrsv5yZXFmC4BxGrQ2 --- git-collaboration-network.ipynb | 404 ++++++++++++++++++++++++++++++++ 1 file changed, 404 insertions(+) create mode 100644 git-collaboration-network.ipynb diff --git a/git-collaboration-network.ipynb b/git-collaboration-network.ipynb new file mode 100644 index 0000000..fad3795 --- /dev/null +++ b/git-collaboration-network.ipynb @@ -0,0 +1,404 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Collaborative Network from Git History\n", + "\n", + "Builds two views of contributor collaboration from a GitHub repository:\n", + "\n", + "1. **Network graph** \u2014 nodes are contributors, edges come from:\n", + " - *PR collaboration*: both participated (author / reviewer) in the same pull request\n", + " - *Temporal file overlap*: both touched the same code area **and** their activity windows overlap in time\n", + "2. **Code Archaeology heatmap** \u2014 for each (contributor, code area) pair, measures how many days before the contributor arrived that area was last touched by someone else" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Configuration" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "REPO = \"torvalds/linux\" # change to any owner/repo\n", + "GITHUB_TOKEN = \"\" # set a personal access token to raise rate limits\n", + "MAX_PRS = 100 # merged PRs to analyse (increase for deeper history)\n", + "MAX_COMMIT_PAGES = 5 # x100 commits each, for activity-window computation\n", + "MIN_CONTRIBUTIONS = 2 # exclude contributors below this threshold\n", + "FILE_GROUP_DEPTH = 1 # directory depth for code-area grouping (1 = top-level dir)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import time\n", + "import requests\n", + "import pandas as pd\n", + "import numpy as np\n", + "import networkx as nx\n", + "import matplotlib.pyplot as plt\n", + "import matplotlib.patches as mpatches\n", + "import seaborn as sns\n", + "from collections import defaultdict\n", + "from itertools import combinations" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Fetch data from GitHub" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def make_headers(token=''):\n", + " h = {\"Accept\": \"application/vnd.github+json\"}\n", + " if token:\n", + " h[\"Authorization\"] = f\"Bearer {token}\"\n", + " return h\n", + "\n", + "\n", + "def paginate(url, headers, params=None, max_pages=None):\n", + " results, page = [], 1\n", + " while True:\n", + " p = {**(params or {}), 'page': page, 'per_page': 100}\n", + " r = requests.get(url, headers=headers, params=p, timeout=30)\n", + " if r.status_code == 403:\n", + " raise PermissionError(\"Rate limit hit \u2014 set GITHUB_TOKEN.\")\n", + " if r.status_code == 404:\n", + " raise ValueError(f\"Not found: {url}\")\n", + " r.raise_for_status()\n", + " data = r.json()\n", + " if not data:\n", + " break\n", + " results.extend(data)\n", + " if (max_pages and page >= max_pages) or \"next\" not in r.links:\n", + " break\n", + " page += 1\n", + " return results" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def fetch_pr_data(repo, headers, max_prs=100):\n", + " print(f\"Fetching PRs from {repo}...\")\n", + " raw = paginate(\n", + " f\"https://api.github.com/repos/{repo}/pulls\",\n", + " headers, {\"state\": \"closed\"},\n", + " max_pages=(max_prs // 100 + 1) if max_prs else None,\n", + " )\n", + " merged = [pr for pr in raw if pr.get('merged_at')][:max_prs]\n", + "\n", + " records = []\n", + " for i, pr in enumerate(merged):\n", + " num = pr['number']\n", + " author = (pr.get('user') or {}).get('login')\n", + " date = pd.to_datetime(pr['merged_at'])\n", + "\n", + " reviews = paginate(\n", + " f\"https://api.github.com/repos/{repo}/pulls/{num}/reviews\", headers\n", + " )\n", + " reviewers = list(\n", + " {(r.get('user') or {}).get('login') for r in reviews} - {None, author}\n", + " )\n", + "\n", + " files_data = paginate(\n", + " f\"https://api.github.com/repos/{repo}/pulls/{num}/files\", headers\n", + " )\n", + " files = [f['filename'] for f in files_data]\n", + "\n", + " records.append({\n", + " 'pr': num, 'author': author,\n", + " 'reviewers': reviewers, 'files': files, 'date': date,\n", + " })\n", + " if (i + 1) % 10 == 0:\n", + " print(f\" {i + 1}/{len(merged)} PRs\")\n", + " time.sleep(0.05)\n", + "\n", + " print(f\"Done \u2014 {len(records)} merged PRs loaded\")\n", + " return records\n", + "\n", + "\n", + "def fetch_commit_activity(repo, headers, max_pages=5):\n", + " print(\"Fetching commit history for activity windows...\")\n", + " commits = paginate(\n", + " f\"https://api.github.com/repos/{repo}/commits\", headers, max_pages=max_pages\n", + " )\n", + " activity = defaultdict(list)\n", + " for c in commits:\n", + " login = (c.get('author') or {}).get('login')\n", + " date_str = (c.get('commit', {}).get('author') or {}).get('date')\n", + " if login and date_str:\n", + " activity[login].append(pd.to_datetime(date_str))\n", + " print(f\"Found activity for {len(activity)} contributors\")\n", + " return dict(activity)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Build collaboration network" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def build_activity_windows(pr_records, commit_activity, min_contributions=2):\n", + " \"\"\"Merge PR and commit dates into per-contributor (start, end) windows.\"\"\"\n", + " dates = defaultdict(list)\n", + " for pr in pr_records:\n", + " for person in [pr['author']] + pr['reviewers']:\n", + " if person:\n", + " dates[person].append(pr['date'])\n", + " for person, ds in commit_activity.items():\n", + " dates[person].extend(ds)\n", + " return {\n", + " p: (min(ds), max(ds))\n", + " for p, ds in dates.items()\n", + " if len(ds) >= min_contributions\n", + " }\n", + "\n", + "\n", + "def overlaps(w1, w2):\n", + " return w1[0] <= w2[1] and w2[0] <= w1[1]\n", + "\n", + "\n", + "def file_group(path, depth=1):\n", + " parts = path.split('/')\n", + " return '/'.join(parts[:depth]) if len(parts) > depth else parts[0]\n", + "\n", + "\n", + "def build_network(pr_records, activity_windows, file_group_depth=1):\n", + " G = nx.Graph()\n", + " G.add_nodes_from(activity_windows)\n", + "\n", + " # PR edges: both participated in the same pull request\n", + " for pr in pr_records:\n", + " participants = list({\n", + " p for p in [pr['author']] + pr['reviewers']\n", + " if p in activity_windows\n", + " })\n", + " for a, b in combinations(participants, 2):\n", + " if G.has_edge(a, b):\n", + " G[a][b]['pr_weight'] = G[a][b].get('pr_weight', 0) + 1\n", + " else:\n", + " G.add_edge(a, b, pr_weight=1, file_weight=0)\n", + "\n", + " # File + temporal-overlap edges: touched the same code area while both active\n", + " area_contribs = defaultdict(set)\n", + " for pr in pr_records:\n", + " participants = [p for p in [pr['author']] + pr['reviewers'] if p in activity_windows]\n", + " for f in pr['files']:\n", + " area_contribs[file_group(f, file_group_depth)].update(participants)\n", + "\n", + " for area, people in area_contribs.items():\n", + " for a, b in combinations(people, 2):\n", + " if overlaps(activity_windows[a], activity_windows[b]):\n", + " if G.has_edge(a, b):\n", + " G[a][b]['file_weight'] = G[a][b].get('file_weight', 0) + 1\n", + " else:\n", + " G.add_edge(a, b, pr_weight=0, file_weight=1)\n", + "\n", + " return G" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Network visualisation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def plot_network(G, repo):\n", + " fig, ax = plt.subplots(figsize=(14, 10))\n", + " pos = nx.spring_layout(G, k=1.5, seed=42)\n", + "\n", + " pr_only = [(u, v) for u, v, d in G.edges(data=True) if d.get('pr_weight', 0) > 0 and d.get('file_weight', 0) == 0]\n", + " file_only = [(u, v) for u, v, d in G.edges(data=True) if d.get('pr_weight', 0) == 0 and d.get('file_weight', 0) > 0]\n", + " both = [(u, v) for u, v, d in G.edges(data=True) if d.get('pr_weight', 0) > 0 and d.get('file_weight', 0) > 0]\n", + "\n", + " node_sizes = [200 + 80 * G.degree(n) for n in G.nodes()]\n", + " nx.draw_networkx_nodes(G, pos, node_size=node_sizes, node_color='steelblue', alpha=0.85, ax=ax)\n", + " nx.draw_networkx_labels(G, pos, font_size=7, ax=ax)\n", + " nx.draw_networkx_edges(G, pos, edgelist=pr_only, edge_color='#e07b39', width=1.5, alpha=0.7, ax=ax)\n", + " nx.draw_networkx_edges(G, pos, edgelist=file_only, edge_color='#4caf50', width=1.0, alpha=0.6, ax=ax)\n", + " nx.draw_networkx_edges(G, pos, edgelist=both, edge_color='#9c27b0', width=2.5, alpha=0.8, ax=ax)\n", + "\n", + " legend = [\n", + " mpatches.Patch(color='#e07b39', label='PR collaboration only'),\n", + " mpatches.Patch(color='#4caf50', label='Shared code area (temporal overlap) only'),\n", + " mpatches.Patch(color='#9c27b0', label='Both'),\n", + " ]\n", + " ax.legend(handles=legend, loc='upper left', fontsize=9)\n", + " ax.set_title(f'Collaboration Network \u2014 {repo}', fontsize=13)\n", + " ax.axis('off')\n", + " plt.tight_layout()\n", + " plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Code Archaeology\n", + "\n", + "For each *(contributor, code area)* pair, the **archaeology depth** is the number of days between when a contributor first became active and when that code area was last touched by someone else before them.\n", + "\n", + "- **Large value** \u2192 contributor arrived long after others left; understanding the code requires digging deep into history\n", + "- **Zero / NaN** \u2192 contributor was contemporary with others who touched that area (no archaeology needed)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def build_archaeology_matrix(pr_records, activity_windows, file_group_depth=1,\n", + " top_files=30, top_contributors=25):\n", + " area_timeline = defaultdict(list)\n", + " for pr in pr_records:\n", + " participants = [p for p in [pr['author']] + pr['reviewers'] if p in activity_windows]\n", + " for f in pr['files']:\n", + " g = file_group(f, file_group_depth)\n", + " for p in participants:\n", + " area_timeline[g].append((pr['date'], p))\n", + " for g in area_timeline:\n", + " area_timeline[g].sort()\n", + "\n", + " contrib_count = defaultdict(int)\n", + " for pr in pr_records:\n", + " for p in [pr['author']] + pr['reviewers']:\n", + " if p in activity_windows:\n", + " contrib_count[p] += 1\n", + " top_contribs = sorted(contrib_count, key=contrib_count.get, reverse=True)[:top_contributors]\n", + "\n", + " top_areas = sorted(\n", + " area_timeline, key=lambda g: len(area_timeline[g]), reverse=True\n", + " )[:top_files]\n", + "\n", + " matrix = pd.DataFrame(index=top_areas, columns=top_contribs, dtype=float)\n", + "\n", + " for area in top_areas:\n", + " for c in top_contribs:\n", + " c_start = activity_windows[c][0]\n", + " prior = [\n", + " date for date, person in area_timeline[area]\n", + " if person != c and date < c_start\n", + " ]\n", + " if prior:\n", + " matrix.loc[area, c] = float(max((c_start - max(prior)).days, 0))\n", + "\n", + " return matrix\n", + "\n", + "\n", + "def plot_archaeology(matrix, repo):\n", + " data = matrix.dropna(how='all').dropna(axis=1, how='all')\n", + " if data.empty:\n", + " print('No archaeology data: all contributors were fully contemporaneous.')\n", + " return\n", + "\n", + " h = max(6, len(data.index) * 0.35)\n", + " w = max(10, len(data.columns) * 0.55)\n", + " fig, ax = plt.subplots(figsize=(w, h))\n", + " sns.heatmap(\n", + " data.astype(float),\n", + " cmap='YlOrRd',\n", + " ax=ax,\n", + " linewidths=0.3,\n", + " cbar_kws={'label': 'Archaeology depth (days)'},\n", + " mask=data.isna(),\n", + " )\n", + " ax.set_title(\n", + " f'Code Archaeology \u2014 {repo}\\n'\n", + " 'Days since others last touched this code area before a contributor arrived',\n", + " fontsize=11,\n", + " )\n", + " ax.set_xlabel('Contributor')\n", + " ax.set_ylabel('Code Area')\n", + " plt.xticks(rotation=45, ha='right', fontsize=8)\n", + " plt.yticks(fontsize=7)\n", + " plt.tight_layout()\n", + " plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Run" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "HEADERS = make_headers(GITHUB_TOKEN)\n", + "\n", + "pr_records = fetch_pr_data(REPO, HEADERS, max_prs=MAX_PRS)\n", + "commit_activity = fetch_commit_activity(REPO, HEADERS, max_pages=MAX_COMMIT_PAGES)\n", + "\n", + "activity_windows = build_activity_windows(pr_records, commit_activity,\n", + " min_contributions=MIN_CONTRIBUTIONS)\n", + "print(f'Active contributors (>={MIN_CONTRIBUTIONS} contributions): {len(activity_windows)}')\n", + "\n", + "G = build_network(pr_records, activity_windows, file_group_depth=FILE_GROUP_DEPTH)\n", + "print(f'Network: {G.number_of_nodes()} nodes, {G.number_of_edges()} edges')\n", + "\n", + "plt.style.use('fivethirtyeight')\n", + "\n", + "plot_network(G, REPO)\n", + "\n", + "arch_matrix = build_archaeology_matrix(\n", + " pr_records, activity_windows,\n", + " file_group_depth=FILE_GROUP_DEPTH,\n", + ")\n", + "plot_archaeology(arch_matrix, REPO)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.9.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file From 7b0cd83e423d44e96e7d62f4e8c1e57befca4017 Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 18 Apr 2026 16:39:15 +0000 Subject: [PATCH 3/4] Add github-core-network-stats notebook Tests the hypothesis that the core collaboration network is very small: fetches top 1000 GitHub repos by stars, computes 80/20 core size, Gini coefficient, temporal core, and top-contributor dominance per repo, then visualises aggregate distributions across the dataset. https://claude.ai/code/session_01VAzMQrsv5yZXFmC4BxGrQ2 --- github-core-network-stats.ipynb | 411 ++++++++++++++++++++++++++++++++ 1 file changed, 411 insertions(+) create mode 100644 github-core-network-stats.ipynb diff --git a/github-core-network-stats.ipynb b/github-core-network-stats.ipynb new file mode 100644 index 0000000..9eb3892 --- /dev/null +++ b/github-core-network-stats.ipynb @@ -0,0 +1,411 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Core Collaboration Network \u2014 Top 1000 GitHub Repos\n", + "\n", + "**Hypothesis (from JIRA analysis):** the _core_ collaboration network of a project is very small relative to its total contributor count.\n", + "\n", + "This notebook fetches the top 1000 GitHub repositories by stars, pulls contributor statistics for each, and computes:\n", + "\n", + "| Metric | What it measures |\n", + "|---|---|\n", + "| **80/20 core size** | minimum contributors accounting for 80% of all commits |\n", + "| **Gini coefficient** | inequality of contribution distribution (0 = equal, 1 = one person does everything) |\n", + "| **Top-1 share** | fraction of commits by the single most active contributor |\n", + "| **Temporal core** | contributors whose activity windows overlap with at least one other (potential collaborators) |\n", + "| **Core fraction** | core size / total contributors |\n", + "\n", + "Results are cached to disk so re-runs are instant." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Configuration" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "GITHUB_TOKEN = \"\" # required \u2014 5000 req/hr authenticated vs 60 req/hr anonymous\n", + "MAX_REPOS = 1000 # top N repos by stars\n", + "CACHE_FILE = \"top_repos_cache.json\" # intermediate results saved here" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import json, time\n", + "import requests\n", + "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import matplotlib.ticker as mticker\n", + "from pathlib import Path\n", + "from itertools import combinations\n", + "from collections import defaultdict" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Fetch top repositories" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def make_headers(token=''):\n", + " h = {\"Accept\": \"application/vnd.github+json\"}\n", + " if token:\n", + " h[\"Authorization\"] = f\"Bearer {token}\"\n", + " return h\n", + "\n", + "\n", + "def _get(url, headers, params=None, retries=4):\n", + " delay = 2\n", + " for attempt in range(retries):\n", + " r = requests.get(url, headers=headers, params=params, timeout=30)\n", + " if r.status_code == 200:\n", + " return r\n", + " if r.status_code in (202, 429, 500, 502, 503):\n", + " time.sleep(delay)\n", + " delay *= 2\n", + " continue\n", + " if r.status_code == 403:\n", + " raise PermissionError(\"Rate limit hit \u2014 set GITHUB_TOKEN.\")\n", + " return None # 404, 451, etc.\n", + " return None\n", + "\n", + "\n", + "def fetch_top_repos(headers, n=1000):\n", + " print(f\"Fetching top {n} repos by stars...\")\n", + " repos, page = [], 1\n", + " while len(repos) < n:\n", + " r = _get(\n", + " \"https://api.github.com/search/repositories\",\n", + " headers,\n", + " {\"q\": \"stars:>100\", \"sort\": \"stars\", \"order\": \"desc\", \"per_page\": 100, \"page\": page},\n", + " )\n", + " if r is None:\n", + " break\n", + " items = r.json().get('items', [])\n", + " if not items:\n", + " break\n", + " repos.extend(items)\n", + " if \"next\" not in r.links:\n", + " break\n", + " page += 1\n", + " time.sleep(0.3)\n", + " repos = repos[:n]\n", + " print(f\"Got {len(repos)} repos\")\n", + " return repos" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Fetch contributor stats (with disk cache)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def fetch_contributor_stats(owner, repo, headers):\n", + " \"\"\"Fetch /stats/contributors, retrying on 202 (GitHub is computing).\"\"\"\n", + " url = f'https://api.github.com/repos/{owner}/{repo}/stats/contributors'\n", + " r = _get(url, headers)\n", + " return r.json() if r and isinstance(r.json(), list) else None\n", + "\n", + "\n", + "def fetch_all_stats(repos, headers, cache_file='top_repos_cache.json'):\n", + " cache = {}\n", + " p = Path(cache_file)\n", + " if p.exists():\n", + " cache = json.loads(p.read_text())\n", + " print(f\"Cache loaded: {len(cache)} repos\")\n", + "\n", + " results = []\n", + " for i, repo in enumerate(repos):\n", + " full_name = repo['full_name']\n", + " owner, name = full_name.split('/', 1)\n", + "\n", + " if full_name not in cache:\n", + " cache[full_name] = fetch_contributor_stats(owner, name, headers)\n", + " time.sleep(0.15)\n", + "\n", + " stats = cache[full_name]\n", + " if stats:\n", + " results.append({\n", + " 'repo': full_name,\n", + " 'stars': repo['stargazers_count'],\n", + " 'forks': repo['forks_count'],\n", + " 'language': repo.get('language'),\n", + " 'contributors': stats,\n", + " })\n", + "\n", + " if (i + 1) % 100 == 0:\n", + " p.write_text(json.dumps(cache)) # checkpoint\n", + " print(f\" {i + 1}/{len(repos)} repos processed ({len(results)} with data)\")\n", + "\n", + " p.write_text(json.dumps(cache))\n", + " print(f\"Done \u2014 {len(results)}/{len(repos)} repos returned contributor data\")\n", + " return results" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Compute per-repo metrics" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def gini(values):\n", + " \"\"\"Gini coefficient: 0 = perfect equality, 1 = maximum inequality.\"\"\"\n", + " v = np.sort(np.array(values, dtype=float))\n", + " n = len(v)\n", + " if n == 0 or v.sum() == 0:\n", + " return 0.0\n", + " ix = np.arange(1, n + 1)\n", + " return float((2 * ix.dot(v) / (n * v.sum())) - (n + 1) / n)\n", + "\n", + "\n", + "def pareto_core(totals, threshold=0.8):\n", + " \"\"\"Minimum contributors accounting for `threshold` fraction of total commits.\"\"\"\n", + " desc = sorted(totals, reverse=True)\n", + " total = sum(desc)\n", + " if total == 0:\n", + " return 0\n", + " cumsum = 0\n", + " for i, t in enumerate(desc):\n", + " cumsum += t\n", + " if cumsum >= threshold * total:\n", + " return i + 1\n", + " return len(totals)\n", + "\n", + "\n", + "def activity_window(weeks):\n", + " active = [w['w'] for w in weeks if w.get('c', 0) + w.get('a', 0) + w.get('d', 0) > 0]\n", + " return (min(active), max(active)) if active else None\n", + "\n", + "\n", + "def temporal_core_size(contributors):\n", + " \"\"\"Contributors whose activity window overlaps with at least one other (sweep-line O(n log n)).\"\"\"\n", + " windows = [w for c in contributors for w in [activity_window(c.get('weeks', []))] if w]\n", + " if len(windows) < 2:\n", + " return len(windows)\n", + " windows.sort(key=lambda w: w[0])\n", + " has_overlap = [False] * len(windows)\n", + " active = [] # (end_time, index)\n", + " for i, (start, end) in enumerate(windows):\n", + " active = [(e, j) for e, j in active if e >= start]\n", + " if active:\n", + " has_overlap[i] = True\n", + " for _, j in active:\n", + " has_overlap[j] = True\n", + " active.append((end, i))\n", + " return sum(has_overlap)\n", + "\n", + "\n", + "def compute_metrics(repo_data):\n", + " records = []\n", + " for repo in repo_data:\n", + " contributors = repo['contributors']\n", + " totals = [c.get('total', 0) for c in contributors if c.get('total', 0) > 0]\n", + " if not totals:\n", + " continue\n", + " total_commits = sum(totals)\n", + " records.append({\n", + " 'repo': repo['repo'],\n", + " 'stars': repo['stars'],\n", + " 'forks': repo['forks'],\n", + " 'language': repo['language'],\n", + " 'n_contributors': len(totals),\n", + " 'total_commits': total_commits,\n", + " 'gini': gini(totals),\n", + " 'core_80': pareto_core(totals, 0.8),\n", + " 'core_50': pareto_core(totals, 0.5),\n", + " 'top1_share': max(totals) / total_commits,\n", + " 'temporal_core': temporal_core_size(contributors),\n", + " })\n", + " df = pd.DataFrame(records)\n", + " df['core_fraction'] = df['core_80'] / df['n_contributors']\n", + " return df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Visualise\n", + "\n", + "Six panels testing the hypothesis that the core collaboration network is small:\n", + "1. Distribution of 80/20 core sizes\n", + "2. Core size vs total contributors (log-log)\n", + "3. Gini coefficient distribution\n", + "4. CDF \u2014 fraction of repos with core \u2264 N\n", + "5. Top contributor's share of commits\n", + "6. Median core size by programming language" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def plot_stats(df):\n", + " fig, axes = plt.subplots(2, 3, figsize=(18, 10))\n", + " fig.suptitle(\n", + " f'Core Collaboration Network \u2014 Top {len(df)} GitHub Repos\\n'\n", + " f'Median contributors: {df[\"n_contributors\"].median():.0f} | '\n", + " f'Median 80/20 core: {df[\"core_80\"].median():.0f} | '\n", + " f'Median Gini: {df[\"gini\"].median():.2f}',\n", + " fontsize=13,\n", + " )\n", + "\n", + " # 1. 80/20 core size histogram\n", + " ax = axes[0, 0]\n", + " ax.hist(df['core_80'].clip(upper=50), bins=40, edgecolor='white')\n", + " ax.axvline(df['core_80'].median(), color='red', linestyle='--',\n", + " label=f\"Median: {df['core_80'].median():.0f}\")\n", + " ax.set_xlabel('80/20 Core Size (contributors)')\n", + " ax.set_ylabel('Repos')\n", + " ax.set_title('80/20 Core Size Distribution')\n", + " ax.legend()\n", + "\n", + " # 2. Core vs total (log-log scatter)\n", + " ax = axes[0, 1]\n", + " ax.scatter(df['n_contributors'], df['core_80'], alpha=0.25, s=8, color='steelblue')\n", + " lims = [1, df['n_contributors'].max() * 1.5]\n", + " ax.plot(lims, lims, 'r--', linewidth=0.8, label='core = total')\n", + " ax.set_xscale('log'); ax.set_yscale('log')\n", + " ax.set_xlabel('Total contributors (log)')\n", + " ax.set_ylabel('80/20 Core size (log)')\n", + " ax.set_title('Core vs Total Contributors')\n", + " ax.legend(fontsize=8)\n", + "\n", + " # 3. Gini coefficient\n", + " ax = axes[0, 2]\n", + " ax.hist(df['gini'], bins=40, edgecolor='white')\n", + " ax.axvline(df['gini'].median(), color='red', linestyle='--',\n", + " label=f\"Median: {df['gini'].median():.2f}\")\n", + " ax.set_xlabel('Gini Coefficient')\n", + " ax.set_ylabel('Repos')\n", + " ax.set_title('Contribution Inequality (Gini)')\n", + " ax.legend()\n", + "\n", + " # 4. CDF of core size\n", + " ax = axes[1, 0]\n", + " sorted_core = np.sort(df['core_80'])\n", + " cdf = np.arange(1, len(sorted_core) + 1) / len(sorted_core)\n", + " ax.plot(sorted_core, cdf, linewidth=2)\n", + " for n in [3, 5, 10, 20]:\n", + " pct = (df['core_80'] <= n).mean()\n", + " ax.axvline(n, color='grey', linestyle=':', linewidth=0.8)\n", + " ax.text(n, 0.05, f'{pct:.0%}\\n\u2264{n}', ha='center', fontsize=7)\n", + " ax.set_xlabel('Core Size')\n", + " ax.set_ylabel('Fraction of repos')\n", + " ax.set_title('CDF: Repos with Core \u2264 N')\n", + " ax.set_xlim(0, 50)\n", + "\n", + " # 5. Top-1 contributor share\n", + " ax = axes[1, 1]\n", + " ax.hist(df['top1_share'], bins=40, edgecolor='white')\n", + " ax.axvline(df['top1_share'].median(), color='red', linestyle='--',\n", + " label=f\"Median: {df['top1_share'].median():.0%}\")\n", + " ax.xaxis.set_major_formatter(mticker.PercentFormatter(1.0))\n", + " ax.set_xlabel('Top Contributor\\'s Share')\n", + " ax.set_ylabel('Repos')\n", + " ax.set_title('Top-1 Contributor Dominance')\n", + " ax.legend()\n", + "\n", + " # 6. Language breakdown\n", + " ax = axes[1, 2]\n", + " lang = (df.groupby('language')['core_80']\n", + " .agg(median='median', count='count')\n", + " .query('count >= 10')\n", + " .sort_values('median'))\n", + " lang['median'].plot(kind='barh', ax=ax)\n", + " ax.set_xlabel('Median 80/20 Core Size')\n", + " ax.set_title('Core Size by Language (n \u2265 10 repos)')\n", + "\n", + " plt.tight_layout()\n", + " plt.show()\n", + "\n", + " print('\\n=== Summary ===')\n", + " print(f'Repos analysed: {len(df)}')\n", + " print(f'Median total contributors: {df[\"n_contributors\"].median():.0f}')\n", + " print(f'Median 80/20 core: {df[\"core_80\"].median():.0f}')\n", + " print(f'Median core fraction: {df[\"core_fraction\"].median():.1%}')\n", + " print(f'Median Gini: {df[\"gini\"].median():.3f}')\n", + " print(f'Median top-1 share: {df[\"top1_share\"].median():.1%}')\n", + " print(f'Repos where core \u2264 3: {(df[\"core_80\"] <= 3).mean():.1%}')\n", + " print(f'Repos where core \u2264 5: {(df[\"core_80\"] <= 5).mean():.1%}')\n", + " print(f'Repos where core \u2264 10: {(df[\"core_80\"] <= 10).mean():.1%}')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Run" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "HEADERS = make_headers(GITHUB_TOKEN)\n", + "\n", + "repos = fetch_top_repos(HEADERS, n=MAX_REPOS)\n", + "raw_data = fetch_all_stats(repos, HEADERS, cache_file=CACHE_FILE)\n", + "\n", + "df = compute_metrics(raw_data)\n", + "print(df.describe())\n", + "\n", + "plt.style.use('fivethirtyeight')\n", + "plot_stats(df)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.9.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file From 2b8881855fd56ffcf640bfa0164600ab3e27c4eb Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 18 Apr 2026 16:47:46 +0000 Subject: [PATCH 4/4] Add collaboration-network-growth animation notebook Renders GitHub PR collaboration history as a growing-network movie: dark background, plasma-coloured nodes (purple=early, orange=late), glow flash on new arrivals. Outputs a GIF/MP4 via matplotlib and an interactive Plotly animation with play button + time scrubber. https://claude.ai/code/session_01VAzMQrsv5yZXFmC4BxGrQ2 --- collaboration-network-growth.ipynb | 436 +++++++++++++++++++++++++++++ 1 file changed, 436 insertions(+) create mode 100644 collaboration-network-growth.ipynb diff --git a/collaboration-network-growth.ipynb b/collaboration-network-growth.ipynb new file mode 100644 index 0000000..e14ef49 --- /dev/null +++ b/collaboration-network-growth.ipynb @@ -0,0 +1,436 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Collaboration Network Growth Animation\n", + "\n", + "Renders the history of a GitHub project as a **movie**: the collaboration network grows from a single contributor outward, frame by frame, one time period at a time.\n", + "\n", + "**Two outputs:**\n", + "- `OUTPUT_FILE` \u2014 GIF or MP4 (dark background, glow nodes, suitable for sharing)\n", + "- Plotly figure \u2014 interactive version with a play button and time scrubber (runs in the notebook)\n", + "\n", + "**Layout:** spring layout computed once on the final graph, so well-connected (core) contributors cluster at the centre and peripheral contributors radiate outward. Nodes are revealed progressively; new arrivals flash brightly on entry.\n", + "\n", + "**Node colour** follows the `plasma` colourmap: early contributors are purple/blue, later arrivals are orange/yellow." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Configuration" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "REPO = \"facebook/react\"\n", + "GITHUB_TOKEN = \"\"\n", + "MAX_PRS = 300 # merged PRs to analyse \u2014 more = longer movie\n", + "TIME_RESOLUTION = 'Q' # 'M' monthly 'Q' quarterly 'Y' yearly\n", + "OUTPUT_FILE = 'collaboration_growth.gif' # .gif (Pillow) or .mp4 (ffmpeg)\n", + "FPS = 4 # frames per second" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import time, requests\n", + "import numpy as np\n", + "import pandas as pd\n", + "import networkx as nx\n", + "import matplotlib.pyplot as plt\n", + "import matplotlib.colors as mcolors\n", + "from matplotlib.animation import FuncAnimation, PillowWriter\n", + "from collections import defaultdict\n", + "from itertools import combinations\n", + "try:\n", + " import plotly.graph_objects as go\n", + " HAS_PLOTLY = True\n", + "except ImportError:\n", + " HAS_PLOTLY = False\n", + " print('plotly not installed \u2014 skipping interactive output')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Fetch PR data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def make_headers(token=''):\n", + " h = {\"Accept\": \"application/vnd.github+json\"}\n", + " if token:\n", + " h[\"Authorization\"] = f\"Bearer {token}\"\n", + " return h\n", + "\n", + "\n", + "def paginate(url, headers, params=None, max_pages=None):\n", + " results, page = [], 1\n", + " while True:\n", + " p = {**(params or {}), 'page': page, 'per_page': 100}\n", + " r = requests.get(url, headers=headers, params=p, timeout=30)\n", + " if r.status_code == 403:\n", + " raise PermissionError(\"Rate limit hit \u2014 set GITHUB_TOKEN.\")\n", + " r.raise_for_status()\n", + " data = r.json()\n", + " if not data:\n", + " break\n", + " results.extend(data)\n", + " if (max_pages and page >= max_pages) or 'next' not in r.links:\n", + " break\n", + " page += 1\n", + " return results\n", + "\n", + "\n", + "def fetch_pr_data(repo, headers, max_prs=300):\n", + " print(f\"Fetching PRs from {repo}...\")\n", + " raw = paginate(\n", + " f\"https://api.github.com/repos/{repo}/pulls\",\n", + " headers, {\"state\": \"closed\"},\n", + " max_pages=(max_prs // 100 + 1) if max_prs else None,\n", + " )\n", + " merged = [pr for pr in raw if pr.get('merged_at')][:max_prs]\n", + "\n", + " records = []\n", + " for i, pr in enumerate(merged):\n", + " num = pr['number']\n", + " author = (pr.get('user') or {}).get('login')\n", + " date = pd.to_datetime(pr['merged_at'])\n", + " reviews = paginate(\n", + " f\"https://api.github.com/repos/{repo}/pulls/{num}/reviews\", headers\n", + " )\n", + " reviewers = list(\n", + " {(r.get('user') or {}).get('login') for r in reviews} - {None, author}\n", + " )\n", + " records.append({'pr': num, 'author': author, 'reviewers': reviewers, 'date': date})\n", + " if (i + 1) % 50 == 0:\n", + " print(f\" {i + 1}/{len(merged)} PRs\")\n", + " time.sleep(0.05)\n", + "\n", + " print(f\"Done \u2014 {len(records)} merged PRs\")\n", + " return records" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Build temporal snapshots" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def build_snapshots(pr_records, freq='Q'):\n", + " \"\"\"Cumulative network state at each time period.\"\"\"\n", + " for pr in pr_records:\n", + " pr['period'] = pr['date'].to_period(freq)\n", + " periods = sorted({pr['period'] for pr in pr_records})\n", + "\n", + " cum_nodes = {} # node -> {'first_seen': period, 'pr_count': int}\n", + " cum_edges = {} # (a,b) -> {'first_seen': period, 'weight': int}\n", + " snapshots = []\n", + "\n", + " for period in periods:\n", + " for pr in (p for p in pr_records if p['period'] == period):\n", + " participants = list({p for p in [pr['author']] + pr['reviewers'] if p})\n", + " for p in participants:\n", + " if p not in cum_nodes:\n", + " cum_nodes[p] = {'first_seen': period, 'pr_count': 0}\n", + " cum_nodes[p]['pr_count'] += 1\n", + " for a, b in combinations(participants, 2):\n", + " key = tuple(sorted([a, b]))\n", + " if key not in cum_edges:\n", + " cum_edges[key] = {'first_seen': period, 'weight': 0}\n", + " cum_edges[key]['weight'] += 1\n", + "\n", + " snapshots.append({\n", + " 'period': period,\n", + " 'nodes': dict(cum_nodes),\n", + " 'edges': dict(cum_edges),\n", + " })\n", + "\n", + " print(f'{len(snapshots)} time periods, '\n", + " f'{len(cum_nodes)} contributors, {len(cum_edges)} collaborations total')\n", + " return snapshots" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Compute layout" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def compute_layout(snapshots):\n", + " \"\"\"Spring layout on the final graph \u2014 core contributors cluster at the centre.\"\"\"\n", + " final = snapshots[-1]\n", + " G = nx.Graph()\n", + " G.add_nodes_from(final['nodes'])\n", + " G.add_edges_from(final['edges'])\n", + " pos = nx.spring_layout(G, k=2.0, iterations=120, seed=42)\n", + " # Normalise to [-1, 1]\n", + " coords = np.array(list(pos.values()))\n", + " centre = coords.mean(axis=0)\n", + " scale = np.abs(coords - centre).max()\n", + " return {n: tuple((np.array(xy) - centre) / (scale or 1)) for n, xy in pos.items()}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Matplotlib animation (GIF / MP4)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "_BG = '#0d1117'\n", + "_CMAP = plt.cm.plasma\n", + "\n", + "\n", + "def _draw_frame(ax, snapshot, pos, period_idx, n_periods, prev_nodes):\n", + " ax.clear()\n", + " ax.set_facecolor(_BG)\n", + " ax.axis('off')\n", + " ax.set_xlim(-1.3, 1.3)\n", + " ax.set_ylim(-1.3, 1.3)\n", + "\n", + " nodes = snapshot['nodes']\n", + " edges = snapshot['edges']\n", + " new_nodes = set(nodes) - prev_nodes\n", + "\n", + " # --- edges ---\n", + " for (a, b), ed in edges.items():\n", + " if a not in pos or b not in pos:\n", + " continue\n", + " age = period_idx[snapshot['period']] - period_idx[ed['first_seen']]\n", + " alpha = min(0.55, 0.08 + 0.06 * age)\n", + " lw = 0.3 + 0.12 * min(ed['weight'], 6)\n", + " ax.plot([pos[a][0], pos[b][0]], [pos[a][1], pos[b][1]],\n", + " '-', color='#58a6ff', alpha=alpha, linewidth=lw, zorder=1)\n", + "\n", + " # --- nodes ---\n", + " top_nodes = {n for n, d in sorted(nodes.items(),\n", + " key=lambda x: x[1]['pr_count'], reverse=True)[:12]}\n", + " for node, nd in nodes.items():\n", + " if node not in pos:\n", + " continue\n", + " x, y = pos[node]\n", + " c_val = period_idx[nd['first_seen']] / max(1, n_periods - 1)\n", + " color = _CMAP(c_val)\n", + " size = 25 + 8 * min(nd['pr_count'], 30)\n", + " is_new = node in new_nodes\n", + " # glow ring\n", + " ax.scatter(x, y, s=size * (5 if is_new else 3),\n", + " color=color, alpha=0.45 if is_new else 0.12, zorder=2)\n", + " # core dot\n", + " ax.scatter(x, y, s=size, color=color, alpha=0.95, zorder=3)\n", + " # label for prominent contributors\n", + " if node in top_nodes:\n", + " ax.text(x, y + 0.09, node, ha='center', fontsize=5,\n", + " color='#e6edf3', alpha=0.85, zorder=4)\n", + "\n", + " # --- overlay ---\n", + " ax.text(0.02, 0.97, str(snapshot['period']),\n", + " transform=ax.transAxes, color='white', fontsize=14,\n", + " va='top', fontweight='bold')\n", + " ax.text(0.02, 0.91,\n", + " f\"{len(nodes)} contributors \u2022 {len(edges)} collaborations\",\n", + " transform=ax.transAxes, color='#8b949e', fontsize=8, va='top')\n", + "\n", + "\n", + "def make_animation(snapshots, pos, fps=4, output='collaboration_growth.gif'):\n", + " period_idx = {s['period']: i for i, s in enumerate(snapshots)}\n", + " n_periods = len(snapshots)\n", + " prev_list = [set()] + [set(snapshots[i - 1]['nodes']) for i in range(1, n_periods)]\n", + "\n", + " fig, ax = plt.subplots(figsize=(10, 10), facecolor=_BG)\n", + " fig.tight_layout(pad=0)\n", + "\n", + " def update(i):\n", + " _draw_frame(ax, snapshots[i], pos, period_idx, n_periods, prev_list[i])\n", + "\n", + " anim = FuncAnimation(fig, update, frames=n_periods,\n", + " interval=1000 // fps, repeat=True)\n", + "\n", + " if output.endswith('.gif'):\n", + " anim.save(output, writer=PillowWriter(fps=fps), dpi=120)\n", + " else:\n", + " from matplotlib.animation import FFMpegWriter\n", + " anim.save(output, writer=FFMpegWriter(fps=fps, bitrate=2000), dpi=150)\n", + " plt.close()\n", + " print(f'Saved \u2192 {output}')\n", + " return anim" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Plotly interactive animation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def make_plotly_animation(snapshots, pos, repo=''):\n", + " period_idx = {s['period']: i for i, s in enumerate(snapshots)}\n", + " n_periods = len(snapshots)\n", + " cmap = plt.cm.plasma\n", + "\n", + " def rgba(period):\n", + " r, g, b, _ = cmap(period_idx[period] / max(1, n_periods - 1))\n", + " return f'rgb({int(r*255)},{int(g*255)},{int(b*255)})'\n", + "\n", + " def snapshot_traces(snapshot):\n", + " nodes, edges = snapshot['nodes'], snapshot['edges']\n", + " ex, ey = [], []\n", + " for (a, b) in edges:\n", + " if a in pos and b in pos:\n", + " ex += [pos[a][0], pos[b][0], None]\n", + " ey += [pos[a][1], pos[b][1], None]\n", + " present = [(n, d) for n, d in nodes.items() if n in pos]\n", + " nx_c = [pos[n][0] for n, _ in present]\n", + " ny_c = [pos[n][1] for n, _ in present]\n", + " tips = [f'{n}
since {d[\"first_seen\"]}
{d[\"pr_count\"]} PRs'\n", + " for n, d in present]\n", + " colors = [rgba(d['first_seen']) for _, d in present]\n", + " sizes = [7 + 2 * min(d['pr_count'], 20) for _, d in present]\n", + " return [\n", + " go.Scatter(x=ex, y=ey, mode='lines',\n", + " line=dict(color='#4a90d9', width=0.6), opacity=0.35,\n", + " hoverinfo='skip'),\n", + " go.Scatter(x=nx_c, y=ny_c, mode='markers',\n", + " text=[n for n, _ in present],\n", + " hovertext=tips, hoverinfo='text',\n", + " marker=dict(size=sizes, color=colors, opacity=0.92,\n", + " line=dict(width=0))),\n", + " ]\n", + "\n", + " frames = [go.Frame(name=str(s['period']), data=snapshot_traces(s))\n", + " for s in snapshots]\n", + "\n", + " fig = go.Figure(\n", + " data=frames[0].data,\n", + " frames=frames,\n", + " layout=go.Layout(\n", + " paper_bgcolor=_BG, plot_bgcolor=_BG,\n", + " showlegend=False,\n", + " xaxis=dict(visible=False, range=[-1.4, 1.4]),\n", + " yaxis=dict(visible=False, range=[-1.4, 1.4], scaleanchor='x'),\n", + " margin=dict(l=10, r=10, t=50, b=60),\n", + " title=dict(text=f'Collaboration Network Growth \u2014 {repo}',\n", + " font=dict(color='white', size=15)),\n", + " updatemenus=[dict(\n", + " type='buttons', showactive=False,\n", + " y=-0.08, x=0.5, xanchor='center',\n", + " buttons=[\n", + " dict(label='\u25b6 Play', method='animate',\n", + " args=[None, dict(frame=dict(duration=700, redraw=True),\n", + " fromcurrent=True)]),\n", + " dict(label='\u23f8 Pause', method='animate',\n", + " args=[[None], dict(frame=dict(duration=0),\n", + " mode='immediate')]),\n", + " ]\n", + " )],\n", + " sliders=[dict(\n", + " currentvalue=dict(prefix='Period: ', font=dict(color='white')),\n", + " font=dict(color='white'),\n", + " pad=dict(t=10),\n", + " steps=[dict(\n", + " args=[[str(s['period'])],\n", + " dict(frame=dict(duration=300), mode='immediate')],\n", + " label=str(s['period']),\n", + " method='animate',\n", + " ) for s in snapshots],\n", + " )],\n", + " )\n", + " )\n", + " return fig" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Run" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "HEADERS = make_headers(GITHUB_TOKEN)\n", + "pr_records = fetch_pr_data(REPO, HEADERS, max_prs=MAX_PRS)\n", + "\n", + "snapshots = build_snapshots(pr_records, freq=TIME_RESOLUTION)\n", + "pos = compute_layout(snapshots)\n", + "\n", + "# --- Movie ---\n", + "make_animation(snapshots, pos, fps=FPS, output=OUTPUT_FILE)\n", + "\n", + "# Display GIF inline if in Jupyter\n", + "try:\n", + " from IPython.display import Image, display\n", + " if OUTPUT_FILE.endswith('.gif'):\n", + " display(Image(OUTPUT_FILE))\n", + "except ImportError:\n", + " pass\n", + "\n", + "# --- Interactive Plotly ---\n", + "if HAS_PLOTLY:\n", + " fig = make_plotly_animation(snapshots, pos, repo=REPO)\n", + " fig.show()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.9.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file