diff --git a/.gitignore b/.gitignore index ec9d4efcc3..fae43151ef 100644 --- a/.gitignore +++ b/.gitignore @@ -25,3 +25,6 @@ wled-update.sh /wled00/Release /wled00/wled00.ino.cpp /wled00/html_*.h + +# Temporary fork statistics results +tempresults.json diff --git a/tools/README_fork_stats.md b/tools/README_fork_stats.md new file mode 100644 index 0000000000..c4453bc0b9 --- /dev/null +++ b/tools/README_fork_stats.md @@ -0,0 +1,201 @@ +# Fork Statistics Analysis Tool + +This tool analyzes GitHub repository forks to provide insights into fork activity and health for the WLED project. + +## Features + +The script analyzes and reports on: + +- **Branch Analysis**: Which forks have branches that do not exist in the main repo +- **Recency Analysis**: Which forks have recent versions of main vs outdated forks +- **Contribution Analysis**: Which fork repos have been the source of PRs into the main repo +- **Activity Detection**: Which forks have active development but haven't contributed PRs +- **Owner Commit Analysis**: Statistics about commits made by fork owners to their own repositories +- **Age Statistics**: Distribution of how far behind forks are (1 month, 3 months, 6 months, 1 year, 2+ years) +- **Incremental Saving**: Automatically saves intermediate results every 10 forks to prevent data loss + +## Requirements + +- Python 3.7+ +- `requests` library (included in WLED requirements.txt) +- GitHub personal access token (recommended for analyzing large numbers of forks) + +## Usage + +### Quick Demo + +To see what the output looks like with sample data: + +```bash +python3 tools/fork_stats.py --demo +``` + +### Basic Analysis (Rate Limited) + +Analyze the first 10 forks without a token (uses GitHub's unauthenticated API with 60 requests/hour limit): + +```bash +python3 tools/fork_stats.py --max-forks 10 +``` + +### Full Analysis with Token + +For comprehensive analysis, create a GitHub personal access token: + +1. Go to GitHub Settings > Developer settings > Personal access tokens > Tokens (classic) +2. Generate a new token with `public_repo` scope +3. Set the token as an environment variable: + +```bash +export GITHUB_TOKEN="your_token_here" +python3 tools/fork_stats.py +``` + +Or pass it directly: + +```bash +python3 tools/fork_stats.py --token "your_token_here" +``` + +### Advanced Options + +```bash +# Analyze specific repository +python3 tools/fork_stats.py --repo owner/repo + +# Limit number of forks analyzed +python3 tools/fork_stats.py --max-forks 50 + +# Fast mode: skip detailed analysis of very old forks for better performance +python3 tools/fork_stats.py --fast --max-forks 100 + +# Save detailed JSON results +python3 tools/fork_stats.py --output results.json + +# Check what would be analyzed without making API calls +python3 tools/fork_stats.py --dry-run + +# Different output format +python3 tools/fork_stats.py --format json +``` + +## Output + +### Summary Format (Default) + +The tool provides a human-readable summary including: + +- Repository statistics (total forks, stars, watchers) +- Fork age distribution showing staleness +- Activity analysis showing contribution patterns +- Key insights about fork health + +### JSON Format + +Detailed machine-readable output including: + +- Complete fork metadata for each analyzed fork +- Branch information and unique branches +- Contribution history and activity metrics +- Owner commit statistics for each fork +- Full statistical breakdown +- Intermediate results are automatically saved to `tempresults.json` every 10 forks to prevent data loss on interruption + +### Visualization + +For advanced visualization and analysis of the JSON results, use the companion visualizer tool: + +```bash +# Generate visualizations from collected data +python3 tools/fork_stats_visualizer.py results.json --save-plots + +# Text-only statistics (no graphs) +python3 tools/fork_stats_visualizer.py results.json --no-graphs +``` + +See [README_fork_stats_visualizer.md](README_fork_stats_visualizer.md) for complete documentation. + +## Performance Considerations + +### Execution Speed +- **Without Token**: 60 requests/hour (very slow, only suitable for testing) +- **With Token**: 5000 requests/hour (much faster, recommended for real analysis) +- **Each fork requires 3-8 API requests** depending on the fork's complexity + +### Fast Mode +Use `--fast` flag to improve performance: +- Skips detailed analysis of forks inactive for 3+ years +- Reduces API calls for very old forks by ~80% +- Maintains statistical accuracy while dramatically improving speed +- Recommended for initial analysis or large repository scans + +### Progress Tracking +The tool provides detailed progress information including: +- Current fork being analyzed +- Time taken per fork analysis +- API requests made and remaining rate limit +- Estimated completion time + +## Example Output + +``` +============================================================ +FORK ANALYSIS SUMMARY FOR wled/WLED +============================================================ + +Repository Details: + - Total Forks: 1,243 + - Analyzed: 100 + - Stars: 15,500 + - Watchers: 326 + +Fork Age Distribution: + - Last updated ≤ 1 month: 8 ( 8.0%) + - Last updated ≤ 3 months: 12 ( 12.0%) + - Last updated ≤ 6 months: 15 ( 15.0%) + - Last updated ≤ 1 year: 23 ( 23.0%) + - Last updated ≤ 2 years: 25 ( 25.0%) + - Last updated > 5 years: 17 ( 17.0%) + +Fork Activity Analysis: + - Forks with unique branches: 34 (34.0%) + - Forks with recent main branch: 42 (42.0%) + - Forks that contributed PRs: 18 (18.0%) + - Active forks (no PR contributions): 23 (23.0%) + +Owner Commit Analysis: + - Forks with owner commits: 67 (67.0%) + - Total commits by fork owners: 2845 + - Average commits per fork: 28.5 + +Key Insights: + - Most forks are significantly behind main branch + - Significant number of forks have custom development + - Majority of forks show some owner development activity +``` + +## Use Cases + +- **Project Maintenance**: Identify which forks are actively maintained +- **Community Engagement**: Find potential contributors who haven't submitted PRs +- **Code Discovery**: Locate interesting custom features in fork branches +- **Health Assessment**: Monitor overall ecosystem health of the project +- **Outreach Planning**: Target active fork maintainers for collaboration + +## Implementation Details + +The script uses the GitHub REST API v3 and implements: + +- Rate limiting with automatic backoff +- Error handling for private/deleted repositories +- Efficient pagination for large fork lists +- Branch comparison algorithms +- PR attribution analysis +- Commit recency detection + +## Troubleshooting + +- **Rate Limit Errors**: Use a GitHub token or reduce `--max-forks` +- **Permission Errors**: Ensure token has `public_repo` scope +- **Network Errors**: Check internet connection and GitHub status +- **Large Repository Timeouts**: Use `--max-forks` to limit analysis scope \ No newline at end of file diff --git a/tools/README_fork_stats_visualizer.md b/tools/README_fork_stats_visualizer.md new file mode 100644 index 0000000000..9ff4690762 --- /dev/null +++ b/tools/README_fork_stats_visualizer.md @@ -0,0 +1,240 @@ +# Fork Statistics Visualizer + +A Python script that loads JSON data generated by `fork_stats.py` and displays detailed statistics both as formatted text lists and visual graphs. + +## Features + +- **Text-based Statistics**: Formatted tables with visual bars showing percentages +- **Interactive Graphs**: Pie charts, bar charts, histograms, and combined dashboards +- **Top Forks Lists**: Shows top forks by unique branches, owner commits, and activity +- **Export Capabilities**: Save all visualizations as high-quality PNG images +- **Works Without Graphics**: Can run in text-only mode without matplotlib + +## Installation + +### Basic Installation (Text-only) +The visualizer works without additional dependencies for text-based output: +```bash +python3 tools/fork_stats_visualizer.py results.json --no-graphs +``` + +### Full Installation (With Graphics) +For graphical visualizations, install matplotlib: +```bash +pip install -r tools/fork_stats_visualizer_requirements.txt +``` + +## Usage + +### Basic Usage +Display text statistics and interactive graphs: +```bash +python3 tools/fork_stats_visualizer.py results.json +``` + +### Text-Only Mode +Skip graphs and only show text statistics: +```bash +python3 tools/fork_stats_visualizer.py results.json --no-graphs +``` + +### Save Plots to Files +Save all plots as PNG files instead of displaying them: +```bash +python3 tools/fork_stats_visualizer.py results.json --save-plots +``` + +### Custom Output Directory +Specify where to save the plots: +```bash +python3 tools/fork_stats_visualizer.py results.json --save-plots --output-dir ./my_plots +``` + +### Show Top N Forks +Display top 30 forks instead of default 20: +```bash +python3 tools/fork_stats_visualizer.py results.json --top-n 30 +``` + +## Output + +### Text Statistics + +The script displays: + +1. **Repository Information** + - Repository name, total forks, stars, watchers + - Number of forks analyzed + - Analysis timestamp + +2. **Fork Age Distribution** + - Breakdown by age categories (≤1 month, ≤3 months, ≤6 months, ≤1 year, ≤2 years, >5 years) + - Count and percentage for each category + - Visual bars showing proportions + +3. **Fork Activity Analysis** + - Forks with unique branches + - Forks with recent main branch + - Forks that contributed PRs + - Active forks without PR contributions + - Visual bars for each metric + +4. **Owner Commit Analysis** + - Number of forks with owner commits + - Total commits by all fork owners + - Average commits per fork + +5. **Top Forks Lists** + - Top N forks by unique branches + - Top N forks by owner commits + - Active forks without PR contributions + +### Visual Graphs + +When matplotlib is available, the script generates: + +1. **Age Distribution Pie Chart** + - Shows the distribution of fork ages + - Color-coded by recency (green for recent, red for old) + +2. **Activity Metrics Bar Chart** + - Compares different activity metrics side-by-side + - Shows percentages for each metric + +3. **Owner Commits Distribution Histogram** + - Shows the distribution of owner commits across forks + - Includes mean and max statistics + +4. **Combined Dashboard** + - All-in-one view with multiple charts + - Summary statistics panel + - Perfect for presentations and reports + +## Example Output + +### Text Output +``` +================================================================================ + REPOSITORY INFORMATION +================================================================================ + +Repository: wled/WLED +Total Forks: 1,243 +Stars: 15,500 +Watchers: 326 + +Analyzed Forks: 100 + +================================================================================ + FORK AGE DISTRIBUTION +================================================================================ + +Age Category Count Percentage +------------------------------------------------------------ +Last updated ≤ 1 month 8 8.0% ████ +Last updated ≤ 3 months 12 12.0% ██████ +Last updated ≤ 6 months 15 15.0% ███████ +Last updated ≤ 1 year 23 23.0% ███████████ +Last updated ≤ 2 years 25 25.0% ████████████ +Last updated > 5 years 17 17.0% ████████ +``` + +### Saved Files +When using `--save-plots`, the following files are created: +- `age_distribution.png` - Pie chart of fork ages +- `activity_metrics.png` - Bar chart of activity metrics +- `owner_commits_distribution.png` - Histogram of owner commits +- `dashboard.png` - Combined dashboard view + +## Integration with fork_stats.py + +### Complete Workflow + +1. **Collect Statistics** + ```bash + python3 tools/fork_stats.py --output results.json --max-forks 100 + ``` + +2. **Visualize Results** + ```bash + python3 tools/fork_stats_visualizer.py results.json --save-plots + ``` + +3. **View Dashboard** + Open `fork_plots/dashboard.png` for a complete overview + +### Automated Analysis + +Create a bash script to run both tools: +```bash +#!/bin/bash +# analyze_forks.sh + +# Set your GitHub token +export GITHUB_TOKEN="your_token_here" + +# Collect statistics +python3 tools/fork_stats.py \ + --repo wled/WLED \ + --max-forks 200 \ + --fast \ + --output fork_analysis.json + +# Generate visualizations +python3 tools/fork_stats_visualizer.py \ + fork_analysis.json \ + --save-plots \ + --output-dir ./fork_analysis_plots \ + --top-n 25 + +echo "Analysis complete! Check fork_analysis_plots/ for visualizations." +``` + +## Troubleshooting + +### "matplotlib not installed" Warning +This is normal if you haven't installed matplotlib. The script will still work in text-only mode. To enable graphs: +```bash +pip install -r tools/fork_stats_visualizer_requirements.txt +``` + +### "No detailed fork data available" +The JSON file doesn't contain individual fork details. This happens when: +- Using demo mode in fork_stats.py +- The analysis was interrupted before collecting fork data +- The JSON file is incomplete + +The visualizer will still show aggregate statistics from the statistics section. + +### Plots Not Saving +Ensure the output directory is writable: +```bash +mkdir -p ./fork_plots +chmod 755 ./fork_plots +python3 tools/fork_stats_visualizer.py results.json --save-plots --output-dir ./fork_plots +``` + +## Command-Line Options + +| Option | Description | Default | +|--------|-------------|---------| +| `json_file` | Path to JSON file with fork statistics | Required | +| `--save-plots` | Save plots to files instead of displaying | False | +| `--output-dir DIR` | Directory to save plots | `./fork_plots` | +| `--top-n N` | Number of top forks to display | 20 | +| `--no-graphs` | Skip graph generation, text only | False | + +## Dependencies + +### Required +- Python 3.7+ +- json (built-in) +- argparse (built-in) + +### Optional (for graphs) +- matplotlib >= 3.5.0 + +Install optional dependencies: +```bash +pip install -r tools/fork_stats_visualizer_requirements.txt +``` diff --git a/tools/fork_stats.py b/tools/fork_stats.py new file mode 100755 index 0000000000..ebc929b1ea --- /dev/null +++ b/tools/fork_stats.py @@ -0,0 +1,782 @@ +#!/usr/bin/env python3 +""" +Fork Statistics Analysis Tool for WLED Repository + +This script analyzes the forks of the WLED repository to provide insights on: +- Which forks have branches that do not exist in the main repo +- Which forks have a recent version of main (vs outdated forks) +- Which fork repos have been the source of PRs into the main repo +- Which forks have active development but haven't contributed PRs +- Statistics on how far behind forks are (1 month, 3 months, 6 months, 1+ years) + +Usage: + python3 tools/fork_stats.py [--token GITHUB_TOKEN] [--repo owner/repo] [--output OUTPUT_FILE] + +Environment Variables: + GITHUB_TOKEN: GitHub personal access token for API access +""" + +import argparse +import json +import os +import sys +import time +from datetime import datetime, timedelta, timezone +from typing import Dict, List, Optional, Set, Tuple +import requests +from dataclasses import dataclass + + +@dataclass +class ForkInfo: + """Information about a repository fork.""" + name: str + full_name: str + owner: str + html_url: str + updated_at: datetime + pushed_at: datetime + default_branch: str + branches: List[str] + unique_branches: List[str] + behind_main_by_commits: int + behind_main_by_days: int + has_contributed_prs: bool + recent_commits: int + is_active: bool + owner_commits: int + + +class GitHubAPIError(Exception): + """Custom exception for GitHub API errors.""" + pass + + +class ForkStatsAnalyzer: + """Analyzes fork statistics for a GitHub repository.""" + + def __init__(self, token: Optional[str] = None): + self.token = token or os.getenv('GITHUB_TOKEN') + if not self.token: + print("Warning: No GitHub token provided. API rate limits will be severely restricted.") + + self.session = requests.Session() + if self.token: + self.session.headers.update({'Authorization': f'token {self.token}'}) + + self.session.headers.update({ + 'Accept': 'application/vnd.github.v3+json', + 'User-Agent': 'WLED-Fork-Stats-Analyzer/1.0' + }) + + # Rate limiting + self.requests_made = 0 + self.rate_limit_remaining = None + self.rate_limit_reset = None + + def _make_request(self, url: str, params: Optional[Dict] = None) -> Dict: + """Make a GitHub API request with rate limiting.""" + if self.rate_limit_remaining is not None and self.rate_limit_remaining <= 5: + if self.rate_limit_reset: + wait_time = self.rate_limit_reset - time.time() + if wait_time > 0: + print(f"Rate limit low ({self.rate_limit_remaining} remaining). Waiting {wait_time:.1f} seconds...") + time.sleep(wait_time + 1) + + try: + response = self.session.get(url, params=params) + + # Update rate limit info + self.rate_limit_remaining = int(response.headers.get('X-RateLimit-Remaining', 0)) + reset_timestamp = response.headers.get('X-RateLimit-Reset') + if reset_timestamp: + self.rate_limit_reset = int(reset_timestamp) + + self.requests_made += 1 + + if self.requests_made % 50 == 0: + print(f"API requests made: {self.requests_made}, remaining: {self.rate_limit_remaining}") + + if response.status_code == 403: + if 'rate limit' in response.text.lower(): + raise GitHubAPIError("Rate limit exceeded") + else: + raise GitHubAPIError("API access forbidden (check token permissions)") + + response.raise_for_status() + return response.json() + + except requests.exceptions.RequestException as e: + raise GitHubAPIError(f"API request failed: {e}") + + def get_repository_info(self, repo: str) -> Dict: + """Get basic repository information.""" + url = f"https://api.github.com/repos/{repo}" + return self._make_request(url) + + def get_forks(self, repo: str, max_forks: Optional[int] = None) -> List[Dict]: + """Get all forks of a repository.""" + forks = [] + page = 1 + per_page = 100 + + while True: + url = f"https://api.github.com/repos/{repo}/forks" + params = {'page': page, 'per_page': per_page, 'sort': 'newest'} + + print(f"Fetching forks page {page}...") + data = self._make_request(url, params) + + if not data: + break + + forks.extend(data) + + if len(data) < per_page: + break + + if max_forks and len(forks) >= max_forks: + forks = forks[:max_forks] + break + + page += 1 + + return forks + + def get_branches(self, repo: str, max_branches: int = 500) -> List[str]: + """Get all branches for a repository (with limits for performance).""" + branches = [] + page = 1 + per_page = 100 + max_pages = max_branches // 100 + 1 + + while page <= max_pages: + url = f"https://api.github.com/repos/{repo}/branches" + params = {'page': page, 'per_page': per_page} + + try: + data = self._make_request(url, params) + if not data: + break + + branches.extend([branch['name'] for branch in data]) + + if len(data) < per_page: + break + + # Limit total branches to avoid excessive API calls + if len(branches) >= max_branches: + branches = branches[:max_branches] + break + + page += 1 + + except GitHubAPIError as e: + if "404" in str(e): # Repository might be empty or deleted + break + raise + + return branches + + def get_pull_requests_from_fork(self, main_repo: str, fork_owner: str) -> List[Dict]: + """Get pull requests created from a specific fork (optimized).""" + prs = [] + page = 1 + per_page = 100 + max_pages = 10 # Limit to first 1000 PRs to avoid excessive API calls + + print(f" Checking PRs from {fork_owner}...") + + while page <= max_pages: + url = f"https://api.github.com/repos/{main_repo}/pulls" + params = { + 'state': 'all', + 'head': f'{fork_owner}:', + 'page': page, + 'per_page': per_page + } + + try: + data = self._make_request(url, params) + if not data: + break + + # Filter PRs that are actually from this fork owner + fork_prs = [pr for pr in data if pr['head']['repo'] and + pr['head']['repo']['owner']['login'] == fork_owner] + prs.extend(fork_prs) + + if len(data) < per_page: + break + + page += 1 + + # Early exit if we found PRs (for performance - we just need to know if any exist) + if fork_prs: + break + + except GitHubAPIError: + break # Some API limitations or permissions issues + + return prs + + def get_commits_by_author(self, repo: str, author: str, branch: str = None, max_commits: int = 500) -> int: + """Get number of commits by a specific author (optimized for performance).""" + url = f"https://api.github.com/repos/{repo}/commits" + params = { + 'author': author, + 'per_page': 100 + } + if branch: + params['sha'] = branch + + try: + commits_count = 0 + page = 1 + max_pages = max_commits // 100 + 1 # Limit pages to avoid excessive API calls + + while page <= max_pages: + params['page'] = page + print(f" Fetching commits by {author}, page {page}...") + response = self.session.get(url, params=params) + if response.status_code != 200: + break + + # Update rate limit tracking + self.rate_limit_remaining = int(response.headers.get('X-RateLimit-Remaining', 0)) + reset_timestamp = response.headers.get('X-RateLimit-Reset') + if reset_timestamp: + self.rate_limit_reset = int(reset_timestamp) + self.requests_made += 1 + + data = response.json() + if not data: + break + + commits_count += len(data) + + # If we got less than per_page results, we're done + if len(data) < 100: + break + + page += 1 + + # Early exit if we hit our limit + if commits_count >= max_commits: + commits_count = max_commits # Cap at max to indicate truncation + break + + # Add small delay between requests to be nice to API + time.sleep(0.1) + + return commits_count + + except Exception as e: + print(f" Error fetching commits for {author}: {e}") + return 0 + + def get_commits_since_date(self, repo: str, since_date: datetime, branch: str = None) -> int: + """Get number of commits since a specific date.""" + url = f"https://api.github.com/repos/{repo}/commits" + params = { + 'since': since_date.isoformat(), + 'per_page': 1 + } + if branch: + params['sha'] = branch + + try: + response = self.session.get(url, params=params) + if response.status_code != 200: + return 0 + + # Get total count from Link header if available + link_header = response.headers.get('Link') + if link_header and 'rel="last"' in link_header: + # Parse the last page number to estimate total commits + import re + match = re.search(r'page=(\d+).*rel="last"', link_header) + if match: + return min(int(match.group(1)) * 30, 1000) # Rough estimate, cap at 1000 + + data = response.json() + return len(data) if data else 0 + + except: + return 0 + + def compare_repositories(self, base_repo: str, head_repo: str) -> Dict: + """Compare two repositories to see how far behind head is from base.""" + url = f"https://api.github.com/repos/{base_repo}/compare/main...{head_repo}:main" + + try: + return self._make_request(url) + except GitHubAPIError: + return {} + + def analyze_fork(self, fork: Dict, main_repo: str, main_branches: Set[str]) -> ForkInfo: + """Analyze a single fork and return detailed information.""" + fork_name = fork['full_name'] + fork_owner = fork['owner']['login'] + + start_time = time.time() + print(f"Analyzing fork: {fork_name}") + + # Get fork branches + try: + print(f" Fetching branches...") + fork_branches = self.get_branches(fork_name) + print(f" Found {len(fork_branches)} branches") + except GitHubAPIError as e: + print(f" Error fetching branches: {e}") + fork_branches = [] + + # Find unique branches (branches in fork but not in main repo) + unique_branches = [branch for branch in fork_branches if branch not in main_branches] + if unique_branches: + print(f" Found {len(unique_branches)} unique branches: {unique_branches[:5]}") + + # Check if fork has contributed PRs + prs_from_fork = self.get_pull_requests_from_fork(main_repo, fork_owner) + has_contributed = len(prs_from_fork) > 0 + if has_contributed: + print(f" Found {len(prs_from_fork)} PRs from this fork") + + # Compare with main repository + print(f" Comparing with main repository...") + comparison = self.compare_repositories(main_repo, fork_name) + behind_commits = comparison.get('behind_by', 0) + + # Calculate days behind based on last push + pushed_at = datetime.fromisoformat(fork['pushed_at'].replace('Z', '+00:00')) + now = datetime.now(timezone.utc) + days_behind = (now - pushed_at).days + print(f" Last pushed {days_behind} days ago") + + # Check for recent activity + print(f" Checking recent activity...") + thirty_days_ago = now - timedelta(days=30) + recent_commits = self.get_commits_since_date(fork_name, thirty_days_ago) + is_active = recent_commits > 0 or days_behind < 30 + + # Get commits by fork owner + print(f" Analyzing owner commits...") + owner_commits = self.get_commits_by_author(fork_name, fork_owner, max_commits=200) + if owner_commits >= 200: + print(f" Owner has 200+ commits (truncated for performance)") + else: + print(f" Owner has {owner_commits} commits") + + elapsed_time = time.time() - start_time + print(f" Analysis completed in {elapsed_time:.1f} seconds") + + return ForkInfo( + name=fork['name'], + full_name=fork_name, + owner=fork_owner, + html_url=fork['html_url'], + updated_at=datetime.fromisoformat(fork['updated_at'].replace('Z', '+00:00')), + pushed_at=pushed_at, + default_branch=fork['default_branch'], + branches=fork_branches, + unique_branches=unique_branches, + behind_main_by_commits=behind_commits, + behind_main_by_days=days_behind, + has_contributed_prs=has_contributed, + recent_commits=recent_commits, + is_active=is_active, + owner_commits=owner_commits + ) + + def analyze_repository_forks(self, repo: str, max_forks: Optional[int] = None, fast_mode: bool = False) -> Dict: + """Main analysis function for repository forks.""" + start_time = time.time() + print(f"Starting fork analysis for {repo}") + if fast_mode: + print("Fast mode enabled: Skipping detailed analysis of forks inactive for 3+ years") + + # Get main repository info + main_repo_info = self.get_repository_info(repo) + print(f"Repository: {main_repo_info['full_name']}") + print(f"Forks count: {main_repo_info['forks_count']}") + + # Get main repository branches + print("Fetching main repository branches...") + main_branches = set(self.get_branches(repo)) + print(f"Main repository has {len(main_branches)} branches") + + # Get all forks + forks = self.get_forks(repo, max_forks) + print(f"Found {len(forks)} forks to analyze") + print(f"Estimated API requests needed: {len(forks) * (3 if fast_mode else 8)}") + if self.rate_limit_remaining: + print(f"Current rate limit: {self.rate_limit_remaining} requests remaining") + + if not forks: + return { + 'main_repo': main_repo_info, + 'total_forks': 0, + 'analyzed_forks': [], + 'statistics': {} + } + + # Analyze each fork + analyzed_forks = [] + temp_results_file = "tempresults.json" + + for i, fork in enumerate(forks, 1): + try: + print(f"Progress: {i}/{len(forks)} - {fork['full_name']}") + + # Skip very old forks to improve performance (if fast mode enabled) + if fast_mode: + pushed_at = datetime.fromisoformat(fork['pushed_at'].replace('Z', '+00:00')) + days_since_push = (datetime.now(timezone.utc) - pushed_at).days + + if days_since_push > 1095: # Skip forks not updated in 3+ years + print(f" Skipping fork (not updated in {days_since_push} days)") + # Create minimal fork info for very old forks + fork_info = ForkInfo( + name=fork['name'], + full_name=fork['full_name'], + owner=fork['owner']['login'], + html_url=fork['html_url'], + updated_at=datetime.fromisoformat(fork['updated_at'].replace('Z', '+00:00')), + pushed_at=pushed_at, + default_branch=fork['default_branch'], + branches=[], + unique_branches=[], + behind_main_by_commits=0, + behind_main_by_days=days_since_push, + has_contributed_prs=False, + recent_commits=0, + is_active=False, + owner_commits=0 + ) + analyzed_forks.append(fork_info) + continue + + fork_info = self.analyze_fork(fork, repo, main_branches) + analyzed_forks.append(fork_info) + except Exception as e: + print(f"Error analyzing fork {fork['full_name']}: {e}") + continue + + # Save intermediate results every 10 forks + if i % 10 == 0: + print(f"Saving intermediate results to {temp_results_file}...") + temp_results = { + 'main_repo': main_repo_info, + 'total_forks': len(forks), + 'analyzed_so_far': i, + 'analyzed_forks': [ + { + 'name': fork.name, + 'full_name': fork.full_name, + 'owner': fork.owner, + 'html_url': fork.html_url, + 'updated_at': fork.updated_at.isoformat(), + 'pushed_at': fork.pushed_at.isoformat(), + 'default_branch': fork.default_branch, + 'branches': fork.branches, + 'unique_branches': fork.unique_branches, + 'behind_main_by_commits': fork.behind_main_by_commits, + 'behind_main_by_days': fork.behind_main_by_days, + 'has_contributed_prs': fork.has_contributed_prs, + 'recent_commits': fork.recent_commits, + 'is_active': fork.is_active, + 'owner_commits': fork.owner_commits + } + for fork in analyzed_forks + ], + 'statistics': self._calculate_statistics(analyzed_forks), + 'analysis_timestamp': datetime.now(timezone.utc).isoformat() + } + + try: + with open(temp_results_file, 'w') as f: + json.dump(temp_results, f, indent=2) + except Exception as save_error: + print(f"Warning: Failed to save intermediate results: {save_error}") + + # Be nice to the API + time.sleep(1) + + # Calculate statistics + statistics = self._calculate_statistics(analyzed_forks) + + # Clean up temporary results file on successful completion + temp_results_file = "tempresults.json" + if os.path.exists(temp_results_file): + try: + os.remove(temp_results_file) + print(f"Cleaned up temporary file: {temp_results_file}") + except Exception as e: + print(f"Warning: Could not remove temporary file {temp_results_file}: {e}") + + return { + 'main_repo': main_repo_info, + 'total_forks': len(forks), + 'analyzed_forks': analyzed_forks, + 'statistics': statistics, + 'analysis_timestamp': datetime.now(timezone.utc).isoformat() + } + + def _calculate_statistics(self, forks: List[ForkInfo]) -> Dict: + """Calculate summary statistics from analyzed forks.""" + if not forks: + return {} + + total_forks = len(forks) + + # Categorize by age + now = datetime.now(timezone.utc) + age_categories = { + '1_month': 0, + '3_months': 0, + '6_months': 0, + '1_year': 0, + '2_years': 0, + '5_plus_years': 0 + } + + for fork in forks: + days_old = (now - fork.pushed_at).days + if days_old <= 30: + age_categories['1_month'] += 1 + elif days_old <= 90: + age_categories['3_months'] += 1 + elif days_old <= 180: + age_categories['6_months'] += 1 + elif days_old <= 365: + age_categories['1_year'] += 1 + elif days_old <= 730: + age_categories['2_years'] += 1 + else: + age_categories['5_plus_years'] += 1 + + # Other statistics + forks_with_unique_branches = len([f for f in forks if f.unique_branches]) + forks_with_recent_main = len([f for f in forks if f.behind_main_by_days <= 365]) + forks_with_contributed_prs = len([f for f in forks if f.has_contributed_prs]) + active_non_contributing = len([f for f in forks if f.is_active and not f.has_contributed_prs]) + + # Owner commit statistics + forks_with_owner_commits = len([f for f in forks if f.owner_commits > 0]) + total_owner_commits = sum(f.owner_commits for f in forks) + avg_owner_commits = total_owner_commits / total_forks if total_forks > 0 else 0 + + return { + 'total_analyzed': total_forks, + 'age_distribution': age_categories, + 'forks_with_unique_branches': forks_with_unique_branches, + 'forks_with_recent_main': forks_with_recent_main, + 'forks_that_contributed_prs': forks_with_contributed_prs, + 'active_non_contributing_forks': active_non_contributing, + 'forks_with_owner_commits': forks_with_owner_commits, + 'total_owner_commits': total_owner_commits, + 'avg_owner_commits_per_fork': round(avg_owner_commits, 1), + 'percentage_with_unique_branches': (forks_with_unique_branches / total_forks) * 100, + 'percentage_with_recent_main': (forks_with_recent_main / total_forks) * 100, + 'percentage_contributed_prs': (forks_with_contributed_prs / total_forks) * 100, + 'percentage_active_non_contributing': (active_non_contributing / total_forks) * 100, + 'percentage_with_owner_commits': (forks_with_owner_commits / total_forks) * 100 + } + + +def main(): + """Main entry point.""" + parser = argparse.ArgumentParser(description='Analyze GitHub repository fork statistics') + parser.add_argument('--token', help='GitHub personal access token') + parser.add_argument('--repo', default='wled/WLED', help='Repository in owner/repo format') + parser.add_argument('--output', help='Output file for JSON results') + parser.add_argument('--max-forks', type=int, help='Maximum number of forks to analyze') + parser.add_argument('--format', choices=['json', 'summary'], default='summary', + help='Output format') + parser.add_argument('--demo', action='store_true', + help='Run with sample data for demonstration (no API calls)') + parser.add_argument('--dry-run', action='store_true', + help='Show what would be analyzed without making API calls') + parser.add_argument('--fast', action='store_true', + help='Fast mode: skip detailed analysis of very old forks (3+ years) for better performance') + + args = parser.parse_args() + + # Validate repository format + if '/' not in args.repo: + print("Error: Repository must be in 'owner/repo' format") + sys.exit(1) + + # Create analyzer + analyzer = ForkStatsAnalyzer(args.token) + + if args.demo: + # Create sample data for demonstration + print("DEMO MODE: Using sample data for demonstration") + print("This shows what the output would look like for WLED repository analysis\n") + + sample_results = { + 'main_repo': { + 'full_name': 'wled/WLED', + 'forks_count': 1243, + 'stargazers_count': 15500, + 'watchers_count': 326 + }, + 'total_forks': 100, # Sample size + 'analyzed_forks': [], # Not needed for summary + 'statistics': { + 'total_analyzed': 100, + 'age_distribution': { + '1_month': 8, + '3_months': 12, + '6_months': 15, + '1_year': 23, + '2_years': 25, + '5_plus_years': 17 + }, + 'forks_with_unique_branches': 34, + 'forks_with_recent_main': 42, + 'forks_that_contributed_prs': 18, + 'active_non_contributing_forks': 23, + 'forks_with_owner_commits': 67, + 'total_owner_commits': 2845, + 'avg_owner_commits_per_fork': 28.5, + 'percentage_with_unique_branches': 34.0, + 'percentage_with_recent_main': 42.0, + 'percentage_contributed_prs': 18.0, + 'percentage_active_non_contributing': 23.0, + 'percentage_with_owner_commits': 67.0 + } + } + + if args.output: + # Save sample results to JSON for demo + with open(args.output, 'w') as f: + json.dump(sample_results, f, indent=2) + print(f"Sample JSON results saved to {args.output}") + + if args.format == 'summary' or not args.output: + print_summary(sample_results) + return + + if args.dry_run: + try: + # Just get basic repository info for dry run + print(f"DRY RUN: Analyzing repository {args.repo}") + repo_info = analyzer.get_repository_info(args.repo) + print(f"Repository: {repo_info['full_name']}") + print(f"Total forks: {repo_info['forks_count']:,}") + + forks_to_analyze = args.max_forks or min(repo_info['forks_count'], 100) + print(f"Would analyze: {forks_to_analyze} forks") + print(f"Estimated API requests: {forks_to_analyze * 3 + 10}") + print(f"Rate limit status: {analyzer.rate_limit_remaining or 'Unknown'} requests remaining") + + if not analyzer.token: + print("WARNING: No token provided. Rate limit is 60 requests/hour for unauthenticated requests.") + if repo_info['forks_count'] > 20: + print("Consider using a GitHub token for analyzing larger repositories.") + else: + print("GitHub token provided. Rate limit is 5000 requests/hour.") + + return + except GitHubAPIError as e: + print(f"Error accessing repository: {e}") + sys.exit(1) + + try: + # Run analysis + results = analyzer.analyze_repository_forks(args.repo, args.max_forks, args.fast) + + if args.output: + # Save detailed results as JSON + with open(args.output, 'w') as f: + # Convert ForkInfo objects to dicts for JSON serialization + serializable_results = results.copy() + serializable_results['analyzed_forks'] = [ + { + 'name': fork.name, + 'full_name': fork.full_name, + 'owner': fork.owner, + 'html_url': fork.html_url, + 'updated_at': fork.updated_at.isoformat(), + 'pushed_at': fork.pushed_at.isoformat(), + 'default_branch': fork.default_branch, + 'branches': fork.branches, + 'unique_branches': fork.unique_branches, + 'behind_main_by_commits': fork.behind_main_by_commits, + 'behind_main_by_days': fork.behind_main_by_days, + 'has_contributed_prs': fork.has_contributed_prs, + 'recent_commits': fork.recent_commits, + 'is_active': fork.is_active, + 'owner_commits': fork.owner_commits + } + for fork in results['analyzed_forks'] + ] + json.dump(serializable_results, f, indent=2) + print(f"\nDetailed results saved to {args.output}") + + # Print summary + if args.format == 'summary' or not args.output: + print_summary(results) + + except KeyboardInterrupt: + print("\nAnalysis interrupted by user") + sys.exit(1) + except Exception as e: + print(f"Error: {e}") + sys.exit(1) + + +def print_summary(results: Dict): + """Print a human-readable summary of the analysis.""" + stats = results['statistics'] + main_repo = results['main_repo'] + + print("\n" + "="*60) + print(f"FORK ANALYSIS SUMMARY FOR {main_repo['full_name']}") + print("="*60) + + print(f"\nRepository Details:") + print(f" - Total Forks: {main_repo['forks_count']:,}") + print(f" - Analyzed: {stats.get('total_analyzed', 0):,}") + print(f" - Stars: {main_repo['stargazers_count']:,}") + print(f" - Watchers: {main_repo['watchers_count']:,}") + + if 'age_distribution' in stats: + print(f"\nFork Age Distribution:") + age_dist = stats['age_distribution'] + total = stats['total_analyzed'] + print(f" - Last updated ≤ 1 month: {age_dist['1_month']:4d} ({age_dist['1_month']/total*100:5.1f}%)") + print(f" - Last updated ≤ 3 months: {age_dist['3_months']:4d} ({age_dist['3_months']/total*100:5.1f}%)") + print(f" - Last updated ≤ 6 months: {age_dist['6_months']:4d} ({age_dist['6_months']/total*100:5.1f}%)") + print(f" - Last updated ≤ 1 year: {age_dist['1_year']:4d} ({age_dist['1_year']/total*100:5.1f}%)") + print(f" - Last updated ≤ 2 years: {age_dist['2_years']:4d} ({age_dist['2_years']/total*100:5.1f}%)") + print(f" - Last updated > 5 years: {age_dist['5_plus_years']:4d} ({age_dist['5_plus_years']/total*100:5.1f}%)") + + print(f"\nFork Activity Analysis:") + print(f" - Forks with unique branches: {stats.get('forks_with_unique_branches', 0):4d} ({stats.get('percentage_with_unique_branches', 0):.1f}%)") + print(f" - Forks with recent main branch: {stats.get('forks_with_recent_main', 0):4d} ({stats.get('percentage_with_recent_main', 0):.1f}%)") + print(f" - Forks that contributed PRs: {stats.get('forks_that_contributed_prs', 0):4d} ({stats.get('percentage_contributed_prs', 0):.1f}%)") + print(f" - Active forks (no PR contributions): {stats.get('active_non_contributing_forks', 0):4d} ({stats.get('percentage_active_non_contributing', 0):.1f}%)") + + print(f"\nOwner Commit Analysis:") + print(f" - Forks with owner commits: {stats.get('forks_with_owner_commits', 0):4d} ({stats.get('percentage_with_owner_commits', 0):.1f}%)") + print(f" - Total commits by fork owners: {stats.get('total_owner_commits', 0):4d}") + print(f" - Average commits per fork: {stats.get('avg_owner_commits_per_fork', 0):4.1f}") + + print(f"\nKey Insights:") + if stats.get('percentage_with_recent_main', 0) < 50: + print(f" - Most forks are significantly behind main branch") + if stats.get('percentage_contributed_prs', 0) < 10: + print(f" - Very few forks have contributed back to main repository") + if stats.get('percentage_with_unique_branches', 0) > 20: + print(f" - Significant number of forks have custom development") + if stats.get('percentage_with_owner_commits', 0) > 60: + print(f" - Majority of forks show some owner development activity") + + print("\n" + "="*60) + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/tools/fork_stats_visualizer.py b/tools/fork_stats_visualizer.py new file mode 100755 index 0000000000..0a608349f4 --- /dev/null +++ b/tools/fork_stats_visualizer.py @@ -0,0 +1,466 @@ +#!/usr/bin/env python3 +""" +Fork Statistics Visualizer + +This script loads JSON data generated by fork_stats.py and displays +detailed statistics both as formatted lists and visual graphs. + +Usage: + python3 tools/fork_stats_visualizer.py results.json + python3 tools/fork_stats_visualizer.py results.json --save-plots + python3 tools/fork_stats_visualizer.py results.json --output-dir ./plots +""" + +import argparse +import json +import sys +from pathlib import Path +from typing import Dict, List +from datetime import datetime + +# Try to import matplotlib, but allow running without it +try: + import matplotlib.pyplot as plt + import matplotlib.patches as mpatches + HAS_MATPLOTLIB = True +except ImportError: + HAS_MATPLOTLIB = False + print("Warning: matplotlib not installed. Graphical visualizations will be disabled.") + print("Install with: pip install -r tools/fork_stats_visualizer_requirements.txt") + print() + +def load_json_data(filepath: str) -> Dict: + """Load the JSON data file.""" + try: + with open(filepath, 'r') as f: + return json.load(f) + except FileNotFoundError: + print(f"Error: File '{filepath}' not found.") + sys.exit(1) + except json.JSONDecodeError as e: + print(f"Error: Invalid JSON in '{filepath}': {e}") + sys.exit(1) + +def print_section_header(title: str): + """Print a formatted section header.""" + print("\n" + "=" * 80) + print(f" {title}") + print("=" * 80) + +def print_repository_info(data: Dict): + """Print repository information.""" + print_section_header("REPOSITORY INFORMATION") + + main_repo = data.get('main_repo', {}) + print(f"\nRepository: {main_repo.get('full_name', 'Unknown')}") + print(f"Total Forks: {main_repo.get('forks_count', 0):,}") + print(f"Stars: {main_repo.get('stargazers_count', 0):,}") + print(f"Watchers: {main_repo.get('watchers_count', 0):,}") + print(f"\nAnalyzed Forks: {data.get('total_forks', 0)}") + + if 'analysis_timestamp' in data: + timestamp = data['analysis_timestamp'] + try: + dt = datetime.fromisoformat(timestamp.replace('Z', '+00:00')) + print(f"Analysis Date: {dt.strftime('%Y-%m-%d %H:%M:%S UTC')}") + except: + print(f"Analysis Date: {timestamp}") + +def print_age_distribution(stats: Dict): + """Print fork age distribution statistics.""" + print_section_header("FORK AGE DISTRIBUTION") + + age_dist = stats.get('age_distribution', {}) + total = stats.get('total_analyzed', 1) + + age_labels = { + '1_month': 'Last updated ≤ 1 month', + '3_months': 'Last updated ≤ 3 months', + '6_months': 'Last updated ≤ 6 months', + '1_year': 'Last updated ≤ 1 year', + '2_years': 'Last updated ≤ 2 years', + '5_plus_years': 'Last updated > 5 years' + } + + print("\nAge Category Count Percentage") + print("-" * 60) + for key, label in age_labels.items(): + count = age_dist.get(key, 0) + pct = (count / total * 100) if total > 0 else 0 + bar = '█' * int(pct / 2) # Visual bar + print(f"{label:30} {count:5d} {pct:5.1f}% {bar}") + +def print_activity_analysis(stats: Dict): + """Print fork activity analysis.""" + print_section_header("FORK ACTIVITY ANALYSIS") + + total = stats.get('total_analyzed', 1) + + metrics = [ + ('Forks with unique branches', stats.get('forks_with_unique_branches', 0), + stats.get('percentage_with_unique_branches', 0)), + ('Forks with recent main branch', stats.get('forks_with_recent_main', 0), + stats.get('percentage_with_recent_main', 0)), + ('Forks that contributed PRs', stats.get('forks_that_contributed_prs', 0), + stats.get('percentage_contributed_prs', 0)), + ('Active forks (no PR contributions)', stats.get('active_non_contributing_forks', 0), + stats.get('percentage_active_non_contributing', 0)), + ] + + print("\nMetric Count Percentage") + print("-" * 65) + for label, count, pct in metrics: + bar = '█' * int(pct / 2) # Visual bar + print(f"{label:35} {count:5d} {pct:5.1f}% {bar}") + +def print_owner_commits(stats: Dict): + """Print owner commit statistics.""" + print_section_header("OWNER COMMIT ANALYSIS") + + total_forks = stats.get('total_analyzed', 0) + forks_with_commits = stats.get('forks_with_owner_commits', 0) + total_commits = stats.get('total_owner_commits', 0) + avg_commits = stats.get('avg_owner_commits_per_fork', 0) + pct_with_commits = stats.get('percentage_with_owner_commits', 0) + + print(f"\nForks with owner commits: {forks_with_commits:5d} / {total_forks:5d} ({pct_with_commits:.1f}%)") + print(f"Total commits by fork owners: {total_commits:5d}") + print(f"Average commits per fork: {avg_commits:5.1f}") + +def print_top_forks(data: Dict, n: int = 20): + """Print top forks by various metrics.""" + analyzed_forks = data.get('analyzed_forks', []) + + if not analyzed_forks: + print("\nNo detailed fork data available in JSON file.") + return + + # Top forks by unique branches + print_section_header(f"TOP {n} FORKS BY UNIQUE BRANCHES") + forks_by_branches = sorted(analyzed_forks, + key=lambda x: len(x.get('unique_branches', [])), + reverse=True)[:n] + + print(f"\n{'Rank':<6} {'Fork':<45} {'Unique Branches':<20} {'Owner Commits'}") + print("-" * 90) + for i, fork in enumerate(forks_by_branches, 1): + unique_count = len(fork.get('unique_branches', [])) + owner_commits = fork.get('owner_commits', 0) + print(f"{i:<6} {fork.get('full_name', 'Unknown'):<45} {unique_count:<20} {owner_commits}") + + # Top forks by owner commits + print_section_header(f"TOP {n} FORKS BY OWNER COMMITS") + forks_by_commits = sorted(analyzed_forks, + key=lambda x: x.get('owner_commits', 0), + reverse=True)[:n] + + print(f"\n{'Rank':<6} {'Fork':<45} {'Owner Commits':<20} {'Active'}") + print("-" * 90) + for i, fork in enumerate(forks_by_commits, 1): + commits = fork.get('owner_commits', 0) + is_active = "Yes" if fork.get('is_active', False) else "No" + print(f"{i:<6} {fork.get('full_name', 'Unknown'):<45} {commits:<20} {is_active}") + + # Active forks that haven't contributed + print_section_header("ACTIVE FORKS WITHOUT PR CONTRIBUTIONS") + active_no_pr = [f for f in analyzed_forks + if f.get('is_active', False) and not f.get('has_contributed_prs', False)][:n] + + if active_no_pr: + print(f"\n{'Fork':<45} {'Recent Commits':<20} {'Owner Commits':<20} {'Days Behind'}") + print("-" * 110) + for fork in active_no_pr: + recent = fork.get('recent_commits', 0) + owner = fork.get('owner_commits', 0) + days = fork.get('behind_main_by_days', 0) + print(f"{fork.get('full_name', 'Unknown'):<45} {recent:<20} {owner:<20} {days}") + else: + print("\nNo active forks without PR contributions found.") + +def create_age_distribution_chart(stats: Dict, save_path: str = None): + """Create a pie chart for fork age distribution.""" + if not HAS_MATPLOTLIB: + print("Skipping age distribution chart (matplotlib not available)") + return + + age_dist = stats.get('age_distribution', {}) + + labels = ['≤ 1 month', '≤ 3 months', '≤ 6 months', '≤ 1 year', '≤ 2 years', '> 5 years'] + sizes = [ + age_dist.get('1_month', 0), + age_dist.get('3_months', 0), + age_dist.get('6_months', 0), + age_dist.get('1_year', 0), + age_dist.get('2_years', 0), + age_dist.get('5_plus_years', 0) + ] + + colors = ['#2ecc71', '#27ae60', '#f39c12', '#e67e22', '#e74c3c', '#95a5a6'] + + fig, ax = plt.subplots(figsize=(10, 8)) + wedges, texts, autotexts = ax.pie(sizes, labels=labels, autopct='%1.1f%%', + colors=colors, startangle=90) + + ax.set_title('Fork Age Distribution (Last Update)', fontsize=16, fontweight='bold', pad=20) + + # Make percentage text more readable + for autotext in autotexts: + autotext.set_color('white') + autotext.set_fontsize(10) + autotext.set_fontweight('bold') + + plt.tight_layout() + + if save_path: + plt.savefig(save_path, dpi=300, bbox_inches='tight') + print(f"Saved: {save_path}") + else: + plt.show() + + plt.close() + +def create_activity_bar_chart(stats: Dict, save_path: str = None): + """Create a bar chart for fork activity metrics.""" + if not HAS_MATPLOTLIB: + print("Skipping activity bar chart (matplotlib not available)") + return + + metrics = [ + 'Unique\nBranches', + 'Recent\nMain', + 'Contributed\nPRs', + 'Active\nNo PRs', + 'Owner\nCommits' + ] + + values = [ + stats.get('percentage_with_unique_branches', 0), + stats.get('percentage_with_recent_main', 0), + stats.get('percentage_contributed_prs', 0), + stats.get('percentage_active_non_contributing', 0), + stats.get('percentage_with_owner_commits', 0) + ] + + colors = ['#3498db', '#2ecc71', '#9b59b6', '#e67e22', '#e74c3c'] + + fig, ax = plt.subplots(figsize=(12, 7)) + bars = ax.bar(metrics, values, color=colors, alpha=0.8, edgecolor='black', linewidth=1.5) + + ax.set_ylabel('Percentage of Forks (%)', fontsize=12, fontweight='bold') + ax.set_title('Fork Activity Metrics', fontsize=16, fontweight='bold', pad=20) + ax.set_ylim(0, 100) + ax.grid(axis='y', alpha=0.3, linestyle='--') + + # Add value labels on bars + for bar in bars: + height = bar.get_height() + ax.text(bar.get_x() + bar.get_width()/2., height + 1, + f'{height:.1f}%', ha='center', va='bottom', fontweight='bold') + + plt.tight_layout() + + if save_path: + plt.savefig(save_path, dpi=300, bbox_inches='tight') + print(f"Saved: {save_path}") + else: + plt.show() + + plt.close() + +def create_owner_commits_distribution(data: Dict, save_path: str = None): + """Create a histogram of owner commits distribution.""" + if not HAS_MATPLOTLIB: + print("Skipping owner commits distribution chart (matplotlib not available)") + return + + analyzed_forks = data.get('analyzed_forks', []) + + if not analyzed_forks: + print("No detailed fork data for commits distribution chart.") + return + + commits = [fork.get('owner_commits', 0) for fork in analyzed_forks if fork.get('owner_commits', 0) > 0] + + if not commits: + print("No owner commits data available.") + return + + fig, ax = plt.subplots(figsize=(12, 7)) + + # Create histogram with bins + n, bins, patches = ax.hist(commits, bins=20, color='#3498db', alpha=0.7, edgecolor='black') + + # Color code the bins + cm = plt.cm.RdYlGn_r + bin_centers = 0.5 * (bins[:-1] + bins[1:]) + col = bin_centers - min(bin_centers) + col /= max(col) + + for c, p in zip(col, patches): + plt.setp(p, 'facecolor', cm(c)) + + ax.set_xlabel('Number of Owner Commits', fontsize=12, fontweight='bold') + ax.set_ylabel('Number of Forks', fontsize=12, fontweight='bold') + ax.set_title('Distribution of Owner Commits Across Forks', fontsize=16, fontweight='bold', pad=20) + ax.grid(axis='y', alpha=0.3, linestyle='--') + + # Add statistics text + stats_text = f'Total Forks: {len(commits)}\nMean: {sum(commits)/len(commits):.1f}\nMax: {max(commits)}' + ax.text(0.95, 0.95, stats_text, transform=ax.transAxes, + verticalalignment='top', horizontalalignment='right', + bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5), + fontsize=10, fontweight='bold') + + plt.tight_layout() + + if save_path: + plt.savefig(save_path, dpi=300, bbox_inches='tight') + print(f"Saved: {save_path}") + else: + plt.show() + + plt.close() + +def create_combined_dashboard(stats: Dict, data: Dict, save_path: str = None): + """Create a combined dashboard with multiple charts.""" + if not HAS_MATPLOTLIB: + print("Skipping combined dashboard (matplotlib not available)") + return + + fig = plt.figure(figsize=(16, 10)) + + # Age distribution pie chart + ax1 = plt.subplot(2, 2, 1) + age_dist = stats.get('age_distribution', {}) + labels = ['≤1mo', '≤3mo', '≤6mo', '≤1yr', '≤2yr', '>5yr'] + sizes = [ + age_dist.get('1_month', 0), + age_dist.get('3_months', 0), + age_dist.get('6_months', 0), + age_dist.get('1_year', 0), + age_dist.get('2_years', 0), + age_dist.get('5_plus_years', 0) + ] + colors = ['#2ecc71', '#27ae60', '#f39c12', '#e67e22', '#e74c3c', '#95a5a6'] + ax1.pie(sizes, labels=labels, autopct='%1.1f%%', colors=colors, startangle=90) + ax1.set_title('Fork Age Distribution', fontweight='bold') + + # Activity metrics bar chart + ax2 = plt.subplot(2, 2, 2) + metrics = ['Unique\nBranches', 'Recent\nMain', 'PRs', 'Active\nNo PRs', 'Owner\nCommits'] + values = [ + stats.get('percentage_with_unique_branches', 0), + stats.get('percentage_with_recent_main', 0), + stats.get('percentage_contributed_prs', 0), + stats.get('percentage_active_non_contributing', 0), + stats.get('percentage_with_owner_commits', 0) + ] + colors_bar = ['#3498db', '#2ecc71', '#9b59b6', '#e67e22', '#e74c3c'] + bars = ax2.bar(metrics, values, color=colors_bar, alpha=0.8) + ax2.set_ylabel('Percentage (%)') + ax2.set_title('Activity Metrics', fontweight='bold') + ax2.set_ylim(0, 100) + ax2.grid(axis='y', alpha=0.3) + + # Owner commits histogram + ax3 = plt.subplot(2, 2, 3) + analyzed_forks = data.get('analyzed_forks', []) + commits = [fork.get('owner_commits', 0) for fork in analyzed_forks if fork.get('owner_commits', 0) > 0] + if commits: + ax3.hist(commits, bins=15, color='#3498db', alpha=0.7, edgecolor='black') + ax3.set_xlabel('Owner Commits') + ax3.set_ylabel('Frequency') + ax3.set_title('Owner Commits Distribution', fontweight='bold') + ax3.grid(axis='y', alpha=0.3) + + # Summary statistics + ax4 = plt.subplot(2, 2, 4) + ax4.axis('off') + + main_repo = data.get('main_repo', {}) + summary_text = f""" + REPOSITORY STATISTICS + {'='*35} + + Repository: {main_repo.get('full_name', 'Unknown')} + Total Forks: {main_repo.get('forks_count', 0):,} + Analyzed: {stats.get('total_analyzed', 0)} + + KEY METRICS: + • Unique Branches: {stats.get('forks_with_unique_branches', 0)} ({stats.get('percentage_with_unique_branches', 0):.1f}%) + • PR Contributors: {stats.get('forks_that_contributed_prs', 0)} ({stats.get('percentage_contributed_prs', 0):.1f}%) + • Owner Commits: {stats.get('total_owner_commits', 0):,} + • Avg Commits/Fork: {stats.get('avg_owner_commits_per_fork', 0):.1f} + + INSIGHTS: + • Recent Forks: {age_dist.get('1_month', 0) + age_dist.get('3_months', 0)} + • Very Old (>5yr): {age_dist.get('5_plus_years', 0)} + • Active No PRs: {stats.get('active_non_contributing_forks', 0)} + """ + + ax4.text(0.1, 0.9, summary_text, transform=ax4.transAxes, + verticalalignment='top', fontsize=11, fontfamily='monospace', + bbox=dict(boxstyle='round', facecolor='lightblue', alpha=0.3)) + + fig.suptitle('Fork Statistics Dashboard', fontsize=18, fontweight='bold', y=0.98) + plt.tight_layout(rect=[0, 0.03, 1, 0.96]) + + if save_path: + plt.savefig(save_path, dpi=300, bbox_inches='tight') + print(f"Saved: {save_path}") + else: + plt.show() + + plt.close() + +def main(): + parser = argparse.ArgumentParser(description='Visualize fork statistics data') + parser.add_argument('json_file', help='Path to the JSON file with fork statistics') + parser.add_argument('--save-plots', action='store_true', help='Save plots to files instead of displaying') + parser.add_argument('--output-dir', default='./fork_plots', help='Directory to save plots (default: ./fork_plots)') + parser.add_argument('--top-n', type=int, default=20, help='Number of top forks to display (default: 20)') + parser.add_argument('--no-graphs', action='store_true', help='Skip graph generation, only show text statistics') + + args = parser.parse_args() + + # Load data + data = load_json_data(args.json_file) + stats = data.get('statistics', {}) + + # Print text statistics + print_repository_info(data) + print_age_distribution(stats) + print_activity_analysis(stats) + print_owner_commits(stats) + print_top_forks(data, args.top_n) + + # Generate graphs if not disabled + if not args.no_graphs: + print_section_header("GENERATING VISUAL CHARTS") + + if args.save_plots: + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + print(f"\nSaving plots to: {output_dir}") + + create_age_distribution_chart(stats, output_dir / "age_distribution.png") + create_activity_bar_chart(stats, output_dir / "activity_metrics.png") + create_owner_commits_distribution(data, output_dir / "owner_commits_distribution.png") + create_combined_dashboard(stats, data, output_dir / "dashboard.png") + + print(f"\nAll plots saved to: {output_dir}") + else: + print("\nDisplaying interactive plots...") + print("Close each plot window to see the next one.") + + create_age_distribution_chart(stats) + create_activity_bar_chart(stats) + create_owner_commits_distribution(data) + create_combined_dashboard(stats, data) + + print_section_header("VISUALIZATION COMPLETE") + print(f"\nData source: {args.json_file}") + print(f"Total forks analyzed: {stats.get('total_analyzed', 0)}") + +if __name__ == '__main__': + main() diff --git a/tools/fork_stats_visualizer_requirements.txt b/tools/fork_stats_visualizer_requirements.txt new file mode 100644 index 0000000000..15cf7ae045 --- /dev/null +++ b/tools/fork_stats_visualizer_requirements.txt @@ -0,0 +1,4 @@ +# Optional requirements for fork statistics visualization tool +# Install with: pip install -r tools/fork_stats_visualizer_requirements.txt + +matplotlib>=3.5.0