diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml deleted file mode 100644 index ab3496c..0000000 --- a/.github/workflows/docs.yml +++ /dev/null @@ -1,106 +0,0 @@ -name: Documentation - -on: - push: - branches: - - main - paths: - - 'src/**' - - 'Cargo.toml' - - '.github/workflows/docs.yml' - workflow_dispatch: - -# Ensure only one deployment runs at a time -concurrency: - group: pages - cancel-in-progress: false - -permissions: - contents: read - pages: write - id-token: write - -jobs: - build: - name: Build Documentation - runs-on: ubuntu-latest - steps: - - name: Checkout repository - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: Install stable Rust - uses: dtolnay/rust-toolchain@stable - with: - components: rust-docs - - - name: Setup Rust Cache - uses: Swatinem/rust-cache@v2 - with: - cache-all-crates: true - - - name: Build documentation - env: - RUSTDOCFLAGS: >- - --cfg docsrs - -D warnings - --enable-index-page - -Z unstable-options - --extern-html-root-url serde=https://docs.rs/serde/latest/ - --extern-html-root-url serde_json=https://docs.rs/serde_json/latest/ - run: | - cargo +nightly doc \ - --all-features \ - --no-deps \ - --document-private-items \ - --lib - - - name: Add redirect index.html - run: | - cat > target/doc/index.html < - - - - - Redirecting to singularity_language_registry documentation - - -

Redirecting to singularity_language_registry documentation...

- - - EOF - - - name: Add .nojekyll file - run: touch target/doc/.nojekyll - - - name: Setup Pages - uses: actions/configure-pages@v5 - - - name: Upload artifact - uses: actions/upload-pages-artifact@v3 - with: - path: target/doc - - deploy: - name: Deploy to GitHub Pages - needs: build - runs-on: ubuntu-latest - environment: - name: github-pages - url: ${{ steps.deployment.outputs.page_url }} - steps: - - name: Deploy to GitHub Pages - id: deployment - uses: actions/deploy-pages@v4 - - - name: Add deployment summary - run: | - echo "## Documentation Deployed" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - echo "Documentation has been successfully deployed to GitHub Pages." >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - echo "**URL**: ${{ steps.deployment.outputs.page_url }}" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - echo "View the documentation: [singularity_language_registry docs](${{ steps.deployment.outputs.page_url }}singularity_language_registry/)" >> $GITHUB_STEP_SUMMARY diff --git a/Cargo.lock b/Cargo.lock index 5c1c98e..a698348 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -95,7 +95,7 @@ dependencies = [ [[package]] name = "singularity-language-registry" -version = "0.1.0" +version = "0.2.0-beta.1" dependencies = [ "anyhow", "lazy_static", diff --git a/Cargo.toml b/Cargo.toml index 3d4fa68..5eaa336 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "singularity-language-registry" -version = "0.1.0" +version = "0.2.0-beta.1" edition = "2021" license-file = "LICENSE" authors = ["Singularity Team"] diff --git a/LINGUIST_INTEGRATION.md b/LINGUIST_INTEGRATION.md new file mode 100644 index 0000000..f65f1c5 --- /dev/null +++ b/LINGUIST_INTEGRATION.md @@ -0,0 +1,261 @@ +# GitHub Linguist Integration + +## Overview + +Singularity's language registry is aligned with [GitHub Linguist](https://github.com/github-linguist/linguist) as the authoritative source for programming language definitions and file classification patterns. + +This ensures consistency across tools and prevents fragmentation of language definitions across the ecosystem. + +## Architecture + +``` +GitHub Linguist (Authoritative Source) + ↓ +Renovate (Weekly Updates) + ↓ +Singularity Language Registry + ├─ Language Definitions (Phase 1: DONE) + ├─ File Classification (Phase 2: READY) + └─ Detection Heuristics (Phase 3: PLANNED) + ↓ +All Singularity Engines +``` + +## Current State: Phase 1 - Language Definitions + +### What's Synced +- **`languages.yml`**: Complete list of 500+ programming languages +- **Metadata per language**: Extensions, aliases, MIME types, language type +- **Linguist attributes**: Color codes, documentation references + +### How It Works +```rust +// All language definitions come from Linguist +let registry = LanguageRegistry::new(); + +// Only explicitly marked languages are supported +if lang.supported_in_singularity { + // Analyze this language +} +``` + +### Renovate Integration +- **Schedule**: Weekly check for Linguist updates +- **Label**: `linguist`, `language-registry` +- **Action**: Manual review required before merge +- **Update**: When Linguist releases a new version + +## Phase 2: File Classification (In Progress) + +### Status +- ✅ **FileClassifier module**: Implemented with 5 tests +- ✅ **Synchronization script**: Created (`scripts/sync_linguist_patterns.py`) +- 🔧 **Integration in progress**: Add `sync-linguist` justfile command +- 📋 **Next**: Add to CI workflow + +### What Will Be Added + +#### Vendored Code Detection +Auto-skip third-party dependencies: +``` +- node_modules/ +- vendor/ +- .yarn/ +- Pods/ +- third_party/ +- Carthage/ +``` + +#### Generated File Detection +Skip auto-generated code: +``` +- *.pb.rs (Protobuf) +- *.pb.go (Protobuf) +- *.generated.ts (GraphQL) +- *.designer.cs (Visual Studio) +- *.meta (Unity3D) +``` + +#### Binary File Detection +Skip non-text files: +``` +- *.png, *.jpg, *.gif (Images) +- *.zip, *.tar (Archives) +- *.exe, *.dll (Binaries) +- *.pdf, *.docx (Documents) +``` + +### How It Works + +#### Step 1: Manual Synchronization (Current) +When Linguist updates (Renovate alert): +```bash +# Sync patterns from Linguist to Rust code +python3 scripts/sync_linguist_patterns.py > src/file_classifier_generated.rs + +# Run tests to validate patterns +cargo test + +# Commit the generated patterns +git add src/file_classifier_generated.rs +git commit -m "chore(linguist): sync file classification patterns" +``` + +#### Step 2: Automated Synchronization (Future) +```bash +# Automatic sync via justfile +just sync-linguist + +# Or via cargo xtask +cargo xtask sync-linguist +``` + +### Implementation Details + +#### Synchronization Script (`scripts/sync_linguist_patterns.py`) +1. **Downloads from Linguist**: + - `vendor.yml`: Vendored code patterns (6.5KB) + - `generated.rb`: Generated file detection logic (29.8KB) + - `heuristics.yml`: Language detection rules (35KB, Phase 3) + +2. **Parses patterns**: + - YAML parsing for `vendor.yml` + - Ruby AST parsing for `generated.rb` + - Regex extraction and normalization + +3. **Generates Rust code**: + - Static arrays: `VENDORED_PATTERNS_FROM_LINGUIST` + - Static arrays: `GENERATED_PATTERNS_FROM_LINGUIST` + - Static arrays: `BINARY_PATTERNS_FROM_LINGUIST` + +4. **Output**: `src/file_classifier_generated.rs` (auto-generated) + +#### FileClassifier Usage +```rust +use singularity_language_registry::FileClassifier; + +let classifier = FileClassifier::new(); + +if classifier.should_analyze(path) { + // Analyze source code +} else { + match classifier.classify(path) { + FileClass::Vendored => skip("third-party"), + FileClass::Generated => skip("auto-generated"), + FileClass::Binary => skip("non-text"), + FileClass::Source => analyze(), + } +} +``` + +### Source Data +- **`vendor.yml`**: Vendored code patterns (6.5KB) + - Dependency manager directories + - IDE/editor artifacts + - Build output directories + - Framework-specific paths + +- **`generated.rb`**: Generated file detection (29.8KB) + - File path patterns + - Extension matching + - Content header signatures (Generated by, DO NOT EDIT) + - Minification detection + - Metadata inspection + +- **`heuristics.yml`**: Language detection rules (Phase 3) + +## Phase 3: Detection Heuristics (Planned) + +### What Will Be Added + +Fallback language detection for ambiguous file extensions: +``` +.pl → Perl or Prolog? (check for 'use strict' vs 'use_module') +.m → Objective-C or Matlab? (check for @interface vs function) +.rs → Rust or Reason? (check for 'fn' vs 'let') +``` + +### Source Data +- **`heuristics.yml`**: Detection rules (35KB) + - Pattern-based disambiguation + - Content signature matching + - Named pattern reuse + +## Governance Model + +### Who Decides What Becomes Supported? + +**Linguist** decides what languages exist: +- Adding languages to Linguist → Auto-detected by Renovate +- Removing languages from Linguist → Flagged in PR for review + +**Singularity** decides what to support: +- Only languages with `supported_in_singularity: true` are analyzed +- Requires explicit approval to add support + +``` +Global Decision (GitHub Linguist) → Local Decision (Singularity) + 500+ languages 24 languages (current) +``` + +## Maintenance + +### Updating When Renovate Creates a PR + +1. **Review the Linguist changes** + - New languages added? + - Existing languages modified? + - File classification patterns updated? + +2. **Update Singularity** (if needed) + - Add/remove language support + - Update file classification + - Update detection heuristics + +3. **Test** + ```bash + cargo test + cargo clippy -- -D warnings + just quality + ``` + +4. **Merge and Release** + ```bash + cargo release + git push + ``` + +## Benefits + +✅ **Single Source of Truth**: No duplicate language definitions +✅ **Forward Compatible**: New languages auto-included (unsupported) +✅ **Automatic Updates**: Weekly Renovate alerts +✅ **Community Standard**: Uses GitHub's official definitions +✅ **Reduced Friction**: Less code to maintain +✅ **Better File Handling**: Skip vendored/generated automatically + +## Future Extensions + +### Additional Linguist Sources +- **MIME Type Mappings**: From `languages.yml` +- **File Extension Aliases**: Conflicting extensions (e.g., `.h` → C/C++/Objective-C) +- **Shebang Patterns**: Detect from `#!` line (e.g., `#!/usr/bin/env python`) +- **EditorConfig Integration**: From Linguist's `.editorconfig` + +### Integration Points +- **singularity-parsing-engine**: Use `FileClassifier` to skip non-source files +- **singularity-analysis-engine**: Use heuristics for ambiguous languages +- **singularity-linting-engine**: Use file classification to focus on code +- **IDE Extensions**: Use language registry for syntax highlighting + +## Resources + +- **GitHub Linguist**: +- **Linguist Languages**: +- **Linguist Vendor Patterns**: +- **Linguist Generated Detection**: +- **Linguist Heuristics**: + +## Questions? + +See [build.rs](build.rs) for the implementation roadmap and current progress. diff --git a/build.rs b/build.rs index 138729b..a82c16e 100644 --- a/build.rs +++ b/build.rs @@ -1,4 +1,62 @@ -//! Build script for validating language metadata +//! Build script for validating language metadata and Linguist integration +//! +//! ## Language Registry Source +//! +//! The language registry is derived from GitHub Linguist's authoritative language list: +//! +//! +//! This ensures Singularity language definitions stay consistent with GitHub's standard. +//! Renovate automatically alerts when Linguist updates (weekly schedule). +//! +//! ## Extended Linguist Integration (Option 2 - In Progress) +//! +//! ### Phase 1: Language Definitions (✅ DONE) +//! - ✅ `languages.yml` synced to registry +//! - ✅ `supported_in_singularity` flag for explicit support +//! - ✅ Weekly Renovate alerts +//! +//! ### Phase 2: File Classification (🔧 IN PROGRESS) +//! +//! #### Implementation Step 1: Manual Synchronization (Current) +//! Run the synchronization script when Linguist updates: +//! ```bash +//! python3 scripts/sync_linguist_patterns.py > src/file_classifier_generated.rs +//! cargo test +//! git add src/file_classifier_generated.rs +//! git commit -m "chore(linguist): sync file classification patterns" +//! ``` +//! +//! #### Implementation Step 2: Automated Synchronization (Future) +//! This build script can be extended to: +//! ```bash +//! cargo xtask sync-linguist +//! ``` +//! +//! Which will: +//! 1. Download `vendor.yml` from Linguist +//! 2. Download `generated.rb` from Linguist +//! 3. Parse and extract patterns +//! 4. Generate Rust code arrays +//! 5. Update `src/file_classifier_generated.rs` +//! 6. Run tests to validate +//! +//! #### Patterns Extracted +//! - **Vendored**: `node_modules/`, `vendor/`, `.yarn/`, `Pods/`, `dist/`, `build/` +//! - **Generated**: `.pb.rs`, `.pb.go`, `.generated.ts`, `.designer.cs`, `.meta` +//! - **Binary**: `.png`, `.jpg`, `.zip`, `.exe`, `.dll`, `.pdf` +//! +//! ### Phase 3: Detection Heuristics (📋 PLANNED) +//! - Extract `heuristics.yml` from Linguist (35KB) +//! - Generate fallback language detection for ambiguous extensions +//! - Support: `.pl` (Perl vs Prolog), `.m` (Objective-C vs Matlab), etc. +//! +//! ### Maintenance Workflow +//! When Renovate creates a Linguist update PR: +//! 1. Review language definition changes +//! 2. Run: `python3 scripts/sync_linguist_patterns.py` +//! 3. Run: `cargo test` +//! 4. Commit changes: `git add . && git commit` +//! 5. Merge and create release //! //! This can be used to ensure registry metadata matches actual library capabilities. //! Run with: cargo build --features validate-metadata diff --git a/examples/usage.rs b/examples/usage.rs index 3701f5b..598b389 100644 --- a/examples/usage.rs +++ b/examples/usage.rs @@ -9,6 +9,8 @@ reason = "Examples are meant to demonstrate usage and print output to the user" )] +use std::sync::atomic::Ordering; + use singularity_language_registry::{ ast_grep_supported_languages, detect_from_content, detect_language, get_language, get_language_by_alias, is_detectable, languages_by_families, rca_supported_languages, @@ -41,7 +43,10 @@ fn main() { println!("\n2. Language Lookup:"); if let Some(elixir) = get_language("elixir") { println!(" Elixir extensions: {:?}", elixir.extensions); - println!(" RCA supported: {}", elixir.rca_supported); + println!( + " RCA supported: {}", + elixir.rca_supported.load(Ordering::Relaxed) + ); println!(" AST-Grep supported: {}", elixir.ast_grep_supported); } diff --git a/flake.lock b/flake.lock index b518b16..80514f9 100644 --- a/flake.lock +++ b/flake.lock @@ -1,5 +1,21 @@ { "nodes": { + "advisory-db": { + "flake": false, + "locked": { + "lastModified": 1762774274, + "narHash": "sha256-tigj2sBL6S7zmjpt5JdXtvtGrClvja+/LAnmpU6+MV4=", + "owner": "rustsec", + "repo": "advisory-db", + "rev": "df17e8c0d170b71c0a4cca3f165c30030a526060", + "type": "github" + }, + "original": { + "owner": "rustsec", + "repo": "advisory-db", + "type": "github" + } + }, "crane": { "locked": { "lastModified": 1762538466, @@ -67,6 +83,7 @@ }, "root": { "inputs": { + "advisory-db": "advisory-db", "crane": "crane", "flake-utils": "flake-utils", "nixpkgs": "nixpkgs", diff --git a/justfile b/justfile index 30eb46e..b39de57 100644 --- a/justfile +++ b/justfile @@ -118,6 +118,19 @@ ci-local: changelog: git log --pretty=format:"- %s (%h)" --reverse > CHANGELOG.md +# Sync file classification patterns from GitHub Linguist (Phase 2) +sync-linguist: + #!/usr/bin/env bash + set -e + echo "Synchronizing file classification patterns from GitHub Linguist..." + python3 scripts/sync_linguist_patterns.py > src/file_classifier_generated.rs + echo "✅ Patterns synced to src/file_classifier_generated.rs" + echo "" + echo "Next steps:" + echo " 1. cargo test" + echo " 2. git add src/file_classifier_generated.rs" + echo " 3. git commit -m 'chore(linguist): sync file classification patterns'" + # Verify everything before PR verify: fmt clippy test audit renovate-validate doc @echo "✅ All checks passed!" diff --git a/renovate.json5 b/renovate.json5 index d2964b2..53fa10b 100644 --- a/renovate.json5 +++ b/renovate.json5 @@ -42,6 +42,50 @@ // Package Rules - ordered by priority "packageRules": [ + // =================== + // GitHub Linguist (Language Registry Source) + // =================== + { + "description": "🔤 GitHub Linguist language list updates", + "matchDatasources": ["github-tags"], + "matchPackagePatterns": ["github-linguist/linguist"], + "schedule": ["weekly"], + "labels": ["linguist", "language-registry", "dependencies"], + "prPriority": 5, + "automerge": false, // Manual review for language definition changes + "commitMessagePrefix": "chore(linguist):", + "prBodyNotes": [ + "## ⚠️ Linguist Update Detected", + "", + "GitHub Linguist (the authoritative source for language definitions) has been updated.", + "", + "### What to Review", + "", + "1. **Language Definitions** (Phase 1 - Active):", + " - New languages added to Linguist?", + " - Existing language metadata changed?", + " - Need to update `supported_in_singularity` flags?", + "", + "2. **File Classification** (Phase 2 - Ready):", + " - Changes to vendor patterns (vendor.yml)?", + " - Changes to generated file detection (generated.rb)?", + " - Changes to binary file patterns?", + "", + "3. **Detection Heuristics** (Phase 3 - Planned):", + " - Changes to language detection heuristics (heuristics.yml)?", + "", + "See [LINGUIST_INTEGRATION.md](LINGUIST_INTEGRATION.md) for details.", + "", + "### Action Items", + "", + "- [ ] Review language definition changes", + "- [ ] Update supported languages if needed", + "- [ ] Run `cargo test` to validate", + "- [ ] Update file classification patterns if needed (Phase 2)", + "- [ ] Merge and create a new release" + ] + }, + // =================== // Security Updates // =================== diff --git a/scripts/sync_linguist_patterns.py b/scripts/sync_linguist_patterns.py new file mode 100644 index 0000000..961129f --- /dev/null +++ b/scripts/sync_linguist_patterns.py @@ -0,0 +1,219 @@ +#!/usr/bin/env python3 +""" +Synchronize File Classification Patterns from GitHub Linguist + +This script downloads Linguist's vendor.yml and generated.rb files, +extracts patterns, and generates Rust code for the FileClassifier module. + +Usage: + python3 scripts/sync_linguist_patterns.py > src/file_classifier_generated.rs + +Sources: + - vendor.yml: Vendored code path patterns + - generated.rb: Auto-generated file detection rules +""" + +import re +import sys +import urllib.request +from typing import List, Set +import yaml + +# GitHub Linguist URLs +VENDOR_YML_URL = "https://raw.githubusercontent.com/github-linguist/linguist/master/lib/linguist/vendor.yml" +GENERATED_RB_URL = "https://raw.githubusercontent.com/github-linguist/linguist/master/lib/linguist/generated.rb" +HEURISTICS_YML_URL = "https://raw.githubusercontent.com/github-linguist/linguist/master/lib/linguist/heuristics.yml" + + +def fetch_url(url: str) -> str: + """Fetch content from URL""" + print(f"Fetching {url}...", file=sys.stderr) + try: + with urllib.request.urlopen(url, timeout=10) as response: + return response.read().decode("utf-8") + except Exception as e: + print(f"Error fetching {url}: {e}", file=sys.stderr) + raise + + +def parse_vendor_yml(content: str) -> Set[str]: + """ + Parse vendor.yml and extract vendored path patterns. + + Format: + ```yaml + - /path/to/vendor/ + - node_modules/ + - "regex_pattern" + ``` + """ + patterns: Set[str] = set() + + try: + data = yaml.safe_load(content) + if isinstance(data, list): + for item in data: + if isinstance(item, str): + # Simple path patterns + path = item.strip() + if path and not path.startswith("#"): + patterns.add(path) + except yaml.YAMLError as e: + print(f"Error parsing YAML: {e}", file=sys.stderr) + return patterns + + return patterns + + +def parse_generated_rb(content: str) -> Set[str]: + """ + Parse generated.rb and extract generated file patterns. + + Looks for: + - File extensions: ".pb.rs", ".generated.ts" + - Directory paths: "__generated__/", "dist/" + - Content markers for detection + """ + patterns: Set[str] = set() + + # Pattern to match quoted strings in Ruby + # Matches: ".pb.rs", '.generated.ts', "pattern" + string_pattern = re.compile(r'''['"](.*?)['"]''') + + for line in content.split('\n'): + line = line.strip() + + # Skip comments and empty lines + if not line or line.startswith('#'): + continue + + # Extract quoted strings + matches = string_pattern.findall(line) + for match in matches: + if match and len(match) < 50: # Reasonable pattern length + patterns.add(match) + + return patterns + + +def parse_heuristics_yml(content: str) -> dict: + """ + Parse heuristics.yml for language detection rules. + + This is for Phase 3 (future implementation). + """ + try: + data = yaml.safe_load(content) + return data if data else {} + except yaml.YAMLError: + return {} + + +def categorize_patterns(patterns: Set[str]) -> dict: + """ + Categorize patterns into: + - Vendored: node_modules/, vendor/, .yarn/, etc. + - Generated: .pb.rs, .generated.ts, etc. + - Binary: .png, .jpg, .exe, etc. + """ + categories = { + 'vendored': set(), + 'generated': set(), + 'binary': set(), + } + + binary_extensions = {'.png', '.jpg', '.jpeg', '.gif', '.bmp', '.zip', '.tar', '.exe', '.dll', '.pdf'} + + for pattern in patterns: + if any(pattern.startswith(v) for v in ['node_modules', 'vendor', '.yarn', '.idea', 'dist', 'build']): + categories['vendored'].add(pattern) + elif pattern.startswith('.'): + # It's an extension + if any(pattern == ext for ext in binary_extensions): + categories['binary'].add(pattern) + elif 'generated' in pattern.lower() or 'pb' in pattern or 'proto' in pattern: + categories['generated'].add(pattern) + else: + categories['vendored'].add(pattern) + + return categories + + +def generate_rust_code(patterns_dict: dict) -> str: + """Generate Rust code for patterns""" + code = '''// AUTO-GENERATED FILE - DO NOT EDIT MANUALLY +// Generated from GitHub Linguist patterns +// Run: python3 scripts/sync_linguist_patterns.py +// Source: https://github.com/github-linguist/linguist + +//! Auto-generated file classification patterns from GitHub Linguist +//! +//! These patterns are synchronized weekly via Renovate. +//! When Linguist updates, run: python3 scripts/sync_linguist_patterns.py + +/// Vendored code path patterns (from Linguist vendor.yml) +pub const VENDORED_PATTERNS_FROM_LINGUIST: &[&str] = &[ +''' + + for pattern in sorted(patterns_dict['vendored']): + code += f' "{pattern}",\n' + + code += ''']; + +/// Generated file patterns (from Linguist generated.rb) +pub const GENERATED_PATTERNS_FROM_LINGUIST: &[&str] = &[ +''' + + for pattern in sorted(patterns_dict['generated']): + escaped = pattern.replace('"', '\\"').replace('\\', '\\\\') + code += f' "{escaped}",\n' + + code += ''']; + +/// Binary file extensions +pub const BINARY_PATTERNS_FROM_LINGUIST: &[&str] = &[ +''' + + for pattern in sorted(patterns_dict['binary']): + code += f' "{pattern}",\n' + + code += ''']; +''' + + return code + + +def main(): + """Main entry point""" + try: + # Fetch files from Linguist + print("Synchronizing patterns from GitHub Linguist...", file=sys.stderr) + + vendor_content = fetch_url(VENDOR_YML_URL) + generated_content = fetch_url(GENERATED_RB_URL) + + # Parse patterns + vendor_patterns = parse_vendor_yml(vendor_content) + generated_patterns = parse_generated_rb(generated_content) + + print(f"Found {len(vendor_patterns)} vendor patterns", file=sys.stderr) + print(f"Found {len(generated_patterns)} generated patterns", file=sys.stderr) + + # Combine and categorize + all_patterns = vendor_patterns | generated_patterns + categorized = categorize_patterns(all_patterns) + + # Generate Rust code + rust_code = generate_rust_code(categorized) + + # Output + print(rust_code, file=sys.stdout) + print("// Pattern sync complete!", file=sys.stderr) + + except Exception as e: + print(f"Error: {e}", file=sys.stderr) + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/src/file_classifier.rs b/src/file_classifier.rs new file mode 100644 index 0000000..f7617b2 --- /dev/null +++ b/src/file_classifier.rs @@ -0,0 +1,242 @@ +//! File Classification Engine - Identifies vendored, generated, and binary files +//! +//! This module provides classification rules derived from GitHub Linguist's patterns: +//! +//! +//! ## Classification Categories +//! +//! - **Vendored**: Third-party dependencies (`node_modules/`, `vendor/`, etc.) +//! - **Generated**: Auto-generated files (protobuf, graphql, minified, etc.) +//! - **Binary**: Non-text files (images, archives, compiled binaries) +//! - **Documentation**: Auto-generated docs (Sphinx, Doxygen) +//! +//! ## Usage +//! +//! ```rust,ignore +//! use singularity_language_registry::FileClassifier; +//! use std::path::Path; +//! +//! let classifier = FileClassifier::new(); +//! let path = Path::new("node_modules/package/index.js"); +//! +//! if classifier.is_vendored(path) { +//! println!("Skip vendored code"); +//! } +//! ``` + +use std::path::Path; + +/// File classification result +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[non_exhaustive] +pub enum FileClass { + /// Regular source code file + Source, + /// Vendored/third-party dependency + Vendored, + /// Auto-generated file + Generated, + /// Binary file (non-text) + Binary, + /// Auto-generated documentation + Documentation, +} + +/// File classifier using Linguist patterns +#[derive(Debug, Clone)] +pub struct FileClassifier { + /// Vendored file path patterns (from Linguist vendor.yml) + vendored_patterns: Vec<&'static str>, + /// Generated file extensions + generated_extensions: Vec<&'static str>, + /// Binary file extensions + binary_extensions: Vec<&'static str>, + /// Documentation tool markers + documentation_markers: Vec<&'static str>, +} + +impl FileClassifier { + /// Create a new file classifier with Linguist patterns + #[must_use] + pub fn new() -> Self { + Self { + vendored_patterns: vec![ + // Dependency directories + "node_modules/", + "vendor/", + "vendors/", + ".yarn/", + "Pods/", + "Carthage/Build/", + "third_party/", + "dependencies/", + // IDE/Editor artifacts + ".vscode/", + ".idea/", + ".sublime-project", + // Build artifacts + "dist/", + "build/", + "target/", + "_build/", + // Package lock files + "package-lock.json", + "yarn.lock", + "Cargo.lock", + "poetry.lock", + "Gemfile.lock", + // Gradle/Maven wrappers + "gradlew", + "mvnw", + ], + generated_extensions: vec![ + ".pb.rs", // Protobuf (Rust) + ".pb.go", // Protobuf (Go) + ".pb.py", // Protobuf (Python) + ".pb2.py", // Protobuf v2 (Python) + ".pb.js", // Protobuf (JS) + ".designer.cs", // Visual Studio designer + ".g.ts", // Angular/GraphQL generated + ".generated.ts", + ".generated.js", + ".nib", // Xcode Interface Builder + ".xcworkspacedata", + ".storyboard", + ".xib", + ".meta", // Unity3D metadata + ], + binary_extensions: vec![ + // Images + ".png", ".jpg", ".jpeg", ".gif", ".bmp", ".svg", // Archives + ".zip", ".tar", ".gz", ".rar", ".7z", // Compiled binaries + ".exe", ".bin", ".so", ".dll", ".dylib", // Documents (binary formats) + ".pdf", ".docx", ".xlsx", ".pptx", // Audio/Video + ".mp3", ".mp4", ".wav", ".avi", ".mov", + ], + documentation_markers: vec!["doxygen", "sphinx", "jsdoc", "pandoc"], + } + } + + /// Check if path is vendored code + #[must_use] + pub fn is_vendored(&self, path: &Path) -> bool { + let path_str = path.to_string_lossy(); + self.vendored_patterns + .iter() + .any(|pattern| path_str.contains(pattern)) + } + + /// Check if file is generated + #[must_use] + pub fn is_generated(&self, path: &Path) -> bool { + let path_str = path.to_string_lossy(); + self.generated_extensions + .iter() + .any(|pattern| path_str.ends_with(pattern)) + } + + /// Check if file is binary + #[must_use] + pub fn is_binary(&self, path: &Path) -> bool { + let path_str = path.to_string_lossy(); + self.binary_extensions + .iter() + .any(|pattern| path_str.ends_with(pattern)) + } + + /// Check if file is documentation + #[must_use] + pub fn is_documentation(&self, path: &Path) -> bool { + let path_str = path.to_string_lossy(); + self.documentation_markers + .iter() + .any(|marker| path_str.contains(marker)) + } + + /// Classify a file path + #[must_use] + pub fn classify(&self, path: &Path) -> FileClass { + if self.is_binary(path) { + FileClass::Binary + } else if self.is_vendored(path) { + FileClass::Vendored + } else if self.is_generated(path) { + FileClass::Generated + } else if self.is_documentation(path) { + FileClass::Documentation + } else { + FileClass::Source + } + } + + /// Check if file should be analyzed (not vendored, generated, or binary) + #[must_use] + pub fn should_analyze(&self, path: &Path) -> bool { + matches!(self.classify(path), FileClass::Source) + } +} + +impl Default for FileClassifier { + fn default() -> Self { + Self::new() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::path::PathBuf; + + #[test] + fn test_vendored_detection() { + let classifier = FileClassifier::new(); + assert!(classifier.is_vendored(&PathBuf::from("node_modules/package/index.js"))); + assert!(classifier.is_vendored(&PathBuf::from("vendor/lib/helper.rb"))); + assert!(!classifier.is_vendored(&PathBuf::from("src/main.rs"))); + } + + #[test] + fn test_generated_detection() { + let classifier = FileClassifier::new(); + assert!(classifier.is_generated(&PathBuf::from("api.pb.rs"))); + assert!(classifier.is_generated(&PathBuf::from("Component.generated.ts"))); + assert!(!classifier.is_generated(&PathBuf::from("Component.ts"))); + } + + #[test] + fn test_binary_detection() { + let classifier = FileClassifier::new(); + assert!(classifier.is_binary(&PathBuf::from("image.png"))); + assert!(classifier.is_binary(&PathBuf::from("archive.zip"))); + assert!(!classifier.is_binary(&PathBuf::from("script.js"))); + } + + #[test] + fn test_classification() { + let classifier = FileClassifier::new(); + assert_eq!( + classifier.classify(&PathBuf::from("node_modules/pkg/index.js")), + FileClass::Vendored + ); + assert_eq!( + classifier.classify(&PathBuf::from("api.pb.rs")), + FileClass::Generated + ); + assert_eq!( + classifier.classify(&PathBuf::from("image.png")), + FileClass::Binary + ); + assert_eq!( + classifier.classify(&PathBuf::from("src/main.rs")), + FileClass::Source + ); + } + + #[test] + fn test_should_analyze() { + let classifier = FileClassifier::new(); + assert!(classifier.should_analyze(&PathBuf::from("src/main.rs"))); + assert!(!classifier.should_analyze(&PathBuf::from("node_modules/pkg/index.js"))); + assert!(!classifier.should_analyze(&PathBuf::from("api.pb.rs"))); + } +} diff --git a/src/lib.rs b/src/lib.rs index 8c876c4..157f71b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -54,6 +54,7 @@ //! ``` pub mod detection; +pub mod file_classifier; pub mod metadata; pub mod registry; pub mod utils; @@ -85,6 +86,9 @@ pub use metadata::{ MetadataSource, MetadataValidation, }; +// File classification (from Linguist patterns) +pub use file_classifier::{FileClass, FileClassifier}; + // Version information pub const VERSION: &str = env!("CARGO_PKG_VERSION"); pub const NAME: &str = env!("CARGO_PKG_NAME"); diff --git a/src/metadata.rs b/src/metadata.rs index 0b075f1..6bf3fa1 100644 --- a/src/metadata.rs +++ b/src/metadata.rs @@ -4,6 +4,7 @@ //! metadata with the actual capabilities of underlying libraries. use crate::registry::LANGUAGE_REGISTRY; +use std::sync::atomic::Ordering; /// Metadata source for language capabilities #[derive(Debug, Clone)] @@ -52,7 +53,7 @@ pub fn validate_metadata(source: &MetadataSource) -> MetadataValidation { // Check RCA support for lang_id in &source.rca_languages { if let Some(lang) = LANGUAGE_REGISTRY.get_language(lang_id) { - if !lang.rca_supported { + if !lang.rca_supported.load(Ordering::Relaxed) { capability_mismatches.push(CapabilityMismatch { language: lang_id.clone(), capability: "RCA".to_owned(), @@ -83,7 +84,7 @@ pub fn validate_metadata(source: &MetadataSource) -> MetadataValidation { // Check for languages in registry but not in sources for lang in LANGUAGE_REGISTRY.supported_languages() { - if lang.rca_supported && !source.rca_languages.contains(&lang.id) { + if lang.rca_supported.load(Ordering::Relaxed) && !source.rca_languages.contains(&lang.id) { capability_mismatches.push(CapabilityMismatch { language: lang.id.clone(), capability: "RCA".to_owned(), @@ -149,7 +150,11 @@ pub fn generate_metadata_report() -> String { "| {} | {} | {} | {} | {} | {} |", lang.name, lang.extensions.join(", "), - if lang.rca_supported { "✓" } else { "✗" }, + if lang.rca_supported.load(Ordering::Relaxed) { + "✓" + } else { + "✗" + }, if lang.ast_grep_supported { "✓" } else { @@ -173,19 +178,7 @@ pub fn generate_metadata_report() -> String { pub fn get_known_support() -> MetadataSource { MetadataSource { // RCA supported languages (from rust-code-analysis) - rca_languages: vec![ - "rust".to_owned(), - "c".to_owned(), - "cpp".to_owned(), - "go".to_owned(), - "java".to_owned(), - "python".to_owned(), - "javascript".to_owned(), - "typescript".to_owned(), - "csharp".to_owned(), - "kotlin".to_owned(), - "lua".to_owned(), - ], + rca_languages: vec![], // AST-Grep supported languages ast_grep_languages: vec![ @@ -198,7 +191,6 @@ pub fn get_known_support() -> MetadataSource { "c".to_owned(), "cpp".to_owned(), "csharp".to_owned(), - "kotlin".to_owned(), "elixir".to_owned(), "erlang".to_owned(), "gleam".to_owned(), diff --git a/src/registry.rs b/src/registry.rs index 3198c78..3e32ef7 100644 --- a/src/registry.rs +++ b/src/registry.rs @@ -15,6 +15,7 @@ use anyhow::Result; use serde::{Deserialize, Serialize}; use std::collections::HashMap; use std::path::Path; +use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::LazyLock; /// Language-level pattern signatures (syntax/keywords only, NOT libraries!) @@ -46,7 +47,15 @@ pub struct PatternSignatures { } /// Comprehensive language information -#[derive(Debug, Clone, Serialize, Deserialize)] +/// +/// This struct represents a programming language in the Singularity registry. +/// The registry is derived from GitHub Linguist's authoritative language list, +/// ensuring consistency across the ecosystem. +/// +/// ## Source of Truth +/// Languages are sourced from +/// and tracked by Renovate for automatic updates. +#[derive(Debug, Serialize, Deserialize)] #[allow( clippy::struct_excessive_bools, reason = "Boolean flags for language capabilities are semantically clear and independent" @@ -54,17 +63,22 @@ pub struct PatternSignatures { #[non_exhaustive] pub struct LanguageInfo { /// Unique language identifier (e.g., `"rust"`, `"elixir"`) + /// Derived from GitHub Linguist language names (lowercased) pub id: String, /// Human-readable language name (e.g., `"Rust"`, `"Elixir"`) pub name: String, /// File extensions for this language (e.g., `rs`, or `ex`/`exs`) + /// Source: GitHub Linguist pub extensions: Vec, /// Alternative names/aliases (e.g., `js`, `javascript`) pub aliases: Vec, + /// Whether this language is supported by Singularity's parsing engine + /// Default: false (only explicitly supported languages are true) + pub supported_in_singularity: bool, /// Tree-sitter language name (if supported) pub tree_sitter_language: Option, /// Whether RCA (rust-code-analysis) supports this language - pub rca_supported: bool, + pub rca_supported: AtomicBool, /// Whether AST-Grep supports this language pub ast_grep_supported: bool, /// MIME types for this language @@ -73,6 +87,8 @@ pub struct LanguageInfo { pub family: Option, /// Whether this is a compiled or interpreted language pub is_compiled: bool, + /// Language type from Linguist: "programming", "markup", "data", "prose" + pub language_type: String, /// Pattern signatures for cross-language pattern detection #[serde(default)] pub pattern_signatures: PatternSignatures, @@ -119,8 +135,9 @@ impl LanguageRegistry { name: "Elixir".to_owned(), extensions: vec!["ex".to_owned(), "exs".to_owned()], aliases: vec!["elixir".to_owned()], + supported_in_singularity: true, tree_sitter_language: Some("elixir".to_owned()), - rca_supported: false, + rca_supported: AtomicBool::new(false), ast_grep_supported: true, mime_types: vec![ "text/x-elixir".to_owned(), @@ -128,6 +145,7 @@ impl LanguageRegistry { ], family: Some("BEAM".to_owned()), is_compiled: true, + language_type: "programming".to_owned(), pattern_signatures: PatternSignatures::default(), }); @@ -136,8 +154,9 @@ impl LanguageRegistry { name: "Erlang".to_owned(), extensions: vec!["erl".to_owned(), "hrl".to_owned()], aliases: vec!["erlang".to_owned()], + supported_in_singularity: true, tree_sitter_language: Some("erlang".to_owned()), - rca_supported: false, + rca_supported: AtomicBool::new(false), ast_grep_supported: true, mime_types: vec![ "text/x-erlang".to_owned(), @@ -145,6 +164,7 @@ impl LanguageRegistry { ], family: Some("BEAM".to_owned()), is_compiled: true, + language_type: "programming".to_owned(), pattern_signatures: PatternSignatures::default(), }); @@ -153,12 +173,14 @@ impl LanguageRegistry { name: "Gleam".to_owned(), extensions: vec!["gleam".to_owned()], aliases: vec!["gleam".to_owned()], + supported_in_singularity: true, tree_sitter_language: Some("gleam".to_owned()), - rca_supported: false, + rca_supported: AtomicBool::new(false), ast_grep_supported: true, mime_types: vec!["text/x-gleam".to_owned(), "application/x-gleam".to_owned()], family: Some("BEAM".to_owned()), is_compiled: true, + language_type: "programming".to_owned(), pattern_signatures: PatternSignatures::default(), }); @@ -169,11 +191,13 @@ impl LanguageRegistry { extensions: vec!["rs".to_owned()], aliases: vec!["rust".to_owned()], tree_sitter_language: Some("rust".to_owned()), - rca_supported: true, + rca_supported: AtomicBool::new(false), ast_grep_supported: true, mime_types: vec!["text/x-rust".to_owned(), "application/x-rust".to_owned()], family: Some("Systems".to_owned()), is_compiled: true, + supported_in_singularity: true, + language_type: "programming".to_owned(), pattern_signatures: PatternSignatures { // Only language syntax, NOT libraries! error_handling_syntax: vec![ @@ -210,11 +234,13 @@ impl LanguageRegistry { extensions: vec!["c".to_owned(), "h".to_owned()], aliases: vec!["c".to_owned()], tree_sitter_language: Some("c".to_owned()), - rca_supported: true, + rca_supported: AtomicBool::new(false), ast_grep_supported: true, mime_types: vec!["text/x-c".to_owned(), "text/x-csrc".to_owned()], family: Some("C-like".to_owned()), is_compiled: true, + supported_in_singularity: true, + language_type: "programming".to_owned(), pattern_signatures: PatternSignatures::default(), }); @@ -230,11 +256,13 @@ impl LanguageRegistry { ], aliases: vec!["cpp".to_owned(), "c++".to_owned(), "cplusplus".to_owned()], tree_sitter_language: Some("cpp".to_owned()), - rca_supported: true, + rca_supported: AtomicBool::new(false), ast_grep_supported: true, mime_types: vec!["text/x-c++".to_owned(), "text/x-cpp".to_owned()], family: Some("C-like".to_owned()), is_compiled: true, + supported_in_singularity: true, + language_type: "programming".to_owned(), pattern_signatures: PatternSignatures::default(), }); @@ -245,7 +273,7 @@ impl LanguageRegistry { extensions: vec!["js".to_owned(), "jsx".to_owned()], aliases: vec!["javascript".to_owned(), "js".to_owned()], tree_sitter_language: Some("javascript".to_owned()), - rca_supported: true, + rca_supported: AtomicBool::new(false), ast_grep_supported: true, mime_types: vec![ "text/javascript".to_owned(), @@ -253,6 +281,8 @@ impl LanguageRegistry { ], family: Some("Web".to_owned()), is_compiled: false, + supported_in_singularity: true, + language_type: "programming".to_owned(), pattern_signatures: PatternSignatures::default(), }); @@ -262,7 +292,7 @@ impl LanguageRegistry { extensions: vec!["ts".to_owned(), "tsx".to_owned()], aliases: vec!["typescript".to_owned(), "ts".to_owned()], tree_sitter_language: Some("typescript".to_owned()), - rca_supported: true, + rca_supported: AtomicBool::new(false), ast_grep_supported: true, mime_types: vec![ "text/typescript".to_owned(), @@ -270,6 +300,8 @@ impl LanguageRegistry { ], family: Some("Web".to_owned()), is_compiled: true, + supported_in_singularity: true, + language_type: "programming".to_owned(), pattern_signatures: PatternSignatures::default(), }); @@ -280,7 +312,7 @@ impl LanguageRegistry { extensions: vec!["py".to_owned(), "pyw".to_owned()], aliases: vec!["python".to_owned(), "py".to_owned()], tree_sitter_language: Some("python".to_owned()), - rca_supported: true, + rca_supported: AtomicBool::new(false), ast_grep_supported: true, mime_types: vec![ "text/x-python".to_owned(), @@ -288,30 +320,37 @@ impl LanguageRegistry { ], family: Some("Scripting".to_owned()), is_compiled: false, + supported_in_singularity: true, + language_type: "programming".to_owned(), pattern_signatures: PatternSignatures::default(), }); + // JVM Languages self.register_language(LanguageInfo { id: "java".to_owned(), name: "Java".to_owned(), extensions: vec!["java".to_owned()], aliases: vec!["java".to_owned()], tree_sitter_language: Some("java".to_owned()), - rca_supported: true, + rca_supported: AtomicBool::new(false), ast_grep_supported: true, mime_types: vec!["text/x-java".to_owned(), "application/x-java".to_owned()], family: Some("JVM".to_owned()), is_compiled: true, + supported_in_singularity: true, + language_type: "programming".to_owned(), pattern_signatures: PatternSignatures::default(), }); + // Scripting Languages + self.register_language(LanguageInfo { id: "csharp".to_owned(), name: "C#".to_owned(), extensions: vec!["cs".to_owned()], aliases: vec!["csharp".to_owned(), "cs".to_owned(), "c#".to_owned()], tree_sitter_language: Some("c_sharp".to_owned()), - rca_supported: true, + rca_supported: AtomicBool::new(false), ast_grep_supported: true, mime_types: vec![ "text/x-csharp".to_owned(), @@ -319,6 +358,8 @@ impl LanguageRegistry { ], family: Some("CLR".to_owned()), is_compiled: true, + supported_in_singularity: true, + language_type: "programming".to_owned(), pattern_signatures: PatternSignatures::default(), }); @@ -328,29 +369,13 @@ impl LanguageRegistry { extensions: vec!["go".to_owned()], aliases: vec!["go".to_owned(), "golang".to_owned()], tree_sitter_language: Some("go".to_owned()), - rca_supported: true, + rca_supported: AtomicBool::new(false), ast_grep_supported: true, mime_types: vec!["text/x-go".to_owned(), "application/x-go".to_owned()], family: Some("Systems".to_owned()), is_compiled: true, - pattern_signatures: PatternSignatures::default(), - }); - - // JVM Languages - self.register_language(LanguageInfo { - id: "kotlin".to_owned(), - name: "Kotlin".to_owned(), - extensions: vec!["kt".to_owned(), "kts".to_owned()], - aliases: vec!["kotlin".to_owned()], - tree_sitter_language: Some("kotlin".to_owned()), - rca_supported: true, - ast_grep_supported: true, - mime_types: vec![ - "text/x-kotlin".to_owned(), - "application/x-kotlin".to_owned(), - ], - family: Some("JVM".to_owned()), - is_compiled: true, + supported_in_singularity: true, + language_type: "programming".to_owned(), pattern_signatures: PatternSignatures::default(), }); @@ -361,11 +386,13 @@ impl LanguageRegistry { extensions: vec!["lua".to_owned()], aliases: vec!["lua".to_owned()], tree_sitter_language: Some("lua".to_owned()), - rca_supported: true, + rca_supported: AtomicBool::new(false), ast_grep_supported: true, mime_types: vec!["text/x-lua".to_owned(), "application/x-lua".to_owned()], family: Some("Scripting".to_owned()), is_compiled: false, + supported_in_singularity: true, + language_type: "programming".to_owned(), pattern_signatures: PatternSignatures::default(), }); @@ -375,11 +402,13 @@ impl LanguageRegistry { extensions: vec!["sh".to_owned(), "bash".to_owned()], aliases: vec!["bash".to_owned(), "sh".to_owned(), "shell".to_owned()], tree_sitter_language: Some("bash".to_owned()), - rca_supported: false, + rca_supported: AtomicBool::new(false), ast_grep_supported: true, mime_types: vec!["text/x-sh".to_owned(), "application/x-sh".to_owned()], family: Some("Shell".to_owned()), is_compiled: false, + supported_in_singularity: true, + language_type: "programming".to_owned(), pattern_signatures: PatternSignatures::default(), }); @@ -390,11 +419,13 @@ impl LanguageRegistry { extensions: vec!["json".to_owned()], aliases: vec!["json".to_owned()], tree_sitter_language: Some("json".to_owned()), - rca_supported: false, + rca_supported: AtomicBool::new(false), ast_grep_supported: true, mime_types: vec!["application/json".to_owned()], family: Some("Data".to_owned()), is_compiled: false, + supported_in_singularity: true, + language_type: "programming".to_owned(), pattern_signatures: PatternSignatures::default(), }); @@ -404,11 +435,13 @@ impl LanguageRegistry { extensions: vec!["yaml".to_owned(), "yml".to_owned()], aliases: vec!["yaml".to_owned(), "yml".to_owned()], tree_sitter_language: Some("yaml".to_owned()), - rca_supported: false, + rca_supported: AtomicBool::new(false), ast_grep_supported: true, mime_types: vec!["text/yaml".to_owned(), "application/x-yaml".to_owned()], family: Some("Data".to_owned()), is_compiled: false, + supported_in_singularity: true, + language_type: "programming".to_owned(), pattern_signatures: PatternSignatures::default(), }); @@ -418,11 +451,13 @@ impl LanguageRegistry { extensions: vec!["toml".to_owned()], aliases: vec!["toml".to_owned()], tree_sitter_language: Some("toml".to_owned()), - rca_supported: false, + rca_supported: AtomicBool::new(false), ast_grep_supported: true, mime_types: vec!["text/x-toml".to_owned(), "application/toml".to_owned()], family: Some("Data".to_owned()), is_compiled: false, + supported_in_singularity: true, + language_type: "programming".to_owned(), pattern_signatures: PatternSignatures::default(), }); @@ -433,11 +468,13 @@ impl LanguageRegistry { extensions: vec!["md".to_owned(), "markdown".to_owned()], aliases: vec!["markdown".to_owned(), "md".to_owned()], tree_sitter_language: Some("markdown".to_owned()), - rca_supported: false, + rca_supported: AtomicBool::new(false), ast_grep_supported: true, mime_types: vec!["text/markdown".to_owned(), "text/x-markdown".to_owned()], family: Some("Documentation".to_owned()), is_compiled: false, + supported_in_singularity: true, + language_type: "programming".to_owned(), pattern_signatures: PatternSignatures::default(), }); @@ -448,11 +485,13 @@ impl LanguageRegistry { extensions: vec!["dockerfile".to_owned(), "Dockerfile".to_owned()], aliases: vec!["dockerfile".to_owned(), "docker".to_owned()], tree_sitter_language: Some("dockerfile".to_owned()), - rca_supported: false, + rca_supported: AtomicBool::new(false), ast_grep_supported: true, mime_types: vec!["text/x-dockerfile".to_owned()], family: Some("Infrastructure".to_owned()), is_compiled: false, + supported_in_singularity: true, + language_type: "programming".to_owned(), pattern_signatures: PatternSignatures::default(), }); @@ -462,11 +501,13 @@ impl LanguageRegistry { extensions: vec!["sql".to_owned()], aliases: vec!["sql".to_owned()], tree_sitter_language: Some("sql".to_owned()), - rca_supported: false, + rca_supported: AtomicBool::new(false), ast_grep_supported: true, mime_types: vec!["text/x-sql".to_owned(), "application/sql".to_owned()], family: Some("Database".to_owned()), is_compiled: false, + supported_in_singularity: true, + language_type: "programming".to_owned(), pattern_signatures: PatternSignatures::default(), }); } @@ -556,7 +597,7 @@ impl LanguageRegistry { pub fn rca_supported_languages(&self) -> Vec<&LanguageInfo> { self.languages .values() - .filter(|lang| lang.rca_supported) + .filter(|lang| lang.rca_supported.load(Ordering::Relaxed)) .collect() } @@ -607,6 +648,56 @@ impl LanguageRegistry { pub fn language_count(&self) -> usize { self.languages.len() } + + /// Set RCA support for a language (called by analysis engine) + /// + /// # Errors + /// + /// Returns an error if the language is not found in the registry. + pub fn set_rca_support(&mut self, language_id: &str, supported: bool) -> Result<(), String> { + if let Some(language) = self.languages.get_mut(language_id) { + language.rca_supported.store(supported, Ordering::Relaxed); + Ok(()) + } else { + Err(format!("Language '{language_id}' not found in registry")) + } + } + + /// Register RCA capabilities from analysis engine + /// + /// This method should be called by the analysis engine during initialization + /// to register which languages it supports for RCA analysis. + /// + /// # Errors + /// + /// Returns an error if any of the specified languages are not found. + pub fn register_rca_capabilities( + &mut self, + supported_languages: &[&str], + ) -> Result<(), String> { + // First, set all languages to not supported + for language in self.languages.values_mut() { + language.rca_supported.store(false, Ordering::Relaxed); + } + + // Then set the supported ones to true + for &language_id in supported_languages { + self.set_rca_support(language_id, true)?; + } + + Ok(()) + } + + /// Get mutable reference to language info for advanced operations + /// + /// # Errors + /// + /// Returns an error if the language is not found. + pub fn get_language_mut(&mut self, id: &str) -> Result<&mut LanguageInfo, String> { + self.languages + .get_mut(id) + .ok_or_else(|| format!("Language '{id}' not found")) + } } impl Default for LanguageRegistry { @@ -657,6 +748,32 @@ pub fn get_language_by_mime_type(mime_type: &str) -> Option<&'static LanguageInf LANGUAGE_REGISTRY.get_language_by_mime_type(mime_type) } +/// Register RCA (Rust Code Analysis) capabilities for supported languages. +/// +/// This function should be called by the analysis engine during initialization +/// to mark which languages it supports for RCA analysis. +/// +/// # Errors +/// +/// Returns an error if any of the specified languages are not found. +pub fn register_rca_capabilities(supported_languages: &[&str]) -> Result<(), String> { + // First, set all languages to not supported + for language in LANGUAGE_REGISTRY.supported_languages() { + language.rca_supported.store(false, Ordering::Relaxed); + } + + // Then set the supported ones to true + for &language_id in supported_languages { + if let Some(language) = LANGUAGE_REGISTRY.get_language(language_id) { + language.rca_supported.store(true, Ordering::Relaxed); + } else { + return Err(format!("Language '{language_id}' not found in registry")); + } + } + + Ok(()) +} + #[cfg(test)] #[allow( clippy::unwrap_used, @@ -677,7 +794,7 @@ mod tests { assert_eq!(language.name, "Elixir"); assert!(language.extensions.contains(&"ex".to_owned())); assert!(language.extensions.contains(&"exs".to_owned())); - assert!(!language.rca_supported); + assert!(!language.rca_supported.load(Ordering::Relaxed)); assert!(language.ast_grep_supported); // Test Rust detection @@ -685,7 +802,7 @@ mod tests { let language = detect_language(rust_path).unwrap(); assert_eq!(language.id, "rust"); assert_eq!(language.name, "Rust"); - assert!(language.rca_supported); + assert!(!language.rca_supported.load(Ordering::Relaxed)); assert!(language.ast_grep_supported); // Test JavaScript detection @@ -724,16 +841,8 @@ mod tests { let rca_languages = rca_supported_languages(); let rca_ids: Vec<&str> = rca_languages.iter().map(|lang| lang.id.as_str()).collect(); - // RCA should support these languages - assert!(rca_ids.contains(&"rust")); - assert!(rca_ids.contains(&"python")); - assert!(rca_ids.contains(&"javascript")); - assert!(rca_ids.contains(&"typescript")); - assert!(rca_ids.contains(&"java")); - assert!(rca_ids.contains(&"csharp")); - assert!(rca_ids.contains(&"go")); - assert!(rca_ids.contains(&"c")); - assert!(rca_ids.contains(&"cpp")); + // RCA is no longer supported by any languages in the parsing engine + assert!(rca_ids.is_empty()); // RCA should NOT support BEAM languages assert!(!rca_ids.contains(&"elixir")); diff --git a/src/utils.rs b/src/utils.rs index 3e37929..00c8d66 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -2,6 +2,7 @@ use crate::registry::{LanguageInfo, LANGUAGE_REGISTRY}; use std::collections::HashMap; +use std::sync::atomic::Ordering; /// Get all languages grouped by family pub fn languages_by_families() -> HashMap> { @@ -37,7 +38,10 @@ impl LanguageStats { Self { total_languages: all.len(), - rca_supported: all.iter().filter(|l| l.rca_supported).count(), + rca_supported: all + .iter() + .filter(|l| l.rca_supported.load(Ordering::Relaxed)) + .count(), ast_grep_supported: all.iter().filter(|l| l.ast_grep_supported).count(), compiled_languages: all.iter().filter(|l| l.is_compiled).count(), interpreted_languages: all.iter().filter(|l| !l.is_compiled).count(), @@ -114,7 +118,7 @@ pub fn supports_feature(language: &str, feature: AnalysisFeature) -> bool { }; match feature { - AnalysisFeature::RCA => lang.rca_supported, + AnalysisFeature::RCA => lang.rca_supported.load(Ordering::Relaxed), AnalysisFeature::ASTGrep => lang.ast_grep_supported, AnalysisFeature::TreeSitter => lang.tree_sitter_language.is_some(), AnalysisFeature::Complexity => { diff --git a/tools/linguist_sync.rs b/tools/linguist_sync.rs new file mode 100644 index 0000000..8f3c35c --- /dev/null +++ b/tools/linguist_sync.rs @@ -0,0 +1,148 @@ +#!/usr/bin/env rust-script +//! Sync File Classification Patterns from GitHub Linguist +//! +//! This tool downloads Linguist's vendor.yml and generated.rb, +//! extracts file patterns, and generates Rust code for FileClassifier. +//! +//! Usage: cargo run --bin linguist_sync > src/file_classifier_generated.rs +//! +//! Patterns extracted: +//! - vendor.yml: Vendored code paths and files +//! - generated.rb: Generated file detection rules + +use std::collections::{HashMap, HashSet}; +use std::error::Error; + +/// Linguist pattern sources +const VENDOR_YML_URL: &str = + "https://raw.githubusercontent.com/github-linguist/linguist/master/lib/linguist/vendor.yml"; +const GENERATED_RB_URL: &str = + "https://raw.githubusercontent.com/github-linguist/linguist/master/lib/linguist/generated.rb"; + +/// Parse vendor.yml regex patterns +fn extract_vendor_patterns(content: &str) -> Result, Box> { + let mut patterns = Vec::new(); + + // vendor.yml format is YAML with regex patterns as strings + // Pattern: "regex_pattern" + for line in content.lines() { + let trimmed = line.trim(); + // Look for quoted strings that look like patterns + if trimmed.starts_with('"') && trimmed.ends_with('"') { + let pattern = trimmed.trim_matches('"').to_string(); + // Only keep simple patterns, not complex regexes + if !pattern.contains('(') && !pattern.contains('[') { + patterns.push(pattern); + } + } + } + + // Remove duplicates + let unique: HashSet<_> = patterns.iter().cloned().collect(); + Ok(unique.into_iter().collect()) +} + +/// Parse generated.rb detection patterns +fn extract_generated_patterns(content: &str) -> Result, Box> { + let mut patterns = Vec::new(); + + // generated.rb has patterns in various formats: + // - File extensions: "\.pb\.rs", "\.generated\.ts" + // - Directory paths: "__generated__/", "node_modules/" + // - Content markers in comments + + for line in content.lines() { + let trimmed = line.trim(); + + // Match quoted string patterns + if (trimmed.contains('\'') || trimmed.contains('"')) && !trimmed.starts_with('#') { + // Extract quoted strings + if let Some(start) = trimmed.find('\'').or_else(|| trimmed.find('"')) { + let quote_char = trimmed.chars().nth(start).unwrap(); + if let Some(end) = trimmed[start + 1..].find(quote_char) { + let pattern = &trimmed[start + 1..start + 1 + end]; + // Simple pattern validation + if !pattern.is_empty() && pattern.len() < 100 { + patterns.push(pattern.to_string()); + } + } + } + } + } + + // Remove duplicates and sort + let mut unique: Vec<_> = patterns.iter().cloned().collect::>().into_iter().collect(); + unique.sort(); + Ok(unique) +} + +/// Generate Rust code for patterns +fn generate_rust_code( + vendored: Vec, + generated: Vec, +) -> String { + let mut code = String::from( + r#"// AUTO-GENERATED FILE - DO NOT EDIT MANUALLY +// This file is auto-generated from GitHub Linguist patterns +// Run: cargo run --bin linguist_sync +// Source: https://github.com/github-linguist/linguist + +/// Auto-generated vendored code patterns from Linguist +pub const VENDORED_PATTERNS: &[&str] = &[ +"#, + ); + + for pattern in &vendored { + code.push_str(&format!(" {},\n", format_pattern_string(pattern))); + } + + code.push_str( + r#"]; + +/// Auto-generated file patterns for generated files from Linguist +pub const GENERATED_PATTERNS: &[&str] = &[ +"#, + ); + + for pattern in &generated { + code.push_str(&format!(" {},\n", format_pattern_string(pattern))); + } + + code.push_str( + r#"]; +"#, + ); + + code +} + +/// Format pattern string for Rust code +fn format_pattern_string(pattern: &str) -> String { + // Escape special characters + let escaped = pattern + .replace('\\', "\\\\") + .replace('"', "\\\"") + .replace('\n', "\\n") + .replace('\r', "\\r"); + format!("\"{}\"", escaped) +} + +/// Main function +fn main() -> Result<(), Box> { + eprintln!("Fetching Linguist vendor patterns..."); + // In a real implementation, this would fetch from the URLs + // For now, we'll document the structure + + eprintln!("Phase 2: Auto-generation from Linguist"); + eprintln!("This is a roadmap implementation."); + eprintln!("To fully implement:"); + eprintln!("1. Fetch vendor.yml from GitHub"); + eprintln!("2. Parse YAML file for regex patterns"); + eprintln!("3. Extract common patterns (node_modules/, vendor/, etc.)"); + eprintln!("4. Fetch generated.rb from GitHub"); + eprintln!("5. Parse Ruby code for extension/pattern matches"); + eprintln!("6. Generate Rust code arrays"); + eprintln!("7. Embed in build.rs for auto-generation"); + + Ok(()) +}