diff --git a/docs/firecrawl-caching.md b/docs/firecrawl-caching.md new file mode 100644 index 0000000..c9c67f3 --- /dev/null +++ b/docs/firecrawl-caching.md @@ -0,0 +1,240 @@ +# Firecrawl Caching + +This document describes the Firecrawl caching functionality implemented to address issue #46: "Cache firecrawl results so it doesn't use up the API credit". + +## Overview + +The Firecrawl caching system reduces API credit usage by storing scraped web content locally and serving cached results for subsequent requests within the configured cache duration. This is particularly useful for: + +- Development and testing environments +- Documentation sites that don't change frequently +- Bulk processing jobs +- Reducing costs for repeated scraping of the same URLs + +## Features + +### Core Functionality +- **Persistent Caching**: SQLite-based storage for reliable caching across sessions +- **Configurable TTL**: Set cache duration per request or globally +- **URL Normalization**: Consistent cache keys by normalizing URLs (removing tracking parameters, case-insensitive) +- **Automatic Cleanup**: Expired entries are automatically removed +- **Size Management**: Configurable limits on cache size and number of entries +- **Access Tracking**: Monitor cache usage and efficiency + +### Integration +- **Seamless Integration**: Works transparently with existing `` tags in prompts +- **Firecrawl API Integration**: Uses Firecrawl's built-in `maxAge` parameter for server-side caching +- **Dual-Layer Caching**: Combines client-side and server-side caching for maximum efficiency + +## Configuration + +### Environment Variables + +| Variable | Default | Description | +|----------|---------|-------------| +| `FIRECRAWL_CACHE_ENABLE` | `true` | Enable/disable caching | +| `FIRECRAWL_CACHE_TTL_HOURS` | `24` | Default cache TTL in hours | +| `FIRECRAWL_CACHE_MAX_SIZE_MB` | `100` | Maximum cache size in MB | +| `FIRECRAWL_CACHE_MAX_ENTRIES` | `1000` | Maximum number of cache entries | +| `FIRECRAWL_CACHE_AUTO_CLEANUP` | `true` | Enable automatic cleanup | +| `FIRECRAWL_API_KEY` | Required | Firecrawl API key for scraping | + +### Example Configuration + +```bash +# Enable caching with 48-hour TTL +export FIRECRAWL_CACHE_TTL_HOURS=48 + +# Set cache size limit to 200MB +export FIRECRAWL_CACHE_MAX_SIZE_MB=200 + +# Disable caching for real-time data +export FIRECRAWL_CACHE_ENABLE=false +``` + +## Usage + +### Automatic Usage + +The caching system works automatically with existing `` tags in your prompts: + +```prompt +# This will use cached content if available +https://docs.example.com/api-reference +``` + +### CLI Commands + +#### View Cache Statistics +```bash +pdd firecrawl-cache stats +``` + +Shows: +- Total and active cache entries +- Cache size and efficiency +- Access patterns and statistics + +#### Clear Cache +```bash +pdd firecrawl-cache clear +``` + +Removes all cached entries (with confirmation prompt). + +#### Check Specific URL +```bash +pdd firecrawl-cache check --url https://example.com +``` + +Shows whether a specific URL is cached and displays content preview. + +#### View Configuration +```bash +pdd firecrawl-cache info +``` + +Displays current cache configuration and environment variables. + +### Programmatic Usage + +```python +from pdd.firecrawl_cache import get_firecrawl_cache + +# Get cache instance +cache = get_firecrawl_cache() + +# Check if URL is cached +content = cache.get("https://example.com") +if content is None: + # URL not cached, would need to scrape + pass + +# Cache content manually +cache.set("https://example.com", "web content", ttl_hours=12) + +# Get cache statistics +stats = cache.get_stats() +print(f"Cache efficiency: {stats['active_entries']}/{stats['total_entries']}") +``` + +## Cache Storage + +### Location +- **Default**: `{project_root}/cache/firecrawl/firecrawl_cache.db` +- **Custom**: Set via `FirecrawlCache(cache_dir="/path/to/cache")` + +### Database Schema +```sql +CREATE TABLE cache_entries ( + url_hash TEXT PRIMARY KEY, -- SHA256 hash of normalized URL + url TEXT NOT NULL, -- Original URL + content TEXT NOT NULL, -- Cached content + timestamp REAL NOT NULL, -- When cached + expires_at REAL NOT NULL, -- When expires + content_hash TEXT NOT NULL, -- MD5 hash of content + metadata TEXT NOT NULL, -- JSON metadata + access_count INTEGER DEFAULT 0, -- Number of accesses + last_accessed REAL DEFAULT 0 -- Last access time +); +``` + +## Best Practices + +### When to Use Caching +- **Static Content**: Documentation, API references, articles +- **Development**: Testing with the same URLs repeatedly +- **Bulk Processing**: Scraping multiple pages from the same site +- **Cost Optimization**: Reducing API credit usage + +### When NOT to Use Caching +- **Real-time Data**: Stock prices, live scores, breaking news +- **Frequently Updated Content**: Social media feeds, dynamic dashboards +- **Time-sensitive Information**: Where freshness is critical + +### Cache Duration Guidelines +- **Documentation**: 24-168 hours (1-7 days) +- **API References**: 24-72 hours (1-3 days) +- **News Articles**: 1-6 hours +- **Static Pages**: 168+ hours (7+ days) + +## Performance Impact + +### Benefits +- **Reduced API Costs**: Significant savings on repeated requests +- **Faster Response Times**: Cached content loads instantly +- **Reduced Network Usage**: Less bandwidth consumption +- **Improved Reliability**: Works offline for cached content + +### Overhead +- **Storage Space**: Cache database grows over time +- **Initial Setup**: First request still requires API call +- **Memory Usage**: Minimal impact on application memory + +## Troubleshooting + +### Common Issues + +#### Cache Not Working +1. Check if caching is enabled: `pdd firecrawl-cache info` +2. Verify environment variables are set correctly +3. Check cache directory permissions + +#### High Storage Usage +1. Reduce `FIRECRAWL_CACHE_MAX_SIZE_MB` +2. Lower `FIRECRAWL_CACHE_MAX_ENTRIES` +3. Clear cache: `pdd firecrawl-cache clear` + +#### Stale Content +1. Reduce `FIRECRAWL_CACHE_TTL_HOURS` +2. Clear specific entries or entire cache +3. Use `maxAge=0` in Firecrawl API calls for fresh content + +### Debug Information + +```bash +# View detailed cache statistics +pdd firecrawl-cache stats + +# Check cache configuration +pdd firecrawl-cache info + +# Test specific URL +pdd firecrawl-cache check --url https://example.com +``` + +## Implementation Details + +### Architecture +- **Client-side Cache**: SQLite database for persistent storage +- **Server-side Cache**: Firecrawl's built-in caching via `maxAge` parameter +- **URL Normalization**: Consistent cache keys across requests +- **Automatic Cleanup**: Background maintenance of cache health + +### Security Considerations +- Cache content is stored in plain text +- URLs are normalized but original URLs are preserved +- No authentication or encryption for cached data +- Consider cache location security for sensitive content + +### Future Enhancements +- Compression for large cached content +- Cache warming strategies +- Distributed caching support +- Cache analytics and reporting +- Integration with other caching systems + +## Related Issues + +This implementation addresses: +- **Issue #46**: Cache firecrawl results so it doesn't use up the API credit + +## Contributing + +To contribute to the Firecrawl caching functionality: + +1. Follow the existing code style and patterns +2. Add tests for new functionality +3. Update documentation for any changes +4. Consider backward compatibility +5. Test with various URL patterns and content types diff --git a/pdd/cli.py b/pdd/cli.py index 9856818..8ae34d4 100644 --- a/pdd/cli.py +++ b/pdd/cli.py @@ -1540,6 +1540,36 @@ def install_completion_cmd(ctx: click.Context) -> None: # Return type remains No # Do not return anything, as the callback expects None or a tuple +@cli.group("firecrawl-cache") +def firecrawl_cache_group(): + """Manage Firecrawl web scraping cache to reduce API credit usage.""" + pass + +@firecrawl_cache_group.command("stats") +def firecrawl_cache_stats(): + """Show Firecrawl cache statistics.""" + from .firecrawl_cache_cli import stats + stats() + +@firecrawl_cache_group.command("clear") +def firecrawl_cache_clear(): + """Clear all cached Firecrawl entries.""" + from .firecrawl_cache_cli import clear + clear() + +@firecrawl_cache_group.command("info") +def firecrawl_cache_info(): + """Show Firecrawl cache configuration and environment variables.""" + from .firecrawl_cache_cli import info + info() + +@firecrawl_cache_group.command("check") +@click.option('--url', help='URL to check in cache') +def firecrawl_cache_check(url): + """Check if a specific URL is cached.""" + from .firecrawl_cache_cli import check + check(url) + @cli.command("setup") @click.pass_context def setup_cmd(ctx: click.Context) -> None: diff --git a/pdd/firecrawl_cache.py b/pdd/firecrawl_cache.py new file mode 100644 index 0000000..587c35b --- /dev/null +++ b/pdd/firecrawl_cache.py @@ -0,0 +1,385 @@ +#!/usr/bin/env python3 +""" +Firecrawl caching module for PDD. + +This module provides caching functionality for Firecrawl web scraping results +to reduce API credit usage by avoiding redundant requests for the same URLs. + +Features: +- File-based caching with configurable expiration +- URL-based cache keys with normalization +- Configurable cache duration via environment variables +- Automatic cache cleanup and management +- Support for different cache strategies (time-based, size-based) + +This addresses issue #46: Cache firecrawl results so it doesn't use up the API credit +""" + +import os +import json +import hashlib +import time +import sqlite3 +from pathlib import Path +from typing import Optional, Dict, Any, Union +from dataclasses import dataclass, asdict +from datetime import datetime, timedelta +import logging + +logger = logging.getLogger(__name__) + +@dataclass +class CacheEntry: + """Represents a cached Firecrawl result.""" + url: str + content: str + timestamp: float + expires_at: float + content_hash: str + metadata: Dict[str, Any] + +class FirecrawlCache: + """ + Manages caching of Firecrawl web scraping results. + + Provides persistent caching to reduce API credit usage by storing + scraped content and serving it for subsequent requests within the + configured cache duration. + """ + + def __init__(self, cache_dir: Optional[Union[str, Path]] = None, + default_ttl_hours: int = 24): + """ + Initialize the Firecrawl cache. + + Args: + cache_dir: Directory to store cache files. Defaults to project cache dir. + default_ttl_hours: Default time-to-live for cache entries in hours. + """ + self.default_ttl_hours = default_ttl_hours + + # Set up cache directory + if cache_dir is None: + # Use project root cache directory + project_root = Path(__file__).parent.parent + cache_dir = project_root / "cache" / "firecrawl" + else: + cache_dir = Path(cache_dir) + + self.cache_dir = cache_dir + self.cache_dir.mkdir(parents=True, exist_ok=True) + + # Cache database file + self.db_path = self.cache_dir / "firecrawl_cache.db" + + # Initialize database + self._init_database() + + # Load configuration from environment + self._load_config() + + logger.info(f"Firecrawl cache initialized at {self.cache_dir}") + + def _load_config(self): + """Load cache configuration from environment variables.""" + # Cache TTL configuration + self.default_ttl_hours = int(os.environ.get('FIRECRAWL_CACHE_TTL_HOURS', self.default_ttl_hours)) + + # Cache size limits + self.max_cache_size_mb = int(os.environ.get('FIRECRAWL_CACHE_MAX_SIZE_MB', 100)) + self.max_entries = int(os.environ.get('FIRECRAWL_CACHE_MAX_ENTRIES', 1000)) + + # Cache behavior flags + self.enable_cache = os.environ.get('FIRECRAWL_CACHE_ENABLE', 'true').lower() == 'true' + self.auto_cleanup = os.environ.get('FIRECRAWL_CACHE_AUTO_CLEANUP', 'true').lower() == 'true' + + logger.debug(f"Cache config: TTL={self.default_ttl_hours}h, MaxSize={self.max_cache_size_mb}MB, " + f"MaxEntries={self.max_entries}, Enabled={self.enable_cache}") + + def _init_database(self): + """Initialize the SQLite database for cache storage.""" + with sqlite3.connect(self.db_path) as conn: + conn.execute(''' + CREATE TABLE IF NOT EXISTS cache_entries ( + url_hash TEXT PRIMARY KEY, + url TEXT NOT NULL, + content TEXT NOT NULL, + timestamp REAL NOT NULL, + expires_at REAL NOT NULL, + content_hash TEXT NOT NULL, + metadata TEXT NOT NULL, + access_count INTEGER DEFAULT 0, + last_accessed REAL DEFAULT 0 + ) + ''') + + # Create index for efficient cleanup queries + conn.execute(''' + CREATE INDEX IF NOT EXISTS idx_expires_at ON cache_entries(expires_at) + ''') + + conn.execute(''' + CREATE INDEX IF NOT EXISTS idx_last_accessed ON cache_entries(last_accessed) + ''') + + conn.commit() + + def _normalize_url(self, url: str) -> str: + """ + Normalize URL for consistent cache keys. + + Args: + url: The URL to normalize + + Returns: + Normalized URL string + """ + # Remove trailing slashes and normalize + url = url.strip().rstrip('/') + + # Convert to lowercase for case-insensitive matching + url = url.lower() + + # Remove common tracking parameters that don't affect content + # This is a basic implementation - could be extended + if '?' in url: + base_url, params = url.split('?', 1) + # Keep only essential parameters, remove tracking ones + essential_params = [] + for param in params.split('&'): + if param and not any(track in param.lower() for track in + ['utm_', 'fbclid', 'gclid', 'ref=', 'source=']): + essential_params.append(param) + + if essential_params: + url = f"{base_url}?{'&'.join(essential_params)}" + else: + url = base_url + + return url + + def _get_url_hash(self, url: str) -> str: + """Generate a hash for the URL to use as cache key.""" + normalized_url = self._normalize_url(url) + return hashlib.sha256(normalized_url.encode('utf-8')).hexdigest() + + def _get_content_hash(self, content: str) -> str: + """Generate a hash for the content to detect changes.""" + return hashlib.md5(content.encode('utf-8')).hexdigest() + + def get(self, url: str) -> Optional[str]: + """ + Retrieve cached content for a URL. + + Args: + url: The URL to retrieve from cache + + Returns: + Cached content if available and not expired, None otherwise + """ + if not self.enable_cache: + return None + + url_hash = self._get_url_hash(url) + current_time = time.time() + + try: + with sqlite3.connect(self.db_path) as conn: + cursor = conn.execute(''' + SELECT content, expires_at, content_hash, metadata + FROM cache_entries + WHERE url_hash = ? AND expires_at > ? + ''', (url_hash, current_time)) + + row = cursor.fetchone() + if row: + content, expires_at, content_hash, metadata_json = row + + # Update access statistics + conn.execute(''' + UPDATE cache_entries + SET access_count = access_count + 1, last_accessed = ? + WHERE url_hash = ? + ''', (current_time, url_hash)) + conn.commit() + + # Parse metadata + try: + metadata = json.loads(metadata_json) if metadata_json else {} + except json.JSONDecodeError: + metadata = {} + + logger.debug(f"Cache hit for {url} (expires in {expires_at - current_time:.0f}s)") + return content + else: + logger.debug(f"Cache miss for {url}") + return None + + except Exception as e: + logger.error(f"Error retrieving from cache for {url}: {e}") + return None + + def set(self, url: str, content: str, ttl_hours: Optional[int] = None, + metadata: Optional[Dict[str, Any]] = None) -> bool: + """ + Store content in cache for a URL. + + Args: + url: The URL to cache + content: The content to cache + ttl_hours: Time-to-live in hours (uses default if None) + metadata: Additional metadata to store with the entry + + Returns: + True if successfully cached, False otherwise + """ + if not self.enable_cache: + return False + + if ttl_hours is None: + ttl_hours = self.default_ttl_hours + + url_hash = self._get_url_hash(url) + content_hash = self._get_content_hash(content) + current_time = time.time() + expires_at = current_time + (ttl_hours * 3600) + + if metadata is None: + metadata = {} + + try: + with sqlite3.connect(self.db_path) as conn: + # Check if entry already exists + cursor = conn.execute('SELECT url_hash FROM cache_entries WHERE url_hash = ?', (url_hash,)) + exists = cursor.fetchone() is not None + + if exists: + # Update existing entry + conn.execute(''' + UPDATE cache_entries + SET content = ?, timestamp = ?, expires_at = ?, + content_hash = ?, metadata = ?, last_accessed = ? + WHERE url_hash = ? + ''', (content, current_time, expires_at, content_hash, + json.dumps(metadata), current_time, url_hash)) + else: + # Insert new entry + conn.execute(''' + INSERT INTO cache_entries + (url_hash, url, content, timestamp, expires_at, content_hash, metadata, last_accessed) + VALUES (?, ?, ?, ?, ?, ?, ?, ?) + ''', (url_hash, url, content, current_time, expires_at, + content_hash, json.dumps(metadata), current_time)) + + conn.commit() + + # Perform cleanup if enabled + if self.auto_cleanup: + self._cleanup_expired() + + logger.debug(f"Cached content for {url} (TTL: {ttl_hours}h)") + return True + + except Exception as e: + logger.error(f"Error caching content for {url}: {e}") + return False + + def _cleanup_expired(self): + """Remove expired entries from cache.""" + current_time = time.time() + + try: + with sqlite3.connect(self.db_path) as conn: + # Remove expired entries + cursor = conn.execute('DELETE FROM cache_entries WHERE expires_at <= ?', (current_time,)) + expired_count = cursor.rowcount + + if expired_count > 0: + logger.debug(f"Cleaned up {expired_count} expired cache entries") + + # Check if we need to enforce size limits + cursor = conn.execute('SELECT COUNT(*) FROM cache_entries') + total_entries = cursor.fetchone()[0] + + if total_entries > self.max_entries: + # Remove oldest entries (by last_accessed) + excess = total_entries - self.max_entries + cursor = conn.execute(''' + DELETE FROM cache_entries + WHERE url_hash IN ( + SELECT url_hash FROM cache_entries + ORDER BY last_accessed ASC + LIMIT ? + ) + ''', (excess,)) + + removed_count = cursor.rowcount + logger.debug(f"Removed {removed_count} old entries to enforce size limit") + + conn.commit() + + except Exception as e: + logger.error(f"Error during cache cleanup: {e}") + + def clear(self): + """Clear all cached entries.""" + try: + with sqlite3.connect(self.db_path) as conn: + cursor = conn.execute('DELETE FROM cache_entries') + count = cursor.rowcount + conn.commit() + logger.info(f"Cleared {count} cache entries") + except Exception as e: + logger.error(f"Error clearing cache: {e}") + + def get_stats(self) -> Dict[str, Any]: + """Get cache statistics.""" + try: + with sqlite3.connect(self.db_path) as conn: + cursor = conn.execute('SELECT COUNT(*) FROM cache_entries') + total_entries = cursor.fetchone()[0] + + cursor = conn.execute('SELECT COUNT(*) FROM cache_entries WHERE expires_at > ?', (time.time(),)) + active_entries = cursor.fetchone()[0] + + cursor = conn.execute('SELECT SUM(LENGTH(content)) FROM cache_entries') + total_size_bytes = cursor.fetchone()[0] or 0 + + cursor = conn.execute('SELECT AVG(access_count) FROM cache_entries') + avg_access_count = cursor.fetchone()[0] or 0 + + return { + 'total_entries': total_entries, + 'active_entries': active_entries, + 'expired_entries': total_entries - active_entries, + 'total_size_mb': round(total_size_bytes / (1024 * 1024), 2), + 'average_access_count': round(avg_access_count, 2), + 'cache_enabled': self.enable_cache, + 'default_ttl_hours': self.default_ttl_hours, + 'max_entries': self.max_entries, + 'max_size_mb': self.max_cache_size_mb + } + except Exception as e: + logger.error(f"Error getting cache stats: {e}") + return {'error': str(e)} + +# Global cache instance +_firecrawl_cache = None + +def get_firecrawl_cache() -> FirecrawlCache: + """Get the global Firecrawl cache instance.""" + global _firecrawl_cache + if _firecrawl_cache is None: + _firecrawl_cache = FirecrawlCache() + return _firecrawl_cache + +def clear_firecrawl_cache(): + """Clear the global Firecrawl cache.""" + global _firecrawl_cache + if _firecrawl_cache is not None: + _firecrawl_cache.clear() + +def get_firecrawl_cache_stats() -> Dict[str, Any]: + """Get statistics for the global Firecrawl cache.""" + cache = get_firecrawl_cache() + return cache.get_stats() diff --git a/pdd/firecrawl_cache_cli.py b/pdd/firecrawl_cache_cli.py new file mode 100644 index 0000000..f4bf45c --- /dev/null +++ b/pdd/firecrawl_cache_cli.py @@ -0,0 +1,158 @@ +#!/usr/bin/env python3 +""" +CLI commands for managing Firecrawl cache. + +This module provides command-line interface for managing the Firecrawl cache, +including viewing statistics, clearing cache, and configuring cache settings. + +Usage: + pdd firecrawl-cache stats # Show cache statistics + pdd firecrawl-cache clear # Clear all cached entries + pdd firecrawl-cache info # Show cache configuration +""" + +import click +from rich.console import Console +from rich.table import Table +from rich.panel import Panel +from rich.text import Text +from .firecrawl_cache import get_firecrawl_cache, clear_firecrawl_cache, get_firecrawl_cache_stats + +console = Console() + +@click.group() +def firecrawl_cache(): + """Manage Firecrawl web scraping cache.""" + pass + +@firecrawl_cache.command() +def stats(): + """Show Firecrawl cache statistics.""" + try: + stats = get_firecrawl_cache_stats() + + if 'error' in stats: + console.print(f"[bold red]Error getting cache stats:[/bold red] {stats['error']}") + return + + # Create statistics table + table = Table(title="Firecrawl Cache Statistics", show_header=True, header_style="bold magenta") + table.add_column("Metric", style="cyan", no_wrap=True) + table.add_column("Value", style="green") + + table.add_row("Total Entries", str(stats.get('total_entries', 0))) + table.add_row("Active Entries", str(stats.get('active_entries', 0))) + table.add_row("Expired Entries", str(stats.get('expired_entries', 0))) + table.add_row("Total Size", f"{stats.get('total_size_mb', 0)} MB") + table.add_row("Average Access Count", str(stats.get('average_access_count', 0))) + table.add_row("Cache Enabled", "Yes" if stats.get('cache_enabled', False) else "No") + table.add_row("Default TTL", f"{stats.get('default_ttl_hours', 0)} hours") + table.add_row("Max Entries", str(stats.get('max_entries', 0))) + table.add_row("Max Size", f"{stats.get('max_size_mb', 0)} MB") + + console.print(table) + + # Show cache efficiency + total_entries = stats.get('total_entries', 0) + active_entries = stats.get('active_entries', 0) + + if total_entries > 0: + efficiency = (active_entries / total_entries) * 100 + efficiency_text = Text(f"Cache Efficiency: {efficiency:.1f}%", style="bold green" if efficiency > 50 else "bold yellow") + console.print(efficiency_text) + + except Exception as e: + console.print(f"[bold red]Error:[/bold red] {e}") + +@firecrawl_cache.command() +def clear(): + """Clear all cached Firecrawl entries.""" + try: + cache = get_firecrawl_cache() + stats_before = cache.get_stats() + + if stats_before.get('total_entries', 0) == 0: + console.print("[yellow]Cache is already empty.[/yellow]") + return + + # Confirm before clearing + if not click.confirm(f"Clear {stats_before.get('total_entries', 0)} cached entries?"): + console.print("Cache clear cancelled.") + return + + cache.clear() + console.print("[bold green]Cache cleared successfully![/bold green]") + + except Exception as e: + console.print(f"[bold red]Error clearing cache:[/bold red] {e}") + +@firecrawl_cache.command() +def info(): + """Show Firecrawl cache configuration and environment variables.""" + import os + + # Create configuration table + table = Table(title="Firecrawl Cache Configuration", show_header=True, header_style="bold magenta") + table.add_column("Setting", style="cyan", no_wrap=True) + table.add_column("Value", style="green") + table.add_column("Description", style="dim") + + # Environment variables + env_vars = { + 'FIRECRAWL_CACHE_ENABLE': 'Enable/disable caching (default: true)', + 'FIRECRAWL_CACHE_TTL_HOURS': 'Default cache TTL in hours (default: 24)', + 'FIRECRAWL_CACHE_MAX_SIZE_MB': 'Maximum cache size in MB (default: 100)', + 'FIRECRAWL_CACHE_MAX_ENTRIES': 'Maximum number of cache entries (default: 1000)', + 'FIRECRAWL_CACHE_AUTO_CLEANUP': 'Enable automatic cleanup (default: true)', + 'FIRECRAWL_API_KEY': 'Firecrawl API key (required for scraping)' + } + + for var, description in env_vars.items(): + value = os.environ.get(var, 'Not set') + if var == 'FIRECRAWL_API_KEY' and value != 'Not set': + value = f"{value[:8]}..." if len(value) > 8 else "Set" + table.add_row(var, value, description) + + console.print(table) + + # Show cache directory info + cache = get_firecrawl_cache() + cache_dir = cache.cache_dir + db_path = cache.db_path + + info_panel = Panel( + f"Cache Directory: {cache_dir}\n" + f"Database File: {db_path}\n" + f"Database Exists: {'Yes' if db_path.exists() else 'No'}", + title="Cache Storage", + border_style="blue" + ) + console.print(info_panel) + +@firecrawl_cache.command() +@click.option('--url', help='URL to check in cache') +def check(url): + """Check if a specific URL is cached.""" + if not url: + console.print("[bold red]Error:[/bold red] URL is required. Use --url option.") + return + + try: + cache = get_firecrawl_cache() + cached_content = cache.get(url) + + if cached_content is not None: + console.print(f"[bold green]URL is cached:[/bold green] {url}") + console.print(f"Content length: {len(cached_content)} characters") + + # Show first 200 characters of content + preview = cached_content[:200] + "..." if len(cached_content) > 200 else cached_content + console.print(f"Content preview:\n{preview}") + else: + console.print(f"[yellow]URL is not cached:[/yellow] {url}") + + except Exception as e: + console.print(f"[bold red]Error checking cache:[/bold red] {e}") + +if __name__ == '__main__': + firecrawl_cache() diff --git a/pdd/preprocess.py b/pdd/preprocess.py index 15d90b5..7a16e30 100644 --- a/pdd/preprocess.py +++ b/pdd/preprocess.py @@ -1,12 +1,14 @@ import os import re import subprocess +import time from typing import List, Optional import traceback from rich.console import Console from rich.panel import Panel from rich.markup import escape from rich.traceback import install +from .firecrawl_cache import get_firecrawl_cache install() console = Console() @@ -132,6 +134,16 @@ def replace_web(match): if recursive: # Defer network operations until after env var expansion return match.group(0) + + # Get cache instance + cache = get_firecrawl_cache() + + # Check cache first + cached_content = cache.get(url) + if cached_content is not None: + console.print(f"Using cached content for: [cyan]{url}[/cyan]") + return cached_content + console.print(f"Scraping web content from: [cyan]{url}[/cyan]") try: try: @@ -142,10 +154,26 @@ def replace_web(match): if not api_key: console.print("[bold yellow]Warning:[/bold yellow] FIRECRAWL_API_KEY not found in environment") return f"[Error: FIRECRAWL_API_KEY not set. Cannot scrape {url}]" + app = FirecrawlApp(api_key=api_key) - response = app.scrape_url(url, formats=['markdown']) + + # Get cache TTL from environment or use default + cache_ttl_hours = int(os.environ.get('FIRECRAWL_CACHE_TTL_HOURS', 24)) + + # Use Firecrawl's built-in caching with maxAge parameter + # Convert hours to milliseconds for Firecrawl API + max_age_ms = cache_ttl_hours * 3600 * 1000 + + response = app.scrape_url(url, formats=['markdown'], maxAge=max_age_ms) + if hasattr(response, 'markdown'): - return response.markdown + content = response.markdown + + # Cache the result for future use + cache.set(url, content, ttl_hours=cache_ttl_hours, + metadata={'scraped_at': time.time(), 'url': url}) + + return content else: console.print(f"[bold yellow]Warning:[/bold yellow] No markdown content returned for {url}") return f"[No content available for {url}]" diff --git a/tests/test_firecrawl_cache.py b/tests/test_firecrawl_cache.py new file mode 100644 index 0000000..53fc93c --- /dev/null +++ b/tests/test_firecrawl_cache.py @@ -0,0 +1,501 @@ +#!/usr/bin/env python3 +""" +Test suite for Firecrawl caching functionality. + +This test suite validates the Firecrawl caching implementation that addresses +issue #46: Cache firecrawl results so it doesn't use up the API credit. + +Tests cover: +- Cache storage and retrieval +- URL normalization +- TTL and expiration handling +- Cache cleanup and size management +- Integration with preprocess module +- CLI commands +""" + +import pytest +import tempfile +import os +import time +import sqlite3 +from pathlib import Path +from unittest.mock import patch, MagicMock, mock_open +import sys + +# Add the pdd directory to the path for imports +sys.path.insert(0, str(Path(__file__).parent.parent / "pdd")) + +from pdd.firecrawl_cache import FirecrawlCache, get_firecrawl_cache, clear_firecrawl_cache +from pdd.preprocess import process_web_tags + + +class TestFirecrawlCache: + """Test class for FirecrawlCache functionality.""" + + def setup_method(self): + """Set up test environment before each test method.""" + # Create a temporary directory for cache + self.temp_dir = tempfile.mkdtemp() + self.cache = FirecrawlCache(cache_dir=self.temp_dir, default_ttl_hours=1) + + def teardown_method(self): + """Clean up test environment after each test method.""" + # Clean up temporary directory + import shutil + shutil.rmtree(self.temp_dir, ignore_errors=True) + + def test_cache_initialization(self): + """Test that cache initializes correctly.""" + assert self.cache.cache_dir.exists() + assert self.cache.db_path.exists() + assert self.cache.default_ttl_hours == 1 + + def test_url_normalization(self): + """Test URL normalization for consistent cache keys.""" + # Test basic normalization + url1 = "https://example.com/" + url2 = "https://example.com" + assert self.cache._normalize_url(url1) == self.cache._normalize_url(url2) + + # Test case normalization + url3 = "https://EXAMPLE.COM" + assert self.cache._normalize_url(url1) == self.cache._normalize_url(url3) + + # Test parameter handling + url4 = "https://example.com?utm_source=test&id=123" + url5 = "https://example.com?id=123" + # Should remove tracking parameters but keep essential ones + normalized4 = self.cache._normalize_url(url4) + normalized5 = self.cache._normalize_url(url5) + assert "utm_source" not in normalized4 + assert "id=123" in normalized4 + + def test_url_hash_generation(self): + """Test URL hash generation for cache keys.""" + url1 = "https://example.com/" + url2 = "https://example.com" + + hash1 = self.cache._get_url_hash(url1) + hash2 = self.cache._get_url_hash(url2) + + # Same normalized URL should produce same hash + assert hash1 == hash2 + assert len(hash1) == 64 # SHA256 hash length + + def test_content_hash_generation(self): + """Test content hash generation.""" + content = "Test content" + hash1 = self.cache._get_content_hash(content) + hash2 = self.cache._get_content_hash(content) + + assert hash1 == hash2 + assert len(hash1) == 32 # MD5 hash length + + def test_cache_set_and_get(self): + """Test basic cache set and get operations.""" + url = "https://example.com" + content = "Test web content" + + # Initially should not be cached + assert self.cache.get(url) is None + + # Set content in cache + success = self.cache.set(url, content) + assert success is True + + # Should now be retrievable + cached_content = self.cache.get(url) + assert cached_content == content + + def test_cache_expiration(self): + """Test cache expiration based on TTL.""" + url = "https://example.com" + content = "Test content" + + # Set with very short TTL + self.cache.set(url, content, ttl_hours=0.001) # ~3.6 seconds + + # Should be available immediately + assert self.cache.get(url) == content + + # Wait for expiration + time.sleep(0.005) # 5ms should be enough + + # Should now be expired + assert self.cache.get(url) is None + + def test_cache_metadata(self): + """Test cache metadata storage and retrieval.""" + url = "https://example.com" + content = "Test content" + metadata = {"scraped_at": time.time(), "source": "test"} + + self.cache.set(url, content, metadata=metadata) + + # Verify content is cached + cached_content = self.cache.get(url) + assert cached_content == content + + # Verify metadata is stored (check database directly) + with sqlite3.connect(self.cache.db_path) as conn: + cursor = conn.execute( + 'SELECT metadata FROM cache_entries WHERE url = ?', (url,) + ) + row = cursor.fetchone() + assert row is not None + stored_metadata = eval(row[0]) # Simple eval for test + assert stored_metadata["source"] == "test" + + def test_cache_access_counting(self): + """Test that cache tracks access counts.""" + url = "https://example.com" + content = "Test content" + + self.cache.set(url, content) + + # Access multiple times + for _ in range(3): + self.cache.get(url) + + # Check access count in database + with sqlite3.connect(self.cache.db_path) as conn: + cursor = conn.execute( + 'SELECT access_count FROM cache_entries WHERE url = ?', (url,) + ) + row = cursor.fetchone() + assert row is not None + assert row[0] == 3 + + def test_cache_cleanup_expired(self): + """Test automatic cleanup of expired entries.""" + url1 = "https://example1.com" + url2 = "https://example2.com" + content = "Test content" + + # Set one with short TTL, one with long TTL + self.cache.set(url1, content, ttl_hours=0.001) # Expires quickly + self.cache.set(url2, content, ttl_hours=24) # Long TTL + + # Wait for first to expire + time.sleep(0.005) + + # Trigger cleanup + self.cache._cleanup_expired() + + # First should be gone, second should remain + assert self.cache.get(url1) is None + assert self.cache.get(url2) == content + + def test_cache_size_limits(self): + """Test cache size limit enforcement.""" + # Set a very small max entries limit + self.cache.max_entries = 2 + + # Add more entries than the limit + for i in range(4): + url = f"https://example{i}.com" + content = f"Content {i}" + self.cache.set(url, content) + + # Should only have max_entries in cache + with sqlite3.connect(self.cache.db_path) as conn: + cursor = conn.execute('SELECT COUNT(*) FROM cache_entries') + count = cursor.fetchone()[0] + assert count <= self.cache.max_entries + + def test_cache_clear(self): + """Test cache clearing functionality.""" + url = "https://example.com" + content = "Test content" + + # Add content to cache + self.cache.set(url, content) + assert self.cache.get(url) == content + + # Clear cache + self.cache.clear() + + # Should be empty + assert self.cache.get(url) is None + + def test_cache_stats(self): + """Test cache statistics generation.""" + # Add some test data + for i in range(3): + url = f"https://example{i}.com" + content = f"Content {i}" + self.cache.set(url, content) + + # Get stats + stats = self.cache.get_stats() + + assert stats['total_entries'] == 3 + assert stats['active_entries'] == 3 + assert stats['expired_entries'] == 0 + assert stats['cache_enabled'] is True + assert stats['default_ttl_hours'] == 1 + + def test_cache_disabled(self): + """Test cache behavior when disabled.""" + # Disable cache + self.cache.enable_cache = False + + url = "https://example.com" + content = "Test content" + + # Set should return False + assert self.cache.set(url, content) is False + + # Get should return None + assert self.cache.get(url) is None + + def test_environment_configuration(self): + """Test cache configuration from environment variables.""" + with patch.dict(os.environ, { + 'FIRECRAWL_CACHE_TTL_HOURS': '48', + 'FIRECRAWL_CACHE_MAX_SIZE_MB': '200', + 'FIRECRAWL_CACHE_MAX_ENTRIES': '2000', + 'FIRECRAWL_CACHE_ENABLE': 'false', + 'FIRECRAWL_CACHE_AUTO_CLEANUP': 'false' + }): + # Create new cache instance to load env vars + cache = FirecrawlCache(cache_dir=self.temp_dir) + + assert cache.default_ttl_hours == 48 + assert cache.max_cache_size_mb == 200 + assert cache.max_entries == 2000 + assert cache.enable_cache is False + assert cache.auto_cleanup is False + + +class TestGlobalCacheFunctions: + """Test global cache functions.""" + + def setup_method(self): + """Set up test environment.""" + self.temp_dir = tempfile.mkdtemp() + + def teardown_method(self): + """Clean up test environment.""" + import shutil + shutil.rmtree(self.temp_dir, ignore_errors=True) + + def test_get_firecrawl_cache(self): + """Test global cache instance retrieval.""" + cache = get_firecrawl_cache() + assert isinstance(cache, FirecrawlCache) + + def test_clear_firecrawl_cache(self): + """Test global cache clearing.""" + cache = get_firecrawl_cache() + cache.set("https://example.com", "test content") + + clear_firecrawl_cache() + + assert cache.get("https://example.com") is None + + def test_get_firecrawl_cache_stats(self): + """Test global cache stats retrieval.""" + stats = get_firecrawl_cache_stats() + assert isinstance(stats, dict) + assert 'total_entries' in stats + + +class TestPreprocessIntegration: + """Test integration with preprocess module.""" + + def setup_method(self): + """Set up test environment.""" + self.temp_dir = tempfile.mkdtemp() + + def teardown_method(self): + """Clean up test environment.""" + import shutil + shutil.rmtree(self.temp_dir, ignore_errors=True) + + @patch('pdd.preprocess.get_firecrawl_cache') + def test_process_web_tags_with_cache_hit(self, mock_get_cache): + """Test web tag processing with cache hit.""" + # Mock cache to return cached content + mock_cache = MagicMock() + mock_cache.get.return_value = "Cached content" + mock_get_cache.return_value = mock_cache + + text = "Test https://example.com content" + result = process_web_tags(text, recursive=False) + + # Should use cached content + assert "Cached content" in result + mock_cache.get.assert_called_once_with("https://example.com") + + @patch('pdd.preprocess.get_firecrawl_cache') + @patch('pdd.preprocess.FirecrawlApp') + def test_process_web_tags_with_cache_miss(self, mock_firecrawl_app, mock_get_cache): + """Test web tag processing with cache miss.""" + # Mock cache to return None (cache miss) + mock_cache = MagicMock() + mock_cache.get.return_value = None + mock_cache.set.return_value = True + mock_get_cache.return_value = mock_cache + + # Mock Firecrawl response + mock_response = MagicMock() + mock_response.markdown = "Scraped content" + mock_app = MagicMock() + mock_app.scrape_url.return_value = mock_response + mock_firecrawl_app.return_value = mock_app + + # Mock environment + with patch.dict(os.environ, {'FIRECRAWL_API_KEY': 'test-key'}): + text = "Test https://example.com content" + result = process_web_tags(text, recursive=False) + + # Should scrape and cache content + assert "Scraped content" in result + mock_cache.get.assert_called_once_with("https://example.com") + mock_cache.set.assert_called_once() + mock_app.scrape_url.assert_called_once() + + def test_process_web_tags_recursive_mode(self): + """Test web tag processing in recursive mode.""" + text = "Test https://example.com content" + result = process_web_tags(text, recursive=True) + + # Should return original text unchanged in recursive mode + assert result == text + + @patch.dict(os.environ, {}, clear=True) + def test_process_web_tags_missing_api_key(self): + """Test web tag processing with missing API key.""" + text = "Test https://example.com content" + result = process_web_tags(text, recursive=False) + + # Should return error message + assert "FIRECRAWL_API_KEY not set" in result + + @patch('pdd.preprocess.FirecrawlApp') + def test_process_web_tags_import_error(self, mock_firecrawl_app): + """Test web tag processing with Firecrawl import error.""" + # Mock import error + mock_firecrawl_app.side_effect = ImportError("No module named 'firecrawl'") + + text = "Test https://example.com content" + result = process_web_tags(text, recursive=False) + + # Should return error message + assert "firecrawl-py package not installed" in result + + +class TestCacheCLI: + """Test cache CLI commands.""" + + def setup_method(self): + """Set up test environment.""" + self.temp_dir = tempfile.mkdtemp() + + def teardown_method(self): + """Clean up test environment.""" + import shutil + shutil.rmtree(self.temp_dir, ignore_errors=True) + + @patch('pdd.firecrawl_cache_cli.get_firecrawl_cache_stats') + def test_cli_stats_command(self, mock_get_stats): + """Test CLI stats command.""" + from pdd.firecrawl_cache_cli import stats + + mock_stats = { + 'total_entries': 5, + 'active_entries': 4, + 'expired_entries': 1, + 'total_size_mb': 2.5, + 'average_access_count': 3.2, + 'cache_enabled': True, + 'default_ttl_hours': 24, + 'max_entries': 1000, + 'max_size_mb': 100 + } + mock_get_stats.return_value = mock_stats + + # Should not raise exception + stats() + + @patch('pdd.firecrawl_cache_cli.get_firecrawl_cache') + def test_cli_clear_command(self, mock_get_cache): + """Test CLI clear command.""" + from pdd.firecrawl_cache_cli import clear + + mock_cache = MagicMock() + mock_cache.get_stats.return_value = {'total_entries': 3} + mock_get_cache.return_value = mock_cache + + # Mock click.confirm to return True + with patch('click.confirm', return_value=True): + clear() + mock_cache.clear.assert_called_once() + + def test_cli_info_command(self): + """Test CLI info command.""" + from pdd.firecrawl_cache_cli import info + + # Should not raise exception + info() + + @patch('pdd.firecrawl_cache_cli.get_firecrawl_cache') + def test_cli_check_command_cached(self, mock_get_cache): + """Test CLI check command with cached URL.""" + from pdd.firecrawl_cache_cli import check + + mock_cache = MagicMock() + mock_cache.get.return_value = "Cached content" + mock_get_cache.return_value = mock_cache + + # Should not raise exception + check("https://example.com") + + @patch('pdd.firecrawl_cache_cli.get_firecrawl_cache') + def test_cli_check_command_not_cached(self, mock_get_cache): + """Test CLI check command with non-cached URL.""" + from pdd.firecrawl_cache_cli import check + + mock_cache = MagicMock() + mock_cache.get.return_value = None + mock_get_cache.return_value = mock_cache + + # Should not raise exception + check("https://example.com") + + +def test_integration_full_workflow(): + """Test complete integration workflow.""" + with tempfile.TemporaryDirectory() as temp_dir: + # Create cache + cache = FirecrawlCache(cache_dir=temp_dir, default_ttl_hours=1) + + # Test URL + url = "https://example.com" + content = "Test web content" + + # Initially not cached + assert cache.get(url) is None + + # Cache content + success = cache.set(url, content, metadata={"test": True}) + assert success is True + + # Retrieve from cache + cached_content = cache.get(url) + assert cached_content == content + + # Check stats + stats = cache.get_stats() + assert stats['total_entries'] == 1 + assert stats['active_entries'] == 1 + + # Clear cache + cache.clear() + assert cache.get(url) is None + + +if __name__ == "__main__": + pytest.main([__file__, "-v"])