diff --git a/.jules/bolt.md b/.jules/bolt.md index bb8f0d2..83bab32 100644 --- a/.jules/bolt.md +++ b/.jules/bolt.md @@ -57,3 +57,7 @@ ## 2025-02-13 - [Substring pre-filtering for regex optimization] **Learning:** In hot paths (like `PriorityEngine._calculate_urgency`), executing pre-compiled regular expressions (`re.search`) for simple keyword extraction or grouping (e.g., `\b(word1|word2)\b`) is significantly slower than simple Python substring checks (`in text`). The regex engine execution overhead in Python adds up in high-iteration loops like priority scoring. **Action:** Always consider pre-extracting literal keywords from simple regex patterns and executing a quick `any(k in text for k in keywords)` pre-filter. Only invoke `regex.search` if the pre-filter passes, avoiding the expensive regex operation on texts that obviously do not match. + +## 2026-02-14 - Stable Cryptographic Cache Keys +**Learning:** Python's built-in `hash()` is salted and non-deterministic across process restarts or different worker processes. Using `hash(image_bytes)` as a cache key in a multi-worker production environment (like Gunicorn/Uvicorn) results in a 0% hit rate across workers and process restarts. +**Action:** Always use stable cryptographic hashes like `hashlib.md5(data).hexdigest()` for cache keys involving binary data to ensure consistency across the entire application cluster. diff --git a/backend/routers/detection.py b/backend/routers/detection.py index cf076ff..0bf7017 100644 --- a/backend/routers/detection.py +++ b/backend/routers/detection.py @@ -3,6 +3,7 @@ from PIL import Image import logging import time +import hashlib from backend.utils import process_and_detect, validate_uploaded_file, process_uploaded_image from backend.schemas import DetectionResponse, UrgencyAnalysisRequest, UrgencyAnalysisResponse @@ -68,35 +69,44 @@ async def _get_cached_result(key: str, func, *args, **kwargs): return result async def _cached_detect_severity(image_bytes: bytes): - key = f"severity_{hash(image_bytes)}" + # Stable cache key using MD5 (hash() is unstable across processes) + image_hash = hashlib.md5(image_bytes).hexdigest() + key = f"severity_{image_hash}" return await _get_cached_result(key, detect_severity_clip, image_bytes) async def _cached_detect_smart_scan(image_bytes: bytes): - key = f"smart_scan_{hash(image_bytes)}" + image_hash = hashlib.md5(image_bytes).hexdigest() + key = f"smart_scan_{image_hash}" return await _get_cached_result(key, detect_smart_scan_clip, image_bytes) async def _cached_generate_caption(image_bytes: bytes): - key = f"caption_{hash(image_bytes)}" + image_hash = hashlib.md5(image_bytes).hexdigest() + key = f"caption_{image_hash}" return await _get_cached_result(key, generate_image_caption, image_bytes) async def _cached_detect_waste(image_bytes: bytes): - key = f"waste_{hash(image_bytes)}" + image_hash = hashlib.md5(image_bytes).hexdigest() + key = f"waste_{image_hash}" return await _get_cached_result(key, detect_waste_clip, image_bytes) async def _cached_detect_civic_eye(image_bytes: bytes): - key = f"civic_eye_{hash(image_bytes)}" + image_hash = hashlib.md5(image_bytes).hexdigest() + key = f"civic_eye_{image_hash}" return await _get_cached_result(key, detect_civic_eye_clip, image_bytes) async def _cached_detect_graffiti(image_bytes: bytes): - key = f"graffiti_{hash(image_bytes)}" + image_hash = hashlib.md5(image_bytes).hexdigest() + key = f"graffiti_{image_hash}" return await _get_cached_result(key, detect_graffiti_art_clip, image_bytes) async def _cached_detect_traffic_sign(image_bytes: bytes): - key = f"traffic_sign_{hash(image_bytes)}" + image_hash = hashlib.md5(image_bytes).hexdigest() + key = f"traffic_sign_{image_hash}" return await _get_cached_result(key, detect_traffic_sign_clip, image_bytes) async def _cached_detect_abandoned_vehicle(image_bytes: bytes): - key = f"abandoned_vehicle_{hash(image_bytes)}" + image_hash = hashlib.md5(image_bytes).hexdigest() + key = f"abandoned_vehicle_{image_hash}" return await _get_cached_result(key, detect_abandoned_vehicle_clip, image_bytes) # Endpoints diff --git a/backend/routers/issues.py b/backend/routers/issues.py index 4b93ad0..ace7e1e 100644 --- a/backend/routers/issues.py +++ b/backend/routers/issues.py @@ -236,8 +236,7 @@ async def create_issue( # Invalidate cache so new issue appears try: recent_issues_cache.clear() - recent_issues_cache.clear() - user_issues_cache.clear() + user_issues_cache.clear() except Exception as e: logger.error(f"Error clearing cache: {e}") @@ -347,24 +346,27 @@ def get_nearby_issues( ) # Convert to response format and limit results - nearby_responses = [ - NearbyIssueResponse( - id=issue.id, - description=issue.description[:100] + "..." if len(issue.description) > 100 else issue.description, - category=issue.category, - latitude=issue.latitude, - longitude=issue.longitude, - distance_meters=distance, - upvotes=issue.upvotes or 0, - created_at=issue.created_at, - status=issue.status - ) - for issue, distance in nearby_issues_with_distance[:limit] - ] + # Performance Boost: Map directly to dictionaries to avoid Pydantic overhead + nearby_data = [] + for issue, distance in nearby_issues_with_distance[:limit]: + desc = issue.description or "" + short_desc = desc[:100] + "..." if len(desc) > 100 else desc + + nearby_data.append({ + "id": issue.id, + "description": short_desc, + "category": issue.category, + "latitude": issue.latitude, + "longitude": issue.longitude, + "distance_meters": distance, + "upvotes": issue.upvotes or 0, + "created_at": issue.created_at.isoformat() if issue.created_at else None, + "status": issue.status + }) # Performance Boost: Cache serialized JSON to bypass redundant Pydantic validation # and serialization on cache hits. - json_data = json.dumps([r.model_dump(mode='json') for r in nearby_responses]) + json_data = json.dumps(nearby_data) nearby_issues_cache.set(json_data, cache_key) return Response(content=json_data, media_type="application/json")