Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions .jules/bolt.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,3 +57,11 @@
## 2025-02-13 - [Substring pre-filtering for regex optimization]
**Learning:** In hot paths (like `PriorityEngine._calculate_urgency`), executing pre-compiled regular expressions (`re.search`) for simple keyword extraction or grouping (e.g., `\b(word1|word2)\b`) is significantly slower than simple Python substring checks (`in text`). The regex engine execution overhead in Python adds up in high-iteration loops like priority scoring.
**Action:** Always consider pre-extracting literal keywords from simple regex patterns and executing a quick `any(k in text for k in keywords)` pre-filter. Only invoke `regex.search` if the pre-filter passes, avoiding the expensive regex operation on texts that obviously do not match.

## 2025-02-14 - Stable Hashing for Binary Cache Keys
**Learning:** Python's built-in `hash()` function is salted and randomized across process restarts. Using `hash(image_bytes)` as a cache key for binary data leads to cache invalidation on server restarts and potential collisions.
**Action:** Use a stable cryptographic hash (e.g., `hashlib.md5(data).hexdigest()`) for cache keys involving binary data to ensure consistency across process lifecycles and reduce collision risk.

## 2025-02-14 - Serialized JSON Caching for List Endpoints
**Learning:** Caching Pydantic models or SQLAlchemy objects in list-heavy endpoints still incurs significant overhead due to FastAPI/Pydantic re-validating and re-serializing the entire list on every request.
**Action:** Serialize the list to a JSON string using `json.dumps()` BEFORE caching. On a cache hit, return a raw `fastapi.Response` with `media_type="application/json"`. This bypasses the entire validation/serialization layer and is ~2-3x faster for large lists.
1 change: 1 addition & 0 deletions backend/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,5 +162,6 @@ def invalidate(self):
# Global instances with improved configuration
recent_issues_cache = ThreadSafeCache(ttl=300, max_size=20) # 5 minutes TTL, max 20 entries
nearby_issues_cache = ThreadSafeCache(ttl=60, max_size=100) # 1 minute TTL, max 100 entries
user_issues_cache = ThreadSafeCache(ttl=300, max_size=50) # 5 minutes TTL, max 50 entries
user_upload_cache = ThreadSafeCache(ttl=3600, max_size=1000) # 1 hour TTL for upload limits
blockchain_last_hash_cache = ThreadSafeCache(ttl=3600, max_size=1)
56 changes: 27 additions & 29 deletions backend/routers/detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from PIL import Image
import logging
import time
import hashlib

from backend.utils import process_and_detect, validate_uploaded_file, process_uploaded_image
from backend.schemas import DetectionResponse, UrgencyAnalysisRequest, UrgencyAnalysisResponse
Expand Down Expand Up @@ -38,6 +39,7 @@
detect_abandoned_vehicle_clip
)
from backend.dependencies import get_http_client
from backend.cache import ThreadSafeCache
import backend.dependencies

logger = logging.getLogger(__name__)
Expand All @@ -46,67 +48,63 @@

# Cached Functions

# Simple Cache Implementation to avoid async-lru dependency issues on Render
_cache_store = {}
CACHE_TTL = 3600 # 1 hour
MAX_CACHE_SIZE = 500
# Optimized: Use ThreadSafeCache with TTL and LRU eviction (Issue #CACHE-DETECTION)
detection_cache = ThreadSafeCache(ttl=3600, max_size=500)

async def _get_cached_result(key: str, func, *args, **kwargs):
current_time = time.time()

"""
Optimized: Thread-safe cache lookup using ThreadSafeCache.
"""
# Check cache
if key in _cache_store:
result, timestamp = _cache_store[key]
if current_time - timestamp < CACHE_TTL:
return result
else:
del _cache_store[key]

# Prune cache if too large
if len(_cache_store) > MAX_CACHE_SIZE:
keys_to_remove = list(_cache_store.keys())[:int(MAX_CACHE_SIZE * 0.2)]
for k in keys_to_remove:
del _cache_store[k]

# Execute function
cached_result = detection_cache.get(key)
if cached_result is not None:
return cached_result

# Execute function if cache miss
if 'client' not in kwargs:
import backend.dependencies
kwargs['client'] = backend.dependencies.SHARED_HTTP_CLIENT

result = await func(*args, **kwargs)
_cache_store[key] = (result, current_time)

# Store in cache
detection_cache.set(data=result, key=key)
return result

def _get_image_hash(image_bytes: bytes) -> str:
"""Stable MD5 hash for image bytes to ensure reliable cache keys."""
return hashlib.md5(image_bytes).hexdigest()

async def _cached_detect_severity(image_bytes: bytes):
key = f"severity_{hash(image_bytes)}"
key = f"severity_{_get_image_hash(image_bytes)}"
return await _get_cached_result(key, detect_severity_clip, image_bytes)

async def _cached_detect_smart_scan(image_bytes: bytes):
key = f"smart_scan_{hash(image_bytes)}"
key = f"smart_scan_{_get_image_hash(image_bytes)}"
return await _get_cached_result(key, detect_smart_scan_clip, image_bytes)

async def _cached_generate_caption(image_bytes: bytes):
key = f"caption_{hash(image_bytes)}"
key = f"caption_{_get_image_hash(image_bytes)}"
return await _get_cached_result(key, generate_image_caption, image_bytes)

async def _cached_detect_waste(image_bytes: bytes):
key = f"waste_{hash(image_bytes)}"
key = f"waste_{_get_image_hash(image_bytes)}"
return await _get_cached_result(key, detect_waste_clip, image_bytes)

async def _cached_detect_civic_eye(image_bytes: bytes):
key = f"civic_eye_{hash(image_bytes)}"
key = f"civic_eye_{_get_image_hash(image_bytes)}"
return await _get_cached_result(key, detect_civic_eye_clip, image_bytes)

async def _cached_detect_graffiti(image_bytes: bytes):
key = f"graffiti_{hash(image_bytes)}"
key = f"graffiti_{_get_image_hash(image_bytes)}"
return await _get_cached_result(key, detect_graffiti_art_clip, image_bytes)

async def _cached_detect_traffic_sign(image_bytes: bytes):
key = f"traffic_sign_{hash(image_bytes)}"
key = f"traffic_sign_{_get_image_hash(image_bytes)}"
return await _get_cached_result(key, detect_traffic_sign_clip, image_bytes)

async def _cached_detect_abandoned_vehicle(image_bytes: bytes):
key = f"abandoned_vehicle_{hash(image_bytes)}"
key = f"abandoned_vehicle_{_get_image_hash(image_bytes)}"
return await _get_cached_result(key, detect_abandoned_vehicle_clip, image_bytes)

# Endpoints
Expand Down
18 changes: 14 additions & 4 deletions backend/routers/issues.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
send_status_notification
)
from backend.spatial_utils import get_bounding_box, find_nearby_issues
from backend.cache import recent_issues_cache, nearby_issues_cache, blockchain_last_hash_cache
from backend.cache import recent_issues_cache, nearby_issues_cache, blockchain_last_hash_cache, user_issues_cache
from backend.hf_api_service import verify_resolution_vqa
from backend.dependencies import get_http_client
from backend.rag_service import rag_service
Expand Down Expand Up @@ -236,6 +236,7 @@ async def create_issue(
# Invalidate cache so new issue appears
try:
recent_issues_cache.clear()
user_issues_cache.clear()
except Exception as e:
logger.error(f"Error clearing cache: {e}")

Expand Down Expand Up @@ -586,8 +587,13 @@ def get_user_issues(
):
"""
Get issues reported by a specific user (identified by email).
Optimized: Uses column projection to avoid loading full model instances and large fields.
Optimized: Uses column projection and serialized JSON caching to bypass Pydantic overhead.
"""
cache_key = f"user_issues_{user_email}_{limit}_{offset}"
Copy link
Contributor

@cubic-dev-ai cubic-dev-ai bot Mar 17, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2: Cache key includes raw user email (PII), which may be written to logs by ThreadSafeCache at debug level. Hash the email portion to avoid leaking PII — the same hashlib.md5 pattern already used in detection.py applies here.

Prompt for AI agents
Check if this issue is valid — if so, understand the root cause and fix it. At backend/routers/issues.py, line 592:

<comment>Cache key includes raw user email (PII), which may be written to logs by `ThreadSafeCache` at debug level. Hash the email portion to avoid leaking PII — the same `hashlib.md5` pattern already used in `detection.py` applies here.</comment>

<file context>
@@ -586,8 +587,13 @@ def get_user_issues(
-    Optimized: Uses column projection to avoid loading full model instances and large fields.
+    Optimized: Uses column projection and serialized JSON caching to bypass Pydantic overhead.
     """
+    cache_key = f"user_issues_{user_email}_{limit}_{offset}"
+    cached_json = user_issues_cache.get(cache_key)
+    if cached_json:
</file context>
Suggested change
cache_key = f"user_issues_{user_email}_{limit}_{offset}"
email_hash = hashlib.md5(user_email.encode()).hexdigest()[:12]
cache_key = f"user_issues_{email_hash}_{limit}_{offset}"
Fix with Cubic

cached_json = user_issues_cache.get(cache_key)
if cached_json:
return Response(content=cached_json, media_type="application/json")
Comment on lines +592 to +595
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Cache key contains user email (PII) which may be logged.

The cache key f"user_issues_{user_email}_{limit}_{offset}" includes the user's email directly. ThreadSafeCache logs cache keys at debug level (e.g., logger.debug(f"Cache set: key={key}, ...")). If debug logging is enabled in production, this could expose PII in logs.

Consider hashing the email portion of the key:

🛡️ Proposed fix to hash email in cache key
-    cache_key = f"user_issues_{user_email}_{limit}_{offset}"
+    email_hash = hashlib.md5(user_email.encode()).hexdigest()[:12]
+    cache_key = f"user_issues_{email_hash}_{limit}_{offset}"
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@backend/routers/issues.py` around lines 592 - 595, The cache key currently
includes raw user_email (cache_key =
f"user_issues_{user_email}_{limit}_{offset}"), which can leak PII because
ThreadSafeCache logs keys; change it to use a deterministic hash of the email
(e.g., SHA256 hex of user_email) when building cache_key so logs contain only
the hash: produce a key like "user_issues_{email_hash}_{limit}_{offset}"; update
any helper code that computes the key to reuse the same hashing routine and
ensure user_issues_cache and any cache inspection uses the hashed key
consistently.


results = db.query(
Issue.id,
Issue.category,
Expand All @@ -613,7 +619,7 @@ def get_user_issues(
"id": row.id,
"category": row.category,
"description": short_desc,
"created_at": row.created_at,
"created_at": row.created_at.isoformat() if row.created_at else None,
"image_path": row.image_path,
"status": row.status,
"upvotes": row.upvotes if row.upvotes is not None else 0,
Comment on lines 593 to 625
Expand All @@ -622,7 +628,11 @@ def get_user_issues(
"longitude": row.longitude
})

return data
# Performance Boost: Cache serialized JSON to bypass redundant Pydantic validation
# and serialization on cache hits. Returning Response directly is ~2-3x faster.
json_data = json.dumps(data)
user_issues_cache.set(data=json_data, key=cache_key)
Copy link
Contributor

@cubic-dev-ai cubic-dev-ai bot Mar 17, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2: This cache is never invalidated when an issue's status or upvotes change, so /issues/user can serve stale issue data for up to 5 minutes.

Prompt for AI agents
Check if this issue is valid — if so, understand the root cause and fix it. At backend/routers/issues.py, line 634:

<comment>This cache is never invalidated when an issue's status or upvotes change, so `/issues/user` can serve stale issue data for up to 5 minutes.</comment>

<file context>
@@ -622,7 +628,11 @@ def get_user_issues(
+    # Performance Boost: Cache serialized JSON to bypass redundant Pydantic validation
+    # and serialization on cache hits. Returning Response directly is ~2-3x faster.
+    json_data = json.dumps(data)
+    user_issues_cache.set(data=json_data, key=cache_key)
+    return Response(content=json_data, media_type="application/json")
 
</file context>
Fix with Cubic

return Response(content=json_data, media_type="application/json")
Comment on lines +631 to +635

@router.get("/issues/{issue_id}/blockchain-verify", response_model=BlockchainVerificationResponse)
async def verify_blockchain_integrity(issue_id: int, db: Session = Depends(get_db)):
Expand Down
Loading