diff --git a/scripts/remote_upload_client.py b/scripts/remote_upload_client.py index a8e8a5be..ef66daf0 100644 --- a/scripts/remote_upload_client.py +++ b/scripts/remote_upload_client.py @@ -47,6 +47,25 @@ import scripts.ingest_code as idx +def _cache_missing_stats(file_hashes: Dict[str, Any]) -> Tuple[bool, int, int]: + """Return (is_stale, missing_count, checked_count) for cached paths.""" + if not file_hashes: + return (False, 0, 0) + missing = 0 + checked = 0 + for path_str in file_hashes.keys(): + try: + if not Path(path_str).exists(): + missing += 1 + except Exception: + missing += 1 + checked += 1 + if checked == 0: + return (False, 0, 0) + missing_ratio = missing / checked + return (missing_ratio >= 0.25, missing, checked) + + def _find_git_root(start: Path) -> Optional[Path]: """Best-effort detection of the git repository root for a workspace. @@ -337,6 +356,24 @@ def _load_local_cache_file_hashes(workspace_path: str, repo_name: Optional[str]) file_hashes = data.get("file_hashes", {}) if not isinstance(file_hashes, dict): return {} + is_stale, missing, checked = _cache_missing_stats(file_hashes) + if is_stale: + logger.warning( + "[remote_upload] Detected stale local cache (%d/%d missing); clearing %s", + missing, + checked, + cache_path, + ) + try: + cache_path.unlink(missing_ok=True) # type: ignore[arg-type] + except TypeError: + try: + cache_path.unlink() + except Exception: + pass + except Exception: + pass + return {} return file_hashes except Exception: return {} diff --git a/scripts/standalone_upload_client.py b/scripts/standalone_upload_client.py index 979244ad..11f9cba3 100644 --- a/scripts/standalone_upload_client.py +++ b/scripts/standalone_upload_client.py @@ -155,7 +155,15 @@ def _load_cache(self) -> Dict[str, str]: try: with open(self.cache_file, 'r', encoding='utf-8') as f: data = json.load(f) - return data.get("file_hashes", {}) + file_hashes = data.get("file_hashes", {}) + if self._cache_seems_stale(file_hashes): + logger.warning( + "[hash_cache] Detected stale cache with missing paths; resetting %s", + self.cache_file, + ) + self._save_cache({}) + return {} + return file_hashes except Exception: return {} @@ -197,6 +205,21 @@ def remove_hash(self, file_path: str) -> None: file_hashes.pop(abs_path, None) self._save_cache(file_hashes) + def _cache_seems_stale(self, file_hashes: Dict[str, str]) -> bool: + """Return True if a large portion of cached paths no longer exist on disk.""" + total = len(file_hashes) + if total == 0: + return False + missing = 0 + for path_str in file_hashes.keys(): + try: + if not Path(path_str).exists(): + missing += 1 + except Exception: + missing += 1 + missing_ratio = missing / total + return missing_ratio >= 0.25 + # Create global cache instance (will be initialized in RemoteUploadClient) _hash_cache: Optional[SimpleHashCache] = None diff --git a/scripts/upload_service.py b/scripts/upload_service.py index ad386faf..4807ef0f 100644 --- a/scripts/upload_service.py +++ b/scripts/upload_service.py @@ -175,6 +175,25 @@ def validate_bundle_format(bundle_path: Path) -> Dict[str, Any]: except Exception as e: raise ValueError(f"Invalid bundle format: {str(e)}") +def _cleanup_empty_dirs(path: Path, stop_at: Path) -> None: + """Recursively remove empty directories up to stop_at (exclusive).""" + try: + path = path.resolve() + stop_at = stop_at.resolve() + except Exception: + pass + while True: + try: + if path == stop_at or not path.exists() or not path.is_dir(): + break + if any(path.iterdir()): + break + path.rmdir() + path = path.parent + except Exception: + break + + def process_delta_bundle(workspace_path: str, bundle_path: Path, manifest: Dict[str, Any]) -> Dict[str, int]: """Process delta bundle and return operation counts.""" operations_count = { @@ -313,10 +332,23 @@ def process_delta_bundle(workspace_path: str, bundle_path: Path, manifest: Dict[ else: operations_count["failed"] += 1 + # Remove original source file if provided + source_rel_path = operation.get("source_path") or operation.get("source_relative_path") + if source_rel_path: + source_path = workspace / source_rel_path + if source_path.exists(): + try: + source_path.unlink() + operations_count["deleted"] += 1 + _cleanup_empty_dirs(source_path.parent, workspace) + except Exception as del_err: + logger.error(f"Error deleting source file for move {source_rel_path}: {del_err}") + elif op_type == "deleted": # Delete file if target_path.exists(): target_path.unlink() + _cleanup_empty_dirs(target_path.parent, workspace) operations_count["deleted"] += 1 else: operations_count["skipped"] += 1