Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,4 @@ tests/.codebase/state.json
CLAUDE.md
.qodo/.cursor/rules
/.augment
/dev-workspace
63 changes: 38 additions & 25 deletions scripts/standalone_upload_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -477,6 +477,7 @@ def create_delta_bundle(self, changes: Dict[str, List]) -> Tuple[str, Dict[str,
operations.append(operation)
file_hashes[rel_path] = f"sha1:{file_hash}"
total_size += stat.st_size
set_cached_file_hash(str(path.resolve()), file_hash, self.repo_name)

except Exception as e:
print(f"[bundle_create] Error processing created file {path}: {e}")
Expand Down Expand Up @@ -516,6 +517,7 @@ def create_delta_bundle(self, changes: Dict[str, List]) -> Tuple[str, Dict[str,
operations.append(operation)
file_hashes[rel_path] = f"sha1:{file_hash}"
total_size += stat.st_size
set_cached_file_hash(str(path.resolve()), file_hash, self.repo_name)

except Exception as e:
print(f"[bundle_create] Error processing updated file {path}: {e}")
Expand Down Expand Up @@ -550,13 +552,14 @@ def create_delta_bundle(self, changes: Dict[str, List]) -> Tuple[str, Dict[str,
"source_absolute_path": str(source_path.resolve()),
"size_bytes": stat.st_size,
"content_hash": content_hash,
"file_hash": f"sha1:{idx.hash_id(content.decode('utf-8', errors='ignore'), dest_rel_path, 1, len(content.splitlines()))}",
"file_hash": f"sha1:{hash_id(content.decode('utf-8', errors='ignore'), dest_rel_path, 1, len(content.splitlines()))}",
"modified_time": datetime.fromtimestamp(stat.st_mtime).isoformat(),
"language": language
}
operations.append(operation)
file_hashes[dest_rel_path] = f"sha1:{file_hash}"
total_size += stat.st_size
set_cached_file_hash(str(dest_path.resolve()), file_hash, self.repo_name)

except Exception as e:
print(f"[bundle_create] Error processing moved file {source_path} -> {dest_path}: {e}")
Expand All @@ -576,7 +579,7 @@ def create_delta_bundle(self, changes: Dict[str, List]) -> Tuple[str, Dict[str,
"previous_hash": f"sha1:{previous_hash}" if previous_hash else None,
"file_hash": None,
"modified_time": datetime.now().isoformat(),
"language": idx.CODE_EXTS.get(path.suffix.lower(), "unknown")
"language": CODE_EXTS.get(path.suffix.lower(), "unknown")
}
operations.append(operation)

Expand Down Expand Up @@ -890,24 +893,41 @@ def watch_loop(self, interval: int = 5):
logger.info(f"[watch] File monitoring stopped by user")

def get_all_code_files(self) -> List[Path]:
"""Get all code files in the workspace."""
all_files = []
"""Get all code files in the workspace, excluding heavy/third-party dirs."""
files: List[Path] = []
try:
workspace_path = Path(self.workspace_path)
for ext in CODE_EXTS:
all_files.extend(workspace_path.rglob(f"*{ext}"))

# Filter out directories and hidden files
all_files = [
f for f in all_files
if f.is_file()
and not any(part.startswith('.') for part in f.parts)
and '.context-engine' not in str(f)
]
if not workspace_path.exists():
return files

# Single walk with early pruning and set-based matching to reduce IO
ext_suffixes = {str(ext).lower() for ext in CODE_EXTS if str(ext).startswith('.')}
name_matches = {str(ext) for ext in CODE_EXTS if not str(ext).startswith('.')}
EXCLUDED_DIRS = {
"node_modules", "vendor", "dist", "build", "target", "out", "dev-workspace",
".git", ".hg", ".svn", ".vscode", ".idea", ".venv", "venv", "__pycache__",
".pytest_cache", ".mypy_cache", ".cache", ".context-engine", ".context-engine-uploader", ".codebase"
}

seen = set()
for root, dirnames, filenames in os.walk(workspace_path):
# Prune heavy/hidden directories before descending
dirnames[:] = [d for d in dirnames if d not in EXCLUDED_DIRS and not d.startswith('.')]

for filename in filenames:
if filename.startswith('.'):
continue
candidate = Path(root) / filename
suffix = candidate.suffix.lower()
if filename in name_matches or suffix in ext_suffixes:
resolved = candidate.resolve()
if resolved not in seen:
seen.add(resolved)
files.append(candidate)
except Exception as e:
logger.error(f"[watch] Error scanning files: {e}")

return all_files
return files

def process_and_upload_changes(self, changed_paths: List[Path]) -> bool:
"""
Expand Down Expand Up @@ -1226,16 +1246,9 @@ def main():
logger.info("Scanning repository for files...")
workspace_path = Path(config['workspace_path'])

# Find all files in the repository
all_files = []
for file_path in workspace_path.rglob('*'):
if file_path.is_file() and not file_path.name.startswith('.'):
rel_path = file_path.relative_to(workspace_path)
# Skip .codebase directory and other metadata
if not str(rel_path).startswith('.codebase'):
all_files.append(file_path)

logger.info(f"Found {len(all_files)} files to upload")
# Find code files in the repository (exclude hidden and heavy dirs)
all_files = client.get_all_code_files()
logger.info(f"Found {len(all_files)} code files to upload")

if not all_files:
logger.warning("No files found to upload")
Expand Down
39 changes: 32 additions & 7 deletions scripts/upload_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -362,22 +362,23 @@ async def upload_delta_bundle(
):
"""Upload and process delta bundle."""
start_time = datetime.now()
client_host = request.client.host if hasattr(request, 'client') and request.client else 'unknown'

try:
logger.info(f"[upload_service] Begin processing upload for workspace={workspace_path} from {client_host}")
# Validate workspace path
workspace = Path(workspace_path)
if not workspace.is_absolute():
workspace = Path(WORK_DIR) / workspace

workspace_path = str(workspace.resolve())
repo_name = _extract_repo_name_from_path(workspace_path) if _extract_repo_name_from_path else None
if not repo_name:
repo_name = Path(workspace_path).name

# Get collection name
if not collection_name:
if get_collection_name:
repo_name = _extract_repo_name_from_path(workspace_path) if _extract_repo_name_from_path else None
# Fallback to directory name if repo detection fails
if not repo_name:
repo_name = Path(workspace_path).name
collection_name = get_collection_name(repo_name)
else:
collection_name = DEFAULT_COLLECTION
Expand Down Expand Up @@ -408,9 +409,32 @@ async def upload_delta_bundle(
with tempfile.NamedTemporaryFile(suffix=".tar.gz", delete=False) as temp_file:
bundle_path = Path(temp_file.name)

# Stream upload to file
content = await bundle.read()
bundle_path.write_bytes(content)
max_bytes = MAX_BUNDLE_SIZE_MB * 1024 * 1024
if bundle.size and bundle.size > max_bytes:
raise HTTPException(
status_code=413,
detail=f"Bundle too large. Max size: {MAX_BUNDLE_SIZE_MB}MB"
)

# Stream upload to file while enforcing size
total = 0
chunk_size = 1024 * 1024
while True:
chunk = await bundle.read(chunk_size)
if not chunk:
break
total += len(chunk)
if total > max_bytes:
try:
temp_file.close()
bundle_path.unlink(missing_ok=True)
except Exception:
pass
raise HTTPException(
status_code=413,
detail=f"Bundle too large. Max size: {MAX_BUNDLE_SIZE_MB}MB"
)
temp_file.write(chunk)

try:
# Validate bundle format
Expand Down Expand Up @@ -460,6 +484,7 @@ async def upload_delta_bundle(

# Calculate processing time
processing_time = (datetime.now() - start_time).total_seconds() * 1000
logger.info(f"[upload_service] Completed bundle {bundle_id} seq={sequence_number} ops={operations_count} in {int(processing_time)}ms")

return UploadResponse(
success=True,
Expand Down
Loading
Loading