diff --git a/.env b/.env index 3d34067c..b26ac1a1 100644 --- a/.env +++ b/.env @@ -3,6 +3,11 @@ QDRANT_URL=http://qdrant:6333 # QDRANT_API_KEY= # not needed for local +# Repository mode: 0=single-repo (default), 1=multi-repo +# Single-repo: All files go into one collection (COLLECTION_NAME) +# Multi-repo: Each subdirectory gets its own collection +MULTI_REPO_MODE=0 + # Single unified collection for seamless cross-repo search # Default: "codebase" - all your code in one collection for unified search # This enables searching across multiple repos/workspaces without fragmentation @@ -144,7 +149,7 @@ MEMORY_COLLECTION_TTL_SECS=300 # INDEX_UPSERT_BATCH=128 # INDEX_UPSERT_RETRIES=5 # INDEX_UPSERT_BACKOFF=0.5 - WATCH_DEBOUNCE_SECS=4 +WATCH_DEBOUNCE_SECS=4 # Duplicate Streamable HTTP MCP instances (run alongside SSE) @@ -161,3 +166,6 @@ HYBRID_RESULTS_CACHE_ENABLED=1 INDEX_CHUNK_LINES=60 INDEX_CHUNK_OVERLAP=10 USE_GPU_DECODER=0 + +# Development Remote Upload Configuration +HOST_INDEX_PATH=./dev-workspace diff --git a/.env.example b/.env.example index 87c7e330..5a80abea 100644 --- a/.env.example +++ b/.env.example @@ -1,10 +1,21 @@ # Qdrant connection QDRANT_URL=http://localhost:6333 QDRANT_API_KEY= + +# Multi-repo mode: 0=single-repo (default), 1=multi-repo +# Single-repo: All files go into one collection (COLLECTION_NAME) +# Multi-repo: Each subdirectory gets its own collection +MULTI_REPO_MODE=0 + # Single unified collection for seamless cross-repo search (default: "codebase") # Leave unset or use "codebase" for unified search across all your code COLLECTION_NAME=codebase +# Repository mode: 0=single-repo (default), 1=multi-repo +# Single-repo: All files go into one collection (COLLECTION_NAME) +# Multi-repo: Each subdirectory gets its own collection +MULTI_REPO_MODE=0 + # Embeddings EMBEDDING_MODEL=BAAI/bge-base-en-v1.5 EMBEDDING_PROVIDER=fastembed diff --git a/Dockerfile.mcp b/Dockerfile.mcp index ef40683b..22524111 100644 --- a/Dockerfile.mcp +++ b/Dockerfile.mcp @@ -3,11 +3,16 @@ FROM python:3.11-slim ENV PYTHONDONTWRITEBYTECODE=1 \ PYTHONUNBUFFERED=1 \ - WORK_ROOTS="/work,/app" + WORK_ROOTS="/work,/app" \ + HF_HOME=/tmp/cache \ + TRANSFORMERS_CACHE=/tmp/cache # Install latest FastMCP with Streamable HTTP (RMCP) support + deps RUN pip install --no-cache-dir --upgrade mcp fastmcp qdrant-client fastembed +# Create cache directory with proper permissions +RUN mkdir -p /tmp/cache && chmod 755 /tmp/cache + # Bake scripts into image so server can run even when /work points elsewhere COPY scripts /app/scripts diff --git a/Dockerfile.upload-service b/Dockerfile.upload-service new file mode 100644 index 00000000..ef6d4538 --- /dev/null +++ b/Dockerfile.upload-service @@ -0,0 +1,56 @@ +# Dockerfile for Context-Engine Delta Upload Service +FROM python:3.11-slim + +# Set environment variables +ENV PYTHONUNBUFFERED=1 \ + PYTHONDONTWRITEBYTECODE=1 \ + PIP_NO_CACHE_DIR=1 \ + PIP_DISABLE_PIP_VERSION_CHECK=1 \ + PYTHONPATH=/app + +# Install system dependencies +RUN apt-get update && apt-get install -y \ + git \ + curl \ + && rm -rf /var/lib/apt/lists/* + +# Create app directory +WORKDIR /app + +# Copy requirements first for better caching +COPY requirements.txt . + +# Install Python dependencies +RUN pip install --upgrade pip && \ + pip install -r requirements.txt + +# Copy application code +COPY scripts/ ./scripts/ +COPY . . + +# Create work directory for repositories +RUN mkdir -p /work && \ + chmod 755 /work + +# Create non-root user for security +RUN useradd --create-home --shell /bin/bash app && \ + chown -R app:app /app /work +USER app + +# Expose port +EXPOSE 8002 + +# Health check +HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \ + CMD curl -f http://localhost:8002/health || exit 1 + +# Default environment variables +ENV UPLOAD_SERVICE_HOST=0.0.0.0 \ + UPLOAD_SERVICE_PORT=8002 \ + QDRANT_URL=http://qdrant:6333 \ + WORK_DIR=/work \ + MAX_BUNDLE_SIZE_MB=100 \ + UPLOAD_TIMEOUT_SECS=300 + +# Run the upload service +CMD ["python", "scripts/upload_service.py"] \ No newline at end of file diff --git a/Makefile b/Makefile index 95ea9e6d..8d598bf9 100644 --- a/Makefile +++ b/Makefile @@ -4,8 +4,8 @@ SHELL := /bin/bash # An empty export forces docker to use its default context/socket. export DOCKER_HOST = -.PHONY: help up down logs ps restart rebuild index reindex watch env hybrid bootstrap history rerank-local setup-reranker prune warm health -.PHONY: venv venv-install +.PHONY: help up down logs ps restart rebuild index reindex watch watch-remote env hybrid bootstrap history rerank-local setup-reranker prune warm health test-e2e +.PHONY: venv venv-install dev-remote-up dev-remote-down dev-remote-logs dev-remote-restart dev-remote-bootstrap dev-remote-test dev-remote-client dev-remote-clean .PHONY: qdrant-status qdrant-list qdrant-prune qdrant-index-root @@ -77,6 +77,23 @@ index-here: ## index the current directory: make index-here [RECREATE=1] [REPO_N watch: ## watch mode: reindex changed files on save (Ctrl+C to stop) docker compose run --rm --entrypoint python indexer /work/scripts/watch_index.py +watch-remote: ## remote watch mode: upload delta bundles to remote server (Ctrl+C to stop) + @echo "Starting remote watch mode..." + @if [ -z "$(REMOTE_UPLOAD_ENDPOINT)" ]; then \ + echo "Error: REMOTE_UPLOAD_ENDPOINT is required"; \ + echo "Usage: make watch-remote REMOTE_UPLOAD_ENDPOINT=http://your-server:8080 [REMOTE_UPLOAD_MAX_RETRIES=3] [REMOTE_UPLOAD_TIMEOUT=30]"; \ + exit 1; \ + fi + @echo "Remote upload endpoint: $(REMOTE_UPLOAD_ENDPOINT)" + @echo "Max retries: $${REMOTE_UPLOAD_MAX_RETRIES:-3}" + @echo "Timeout: $${REMOTE_UPLOAD_TIMEOUT:-30} seconds" + docker compose run --rm --entrypoint python \ + -e REMOTE_UPLOAD_ENABLED=1 \ + -e REMOTE_UPLOAD_ENDPOINT=$(REMOTE_UPLOAD_ENDPOINT) \ + -e REMOTE_UPLOAD_MAX_RETRIES=$${REMOTE_UPLOAD_MAX_RETRIES:-3} \ + -e REMOTE_UPLOAD_TIMEOUT=$${REMOTE_UPLOAD_TIMEOUT:-30} \ + indexer /work/scripts/watch_index.py + rerank: ## multi-query re-ranker helper example docker compose run --rm --entrypoint python indexer /work/scripts/rerank_query.py \ --query "chunk code by lines with overlap for indexing" \ @@ -216,12 +233,54 @@ llamacpp-build-image: ## build custom llama.cpp image with baked model (override # Download a tokenizer.json for micro-chunking (default: BAAI/bge-base-en-v1.5) TOKENIZER_URL ?= https://huggingface.co/BAAI/bge-base-en-v1.5/resolve/main/tokenizer.json TOKENIZER_PATH ?= models/tokenizer.json - tokenizer: ## download tokenizer.json to models/tokenizer.json (override with TOKENIZER_URL/TOKENIZER_PATH) @mkdir -p $(dir $(TOKENIZER_PATH)) @echo "Downloading: $(TOKENIZER_URL) -> $(TOKENIZER_PATH)" && \ curl -L --fail --retry 3 -C - "$(TOKENIZER_URL)" -o "$(TOKENIZER_PATH)" +# --- Development Remote Upload System Targets --- + +dev-remote-up: ## start dev-remote stack with upload service + @echo "Starting development remote upload system..." + @mkdir -p dev-workspace/.codebase + docker compose -f docker-compose.dev-remote.yml up -d --build + +dev-remote-down: ## stop dev-remote stack + @echo "Stopping development remote upload system..." + docker compose -f docker-compose.dev-remote.yml down + +dev-remote-logs: ## follow logs for dev-remote stack + docker compose -f docker-compose.dev-remote.yml logs -f --tail=100 + +dev-remote-restart: ## restart dev-remote stack (rebuild) + docker compose -f docker-compose.dev-remote.yml down && docker compose -f docker-compose.dev-remote.yml up -d --build + +dev-remote-bootstrap: env dev-remote-up ## bootstrap dev-remote: up -> wait -> init -> index -> warm + @echo "Bootstrapping development remote upload system..." + ./scripts/wait-for-qdrant.sh + docker compose -f docker-compose.dev-remote.yml run --rm init_payload || true + $(MAKE) tokenizer + docker compose -f docker-compose.dev-remote.yml run --rm indexer --root /work --recreate + $(MAKE) warm || true + $(MAKE) health + +dev-remote-test: ## test remote upload workflow + @echo "Testing remote upload workflow..." + @echo "Upload service should be accessible at http://localhost:8004" + @echo "Health check: curl http://localhost:8004/health" + @echo "Status check: curl 'http://localhost:8004/api/v1/delta/status?workspace_path=/work/test-repo'" + @echo "Test upload: curl -X POST -F 'bundle=@test-bundle.tar.gz' -F 'workspace_path=/work/test-repo' http://localhost:8004/api/v1/delta/upload" + +dev-remote-client: ## start remote upload client for testing + @echo "Starting remote upload client..." + docker compose -f docker-compose.dev-remote.yml --profile client up -d remote_upload_client + +dev-remote-clean: ## clean up dev-remote volumes and containers + @echo "Cleaning up development remote upload system..." + docker compose -f docker-compose.dev-remote.yml down -v + docker volume rm context-engine_shared_workspace context-engine_shared_codebase context-engine_upload_temp context-engine_qdrant_storage_dev_remote 2>/dev/null || true + rm -rf dev-workspace + # Router helpers Q ?= what is hybrid search? diff --git a/README.md b/README.md index 5b4a5f80..607bbcc7 100644 --- a/README.md +++ b/README.md @@ -730,6 +730,8 @@ Indexer/Search MCP (8001 SSE, 8003 RMCP): - search_callers_for — intent wrapper for probable callers/usages - search_importers_for — intent wrapper for files importing a module/symbol - change_history_for_path(path) — summarize recent changes using stored metadata +- collection_map - return collection↔repo mappings +- default_collection - set the collection to use for the session Notes: - Most search tools accept filters like language, under, path_glob, kind, symbol, ext. @@ -888,11 +890,25 @@ For production-grade backup/migration strategies, see the official Qdrant docume Operational notes: - Collection name comes from `COLLECTION_NAME` (see .env). This stack defaults to a single collection for both code and memories; filtering uses `metadata.kind`. -- If you switch to a dedicated memory collection, update the MCP Memory server and the Indexer’s memory blending env to point at it. +- If you switch to a dedicated memory collection, update the MCP Memory server and the Indexer's memory blending env to point at it. - Consider pruning expired memories by filtering `expires_at < now`. - Call `context_search` on :8001 (SSE) or :8003 (RMCP) with `{ "include_memories": true }` to return both memory and code results. +### Collection Naming Strategies + +Different hash lengths are used for different workspace types: + +**Local Workspaces:** `repo-name-8charhash` +- Example: `Anesidara-e8d0f5fc` +- Used by local indexer/watcher +- Assumes unique repo names within workspace + +**Remote Uploads:** `folder-name-16charhash-8charhash` +- Example: `testupload2-04e680d5939dd035-b8b8d4cc` +- Collision avoidance for duplicate folder names for different codebases +- 16-char hash identifies workspace, 8-char hash identifies collection + ### Enable memory blending (for context_search) diff --git a/build-images.sh b/build-images.sh new file mode 100644 index 00000000..2cc3cd3a --- /dev/null +++ b/build-images.sh @@ -0,0 +1,226 @@ +#!/bin/bash +# Docker Build Script for Context-Engine +# Builds all service images with custom registry tagging + +set -euo pipefail + +# Configuration +REGISTRY="192.168.96.61:30009/library" +PROJECT_NAME="context-engine" +TAG="${TAG:-latest}" + +# Service mapping (service_name:dockerfile:final_image_name) +declare -A SERVICES=( + ["memory"]="Dockerfile.mcp:${PROJECT_NAME}-memory" + ["indexer"]="Dockerfile.mcp-indexer:${PROJECT_NAME}-indexer" + ["indexer-service"]="Dockerfile.indexer:${PROJECT_NAME}-indexer-service" + ["llamacpp"]="Dockerfile.llamacpp:${PROJECT_NAME}-llamacpp" +) + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +# Logging functions +log_info() { echo -e "${GREEN}[INFO]${NC} $1"; } +log_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; } +log_error() { echo -e "${RED}[ERROR]${NC} $1"; } + +# Build function +build_image() { + local service=$1 + local dockerfile=$2 + local image_name=$3 + local full_image="${REGISTRY}/${image_name}:${TAG}" + + log_info "Building ${service} service..." + log_info "Dockerfile: ${dockerfile}" + log_info "Image: ${full_image}" + + if ! docker build \ + -f "${dockerfile}" \ + -t "${full_image}" \ + --build-arg BUILDKIT_INLINE_CACHE=1 \ + .; then + log_error "Failed to build ${service} image" + return 1 + fi + + log_info "Successfully built ${service} image: ${full_image}" + + # Push if registry is accessible + if [[ "${PUSH_IMAGES:-false}" == "true" ]]; then + log_info "Pushing ${service} image..." + if ! docker push "${full_image}"; then + log_warn "Failed to push ${service} image (registry may be inaccessible)" + return 1 + fi + log_info "Successfully pushed ${service} image" + fi + + echo "${full_image}" +} + +# Main build process +main() { + log_info "Starting Context-Engine Docker build process..." + log_info "Registry: ${REGISTRY}" + log_info "Tag: ${TAG}" + log_info "Push enabled: ${PUSH_IMAGES:-false}" + echo + + # Check if Docker is running + if ! docker info >/dev/null 2>&1; then + log_error "Docker is not running or not accessible" + exit 1 + fi + + # Check if Dockerfiles exist + for service in "${!SERVICES[@]}"; do + IFS=':' read -r dockerfile image_name <<< "${SERVICES[$service]}" + if [[ ! -f "${dockerfile}" ]]; then + log_error "Dockerfile not found: ${dockerfile}" + exit 1 + fi + done + + local built_images=() + local failed_services=() + + # Build each service + for service in "${!SERVICES[@]}"; do + IFS=':' read -r dockerfile image_name <<< "${SERVICES[$service]}" + + if built_image=$(build_image "$service" "$dockerfile" "$image_name"); then + built_images+=("$built_image") + else + failed_services+=("$service") + fi + echo + done + + # Summary + log_info "Build Summary:" + log_info "Successfully built: ${#built_images[@]} images" + for img in "${built_images[@]}"; do + log_info " ✓ ${img}" + done + + if [[ ${#failed_services[@]} -gt 0 ]]; then + log_error "Failed to build: ${#failed_services[@]} services" + for service in "${failed_services[@]}"; do + log_error " ✗ ${service}" + done + exit 1 + fi + + log_info "All images built successfully!" + + # Generate updated kustomization.yaml + cat > "deploy/kubernetes/kustomization-images.yaml" << 'EOF' +# Image overrides for Context-Engine Kubernetes deployment +# Use this with: kustomize build . --load-restrictor=LoadRestrictionsNone | kubectl apply -f - +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: + - namespace.yaml + - configmap.yaml + - qdrant.yaml + - mcp-memory.yaml + - mcp-indexer.yaml + - mcp-http.yaml + - indexer-services.yaml + - llamacpp.yaml + - ingress.yaml + +images: +EOF + + # Add images to kustomization + for service in "${!SERVICES[@]}"; do + IFS=':' read -r dockerfile image_name <<< "${SERVICES[$service]}" + full_image="${REGISTRY}/${image_name}:${TAG}" + cat >> "deploy/kubernetes/kustomization-images.yaml" << EOF + - name: ${image_name} + newName: ${full_image%:*} # Remove tag + newTag: ${TAG} +EOF + done + + cat >> "deploy/kubernetes/kustomization-images.yaml" << 'EOF' + +# Common labels +commonLabels: + app.kubernetes.io/name: context-engine + app.kubernetes.io/component: kubernetes-deployment + app.kubernetes.io/managed-by: kustomize + +# Namespace override +namespace: context-engine +EOF + + log_info "Generated deploy/kubernetes/kustomization-images.yaml" + log_info "To deploy: kustomize build deploy/kubernetes/ | kubectl apply -f -" +} + +# Help function +show_help() { + cat << EOF +Context-Engine Docker Build Script + +Usage: $0 [OPTIONS] + +Options: + -t, --tag TAG Set image tag (default: latest) + -p, --push Push images to registry after build + -h, --help Show this help message + +Examples: + $0 # Build with default tag + $0 -t v1.0.0 # Build with custom tag + $0 --push # Build and push to registry + TAG=dev-branch $0 # Build using environment variable + +Environment Variables: + TAG Image tag to use + PUSH_IMAGES Set to 'true' to push after build + +Registry Configuration: + Current registry: ${REGISTRY} + To change: modify REGISTRY variable in script + +Generated Files: + - deploy/kubernetes/kustomization-images.yaml + Contains image references for Kubernetes deployment + +EOF +} + +# Parse arguments +while [[ $# -gt 0 ]]; do + case $1 in + -t|--tag) + TAG="$2" + shift 2 + ;; + -p|--push) + export PUSH_IMAGES=true + shift + ;; + -h|--help) + show_help + exit 0 + ;; + *) + log_error "Unknown option: $1" + show_help + exit 1 + ;; + esac +done + +# Run main function +main "$@" \ No newline at end of file diff --git a/deploy/kubernetes/Makefile b/deploy/kubernetes/Makefile index 8307bbbe..76d9df7d 100644 --- a/deploy/kubernetes/Makefile +++ b/deploy/kubernetes/Makefile @@ -53,7 +53,6 @@ kustomize-apply: check-kubectl ## Apply manifests with Kustomize .PHONY: kustomize-delete kustomize-delete: check-kubectl ## Delete manifests with Kustomize kustomize build . | kubectl delete -f - - # Management targets .PHONY: status status: check-kubectl ## Show deployment status @@ -79,9 +78,18 @@ status: check-kubectl ## Show deployment status kubectl get jobs -n $(NAMESPACE) || echo "No jobs found" .PHONY: logs -logs: check-kubectl ## Show logs for all services +logs: check-kubectl ## Show logs for core services (tail 100) @echo "=== Qdrant Logs ===" - kubectl logs -f statefulset/qdrant -n $(NAMESPACE) --tail=50 || echo "Qdrant logs not available" + kubectl logs -f statefulset/qdrant -n $(NAMESPACE) --tail=100 || echo "Qdrant logs not available" + @echo "" + @echo "=== MCP Memory Logs ===" + kubectl logs -f deployment/mcp-memory -n $(NAMESPACE) --tail=100 || echo "MCP Memory logs not available" + @echo "" + @echo "=== MCP Indexer Logs ===" + kubectl logs -f deployment/mcp-indexer -n $(NAMESPACE) --tail=100 || echo "MCP Indexer logs not available" + @echo "" + @echo "=== Watcher Logs ===" + kubectl logs -f deployment/watcher -n $(NAMESPACE) --tail=100 || echo "Watcher logs not available" .PHONY: logs-service logs-service: check-kubectl ## Show logs for specific service (usage: make logs-service SERVICE=mcp-memory) @@ -136,9 +144,8 @@ port-forward-service: check-kubectl ## Port forward specific service (usage: mak stop-port-forward: ## Stop all port forwards pkill -f "kubectl port-forward" || echo "No port forwards found" -# Build and push targets .PHONY: build-image -build-image: ## Build Docker image +build-image: ## Build Docker image (requires Docker) docker build -t $(IMAGE_REGISTRY)/context-engine:$(IMAGE_TAG) ../../ .PHONY: push-image @@ -152,9 +159,9 @@ test-connection: check-kubectl ## Test connectivity to all services @echo "Qdrant:" @kubectl run qdrant-test --image=curlimages/curl --rm -i --restart=Never -n $(NAMESPACE) -- curl -f http://qdrant.$(NAMESPACE).svc.cluster.local:6333/health || echo "Qdrant test failed" @echo "MCP Memory:" - @kubectl run memory-test --image=curlimages/curl --rm -i --restart=Never -n $(NAMESPACE) -- curl -f http://mcp-memory.$(NAMESPACE).svc.cluster.local:18000/health || echo "MCP Memory test failed" + @kubectl run memory-test --image=curlimages/curl --rm -i --restart=Never -n $(NAMESPACE) -- curl -f http://mcp-memory.$(NAMESPACE).svc.cluster.local:8000/health || echo "MCP Memory test failed" @echo "MCP Indexer:" - @kubectl run indexer-test --image=curlimages/curl --rm -i --restart=Never -n $(NAMESPACE) -- curl -f http://mcp-indexer.$(NAMESPACE).svc.cluster.local:18001/health || echo "MCP Indexer test failed" + @kubectl run indexer-test --image=curlimages/curl --rm -i --restart=Never -n $(NAMESPACE) -- curl -f http://mcp-indexer.$(NAMESPACE).svc.cluster.local:8001/health || echo "MCP Indexer test failed" # Configuration targets .PHONY: show-config @@ -167,7 +174,7 @@ show-config: ## Show current configuration @echo "Quick start commands:" @echo " make deploy # Deploy all services" @echo " make status # Show deployment status" - @echo " make logs-service SERVICE=mcp-memory # Show logs" + @echo " make logs # Show logs" @echo " make cleanup # Remove everything" .PHONY: show-urls @@ -196,4 +203,3 @@ describe-service: check-kubectl ## Describe a service (usage: make describe-serv .PHONY: events events: check-kubectl ## Show recent events kubectl get events -n $(NAMESPACE) --sort-by=.metadata.creationTimestamp - diff --git a/deploy/kubernetes/cleanup.sh b/deploy/kubernetes/cleanup.sh index 2ce5d64d..dadfa6ec 100755 --- a/deploy/kubernetes/cleanup.sh +++ b/deploy/kubernetes/cleanup.sh @@ -48,60 +48,150 @@ check_kubectl() { log_success "Kubernetes connection verified" } -# Confirm cleanup -confirm_cleanup() { - if [[ "$FORCE" != "true" ]]; then - log_warning "This will delete all Context-Engine resources in namespace: $NAMESPACE" - read -p "Are you sure you want to continue? (yes/no): " -r +# Check if namespace exists +check_namespace() { + if ! kubectl get namespace $NAMESPACE &> /dev/null; then + log_warning "Namespace $NAMESPACE does not exist" + return 1 + fi + return 0 +} + +# Show what will be deleted +show_deletion_plan() { + log_info "The following resources will be deleted:" + echo + + # Show current resources + echo "Pods:" + kubectl get pods -n $NAMESPACE 2>/dev/null || echo " No pods found" + echo + echo "Services:" + kubectl get services -n $NAMESPACE 2>/dev/null || echo " No services found" + echo + echo "Deployments:" + kubectl get deployments -n $NAMESPACE 2>/dev/null || echo " No deployments found" + echo + echo "StatefulSets:" + kubectl get statefulsets -n $NAMESPACE 2>/dev/null || echo " No statefulsets found" + echo + echo "Jobs:" + kubectl get jobs -n $NAMESPACE 2>/dev/null || echo " No jobs found" + echo + echo "PersistentVolumeClaims:" + kubectl get pvc -n $NAMESPACE 2>/dev/null || echo " No PVCs found" + echo + echo "ConfigMaps:" + kubectl get configmaps -n $NAMESPACE 2>/dev/null || echo " No configmaps found" + echo + if kubectl get ingress -n $NAMESPACE &> /dev/null; then + echo "Ingress:" + kubectl get ingress -n $NAMESPACE echo - if [[ ! $REPLY =~ ^[Yy][Ee][Ss]$ ]]; then - log_info "Cleanup cancelled" - exit 0 - fi fi + + log_warning "This will permanently delete all data in Qdrant and any other persistent storage!" +} + +confirm_cleanup() { + if [[ "$FORCE" == "true" ]]; then + return 0 + fi + read -p "Are you sure you want to delete all Context-Engine resources? (yes/no): " -r + echo + if [[ ! $REPLY =~ ^[Yy][Ee][Ss]$ ]]; then + log_info "Cleanup cancelled" + exit 0 + fi +} + +# Delete namespace and all resources +delete_namespace() { + log_info "Deleting namespace: $NAMESPACE" + kubectl delete namespace $NAMESPACE --ignore-not-found=true + log_success "Namespace deleted" +} + +# Wait for namespace deletion +wait_for_deletion() { + log_info "Waiting for namespace deletion to complete..." + + local timeout=60 + local count=0 + + while kubectl get namespace $NAMESPACE &> /dev/null; do + if [[ $count -ge $timeout ]]; then + log_warning "Namespace deletion is taking longer than expected" + log_info "You may need to manually delete remaining resources" + return 1 + fi + + echo -n "." + sleep 1 + ((count++)) + done + + echo + log_success "Namespace deletion completed" } -# Delete resources -cleanup_resources() { - log_info "Cleaning up Context-Engine resources..." +# Force delete if needed +force_delete() { + log_warning "Attempting to force delete remaining resources..." + + # Force delete any remaining pods + kubectl delete pods --all -n $NAMESPACE --grace-period=0 --force 2>/dev/null || true + + # Force delete any remaining PVCs + kubectl delete pvc --all -n $NAMESPACE --grace-period=0 --force 2>/dev/null || true - # Delete deployments - log_info "Deleting deployments..." - kubectl delete deployment --all -n $NAMESPACE --ignore-not-found=true + log_success "Force delete completed" +} + +# Verify cleanup +verify_cleanup() { + log_info "Verifying cleanup..." - # Delete statefulsets - log_info "Deleting statefulsets..." - kubectl delete statefulset --all -n $NAMESPACE --ignore-not-found=true + if kubectl get namespace $NAMESPACE &> /dev/null; then + log_error "Namespace $NAMESPACE still exists" + return 1 + fi - # Delete jobs - log_info "Deleting jobs..." - kubectl delete job --all -n $NAMESPACE --ignore-not-found=true + log_success "Cleanup completed successfully" +} - # Delete services - log_info "Deleting services..." - kubectl delete service --all -n $NAMESPACE --ignore-not-found=true +# Main cleanup function +main() { + log_info "Starting Context-Engine Kubernetes cleanup" - # Delete ingress - log_info "Deleting ingress..." - kubectl delete ingress --all -n $NAMESPACE --ignore-not-found=true + # Check prerequisites + check_kubectl - # Delete configmaps - log_info "Deleting configmaps..." - kubectl delete configmap --all -n $NAMESPACE --ignore-not-found=true + # Check if namespace exists + if ! check_namespace; then + log_success "Nothing to clean up - namespace $NAMESPACE does not exist" + exit 0 + fi - # Delete secrets - log_info "Deleting secrets..." - kubectl delete secret --all -n $NAMESPACE --ignore-not-found=true + # Show what will be deleted + show_deletion_plan - # Delete PVCs - log_info "Deleting persistent volume claims..." - kubectl delete pvc --all -n $NAMESPACE --ignore-not-found=true + # Ask for confirmation (unless forced) + confirm_cleanup # Delete namespace - log_info "Deleting namespace..." - kubectl delete namespace $NAMESPACE --ignore-not-found=true + delete_namespace - log_success "Cleanup complete!" + # Wait for deletion + if ! wait_for_deletion; then + log_warning "Standard deletion incomplete, attempting force delete..." + force_delete + fi + + # Verify cleanup + verify_cleanup + + log_success "Context-Engine cleanup completed!" } # Help function @@ -111,14 +201,17 @@ show_help() { echo "Usage: $0 [OPTIONS]" echo echo "Options:" - echo " -h, --help Show this help message" - echo " --namespace NAMESPACE Kubernetes namespace (default: context-engine)" - echo " --force Skip confirmation prompt" + echo " -h, --help Show this help message" + echo " -n, --namespace NAMESPACE Kubernetes namespace (default: context-engine)" + echo " -f, --force Skip confirmation prompt" + echo + echo "Environment variables:" + echo " NAMESPACE=context-engine Kubernetes namespace" echo echo "Examples:" - echo " $0 # Interactive cleanup" - echo " $0 --force # Force cleanup without confirmation" - echo " $0 --namespace my-ns # Cleanup specific namespace" + echo " $0 # Interactive cleanup with confirmation" + echo " $0 --force # Cleanup without confirmation" + echo " $0 -n my-namespace # Cleanup different namespace" } # Parse command line arguments @@ -128,11 +221,11 @@ while [[ $# -gt 0 ]]; do show_help exit 0 ;; - --namespace) + -n|--namespace) NAMESPACE="$2" shift 2 ;; - --force) + -f|--force|--force=true) FORCE=true shift ;; @@ -144,20 +237,11 @@ while [[ $# -gt 0 ]]; do esac done -# Main cleanup function -main() { - log_info "Starting Context-Engine Kubernetes cleanup" - - # Check prerequisites - check_kubectl - - # Confirm cleanup - confirm_cleanup - - # Cleanup resources - cleanup_resources -} +# Check if we're in the right directory +if [[ ! -f "qdrant.yaml" ]]; then + log_error "Please run this script from the deploy/kubernetes directory" + exit 1 +fi # Run main cleanup main - diff --git a/deploy/kubernetes/code-models-pvc.yaml b/deploy/kubernetes/code-models-pvc.yaml new file mode 100644 index 00000000..c69358d5 --- /dev/null +++ b/deploy/kubernetes/code-models-pvc.yaml @@ -0,0 +1,18 @@ +--- +# Persistent Volume Claim for model storage (CephFS RWX) +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: code-models-pvc + namespace: context-engine + labels: + app: context-engine + component: llamacpp + type: storage +spec: + accessModes: + - ReadWriteMany # CephFS supports RWX for multiple pods + storageClassName: rook-cephfs # Adjust based on your storage class + resources: + requests: + storage: 20Gi # Adjust size based on expected model footprint diff --git a/deploy/kubernetes/configmap.yaml b/deploy/kubernetes/configmap.yaml index 0c514bc2..1c73f60c 100644 --- a/deploy/kubernetes/configmap.yaml +++ b/deploy/kubernetes/configmap.yaml @@ -5,71 +5,112 @@ metadata: namespace: context-engine labels: app: context-engine +component: configuration data: - # Core Configuration COLLECTION_NAME: "codebase" EMBEDDING_MODEL: "BAAI/bge-base-en-v1.5" EMBEDDING_PROVIDER: "fastembed" - - # Qdrant Configuration - QDRANT_URL: "http://qdrant:6333" - QDRANT_TIMEOUT: "60" - - # Indexing Configuration - INDEX_MICRO_CHUNKS: "1" - MAX_MICRO_CHUNKS_PER_FILE: "200" - INDEX_CHUNK_LINES: "120" - INDEX_CHUNK_OVERLAP: "20" - INDEX_BATCH_SIZE: "64" - INDEX_UPSERT_BATCH: "128" - INDEX_UPSERT_RETRIES: "5" - INDEX_UPSERT_BACKOFF: "0.5" - - # Watcher Configuration - WATCH_DEBOUNCE_SECS: "1.5" - - # ReFRAG Configuration + + FASTMCP_HOST: "0.0.0.0" + FASTMCP_PORT: "8000" + FASTMCP_INDEXER_PORT: "8001" + + TOOL_STORE_DESCRIPTION: "Store reusable code snippets for later retrieval. The 'information' is a clear NL description; include the actual code in 'metadata.code' and add 'metadata.language' (e.g., python, typescript) and 'metadata.path' when known. Use this whenever you generate or refine a code snippet." + TOOL_FIND_DESCRIPTION: "Search for relevant code snippets using multiple phrasings of the query (multi-query). Prefer results where metadata.language matches the target file and metadata.path is relevant. You may pass optional filters (language, path_prefix, kind) which the server applies server-side. Include 'metadata.code', 'metadata.path', and 'metadata.language' in responses." + + RERANKER_ENABLED: "1" + RERANKER_TOPN: "100" + RERANKER_RETURN_M: "20" + RERANKER_TIMEOUT_MS: "3000" + RERANK_TIMEOUT_FLOOR_MS: "1000" + + EMBEDDING_WARMUP: "0" + RERANK_WARMUP: "0" + + HYBRID_IN_PROCESS: "1" + RERANK_IN_PROCESS: "1" + + USE_TREE_SITTER: "1" + + HYBRID_EXPAND: "1" + HYBRID_PER_PATH: "1" + HYBRID_SYMBOL_BOOST: "0.35" + HYBRID_RECENCY_WEIGHT: "0.1" + RERANK_EXPAND: "1" + + INDEX_SEMANTIC_CHUNKS: "0" + + MEMORY_SSE_ENABLED: "true" + MEMORY_MCP_URL: "http://mcp:8000/sse" + MEMORY_MCP_TIMEOUT: "6" + + LLM_PROVIDER: "ollama" + OLLAMA_HOST: "http://ollama:11434" + LLM_EXPAND_MODEL: "phi3:mini" + LLM_EXPAND_MAX: "4" + PRF_ENABLED: "1" + REFRAG_MODE: "1" - REFRAG_GATE_FIRST: "1" - REFRAG_CANDIDATES: "200" + MINI_VECTOR_NAME: "mini" + MINI_VEC_DIM: "64" + MINI_VEC_SEED: "1337" + HYBRID_MINI_WEIGHT: "1.0" + + INDEX_MICRO_CHUNKS: "1" MICRO_CHUNK_TOKENS: "16" MICRO_CHUNK_STRIDE: "8" + REFRAG_GATE_FIRST: "1" + REFRAG_CANDIDATES: "200" + MICRO_OUT_MAX_SPANS: "3" MICRO_MERGE_LINES: "4" MICRO_BUDGET_TOKENS: "512" MICRO_TOKENS_PER_LINE: "32" - - # Decoder Configuration (optional) + + CTX_SUMMARY_CHARS: "0" + REFRAG_DECODER: "1" REFRAG_RUNTIME: "llamacpp" + REFRAG_ENCODER_MODEL: "BAAI/bge-base-en-v1.5" + REFRAG_PHI_PATH: "/work/models/refrag_phi_768_to_dmodel.bin" + REFRAG_SENSE: "heuristic" + LLAMACPP_URL: "http://llamacpp:8080" LLAMACPP_TIMEOUT_SEC: "180" DECODER_MAX_TOKENS: "4000" + REFRAG_DECODER_MODE: "prompt" + REFRAG_SOFT_SCALE: "1.0" - # Model download configuration (for init container) - LLAMACPP_MODEL_URL: "https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-GGUF/resolve/main/qwen2.5-1.5b-instruct-q8_0.gguf" - LLAMACPP_MODEL_NAME: "qwen2.5-1.5b-instruct-q8_0.gguf" - - # Reranker Configuration - RERANKER_ENABLED: "1" - - # MCP Configuration - FASTMCP_HOST: "0.0.0.0" - FASTMCP_PORT: "8000" - FASTMCP_INDEXER_PORT: "8001" + MAX_MICRO_CHUNKS_PER_FILE: "200" + QDRANT_TIMEOUT: "60" + MEMORY_AUTODETECT: "1" + MEMORY_COLLECTION_TTL_SECS: "300" + + FASTMCP_HTTP_TRANSPORT: "http" + FASTMCP_HTTP_PORT: "8002" + FASTMCP_HTTP_HEALTH_PORT: "18002" + FASTMCP_INDEXER_HTTP_PORT: "8003" + FASTMCP_INDEXER_HTTP_HEALTH_PORT: "18003" + + WATCH_DEBOUNCE_SECS: "1.5" + INDEX_UPSERT_BATCH: "128" + INDEX_UPSERT_RETRIES: "5" + + QDRANT_URL: "http://qdrant:6333" + + QDRANT_API_KEY: "" + REPO_NAME: "workspace" + FASTMCP_SERVER_NAME: "qdrant-mcp" + HOST_INDEX_PATH: "/work" + + INDEX_CHUNK_LINES: "120" + INDEX_CHUNK_OVERLAP: "20" + INDEX_BATCH_SIZE: "64" + INDEX_UPSERT_BACKOFF: "0.5" FASTMCP_HEALTH_PORT: "18000" - - # Memory Configuration - MEMORY_SSE_ENABLED: "true" - MEMORY_MCP_URL: "http://mcp-memory:8000/sse" - MEMORY_MCP_TIMEOUT: "6" - - # Multi-collection Configuration CTX_MULTI_COLLECTION: "1" CTX_DOC_PASS: "1" - - # Logging DEBUG_CONTEXT_ANSWER: "0" - - # Tokenizer TOKENIZER_JSON: "/app/models/tokenizer.json" + LLAMACPP_MODEL_URL: "https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-GGUF/resolve/main/qwen2.5-1.5b-instruct-q8_0.gguf" + LLAMACPP_MODEL_NAME: "qwen2.5-1.5b-instruct-q8_0.gguf" diff --git a/deploy/kubernetes/deploy.sh b/deploy/kubernetes/deploy.sh index 156f0e68..61fdf1c6 100755 --- a/deploy/kubernetes/deploy.sh +++ b/deploy/kubernetes/deploy.sh @@ -7,8 +7,9 @@ set -e # Configuration NAMESPACE="context-engine" -IMAGE_REGISTRY="context-engine" +IMAGE_REGISTRY="context-engine" # Change to your registry if needed IMAGE_TAG="latest" +USE_KUSTOMIZE=${USE_KUSTOMIZE:-"false"} # Colors for output RED='\033[0;31m' @@ -73,7 +74,7 @@ deploy_core() { # Wait for Qdrant to be ready log_info "Waiting for Qdrant to be ready..." - kubectl wait --for=condition=ready pod -l component=qdrant -n $NAMESPACE --timeout=300s || log_warning "Qdrant may not be ready yet" + kubectl wait --for=condition=ready pod -l component=qdrant -n "$NAMESPACE" --timeout=300s log_success "Core services deployed" } @@ -88,8 +89,8 @@ deploy_mcp_servers() { # Wait for MCP servers to be ready log_info "Waiting for MCP servers to be ready..." - kubectl wait --for=condition=ready pod -l component=mcp-memory -n $NAMESPACE --timeout=300s || log_warning "MCP Memory may not be ready yet" - kubectl wait --for=condition=ready pod -l component=mcp-indexer -n $NAMESPACE --timeout=300s || log_warning "MCP Indexer may not be ready yet" + kubectl wait --for=condition=ready pod -l component=mcp-memory -n "$NAMESPACE" --timeout=300s + kubectl wait --for=condition=ready pod -l component=mcp-indexer -n "$NAMESPACE" --timeout=300s log_success "MCP servers deployed" } @@ -99,9 +100,9 @@ deploy_http_servers() { log_info "Deploying HTTP servers (optional)" kubectl apply -f mcp-http.yaml - # Wait for HTTP servers to be ready - kubectl wait --for=condition=ready pod -l component=mcp-memory-http -n $NAMESPACE --timeout=300s || log_warning "MCP Memory HTTP may not be ready yet" - kubectl wait --for=condition=ready pod -l component=mcp-indexer-http -n $NAMESPACE --timeout=300s || log_warning "MCP Indexer HTTP may not be ready yet" + log_info "Waiting for HTTP servers to be ready..." + kubectl wait --for=condition=ready pod -l component=mcp-memory-http -n "$NAMESPACE" --timeout=300s + kubectl wait --for=condition=ready pod -l component=mcp-indexer-http -n "$NAMESPACE" --timeout=300s log_success "HTTP servers deployed" } @@ -132,7 +133,7 @@ deploy_ingress() { kubectl apply -f ingress.yaml log_success "Ingress deployed" else - log_warning "Skipping Ingress deployment (set --deploy-ingress to enable)" + log_warning "Skipping Ingress deployment (set DEPLOY_INGRESS=true or pass --deploy-ingress to enable)" fi } @@ -148,6 +149,9 @@ show_status() { echo "Services:" kubectl get services -n $NAMESPACE echo + echo "Persistent Volumes:" + kubectl get pvc -n $NAMESPACE || echo "No PVCs found" + echo log_success "Deployment complete!" echo @@ -162,7 +166,6 @@ show_status() { fi } - # Patch images to the chosen registry:tag and refresh jobs set_images() { local full="${IMAGE_REGISTRY}:${IMAGE_TAG}" @@ -234,7 +237,6 @@ apply_with_kustomize() { rm -rf "${tmp_dir}" } - # Main deployment function main() { log_info "Starting Context-Engine Kubernetes deployment" @@ -329,6 +331,4 @@ if [[ ! -f "qdrant.yaml" ]]; then exit 1 fi -# Run main deployment main - diff --git a/deploy/kubernetes/indexer-services.yaml b/deploy/kubernetes/indexer-services.yaml index a2695dc3..a8351c35 100644 --- a/deploy/kubernetes/indexer-services.yaml +++ b/deploy/kubernetes/indexer-services.yaml @@ -1,5 +1,5 @@ --- -# Watcher Deployment (File change monitoring and reindexing) +# Indexer Service Deployment (file change monitoring and reindexing) # This is a template - copy and customize for each repository apiVersion: apps/v1 kind: Deployment @@ -8,24 +8,23 @@ metadata: namespace: context-engine labels: app: context-engine - component: watcher + component: indexer-service spec: replicas: 1 selector: matchLabels: app: context-engine - component: watcher + component: indexer-service template: metadata: labels: app: context-engine - component: watcher + component: indexer-service spec: serviceAccountName: context-engine - containers: - name: watcher - image: context-engine:latest + image: context-engine-indexer-service imagePullPolicy: IfNotPresent command: ["python", "/app/scripts/watch_index.py"] workingDir: /work @@ -78,15 +77,20 @@ spec: cpu: "250m" limits: memory: "2Gi" - cpu: "1" + cpu: "1000m" volumeMounts: - - name: work + - name: work-volume mountPath: /work + readOnly: true + - name: metadata-volume + mountPath: /work/.codebase volumes: - - name: work - hostPath: - path: /tmp/context-engine-work - type: DirectoryOrCreate + - name: work-volume + persistentVolumeClaim: + claimName: code-repos-pvc + - name: metadata-volume + persistentVolumeClaim: + claimName: code-metadata-pvc --- # Indexer Job (One-shot code indexing) @@ -109,7 +113,7 @@ spec: restartPolicy: OnFailure containers: - name: indexer - image: context-engine:latest + image: context-engine-indexer-service imagePullPolicy: IfNotPresent command: ["python", "/app/scripts/ingest_code.py"] workingDir: /work @@ -135,16 +139,20 @@ spec: cpu: "500m" limits: memory: "4Gi" - cpu: "2" + cpu: "2000m" volumeMounts: - - name: work + - name: work-volume mountPath: /work readOnly: true + - name: metadata-volume + mountPath: /work/.codebase volumes: - - name: work - hostPath: - path: /tmp/context-engine-work - type: DirectoryOrCreate + - name: work-volume + persistentVolumeClaim: + claimName: code-repos-pvc + - name: metadata-volume + persistentVolumeClaim: + claimName: code-metadata-pvc --- # Index Initialization Job @@ -167,7 +175,7 @@ spec: restartPolicy: OnFailure containers: - name: init-payload - image: context-engine:latest + image: context-engine-indexer-service imagePullPolicy: IfNotPresent command: ["python", "/app/scripts/create_indexes.py"] workingDir: /work @@ -190,12 +198,15 @@ spec: memory: "1Gi" cpu: "500m" volumeMounts: - - name: work + - name: work-volume mountPath: /work readOnly: true + - name: metadata-volume + mountPath: /work/.codebase volumes: - - name: work - hostPath: - path: /tmp/context-engine-work - type: DirectoryOrCreate - + - name: work-volume + persistentVolumeClaim: + claimName: code-repos-pvc + - name: metadata-volume + persistentVolumeClaim: + claimName: code-metadata-pvc diff --git a/deploy/kubernetes/ingress.yaml b/deploy/kubernetes/ingress.yaml index a415be73..99c1b5a6 100644 --- a/deploy/kubernetes/ingress.yaml +++ b/deploy/kubernetes/ingress.yaml @@ -1,5 +1,5 @@ --- -# Ingress for Context-Engine services +# Ingress for Context-Engine services (optional) # Requires an Ingress controller (e.g., nginx-ingress, traefik) apiVersion: networking.k8s.io/v1 kind: Ingress @@ -8,15 +8,14 @@ metadata: namespace: context-engine labels: app: context-engine + component: ingress annotations: - # Nginx Ingress annotations nginx.ingress.kubernetes.io/use-regex: "true" nginx.ingress.kubernetes.io/rewrite-target: /$2 - nginx.ingress.kubernetes.io/ssl-redirect: "true" - # Increase timeouts for SSE connections + nginx.ingress.kubernetes.io/ssl-redirect: "false" + nginx.ingress.kubernetes.io/proxy-body-size: "100m" nginx.ingress.kubernetes.io/proxy-read-timeout: "3600" nginx.ingress.kubernetes.io/proxy-send-timeout: "3600" - # Enable CORS if needed # nginx.ingress.kubernetes.io/enable-cors: "true" # nginx.ingress.kubernetes.io/cors-allow-origin: "*" spec: @@ -25,7 +24,6 @@ spec: - host: context-engine.example.com # Change to your domain http: paths: - # Qdrant - path: /qdrant(/|$)(.*) pathType: ImplementationSpecific backend: @@ -33,8 +31,6 @@ spec: name: qdrant port: number: 6333 - - # MCP Memory (SSE) - path: /mcp/memory(/|$)(.*) pathType: ImplementationSpecific backend: @@ -42,8 +38,6 @@ spec: name: mcp-memory port: number: 8000 - - # MCP Indexer (SSE) - path: /mcp/indexer(/|$)(.*) pathType: ImplementationSpecific backend: @@ -51,8 +45,6 @@ spec: name: mcp-indexer port: number: 8001 - - # MCP Memory HTTP - path: /mcp-http/memory(/|$)(.*) pathType: ImplementationSpecific backend: @@ -60,8 +52,6 @@ spec: name: mcp-memory-http port: number: 8002 - - # MCP Indexer HTTP - path: /mcp-http/indexer(/|$)(.*) pathType: ImplementationSpecific backend: @@ -69,8 +59,6 @@ spec: name: mcp-indexer-http port: number: 8003 - - # Llama.cpp (optional) - path: /llamacpp(/|$)(.*) pathType: ImplementationSpecific backend: @@ -78,10 +66,9 @@ spec: name: llamacpp port: number: 8080 - + # TLS configuration (optional) # tls: # - hosts: # - context-engine.example.com # secretName: context-engine-tls - diff --git a/deploy/kubernetes/kustomization.yaml b/deploy/kubernetes/kustomization.yaml index 908af460..10944621 100644 --- a/deploy/kubernetes/kustomization.yaml +++ b/deploy/kubernetes/kustomization.yaml @@ -12,6 +12,7 @@ resources: # Core services - qdrant.yaml + - code-models-pvc.yaml # MCP servers - mcp-memory.yaml @@ -22,20 +23,17 @@ resources: - indexer-services.yaml - rbac.yaml - networkpolicy.yaml - - hpa.yaml - # Optional services - llamacpp.yaml - ingress.yaml # Common labels -labels: - - pairs: - app.kubernetes.io/name: context-engine - app.kubernetes.io/component: kubernetes-deployment - app.kubernetes.io/managed-by: kustomize +commonLabels: + app.kubernetes.io/name: context-engine + app.kubernetes.io/component: kubernetes-deployment + app.kubernetes.io/managed-by: kustomize # Patches for production customization patchesStrategicMerge: [] @@ -81,4 +79,3 @@ patches: target: kind: Deployment name: mcp-memory - diff --git a/deploy/kubernetes/llamacpp.yaml b/deploy/kubernetes/llamacpp.yaml index aec6fcfe..695e3770 100644 --- a/deploy/kubernetes/llamacpp.yaml +++ b/deploy/kubernetes/llamacpp.yaml @@ -1,5 +1,5 @@ --- -# Llama.cpp Deployment (Optional - for text generation) +# Optional Llama.cpp Service (Text Generation) apiVersion: apps/v1 kind: Deployment metadata: @@ -9,7 +9,7 @@ metadata: app: context-engine component: llamacpp spec: - replicas: 1 + replicas: 1 # Set to 0 if not needed selector: matchLabels: app: context-engine @@ -86,26 +86,33 @@ spec: - name: http containerPort: 8080 protocol: TCP - command: - - /app/llama-server + env: + - name: LLAMA_ARG_MODEL + value: "/models/model.gguf" + - name: LLAMA_ARG_CTX_SIZE + value: "8192" + - name: LLAMA_ARG_HOST + value: "0.0.0.0" + - name: LLAMA_ARG_PORT + value: "8080" + command: ["llama-server"] args: - - --host - - "0.0.0.0" - - --port - - "8080" - - --model - - /models/qwen2.5-1.5b-instruct-q8_0.gguf - - --ctx-size - - "4096" - - --n-gpu-layers - - "0" + - "--model" + - "/models/model.gguf" + - "--host" + - "0.0.0.0" + - "--port" + - "8080" + - "--ctx-size" + - "8192" + - "--no-warmup" resources: requests: memory: "2Gi" - cpu: "1" + cpu: "1000m" limits: memory: "8Gi" - cpu: "4" + cpu: "4000m" volumeMounts: - name: models mountPath: /models @@ -115,18 +122,19 @@ spec: path: /health port: http initialDelaySeconds: 60 - periodSeconds: 10 + periodSeconds: 30 + timeoutSeconds: 10 readinessProbe: httpGet: path: /health port: http initialDelaySeconds: 30 - periodSeconds: 5 + periodSeconds: 10 + timeoutSeconds: 5 volumes: - name: models - hostPath: - path: /tmp/context-engine-models - type: DirectoryOrCreate + persistentVolumeClaim: + claimName: code-models-pvc --- # Llama.cpp Service @@ -139,16 +147,16 @@ metadata: app: context-engine component: llamacpp spec: - type: ClusterIP + type: NodePort # Change to LoadBalancer for external access ports: - name: http port: 8080 targetPort: http + nodePort: 30808 # Optional: specify node port protocol: TCP selector: app: context-engine component: llamacpp - --- # Optional: Llama.cpp External Service apiVersion: v1 @@ -170,4 +178,3 @@ spec: selector: app: context-engine component: llamacpp - diff --git a/deploy/kubernetes/mcp-http.yaml b/deploy/kubernetes/mcp-http.yaml index 8ecd7b55..9829b085 100644 --- a/deploy/kubernetes/mcp-http.yaml +++ b/deploy/kubernetes/mcp-http.yaml @@ -1,5 +1,5 @@ --- -# MCP Memory HTTP Deployment +# MCP Memory Server (HTTP) Deployment apiVersion: apps/v1 kind: Deployment metadata: @@ -21,31 +21,20 @@ spec: component: mcp-memory-http spec: serviceAccountName: context-engine - containers: - name: mcp-memory-http - image: context-engine:latest + image: context-engine-memory imagePullPolicy: IfNotPresent - command: ["python", "/app/scripts/mcp_memory_server.py"] + command: ["python", "-m", "mcp.server.fastmcp"] + args: ["--server-name", "context-engine-http", "--host", "0.0.0.0", "--port", "8000", "--transport", "http", "/app/scripts/memory_server.py"] ports: - name: http - containerPort: 8002 + containerPort: 8000 protocol: TCP - name: health - containerPort: 18002 + containerPort: 18000 protocol: TCP env: - - name: FASTMCP_TRANSPORT - value: "streamable-http" - - name: FASTMCP_HOST - valueFrom: - configMapKeyRef: - name: context-engine-config - key: FASTMCP_HOST - - name: FASTMCP_PORT - value: "8002" - - name: FASTMCP_HEALTH_PORT - value: "18002" - name: QDRANT_URL valueFrom: configMapKeyRef: @@ -61,15 +50,44 @@ spec: configMapKeyRef: name: context-engine-config key: EMBEDDING_MODEL + - name: EMBEDDING_PROVIDER + valueFrom: + configMapKeyRef: + name: context-engine-config + key: EMBEDDING_PROVIDER + - name: TOOL_STORE_DESCRIPTION + valueFrom: + configMapKeyRef: + name: context-engine-config + key: TOOL_STORE_DESCRIPTION + - name: TOOL_FIND_DESCRIPTION + valueFrom: + configMapKeyRef: + name: context-engine-config + key: TOOL_FIND_DESCRIPTION + - name: FASTMCP_HOST + valueFrom: + configMapKeyRef: + name: context-engine-config + key: FASTMCP_HOST + - name: FASTMCP_PORT + value: "8000" + - name: FASTMCP_TRANSPORT + valueFrom: + configMapKeyRef: + name: context-engine-config + key: FASTMCP_HTTP_TRANSPORT + - name: FASTMCP_HEALTH_PORT + value: "18000" resources: requests: - memory: "1Gi" - cpu: "500m" + memory: "512Mi" + cpu: "250m" limits: - memory: "4Gi" - cpu: "2" + memory: "2Gi" + cpu: "1000m" volumeMounts: - - name: work + - name: work-volume mountPath: /work readOnly: true livenessProbe: @@ -85,13 +103,12 @@ spec: initialDelaySeconds: 10 periodSeconds: 5 volumes: - - name: work - hostPath: - path: /tmp/context-engine-work - type: DirectoryOrCreate + - name: work-volume + persistentVolumeClaim: + claimName: code-repos-pvc --- -# MCP Memory HTTP Service +# MCP Memory Server (HTTP) Service apiVersion: v1 kind: Service metadata: @@ -101,15 +118,17 @@ metadata: app: context-engine component: mcp-memory-http spec: - type: ClusterIP + type: NodePort # Change to LoadBalancer for external access ports: - name: http port: 8002 targetPort: http + nodePort: 30804 # Optional: specify node port protocol: TCP - name: health port: 18002 targetPort: health + nodePort: 30805 # Optional: specify node port protocol: TCP selector: app: context-engine @@ -143,7 +162,7 @@ spec: component: mcp-memory-http --- -# MCP Indexer HTTP Deployment +# MCP Indexer Server (HTTP) Deployment apiVersion: apps/v1 kind: Deployment metadata: @@ -167,28 +186,18 @@ spec: serviceAccountName: context-engine containers: - name: mcp-indexer-http - image: context-engine:latest + image: context-engine-indexer imagePullPolicy: IfNotPresent - command: ["python", "/app/scripts/mcp_indexer_server.py"] + command: ["python", "-m", "mcp.server.fastmcp"] + args: ["--server-name", "context-engine-indexer-http", "--host", "0.0.0.0", "--port", "8001", "--transport", "http", "/app/scripts/indexer_server.py"] ports: - name: http - containerPort: 8003 + containerPort: 8001 protocol: TCP - name: health - containerPort: 18003 + containerPort: 18001 protocol: TCP env: - - name: FASTMCP_TRANSPORT - value: "streamable-http" - - name: FASTMCP_HOST - valueFrom: - configMapKeyRef: - name: context-engine-config - key: FASTMCP_HOST - - name: FASTMCP_INDEXER_PORT - value: "8003" - - name: FASTMCP_HEALTH_PORT - value: "18003" - name: QDRANT_URL valueFrom: configMapKeyRef: @@ -244,16 +253,32 @@ spec: configMapKeyRef: name: context-engine-config key: CTX_MULTI_COLLECTION + - name: FASTMCP_HOST + valueFrom: + configMapKeyRef: + name: context-engine-config + key: FASTMCP_HOST + - name: FASTMCP_INDEXER_PORT + value: "8001" + - name: FASTMCP_TRANSPORT + valueFrom: + configMapKeyRef: + name: context-engine-config + key: FASTMCP_HTTP_TRANSPORT + - name: FASTMCP_HEALTH_PORT + value: "18001" resources: requests: - memory: "1Gi" - cpu: "500m" + memory: "512Mi" + cpu: "250m" limits: - memory: "4Gi" - cpu: "2" + memory: "2Gi" + cpu: "1000m" volumeMounts: - - name: work + - name: work-volume mountPath: /work + - name: codebase-volume + mountPath: /work/.codebase livenessProbe: httpGet: path: /readyz @@ -267,13 +292,15 @@ spec: initialDelaySeconds: 10 periodSeconds: 5 volumes: - - name: work - hostPath: - path: /tmp/context-engine-work - type: DirectoryOrCreate + - name: work-volume + persistentVolumeClaim: + claimName: code-repos-pvc + - name: codebase-volume + persistentVolumeClaim: + claimName: code-metadata-pvc --- -# MCP Indexer HTTP Service +# MCP Indexer Server (HTTP) Service apiVersion: v1 kind: Service metadata: @@ -283,15 +310,17 @@ metadata: app: context-engine component: mcp-indexer-http spec: - type: ClusterIP + type: NodePort # Change to LoadBalancer for external access ports: - name: http port: 8003 targetPort: http + nodePort: 30806 # Optional: specify node port protocol: TCP - name: health port: 18003 targetPort: health + nodePort: 30807 # Optional: specify node port protocol: TCP selector: app: context-engine @@ -323,4 +352,3 @@ spec: selector: app: context-engine component: mcp-indexer-http - diff --git a/deploy/kubernetes/mcp-indexer.yaml b/deploy/kubernetes/mcp-indexer.yaml index a11bff6b..389316f3 100644 --- a/deploy/kubernetes/mcp-indexer.yaml +++ b/deploy/kubernetes/mcp-indexer.yaml @@ -1,5 +1,5 @@ --- -# MCP Indexer Deployment +# MCP Indexer Server (SSE) Deployment apiVersion: apps/v1 kind: Deployment metadata: @@ -21,10 +21,9 @@ spec: component: mcp-indexer spec: serviceAccountName: context-engine - containers: - name: mcp-indexer - image: context-engine:latest + image: context-engine-indexer imagePullPolicy: IfNotPresent command: ["python", "/app/scripts/mcp_indexer_server.py"] ports: @@ -47,6 +46,8 @@ spec: key: FASTMCP_INDEXER_PORT - name: FASTMCP_HEALTH_PORT value: "18001" + - name: FASTMCP_TRANSPORT + value: "sse" - name: QDRANT_URL valueFrom: configMapKeyRef: @@ -62,41 +63,6 @@ spec: configMapKeyRef: name: context-engine-config key: EMBEDDING_MODEL - - name: INDEX_MICRO_CHUNKS - valueFrom: - configMapKeyRef: - name: context-engine-config - key: INDEX_MICRO_CHUNKS - - name: MAX_MICRO_CHUNKS_PER_FILE - valueFrom: - configMapKeyRef: - name: context-engine-config - key: MAX_MICRO_CHUNKS_PER_FILE - - name: REFRAG_MODE - valueFrom: - configMapKeyRef: - name: context-engine-config - key: REFRAG_MODE - - name: REFRAG_DECODER - valueFrom: - configMapKeyRef: - name: context-engine-config - key: REFRAG_DECODER - - name: LLAMACPP_URL - valueFrom: - configMapKeyRef: - name: context-engine-config - key: LLAMACPP_URL - - name: MEMORY_SSE_ENABLED - valueFrom: - configMapKeyRef: - name: context-engine-config - key: MEMORY_SSE_ENABLED - - name: MEMORY_MCP_URL - valueFrom: - configMapKeyRef: - name: context-engine-config - key: MEMORY_MCP_URL - name: CTX_MULTI_COLLECTION valueFrom: configMapKeyRef: @@ -104,14 +70,16 @@ spec: key: CTX_MULTI_COLLECTION resources: requests: - memory: "1Gi" - cpu: "500m" + memory: "512Mi" + cpu: "250m" limits: - memory: "4Gi" - cpu: "2" + memory: "2Gi" + cpu: "1000m" volumeMounts: - - name: work + - name: work-volume mountPath: /work + - name: codebase-volume + mountPath: /work/.codebase livenessProbe: httpGet: path: /readyz @@ -125,13 +93,15 @@ spec: initialDelaySeconds: 10 periodSeconds: 5 volumes: - - name: work - hostPath: - path: /tmp/context-engine-work - type: DirectoryOrCreate + - name: work-volume + persistentVolumeClaim: + claimName: code-repos-pvc + - name: codebase-volume + persistentVolumeClaim: + claimName: code-metadata-pvc --- -# MCP Indexer Service +# MCP Indexer Server (SSE) Service apiVersion: v1 kind: Service metadata: @@ -141,15 +111,17 @@ metadata: app: context-engine component: mcp-indexer spec: - type: ClusterIP + type: NodePort # Change to LoadBalancer for external access ports: - name: sse port: 8001 targetPort: sse + nodePort: 30802 # Optional: specify node port protocol: TCP - name: health port: 18001 targetPort: health + nodePort: 30803 # Optional: specify node port protocol: TCP selector: app: context-engine @@ -181,4 +153,3 @@ spec: selector: app: context-engine component: mcp-indexer - diff --git a/deploy/kubernetes/mcp-memory.yaml b/deploy/kubernetes/mcp-memory.yaml index aca68cb0..891ce15e 100644 --- a/deploy/kubernetes/mcp-memory.yaml +++ b/deploy/kubernetes/mcp-memory.yaml @@ -1,5 +1,5 @@ --- -# MCP Memory Deployment +# MCP Memory Server (SSE) Deployment apiVersion: apps/v1 kind: Deployment metadata: @@ -21,10 +21,9 @@ spec: component: mcp-memory spec: serviceAccountName: context-engine - containers: - name: mcp-memory - image: context-engine:latest + image: context-engine-memory imagePullPolicy: IfNotPresent command: ["python", "/app/scripts/mcp_memory_server.py"] ports: @@ -47,6 +46,8 @@ spec: key: FASTMCP_PORT - name: FASTMCP_HEALTH_PORT value: "18000" + - name: FASTMCP_TRANSPORT + value: "sse" - name: QDRANT_URL valueFrom: configMapKeyRef: @@ -62,11 +63,6 @@ spec: configMapKeyRef: name: context-engine-config key: EMBEDDING_MODEL - - name: EMBEDDING_PROVIDER - valueFrom: - configMapKeyRef: - name: context-engine-config - key: EMBEDDING_PROVIDER resources: requests: memory: "1Gi" @@ -75,7 +71,7 @@ spec: memory: "4Gi" cpu: "2" volumeMounts: - - name: work + - name: work-volume mountPath: /work readOnly: true livenessProbe: @@ -91,13 +87,12 @@ spec: initialDelaySeconds: 10 periodSeconds: 5 volumes: - - name: work - hostPath: - path: /tmp/context-engine-work - type: DirectoryOrCreate + - name: work-volume + persistentVolumeClaim: + claimName: code-repos-pvc --- -# MCP Memory Service +# MCP Memory Server (SSE) Service apiVersion: v1 kind: Service metadata: @@ -107,15 +102,17 @@ metadata: app: context-engine component: mcp-memory spec: - type: ClusterIP + type: NodePort # Change to LoadBalancer for external access ports: - name: sse port: 8000 targetPort: sse + nodePort: 30800 # Optional: specify node port protocol: TCP - name: health port: 18000 targetPort: health + nodePort: 30801 # Optional: specify node port protocol: TCP selector: app: context-engine diff --git a/deploy/kubernetes/namespace.yaml b/deploy/kubernetes/namespace.yaml index b972df16..0f0cecad 100644 --- a/deploy/kubernetes/namespace.yaml +++ b/deploy/kubernetes/namespace.yaml @@ -3,5 +3,6 @@ kind: Namespace metadata: name: context-engine labels: - name: context-engine +name: context-engine app: context-engine + component: infrastructure diff --git a/deploy/kubernetes/qdrant.yaml b/deploy/kubernetes/qdrant.yaml index 503af2f8..191041b1 100644 --- a/deploy/kubernetes/qdrant.yaml +++ b/deploy/kubernetes/qdrant.yaml @@ -39,7 +39,7 @@ spec: value: "6334" resources: requests: - memory: "2Gi" +memory: "2Gi" cpu: "1" limits: memory: "8Gi" @@ -49,13 +49,13 @@ spec: mountPath: /qdrant/storage livenessProbe: httpGet: - path: /healthz +path: /healthz port: http initialDelaySeconds: 30 periodSeconds: 10 readinessProbe: httpGet: - path: /readyz +path: /readyz port: http initialDelaySeconds: 5 periodSeconds: 5 @@ -67,10 +67,10 @@ spec: component: qdrant spec: accessModes: ["ReadWriteOnce"] - storageClassName: standard # Adjust based on your cluster +# storageClassName: "" # Uncomment and set if you want to specify a storage class resources: requests: - storage: 50Gi + storage: 20Gi --- # Qdrant Service @@ -108,19 +108,18 @@ metadata: app: context-engine component: qdrant spec: - type: NodePort +type: NodePort # Change to LoadBalancer if your cluster supports it ports: - name: http port: 6333 targetPort: http - nodePort: 30333 +nodePort: 30333 # Optional: specify node port protocol: TCP - name: grpc port: 6334 targetPort: grpc - nodePort: 30334 +nodePort: 30334 # Optional: specify node port protocol: TCP selector: app: context-engine component: qdrant - diff --git a/deploy/kubernetes/upload-codebase-pvc.yaml b/deploy/kubernetes/upload-codebase-pvc.yaml new file mode 100644 index 00000000..cd6d07a9 --- /dev/null +++ b/deploy/kubernetes/upload-codebase-pvc.yaml @@ -0,0 +1,23 @@ +--- +# Persistent Volume Claim for codebase metadata storage (CephFS RWX) +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: upload-codebase-pvc + namespace: context-engine + labels: + app: context-engine + component: upload-service + type: storage +spec: + accessModes: + - ReadWriteMany # CephFS supports RWX for multiple pods + storageClassName: rook-cephfs # Adjust based on your CephFS storage class + resources: + requests: + storage: 5Gi # Smaller size for metadata/cache + # Optional: selector for specific PV + # selector: + # matchLabels: + # app: context-engine + # component: upload-codebase \ No newline at end of file diff --git a/deploy/kubernetes/upload-pvc.yaml b/deploy/kubernetes/upload-pvc.yaml new file mode 100644 index 00000000..8e4487dd --- /dev/null +++ b/deploy/kubernetes/upload-pvc.yaml @@ -0,0 +1,47 @@ +--- +# Persistent Volume Claim for code repositories storage (CephFS RWX) +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: code-repos-pvc + namespace: context-engine + labels: + app: context-engine + component: upload-service + type: storage +spec: + accessModes: + - ReadWriteMany # CephFS supports RWX for multiple pods + storageClassName: rook-cephfs # Adjust based on your CephFS storage class + resources: + requests: + storage: 10Gi # Adjust size based on your needs + # Optional: selector for specific PV + # selector: + # matchLabels: + # app: context-engine + # component: code-repos + +--- +# Persistent Volume Claim for code metadata storage (CephFS RWX) +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: code-metadata-pvc + namespace: context-engine + labels: + app: context-engine + component: upload-service + type: storage +spec: + accessModes: + - ReadWriteMany # CephFS supports RWX for multiple pods + storageClassName: rook-cephfs # Adjust based on your CephFS storage class + resources: + requests: + storage: 5Gi # Smaller size for metadata/cache + # Optional: selector for specific PV + # selector: + # matchLabels: + # app: context-engine + # component: code-metadata diff --git a/deploy/kubernetes/upload-service.yaml b/deploy/kubernetes/upload-service.yaml new file mode 100644 index 00000000..189a35b1 --- /dev/null +++ b/deploy/kubernetes/upload-service.yaml @@ -0,0 +1,129 @@ +--- +# Delta Upload Service Deployment +apiVersion: apps/v1 +kind: Deployment +metadata: + name: upload-service + namespace: context-engine + labels: + app: context-engine + component: upload-service +spec: + replicas: 1 + selector: + matchLabels: + app: context-engine + component: upload-service + template: + metadata: + labels: + app: context-engine + component: upload-service + spec: + securityContext: + runAsUser: 1000 + runAsGroup: 1000 + fsGroup: 1000 + containers: + - name: upload-service + image: context-engine-upload-service # Use service-specific image name + imagePullPolicy: IfNotPresent + command: ["python", "scripts/upload_service.py"] + ports: + - name: http + containerPort: 8002 + protocol: TCP + env: + - name: QDRANT_URL + valueFrom: + configMapKeyRef: + name: context-engine-config + key: QDRANT_URL + - name: COLLECTION_NAME + valueFrom: + configMapKeyRef: + name: context-engine-config + key: COLLECTION_NAME + - name: UPLOAD_SERVICE_HOST + value: "0.0.0.0" + - name: UPLOAD_SERVICE_PORT + value: "8002" + - name: WORK_DIR + value: "/work" + - name: MAX_BUNDLE_SIZE_MB + value: "100" + - name: UPLOAD_TIMEOUT_SECS + value: "300" + - name: EMBEDDING_MODEL + valueFrom: + configMapKeyRef: + name: context-engine-config + key: EMBEDDING_MODEL + - name: USE_TREE_SITTER + valueFrom: + configMapKeyRef: + name: context-engine-config + key: USE_TREE_SITTER + - name: INDEX_SEMANTIC_CHUNKS + valueFrom: + configMapKeyRef: + name: context-engine-config + key: INDEX_SEMANTIC_CHUNKS + - name: INDEX_MICRO_CHUNKS + valueFrom: + configMapKeyRef: + name: context-engine-config + key: INDEX_MICRO_CHUNKS + resources: + requests: + memory: "512Mi" + cpu: "250m" + limits: + memory: "2Gi" + cpu: "1000m" + volumeMounts: + - name: work-volume + mountPath: /work + - name: codebase-volume + mountPath: /work/.codebase + livenessProbe: + httpGet: + path: /health + port: http + initialDelaySeconds: 30 + periodSeconds: 10 + readinessProbe: + httpGet: + path: /health + port: http + initialDelaySeconds: 10 + periodSeconds: 5 + volumes: + - name: work-volume + persistentVolumeClaim: + claimName: upload-work-pvc + - name: codebase-volume + persistentVolumeClaim: + claimName: upload-codebase-pvc + +--- +# Delta Upload Service Service +apiVersion: v1 +kind: Service +metadata: + name: upload-service + namespace: context-engine + labels: + app: context-engine + component: upload-service +spec: + type: NodePort # Change to LoadBalancer for external access + ports: + - name: http + port: 8002 + targetPort: http + nodePort: 30804 # Optional: specify node port + protocol: TCP + selector: + app: context-engine + component: upload-service \ No newline at end of file diff --git a/docker-compose.dev-remote.yml b/docker-compose.dev-remote.yml new file mode 100644 index 00000000..27a2a4ca --- /dev/null +++ b/docker-compose.dev-remote.yml @@ -0,0 +1,404 @@ +# Development Docker Compose for Remote Upload System Testing +# This file simulates Kubernetes environment with shared volumes that simulate the Kubernetes CephFS RWX PVC behavior. +# Repos stored in /work/ (which is project root - avoiding docker volumes) and metadata are stored in /work/.codebase/repos (project root/.codebase) +# Updated to use separate PVCs for workspace and codebase to eliminate circular dependencies + +version: '3.8' + +services: + # Qdrant vector database - same as base compose + qdrant: + image: qdrant/qdrant:latest + container_name: qdrant-db-dev-remote + ports: + - "6333:6333" + - "6334:6334" + volumes: + - qdrant_storage_dev_remote:/qdrant/storage + networks: + - dev-remote-network + + # MCP search service - same as base compose + mcp: + build: + context: . + dockerfile: Dockerfile.mcp + container_name: mcp-search-dev-remote + user: "1000:1000" + depends_on: + - qdrant + env_file: + - .env + environment: + - FASTMCP_HOST=${FASTMCP_HOST} + - FASTMCP_PORT=${FASTMCP_PORT} + - QDRANT_URL=${QDRANT_URL} + - COLLECTION_NAME=${COLLECTION_NAME} + - PATH_EMIT_MODE=container + - HF_HOME=/work/.cache/huggingface + - TRANSFORMERS_CACHE=/work/.cache/huggingface + - HUGGINGFACE_HUB_CACHE=/work/.cache/huggingface + - EMBEDDING_MODEL=${EMBEDDING_MODEL} + - EMBEDDING_PROVIDER=${EMBEDDING_PROVIDER} + - TOOL_STORE_DESCRIPTION=${TOOL_STORE_DESCRIPTION} + - TOOL_FIND_DESCRIPTION=${TOOL_FIND_DESCRIPTION} + - FASTMCP_HEALTH_PORT=18000 + - HF_HOME=/home/user/.cache + - TRANSFORMERS_CACHE=/home/user/.cache + ports: + - "18000:18000" + - "8000:8000" + volumes: + - workspace_pvc:/work:ro + - huggingface_cache:/home/user/.cache + networks: + - dev-remote-network + + # MCP indexer service - same as base compose + mcp_indexer: + build: + context: . + dockerfile: Dockerfile.mcp-indexer + container_name: mcp-indexer-dev-remote + user: "1000:1000" + # In K8s, scripts would be accessed directly at /app/scripts/ or via proper initContainer + # For Docker Compose dev-remote simulation, create symlink so /work/scripts/ works + # Use /tmp/huggingface for cache to avoid permission issues (universally writable) + # Set CORRECT environment variables for HuggingFace and FastEmbed + command: ["sh", "-c", "mkdir -p /tmp/huggingface/hub /tmp/huggingface/transformers /tmp/huggingface/fastembed && exec python /app/scripts/mcp_indexer_server.py"] + depends_on: + - qdrant + env_file: + - .env + environment: + - FASTMCP_HEALTH_PORT=18001 + - FASTMCP_HOST=${FASTMCP_HOST} + - FASTMCP_INDEXER_PORT=${FASTMCP_INDEXER_PORT} + - QDRANT_URL=${QDRANT_URL} + - COLLECTION_NAME=${COLLECTION_NAME} + - PATH_EMIT_MODE=container + - HF_HOME=/tmp/huggingface + - HF_HUB_CACHE=/tmp/huggingface/hub + - TRANSFORMERS_CACHE=/tmp/huggingface/transformers + - FASTEMBED_CACHE_PATH=/tmp/huggingface/fastembed + - EMBEDDING_MODEL=${EMBEDDING_MODEL} + - EMBEDDING_PROVIDER=${EMBEDDING_PROVIDER} + - QDRANT_TIMEOUT=${QDRANT_TIMEOUT:-60} + - INDEX_SEMANTIC_CHUNKS=${INDEX_SEMANTIC_CHUNKS:-0} + - INDEX_MICRO_CHUNKS=${INDEX_MICRO_CHUNKS:-0} + - INDEX_UPSERT_BATCH=${INDEX_UPSERT_BATCH:-512} + - INDEX_UPSERT_RETRIES=${INDEX_UPSERT_RETRIES:-5} + - MAX_MICRO_CHUNKS_PER_FILE=${MAX_MICRO_CHUNKS_PER_FILE:-200} + ports: + - "${FASTMCP_INDEXER_PORT:-8001}:8001" + - "18001:18001" + volumes: + - workspace_pvc:/work:rw + - codebase_pvc:/work/.codebase:rw + networks: + - dev-remote-network + + # MCP HTTP search service - same as base compose + mcp_http: + build: + context: . + dockerfile: Dockerfile.mcp + container_name: mcp-search-http-dev-remote + user: "1000:1000" + depends_on: + - qdrant + env_file: + - .env + environment: + - FASTMCP_HOST=${FASTMCP_HOST} + - FASTMCP_PORT=8000 + - FASTMCP_TRANSPORT=${FASTMCP_HTTP_TRANSPORT} + - QDRANT_URL=${QDRANT_URL} + - COLLECTION_NAME=${COLLECTION_NAME} + - PATH_EMIT_MODE=container + - HF_HOME=/work/.cache/huggingface + - TRANSFORMERS_CACHE=/work/.cache/huggingface + - HUGGINGFACE_HUB_CACHE=/work/.cache/huggingface + - EMBEDDING_MODEL=${EMBEDDING_MODEL} + - EMBEDDING_PROVIDER=${EMBEDDING_PROVIDER} + - TOOL_STORE_DESCRIPTION=${TOOL_STORE_DESCRIPTION} + - TOOL_FIND_DESCRIPTION=${TOOL_FIND_DESCRIPTION} + - FASTMCP_HEALTH_PORT=18000 + - HF_HOME=/home/user/.cache + - TRANSFORMERS_CACHE=/home/user/.cache + ports: + - "${FASTMCP_HTTP_HEALTH_PORT:-18002}:18000" + - "${FASTMCP_HTTP_PORT:-8002}:8000" + volumes: + - workspace_pvc:/work:ro + - huggingface_cache:/home/user/.cache + networks: + - dev-remote-network + + # MCP HTTP indexer service - same as base compose + mcp_indexer_http: + build: + context: . + dockerfile: Dockerfile.mcp-indexer + container_name: mcp-indexer-http-dev-remote + user: "1000:1000" + # In K8s, scripts would be accessed directly at /app/scripts/ or via proper initContainer + # For Docker Compose dev-remote simulation, create symlink so /work/scripts/ works + # Use /tmp/huggingface for cache to avoid permission issues (universally writable) + # Set CORRECT environment variables for HuggingFace and FastEmbed + command: ["sh", "-c", "mkdir -p /tmp/huggingface/hub /tmp/huggingface/transformers /tmp/huggingface/fastembed && exec python /app/scripts/mcp_indexer_server.py"] + depends_on: + - qdrant + env_file: + - .env + environment: + - FASTMCP_HOST=${FASTMCP_HOST} + - FASTMCP_INDEXER_PORT=8001 + - FASTMCP_TRANSPORT=${FASTMCP_HTTP_TRANSPORT} + - QDRANT_URL=${QDRANT_URL} + - FASTMCP_HEALTH_PORT=18001 + - COLLECTION_NAME=${COLLECTION_NAME} + - PATH_EMIT_MODE=container + - HF_HOME=/tmp/huggingface + - HF_HUB_CACHE=/tmp/huggingface/hub + - TRANSFORMERS_CACHE=/tmp/huggingface/transformers + - FASTEMBED_CACHE_PATH=/tmp/huggingface/fastembed + - EMBEDDING_MODEL=${EMBEDDING_MODEL} + - EMBEDDING_PROVIDER=${EMBEDDING_PROVIDER} + - QDRANT_TIMEOUT=${QDRANT_TIMEOUT:-60} + - INDEX_SEMANTIC_CHUNKS=${INDEX_SEMANTIC_CHUNKS:-0} + - INDEX_MICRO_CHUNKS=${INDEX_MICRO_CHUNKS:-0} + - INDEX_UPSERT_BATCH=${INDEX_UPSERT_BATCH:-512} + - INDEX_UPSERT_RETRIES=${INDEX_UPSERT_RETRIES:-5} + - MAX_MICRO_CHUNKS_PER_FILE=${MAX_MICRO_CHUNKS_PER_FILE:-200} + ports: + - "${FASTMCP_INDEXER_HTTP_PORT:-8003}:8001" + - "${FASTMCP_INDEXER_HTTP_HEALTH_PORT:-18003}:18001" + volumes: + - workspace_pvc:/work:rw + - codebase_pvc:/work/.codebase:rw + networks: + - dev-remote-network + + # Llama.cpp decoder service - same as base compose + llamacpp: + image: ghcr.io/ggerganov/llama.cpp:server + container_name: llama-decoder-dev-remote + environment: + - LLAMA_ARG_MODEL=/models/model.gguf + - LLAMA_ARG_CTX_SIZE=8192 + - LLAMA_ARG_HOST=0.0.0.0 + - LLAMA_ARG_PORT=8080 + ports: + - "8080:8080" + volumes: + - ./models:/models:ro + command: ["--model", "/models/model.gguf", "--host", "0.0.0.0", "--port", "8080", "--no-warmup"] + networks: + - dev-remote-network + + # Indexer service - modified for PVC volumes + indexer: + build: + context: . + dockerfile: Dockerfile.indexer + container_name: indexer-dev-remote + depends_on: + - qdrant + env_file: + - .env + environment: + - QDRANT_URL=${QDRANT_URL} + - COLLECTION_NAME=${COLLECTION_NAME} + - HF_HOME=/work/.cache/huggingface + - TRANSFORMERS_CACHE=/work/.cache/huggingface + - HUGGINGFACE_HUB_CACHE=/work/.cache/huggingface + - EMBEDDING_MODEL=${EMBEDDING_MODEL} + - HF_HOME=/home/user/.cache + - HOST_INDEX_PATH=/work + - TRANSFORMERS_CACHE=/home/user/.cache + - QDRANT_TIMEOUT=${QDRANT_TIMEOUT:-60} + - INDEX_SEMANTIC_CHUNKS=${INDEX_SEMANTIC_CHUNKS:-0} + - INDEX_MICRO_CHUNKS=${INDEX_MICRO_CHUNKS:-0} + - INDEX_UPSERT_BATCH=${INDEX_UPSERT_BATCH:-512} + - INDEX_UPSERT_RETRIES=${INDEX_UPSERT_RETRIES:-5} + - MAX_MICRO_CHUNKS_PER_FILE=${MAX_MICRO_CHUNKS_PER_FILE:-200} + volumes: + - workspace_pvc:/work:rw + - codebase_pvc:/work/.codebase:rw + - huggingface_cache:/home/user/.cache + entrypoint: ["sh", "-c", "mkdir -p /tmp/logs && /app/scripts/wait-for-qdrant.sh && cd /app && python /app/scripts/ingest_code.py --root /work"] + restart: "no" # Run once on startup, do not restart after completion + networks: + - dev-remote-network + + # Watcher service - modified for PVC volumes + watcher: + build: + context: . + dockerfile: Dockerfile.indexer + container_name: watcher-dev-remote + user: "1000:1000" + depends_on: + - qdrant + env_file: + - .env + environment: + - QDRANT_URL=${QDRANT_URL} + - COLLECTION_NAME=${COLLECTION_NAME} + - HF_HOME=/tmp/huggingface + - HF_HUB_CACHE=/tmp/huggingface/hub + - TRANSFORMERS_CACHE=/tmp/huggingface/transformers + - FASTEMBED_CACHE_PATH=/tmp/huggingface/fastembed + - EMBEDDING_MODEL=${EMBEDDING_MODEL} + - EMBEDDING_PROVIDER=${EMBEDDING_PROVIDER} + - WATCH_ROOT=${WATCH_ROOT:-/work} + - HOST_INDEX_PATH=/work + - QDRANT_TIMEOUT=${QDRANT_TIMEOUT:-60} + - INDEX_SEMANTIC_CHUNKS=${INDEX_SEMANTIC_CHUNKS:-0} + - INDEX_MICRO_CHUNKS=${INDEX_MICRO_CHUNKS:-0} + - INDEX_UPSERT_BATCH=${INDEX_UPSERT_BATCH:-512} + - INDEX_UPSERT_RETRIES=${INDEX_UPSERT_RETRIES:-5} + - MAX_MICRO_CHUNKS_PER_FILE=${MAX_MICRO_CHUNKS_PER_FILE:-200} + - WATCH_DEBOUNCE_SECS=${WATCH_DEBOUNCE_SECS:-1.5} + - REMOTE_UPLOAD_ENABLED=${REMOTE_UPLOAD_ENABLED:-0} + volumes: + - workspace_pvc:/work:rw + - codebase_pvc:/work/.codebase:rw + command: ["sh", "-c", "mkdir -p /tmp/huggingface/hub /tmp/huggingface/transformers /tmp/huggingface/fastembed && exec python /app/scripts/watch_index.py"] + networks: + - dev-remote-network + + # Init payload service - modified for PVC volumes with complete bootstrap + init_payload: + build: + context: . + dockerfile: Dockerfile.indexer + container_name: init-payload-dev-remote + user: "1000:1000" + depends_on: + - qdrant + env_file: + - .env + environment: + - QDRANT_URL=${QDRANT_URL} + - COLLECTION_NAME=${COLLECTION_NAME} + - HF_HOME=/work/.cache/huggingface + - TRANSFORMERS_CACHE=/work/.cache/huggingface + - HUGGINGFACE_HUB_CACHE=/work/.cache/huggingface + - WORKDIR=/work + - TOKENIZER_URL=${TOKENIZER_URL:-https://huggingface.co/BAAI/bge-base-en-v1.5/resolve/main/tokenizer.json} + - TOKENIZER_PATH=${TOKENIZER_PATH:-/work/models/tokenizer.json} + - EMBEDDING_MODEL=${EMBEDDING_MODEL} + volumes: + - workspace_pvc:/work:rw + - codebase_pvc:/work/.codebase:rw + command: [ + "sh", + "-c", + "mkdir -p /tmp/logs && echo 'Starting initialization sequence...' && /app/scripts/wait-for-qdrant.sh && PYTHONPATH=/app python /app/scripts/create_indexes.py && echo 'Collections and metadata created' && python /app/scripts/warm_all_collections.py && echo 'Search caches warmed for all collections' && python /app/scripts/health_check.py && echo 'Initialization completed successfully!'" + ] + restart: "no" # Run once on startup + networks: + - dev-remote-network + + # NEW: Upload Service for Remote Upload System + upload_service: + build: + context: . + dockerfile: Dockerfile.upload-service + container_name: upload-service-dev-remote + user: "1000:1000" + depends_on: + - qdrant + env_file: + - .env + environment: + # Upload service configuration + - UPLOAD_SERVICE_HOST=0.0.0.0 + - UPLOAD_SERVICE_PORT=8002 + - QDRANT_URL=${QDRANT_URL} + - WORKDIR=/work + - MAX_BUNDLE_SIZE_MB=100 + - UPLOAD_TIMEOUT_SECS=300 + + # Indexing configuration + - COLLECTION_NAME=${COLLECTION_NAME} + - HF_HOME=/work/.cache/huggingface + - TRANSFORMERS_CACHE=/work/.cache/huggingface + - HUGGINGFACE_HUB_CACHE=/work/.cache/huggingface + - EMBEDDING_MODEL=${EMBEDDING_MODEL} + - EMBEDDING_PROVIDER=${EMBEDDING_PROVIDER} + - USE_TREE_SITTER=${USE_TREE_SITTER} + - INDEX_SEMANTIC_CHUNKS=${INDEX_SEMANTIC_CHUNKS} + - INDEX_MICRO_CHUNKS=${INDEX_MICRO_CHUNKS} + + # Remote upload mode configuration + - REMOTE_UPLOAD_ENABLED=1 + - REMOTE_UPLOAD_MODE=development + - REMOTE_UPLOAD_DEBUG=1 + + # Qdrant configuration + - QDRANT_TIMEOUT=${QDRANT_TIMEOUT} + - MAX_MICRO_CHUNKS_PER_FILE=${MAX_MICRO_CHUNKS_PER_FILE} + - INDEX_UPSERT_BATCH=${INDEX_UPSERT_BATCH} + - INDEX_UPSERT_RETRIES=${INDEX_UPSERT_RETRIES} + ports: + - "8004:8002" # Map to different host port to avoid conflicts + - "18004:18000" # Health check port + volumes: + - workspace_pvc:/work:rw + - codebase_pvc:/work/.codebase:rw + - upload_temp:/tmp/uploads + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:8002/health"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 40s + restart: unless-stopped + networks: + - dev-remote-network + + +# PVCs to simulate CephFS RWX behavior (production-like) +volumes: + # Main workspace volume - simulates CephFS RWX for repository storage + workspace_pvc: + driver: local + driver_opts: + type: none + o: bind + device: ${HOST_INDEX_PATH:-./dev-workspace} + + # Codebase metadata volume - simulates CephFS RWX for indexing metadata + codebase_pvc: + driver: local + driver_opts: + type: none + o: bind + device: ./.codebase + + # Temporary upload storage + upload_temp: + driver: local + + # HuggingFace cache for model downloads + huggingface_cache: + driver: local + + # Indexer cache for model downloads + indexer_cache: + driver: local + + # Qdrant storage - separate from base compose to avoid conflicts + qdrant_storage_dev_remote: + driver: local + +# Custom network for service discovery +networks: + dev-remote-network: + driver: bridge + ipam: + config: + - subnet: 172.20.0.0/16 \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 0d1e3698..d6c0f21e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,6 +8,9 @@ tree_sitter>=0.25.2 tree_sitter_languages; python_version < "3.13" mcp==1.17.0 fastmcp==2.12.4 +fastapi +uvicorn[standard] +python-multipart openai>=1.0 # Test-only diff --git a/scripts/create_indexes.py b/scripts/create_indexes.py index c0f3ff62..970f5374 100644 --- a/scripts/create_indexes.py +++ b/scripts/create_indexes.py @@ -4,21 +4,36 @@ QDRANT_URL = os.environ.get("QDRANT_URL", "http://qdrant:6333") from datetime import datetime +# Import critical functions first try: - from scripts.workspace_state import update_workspace_state, update_last_activity, get_collection_name + from scripts.workspace_state import get_collection_name, is_multi_repo_mode except Exception: - update_workspace_state = None # type: ignore - update_last_activity = None # type: ignore get_collection_name = None # type: ignore + is_multi_repo_mode = None # type: ignore + +# Import other optional functions +try: + from scripts.workspace_state import log_activity +except Exception: + log_activity = None # type: ignore COLLECTION = os.environ.get("COLLECTION_NAME", "codebase") # Discover workspace path for state updates (allows subdir indexing) WS_PATH = os.environ.get("INDEX_ROOT") or os.environ.get("WORKSPACE_PATH") or "/work" -# Use workspace state to get collection name (defaults to "codebase") +# Skip creating root collection in multi-repo mode when indexing entire /work tree +if is_multi_repo_mode and is_multi_repo_mode() and WS_PATH == "/work": + print("Multi-repo mode enabled - skipping root collection creation for /work") + exit(0) + +# Prefer workspace-derived collection names when env value is a placeholder if 'get_collection_name' in globals() and get_collection_name: try: - COLLECTION = get_collection_name(WS_PATH) + resolved = get_collection_name(None) + if resolved: + placeholders = {"", "default-collection", "my-collection", "codebase"} + if COLLECTION in placeholders: + COLLECTION = resolved except Exception: pass @@ -37,19 +52,14 @@ field_schema=models.PayloadSchemaType.KEYWORD, ) -# Update workspace state to record collection and activity +# Log activity using cleaned workspace_state function try: - if update_workspace_state: - update_workspace_state(WS_PATH, {"qdrant_collection": COLLECTION}) - if update_last_activity: - update_last_activity( - WS_PATH, - { - "timestamp": datetime.now().isoformat(), - "action": "initialized", - "file_path": "", - "details": {"created_indexes": ["metadata.language", "metadata.path_prefix"]}, - }, + if log_activity: + log_activity( + repo_name=None, + action="initialized", + file_path="", + details={"created_indexes": ["metadata.language", "metadata.path_prefix"]}, ) except Exception: pass diff --git a/scripts/dev-setup.sh b/scripts/dev-setup.sh new file mode 100755 index 00000000..1a7c5553 --- /dev/null +++ b/scripts/dev-setup.sh @@ -0,0 +1,169 @@ +#!/bin/bash + +# Development Environment Setup Script for Remote Upload System +# This script sets up the development environment for testing the remote upload workflow + +set -e + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# Configuration +DEV_WORKSPACE="${DEV_WORKSPACE:-./dev-workspace}" + +# Functions +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +log_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +log_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +# Check if Docker is running +check_docker() { + log_info "Checking Docker installation..." + if ! command -v docker &> /dev/null; then + log_error "Docker is not installed or not in PATH" + exit 1 + fi + + if ! docker info &> /dev/null; then + log_error "Docker daemon is not running" + exit 1 + fi + + log_success "Docker is available and running" +} + +# Check if Docker Compose is available +check_docker_compose() { + log_info "Checking Docker Compose installation..." + if ! command -v docker-compose &> /dev/null && ! docker compose version &> /dev/null; then + log_error "Docker Compose is not installed" + exit 1 + fi + + log_success "Docker Compose is available" +} + +# Create development workspace directory structure +setup_workspace() { + log_info "Setting up development workspace..." + + # Create main workspace directory + mkdir -p "$DEV_WORKSPACE" + mkdir -p "$DEV_WORKSPACE/.codebase" + + log_success "Development workspace created at $DEV_WORKSPACE" + log_info "You can mount your existing repositories here for testing" +} + +# Create environment file +create_env_file() { + log_info "Creating environment configuration..." + + if [ ! -f ".env" ]; then + cp .env.example .env + log_success "Created .env from .env.example" + else + log_warning ".env file already exists, skipping creation" + fi + + # Add dev-remote specific configurations if not already present + if ! grep -q "HOST_INDEX_PATH=./dev-workspace" .env; then + cat >> .env << 'EOF' + +# Development Remote Upload Configuration +HOST_INDEX_PATH=./dev-workspace +DEV_REMOTE_MODE=1 +DEV_REMOTE_DEBUG=1 + +# Upload Service Configuration (Development) +UPLOAD_SERVICE_HOST=0.0.0.0 +UPLOAD_SERVICE_PORT=8002 +UPLOAD_SERVICE_DEBUG=1 + +# Remote Upload Client Configuration +REMOTE_UPLOAD_ENABLED=1 +REMOTE_UPLOAD_ENDPOINT=http://upload_service:8002 +REMOTE_UPLOAD_MAX_RETRIES=3 +REMOTE_UPLOAD_TIMEOUT=30 +REMOTE_UPLOAD_DEBUG=1 + +# Development-specific settings +QDRANT_TIMEOUT=60 +MAX_MICRO_CHUNKS_PER_FILE=200 +INDEX_UPSERT_BATCH=128 +INDEX_UPSERT_RETRIES=5 +WATCH_DEBOUNCE_SECS=1.5 +EOF + log_success "Added dev-remote configuration to .env" + else + log_warning "Dev-remote configuration already exists in .env" + fi +} + +# Print usage information +print_usage() { + log_info "Development environment setup complete!" + echo + echo "Quick Start:" + echo " 1. Copy your repository to dev-workspace/your-repo-name" + echo " 2. Run: make dev-remote-bootstrap" + echo " 3. Test with: make dev-remote-test" + echo + echo "Available commands:" + echo " make dev-remote-up - Start the dev-remote stack" + echo " make dev-remote-down - Stop the dev-remote stack" + echo " make dev-remote-bootstrap - Bootstrap the complete system" + echo " make dev-remote-test - Test the remote upload workflow" + echo " make dev-remote-client - Start remote upload client" + echo " make dev-remote-clean - Clean up all dev-remote resources" + echo + echo "Service URLs:" + echo " Upload Service: http://localhost:8004" + echo " Qdrant Dashboard: http://localhost:6333" + echo " MCP Search: http://localhost:8000" + echo " MCP Indexer: http://localhost:8001" + echo + echo "Testing Workflow:" + echo " 1. Place your code in: $DEV_WORKSPACE/your-repo" + echo " 2. Start the stack: make dev-remote-bootstrap" + echo " 3. Test upload: curl http://localhost:8004/health" + echo " 4. Check status: curl 'http://localhost:8004/api/v1/delta/status?workspace_path=/work/your-repo'" + echo + echo "For remote upload testing:" + echo " 1. Set REMOTE_UPLOAD_ENDPOINT=http://localhost:8004" + echo " 2. Run: make watch-remote REMOTE_UPLOAD_ENDPOINT=http://localhost:8004" + echo + log_success "Ready to test the remote upload system!" +} + +# Main execution +main() { + log_info "Setting up development environment for remote upload system..." + + check_docker + check_docker_compose + setup_workspace + create_env_file + print_usage + + log_success "Development environment setup completed successfully!" +} + +# Run main function +main "$@" \ No newline at end of file diff --git a/scripts/health_check.py b/scripts/health_check.py index 3a0137e7..67e80e2e 100644 --- a/scripts/health_check.py +++ b/scripts/health_check.py @@ -39,83 +39,101 @@ def main(): client = QdrantClient(url=qdrant_url, api_key=api_key or None) - # 1) Collection exists and has expected named vector/dimension - info = client.get_collection(collection) - cfg = info.config.params.vectors - if isinstance(cfg, dict): - present_names = list(cfg.keys()) - assert_true(len(present_names) >= 1, "Collection has at least one named vector") - assert_true( - vec_name_expect in present_names, - f"Expected vector name present: {vec_name_expect} in {present_names}", - ) - got_dim = cfg[vec_name_expect].size - else: - present_names = [""] - got_dim = cfg.size - assert_true( - got_dim == dim, f"Vector dimension matches embedding ({got_dim} == {dim})" - ) - - # 2) HNSW tuned params (best effort; allow >= thresholds) - hcfg = info.config.hnsw_config + # Get all collections and check each one try: - m = getattr(hcfg, "m", None) - efc = getattr(hcfg, "ef_construct", None) - assert_true(m is None or m >= 16, f"HNSW m>=16 (got {m})") - assert_true(efc is None or efc >= 256, f"HNSW ef_construct>=256 (got {efc})") - except Exception: - print("[WARN] Could not read HNSW config; continuing") - - # 3) Payload indexes created (language, path_prefix, repo, kind, symbol) - # Not all clients expose schema listing; we validate by running filtered queries - probe_text = "split code into overlapping line chunks" - probe_vec = next(model.embed([probe_text])).tolist() - - # Unfiltered query - qp = client.query_points( - collection_name=collection, - query=probe_vec, - using=vec_name_expect, - limit=3, - with_payload=True, - search_params=models.SearchParams(hnsw_ef=128), - ) - res_points = getattr(qp, "points", qp) - assert_true(isinstance(res_points, list), "query_points returns a list of points") - - # Filtered by language + kind (should not error; may return 0 results if dataset sparse) - flt = models.Filter( - must=[ - models.FieldCondition( - key="metadata.language", match=models.MatchValue(value="python") - ), - models.FieldCondition( - key="metadata.kind", match=models.MatchValue(value="function") - ), - ] - ) - qp2 = client.query_points( - collection_name=collection, - query=probe_vec, - using=vec_name_expect, - query_filter=flt, - limit=3, - with_payload=True, - ) - res2 = getattr(qp2, "points", qp2) or [] - # If results exist, ensure payload has kind/symbol keys - if res2: - md: Dict[str, Any] = (res2[0].payload or {}).get("metadata") or {} + collections_response = client.get_collections() + collections = [c.name for c in collections_response.collections] + print(f"Found collections: {collections}") + except Exception as e: + print(f"Error getting collections: {e}") + sys.exit(1) + + if not collections: + print("No collections found - nothing to health check") + return + + # Check each collection + for collection_name in collections: + print(f"Checking collection: {collection_name}") + + # 1) Collection exists and has expected named vector/dimension + info = client.get_collection(collection_name) + cfg = info.config.params.vectors + if isinstance(cfg, dict): + present_names = list(cfg.keys()) + assert_true(len(present_names) >= 1, "Collection has at least one named vector") + assert_true( + vec_name_expect in present_names, + f"Expected vector name present: {vec_name_expect} in {present_names}", + ) + got_dim = cfg[vec_name_expect].size + else: + present_names = [""] + got_dim = cfg.size assert_true( - "kind" in md and "symbol" in md, - "payload includes metadata.kind and metadata.symbol", + got_dim == dim, f"Vector dimension matches embedding ({got_dim} == {dim})" ) - else: - print("[OK] Filtered query ran (no results is acceptable depending on data)") - print("All checks passed.") + # 2) HNSW tuned params (best effort; allow >= thresholds) + hcfg = info.config.hnsw_config + try: + m = getattr(hcfg, "m", None) + efc = getattr(hcfg, "ef_construct", None) + assert_true(m is None or m >= 16, f"HNSW m>=16 (got {m})") + assert_true(efc is None or efc >= 256, f"HNSW ef_construct>=256 (got {efc})") + except Exception: + print("[WARN] Could not read HNSW config; continuing") + + # 3) Test queries on this collection + probe_text = "split code into overlapping line chunks" + probe_vec = next(model.embed([probe_text])).tolist() + + # Unfiltered query + qp = client.query_points( + collection_name=collection_name, + query=probe_vec, + using=vec_name_expect, + limit=3, + with_payload=True, + search_params=models.SearchParams(hnsw_ef=128), + ) + res_points = getattr(qp, "points", qp) + assert_true(isinstance(res_points, list), "query_points returns a list of points") + + # Filtered by language + kind (should not error; may return 0 results if dataset sparse) + flt = models.Filter( + must=[ + models.FieldCondition( + key="metadata.language", match=models.MatchValue(value="python") + ), + models.FieldCondition( + key="metadata.kind", match=models.MatchValue(value="function") + ), + ] + ) + qp2 = client.query_points( + collection_name=collection_name, + query=probe_vec, + using=vec_name_expect, + query_filter=flt, + limit=3, + with_payload=True, + ) + res2 = getattr(qp2, "points", qp2) or [] + # If results exist, ensure payload has kind/symbol keys + if res2: + md: Dict[str, Any] = (res2[0].payload or {}).get("metadata") or {} + assert_true( + "kind" in md and "symbol" in md, + "payload includes metadata.kind and metadata.symbol", + ) + else: + print("[OK] Filtered query ran (no results is acceptable depending on data)") + + print(f"[OK] Collection {collection_name} health check passed") + + print(f"[OK] All {len(collections)} collections passed health check") if __name__ == "__main__": - main() + main() \ No newline at end of file diff --git a/scripts/hybrid_search.py b/scripts/hybrid_search.py index 6dd1b25d..d2fe5af8 100644 --- a/scripts/hybrid_search.py +++ b/scripts/hybrid_search.py @@ -41,8 +41,17 @@ logger = logging.getLogger("hybrid_search") -def _collection() -> str: - return os.environ.get("COLLECTION_NAME", "codebase") +def _collection(collection_name: str | None = None) -> str: + """Determine collection name with priority: CLI arg > env > default.""" + + if collection_name and collection_name.strip(): + return collection_name.strip() + + env_coll = os.environ.get("COLLECTION_NAME", "").strip() + if env_coll: + return env_coll + + return "my-collection" MODEL_NAME = os.environ.get("EMBEDDING_MODEL", "BAAI/bge-base-en-v1.5") @@ -120,6 +129,29 @@ def _coerce_points(result: Any) -> List[Any]: return [result] +def _legacy_vector_search( + client: QdrantClient, + collection: str, + vec_name: str, + vector: List[float], + per_query: int, + flt, +) -> List[Any]: + """Fallback to legacy client.search when query_points is unavailable.""" + + try: + result = client.search( + collection_name=collection, + query_vector={"name": vec_name, "vector": vector}, + limit=per_query, + with_payload=True, + query_filter=flt, + ) + return _coerce_points(getattr(result, "points", result)) + except Exception: + return [] + + def _embed_queries_cached( model: TextEmbedding, queries: List[str] ) -> List[List[float]]: @@ -1159,14 +1191,15 @@ def _sanitize_filter_obj(flt): return None -def lex_query(client: QdrantClient, v: List[float], flt, per_query: int) -> List[Any]: +def lex_query(client: QdrantClient, v: List[float], flt, per_query: int, collection_name: str | None = None) -> List[Any]: ef = max(EF_SEARCH, 32 + 4 * int(per_query)) flt = _sanitize_filter_obj(flt) + collection = _collection(collection_name) # Prefer modern API; handle kwarg rename between client versions (query_filter -> filter) try: qp = client.query_points( - collection_name=_collection(), + collection_name=collection, query=v, using=LEX_VECTOR_NAME, query_filter=flt, @@ -1180,7 +1213,7 @@ def lex_query(client: QdrantClient, v: List[float], flt, per_query: int) -> List if os.environ.get("DEBUG_HYBRID_SEARCH"): logger.debug("QP_FILTER_KWARG_SWITCH", extra={"using": LEX_VECTOR_NAME}) qp = client.query_points( - collection_name=_collection(), + collection_name=collection, query=v, using=LEX_VECTOR_NAME, filter=flt, @@ -1189,6 +1222,8 @@ def lex_query(client: QdrantClient, v: List[float], flt, per_query: int) -> List with_payload=True, ) return _coerce_points(getattr(qp, "points", qp)) + except AttributeError: + return _legacy_vector_search(client, collection, LEX_VECTOR_NAME, v, per_query, flt) except Exception as e: # Retry without a filter at all (handles servers that reject certain filter shapes) if os.environ.get("DEBUG_HYBRID_SEARCH"): @@ -1198,7 +1233,7 @@ def lex_query(client: QdrantClient, v: List[float], flt, per_query: int) -> List pass try: qp = client.query_points( - collection_name=_collection(), + collection_name=collection, query=v, using=LEX_VECTOR_NAME, query_filter=None, @@ -1209,7 +1244,7 @@ def lex_query(client: QdrantClient, v: List[float], flt, per_query: int) -> List return _coerce_points(getattr(qp, "points", qp)) except TypeError: qp = client.query_points( - collection_name=_collection(), + collection_name=collection, query=v, using=LEX_VECTOR_NAME, filter=None, @@ -1224,18 +1259,19 @@ def lex_query(client: QdrantClient, v: List[float], flt, per_query: int) -> List logger.debug("QP_FILTER_DROP_FAILED", extra={"using": LEX_VECTOR_NAME, "reason": str(e2)[:200]}) except Exception: pass - return [] + return _legacy_vector_search(client, collection, LEX_VECTOR_NAME, v, per_query, flt) def dense_query( - client: QdrantClient, vec_name: str, v: List[float], flt, per_query: int + client: QdrantClient, vec_name: str, v: List[float], flt, per_query: int, collection_name: str | None = None ) -> List[Any]: ef = max(EF_SEARCH, 32 + 4 * int(per_query)) flt = _sanitize_filter_obj(flt) + collection = _collection(collection_name) try: qp = client.query_points( - collection_name=_collection(), + collection_name=collection, query=v, using=vec_name, query_filter=flt, @@ -1248,7 +1284,7 @@ def dense_query( if os.environ.get("DEBUG_HYBRID_SEARCH"): logger.debug("QP_FILTER_KWARG_SWITCH", extra={"using": vec_name}) qp = client.query_points( - collection_name=_collection(), + collection_name=collection, query=v, using=vec_name, filter=flt, @@ -1278,7 +1314,7 @@ def dense_query( except TypeError: try: qp = client.query_points( - collection_name=_collection(), + collection_name=collection, query=v, using=vec_name, filter=None, @@ -1293,7 +1329,7 @@ def dense_query( logger.debug("QP_FILTER_DROP_FAILED", extra={"using": vec_name, "reason": str(e2)[:200]}) except Exception: pass - return [] + return _legacy_vector_search(client, collection, vec_name, v, per_query, flt) # In-process API: run hybrid search and return structured items list @@ -1316,6 +1352,7 @@ def run_hybrid_search( not_glob: str | list[str] | None = None, expand: bool = True, model: TextEmbedding | None = None, + collection: str | None = None, ) -> List[Dict[str, Any]]: client = QdrantClient(url=os.environ.get("QDRANT_URL", QDRANT_URL), api_key=API_KEY) model_name = os.environ.get("EMBEDDING_MODEL", MODEL_NAME) @@ -1622,7 +1659,7 @@ def _bn(p: str) -> str: score_map: Dict[str, Dict[str, Any]] = {} try: lex_vec = lex_hash_vector(qlist) - lex_results = lex_query(client, lex_vec, flt, max(24, limit)) + lex_results = lex_query(client, lex_vec, flt, max(24, limit), collection) except Exception: lex_results = [] @@ -1664,7 +1701,7 @@ def _bn(p: str) -> str: try: if embedded: dim = len(embedded[0]) - _ensure_collection(client, _collection(), dim, vec_name) + _ensure_collection(client, _collection(collection), dim, vec_name) except Exception: pass # Optional gate-first using mini vectors to restrict dense search to candidates @@ -1721,7 +1758,7 @@ def _bn(p: str) -> str: # Get top candidates using MINI vectors (fast prefilter) candidate_ids = set() for mv in mini_queries: - mini_results = dense_query(client, MINI_VECTOR_NAME, mv, flt, cand_n) + mini_results = dense_query(client, MINI_VECTOR_NAME, mv, flt, cand_n, collection) for result in mini_results: if hasattr(result, 'id'): candidate_ids.add(result.id) @@ -1775,7 +1812,7 @@ def _bn(p: str) -> str: flt_gated = _sanitize_filter_obj(flt_gated) result_sets: List[List[Any]] = [ - dense_query(client, vec_name, v, flt_gated, max(24, limit)) for v in embedded + dense_query(client, vec_name, v, flt_gated, max(24, limit), collection) for v in embedded ] if os.environ.get("DEBUG_HYBRID_SEARCH"): total_dense_results = sum(len(rs) for rs in result_sets) @@ -1792,7 +1829,7 @@ def _bn(p: str) -> str: try: mini_queries = [_project_mini(list(v), MINI_VEC_DIM) for v in embedded] mini_sets: List[List[Any]] = [ - dense_query(client, MINI_VECTOR_NAME, mv, flt, max(24, limit)) + dense_query(client, MINI_VECTOR_NAME, mv, flt, max(24, limit), collection) for mv in mini_queries ] for res in mini_sets: @@ -1947,7 +1984,7 @@ def _bn(p: str) -> str: try: lex_vec2 = lex_hash_vector(prf_qs) lex_results2 = lex_query( - client, lex_vec2, flt, max(12, limit // 2 or 6) + client, lex_vec2, flt, max(12, limit // 2 or 6), collection ) except Exception: lex_results2 = [] @@ -1976,7 +2013,7 @@ def _bn(p: str) -> str: try: embedded2 = _embed_queries_cached(_model, prf_qs) result_sets2: List[List[Any]] = [ - dense_query(client, vec_name, v, flt, max(12, limit // 2 or 6)) + dense_query(client, vec_name, v, flt, max(12, limit // 2 or 6), collection) for v in embedded2 ] for res2 in result_sets2: @@ -2695,6 +2732,8 @@ def main(): # Structured filters to mirror MCP tool fields ap.add_argument("--ext", type=str, default=None) ap.add_argument("--not", dest="not_filter", type=str, default=None) + ap.add_argument("--collection", type=str, default=None, + help="Target collection name") ap.add_argument( "--case", type=str, @@ -2707,6 +2746,9 @@ def main(): args = ap.parse_args() + # Resolve effective collection early to avoid variable usage errors + eff_collection = args.collection or os.environ.get("COLLECTION_NAME", "my-collection") + model = TextEmbedding(model_name=MODEL_NAME) vec_name = _sanitize_vector_name(MODEL_NAME) client = QdrantClient(url=QDRANT_URL, api_key=API_KEY or None) @@ -2715,7 +2757,7 @@ def main(): try: first_vec = next(model.embed(["__dim__warmup__"])) dim = len(first_vec.tolist()) - _ensure_collection(client, _collection(), dim, vec_name) + _ensure_collection(client, _collection(eff_collection), dim, vec_name) except Exception: pass @@ -2822,7 +2864,7 @@ def _norm_under(u: str | None) -> str | None: # Server-side lexical vector search (hashing) as an additional ranked list try: lex_vec = lex_hash_vector(queries) - lex_results = lex_query(client, lex_vec, flt, args.per_query) + lex_results = lex_query(client, lex_vec, flt, args.per_query, eff_collection) except Exception: lex_results = [] @@ -2874,7 +2916,7 @@ def _norm_under(u: str | None) -> str | None: embedded = _embed_queries_cached(model, queries) result_sets: List[List[Any]] = [ - dense_query(client, vec_name, v, flt, args.per_query) for v in embedded + dense_query(client, vec_name, v, flt, args.per_query, eff_collection) for v in embedded ] # RRF fusion (weighted) diff --git a/scripts/ingest_code.py b/scripts/ingest_code.py index f3289c49..9434794b 100644 --- a/scripts/ingest_code.py +++ b/scripts/ingest_code.py @@ -1,38 +1,15 @@ from __future__ import annotations -# Helper: detect repository name automatically (no REPO_NAME env needed) +# Import repository detection from workspace_state to avoid duplication def _detect_repo_name_from_path(path: Path) -> str: + """Wrapper function to use workspace_state repository detection.""" try: - import subprocess, os as _os - - base = path if path.is_dir() else path.parent - r = subprocess.run( - ["git", "-C", str(base), "rev-parse", "--show-toplevel"], - capture_output=True, - text=True, - ) - top = r.stdout.strip() - if r.returncode == 0 and top: - return Path(top).name or "workspace" - except Exception: - pass - # Fallback: walk up to find a .git folder - try: - cur = path if path.is_dir() else path.parent - for p in [cur] + list(cur.parents): - try: - if (p / ".git").exists(): - return p.name or "workspace" - except Exception: - continue - except Exception: - pass - # Last resort: directory name - try: - return (path if path.is_dir() else path.parent).name or "workspace" - except Exception: - return "workspace" + from scripts.workspace_state import _extract_repo_name_from_path as _ws_detect + return _ws_detect(str(path)) + except ImportError: + # Fallback for when workspace_state is not available + return path.name if path.is_dir() else path.parent.name #!/usr/bin/env python3 @@ -56,27 +33,43 @@ def _detect_repo_name_from_path(path: Path) -> str: from fastembed import TextEmbedding - from datetime import datetime + +# Import critical multi-repo functions first try: from scripts.workspace_state import ( - update_indexing_status, - update_last_activity, - update_workspace_state, + is_multi_repo_mode, get_collection_name, + ) +except ImportError: + is_multi_repo_mode = None # type: ignore + get_collection_name = None # type: ignore + +# Import watcher's repo detection for surgical fix +try: + from scripts.watch_index import _detect_repo_for_file, _get_collection_for_file +except ImportError: + _detect_repo_for_file = None # type: ignore + _get_collection_for_file = None # type: ignore + +# Import other workspace state functions (optional) +try: + from scripts.workspace_state import ( + log_activity, get_cached_file_hash, set_cached_file_hash, remove_cached_file, + update_indexing_status, + update_workspace_state, ) -except Exception: +except ImportError: # State integration is optional; continue if not available - update_indexing_status = None # type: ignore - update_last_activity = None # type: ignore - update_workspace_state = None # type: ignore - get_collection_name = None # type: ignore + log_activity = None # type: ignore get_cached_file_hash = None # type: ignore set_cached_file_hash = None # type: ignore remove_cached_file = None # type: ignore + update_indexing_status = None # type: ignore + update_workspace_state = None # type: ignore # Optional Tree-sitter import (graceful fallback) try: @@ -463,7 +456,6 @@ def chunk_semantic( n = len(lines) - # Extract symbols with line ranges symbols = _extract_symbols(language, text) if not symbols: @@ -524,7 +516,6 @@ def chunk_by_tokens( Tokenizer = None # type: ignore - try: k = int(os.environ.get("MICRO_CHUNK_TOKENS", str(k_tokens or 16)) or 16) except Exception: @@ -688,23 +679,21 @@ def ensure_collection(client: QdrantClient, name: str, dim: int, vector_name: st """ try: info = client.get_collection(name) - # Ensure HNSW tuned params even if the collection already existed - try: - client.update_collection( - collection_name=name, - hnsw_config=models.HnswConfigDiff(m=16, ef_construct=256), - ) - except Exception: - pass - # Schema repair: add missing named vectors on existing collections + # Prevent I/O storm - only update vectors if they actually don't exist try: cfg = getattr(info.config.params, "vectors", None) if isinstance(cfg, dict): + # Check if collection already has required vectors before updating + has_lex = LEX_VECTOR_NAME in cfg + has_mini = MINI_VECTOR_NAME in cfg + + # Only add to missing if vector doesn't already exist missing = {} - if LEX_VECTOR_NAME not in cfg: + if not has_lex: missing[LEX_VECTOR_NAME] = models.VectorParams( size=LEX_VECTOR_DIM, distance=models.Distance.COSINE ) + try: refrag_on = os.environ.get("REFRAG_MODE", "").strip().lower() in { "1", @@ -714,13 +703,17 @@ def ensure_collection(client: QdrantClient, name: str, dim: int, vector_name: st } except Exception: refrag_on = False - if refrag_on and MINI_VECTOR_NAME not in cfg: + + if refrag_on and not has_mini: missing[MINI_VECTOR_NAME] = models.VectorParams( size=int( os.environ.get("MINI_VEC_DIM", MINI_VEC_DIM) or MINI_VEC_DIM ), distance=models.Distance.COSINE, ) + + # Only update collection if vectors are actually missing + # Previous behavior: always called update_collection() causing I/O storms if missing: try: client.update_collection( @@ -729,10 +722,13 @@ def ensure_collection(client: QdrantClient, name: str, dim: int, vector_name: st except Exception: # Best-effort; if server doesn't support adding vectors, leave to recreate path pass - except Exception: + except Exception as e: + print(f"[COLLECTION_ERROR] Failed to update collection {name}: {e}") pass return - except Exception: + except Exception as e: + # Collection doesn't exist - proceed to create it + print(f"[COLLECTION_INFO] Creating new collection {name}: {type(e).__name__}") pass vectors_cfg = { vector_name: models.VectorParams(size=dim, distance=models.Distance.COSINE), @@ -1199,7 +1195,6 @@ def _extract_symbols_java(text: str) -> List[_Sym]: return syms - def _extract_symbols_csharp(text: str) -> List[_Sym]: lines = text.splitlines() syms: List[_Sym] = [] @@ -1263,7 +1258,6 @@ def _extract_symbols_php(text: str) -> List[_Sym]: return syms - def _extract_symbols_shell(text: str) -> List[_Sym]: lines = text.splitlines() syms: List[_Sym] = [] @@ -1667,8 +1661,8 @@ def index_single_file( ws_path = os.environ.get("WATCH_ROOT") or os.environ.get("WORKSPACE_PATH") or "/work" try: if get_cached_file_hash: - prev_local = get_cached_file_hash(ws_path, str(file_path)) - if prev_local and prev_local == file_hash: + prev_local = get_cached_file_hash(str(file_path), repo_tag) + if prev_local and file_hash and prev_local == file_hash: print(f"Skipping unchanged file (cache): {file_path}") return False except Exception: @@ -1855,13 +1849,13 @@ def make_point(pid, dense_vec, lex_vec, payload): try: ws = os.environ.get("WATCH_ROOT") or os.environ.get("WORKSPACE_PATH") or "/work" if set_cached_file_hash: - set_cached_file_hash(ws, str(file_path), file_hash) + file_repo_tag = _detect_repo_name_from_path(file_path) + set_cached_file_hash(str(file_path), file_hash, file_repo_tag) except Exception: pass return True return False - def index_repo( root: Path, qdrant_url: str, @@ -1911,35 +1905,61 @@ def index_repo( if vector_name is None: vector_name = _sanitize_vector_name(model_name) - # Workspace state: use single unified collection for seamless cross-repo search + use_per_repo_collections = False + + # Workspace state: derive collection and persist metadata try: ws_path = str(root) - # Always use the unified collection (default: "codebase") - if 'get_collection_name' in globals() and get_collection_name: - collection = get_collection_name(ws_path) - if update_workspace_state: - update_workspace_state(ws_path, {"qdrant_collection": collection}) - if update_indexing_status: + repo_tag = _detect_repo_name_from_path(root) if _detect_repo_name_from_path else None + + is_multi_repo = bool(is_multi_repo_mode and is_multi_repo_mode()) + use_per_repo_collections = bool(is_multi_repo and _get_collection_for_file) + + if use_per_repo_collections: + collection = None # Determined per file later + print("[multi_repo] Using per-repo collections for root") + else: + if 'get_collection_name' in globals() and get_collection_name: + try: + resolved = get_collection_name(ws_path) + placeholders = {"", "default-collection", "my-collection", "codebase"} + if resolved and collection in placeholders: + collection = resolved + except Exception: + pass + + if update_workspace_state and not use_per_repo_collections: + update_workspace_state( + workspace_path=ws_path, + updates={"qdrant_collection": collection}, + repo_name=repo_tag, + ) + if update_indexing_status and repo_tag: update_indexing_status( - ws_path, - { + workspace_path=ws_path, + status={ "state": "indexing", "started_at": datetime.now().isoformat(), "progress": {"files_processed": 0, "total_files": None}, }, + repo_name=repo_tag, ) - except Exception: - pass + except Exception as e: + # Log state update errors instead of silent failure + import traceback + print(f"[ERROR] Failed to update workspace state during indexing: {e}") + print(f"[ERROR] Traceback: {traceback.format_exc()}") print( f"Indexing root={root} -> {qdrant_url} collection={collection} model={model_name} recreate={recreate}" ) - # Health check: detect cache/collection sync issues before indexing - if not recreate and skip_unchanged: + # Health check: detect cache/collection sync issues before indexing (single-collection mode only) + if not recreate and skip_unchanged and not use_per_repo_collections and collection: try: from scripts.collection_health import auto_heal_if_needed + print("[health_check] Checking collection health...") heal_result = auto_heal_if_needed(str(root), collection, qdrant_url, dry_run=False) if heal_result["action_taken"] == "cleared_cache": @@ -1951,15 +1971,21 @@ def index_repo( except Exception as e: print(f"[health_check] Warning: health check failed: {e}") - if recreate: - recreate_collection(client, collection, dim, vector_name) + # Skip single collection setup in multi-repo mode + if not use_per_repo_collections: + if recreate: + recreate_collection(client, collection, dim, vector_name) + else: + ensure_collection(client, collection, dim, vector_name) + # Ensure useful payload indexes exist (idempotent) + ensure_payload_indexes(client, collection) else: - ensure_collection(client, collection, dim, vector_name) - - # Ensure useful payload indexes exist (idempotent) - ensure_payload_indexes(client, collection) + print("[multi_repo] Skipping single collection setup - will create per-repo collections during indexing") # Repo tag for filtering: auto-detect from git or folder name repo_tag = _detect_repo_name_from_path(root) + workspace_root = os.environ.get("WATCH_ROOT") or os.environ.get("WORKSPACE_PATH") or "/work" + touched_repos: set[str] = set() + repo_roots: dict[str, str] = {} # Batch and scaling config (env/CLI overridable) batch_texts: list[str] = [] @@ -2010,6 +2036,18 @@ def make_point(pid, dense_vec, lex_vec, payload): for file_path in iter_files(root): files_seen += 1 + + # Determine collection per-file in multi-repo mode (use watcher's exact logic) + current_collection = collection + if use_per_repo_collections: + if _get_collection_for_file: + current_collection = _get_collection_for_file(file_path) + # Ensure collection exists on first use + ensure_collection(client, current_collection, dim, vector_name) + ensure_payload_indexes(client, current_collection) + else: + current_collection = get_collection_name(ws_path) if get_collection_name else "default-collection" + try: text = file_path.read_text(encoding="utf-8", errors="ignore") except Exception as e: @@ -2018,20 +2056,38 @@ def make_point(pid, dense_vec, lex_vec, payload): language = detect_language(file_path) file_hash = hashlib.sha1(text.encode("utf-8", errors="ignore")).hexdigest() + per_file_repo = ( + _detect_repo_name_from_path(file_path) + if _detect_repo_name_from_path + else repo_tag + ) + if per_file_repo: + touched_repos.add(per_file_repo) + repo_roots.setdefault( + per_file_repo, + str(Path(workspace_root).resolve() / per_file_repo), + ) + # Skip unchanged files if enabled (default) if skip_unchanged: # Prefer local workspace cache to avoid Qdrant lookups try: if get_cached_file_hash: - prev_local = get_cached_file_hash(ws_path, str(file_path)) - if prev_local and prev_local == file_hash: + prev_local = get_cached_file_hash(str(file_path), per_file_repo) + if prev_local and file_hash and prev_local == file_hash: if PROGRESS_EVERY <= 0 and files_seen % 50 == 0: print(f"... processed {files_seen} files (skipping unchanged, cache)") try: if update_indexing_status: + target_workspace = ( + ws_path if not use_per_repo_collections else str(file_path.parent) + ) + target_repo = ( + repo_tag if not use_per_repo_collections else per_file_repo + ) update_indexing_status( - ws_path, - { + workspace_path=target_workspace, + status={ "state": "indexing", "progress": { "files_processed": files_seen, @@ -2039,6 +2095,7 @@ def make_point(pid, dense_vec, lex_vec, payload): "current_file": str(file_path), }, }, + repo_name=target_repo, ) except Exception: pass @@ -2047,16 +2104,28 @@ def make_point(pid, dense_vec, lex_vec, payload): continue except Exception: pass - prev = get_indexed_file_hash(client, collection, str(file_path)) - if prev and prev == file_hash: + prev = get_indexed_file_hash(client, current_collection, str(file_path)) + if prev and file_hash and prev == file_hash: + # File exists in Qdrant with same hash - cache it locally for next time + try: + if set_cached_file_hash: + set_cached_file_hash(str(file_path), file_hash, per_file_repo) + except Exception: + pass if PROGRESS_EVERY <= 0 and files_seen % 50 == 0: # minor heartbeat when no progress cadence configured print(f"... processed {files_seen} files (skipping unchanged)") try: if update_indexing_status: + target_workspace = ( + ws_path if not use_per_repo_collections else str(file_path.parent) + ) + target_repo = ( + repo_tag if not use_per_repo_collections else per_file_repo + ) update_indexing_status( - ws_path, - { + workspace_path=target_workspace, + status={ "state": "indexing", "progress": { "files_processed": files_seen, @@ -2064,6 +2133,7 @@ def make_point(pid, dense_vec, lex_vec, payload): "current_file": str(file_path), }, }, + repo_name=target_repo, ) except Exception: pass @@ -2073,7 +2143,7 @@ def make_point(pid, dense_vec, lex_vec, payload): # Dedupe per-file by deleting previous points for this path (default) if dedupe: - delete_points_by_path(client, collection, str(file_path)) + delete_points_by_path(client, current_collection, str(file_path)) files_indexed += 1 symbols = _extract_symbols(language, text) @@ -2168,7 +2238,7 @@ def make_point(pid, dense_vec, lex_vec, payload): "kind": kind, "symbol": sym, "symbol_path": sym_path or "", - "repo": repo_tag, + "repo": per_file_repo, "start_line": ch["start"], "end_line": ch["end"], "code": ch["text"], @@ -2220,14 +2290,22 @@ def make_point(pid, dense_vec, lex_vec, payload): make_point(i, v, lx, m) for i, v, lx, m in zip(batch_ids, vectors, batch_lex, batch_meta) ] - upsert_points(client, collection, points) + upsert_points(client, current_collection, points) # Update local file-hash cache for any files that had chunks in this flush try: if set_cached_file_hash: for _p, _h in list(batch_file_hashes.items()): try: if _p and _h: - set_cached_file_hash(ws_path, _p, _h) + file_repo_tag = _detect_repo_name_from_path(Path(_p)) + repos_touched_name = file_repo_tag or per_file_repo + if repos_touched_name: + touched_repos.add(repos_touched_name) + repo_roots.setdefault( + repos_touched_name, + str(Path(workspace_root).resolve() / repos_touched_name), + ) + set_cached_file_hash(_p, _h, file_repo_tag) except Exception: continue except Exception: @@ -2241,19 +2319,25 @@ def make_point(pid, dense_vec, lex_vec, payload): ) try: if update_indexing_status: - update_indexing_status( - ws_path, - { - "state": "indexing", - "progress": { - "files_processed": files_seen, - "total_files": None, - "current_file": str(file_path), + per_file_repo = _detect_repo_name_from_path(file_path) if _detect_repo_name_from_path else repo_tag + if per_file_repo: + update_indexing_status( + workspace_path=str(file_path.parent), + status={ + "state": "indexing", + "progress": { + "files_processed": repo_progress.get(per_file_repo, 0), + "total_files": repo_total.get(per_file_repo, None), + "current_file": str(file_path), + }, }, - }, - ) - except Exception: - pass + repo_name=per_file_repo, + ) + except Exception as e: + # Log progress update errors instead of silent failure + import traceback + print(f"[ERROR] Failed to update indexing progress: {e}") + print(f"[ERROR] Traceback: {traceback.format_exc()}") if batch_texts: vectors = embed_batch(model, batch_texts) @@ -2267,14 +2351,16 @@ def make_point(pid, dense_vec, lex_vec, payload): make_point(i, v, lx, m) for i, v, lx, m in zip(batch_ids, vectors, batch_lex, batch_meta) ] - upsert_points(client, collection, points) + upsert_points(client, current_collection, points) # Update local file-hash cache for any files that had chunks during this run (final flush) try: if set_cached_file_hash: for _p, _h in list(batch_file_hashes.items()): try: if _p and _h: - set_cached_file_hash(ws_path, _p, _h) + per_file_repo = _detect_repo_name_from_path(Path(_p)) + if per_file_repo: + set_cached_file_hash(_p, _h, per_file_repo) except Exception: continue except Exception: @@ -2286,30 +2372,43 @@ def make_point(pid, dense_vec, lex_vec, payload): # Workspace state: mark completion try: - if update_last_activity: - update_last_activity( - ws_path, - { - "timestamp": datetime.now().isoformat(), - "action": "scan-completed", - "file_path": "", - "details": { - "files_seen": files_seen, - "files_indexed": files_indexed, - "chunks_indexed": points_indexed, - }, + if log_activity: + # Extract repo name from workspace path for log_activity + repo_name = None + if use_per_repo_collections: + # In multi-repo mode, we need to determine which repo this activity belongs to + # For scan completion, we use the workspace path as the repo identifier + repo_name = _detect_repo_name_from_path(Path(ws_path)) + + log_activity( + repo_name=repo_name, + action="scan-completed", + file_path="", + details={ + "files_seen": files_seen, + "files_indexed": files_indexed, + "chunks_indexed": points_indexed, }, ) if update_indexing_status: - update_indexing_status( - ws_path, - { - "state": "idle", - "progress": {"files_processed": files_indexed, "total_files": None}, - }, - ) - except Exception: - pass + for repo_name in touched_repos or ({repo_tag} if repo_tag else set()): + try: + target_ws = repo_roots.get(repo_name) or ws_path + update_indexing_status( + workspace_path=target_ws, + status={ + "state": "idle", + "progress": {"files_processed": files_indexed, "total_files": None}, + }, + repo_name=repo_name, + ) + except Exception: + continue + except Exception as e: + # Log the error instead of silently swallowing it + import traceback + print(f"[ERROR] Failed to update workspace state after indexing completion: {e}") + print(f"[ERROR] Traceback: {traceback.format_exc()}") def main(): @@ -2401,9 +2500,29 @@ def main(): qdrant_url = os.environ.get("QDRANT_URL", "http://localhost:6333") api_key = os.environ.get("QDRANT_API_KEY") - collection = os.environ.get("COLLECTION_NAME", "codebase") + collection = os.environ.get("COLLECTION_NAME") or os.environ.get("DEFAULT_COLLECTION") or "codebase" model_name = os.environ.get("EMBEDDING_MODEL", "BAAI/bge-base-en-v1.5") + # Resolve collection name based on multi-repo mode + multi_repo = bool(is_multi_repo_mode and is_multi_repo_mode()) + if multi_repo: + # Multi-repo mode: pass collection=None to trigger per-repo collection resolution + collection = None + print("[multi_repo] Multi-repo mode enabled - will create separate collections per repository") + else: + # Single-repo mode: use environment variable + if 'get_collection_name' in globals() and get_collection_name: + try: + resolved = get_collection_name(str(Path(args.root).resolve())) + placeholders = {"", "default-collection", "my-collection", "codebase"} + if resolved and collection in placeholders: + collection = resolved + except Exception: + pass + if not collection: + collection = os.environ.get("COLLECTION_NAME", "codebase") + print(f"[single_repo] Single-repo mode enabled - using collection: {collection}") + index_repo( Path(args.root).resolve(), qdrant_url, diff --git a/scripts/mcp_indexer_server.py b/scripts/mcp_indexer_server.py index 76a34173..95b047be 100644 --- a/scripts/mcp_indexer_server.py +++ b/scripts/mcp_indexer_server.py @@ -50,6 +50,14 @@ # Cache for memory collection autodetection (name + timestamp) _MEM_COLL_CACHE = {"name": None, "ts": 0.0} +# Session defaults map (token -> defaults). Guarded for concurrency. +_SESSION_LOCK = threading.Lock() +SESSION_DEFAULTS: Dict[str, Dict[str, Any]] = {} +# Per-connection defaults keyed by ctx.session (no token required) +from weakref import WeakKeyDictionary +_SESSION_CTX_LOCK = threading.Lock() +SESSION_DEFAULTS_BY_SESSION: "WeakKeyDictionary[Any, Dict[str, Any]]" = WeakKeyDictionary() + _roots = [p.strip() for p in _roots_env.split(",") if p.strip()] or ["/work", "/app"] try: @@ -142,8 +150,9 @@ def _highlight_snippet(snippet, tokens): # type: ignore try: # Official MCP Python SDK (FastMCP convenience server) - from mcp.server.fastmcp import FastMCP + from mcp.server.fastmcp import FastMCP, Context # type: ignore except Exception as e: # pragma: no cover + # Keep FastMCP import error loud; Context is for type hints only raise SystemExit("mcp package is required inside the container: pip install mcp") APP_NAME = os.environ.get("FASTMCP_SERVER_NAME", "qdrant-indexer-mcp") @@ -225,7 +234,22 @@ def _score(token: str) -> int: QDRANT_URL = os.environ.get("QDRANT_URL", "http://qdrant:6333") -DEFAULT_COLLECTION = os.environ.get("COLLECTION_NAME", "codebase") +DEFAULT_COLLECTION = ( + os.environ.get("DEFAULT_COLLECTION") + or os.environ.get("COLLECTION_NAME") + or "my-collection" +) +try: + from scripts.workspace_state import get_collection_name as _ws_get_collection_name # type: ignore + + if DEFAULT_COLLECTION in {"", "default-collection", "my-collection", "codebase"}: + workspace_path = os.environ.get("WATCH_ROOT", "/work") + resolved = _ws_get_collection_name(workspace_path) + if resolved: + DEFAULT_COLLECTION = resolved +except Exception: + pass + MAX_LOG_TAIL = safe_int( os.environ.get("MCP_MAX_LOG_TAIL", "4000"), default=4000, @@ -258,11 +282,23 @@ def _score(token: str) -> int: # --- Workspace state integration helpers --- def _state_file_path(ws_path: str = "/work") -> str: + """Locate workspace state using centralized metadata helpers when available.""" try: - return os.path.join(ws_path, ".codebase", "state.json") - except Exception as e: - logger.warning(f"State file path construction failed, using fallback: {e}") - return "/work/.codebase/state.json" + from scripts.workspace_state import ( + _extract_repo_name_from_path, + _state_file_path as _ws_state_file_path, + ) + + repo_name = _extract_repo_name_from_path(ws_path) + return str(_ws_state_file_path(workspace_path=None, repo_name=repo_name)) + except Exception: + try: + from scripts.workspace_state import _state_file_path as _ws_state_file_path + + return str(_ws_state_file_path(workspace_path=ws_path, repo_name=None)) + except Exception as exc: + logger.warning(f"State file path construction failed, using fallback: {exc}") + return os.path.join(ws_path, ".codebase", "state.json") def _read_ws_state(ws_path: str = "/work") -> Optional[Dict[str, Any]]: @@ -279,38 +315,33 @@ def _read_ws_state(ws_path: str = "/work") -> Optional[Dict[str, Any]]: def _default_collection() -> str: + env_coll = (os.environ.get("DEFAULT_COLLECTION") or os.environ.get("COLLECTION_NAME") or "").strip() + if env_coll: + return env_coll st = _read_ws_state("/work") if st: coll = st.get("qdrant_collection") if isinstance(coll, str) and coll.strip(): return coll.strip() - # Fall back to current environment rather than module-load default so tests - # and dynamic collection switching work correctly. - return os.environ.get("COLLECTION_NAME", DEFAULT_COLLECTION) - + return DEFAULT_COLLECTION def _work_script(name: str) -> str: - """Return path to a script under /app/scripts (container installation). - - Scripts are always installed at /app/scripts in the container. - This is independent of where user repositories are mounted. - """ - return os.path.join("/app", "scripts", name) - - -# Invalidate router scratchpad after reindex to avoid stale state reuse -_def_ws = "/work" - + """Return path to script respecting bind mounts first, then /app, then local fallback.""" + try: + work_path = os.path.join("/work", "scripts", name) + if os.path.exists(work_path): + return work_path + except Exception: + pass -def _invalidate_router_scratchpad(ws_path: str = _def_ws) -> bool: try: - p = os.path.join(ws_path, ".codebase", "router_scratchpad.json") - if os.path.exists(p): - os.remove(p) - return True + app_path = os.path.join("/app", "scripts", name) + if os.path.exists(app_path): + return app_path except Exception: pass - return False + + return os.path.join(os.getcwd(), "scripts", name) mcp = FastMCP(APP_NAME) @@ -517,7 +548,6 @@ def _cap_tail(s: str) -> str: except Exception as e: return {"ok": False, "code": -2, "stdout": "", "stderr": str(e)} finally: - # Explicitly close pipes to avoid unraisable warnings on transport GC try: if proc is not None: if proc.stdout is not None: @@ -648,28 +678,90 @@ def _to_str_list_relaxed(x: _Any) -> list[str]: if x is None: return [] if isinstance(x, (list, tuple)): - return [str(e) for e in x if str(e).strip()] + flat: list[str] = [] + for item in x: + flat.extend(_to_str_list_relaxed(item)) + return [t for t in flat if t.strip()] if isinstance(x, str): s = x.strip() if not s: return [] - # Try JSON array or Python literal list - if s.startswith("[") and s.endswith("]"): - try: - arr = json.loads(s) - if isinstance(arr, list): - return [str(e) for e in arr if str(e).strip()] - except json.JSONDecodeError: - try: - arr = _ast.literal_eval(s) - if isinstance(arr, (list, tuple)): - return [str(e) for e in arr if str(e).strip()] - except (ValueError, SyntaxError): - pass - # Comma-separated fallback - if "," in s: - return [t.strip() for t in s.split(",") if t.strip()] - return [s] + + def _normalize_tokens(val: _Any, depth: int = 0) -> list[str]: + if depth > 10: + text = str(val).strip() + return [text] if text else [] + if isinstance(val, (list, tuple)): + tokens: list[str] = [] + for item in val: + tokens.extend(_normalize_tokens(item, depth + 1)) + return tokens + + text = str(val).strip() + if not text: + return [] + + seen: set[str] = set() + current = text + while True: + if not current: + return [] + key = f"{depth}:{current}" + if key in seen: + return [current] + seen.add(key) + + if len(current) >= 2 and current[0] == current[-1] and current[0] in {'"', "'"}: + current = current[1:-1].strip() + continue + + changed = False + if current.startswith('/"'): + current = current[2:].strip() + changed = True + if current.endswith('"/'): + current = current[:-2].strip() + changed = True + if current.endswith('/"'): + current = current[:-2].strip() + changed = True + if changed: + continue + + parsed = None + for parser in (json.loads, _ast.literal_eval): + try: + parsed = parser(current) + except Exception: + continue + else: + break + if isinstance(parsed, (list, tuple)): + tokens: list[str] = [] + for item in parsed: + tokens.extend(_normalize_tokens(item, depth + 1)) + return tokens + if isinstance(parsed, str): + current = parsed.strip() + continue + if parsed is not None: + current = str(parsed).strip() + continue + + maybe = current.replace('\\"', '"').replace("\\'", "'") + if maybe != current: + current = maybe.strip() + continue + + if ',' in current: + tokens: list[str] = [] + for part in current.split(','): + tokens.extend(_normalize_tokens(part, depth + 1)) + return tokens + + return [current] + + return [t for t in _normalize_tokens(s) if t.strip()] return [str(x)] @@ -834,9 +926,13 @@ async def qdrant_index_root( try: from scripts.workspace_state import ( get_collection_name as _ws_get_collection_name, + is_multi_repo_mode as _ws_is_multi_repo_mode, ) # type: ignore - coll = _ws_get_collection_name("/work") + if _ws_is_multi_repo_mode(): + coll = _ws_get_collection_name("/work") or _default_collection() + else: + coll = _ws_get_collection_name(None) or _default_collection() except Exception: coll = _default_collection() @@ -906,9 +1002,12 @@ async def workspace_info( - {"workspace_path": str, "default_collection": str, "source": "state_file"|"env", "state": dict} """ ws_path = (workspace_path or "/work").strip() or "/work" + + st = _read_ws_state(ws_path) or {} coll = ( (st.get("qdrant_collection") if isinstance(st, dict) else None) + or os.environ.get("DEFAULT_COLLECTION") or os.environ.get("COLLECTION_NAME") or DEFAULT_COLLECTION ) @@ -943,28 +1042,233 @@ async def list_workspaces(search_root: Optional[str] = None) -> Dict[str, Any]: @mcp.tool() -async def memory_store( - information: str, - metadata: Optional[Dict[str, Any]] = None, +async def collection_map( + search_root: Optional[str] = None, collection: Optional[str] = None, + repo_name: Optional[str] = None, + include_samples: Optional[bool] = None, + limit: Optional[int] = None, ) -> Dict[str, Any]: - """Store a free-form memory entry in Qdrant (no code path metadata). + """Return collection↔repo mappings with optional Qdrant payload samples.""" - What it does: - - Embeds the text and upserts a payload with {"information", "metadata"} - - Uses named vectors (dense + lexical; mini when enabled) - - Enables context_search(include_memories=true) to surface it alongside code + def _norm_str(val: Any) -> Optional[str]: + if val is None: + return None + try: + s = str(val).strip() + except Exception: + return None + return s or None - When to use: - - Save preferences, decisions, or notes to retrieve later with code context + collection_filter = _norm_str(collection) + repo_filter = _norm_str(repo_name) + sample_flag = _coerce_bool(include_samples, False) - Parameters: - - information: str. Required text to remember. - - metadata: dict (optional). Tags like {"kind": "preference", "source": "memory"}. - - collection: str (optional). Defaults to workspace/env COLLECTION_NAME. + max_entries: Optional[int] = None + if limit is not None: + try: + max_entries = max(1, int(limit)) + except Exception: + max_entries = None - Returns: - - {"ok": true, "id": str, "collection": str} or {"error": "..."} + state_entries: List[Dict[str, Any]] = [] + state_error: Optional[str] = None + + try: + from scripts.workspace_state import get_collection_mappings as _get_collection_mappings # type: ignore + + try: + state_entries = await asyncio.to_thread( + lambda: _get_collection_mappings(search_root) + ) + except Exception as exc: + state_error = str(exc) + state_entries = [] + except Exception as exc: # pragma: no cover + state_error = f"workspace_state unavailable: {exc}" + state_entries = [] + + if repo_filter: + state_entries = [ + entry for entry in state_entries if _norm_str(entry.get("repo_name")) == repo_filter + ] + if collection_filter: + state_entries = [ + entry + for entry in state_entries + if _norm_str(entry.get("collection_name")) == collection_filter + ] + + results: List[Dict[str, Any]] = [] + seen_collections: set[str] = set() + + for entry in state_entries: + item = dict(entry) + item["source"] = "state" + results.append(item) + coll = _norm_str(entry.get("collection_name")) + if coll: + seen_collections.add(coll) + + # Qdrant helpers ----------------------------------------------------- + sample_cache: Dict[str, Tuple[Optional[Dict[str, Any]], Optional[str]]] = {} + qdrant_error: Optional[str] = None + qdrant_used = False + client = None + + def _ensure_qdrant_client(): + nonlocal client, qdrant_error, qdrant_used + if client is not None or qdrant_error: + return client + try: + from qdrant_client import QdrantClient # type: ignore + except Exception as exc: # pragma: no cover + qdrant_error = f"qdrant_client unavailable: {exc}" + return None + + try: + qdrant_used = True + return QdrantClient( + url=QDRANT_URL, + api_key=os.environ.get("QDRANT_API_KEY"), + timeout=float(os.environ.get("QDRANT_TIMEOUT", "20") or 20), + ) + except Exception as exc: # pragma: no cover + qdrant_error = str(exc) + return None + + async def _sample_payload(coll_name: Optional[str]) -> Tuple[Optional[Dict[str, Any]], Optional[str]]: + key = _norm_str(coll_name) or "" + if not key: + return None, "missing_collection" + if key in sample_cache: + return sample_cache[key] + + cli = _ensure_qdrant_client() + if cli is None: + sample_cache[key] = (None, qdrant_error) + return sample_cache[key] + + def _scroll_one(): + try: + points, _ = cli.scroll( + collection_name=key, + limit=1, + with_payload=True, + with_vectors=False, + ) + return points + except Exception as exc: # pragma: no cover + raise exc + + try: + points = await asyncio.to_thread(_scroll_one) + except Exception as exc: # pragma: no cover + err = str(exc) + sample_cache[key] = (None, err) + return sample_cache[key] + + if not points: + sample_cache[key] = (None, None) + return sample_cache[key] + + payload = points[0].payload or {} + metadata = payload.get("metadata") or {} + sample = { + "host_path": metadata.get("host_path"), + "container_path": metadata.get("container_path"), + "path": metadata.get("path") or payload.get("path"), + "start_line": metadata.get("start_line"), + "end_line": metadata.get("end_line"), + } + sample_cache[key] = (sample, None) + return sample_cache[key] + + # Attach samples to state-backed entries when requested + if sample_flag and results: + for entry in results: + coll_name = entry.get("collection_name") + sample, err = await _sample_payload(coll_name) + if sample: + entry["sample"] = sample + if err: + entry.setdefault("warnings", []).append(err) + + # If no state entries (or explicit collection filtered out), fall back to Qdrant listings + fallback_entries: List[Dict[str, Any]] = [] + need_qdrant_listing = not results + + if need_qdrant_listing: + cli = _ensure_qdrant_client() + if cli is not None: + def _list_collections(): + info = cli.get_collections() + return [c.name for c in info.collections] + + try: + collection_names = await asyncio.to_thread(_list_collections) + except Exception as exc: # pragma: no cover + qdrant_error = str(exc) + collection_names = [] + + if collection_filter: + collection_names = [ + name for name in collection_names if _norm_str(name) == collection_filter + ] + + count = 0 + for name in collection_names: + if name in seen_collections: + continue + entry: Dict[str, Any] = { + "collection_name": name, + "source": "qdrant", + } + sample, err = await _sample_payload(name) if sample_flag else (None, None) + if sample: + entry["sample"] = sample + if err: + entry.setdefault("warnings", []).append(err) + fallback_entries.append(entry) + count += 1 + if max_entries is not None and count >= max_entries: + break + + entries = results + fallback_entries + + return { + "results": entries, + "counts": { + "state": len(state_entries), + "returned": len(entries), + "fallback": len(fallback_entries), + }, + "errors": { + "state": state_error, + "qdrant": qdrant_error, + }, + "qdrant_used": qdrant_used, + "filters": { + "collection": collection_filter, + "repo_name": repo_filter, + "search_root": search_root, + "include_samples": sample_flag, + "limit": max_entries, + }, + } + + +@mcp.tool() +async def memory_store( + information: str, + metadata: Optional[Dict[str, Any]] = None, + collection: Optional[str] = None, +) -> Dict[str, Any]: + """Store a free-form memory entry in Qdrant using the active collection. + + - Embeds the text and writes both dense and lexical vectors (plus mini vector in ReFRAG mode). + - Honors explicit collection overrides; otherwise falls back to workspace/env defaults. + - Returns a payload compatible with context-aware tools. """ try: from qdrant_client import QdrantClient, models # type: ignore @@ -972,6 +1276,8 @@ async def memory_store( import time, hashlib, re, math from scripts.utils import sanitize_vector_name from scripts.ingest_code import ensure_collection as _ensure_collection # type: ignore + + from scripts.ingest_code import project_mini as _project_mini # type: ignore except Exception as e: # pragma: no cover @@ -1248,9 +1554,13 @@ async def qdrant_index( try: from scripts.workspace_state import ( get_collection_name as _ws_get_collection_name, + is_multi_repo_mode as _ws_is_multi_repo_mode, ) # type: ignore - coll = _ws_get_collection_name("/work") + if _ws_is_multi_repo_mode(): + coll = _ws_get_collection_name(root) or _default_collection() + else: + coll = _ws_get_collection_name(None) or _default_collection() except Exception: coll = _default_collection() @@ -1279,17 +1589,69 @@ async def qdrant_index( @mcp.tool() -async def qdrant_prune(kwargs: Any = None) -> Dict[str, Any]: - """Remove stale points for /work (files deleted/moved but still in the index). +async def set_session_defaults( + collection: Any = None, + session: Any = None, + ctx: Context = None, + **kwargs, +) -> Dict[str, Any]: + """Set defaults (e.g., collection) for subsequent calls. - When to use: - - After large deletes/moves when watcher/indexer may not have cleaned up + Behavior: + - If request Context is available, persist defaults per-connection so later calls on + the same MCP session automatically use them (no token required). + - Optionally also stores token-scoped defaults for cross-connection reuse. + """ + try: + _extra = _extract_kwargs_payload(kwargs) + if _extra: + if (collection is None or (isinstance(collection, str) and collection.strip() == "")) and _extra.get("collection") is not None: + collection = _extra.get("collection") + if (session is None or (isinstance(session, str) and str(session).strip() == "")) and _extra.get("session") is not None: + session = _extra.get("session") + except Exception: + pass - Parameters: - - (none). Operates on the current collection for /work. + defaults: Dict[str, Any] = {} + if isinstance(collection, str) and collection.strip(): + defaults["collection"] = str(collection).strip() - Returns: - - Subprocess result from prune.py; on success code==0. + # Per-connection storage (preferred) + try: + if ctx is not None and getattr(ctx, "session", None) is not None and defaults: + with _SESSION_CTX_LOCK: + existing2 = SESSION_DEFAULTS_BY_SESSION.get(ctx.session) or {} + existing2.update(defaults) + SESSION_DEFAULTS_BY_SESSION[ctx.session] = existing2 + except Exception: + pass + + # Optional token storage + sid = str(session).strip() if session is not None else "" + if not sid: + sid = uuid.uuid4().hex[:12] + try: + if defaults: + with _SESSION_LOCK: + existing = SESSION_DEFAULTS.get(sid) or {} + existing.update(defaults) + SESSION_DEFAULTS[sid] = existing + except Exception: + pass + + return { + "ok": True, + "session": sid, + "defaults": SESSION_DEFAULTS.get(sid, {}), + "applied": ("connection" if (ctx is not None and getattr(ctx, "session", None) is not None) else "token"), + } + +@mcp.tool() +async def qdrant_prune(kwargs: Any = None, **ignored: Any) -> Dict[str, Any]: + """Remove stale points for /work (files deleted/moved but still in the index). + + Extra arguments are accepted for forward compatibility but ignored. + Returns the subprocess result from ``prune.py`` with status information. """ env = os.environ.copy() env["PRUNE_ROOT"] = "/work" @@ -1314,6 +1676,11 @@ async def repo_search( highlight_snippet: Any = None, collection: Any = None, workspace_path: Any = None, + + + session: Any = None, + ctx: Context = None, + # Structured filters (optional; mirrors hybrid_search flags) language: Any = None, under: Any = None, @@ -1340,7 +1707,8 @@ async def repo_search( - query: str or list[str]. Multiple queries are fused; accepts "queries" alias. - limit: int (default 10). Total results across files. - per_path: int (default 2). Max results per file. - - include_snippet: bool. If true, returns a short snippet near the hit; control length with context_lines. + - include_snippet/context_lines: return inline snippets near hits when true. + - rerank_*: optional ONNX reranker toggles; timeouts fall back to hybrid output. - collection: str. Target collection; defaults to workspace state or env COLLECTION_NAME. - Filters (optional): language, under (path prefix), kind, symbol, ext, path_regex, path_glob (str or list[str]), not_glob (str or list[str]), not_ (negative text), case. @@ -1416,6 +1784,12 @@ async def repo_search( or (isinstance(collection, str) and collection.strip() == "") ) and _extra.get("collection"): collection = _extra.get("collection") + # Optional session token for session-scoped defaults + if ( + (session is None) or (isinstance(session, str) and str(session).strip() == "") + ) and _extra.get("session") is not None: + session = _extra.get("session") + # Optional workspace_path routing if ( (workspace_path is None) @@ -1425,6 +1799,7 @@ async def repo_search( ) ) and _extra.get("workspace_path") is not None: workspace_path = _extra.get("workspace_path") + if ( language is None or (isinstance(language, str) and language.strip() == "") @@ -1489,6 +1864,10 @@ def _to_bool(x, default): return False return default + # Session token (top-level or parsed from nested kwargs above) + sid = (str(session).strip() if session is not None else "") + + def _to_str(x, default=""): if x is None: return default @@ -1515,17 +1894,39 @@ def _to_str(x, default=""): ) highlight_snippet = _to_bool(highlight_snippet, True) - # Resolve collection: explicit > workspace_path state > default - ws_hint = _to_str(workspace_path, "").strip() + # Resolve collection precedence: explicit > per-connection defaults > token defaults > env default coll_hint = _to_str(collection, "").strip() - if not coll_hint and ws_hint: + + # 1) Per-connection defaults via ctx (no token required) + if (not coll_hint) and ctx is not None and getattr(ctx, "session", None) is not None: + try: + with _SESSION_CTX_LOCK: + _d2 = SESSION_DEFAULTS_BY_SESSION.get(ctx.session) or {} + _sc2 = str((_d2.get("collection") or "")).strip() + if _sc2: + coll_hint = _sc2 + except Exception: + pass + + # 2) Legacy token-based defaults + if (not coll_hint) and sid: try: - st = _read_ws_state(ws_hint) - if st and isinstance(st.get("qdrant_collection"), str): - coll_hint = st.get("qdrant_collection").strip() + with _SESSION_LOCK: + _d = SESSION_DEFAULTS.get(sid) or {} + _sc = str((_d.get("collection") or "")).strip() + if _sc: + coll_hint = _sc except Exception: pass - collection = coll_hint or _default_collection() + + # 3) Environment default + env_coll = (os.environ.get("DEFAULT_COLLECTION") or os.environ.get("COLLECTION_NAME") or "").strip() + if (not coll_hint) and env_coll: + coll_hint = env_coll + + # Final fallback + env_fallback = (os.environ.get("DEFAULT_COLLECTION") or os.environ.get("COLLECTION_NAME") or "my-collection").strip() + collection = coll_hint or env_fallback language = _to_str(language, "").strip() under = _to_str(under, "").strip() @@ -1624,7 +2025,7 @@ def _to_str_list(x): path_regex=path_regex or None, path_glob=(path_globs or None), not_glob=(not_globs or None), - expand=str(os.environ.get("HYBRID_EXPAND", "0")).strip().lower() + expand=str(os.environ.get("HYBRID_EXPAND", "1")).strip().lower() in {"1", "true", "yes", "on"}, model=model, ) @@ -1675,6 +2076,8 @@ def _to_str_list(x): cmd += ["--not-glob", g] for q in queries: cmd += ["--query", q] + if collection: + cmd += ["--collection", str(collection)] res = await _run_async(cmd, env=env) for line in (res.get("stdout") or "").splitlines(): @@ -2090,6 +2493,8 @@ async def repo_search_compat(**arguments) -> Dict[str, Any]: "rerank_timeout_ms": args.get("rerank_timeout_ms"), "highlight_snippet": args.get("highlight_snippet"), "collection": args.get("collection"), + "session": args.get("session"), + "workspace_path": args.get("workspace_path"), "language": args.get("language"), "under": args.get("under"), "kind": args.get("kind"), @@ -3261,9 +3666,9 @@ def _poll_ready(): if tool_name: qtext = " ".join([q for q in queries if q]).strip() or queries[0] arg_variants: List[Dict[str, Any]] = [ - {"query": qtext, "limit": mem_limit}, - {"q": qtext, "limit": mem_limit}, - {"text": qtext, "limit": mem_limit}, + {"query": qtext, "limit": mem_limit, "collection": mcoll}, + {"q": qtext, "limit": mem_limit, "collection": mcoll}, + {"text": qtext, "limit": mem_limit, "collection": mcoll}, ] res_obj = None for args in arg_variants: @@ -6342,7 +6747,6 @@ def _k(s: Dict[str, Any]): include_snippet=bool(include_snippet), queries=queries, ) - # Debug: log span details if os.environ.get("DEBUG_CONTEXT_ANSWER"): logger.debug( diff --git a/scripts/mcp_memory_server.py b/scripts/mcp_memory_server.py index 5a782d82..6777f16a 100644 --- a/scripts/mcp_memory_server.py +++ b/scripts/mcp_memory_server.py @@ -2,14 +2,26 @@ from typing import Any, Dict, Optional, List import json import threading +from weakref import WeakKeyDictionary -from mcp.server.fastmcp import FastMCP +# FastMCP server and request Context (ctx) for per-connection state +try: + from mcp.server.fastmcp import FastMCP, Context # type: ignore +except Exception: + # Fallback: keep FastMCP import; treat Context as Any for type hints + from mcp.server.fastmcp import FastMCP # type: ignore + Context = Any # type: ignore + from qdrant_client import QdrantClient, models # Env QDRANT_URL = os.environ.get("QDRANT_URL", "http://qdrant:6333") -DEFAULT_COLLECTION = os.environ.get("COLLECTION_NAME", "codebase") +DEFAULT_COLLECTION = ( + os.environ.get("DEFAULT_COLLECTION") + or os.environ.get("COLLECTION_NAME") + or "my-collection" +) LEX_VECTOR_NAME = os.environ.get("LEX_VECTOR_NAME", "lex") LEX_VECTOR_DIM = int(os.environ.get("LEX_VECTOR_DIM", "4096") or 4096) EMBEDDING_MODEL = os.environ.get("EMBEDDING_MODEL", "BAAI/bge-base-en-v1.5") @@ -37,8 +49,6 @@ def _get_embedding_model(): _EMBED_MODEL = m return m - - # Ensure repo roots are importable so 'scripts' resolves inside container import sys as _sys _roots_env = os.environ.get("WORK_ROOTS", "") @@ -59,6 +69,51 @@ def _get_embedding_model(): VECTOR_NAME = _sanitize_vector_name(EMBEDDING_MODEL) +# I/O-safety knobs for memory server behavior +# These env vars allow tuning startup latency vs. first-call latency, especially important +# on slow storage backends (e.g., Ceph + HDD). See comments below for rationale. +MEMORY_ENSURE_ON_START = str(os.environ.get("MEMORY_ENSURE_ON_START", "1")).strip().lower() in {"1", "true", "yes", "on"} +MEMORY_COLD_SKIP_DENSE = str(os.environ.get("MEMORY_COLD_SKIP_DENSE", "0")).strip().lower() in {"1", "true", "yes", "on"} +MEMORY_PROBE_EMBED_DIM = str(os.environ.get("MEMORY_PROBE_EMBED_DIM", "1")).strip().lower() in {"1", "true", "yes", "on"} +try: + MEMORY_VECTOR_DIM = int(os.environ.get("MEMORY_VECTOR_DIM") or os.environ.get("EMBED_DIM") or "768") +except Exception: + MEMORY_VECTOR_DIM = 768 + +# Lazy embedding model cache with double-checked locking. +# RATIONALE: Avoid loading the embedding model (100–500 MB) on module import. +# On slow storage (Ceph + HDD), eager loading can cause 30–60s startup delays. +# Instead, load on first tool call (store/find). Subsequent calls reuse cached instance. +_EMBED_MODEL_CACHE: Dict[str, Any] = {} +_EMBED_MODEL_LOCK = threading.Lock() + +def _get_embedding_model(): + """Lazily load and cache the embedding model to avoid startup I/O.""" + from fastembed import TextEmbedding + m = _EMBED_MODEL_CACHE.get(EMBEDDING_MODEL) + if m is None: + with _EMBED_MODEL_LOCK: + m = _EMBED_MODEL_CACHE.get(EMBEDDING_MODEL) + if m is None: + m = TextEmbedding(model_name=EMBEDDING_MODEL) + _EMBED_MODEL_CACHE[EMBEDDING_MODEL] = m + return m + +# Track ensured collections to reduce redundant ensure calls. +# RATIONALE: Avoid repeated Qdrant network calls for the same collection. +_ENSURED = set() + +def _ensure_once(name: str) -> bool: + """Ensure collection exists, but only once per process (cached result).""" + if name in _ENSURED: + return True + try: + _ensure_collection(name) + _ENSURED.add(name) + return True + except Exception: + return False + mcp = FastMCP(name="memory-server") # Capture tool registry automatically by wrapping the decorator once @@ -90,6 +145,13 @@ def _inner(fn): except Exception: HEALTH_PORT = 18000 +# In-memory session defaults (legacy token-based) +_SESSION_LOCK = threading.Lock() +SESSION_DEFAULTS: Dict[str, Dict[str, Any]] = {} +# In-memory per-connection defaults keyed by ctx.session (no token required) +_SESSION_CTX_LOCK = threading.Lock() +SESSION_DEFAULTS_BY_SESSION: "WeakKeyDictionary[Any, Dict[str, Any]]" = WeakKeyDictionary() + def _start_readyz_server(): try: @@ -137,35 +199,129 @@ def log_message(self, *args, **kwargs): def _ensure_collection(name: str): + """Create collection if missing. + + Default behavior mirrors the original implementation for PR compatibility: + - Probe the embedding model to detect the dense vector dimension (MEMORY_PROBE_EMBED_DIM=1) + - Eager ensure on startup (MEMORY_ENSURE_ON_START=1) + + For slow storage backends (e.g., Ceph + HDD), set the following in your env: + - MEMORY_PROBE_EMBED_DIM=0 -> skip model probing; use MEMORY_VECTOR_DIM/EMBED_DIM + - MEMORY_ENSURE_ON_START=0 -> ensure lazily on first tool call + """ try: - info = client.get_collection(name) + client.get_collection(name) return True except Exception: pass - # Derive dense vector dimension from embedding model to avoid mismatch - # Derive dense vector dimension from embedding model to avoid mismatch - try: - _model_probe = TextEmbedding(model_name=EMBEDDING_MODEL) - _dense_vec = next(_model_probe.embed(["probe"])) - _dense_dim = len(getattr(_dense_vec, "tolist", lambda: _dense_vec)()) if hasattr(_dense_vec, "tolist") else len(_dense_vec) - except Exception: + + # Choose dense dimension based on config: probe (default) vs env-configured + if MEMORY_PROBE_EMBED_DIM: try: - _dense_dim = int(os.environ.get("EMBED_DIM", "768") or 768) + from fastembed import TextEmbedding + _model_probe = TextEmbedding(model_name=EMBEDDING_MODEL) + _dense_vec = next(_model_probe.embed(["probe"])) + if hasattr(_dense_vec, "tolist"): + dense_dim = len(_dense_vec.tolist()) + else: + try: + dense_dim = len(_dense_vec) + except Exception: + dense_dim = int(os.environ.get("MEMORY_VECTOR_DIM") or os.environ.get("EMBED_DIM") or "768") except Exception: - _dense_dim = 768 + # Fallback to env-configured dimension if probing fails + try: + dense_dim = int(os.environ.get("MEMORY_VECTOR_DIM") or os.environ.get("EMBED_DIM") or "768") + except Exception: + dense_dim = 768 + else: + dense_dim = int(MEMORY_VECTOR_DIM or 768) + vectors_cfg = { - VECTOR_NAME: models.VectorParams( - size=int(_dense_dim or 768), distance=models.Distance.COSINE - ), - LEX_VECTOR_NAME: models.VectorParams( - size=LEX_VECTOR_DIM, distance=models.Distance.COSINE - ), + VECTOR_NAME: models.VectorParams(size=int(dense_dim or 768), distance=models.Distance.COSINE), + LEX_VECTOR_NAME: models.VectorParams(size=LEX_VECTOR_DIM, distance=models.Distance.COSINE), } client.create_collection(collection_name=name, vectors_config=vectors_cfg) return True -_ensure_collection(DEFAULT_COLLECTION) +# Optional eager collection ensure on startup (enabled by default for backward compatibility). +# Set MEMORY_ENSURE_ON_START=0 to defer ensure to first tool call (recommended on slow storage). +if MEMORY_ENSURE_ON_START: + try: + _ensure_collection(DEFAULT_COLLECTION) + except Exception: + pass + +@mcp.tool() +def set_session_defaults( + collection: Optional[str] = None, + session: Optional[str] = None, + ctx: Context = None, + **kwargs: Any, +) -> Dict[str, Any]: + """Set defaults (e.g., collection) for subsequent calls. + + Behavior: + - If a request Context is provided (normal with FastMCP), store defaults per-connection + so subsequent calls on the same MCP session automatically use them (no token needed). + - Optionally, also supports a lightweight token for clients that prefer cross-connection reuse. + + Precedence everywhere: explicit collection > per-connection defaults > token defaults > env default. + """ + try: + _extra = kwargs or {} + if isinstance(_extra, dict) and "kwargs" in _extra: + inner = _extra.get("kwargs") + if isinstance(inner, dict): + _extra = inner + elif isinstance(inner, str): + try: + _extra = json.loads(inner) + except Exception: + _extra = {} + if (not collection) and isinstance(_extra, dict) and _extra.get("collection") is not None: + collection = _extra.get("collection") + if (not session) and isinstance(_extra, dict) and _extra.get("session") is not None: + session = _extra.get("session") + except Exception: + pass + + # Prepare defaults payload + defaults: Dict[str, Any] = {} + if isinstance(collection, str) and collection.strip(): + defaults["collection"] = collection.strip() + + # Store per-connection (preferred, no token required) + try: + if ctx is not None and getattr(ctx, "session", None) is not None and defaults: + with _SESSION_CTX_LOCK: + existing = SESSION_DEFAULTS_BY_SESSION.get(ctx.session) or {} + existing.update(defaults) + SESSION_DEFAULTS_BY_SESSION[ctx.session] = existing + except Exception: + pass + + # Optional: also support legacy token + sid = (str(session).strip() if session is not None else "") or None + if not sid: + import uuid as _uuid + sid = _uuid.uuid4().hex[:12] + try: + if defaults: + with _SESSION_LOCK: + existing = SESSION_DEFAULTS.get(sid) or {} + existing.update(defaults) + SESSION_DEFAULTS[sid] = existing + except Exception: + pass + + return { + "ok": True, + "session": sid, + "defaults": (SESSION_DEFAULTS.get(sid, {}) if sid else {}), + "applied": ("connection" if (ctx is not None and getattr(ctx, "session", None) is not None) else "token"), + } @mcp.tool() @@ -173,9 +329,16 @@ def store( information: str, metadata: Optional[Dict[str, Any]] = None, collection: Optional[str] = None, + session: Optional[str] = None, + ctx: Context = None, + **kwargs: Any, ) -> Dict[str, Any]: - """Store a memory entry into Qdrant (dual vectors consistent with indexer).""" - coll = collection or DEFAULT_COLLECTION + """Store a memory entry into Qdrant (dual vectors consistent with indexer). + + First call may be slower because the embedding model loads lazily. + """ + coll = _resolve_collection(collection, session=session, ctx=ctx, extra_kwargs=kwargs) + _ensure_once(coll) model = _get_embedding_model() dense = next(model.embed([str(information)])).tolist() lex = _lex_hash_vector_text(str(information), LEX_VECTOR_DIM) @@ -199,33 +362,51 @@ def find( limit: Optional[int] = None, collection: Optional[str] = None, top_k: Optional[int] = None, + session: Optional[str] = None, + ctx: Context = None, + **kwargs: Any, ) -> Dict[str, Any]: - """Find memory-like entries by vector similarity (dense + lexical fusion).""" - coll = collection or DEFAULT_COLLECTION - model = _get_embedding_model() - dense = next(model.embed([str(query)])).tolist() + """Find memory-like entries by vector similarity (dense + lexical fusion). + + Cold-start option: set MEMORY_COLD_SKIP_DENSE=1 to skip dense embedding until the + model is cached (useful on slow storage). + """ + coll = _resolve_collection(collection, session=session, ctx=ctx, extra_kwargs=kwargs) + _ensure_once(coll) + + use_dense = True + if MEMORY_COLD_SKIP_DENSE and EMBEDDING_MODEL not in _EMBED_MODEL_CACHE: + use_dense = False + if use_dense: + model = _get_embedding_model() + dense = next(model.embed([str(query)])).tolist() + else: + dense = None lex = _lex_hash_vector_text(str(query), LEX_VECTOR_DIM) # Harmonize alias: top_k -> limit lim = int(limit if limit is not None else (top_k if top_k is not None else 5)) # Two searches (prefer query_points) then simple RRF-like merge - try: - qp_dense = client.query_points( - collection_name=coll, - query=dense, - using=VECTOR_NAME, - limit=max(10, lim), - with_payload=True, - ) - res_dense = getattr(qp_dense, "points", qp_dense) - except AttributeError: - res_dense = client.search( - collection_name=coll, - query_vector=(VECTOR_NAME, dense), - limit=max(10, lim), - with_payload=True, - ) + if use_dense: + try: + qp_dense = client.query_points( + collection_name=coll, + query=dense, + using=VECTOR_NAME, + limit=max(10, lim), + with_payload=True, + ) + res_dense = getattr(qp_dense, "points", qp_dense) + except AttributeError: + res_dense = client.search( + collection_name=coll, + query_vector=(VECTOR_NAME, dense), + limit=max(10, lim), + with_payload=True, + ) + else: + res_dense = [] try: qp_lex = client.query_points( @@ -287,6 +468,65 @@ def add_hits(hits, weight: float): return {"ok": True, "results": ordered, "count": len(ordered)} +def _resolve_collection( + collection: Optional[str], + session: Optional[str] = None, + ctx: Context = None, + extra_kwargs: Any = None, +) -> str: + """Resolve the collection name honoring explicit args, session defaults, and env fallbacks.""" + coll = (collection or "").strip() + sid: Optional[str] = None + + # Extract overrides from nested kwargs payloads some clients send + try: + payload = extra_kwargs or {} + if isinstance(payload, dict) and "kwargs" in payload: + payload = payload.get("kwargs") + if isinstance(payload, str): + try: + payload = json.loads(payload) + except Exception: + payload = {} + if not coll and isinstance(payload, dict) and payload.get("collection") is not None: + coll = str(payload.get("collection")).strip() + if isinstance(payload, dict) and payload.get("session") is not None: + sid = str(payload.get("session")).strip() + except Exception: + pass + + # Explicit session parameter wins over payload session + try: + if session is not None and str(session).strip(): + sid = str(session).strip() + except Exception: + pass + + # Per-connection defaults via Context session + if not coll and ctx is not None and getattr(ctx, "session", None) is not None: + try: + with _SESSION_CTX_LOCK: + defaults = SESSION_DEFAULTS_BY_SESSION.get(ctx.session) or {} + candidate = str(defaults.get("collection") or "").strip() + if candidate: + coll = candidate + except Exception: + pass + + # Legacy token-based session defaults + if not coll and sid: + try: + with _SESSION_LOCK: + defaults = SESSION_DEFAULTS.get(sid) or {} + candidate = str(defaults.get("collection") or "").strip() + if candidate: + coll = candidate + except Exception: + pass + + return coll or DEFAULT_COLLECTION + + if __name__ == "__main__": transport = os.environ.get("FASTMCP_TRANSPORT", "sse").strip().lower() # Start lightweight /readyz health endpoint in background (best-effort) diff --git a/scripts/memory_backup.py b/scripts/memory_backup.py new file mode 100644 index 00000000..410ed90a --- /dev/null +++ b/scripts/memory_backup.py @@ -0,0 +1,319 @@ +#!/usr/bin/env python3 +""" +Memory Backup Utility for Qdrant Collections + +Exports memories (non-code points) from Qdrant collections to JSON for backup purposes. +Memories are identified as points without file path metadata - typically user-added notes, +context, or other information that's not tied to specific code files. + +Usage: + python scripts/memory_backup.py --collection test-repo-58ecbbc8 --output memories_backup.json + python scripts/memory_backup.py --collection test-repo-58ecbbc8 --output memories_backup_$(date +%Y%m%d).json +""" + +import os +import sys +import json +import argparse +from datetime import datetime +from typing import List, Dict, Any, Optional +from pathlib import Path + +# Add project root to path for imports +ROOT_DIR = Path(__file__).resolve().parent.parent +if str(ROOT_DIR) not in sys.path: + sys.path.insert(0, str(ROOT_DIR)) + +try: + from qdrant_client import QdrantClient + from qdrant_client.models import Filter, FieldCondition, MatchValue +except ImportError: + print("ERROR: qdrant-client not installed. Install with: pip install qdrant-client") + sys.exit(1) + + +def get_qdrant_client() -> QdrantClient: + """Initialize Qdrant client with environment configuration.""" + qdrant_url = os.environ.get("QDRANT_URL", "http://localhost:6333") + api_key = os.environ.get("QDRANT_API_KEY") + + return QdrantClient(url=qdrant_url, api_key=api_key or None) + + +def is_memory_point(payload: Dict[str, Any]) -> bool: + """ + Determine if a point is a memory (user-added) rather than code-indexed content. + + Memory points typically: + - Have no 'path' in metadata (not tied to a file) + - May have 'source' set to 'memory' + - Have 'content' field that's not extracted from code + + Args: + payload: Point payload from Qdrant + + Returns: + True if this appears to be a memory point, False if it's code content + """ + if not payload: + return False + + metadata = payload.get("metadata", {}) + + # Primary indicator: no file path means it's likely a memory + if not metadata.get("path"): + return True + + # Secondary indicator: explicit source marking + if metadata.get("source") == "memory": + return True + + # Tertiary: content-based heuristics + content = payload.get("information", "") + if content and not metadata.get("language") and not metadata.get("kind"): + # Content without language/kind metadata is likely user-added + return True + + return False + + +def export_memories( + collection_name: str, + output_file: str, + client: Optional[QdrantClient] = None, + include_vectors: bool = True, + batch_size: int = 1000 +) -> Dict[str, Any]: + """ + Export memories from a Qdrant collection to JSON. + + Args: + collection_name: Qdrant collection name + output_file: Output JSON file path + client: Qdrant client instance (will create if None) + include_vectors: Whether to include vector embeddings in backup + batch_size: Number of points to fetch per request + + Returns: + Dict with backup statistics + """ + if client is None: + client = get_qdrant_client() + + # Verify collection exists + try: + collections = client.get_collections().collections + if collection_name not in [c.name for c in collections]: + raise ValueError(f"Collection '{collection_name}' not found") + except Exception as e: + raise RuntimeError(f"Failed to access Qdrant: {e}") + + print(f"Exporting memories from collection: {collection_name}") + print(f"Output file: {output_file}") + + # Get all points from collection + all_points = [] + total_count = 0 + memory_count = 0 + + # Use scroll to get all points efficiently + next_page_offset = None + while True: + points, next_page_offset = client.scroll( + collection_name=collection_name, + offset=next_page_offset, + limit=batch_size, + with_payload=True, + with_vectors=include_vectors + ) + + if not points: + break + + all_points.extend(points) + total_count += len(points) + + # Filter for memory points + memory_points = [] + for point in points: + if is_memory_point(point.payload or {}): + memory_points.append(point) + memory_count += 1 + + print(f"Fetched {len(points)} points (total: {total_count}), found {len(memory_points)} memories (total: {memory_count})") + + if next_page_offset is None: + break + + if memory_count == 0: + print("No memories found in collection!") + return { + "collection": collection_name, + "total_points": total_count, + "memory_count": 0, + "backup_file": output_file, + "success": True + } + + # Prepare backup data + backup_data = { + "backup_info": { + "collection_name": collection_name, + "export_date": datetime.now().isoformat(), + "total_points_exported": total_count, + "memory_points_found": memory_count, + "include_vectors": include_vectors, + "vector_dimension": None # Will be set if vectors included + }, + "memories": [] + } + + # Process memory points + for point in all_points: + if not is_memory_point(point.payload or {}): + continue + + payload = point.payload or {} + memory_entry = { + "id": str(point.id), + "content": payload.get("information", ""), + "metadata": payload.get("metadata", {}), + } + + # Include vector if requested + if include_vectors and point.vector: + if hasattr(point.vector, 'tolist'): + memory_entry["vector"] = point.vector.tolist() + else: + memory_entry["vector"] = point.vector + + # Set vector dimension from first memory + if backup_data["backup_info"]["vector_dimension"] is None: + vector_data = memory_entry["vector"] + if isinstance(vector_data, dict): + # Named vector format: {"memory": [values]} + first_vector = next(iter(vector_data.values())) + backup_data["backup_info"]["vector_dimension"] = len(first_vector) + else: + # Direct vector list format + backup_data["backup_info"]["vector_dimension"] = len(vector_data) + + backup_data["memories"].append(memory_entry) + + # Write backup file + output_path = Path(output_file) + output_path.parent.mkdir(parents=True, exist_ok=True) + + with open(output_path, 'w') as f: + json.dump(backup_data, f, indent=2) + + print(f"✅ Backup completed successfully!") + print(f" Total points processed: {total_count}") + print(f" Memory points exported: {memory_count}") + print(f" Backup file: {output_path}") + print(f" File size: {output_path.stat().st_size / 1024:.1f} KB") + + return { + "collection": collection_name, + "total_points": total_count, + "memory_count": memory_count, + "backup_file": str(output_path), + "file_size": output_path.stat().st_size, + "success": True + } + + +def list_collections() -> None: + """List all available Qdrant collections.""" + client = get_qdrant_client() + + try: + collections = client.get_collections().collections + print("Available collections:") + for collection in collections: + info = client.get_collection(collection.name) + point_count = info.points_count + print(f" - {collection.name} ({point_count:,} points)") + except Exception as e: + print(f"Error listing collections: {e}") + + +def main(): + parser = argparse.ArgumentParser( + description="Backup memories (non-code points) from Qdrant collections", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + %(prog)s --collection test-repo-58ecbbc8 --output memories_backup.json + %(prog)s --list-collections + %(prog)s --collection test-repo-58ecbbc8 --output backup_$(date +%Y%m%d_%H%M%S).json --no-vectors + """ + ) + + parser.add_argument( + "--collection", "-c", + required=False, + help="Qdrant collection name to backup memories from" + ) + + parser.add_argument( + "--output", "-o", + help="Output JSON file path for backup" + ) + + parser.add_argument( + "--list-collections", "-l", + action="store_true", + help="List all available collections" + ) + + parser.add_argument( + "--no-vectors", + action="store_true", + help="Don't include vector embeddings in backup (smaller file, requires re-embedding)" + ) + + parser.add_argument( + "--batch-size", + type=int, + default=1000, + help="Number of points to fetch per request (default: 1000)" + ) + + args = parser.parse_args() + + if args.list_collections: + list_collections() + return + + if not args.collection: + parser.error("--collection required unless using --list-collections") + + if not args.output: + # Generate default filename with timestamp + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + args.output = f"{args.collection}_memories_{timestamp}.json" + + try: + result = export_memories( + collection_name=args.collection, + output_file=args.output, + include_vectors=not args.no_vectors, + batch_size=args.batch_size + ) + + if result["success"]: + print(f"\n🎉 Memory backup completed successfully!") + if result["memory_count"] == 0: + print(" (No memories found to backup)") + else: + print(f"\n❌ Memory backup failed!") + sys.exit(1) + + except Exception as e: + print(f"\n❌ Error during backup: {e}") + sys.exit(1) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/scripts/memory_restore.py b/scripts/memory_restore.py new file mode 100644 index 00000000..cacddeda --- /dev/null +++ b/scripts/memory_restore.py @@ -0,0 +1,379 @@ +#!/usr/bin/env python3 +""" +Memory Restore Utility for Qdrant Collections + +Imports previously backed up memories into Qdrant collections. +Can restore to existing collections (append) or new ones. +Supports re-embedding memories if vectors were not included in backup. + +Usage: + python scripts/memory_restore.py --backup memories_backup.json --collection test-repo-58ecbbc8 + python scripts/memory_restore.py --backup memories_backup.json --collection new-test-repo --embedding-model BAAI/bge-large-en-v1.5 + python scripts/memory_restore.py --backup memories_backup.json --collection new-collection --new-collection +""" + +import os +import sys +import json +import argparse +from datetime import datetime +from typing import List, Dict, Any, Optional +from pathlib import Path + +# Add project root to path for imports +ROOT_DIR = Path(__file__).resolve().parent.parent +if str(ROOT_DIR) not in sys.path: + sys.path.insert(0, str(ROOT_DIR)) + +try: + from qdrant_client import QdrantClient + from qdrant_client.models import VectorParams, Distance + from fastembed import TextEmbedding +except ImportError as e: + print(f"ERROR: Missing required dependency: {e}") + print("Install with: pip install qdrant-client fastembed") + sys.exit(1) + + +def get_qdrant_client() -> QdrantClient: + """Initialize Qdrant client with environment configuration.""" + qdrant_url = os.environ.get("QDRANT_URL", "http://localhost:6333") + api_key = os.environ.get("QDRANT_API_KEY") + + return QdrantClient(url=qdrant_url, api_key=api_key or None) + + +def get_embedding_model(model_name: str): + """Initialize embedding model with the given name.""" + try: + return TextEmbedding(model_name=model_name) + except Exception as e: + raise RuntimeError(f"Failed to load embedding model '{model_name}': {e}") + + +def ensure_collection_exists( + client: QdrantClient, + collection_name: str, + vector_dimension: int, + vector_name: str = "memory" +) -> None: + """ + Ensure the target collection exists with appropriate vector configuration. + + Args: + client: Qdrant client instance + collection_name: Collection name + vector_dimension: Vector dimensions for memories + vector_name: Name for the memory vector + """ + try: + # Check if collection exists + collections = client.get_collections().collections + if collection_name in [c.name for c in collections]: + print(f"Collection '{collection_name}' already exists") + return + except Exception as e: + print(f"Warning: Could not check collection existence: {e}") + + # Create collection with memory vector + try: + client.create_collection( + collection_name=collection_name, + vectors_config={ + vector_name: VectorParams( + size=vector_dimension, + distance=Distance.COSINE + ) + } + ) + print(f"✅ Created collection '{collection_name}' with {vector_dimension}-dim vectors") + except Exception as e: + raise RuntimeError(f"Failed to create collection '{collection_name}': {e}") + + +def restore_memories( + backup_file: str, + collection_name: str, + client: Optional[QdrantClient] = None, + embedding_model_name: Optional[str] = None, + vector_name: str = "memory", + batch_size: int = 100, + skip_existing: bool = True +) -> Dict[str, Any]: + """ + Restore memories from backup file to Qdrant collection. + + Args: + backup_file: Path to backup JSON file + collection_name: Target collection name + client: Qdrant client instance (will create if None) + embedding_model_name: Model name for re-embedding (if vectors not in backup) + vector_name: Name for the memory vector in collection + batch_size: Number of memories to upload per batch + skip_existing: Skip memories that already exist in collection + + Returns: + Dict with restore statistics + """ + if client is None: + client = get_qdrant_client() + + # Load backup file + backup_path = Path(backup_file) + if not backup_path.exists(): + raise FileNotFoundError(f"Backup file not found: {backup_file}") + + try: + with open(backup_path, 'r') as f: + backup_data = json.load(f) + except Exception as e: + raise ValueError(f"Invalid backup file format: {e}") + + # Validate backup structure + if "memories" not in backup_data: + raise ValueError("Invalid backup file: missing 'memories' section") + + memories = backup_data["memories"] + backup_info = backup_data.get("backup_info", {}) + + print(f"Restoring memories from: {backup_file}") + print(f"Target collection: {collection_name}") + print(f"Memories in backup: {len(memories)}") + + if backup_info: + print(f"Original collection: {backup_info.get('collection_name', 'unknown')}") + print(f"Backup date: {backup_info.get('export_date', 'unknown')}") + print(f"Vector dimension: {backup_info.get('vector_dimension', 'unknown')}") + + # Determine vector configuration + vectors_included = backup_info.get("include_vectors", True) and memories and "vector" in memories[0] + + if not vectors_included: + if not embedding_model_name: + # Use default model + embedding_model_name = os.environ.get("EMBEDDING_MODEL", "BAAI/bge-base-en-v1.5") + + print(f"Vectors not included in backup, will re-embed with: {embedding_model_name}") + embedding_model = get_embedding_model(embedding_model_name) + + # Get vector dimension from model + test_vector = next(embedding_model.embed(["test"])).tolist() + vector_dimension = len(test_vector) + print(f"Embedding model vector dimension: {vector_dimension}") + else: + # Use dimension from backup + vector_dimension = backup_info.get("vector_dimension", len(memories[0]["vector"])) + embedding_model = None + print(f"Using vectors from backup, dimension: {vector_dimension}") + + # Ensure collection exists + ensure_collection_exists(client, collection_name, vector_dimension, vector_name) + + # Check for existing memories if skip_existing is True + existing_ids = set() + if skip_existing: + try: + # Get all existing point IDs + all_points, _ = client.scroll( + collection_name=collection_name, + limit=None, + with_payload=False, + with_vectors=False + ) + existing_ids = {str(point.id) for point in all_points} + print(f"Found {len(existing_ids)} existing points in collection") + except Exception as e: + print(f"Warning: Could not check existing points: {e}") + skip_existing = False + + # Process and upload memories in batches + restored_count = 0 + skipped_count = 0 + error_count = 0 + + for i in range(0, len(memories), batch_size): + batch = memories[i:i + batch_size] + batch_points = [] + + for memory in batch: + memory_id = memory.get("id", "") + + # Skip if already exists + if skip_existing and memory_id in existing_ids: + skipped_count += 1 + continue + + try: + # Prepare vector + if vectors_included: + vector = memory.get("vector") + if not vector: + raise ValueError("Memory missing vector data") + # Vector from backup is already in the correct format: {"memory": [values]} + else: + # Re-embed content + content = memory.get("content", "") + if not content: + raise ValueError("Memory missing content for embedding") + + vector = next(embedding_model.embed([content])).tolist() + # For re-embedded vectors, we need to structure them with the vector name + vector = {vector_name: vector} + + # Prepare point data + point_data = { + "id": memory_id, + "vector": vector, + "payload": { + "information": memory.get("content", ""), + "metadata": memory.get("metadata", {}) + } + } + + batch_points.append(point_data) + + except Exception as e: + print(f"Error processing memory {memory_id}: {e}") + error_count += 1 + continue + + # Upload batch + if batch_points: + try: + client.upsert(collection_name=collection_name, points=batch_points) + restored_count += len(batch_points) + print(f" Uploaded batch {i//batch_size + 1}: +{len(batch_points)} memories (total: {restored_count})") + except Exception as e: + print(f"Error uploading batch {i//batch_size + 1}: {e}") + error_count += len(batch_points) + + # Final statistics + print(f"\n✅ Memory restore completed!") + print(f" Total memories in backup: {len(memories)}") + print(f" Successfully restored: {restored_count}") + print(f" Skipped (already exists): {skipped_count}") + print(f" Errors: {error_count}") + print(f" Target collection: {collection_name}") + + # Verify final count + try: + final_count = client.count(collection_name).count + print(f" Final collection size: {final_count:,} points") + except Exception as e: + print(f" Warning: Could not get final count: {e}") + + return { + "collection": collection_name, + "backup_file": backup_file, + "total_memories": len(memories), + "restored": restored_count, + "skipped": skipped_count, + "errors": error_count, + "success": True + } + + +def main(): + parser = argparse.ArgumentParser( + description="Restore memories from backup to Qdrant collections", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + %(prog)s --backup memories_backup.json --collection test-repo-58ecbbc8 + %(prog)s --backup memories_backup.json --collection new-test-repo --embedding-model BAAI/bge-large-en-v1.5 + %(prog)s --backup memories_backup.json --collection new-collection --new-collection --no-skip-existing + """ + ) + + parser.add_argument( + "--backup", "-b", + required=True, + help="Path to backup JSON file" + ) + + parser.add_argument( + "--collection", "-c", + required=True, + help="Target Qdrant collection name" + ) + + parser.add_argument( + "--embedding-model", "-m", + help="Embedding model for re-embedding (if vectors not in backup)" + ) + + parser.add_argument( + "--vector-name", + default="memory", + help="Name for the memory vector in collection (default: memory)" + ) + + parser.add_argument( + "--batch-size", + type=int, + default=100, + help="Number of memories to upload per batch (default: 100)" + ) + + parser.add_argument( + "--no-skip-existing", + action="store_true", + help="Don't skip memories that already exist in collection" + ) + + parser.add_argument( + "--list-backup-info", + action="store_true", + help="Show backup file information without restoring" + ) + + args = parser.parse_args() + + try: + # Load backup to show info + with open(args.backup, 'r') as f: + backup_data = json.load(f) + + if args.list_backup_info: + print("Backup Information:") + print("=" * 50) + backup_info = backup_data.get("backup_info", {}) + for key, value in backup_info.items(): + print(f" {key}: {value}") + + memories = backup_data.get("memories", []) + print(f" Memory count: {len(memories)}") + + if memories: + sample = memories[0] + has_vector = "vector" in sample + print(f" Has vectors: {has_vector}") + if has_vector: + vector_dim = len(sample["vector"]) + print(f" Vector dimension: {vector_dim}") + + return + + # Restore memories + result = restore_memories( + backup_file=args.backup, + collection_name=args.collection, + embedding_model_name=args.embedding_model, + vector_name=args.vector_name, + batch_size=args.batch_size, + skip_existing=not args.no_skip_existing + ) + + if result["success"]: + print(f"\n🎉 Memory restoration completed successfully!") + else: + print(f"\n❌ Memory restoration failed!") + sys.exit(1) + + except Exception as e: + print(f"\n❌ Error during restoration: {e}") + sys.exit(1) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/scripts/remote_upload_client.py b/scripts/remote_upload_client.py new file mode 100644 index 00000000..aac98034 --- /dev/null +++ b/scripts/remote_upload_client.py @@ -0,0 +1,1111 @@ +#!/usr/bin/env python3 +""" +Remote upload client for delta bundles in Context-Engine. + +This module provides functionality to create and upload delta bundles to a remote +server, enabling real-time code synchronization across distributed environments. + +Example usage: + export HOST_ROOT="/tmp/testupload" && export CONTAINER_ROOT="/work" && export + PYTHONPATH="/home/coder/project/Context-Engine:$PYTHONPATH" && python3 + scripts/remote_upload_client.py --path /tmp/testupload) +""" + +import os +import json +import time +import uuid +import hashlib +import tarfile +import tempfile +import logging +import argparse +from pathlib import Path +from typing import Dict, List, Any, Optional, Tuple +from datetime import datetime +import requests +from requests.adapters import HTTPAdapter +from urllib3.util.retry import Retry + +# Configure logging +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +# Import existing workspace state functions +from scripts.workspace_state import ( + get_cached_file_hash, + set_cached_file_hash, + get_collection_name, + _extract_repo_name_from_path, +) + +# Import existing hash function +import scripts.ingest_code as idx + + +class RemoteUploadClient: + """Client for uploading delta bundles to remote server.""" + + def _translate_to_container_path(self, host_path: str) -> str: + """Translate host path to container path for API communication.""" + # Use environment variable for path mapping if available + host_root = os.environ.get("HOST_ROOT", "/home/coder/project/Context-Engine/dev-workspace") + container_root = os.environ.get("CONTAINER_ROOT", "/work") + + if host_path.startswith(host_root): + return host_path.replace(host_root, container_root) + else: + # Fallback: if path doesn't match expected pattern, use as-is + return host_path + + def __init__(self, upload_endpoint: str, workspace_path: str, collection_name: str, + max_retries: int = 3, timeout: int = 30, metadata_path: Optional[str] = None): + """Initialize remote upload client.""" + self.upload_endpoint = upload_endpoint.rstrip('/') + self.workspace_path = workspace_path + self.collection_name = collection_name + self.max_retries = max_retries + self.timeout = timeout + self.temp_dir = None + + # Set environment variables for cache functions + os.environ["WORKSPACE_PATH"] = workspace_path + + # Get repo name for cache operations + try: + from scripts.workspace_state import _extract_repo_name_from_path + self.repo_name = _extract_repo_name_from_path(workspace_path) + # Fallback to directory name if repo detection fails (for non-git repos) + if not self.repo_name: + self.repo_name = Path(workspace_path).name + except ImportError: + self.repo_name = Path(workspace_path).name + + # Setup HTTP session with simple retry + self.session = requests.Session() + retry_strategy = Retry(total=max_retries, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504]) + adapter = HTTPAdapter(max_retries=retry_strategy) + self.session.mount("http://", adapter) + self.session.mount("https://", adapter) + + def __enter__(self): + """Context manager entry.""" + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + """Context manager exit with cleanup.""" + self.cleanup() + + def cleanup(self): + """Clean up temporary directories.""" + if self.temp_dir and os.path.exists(self.temp_dir): + try: + import shutil + shutil.rmtree(self.temp_dir) + logger.debug(f"[remote_upload] Cleaned up temporary directory: {self.temp_dir}") + except Exception as e: + logger.warning(f"[remote_upload] Failed to cleanup temp directory {self.temp_dir}: {e}") + finally: + self.temp_dir = None + + def get_mapping_summary(self) -> Dict[str, Any]: + """Return derived collection mapping details.""" + container_path = self._translate_to_container_path(self.workspace_path) + return { + "repo_name": self.repo_name, + "collection_name": self.collection_name, + "source_path": self.workspace_path, + "container_path": container_path, + "upload_endpoint": self.upload_endpoint, + } + + def log_mapping_summary(self) -> None: + """Log mapping summary for user visibility.""" + info = self.get_mapping_summary() + logger.info("[remote_upload] Collection mapping:") + logger.info(f" repo_name: {info['repo_name']}") + logger.info(f" collection_name: {info['collection_name']}") + logger.info(f" source_path: {info['source_path']}") + logger.info(f" container_path: {info['container_path']}") + + def _get_temp_bundle_dir(self) -> Path: + """Get or create temporary directory for bundle creation.""" + if not self.temp_dir: + self.temp_dir = tempfile.mkdtemp(prefix="delta_bundle_") + return Path(self.temp_dir) + # CLI is stateless - sequence tracking is handled by server + + def detect_file_changes(self, changed_paths: List[Path]) -> Dict[str, List]: + """ + Detect what type of changes occurred for each file path. + + Args: + changed_paths: List of changed file paths + + Returns: + Dictionary with change types: created, updated, deleted, moved, unchanged + """ + changes = { + "created": [], + "updated": [], + "deleted": [], + "moved": [], + "unchanged": [] + } + + for path in changed_paths: + abs_path = str(path.resolve()) + cached_hash = get_cached_file_hash(abs_path, self.repo_name) + + if not path.exists(): + # File was deleted + if cached_hash: + changes["deleted"].append(path) + else: + # File exists - calculate current hash + try: + with open(path, 'rb') as f: + content = f.read() + current_hash = hashlib.sha1(content).hexdigest() + + if not cached_hash: + # New file + changes["created"].append(path) + elif cached_hash != current_hash: + # Modified file + changes["updated"].append(path) + else: + # Unchanged (might be a move detection candidate) + changes["unchanged"].append(path) + + # Update cache + set_cached_file_hash(abs_path, current_hash, self.repo_name) + except Exception: + # Skip files that can't be read + continue + + # Detect moves by looking for files with same content hash + # but different paths (requires additional tracking) + changes["moved"] = self._detect_moves(changes["created"], changes["deleted"]) + + return changes + + def _detect_moves(self, created_files: List[Path], deleted_files: List[Path]) -> List[Tuple[Path, Path]]: + """ + Detect file moves by matching content hashes between created and deleted files. + + Args: + created_files: List of newly created files + deleted_files: List of deleted files + + Returns: + List of (source, destination) path tuples for detected moves + """ + moves = [] + deleted_hashes = {} + + # Build hash map for deleted files + for deleted_path in deleted_files: + try: + # Try to get cached hash first, fallback to file content + cached_hash = get_cached_file_hash(str(deleted_path), self.repo_name) + if cached_hash: + deleted_hashes[cached_hash] = deleted_path + continue + + # If no cached hash, try to read from file if it still exists + if deleted_path.exists(): + with open(deleted_path, 'rb') as f: + content = f.read() + file_hash = hashlib.sha1(content).hexdigest() + deleted_hashes[file_hash] = deleted_path + except Exception: + continue + + # Match created files with deleted files by hash + for created_path in created_files: + try: + with open(created_path, 'rb') as f: + content = f.read() + file_hash = hashlib.sha1(content).hexdigest() + + if file_hash in deleted_hashes: + source_path = deleted_hashes[file_hash] + moves.append((source_path, created_path)) + # Remove from consideration + del deleted_hashes[file_hash] + except Exception: + continue + + return moves + + def create_delta_bundle(self, changes: Dict[str, List]) -> Tuple[str, Dict[str, Any]]: + """ + Create a delta bundle from detected changes. + + Args: + changes: Dictionary of file changes by type + + Returns: + Tuple of (bundle_path, manifest_metadata) + """ + bundle_id = str(uuid.uuid4()) + # CLI is stateless - server handles sequence numbers + created_at = datetime.now().isoformat() + + # Create temporary directory for bundle + with tempfile.TemporaryDirectory() as temp_dir: + temp_path = Path(temp_dir) + + # Create directory structure + files_dir = temp_path / "files" + metadata_dir = temp_path / "metadata" + files_dir.mkdir() + metadata_dir.mkdir() + + # Create subdirectories + (files_dir / "created").mkdir() + (files_dir / "updated").mkdir() + (files_dir / "moved").mkdir() + + operations = [] + total_size = 0 + file_hashes = {} + + # Process created files + for path in changes["created"]: + rel_path = str(path.relative_to(Path(self.workspace_path))) + try: + with open(path, 'rb') as f: + content = f.read() + file_hash = hashlib.sha1(content).hexdigest() + content_hash = f"sha1:{file_hash}" + + # Write file to bundle + bundle_file_path = files_dir / "created" / rel_path + bundle_file_path.parent.mkdir(parents=True, exist_ok=True) + bundle_file_path.write_bytes(content) + + # Get file info + stat = path.stat() + language = idx.CODE_EXTS.get(path.suffix.lower(), "unknown") + + operation = { + "operation": "created", + "path": rel_path, + "relative_path": rel_path, + "absolute_path": str(path.resolve()), + "size_bytes": stat.st_size, + "content_hash": content_hash, + "file_hash": f"sha1:{idx.hash_id(content.decode('utf-8', errors='ignore'), rel_path, 1, len(content.splitlines()))}", + "modified_time": datetime.fromtimestamp(stat.st_mtime).isoformat(), + "language": language + } + operations.append(operation) + file_hashes[rel_path] = f"sha1:{file_hash}" + total_size += stat.st_size + + except Exception as e: + print(f"[bundle_create] Error processing created file {path}: {e}") + continue + + # Process updated files + for path in changes["updated"]: + rel_path = str(path.relative_to(Path(self.workspace_path))) + try: + with open(path, 'rb') as f: + content = f.read() + file_hash = hashlib.sha1(content).hexdigest() + content_hash = f"sha1:{file_hash}" + previous_hash = get_cached_file_hash(str(path.resolve()), self.repo_name) + + # Write file to bundle + bundle_file_path = files_dir / "updated" / rel_path + bundle_file_path.parent.mkdir(parents=True, exist_ok=True) + bundle_file_path.write_bytes(content) + + # Get file info + stat = path.stat() + language = idx.CODE_EXTS.get(path.suffix.lower(), "unknown") + + operation = { + "operation": "updated", + "path": rel_path, + "relative_path": rel_path, + "absolute_path": str(path.resolve()), + "size_bytes": stat.st_size, + "content_hash": content_hash, + "previous_hash": f"sha1:{previous_hash}" if previous_hash else None, + "file_hash": f"sha1:{idx.hash_id(content.decode('utf-8', errors='ignore'), rel_path, 1, len(content.splitlines()))}", + "modified_time": datetime.fromtimestamp(stat.st_mtime).isoformat(), + "language": language + } + operations.append(operation) + file_hashes[rel_path] = f"sha1:{file_hash}" + total_size += stat.st_size + + except Exception as e: + print(f"[bundle_create] Error processing updated file {path}: {e}") + continue + + # Process moved files + for source_path, dest_path in changes["moved"]: + dest_rel_path = str(dest_path.relative_to(Path(self.workspace_path))) + source_rel_path = str(source_path.relative_to(Path(self.workspace_path))) + try: + with open(dest_path, 'rb') as f: + content = f.read() + file_hash = hashlib.sha1(content).hexdigest() + content_hash = f"sha1:{file_hash}" + + # Write file to bundle + bundle_file_path = files_dir / "moved" / dest_rel_path + bundle_file_path.parent.mkdir(parents=True, exist_ok=True) + bundle_file_path.write_bytes(content) + + # Get file info + stat = dest_path.stat() + language = idx.CODE_EXTS.get(dest_path.suffix.lower(), "unknown") + + operation = { + "operation": "moved", + "path": dest_rel_path, + "relative_path": dest_rel_path, + "absolute_path": str(dest_path.resolve()), + "source_path": source_rel_path, + "source_relative_path": source_rel_path, + "source_absolute_path": str(source_path.resolve()), + "size_bytes": stat.st_size, + "content_hash": content_hash, + "file_hash": f"sha1:{idx.hash_id(content.decode('utf-8', errors='ignore'), dest_rel_path, 1, len(content.splitlines()))}", + "modified_time": datetime.fromtimestamp(stat.st_mtime).isoformat(), + "language": language + } + operations.append(operation) + file_hashes[dest_rel_path] = f"sha1:{file_hash}" + total_size += stat.st_size + + except Exception as e: + print(f"[bundle_create] Error processing moved file {source_path} -> {dest_path}: {e}") + continue + + # Process deleted files + for path in changes["deleted"]: + rel_path = str(path.relative_to(Path(self.workspace_path))) + try: + previous_hash = get_cached_file_hash(str(path.resolve()), self.repo_name) + + operation = { + "operation": "deleted", + "path": rel_path, + "relative_path": rel_path, + "absolute_path": str(path.resolve()), + "previous_hash": f"sha1:{previous_hash}" if previous_hash else None, + "file_hash": None, + "modified_time": datetime.now().isoformat(), + "language": idx.CODE_EXTS.get(path.suffix.lower(), "unknown") + } + operations.append(operation) + + except Exception as e: + print(f"[bundle_create] Error processing deleted file {path}: {e}") + continue + + # Create manifest + manifest = { + "version": "1.0", + "bundle_id": bundle_id, + "workspace_path": self.workspace_path, + "collection_name": self.collection_name, + "created_at": created_at, + # CLI is stateless - server will assign sequence numbers + "sequence_number": None, # Server will assign + "parent_sequence": None, # Server will determine + "operations": { + "created": len(changes["created"]), + "updated": len(changes["updated"]), + "deleted": len(changes["deleted"]), + "moved": len(changes["moved"]) + }, + "total_files": len(operations), + "total_size_bytes": total_size, + "compression": "gzip", + "encoding": "utf-8" + } + + # Write manifest + (temp_path / "manifest.json").write_text(json.dumps(manifest, indent=2)) + + # Write operations metadata + operations_metadata = { + "operations": operations + } + (metadata_dir / "operations.json").write_text(json.dumps(operations_metadata, indent=2)) + + # Write hashes + hashes_metadata = { + "workspace_path": self.workspace_path, + "updated_at": created_at, + "file_hashes": file_hashes + } + (metadata_dir / "hashes.json").write_text(json.dumps(hashes_metadata, indent=2)) + + # Create tarball in temporary directory + temp_bundle_dir = self._get_temp_bundle_dir() + bundle_path = temp_bundle_dir / f"{bundle_id}.tar.gz" + with tarfile.open(bundle_path, "w:gz") as tar: + tar.add(temp_path, arcname=f"{bundle_id}") + + return str(bundle_path), manifest + + def upload_bundle(self, bundle_path: str, manifest: Dict[str, Any]) -> Dict[str, Any]: + """ + Upload delta bundle to remote server with exponential backoff retry. + + Args: + bundle_path: Path to the bundle tarball + manifest: Bundle manifest metadata + + Returns: + Server response dictionary + """ + last_error = None + + for attempt in range(self.max_retries + 1): + try: + # Simple exponential backoff + if attempt > 0: + delay = min(2 ** (attempt - 1), 30) # 1, 2, 4, 8... capped at 30s + logger.info(f"[remote_upload] Retry attempt {attempt + 1}/{self.max_retries + 1} after {delay}s delay") + time.sleep(delay) + + # Verify bundle exists + if not os.path.exists(bundle_path): + return {"success": False, "error": {"code": "BUNDLE_NOT_FOUND", "message": f"Bundle not found: {bundle_path}"}} + + # Check bundle size (100MB limit) + bundle_size = os.path.getsize(bundle_path) + if bundle_size > 100 * 1024 * 1024: + return {"success": False, "error": {"code": "BUNDLE_TOO_LARGE", "message": f"Bundle too large: {bundle_size} bytes"}} + + with open(bundle_path, 'rb') as bundle_file: + files = { + 'bundle': (f"{manifest['bundle_id']}.tar.gz", bundle_file, 'application/gzip') + } + + data = { + 'workspace_path': self._translate_to_container_path(self.workspace_path), + 'collection_name': self.collection_name, + # CLI is stateless - server handles sequence numbers + 'force': 'false', + 'source_path': self.workspace_path, + } + + logger.info(f"[remote_upload] Uploading bundle {manifest['bundle_id']} (size: {bundle_size} bytes)") + + response = self.session.post( + f"{self.upload_endpoint}/api/v1/delta/upload", + files=files, + data=data, + timeout=self.timeout + ) + + if response.status_code == 200: + result = response.json() + logger.info(f"[remote_upload] Successfully uploaded bundle {manifest['bundle_id']}") + return result + + # Handle error + error_msg = f"Upload failed with status {response.status_code}" + try: + error_detail = response.json() + error_detail_msg = error_detail.get('error', {}).get('message', 'Unknown error') + error_msg += f": {error_detail_msg}" + error_code = error_detail.get('error', {}).get('code', 'HTTP_ERROR') + except: + error_msg += f": {response.text[:200]}" + error_code = "HTTP_ERROR" + + last_error = {"success": False, "error": {"code": error_code, "message": error_msg, "status_code": response.status_code}} + + # Don't retry on client errors (except 429) + if 400 <= response.status_code < 500 and response.status_code != 429: + return last_error + + logger.warning(f"[remote_upload] Upload attempt {attempt + 1} failed: {error_msg}") + + except requests.exceptions.Timeout as e: + last_error = {"success": False, "error": {"code": "TIMEOUT_ERROR", "message": f"Upload timeout: {str(e)}"}} + logger.warning(f"[remote_upload] Upload timeout on attempt {attempt + 1}: {e}") + + except requests.exceptions.ConnectionError as e: + last_error = {"success": False, "error": {"code": "CONNECTION_ERROR", "message": f"Connection error: {str(e)}"}} + logger.warning(f"[remote_upload] Connection error on attempt {attempt + 1}: {e}") + + except requests.exceptions.RequestException as e: + last_error = {"success": False, "error": {"code": "NETWORK_ERROR", "message": f"Network error: {str(e)}"}} + logger.warning(f"[remote_upload] Network error on attempt {attempt + 1}: {e}") + + except Exception as e: + last_error = {"success": False, "error": {"code": "UPLOAD_ERROR", "message": f"Upload error: {str(e)}"}} + logger.error(f"[remote_upload] Unexpected error on attempt {attempt + 1}: {e}") + + # All retries exhausted + logger.error(f"[remote_upload] All {self.max_retries + 1} upload attempts failed for bundle {manifest.get('bundle_id', 'unknown')}") + return last_error or { + "success": False, + "error": { + "code": "MAX_RETRIES_EXCEEDED", + "message": f"Upload failed after {self.max_retries + 1} attempts" + } + } + + def get_server_status(self) -> Dict[str, Any]: + """Get server status with simplified error handling.""" + try: + container_workspace_path = self._translate_to_container_path(self.workspace_path) + + response = self.session.get( + f"{self.upload_endpoint}/api/v1/delta/status", + params={'workspace_path': container_workspace_path}, + timeout=min(self.timeout, 10) + ) + + if response.status_code == 200: + return response.json() + + # Handle error response + error_msg = f"Status check failed with HTTP {response.status_code}" + try: + error_detail = response.json() + error_msg += f": {error_detail.get('error', {}).get('message', 'Unknown error')}" + except: + error_msg += f": {response.text[:100]}" + + return {"success": False, "error": {"code": "STATUS_ERROR", "message": error_msg}} + + except requests.exceptions.Timeout: + return {"success": False, "error": {"code": "STATUS_TIMEOUT", "message": "Status check timeout"}} + except requests.exceptions.ConnectionError: + return {"success": False, "error": {"code": "CONNECTION_ERROR", "message": f"Cannot connect to server"}} + except Exception as e: + return {"success": False, "error": {"code": "STATUS_CHECK_ERROR", "message": f"Status check error: {str(e)}"}} + + def has_meaningful_changes(self, changes: Dict[str, List]) -> bool: + """Check if changes warrant a delta upload.""" + total_changes = sum(len(files) for op, files in changes.items() if op != "unchanged") + return total_changes > 0 + + def process_changes_and_upload(self, changes: Dict[str, List]) -> bool: + """ + Process pre-computed changes and upload delta bundle. + Includes comprehensive error handling and graceful fallback. + + Args: + changes: Dictionary of file changes by type + + Returns: + True if upload was successful, False otherwise + """ + try: + logger.info(f"[remote_upload] Processing pre-computed changes") + + # Validate input + if not changes: + logger.info("[remote_upload] No changes provided") + return True + + if not self.has_meaningful_changes(changes): + logger.info("[remote_upload] No meaningful changes detected, skipping upload") + return True + + # Log change summary + total_changes = sum(len(files) for op, files in changes.items() if op != "unchanged") + logger.info(f"[remote_upload] Detected {total_changes} meaningful changes: " + f"{len(changes['created'])} created, {len(changes['updated'])} updated, " + f"{len(changes['deleted'])} deleted, {len(changes['moved'])} moved") + + # Create delta bundle + bundle_path = None + try: + bundle_path, manifest = self.create_delta_bundle(changes) + logger.info(f"[remote_upload] Created delta bundle: {manifest['bundle_id']} " + f"(size: {manifest['total_size_bytes']} bytes)") + + # Validate bundle was created successfully + if not bundle_path or not os.path.exists(bundle_path): + raise RuntimeError(f"Failed to create bundle at {bundle_path}") + + except Exception as e: + logger.error(f"[remote_upload] Error creating delta bundle: {e}") + # Clean up any temporary files on failure + self.cleanup() + return False + + # Upload bundle with retry logic + try: + response = self.upload_bundle(bundle_path, manifest) + + if response.get("success", False): + processed_ops = response.get('processed_operations', {}) + logger.info(f"[remote_upload] Successfully uploaded bundle {manifest['bundle_id']}") + logger.info(f"[remote_upload] Processed operations: {processed_ops}") + + # Clean up temporary bundle after successful upload + try: + if os.path.exists(bundle_path): + os.remove(bundle_path) + logger.debug(f"[remote_upload] Cleaned up temporary bundle: {bundle_path}") + # Also clean up the entire temp directory if this is the last bundle + self.cleanup() + except Exception as cleanup_error: + logger.warning(f"[remote_upload] Failed to cleanup bundle {bundle_path}: {cleanup_error}") + + return True + else: + error_msg = response.get('error', {}).get('message', 'Unknown upload error') + logger.error(f"[remote_upload] Upload failed: {error_msg}") + return False + + except Exception as e: + logger.error(f"[remote_upload] Error uploading bundle: {e}") + return False + + except Exception as e: + logger.error(f"[remote_upload] Unexpected error in process_changes_and_upload: {e}") + return False + + def get_all_code_files(self) -> List[Path]: + """Get all code files in the workspace.""" + all_files = [] + try: + workspace_path = Path(self.workspace_path) + for ext in idx.CODE_EXTS: + all_files.extend(workspace_path.rglob(f"*{ext}")) + + # Filter out directories and hidden files + all_files = [ + f for f in all_files + if f.is_file() + and not any(part.startswith('.') for part in f.parts) + and '.codebase' not in str(f) + ] + except Exception as e: + logger.error(f"[watch] Error scanning files: {e}") + + return all_files + + def watch_loop(self, interval: int = 5): + """Main file watching loop using existing detection and upload methods.""" + logger.info(f"[watch] Starting file monitoring (interval: {interval}s)") + logger.info(f"[watch] Monitoring: {self.workspace_path}") + logger.info(f"[watch] Press Ctrl+C to stop") + + try: + while True: + try: + # Use existing change detection (get all files in workspace) + all_files = self.get_all_code_files() + changes = self.detect_file_changes(all_files) + + # Count only meaningful changes (exclude unchanged) + meaningful_changes = len(changes.get("created", [])) + len(changes.get("updated", [])) + len(changes.get("deleted", [])) + len(changes.get("moved", [])) + + if meaningful_changes > 0: + logger.info(f"[watch] Detected {meaningful_changes} changes: { {k: len(v) for k, v in changes.items() if k != 'unchanged'} }") + + # Use existing upload method + success = self.process_changes_and_upload(changes) + + if success: + logger.info(f"[watch] Successfully uploaded changes") + else: + logger.error(f"[watch] Failed to upload changes") + else: + logger.debug(f"[watch] No changes detected") # Debug level to avoid spam + + # Sleep until next check + time.sleep(interval) + + except KeyboardInterrupt: + logger.info(f"[watch] Received interrupt signal, stopping...") + break + except Exception as e: + logger.error(f"[watch] Error in watch loop: {e}") + time.sleep(interval) # Continue even after errors + + except KeyboardInterrupt: + logger.info(f"[watch] File monitoring stopped by user") + + def process_and_upload_changes(self, changed_paths: List[Path]) -> bool: + """ + Process changed paths and upload delta bundle if meaningful changes exist. + Includes comprehensive error handling and graceful fallback. + + Args: + changed_paths: List of changed file paths + + Returns: + True if upload was successful, False otherwise + """ + try: + logger.info(f"[remote_upload] Processing {len(changed_paths)} changed paths") + + # Validate input + if not changed_paths: + logger.info("[remote_upload] No changed paths provided") + return True + + # Detect changes + try: + changes = self.detect_file_changes(changed_paths) + except Exception as e: + logger.error(f"[remote_upload] Error detecting file changes: {e}") + return False + + if not self.has_meaningful_changes(changes): + logger.info("[remote_upload] No meaningful changes detected, skipping upload") + return True + + # Log change summary + total_changes = sum(len(files) for op, files in changes.items() if op != "unchanged") + logger.info(f"[remote_upload] Detected {total_changes} meaningful changes: " + f"{len(changes['created'])} created, {len(changes['updated'])} updated, " + f"{len(changes['deleted'])} deleted, {len(changes['moved'])} moved") + + # Create delta bundle + bundle_path = None + try: + bundle_path, manifest = self.create_delta_bundle(changes) + logger.info(f"[remote_upload] Created delta bundle: {manifest['bundle_id']} " + f"(size: {manifest['total_size_bytes']} bytes)") + + # Validate bundle was created successfully + if not bundle_path or not os.path.exists(bundle_path): + raise RuntimeError(f"Failed to create bundle at {bundle_path}") + + except Exception as e: + logger.error(f"[remote_upload] Error creating delta bundle: {e}") + # Clean up any temporary files on failure + self.cleanup() + return False + + # Upload bundle with retry logic + try: + response = self.upload_bundle(bundle_path, manifest) + + if response.get("success", False): + processed_ops = response.get('processed_operations', {}) + logger.info(f"[remote_upload] Successfully uploaded bundle {manifest['bundle_id']}") + logger.info(f"[remote_upload] Processed operations: {processed_ops}") + + # Clean up temporary bundle after successful upload + try: + if os.path.exists(bundle_path): + os.remove(bundle_path) + logger.debug(f"[remote_upload] Cleaned up temporary bundle: {bundle_path}") + # Also clean up the entire temp directory if this is the last bundle + self.cleanup() + except Exception as cleanup_error: + logger.warning(f"[remote_upload] Failed to cleanup bundle {bundle_path}: {cleanup_error}") + + return True + else: + error = response.get("error", {}) + error_code = error.get("code", "UNKNOWN") + error_msg = error.get("message", "Unknown error") + + logger.error(f"[remote_upload] Upload failed: {error_msg}") + + # Handle specific error types + # CLI is stateless - server handles sequence management + if error_code in ["BUNDLE_TOO_LARGE", "BUNDLE_NOT_FOUND"]: + # These are unrecoverable errors + logger.error(f"[remote_upload] Unrecoverable error ({error_code}): {error_msg}") + return False + elif error_code in ["TIMEOUT_ERROR", "CONNECTION_ERROR", "NETWORK_ERROR"]: + # These might be temporary, suggest fallback + logger.warning(f"[remote_upload] Network-related error ({error_code}): {error_msg}") + logger.warning("[remote_upload] Consider falling back to local mode if this persists") + return False + else: + # Other errors + logger.error(f"[remote_upload] Upload error ({error_code}): {error_msg}") + return False + + except Exception as e: + logger.error(f"[remote_upload] Unexpected error during upload: {e}") + return False + + except Exception as e: + logger.error(f"[remote_upload] Critical error in process_and_upload_changes: {e}") + logger.exception("[remote_upload] Full traceback:") + return False + +def get_remote_config(cli_path: Optional[str] = None) -> Dict[str, str]: + """Get remote upload configuration from environment variables and command-line arguments.""" + # Use command-line path if provided, otherwise fall back to environment variables + if cli_path: + workspace_path = cli_path + else: + workspace_path = os.environ.get("WATCH_ROOT", os.environ.get("WORKSPACE_PATH", "/work")) + + # Use auto-generated collection name based on repo name + repo_name = _extract_repo_name_from_path(workspace_path) + # Fallback to directory name if repo detection fails + if not repo_name: + repo_name = Path(workspace_path).name + collection_name = get_collection_name(repo_name) + + return { + "upload_endpoint": os.environ.get("REMOTE_UPLOAD_ENDPOINT", "http://localhost:8080"), + "workspace_path": workspace_path, + "collection_name": collection_name, + "max_retries": int(os.environ.get("REMOTE_UPLOAD_MAX_RETRIES", "3")), + "timeout": int(os.environ.get("REMOTE_UPLOAD_TIMEOUT", "30")) + } + + +def main(): + """Main entry point for the remote upload client.""" + parser = argparse.ArgumentParser( + description="Remote upload client for delta bundles in Context-Engine", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Upload from current directory or environment variables + python remote_upload_client.py + + # Upload from specific directory + python remote_upload_client.py --path /path/to/repo + + # Upload from specific directory with custom endpoint + python remote_upload_client.py --path /path/to/repo --endpoint http://remote-server:8080 + + # Watch for file changes and upload automatically + python remote_upload_client.py --path /path/to/repo --watch + + # Watch with custom interval (check every 3 seconds) + python remote_upload_client.py --path /path/to/repo --watch --interval 3 + """ + ) + + parser.add_argument( + "--path", + type=str, + help="Path to the directory to upload (overrides WATCH_ROOT/WORKSPACE_PATH environment variables)" + ) + + parser.add_argument( + "--endpoint", + type=str, + help="Remote upload endpoint (overrides REMOTE_UPLOAD_ENDPOINT environment variable)" + ) + + parser.add_argument( + "--max-retries", + type=int, + help="Maximum number of upload retries (overrides REMOTE_UPLOAD_MAX_RETRIES environment variable)" + ) + + parser.add_argument( + "--timeout", + type=int, + help="Request timeout in seconds (overrides REMOTE_UPLOAD_TIMEOUT environment variable)" + ) + + parser.add_argument( + "--force", + action="store_true", + help="Force upload of all files (ignore cached state and treat all files as new)" + ) + + parser.add_argument( + "--show-mapping", + action="store_true", + help="Print collection↔workspace mapping information and exit" + ) + + parser.add_argument( + "--watch", "-w", + action="store_true", + help="Watch for file changes and upload automatically (continuous mode)" + ) + + parser.add_argument( + "--interval", "-i", + type=int, + default=5, + help="Watch interval in seconds (default: 5)" + ) + + args = parser.parse_args() + + # Validate path if provided + if args.path: + if not os.path.exists(args.path): + logger.error(f"Path does not exist: {args.path}") + return 1 + + if not os.path.isdir(args.path): + logger.error(f"Path is not a directory: {args.path}") + return 1 + + args.path = os.path.abspath(args.path) + logger.info(f"Using specified path: {args.path}") + + # Get configuration + config = get_remote_config(args.path) + + # Override with command-line arguments + if args.endpoint: + config["upload_endpoint"] = args.endpoint + if args.max_retries is not None: + config["max_retries"] = args.max_retries + if args.timeout is not None: + config["timeout"] = args.timeout + + logger.info(f"Workspace path: {config['workspace_path']}") + logger.info(f"Collection name: {config['collection_name']}") + logger.info(f"Upload endpoint: {config['upload_endpoint']}") + + if args.show_mapping: + with RemoteUploadClient( + upload_endpoint=config["upload_endpoint"], + workspace_path=config["workspace_path"], + collection_name=config["collection_name"], + max_retries=config["max_retries"], + timeout=config["timeout"], + ) as client: + client.log_mapping_summary() + return 0 + + # Handle watch mode + if args.watch: + logger.info("Starting watch mode for continuous file monitoring") + try: + with RemoteUploadClient( + upload_endpoint=config["upload_endpoint"], + workspace_path=config["workspace_path"], + collection_name=config["collection_name"], + max_retries=config["max_retries"], + timeout=config["timeout"] + ) as client: + + logger.info("Remote upload client initialized successfully") + client.log_mapping_summary() + + # Test server connection first + logger.info("Checking server status...") + status = client.get_server_status() + is_success = ( + isinstance(status, dict) and + 'workspace_path' in status and + 'collection_name' in status and + status.get('status') == 'ready' + ) + if not is_success: + error = status.get("error", {}) + logger.error(f"Cannot connect to server: {error.get('message', 'Unknown error')}") + return 1 + + logger.info("Server connection successful") + logger.info(f"Starting file monitoring with {args.interval}s interval") + + # Start the watch loop + client.watch_loop(interval=args.interval) + + return 0 + + except KeyboardInterrupt: + logger.info("Watch mode stopped by user") + return 0 + except Exception as e: + logger.error(f"Watch mode failed: {e}") + return 1 + + # Initialize client with context manager for cleanup + try: + with RemoteUploadClient( + upload_endpoint=config["upload_endpoint"], + workspace_path=config["workspace_path"], + collection_name=config["collection_name"], + max_retries=config["max_retries"], + timeout=config["timeout"] + ) as client: + + logger.info("Remote upload client initialized successfully") + + client.log_mapping_summary() + + # Test server connection + logger.info("Checking server status...") + status = client.get_server_status() + # For delta endpoint, success is indicated by having expected fields (not a "success" boolean) + is_success = ( + isinstance(status, dict) and + 'workspace_path' in status and + 'collection_name' in status and + status.get('status') == 'ready' + ) + if not is_success: + error = status.get("error", {}) + logger.error(f"Cannot connect to server: {error.get('message', 'Unknown error')}") + return 1 + + logger.info("Server connection successful") + + # Scan repository and upload files + logger.info("Scanning repository for files...") + workspace_path = Path(config['workspace_path']) + + # Find all files in the repository + all_files = [] + for file_path in workspace_path.rglob('*'): + if file_path.is_file() and not file_path.name.startswith('.'): + rel_path = file_path.relative_to(workspace_path) + # Skip .codebase directory and other metadata + if not str(rel_path).startswith('.codebase'): + all_files.append(file_path) + + logger.info(f"Found {len(all_files)} files to upload") + + if not all_files: + logger.warning("No files found to upload") + return 0 + + # Detect changes (treat all files as changes for initial upload) + if args.force: + # Force mode: treat all files as created + changes = {"created": all_files, "updated": [], "deleted": [], "moved": [], "unchanged": []} + else: + changes = client.detect_file_changes(all_files) + + if not client.has_meaningful_changes(changes): + logger.info("No meaningful changes to upload") + return 0 + + logger.info(f"Changes detected: {len(changes.get('created', []))} created, {len(changes.get('updated', []))} updated, {len(changes.get('deleted', []))} deleted") + + # Process and upload changes + logger.info("Uploading files to remote server...") + success = client.process_changes_and_upload(changes) + + if success: + logger.info("Repository upload completed successfully!") + logger.info(f"Collection name: {config['collection_name']}") + logger.info(f"Files uploaded: {len(all_files)}") + else: + logger.error("Repository upload failed!") + return 1 + + return 0 + + except Exception as e: + logger.error(f"Failed to initialize remote upload client: {e}") + return 1 + + +if __name__ == "__main__": + import sys + sys.exit(main()) diff --git a/scripts/standalone_upload_client.py b/scripts/standalone_upload_client.py new file mode 100644 index 00000000..76a42432 --- /dev/null +++ b/scripts/standalone_upload_client.py @@ -0,0 +1,1259 @@ +#!/usr/bin/env python3 +""" +Standalone Remote Upload Client for Context-Engine. + +This is a self-contained version of the remote upload client that doesn't require +the full Context-Engine repository. It includes only the essential functions +needed for delta bundle creation and upload. + +Example usage: + python3 standalone_upload_client.py --path /path/to/your/project --server https://your-server.com +""" + +import os +import json +import time +import uuid +import hashlib +import tarfile +import tempfile +import logging +import argparse +from pathlib import Path +from typing import Dict, List, Any, Optional, Tuple +from datetime import datetime +import requests +from requests.adapters import HTTPAdapter +from urllib3.util.retry import Retry + +# Configure logging +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +# ============================================================================= +# EMBEDDED DEPENDENCIES (Extracted from Context-Engine) +# ============================================================================= + +# Language detection mapping (from ingest_code.py) +CODE_EXTS = { + ".py": "python", + ".js": "javascript", + ".ts": "typescript", + ".tsx": "typescript", + ".jsx": "javascript", + ".java": "java", + ".go": "go", + ".rs": "rust", + ".rb": "ruby", + ".php": "php", + ".c": "c", + ".h": "c", + ".cpp": "cpp", + ".cc": "cpp", + ".hpp": "cpp", + ".cs": "csharp", + ".kt": "kotlin", + ".swift": "swift", + ".scala": "scala", + ".sh": "shell", + ".ps1": "powershell", + ".psm1": "powershell", + ".psd1": "powershell", + ".sql": "sql", + ".md": "markdown", + ".yml": "yaml", + ".yaml": "yaml", + ".toml": "toml", + ".ini": "ini", + ".cfg": "ini", + ".conf": "ini", + ".xml": "xml", + ".html": "html", + ".htm": "html", + ".css": "css", + ".scss": "scss", + ".sass": "sass", + ".less": "less", + ".json": "json", + "Dockerfile": "dockerfile", + "Makefile": "makefile", + ".tf": "terraform", + ".tfvars": "terraform", + ".hcl": "terraform", + ".vue": "vue", + ".svelte": "svelte", + ".elm": "elm", + ".dart": "dart", + ".lua": "lua", + ".r": "r", + ".R": "r", + ".m": "matlab", + ".pl": "perl", + ".swift": "swift", + ".kt": "kotlin", + ".cljs": "clojure", + ".clj": "clojure", + ".hs": "haskell", + ".ml": "ocaml", + ".zig": "zig", + ".nim": "nim", + ".v": "verilog", + ".sv": "verilog", + ".vhdl": "vhdl", + ".asm": "assembly", + ".s": "assembly", + ". Dockerfile": "dockerfile", +} + +def hash_id(text: str, path: str, start: int, end: int) -> str: + """Generate hash ID for content (from ingest_code.py).""" + h = hashlib.sha1( + f"{path}:{start}-{end}\n{text}".encode("utf-8", errors="ignore") + ).hexdigest() + return h[:16] + +def get_collection_name(repo_name: Optional[str] = None) -> str: + """Generate collection name with 8-char hash for local workspaces. + + Simplified version from workspace_state.py. + """ + if not repo_name: + return "default-collection" + hash_obj = hashlib.sha256(repo_name.encode()) + short_hash = hash_obj.hexdigest()[:8] + return f"{repo_name}-{short_hash}" + +def _extract_repo_name_from_path(workspace_path: str) -> str: + """Extract repository name from workspace path. + + Simplified version from workspace_state.py. + """ + try: + path = Path(workspace_path).resolve() + # Get the directory name as repo name + return path.name + except Exception: + return "unknown-repo" + +# Simple file-based hash cache (simplified from workspace_state.py) +class SimpleHashCache: + """Simple file-based hash cache for tracking file changes.""" + + def __init__(self, workspace_path: str, repo_name: str): + self.workspace_path = Path(workspace_path).resolve() + self.repo_name = repo_name + self.cache_dir = self.workspace_path / ".context-engine" + self.cache_file = self.cache_dir / "file_cache.json" + self.cache_dir.mkdir(exist_ok=True) + + def _load_cache(self) -> Dict[str, str]: + """Load cache from disk.""" + if not self.cache_file.exists(): + return {} + try: + with open(self.cache_file, 'r', encoding='utf-8') as f: + data = json.load(f) + return data.get("file_hashes", {}) + except Exception: + return {} + + def _save_cache(self, file_hashes: Dict[str, str]): + """Save cache to disk.""" + try: + data = { + "file_hashes": file_hashes, + "updated_at": datetime.now().isoformat() + } + with open(self.cache_file, 'w', encoding='utf-8') as f: + json.dump(data, f, indent=2) + except Exception: + pass + + def get_hash(self, file_path: str) -> str: + """Get cached file hash.""" + file_hashes = self._load_cache() + abs_path = str(Path(file_path).resolve()) + return file_hashes.get(abs_path, "") + + def set_hash(self, file_path: str, file_hash: str): + """Set cached file hash.""" + file_hashes = self._load_cache() + abs_path = str(Path(file_path).resolve()) + file_hashes[abs_path] = file_hash + self._save_cache(file_hashes) + +# Create global cache instance (will be initialized in RemoteUploadClient) +_hash_cache: Optional[SimpleHashCache] = None + +def get_cached_file_hash(file_path: str, repo_name: Optional[str] = None) -> str: + """Get cached file hash for tracking changes.""" + global _hash_cache + if _hash_cache: + return _hash_cache.get_hash(file_path) + return "" + +def set_cached_file_hash(file_path: str, file_hash: str, repo_name: Optional[str] = None): + """Set cached file hash for tracking changes.""" + global _hash_cache + if _hash_cache: + _hash_cache.set_hash(file_path, file_hash) + + +class RemoteUploadClient: + """Client for uploading delta bundles to remote server.""" + + def _translate_to_container_path(self, host_path: str) -> str: + """Translate host path to container path for API communication.""" + # Use environment variable for path mapping if available + host_root = os.environ.get("HOST_ROOT", "/home/coder/project/Context-Engine/dev-workspace") + container_root = os.environ.get("CONTAINER_ROOT", "/work") + + if host_path.startswith(host_root): + return host_path.replace(host_root, container_root) + else: + # Fallback: if path doesn't match expected pattern, use as-is + return host_path + + def __init__(self, upload_endpoint: str, workspace_path: str, collection_name: str, + max_retries: int = 3, timeout: int = 30, metadata_path: Optional[str] = None): + """Initialize remote upload client.""" + self.upload_endpoint = upload_endpoint.rstrip('/') + self.workspace_path = workspace_path + self.collection_name = collection_name + self.max_retries = max_retries + self.timeout = timeout + self.temp_dir = None + + # Set environment variables for cache functions + os.environ["WORKSPACE_PATH"] = workspace_path + + # Store repo name and initialize hash cache + self.repo_name = _extract_repo_name_from_path(workspace_path) + # Fallback to directory name if repo detection fails (for non-git repos) + if not self.repo_name: + self.repo_name = Path(workspace_path).name + global _hash_cache + _hash_cache = SimpleHashCache(workspace_path, self.repo_name) + + # Setup HTTP session with simple retry + self.session = requests.Session() + retry_strategy = Retry(total=max_retries, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504]) + adapter = HTTPAdapter(max_retries=retry_strategy) + self.session.mount("http://", adapter) + self.session.mount("https://", adapter) + + def __enter__(self): + """Context manager entry.""" + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + """Context manager exit with cleanup.""" + self.cleanup() + + def cleanup(self): + """Clean up temporary directories.""" + if self.temp_dir and os.path.exists(self.temp_dir): + try: + import shutil + shutil.rmtree(self.temp_dir) + logger.debug(f"[remote_upload] Cleaned up temporary directory: {self.temp_dir}") + except Exception as e: + logger.warning(f"[remote_upload] Failed to cleanup temp directory {self.temp_dir}: {e}") + finally: + self.temp_dir = None + + def get_mapping_summary(self) -> Dict[str, Any]: + """Return derived collection mapping details.""" + container_path = self._translate_to_container_path(self.workspace_path) + return { + "repo_name": self.repo_name, + "collection_name": self.collection_name, + "source_path": self.workspace_path, + "container_path": container_path, + "upload_endpoint": self.upload_endpoint, + } + + def log_mapping_summary(self) -> None: + """Log mapping summary for user visibility.""" + info = self.get_mapping_summary() + logger.info("[remote_upload] Collection mapping:") + logger.info(f" repo_name: {info['repo_name']}") + logger.info(f" collection_name: {info['collection_name']}") + logger.info(f" source_path: {info['source_path']}") + logger.info(f" container_path: {info['container_path']}") + + def _get_temp_bundle_dir(self) -> Path: + """Get or create temporary directory for bundle creation.""" + if not self.temp_dir: + self.temp_dir = tempfile.mkdtemp(prefix="delta_bundle_") + return Path(self.temp_dir) + # CLI is stateless - sequence tracking is handled by server + + def detect_file_changes(self, changed_paths: List[Path]) -> Dict[str, List]: + """ + Detect what type of changes occurred for each file path. + + Args: + changed_paths: List of changed file paths + + Returns: + Dictionary with change types: created, updated, deleted, moved, unchanged + """ + changes = { + "created": [], + "updated": [], + "deleted": [], + "moved": [], + "unchanged": [] + } + + for path in changed_paths: + abs_path = str(path.resolve()) + cached_hash = get_cached_file_hash(abs_path, self.repo_name) + + if not path.exists(): + # File was deleted + if cached_hash: + changes["deleted"].append(path) + else: + # File exists - calculate current hash + try: + with open(path, 'rb') as f: + content = f.read() + current_hash = hashlib.sha1(content).hexdigest() + + if not cached_hash: + # New file + changes["created"].append(path) + elif cached_hash != current_hash: + # Modified file + changes["updated"].append(path) + else: + # Unchanged (might be a move detection candidate) + changes["unchanged"].append(path) + + # Update cache + set_cached_file_hash(abs_path, current_hash, self.repo_name) + except Exception: + # Skip files that can't be read + continue + + # Detect moves by looking for files with same content hash + # but different paths (requires additional tracking) + changes["moved"] = self._detect_moves(changes["created"], changes["deleted"]) + + return changes + + def _detect_moves(self, created_files: List[Path], deleted_files: List[Path]) -> List[Tuple[Path, Path]]: + """ + Detect file moves by matching content hashes between created and deleted files. + + Args: + created_files: List of newly created files + deleted_files: List of deleted files + + Returns: + List of (source, destination) path tuples for detected moves + """ + moves = [] + deleted_hashes = {} + + # Build hash map for deleted files + for deleted_path in deleted_files: + try: + # Try to get cached hash first, fallback to file content + cached_hash = get_cached_file_hash(str(deleted_path), self.repo_name) + if cached_hash: + deleted_hashes[cached_hash] = deleted_path + continue + + # If no cached hash, try to read from file if it still exists + if deleted_path.exists(): + with open(deleted_path, 'rb') as f: + content = f.read() + file_hash = hashlib.sha1(content).hexdigest() + deleted_hashes[file_hash] = deleted_path + except Exception: + continue + + # Match created files with deleted files by hash + for created_path in created_files: + try: + with open(created_path, 'rb') as f: + content = f.read() + file_hash = hashlib.sha1(content).hexdigest() + + if file_hash in deleted_hashes: + source_path = deleted_hashes[file_hash] + moves.append((source_path, created_path)) + # Remove from consideration + del deleted_hashes[file_hash] + except Exception: + continue + + return moves + + def create_delta_bundle(self, changes: Dict[str, List]) -> Tuple[str, Dict[str, Any]]: + """ + Create a delta bundle from detected changes. + + Args: + changes: Dictionary of file changes by type + + Returns: + Tuple of (bundle_path, manifest_metadata) + """ + bundle_id = str(uuid.uuid4()) + # CLI is stateless - server handles sequence numbers + created_at = datetime.now().isoformat() + + # Create temporary directory for bundle + with tempfile.TemporaryDirectory() as temp_dir: + temp_path = Path(temp_dir) + + # Create directory structure + files_dir = temp_path / "files" + metadata_dir = temp_path / "metadata" + files_dir.mkdir() + metadata_dir.mkdir() + + # Create subdirectories + (files_dir / "created").mkdir() + (files_dir / "updated").mkdir() + (files_dir / "moved").mkdir() + + operations = [] + total_size = 0 + file_hashes = {} + + # Process created files + for path in changes["created"]: + rel_path = str(path.relative_to(Path(self.workspace_path))) + try: + with open(path, 'rb') as f: + content = f.read() + file_hash = hashlib.sha1(content).hexdigest() + content_hash = f"sha1:{file_hash}" + + # Write file to bundle + bundle_file_path = files_dir / "created" / rel_path + bundle_file_path.parent.mkdir(parents=True, exist_ok=True) + bundle_file_path.write_bytes(content) + + # Get file info + stat = path.stat() + language = CODE_EXTS.get(path.suffix.lower(), "unknown") + + operation = { + "operation": "created", + "path": rel_path, + "relative_path": rel_path, + "absolute_path": str(path.resolve()), + "size_bytes": stat.st_size, + "content_hash": content_hash, + "file_hash": f"sha1:{hash_id(content.decode('utf-8', errors='ignore'), rel_path, 1, len(content.splitlines()))}", + "modified_time": datetime.fromtimestamp(stat.st_mtime).isoformat(), + "language": language + } + operations.append(operation) + file_hashes[rel_path] = f"sha1:{file_hash}" + total_size += stat.st_size + + except Exception as e: + print(f"[bundle_create] Error processing created file {path}: {e}") + continue + + # Process updated files + for path in changes["updated"]: + rel_path = str(path.relative_to(Path(self.workspace_path))) + try: + with open(path, 'rb') as f: + content = f.read() + file_hash = hashlib.sha1(content).hexdigest() + content_hash = f"sha1:{file_hash}" + previous_hash = get_cached_file_hash(str(path.resolve()), self.repo_name) + + # Write file to bundle + bundle_file_path = files_dir / "updated" / rel_path + bundle_file_path.parent.mkdir(parents=True, exist_ok=True) + bundle_file_path.write_bytes(content) + + # Get file info + stat = path.stat() + language = CODE_EXTS.get(path.suffix.lower(), "unknown") + + operation = { + "operation": "updated", + "path": rel_path, + "relative_path": rel_path, + "absolute_path": str(path.resolve()), + "size_bytes": stat.st_size, + "content_hash": content_hash, + "previous_hash": f"sha1:{previous_hash}" if previous_hash else None, + "file_hash": f"sha1:{hash_id(content.decode('utf-8', errors='ignore'), rel_path, 1, len(content.splitlines()))}", + "modified_time": datetime.fromtimestamp(stat.st_mtime).isoformat(), + "language": language + } + operations.append(operation) + file_hashes[rel_path] = f"sha1:{file_hash}" + total_size += stat.st_size + + except Exception as e: + print(f"[bundle_create] Error processing updated file {path}: {e}") + continue + + # Process moved files + for source_path, dest_path in changes["moved"]: + dest_rel_path = str(dest_path.relative_to(Path(self.workspace_path))) + source_rel_path = str(source_path.relative_to(Path(self.workspace_path))) + try: + with open(dest_path, 'rb') as f: + content = f.read() + file_hash = hashlib.sha1(content).hexdigest() + content_hash = f"sha1:{file_hash}" + + # Write file to bundle + bundle_file_path = files_dir / "moved" / dest_rel_path + bundle_file_path.parent.mkdir(parents=True, exist_ok=True) + bundle_file_path.write_bytes(content) + + # Get file info + stat = dest_path.stat() + language = CODE_EXTS.get(dest_path.suffix.lower(), "unknown") + + operation = { + "operation": "moved", + "path": dest_rel_path, + "relative_path": dest_rel_path, + "absolute_path": str(dest_path.resolve()), + "source_path": source_rel_path, + "source_relative_path": source_rel_path, + "source_absolute_path": str(source_path.resolve()), + "size_bytes": stat.st_size, + "content_hash": content_hash, + "file_hash": f"sha1:{idx.hash_id(content.decode('utf-8', errors='ignore'), dest_rel_path, 1, len(content.splitlines()))}", + "modified_time": datetime.fromtimestamp(stat.st_mtime).isoformat(), + "language": language + } + operations.append(operation) + file_hashes[dest_rel_path] = f"sha1:{file_hash}" + total_size += stat.st_size + + except Exception as e: + print(f"[bundle_create] Error processing moved file {source_path} -> {dest_path}: {e}") + continue + + # Process deleted files + for path in changes["deleted"]: + rel_path = str(path.relative_to(Path(self.workspace_path))) + try: + previous_hash = get_cached_file_hash(str(path.resolve()), self.repo_name) + + operation = { + "operation": "deleted", + "path": rel_path, + "relative_path": rel_path, + "absolute_path": str(path.resolve()), + "previous_hash": f"sha1:{previous_hash}" if previous_hash else None, + "file_hash": None, + "modified_time": datetime.now().isoformat(), + "language": idx.CODE_EXTS.get(path.suffix.lower(), "unknown") + } + operations.append(operation) + + except Exception as e: + print(f"[bundle_create] Error processing deleted file {path}: {e}") + continue + + # Create manifest + manifest = { + "version": "1.0", + "bundle_id": bundle_id, + "workspace_path": self.workspace_path, + "collection_name": self.collection_name, + "created_at": created_at, + # CLI is stateless - server will assign sequence numbers + "sequence_number": None, # Server will assign + "parent_sequence": None, # Server will determine + "operations": { + "created": len(changes["created"]), + "updated": len(changes["updated"]), + "deleted": len(changes["deleted"]), + "moved": len(changes["moved"]) + }, + "total_files": len(operations), + "total_size_bytes": total_size, + "compression": "gzip", + "encoding": "utf-8" + } + + # Write manifest + (temp_path / "manifest.json").write_text(json.dumps(manifest, indent=2)) + + # Write operations metadata + operations_metadata = { + "operations": operations + } + (metadata_dir / "operations.json").write_text(json.dumps(operations_metadata, indent=2)) + + # Write hashes + hashes_metadata = { + "workspace_path": self.workspace_path, + "updated_at": created_at, + "file_hashes": file_hashes + } + (metadata_dir / "hashes.json").write_text(json.dumps(hashes_metadata, indent=2)) + + # Create tarball in temporary directory + temp_bundle_dir = self._get_temp_bundle_dir() + bundle_path = temp_bundle_dir / f"{bundle_id}.tar.gz" + with tarfile.open(bundle_path, "w:gz") as tar: + tar.add(temp_path, arcname=f"{bundle_id}") + + return str(bundle_path), manifest + + def upload_bundle(self, bundle_path: str, manifest: Dict[str, Any]) -> Dict[str, Any]: + """ + Upload delta bundle to remote server with exponential backoff retry. + + Args: + bundle_path: Path to the bundle tarball + manifest: Bundle manifest metadata + + Returns: + Server response dictionary + """ + last_error = None + + for attempt in range(self.max_retries + 1): + try: + # Simple exponential backoff + if attempt > 0: + delay = min(2 ** (attempt - 1), 30) # 1, 2, 4, 8... capped at 30s + logger.info(f"[remote_upload] Retry attempt {attempt + 1}/{self.max_retries + 1} after {delay}s delay") + time.sleep(delay) + + # Verify bundle exists + if not os.path.exists(bundle_path): + return {"success": False, "error": {"code": "BUNDLE_NOT_FOUND", "message": f"Bundle not found: {bundle_path}"}} + + # Check bundle size (100MB limit) + bundle_size = os.path.getsize(bundle_path) + if bundle_size > 100 * 1024 * 1024: + return {"success": False, "error": {"code": "BUNDLE_TOO_LARGE", "message": f"Bundle too large: {bundle_size} bytes"}} + + with open(bundle_path, 'rb') as bundle_file: + files = { + 'bundle': (f"{manifest['bundle_id']}.tar.gz", bundle_file, 'application/gzip') + } + + data = { + 'workspace_path': self._translate_to_container_path(self.workspace_path), + 'collection_name': self.collection_name, + # CLI is stateless - server handles sequence numbers + 'force': 'false', + 'source_path': self.workspace_path, + } + + logger.info(f"[remote_upload] Uploading bundle {manifest['bundle_id']} (size: {bundle_size} bytes)") + + response = self.session.post( + f"{self.upload_endpoint}/api/v1/delta/upload", + files=files, + data=data, + timeout=self.timeout + ) + + if response.status_code == 200: + result = response.json() + logger.info(f"[remote_upload] Successfully uploaded bundle {manifest['bundle_id']}") + return result + # Handle error + error_msg = f"Upload failed with status {response.status_code}" + try: + error_detail = response.json() + error_detail_msg = error_detail.get('error', {}).get('message', 'Unknown error') + error_msg += f": {error_detail_msg}" + error_code = error_detail.get('error', {}).get('code', 'HTTP_ERROR') + except: + error_msg += f": {response.text[:200]}" + error_code = "HTTP_ERROR" + + last_error = {"success": False, "error": {"code": error_code, "message": error_msg, "status_code": response.status_code}} + + # Don't retry on client errors (except 429) + if 400 <= response.status_code < 500 and response.status_code != 429: + return last_error + + logger.warning(f"[remote_upload] Upload attempt {attempt + 1} failed: {error_msg}") + + except requests.exceptions.Timeout as e: + last_error = {"success": False, "error": {"code": "TIMEOUT_ERROR", "message": f"Upload timeout: {str(e)}"}} + logger.warning(f"[remote_upload] Upload timeout on attempt {attempt + 1}: {e}") + + except requests.exceptions.ConnectionError as e: + last_error = {"success": False, "error": {"code": "CONNECTION_ERROR", "message": f"Connection error: {str(e)}"}} + logger.warning(f"[remote_upload] Connection error on attempt {attempt + 1}: {e}") + + except requests.exceptions.RequestException as e: + last_error = {"success": False, "error": {"code": "NETWORK_ERROR", "message": f"Network error: {str(e)}"}} + logger.warning(f"[remote_upload] Network error on attempt {attempt + 1}: {e}") + + except Exception as e: + last_error = {"success": False, "error": {"code": "UPLOAD_ERROR", "message": f"Upload error: {str(e)}"}} + logger.error(f"[remote_upload] Unexpected error on attempt {attempt + 1}: {e}") + + # All retries exhausted + logger.error(f"[remote_upload] All {self.max_retries + 1} upload attempts failed for bundle {manifest.get('bundle_id', 'unknown')}") + return last_error or { + "success": False, + "error": { + "code": "MAX_RETRIES_EXCEEDED", + "message": f"Upload failed after {self.max_retries + 1} attempts" + } + } + + def get_server_status(self) -> Dict[str, Any]: + """Get server status with simplified error handling.""" + try: + container_workspace_path = self._translate_to_container_path(self.workspace_path) + + response = self.session.get( + f"{self.upload_endpoint}/api/v1/delta/status", + params={'workspace_path': container_workspace_path}, + timeout=min(self.timeout, 10) + ) + + if response.status_code == 200: + return response.json() + + # Handle error response + error_msg = f"Status check failed with HTTP {response.status_code}" + try: + error_detail = response.json() + error_msg += f": {error_detail.get('error', {}).get('message', 'Unknown error')}" + except: + error_msg += f": {response.text[:100]}" + + return {"success": False, "error": {"code": "STATUS_ERROR", "message": error_msg}} + + except requests.exceptions.Timeout: + return {"success": False, "error": {"code": "STATUS_TIMEOUT", "message": "Status check timeout"}} + except requests.exceptions.ConnectionError: + return {"success": False, "error": {"code": "CONNECTION_ERROR", "message": f"Cannot connect to server"}} + except Exception as e: + return {"success": False, "error": {"code": "STATUS_CHECK_ERROR", "message": f"Status check error: {str(e)}"}} + + def has_meaningful_changes(self, changes: Dict[str, List]) -> bool: + """Check if changes warrant a delta upload.""" + total_changes = sum(len(files) for op, files in changes.items() if op != "unchanged") + return total_changes > 0 + + def process_changes_and_upload(self, changes: Dict[str, List]) -> bool: + """ + Process pre-computed changes and upload delta bundle. + Includes comprehensive error handling and graceful fallback. + + Args: + changes: Dictionary of file changes by type + + Returns: + True if upload was successful, False otherwise + """ + try: + logger.info(f"[remote_upload] Processing pre-computed changes") + + # Validate input + if not changes: + logger.info("[remote_upload] No changes provided") + return True + + if not self.has_meaningful_changes(changes): + logger.info("[remote_upload] No meaningful changes detected, skipping upload") + return True + + # Log change summary + total_changes = sum(len(files) for op, files in changes.items() if op != "unchanged") + logger.info(f"[remote_upload] Detected {total_changes} meaningful changes: " + f"{len(changes['created'])} created, {len(changes['updated'])} updated, " + f"{len(changes['deleted'])} deleted, {len(changes['moved'])} moved") + + # Create delta bundle + bundle_path = None + try: + bundle_path, manifest = self.create_delta_bundle(changes) + logger.info(f"[remote_upload] Created delta bundle: {manifest['bundle_id']} " + f"(size: {manifest['total_size_bytes']} bytes)") + + # Validate bundle was created successfully + if not bundle_path or not os.path.exists(bundle_path): + raise RuntimeError(f"Failed to create bundle at {bundle_path}") + + except Exception as e: + logger.error(f"[remote_upload] Error creating delta bundle: {e}") + # Clean up any temporary files on failure + self.cleanup() + return False + + # Upload bundle with retry logic + try: + response = self.upload_bundle(bundle_path, manifest) + + if response.get("success", False): + processed_ops = response.get('processed_operations', {}) + logger.info(f"[remote_upload] Successfully uploaded bundle {manifest['bundle_id']}") + logger.info(f"[remote_upload] Processed operations: {processed_ops}") + + # Clean up temporary bundle after successful upload + try: + if os.path.exists(bundle_path): + os.remove(bundle_path) + logger.debug(f"[remote_upload] Cleaned up temporary bundle: {bundle_path}") + # Also clean up the entire temp directory if this is the last bundle + self.cleanup() + except Exception as cleanup_error: + logger.warning(f"[remote_upload] Failed to cleanup bundle {bundle_path}: {cleanup_error}") + + return True + else: + error_msg = response.get('error', {}).get('message', 'Unknown upload error') + logger.error(f"[remote_upload] Upload failed: {error_msg}") + return False + + except Exception as e: + logger.error(f"[remote_upload] Error uploading bundle: {e}") + return False + + except Exception as e: + logger.error(f"[remote_upload] Unexpected error in process_changes_and_upload: {e}") + return False + + def watch_loop(self, interval: int = 5): + """Main file watching loop using existing detection and upload methods.""" + logger.info(f"[watch] Starting file monitoring (interval: {interval}s)") + logger.info(f"[watch] Monitoring: {self.workspace_path}") + logger.info(f"[watch] Press Ctrl+C to stop") + + try: + while True: + try: + # Use existing change detection (get all files in workspace) + all_files = self.get_all_code_files() + changes = self.detect_file_changes(all_files) + + # Count only meaningful changes (exclude unchanged) + meaningful_changes = len(changes.get("created", [])) + len(changes.get("updated", [])) + len(changes.get("deleted", [])) + len(changes.get("moved", [])) + + if meaningful_changes > 0: + logger.info(f"[watch] Detected {meaningful_changes} changes: { {k: len(v) for k, v in changes.items() if k != 'unchanged'} }") + + # Use existing upload method + success = self.process_changes_and_upload(changes) + + if success: + logger.info(f"[watch] Successfully uploaded changes") + else: + logger.error(f"[watch] Failed to upload changes") + else: + logger.debug(f"[watch] No changes detected") # Debug level to avoid spam + + # Sleep until next check + time.sleep(interval) + + except KeyboardInterrupt: + logger.info(f"[watch] Received interrupt signal, stopping...") + break + except Exception as e: + logger.error(f"[watch] Error in watch loop: {e}") + time.sleep(interval) # Continue even after errors + + except KeyboardInterrupt: + logger.info(f"[watch] File monitoring stopped by user") + + def get_all_code_files(self) -> List[Path]: + """Get all code files in the workspace.""" + all_files = [] + try: + workspace_path = Path(self.workspace_path) + for ext in CODE_EXTS: + all_files.extend(workspace_path.rglob(f"*{ext}")) + + # Filter out directories and hidden files + all_files = [ + f for f in all_files + if f.is_file() + and not any(part.startswith('.') for part in f.parts) + and '.context-engine' not in str(f) + ] + except Exception as e: + logger.error(f"[watch] Error scanning files: {e}") + + return all_files + + def process_and_upload_changes(self, changed_paths: List[Path]) -> bool: + """ + Process changed paths and upload delta bundle if meaningful changes exist. + Includes comprehensive error handling and graceful fallback. + + Args: + changed_paths: List of changed file paths + + Returns: + True if upload was successful, False otherwise + """ + try: + logger.info(f"[remote_upload] Processing {len(changed_paths)} changed paths") + + # Validate input + if not changed_paths: + logger.info("[remote_upload] No changed paths provided") + return True + + # Detect changes + try: + changes = self.detect_file_changes(changed_paths) + except Exception as e: + logger.error(f"[remote_upload] Error detecting file changes: {e}") + return False + + if not self.has_meaningful_changes(changes): + logger.info("[remote_upload] No meaningful changes detected, skipping upload") + return True + + # Log change summary + total_changes = sum(len(files) for op, files in changes.items() if op != "unchanged") + logger.info(f"[remote_upload] Detected {total_changes} meaningful changes: " + f"{len(changes['created'])} created, {len(changes['updated'])} updated, " + f"{len(changes['deleted'])} deleted, {len(changes['moved'])} moved") + + # Create delta bundle + bundle_path = None + try: + bundle_path, manifest = self.create_delta_bundle(changes) + logger.info(f"[remote_upload] Created delta bundle: {manifest['bundle_id']} " + f"(size: {manifest['total_size_bytes']} bytes)") + + # Validate bundle was created successfully + if not bundle_path or not os.path.exists(bundle_path): + raise RuntimeError(f"Failed to create bundle at {bundle_path}") + + except Exception as e: + logger.error(f"[remote_upload] Error creating delta bundle: {e}") + # Clean up any temporary files on failure + self.cleanup() + return False + + # Upload bundle with retry logic + try: + response = self.upload_bundle(bundle_path, manifest) + + if response.get("success", False): + processed_ops = response.get('processed_operations', {}) + logger.info(f"[remote_upload] Successfully uploaded bundle {manifest['bundle_id']}") + logger.info(f"[remote_upload] Processed operations: {processed_ops}") + + # Clean up temporary bundle after successful upload + try: + if os.path.exists(bundle_path): + os.remove(bundle_path) + logger.debug(f"[remote_upload] Cleaned up temporary bundle: {bundle_path}") + # Also clean up the entire temp directory if this is the last bundle + self.cleanup() + except Exception as cleanup_error: + logger.warning(f"[remote_upload] Failed to cleanup bundle {bundle_path}: {cleanup_error}") + + return True + else: + error = response.get("error", {}) + error_code = error.get("code", "UNKNOWN") + error_msg = error.get("message", "Unknown error") + + logger.error(f"[remote_upload] Upload failed: {error_msg}") + + # Handle specific error types + # CLI is stateless - server handles sequence management + if error_code in ["BUNDLE_TOO_LARGE", "BUNDLE_NOT_FOUND"]: + # These are unrecoverable errors + logger.error(f"[remote_upload] Unrecoverable error ({error_code}): {error_msg}") + return False + elif error_code in ["TIMEOUT_ERROR", "CONNECTION_ERROR", "NETWORK_ERROR"]: + # These might be temporary, suggest fallback + logger.warning(f"[remote_upload] Network-related error ({error_code}): {error_msg}") + logger.warning("[remote_upload] Consider falling back to local mode if this persists") + return False + else: + # Other errors + logger.error(f"[remote_upload] Upload error ({error_code}): {error_msg}") + return False + + except Exception as e: + logger.error(f"[remote_upload] Unexpected error during upload: {e}") + return False + + except Exception as e: + logger.error(f"[remote_upload] Critical error in process_and_upload_changes: {e}") + logger.exception("[remote_upload] Full traceback:") + return False + +def get_remote_config(cli_path: Optional[str] = None) -> Dict[str, str]: + """Get remote upload configuration from environment variables and command-line arguments.""" + # Use command-line path if provided, otherwise fall back to environment variables + if cli_path: + workspace_path = cli_path + else: + workspace_path = os.environ.get("WATCH_ROOT", os.environ.get("WORKSPACE_PATH", "/work")) + + # Use auto-generated collection name based on repo name + repo_name = _extract_repo_name_from_path(workspace_path) + # Fallback to directory name if repo detection fails + if not repo_name: + repo_name = Path(workspace_path).name + collection_name = get_collection_name(repo_name) + + return { + "upload_endpoint": os.environ.get("REMOTE_UPLOAD_ENDPOINT", "http://localhost:8080"), + "workspace_path": workspace_path, + "collection_name": collection_name, + "max_retries": int(os.environ.get("REMOTE_UPLOAD_MAX_RETRIES", "3")), + "timeout": int(os.environ.get("REMOTE_UPLOAD_TIMEOUT", "30")) + } + + +def main(): + """Main entry point for the remote upload client.""" + parser = argparse.ArgumentParser( + description="Remote upload client for delta bundles in Context-Engine", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Upload from current directory or environment variables + python remote_upload_client.py + + # Upload from specific directory + python remote_upload_client.py --path /path/to/repo + + # Upload from specific directory with custom endpoint + python remote_upload_client.py --path /path/to/repo --endpoint http://remote-server:8080 + """ + ) + + parser.add_argument( + "--path", + type=str, + help="Path to the directory to upload (overrides WATCH_ROOT/WORKSPACE_PATH environment variables)" + ) + + parser.add_argument( + "--endpoint", + type=str, + help="Remote upload endpoint (overrides REMOTE_UPLOAD_ENDPOINT environment variable)" + ) + + parser.add_argument( + "--max-retries", + type=int, + help="Maximum number of upload retries (overrides REMOTE_UPLOAD_MAX_RETRIES environment variable)" + ) + + parser.add_argument( + "--timeout", + type=int, + help="Request timeout in seconds (overrides REMOTE_UPLOAD_TIMEOUT environment variable)" + ) + + parser.add_argument( + "--force", + action="store_true", + help="Force upload of all files (ignore cached state and treat all files as new)" + ) + + parser.add_argument( + "--show-mapping", + action="store_true", + help="Print collection↔workspace mapping information and exit" + ) + + parser.add_argument( + "--watch", "-w", + action="store_true", + help="Watch for file changes and upload automatically (continuous mode)" + ) + + parser.add_argument( + "--interval", "-i", + type=int, + default=5, + help="Watch interval in seconds (default: 5)" + ) + + args = parser.parse_args() + + # Validate path if provided + if args.path: + if not os.path.exists(args.path): + logger.error(f"Path does not exist: {args.path}") + return 1 + + if not os.path.isdir(args.path): + logger.error(f"Path is not a directory: {args.path}") + return 1 + + args.path = os.path.abspath(args.path) + logger.info(f"Using specified path: {args.path}") + + # Get configuration + config = get_remote_config(args.path) + + # Override config with command-line arguments if provided + if args.endpoint: + config["upload_endpoint"] = args.endpoint + if args.max_retries is not None: + config["max_retries"] = args.max_retries + if args.timeout is not None: + config["timeout"] = args.timeout + + logger.info(f"Workspace path: {config['workspace_path']}") + logger.info(f"Collection name: {config['collection_name']}") + logger.info(f"Upload endpoint: {config['upload_endpoint']}") + + if args.show_mapping: + with RemoteUploadClient( + upload_endpoint=config["upload_endpoint"], + workspace_path=config["workspace_path"], + collection_name=config["collection_name"], + max_retries=config["max_retries"], + timeout=config["timeout"], + ) as client: + client.log_mapping_summary() + return 0 + + # Handle watch mode + if args.watch: + logger.info("Starting watch mode for continuous file monitoring") + try: + with RemoteUploadClient( + upload_endpoint=config["upload_endpoint"], + workspace_path=config["workspace_path"], + collection_name=config["collection_name"], + max_retries=config["max_retries"], + timeout=config["timeout"] + ) as client: + + logger.info("Remote upload client initialized successfully") + client.log_mapping_summary() + + # Test server connection first + logger.info("Checking server status...") + status = client.get_server_status() + is_success = ( + isinstance(status, dict) and + 'workspace_path' in status and + 'collection_name' in status and + status.get('status') == 'ready' + ) + if not is_success: + error = status.get("error", {}) + logger.error(f"Cannot connect to server: {error.get('message', 'Unknown error')}") + return 1 + + logger.info("Server connection successful") + logger.info(f"Starting file monitoring with {args.interval}s interval") + + # Start the watch loop + client.watch_loop(interval=args.interval) + + return 0 + + except KeyboardInterrupt: + logger.info("Watch mode stopped by user") + return 0 + except Exception as e: + logger.error(f"Watch mode failed: {e}") + return 1 + + # Single upload mode (original logic) + # Initialize client with context manager for cleanup + try: + with RemoteUploadClient( + upload_endpoint=config["upload_endpoint"], + workspace_path=config["workspace_path"], + collection_name=config["collection_name"], + max_retries=config["max_retries"], + timeout=config["timeout"] + ) as client: + + logger.info("Remote upload client initialized successfully") + + client.log_mapping_summary() + + # Test server connection + logger.info("Checking server status...") + status = client.get_server_status() + # For delta endpoint, success is indicated by having expected fields (not a "success" boolean) + is_success = ( + isinstance(status, dict) and + 'workspace_path' in status and + 'collection_name' in status and + status.get('status') == 'ready' + ) + if not is_success: + error = status.get("error", {}) + logger.error(f"Cannot connect to server: {error.get('message', 'Unknown error')}") + return 1 + + logger.info("Server connection successful") + + # Scan repository and upload files + logger.info("Scanning repository for files...") + workspace_path = Path(config['workspace_path']) + + # Find all files in the repository + all_files = [] + for file_path in workspace_path.rglob('*'): + if file_path.is_file() and not file_path.name.startswith('.'): + rel_path = file_path.relative_to(workspace_path) + # Skip .codebase directory and other metadata + if not str(rel_path).startswith('.codebase'): + all_files.append(file_path) + + logger.info(f"Found {len(all_files)} files to upload") + + if not all_files: + logger.warning("No files found to upload") + return 0 + + # Detect changes (treat all files as changes for initial upload) + if args.force: + # Force mode: treat all files as created + changes = {"created": all_files, "updated": [], "deleted": [], "moved": [], "unchanged": []} + else: + changes = client.detect_file_changes(all_files) + + if not client.has_meaningful_changes(changes): + logger.info("No meaningful changes to upload") + return 0 + + logger.info(f"Changes detected: {len(changes.get('created', []))} created, {len(changes.get('updated', []))} updated, {len(changes.get('deleted', []))} deleted") + + # Process and upload changes + logger.info("Uploading files to remote server...") + success = client.process_changes_and_upload(changes) + + if success: + logger.info("Repository upload completed successfully!") + logger.info(f"Collection name: {config['collection_name']}") + logger.info(f"Files uploaded: {len(all_files)}") + else: + logger.error("Repository upload failed!") + return 1 + + return 0 + + except Exception as e: + logger.error(f"Failed to initialize remote upload client: {e}") + return 1 + + +if __name__ == "__main__": + import sys + sys.exit(main()) diff --git a/scripts/upload_service.py b/scripts/upload_service.py new file mode 100644 index 00000000..0b5c1589 --- /dev/null +++ b/scripts/upload_service.py @@ -0,0 +1,526 @@ +#!/usr/bin/env python3 +""" +HTTP Upload Service for Delta Bundles in Context-Engine. + +This FastAPI service receives delta bundles from remote upload clients, +processes them, and integrates with the existing indexing pipeline. +""" + +import os +import json +import tarfile +import tempfile +import hashlib +import asyncio +import logging +from pathlib import Path +from typing import Dict, Any, Optional, List +from datetime import datetime + +import uvicorn +from fastapi import FastAPI, File, UploadFile, Form, HTTPException, Request, status +from fastapi.responses import JSONResponse +from fastapi.middleware.cors import CORSMiddleware +from pydantic import BaseModel, Field + +# Import existing workspace state and indexing functions +try: + from scripts.workspace_state import ( + log_activity, + get_collection_name, + get_cached_file_hash, + set_cached_file_hash, + _extract_repo_name_from_path, + update_repo_origin, + get_collection_mappings, + ) +except ImportError: + # Fallback for testing without full environment + log_activity = None + get_collection_name = None + get_cached_file_hash = None + set_cached_file_hash = None + _extract_repo_name_from_path = None + update_repo_origin = None + get_collection_mappings = None + + +# Configure logging +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' +) +logger = logging.getLogger(__name__) + +# Configuration from environment +QDRANT_URL = os.environ.get("QDRANT_URL", "http://qdrant:6333") +DEFAULT_COLLECTION = os.environ.get("COLLECTION_NAME", "my-collection") +WORK_DIR = os.environ.get("WORK_DIR", "/work") +MAX_BUNDLE_SIZE_MB = int(os.environ.get("MAX_BUNDLE_SIZE_MB", "100")) +UPLOAD_TIMEOUT_SECS = int(os.environ.get("UPLOAD_TIMEOUT_SECS", "300")) + +# FastAPI app +app = FastAPI( + title="Context-Engine Delta Upload Service", + description="HTTP service for receiving and processing delta bundles", + version="1.0.0" +) + +# Add CORS middleware +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + +# In-memory sequence tracking (in production, use persistent storage) +_sequence_tracker: Dict[str, int] = {} + +class UploadResponse(BaseModel): + success: bool + bundle_id: Optional[str] = None + sequence_number: Optional[int] = None + processed_operations: Optional[Dict[str, int]] = None + processing_time_ms: Optional[int] = None + next_sequence: Optional[int] = None + error: Optional[Dict[str, Any]] = None + +class StatusResponse(BaseModel): + workspace_path: str + collection_name: str + last_sequence: int + last_upload: Optional[str] = None + pending_operations: int + status: str + server_info: Dict[str, Any] + +class HealthResponse(BaseModel): + status: str + timestamp: str + version: str + qdrant_url: str + work_dir: str + +def get_workspace_key(workspace_path: str) -> str: + """Generate 16-char hash for collision avoidance in remote uploads. + + Remote uploads may have identical folder names from different users, + so uses longer hash than local indexing (8-chars) to ensure uniqueness. + + Both host paths (/home/user/project/repo) and container paths (/work/repo) + should generate the same key for the same repository. + """ + repo_name = Path(workspace_path).name + return hashlib.sha256(repo_name.encode('utf-8')).hexdigest()[:16] + +def get_next_sequence(workspace_path: str) -> int: + """Get next sequence number for workspace.""" + key = get_workspace_key(workspace_path) + current = _sequence_tracker.get(key, 0) + next_seq = current + 1 + _sequence_tracker[key] = next_seq + return next_seq + +def get_last_sequence(workspace_path: str) -> int: + """Get last sequence number for workspace.""" + key = get_workspace_key(workspace_path) + return _sequence_tracker.get(key, 0) + +def validate_bundle_format(bundle_path: Path) -> Dict[str, Any]: + """Validate delta bundle format and return manifest.""" + try: + with tarfile.open(bundle_path, "r:gz") as tar: + # Check for required files + required_files = ["manifest.json", "metadata/operations.json", "metadata/hashes.json"] + members = tar.getnames() + + for req_file in required_files: + if not any(req_file in member for member in members): + raise ValueError(f"Missing required file: {req_file}") + + # Extract and validate manifest + manifest_member = None + for member in members: + if member.endswith("manifest.json"): + manifest_member = member + break + + if not manifest_member: + raise ValueError("manifest.json not found in bundle") + + manifest_file = tar.extractfile(manifest_member) + if not manifest_file: + raise ValueError("Cannot extract manifest.json") + + manifest = json.loads(manifest_file.read().decode('utf-8')) + + # Validate manifest structure + required_fields = ["version", "bundle_id", "workspace_path", "created_at", "sequence_number"] + for field in required_fields: + if field not in manifest: + raise ValueError(f"Missing required field in manifest: {field}") + + return manifest + + except Exception as e: + raise ValueError(f"Invalid bundle format: {str(e)}") + +async def process_delta_bundle(workspace_path: str, bundle_path: Path, manifest: Dict[str, Any]) -> Dict[str, int]: + """Process delta bundle and return operation counts.""" + operations_count = { + "created": 0, + "updated": 0, + "deleted": 0, + "moved": 0, + "skipped": 0, + "failed": 0 + } + + try: + # CRITICAL FIX: Extract repo name and create workspace under WORK_DIR + # Previous bug: used source workspace_path directly, extracting files outside /work + # This caused watcher service to never see uploaded files + if _extract_repo_name_from_path: + repo_name = _extract_repo_name_from_path(workspace_path) + # Fallback to directory name if repo detection fails + if not repo_name: + repo_name = Path(workspace_path).name + else: + # Fallback: use directory name + repo_name = Path(workspace_path).name + + # Generate workspace under WORK_DIR using repo name hash + workspace_key = get_workspace_key(workspace_path) + workspace = Path(WORK_DIR) / f"{repo_name}-{workspace_key}" + workspace.mkdir(parents=True, exist_ok=True) + + with tarfile.open(bundle_path, "r:gz") as tar: + # Extract operations metadata + ops_member = None + for member in tar.getnames(): + if member.endswith("metadata/operations.json"): + ops_member = member + break + + if not ops_member: + raise ValueError("operations.json not found in bundle") + + ops_file = tar.extractfile(ops_member) + if not ops_file: + raise ValueError("Cannot extract operations.json") + + operations_data = json.loads(ops_file.read().decode('utf-8')) + operations = operations_data.get("operations", []) + + # Process each operation + for operation in operations: + op_type = operation.get("operation") + rel_path = operation.get("path") + + if not rel_path: + operations_count["skipped"] += 1 + continue + + target_path = workspace / rel_path + + try: + if op_type == "created": + # Extract file from bundle + file_member = None + for member in tar.getnames(): + if member.endswith(f"files/created/{rel_path}"): + file_member = member + break + + if file_member: + file_content = tar.extractfile(file_member) + if file_content: + target_path.parent.mkdir(parents=True, exist_ok=True) + target_path.write_bytes(file_content.read()) + operations_count["created"] += 1 + else: + operations_count["failed"] += 1 + else: + operations_count["failed"] += 1 + + elif op_type == "updated": + # Extract updated file + file_member = None + for member in tar.getnames(): + if member.endswith(f"files/updated/{rel_path}"): + file_member = member + break + + if file_member: + file_content = tar.extractfile(file_member) + if file_content: + target_path.parent.mkdir(parents=True, exist_ok=True) + target_path.write_bytes(file_content.read()) + operations_count["updated"] += 1 + else: + operations_count["failed"] += 1 + else: + operations_count["failed"] += 1 + + elif op_type == "moved": + # Extract moved file to destination + file_member = None + for member in tar.getnames(): + if member.endswith(f"files/moved/{rel_path}"): + file_member = member + break + + if file_member: + file_content = tar.extractfile(file_member) + if file_content: + target_path.parent.mkdir(parents=True, exist_ok=True) + target_path.write_bytes(file_content.read()) + operations_count["moved"] += 1 + else: + operations_count["failed"] += 1 + else: + operations_count["failed"] += 1 + + elif op_type == "deleted": + # Delete file + if target_path.exists(): + target_path.unlink() + operations_count["deleted"] += 1 + else: + operations_count["skipped"] += 1 + + else: + operations_count["skipped"] += 1 + + except Exception as e: + logger.error(f"Error processing operation {op_type} for {rel_path}: {e}") + operations_count["failed"] += 1 + + return operations_count + + except Exception as e: + logger.error(f"Error processing delta bundle: {e}") + raise + + +@app.get("/health", response_model=HealthResponse) +async def health_check(): + """Health check endpoint.""" + return HealthResponse( + status="healthy", + timestamp=datetime.now().isoformat(), + version="1.0.0", + qdrant_url=QDRANT_URL, + work_dir=WORK_DIR + ) + +@app.get("/api/v1/delta/status", response_model=StatusResponse) +async def get_status(workspace_path: str): + """Get upload status for workspace.""" + try: + # Get collection name + if get_collection_name: + repo_name = _extract_repo_name_from_path(workspace_path) if _extract_repo_name_from_path else None + collection_name = get_collection_name(repo_name) + else: + collection_name = DEFAULT_COLLECTION + + # Get last sequence + last_sequence = get_last_sequence(workspace_path) + + last_upload = None + + return StatusResponse( + workspace_path=workspace_path, + collection_name=collection_name, + last_sequence=last_sequence, + last_upload=last_upload, + pending_operations=0, + status="ready", + server_info={ + "version": "1.0.0", + "max_bundle_size_mb": MAX_BUNDLE_SIZE_MB, + "supported_formats": ["tar.gz"] + } + ) + + except Exception as e: + logger.error(f"Error getting status: {e}") + raise HTTPException(status_code=500, detail=str(e)) + +@app.post("/api/v1/delta/upload", response_model=UploadResponse) +async def upload_delta_bundle( + request: Request, + bundle: UploadFile = File(...), + workspace_path: str = Form(...), + collection_name: Optional[str] = Form(None), + sequence_number: Optional[int] = Form(None), + force: Optional[bool] = Form(False), + source_path: Optional[str] = Form(None), +): + """Upload and process delta bundle.""" + start_time = datetime.now() + + try: + # Validate workspace path + workspace = Path(workspace_path) + if not workspace.is_absolute(): + workspace = Path(WORK_DIR) / workspace + + workspace_path = str(workspace.resolve()) + + # Get collection name + if not collection_name: + if get_collection_name: + repo_name = _extract_repo_name_from_path(workspace_path) if _extract_repo_name_from_path else None + # Fallback to directory name if repo detection fails + if not repo_name: + repo_name = Path(workspace_path).name + collection_name = get_collection_name(repo_name) + else: + collection_name = DEFAULT_COLLECTION + + # Persist origin metadata for remote lookups + try: + if update_repo_origin and repo_name: + workspace_key = get_workspace_key(workspace_path) + container_workspace = str(Path(WORK_DIR) / f"{repo_name}-{workspace_key}") + update_repo_origin( + workspace_path=container_workspace, + repo_name=repo_name, + container_path=container_workspace, + source_path=source_path or workspace_path, + collection_name=collection_name, + ) + except Exception as origin_err: + logger.debug(f"[upload_service] Failed to persist origin info: {origin_err}") + + # Validate bundle size + if bundle.size and bundle.size > MAX_BUNDLE_SIZE_MB * 1024 * 1024: + raise HTTPException( + status_code=413, + detail=f"Bundle too large. Max size: {MAX_BUNDLE_SIZE_MB}MB" + ) + + # Save bundle to temporary file + with tempfile.NamedTemporaryFile(suffix=".tar.gz", delete=False) as temp_file: + bundle_path = Path(temp_file.name) + + # Stream upload to file + content = await bundle.read() + bundle_path.write_bytes(content) + + try: + # Validate bundle format + manifest = validate_bundle_format(bundle_path) + bundle_id = manifest.get("bundle_id") + manifest_sequence = manifest.get("sequence_number") + + # Check sequence number + if sequence_number is None: + sequence_number = manifest_sequence + + if not force and sequence_number is not None: + last_sequence = get_last_sequence(workspace_path) + if sequence_number != last_sequence + 1: + return UploadResponse( + success=False, + error={ + "code": "SEQUENCE_MISMATCH", + "message": f"Expected sequence {last_sequence + 1}, got {sequence_number}", + "expected_sequence": last_sequence + 1, + "received_sequence": sequence_number, + "retry_after": 5000 + } + ) + + # Process delta bundle + operations_count = await process_delta_bundle(workspace_path, bundle_path, manifest) + + + # Update sequence tracking + if sequence_number is not None: + key = get_workspace_key(workspace_path) + _sequence_tracker[key] = sequence_number + + # Log activity using cleaned workspace_state function + if log_activity: + log_activity( + repo_name=_extract_repo_name_from_path(workspace_path) if _extract_repo_name_from_path else None, + action="uploaded", + file_path=bundle_id, + details={ + "bundle_id": bundle_id, + "operations": operations_count, + "source": "delta_upload" + } + ) + + # Calculate processing time + processing_time = (datetime.now() - start_time).total_seconds() * 1000 + + return UploadResponse( + success=True, + bundle_id=bundle_id, + sequence_number=sequence_number, + processed_operations=operations_count, + processing_time_ms=int(processing_time), + next_sequence=sequence_number + 1 if sequence_number else None + ) + + finally: + # Clean up temporary file + try: + bundle_path.unlink() + except Exception: + pass + + except HTTPException: + raise + except Exception as e: + logger.error(f"Error processing upload: {e}") + return UploadResponse( + success=False, + error={ + "code": "PROCESSING_ERROR", + "message": f"Error processing bundle: {str(e)}" + } + ) + +@app.exception_handler(Exception) +async def global_exception_handler(request: Request, exc: Exception): + """Global exception handler.""" + logger.error(f"Unhandled exception: {exc}") + return JSONResponse( + status_code=500, + content={ + "success": False, + "error": { + "code": "INTERNAL_ERROR", + "message": "Internal server error" + } + } + ) + +def main(): + """Main entry point for the upload service.""" + host = os.environ.get("UPLOAD_SERVICE_HOST", "0.0.0.0") + port = int(os.environ.get("UPLOAD_SERVICE_PORT", "8002")) + + logger.info(f"Starting upload service on {host}:{port}") + logger.info(f"Qdrant URL: {QDRANT_URL}") + logger.info(f"Work directory: {WORK_DIR}") + logger.info(f"Max bundle size: {MAX_BUNDLE_SIZE_MB}MB") + + uvicorn.run( + app, + host=host, + port=port, + log_level="info", + access_log=True + ) + +if __name__ == "__main__": + main() diff --git a/scripts/wait-for-qdrant.sh b/scripts/wait-for-qdrant.sh index 98f9e859..e26c73cf 100755 --- a/scripts/wait-for-qdrant.sh +++ b/scripts/wait-for-qdrant.sh @@ -1,6 +1,18 @@ #!/usr/bin/env bash set -euo pipefail -until curl -fsS "${QDRANT_URL:-http://localhost:6333}/" >/dev/null; do +# Use Python stdlib to avoid curl dependency in the container +until python - <<'PY' +import os, sys, urllib.request +url = os.environ.get("QDRANT_URL", "http://localhost:6333") +if not url.endswith("/"): + url += "/" +try: + with urllib.request.urlopen(url, timeout=2) as r: + sys.exit(0 if getattr(r, "status", 200) < 500 else 1) +except Exception: + sys.exit(1) +PY +do echo "Waiting for Qdrant at ${QDRANT_URL:-http://localhost:6333} ..." sleep 1 done diff --git a/scripts/warm_all_collections.py b/scripts/warm_all_collections.py new file mode 100644 index 00000000..0344da82 --- /dev/null +++ b/scripts/warm_all_collections.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python3 +""" +Script to warm all collections in Qdrant +""" +import os +import sys +import subprocess +from qdrant_client import QdrantClient + +def main(): + # Get configuration from environment + qdrant_url = os.environ.get("QDRANT_URL", "http://qdrant:6333") + ef = os.environ.get("EF", "256") + limit = os.environ.get("LIMIT", "3") + + print(f"Connecting to Qdrant at {qdrant_url}") + + # Connect to Qdrant + client = QdrantClient(url=qdrant_url) + + # Get all collections + try: + collections_response = client.get_collections() + collections = [c.name for c in collections_response.collections] + print(f"Found collections: {collections}") + except Exception as e: + print(f"Error getting collections: {e}") + sys.exit(1) + + # Warm each collection + for collection_name in collections: + print(f"Warming collection: {collection_name}") + try: + # Set environment variable for the collection name + env = os.environ.copy() + env["COLLECTION_NAME"] = collection_name + + result = subprocess.run( + [ + "python", + "/app/scripts/warm_start.py", + "--ef", ef, + "--limit", limit + ], + capture_output=True, + text=True, + check=True, + env=env + ) + print(f"Successfully warmed {collection_name}") + except subprocess.CalledProcessError as e: + print(f"Error warming {collection_name}: {e}") + print(f"stdout: {e.stdout}") + print(f"stderr: {e.stderr}") + sys.exit(1) + + print("All collections warmed successfully") + +if __name__ == "__main__": + main() diff --git a/scripts/watch_index.py b/scripts/watch_index.py index ab503f61..c9e94c57 100644 --- a/scripts/watch_index.py +++ b/scripts/watch_index.py @@ -3,7 +3,7 @@ import time import threading from pathlib import Path -from typing import Set +from typing import Optional, Set from qdrant_client import QdrantClient, models from fastembed import TextEmbedding @@ -20,28 +20,72 @@ sys.path.insert(0, str(ROOT_DIR)) from scripts.workspace_state import ( - get_workspace_state, - update_indexing_status, - update_last_activity, - update_workspace_state, + _extract_repo_name_from_path, + get_collection_name, + _get_global_state_dir, + is_multi_repo_mode, get_cached_file_hash, set_cached_file_hash, remove_cached_file, + update_indexing_status, + update_workspace_state, ) import hashlib from datetime import datetime import scripts.ingest_code as idx +from scripts.logger import get_logger + + +try: + logger = get_logger(__name__) +except Exception: # pragma: no cover - fallback for logger import issues + import logging + + logger = logging.getLogger(__name__) QDRANT_URL = os.environ.get("QDRANT_URL", "http://qdrant:6333") -COLLECTION = os.environ.get("COLLECTION_NAME", "codebase") MODEL = os.environ.get("EMBEDDING_MODEL", "BAAI/bge-base-en-v1.5") ROOT = Path(os.environ.get("WATCH_ROOT", "/work")).resolve() +# Back-compat: legacy modules/tests expect a module-level COLLECTION constant. +# It will be updated in main() once the resolved collection is known. +COLLECTION = os.environ.get("COLLECTION_NAME", "my-collection") + # Debounce interval DELAY_SECS = float(os.environ.get("WATCH_DEBOUNCE_SECS", "1.0")) +def _detect_repo_for_file(file_path: Path) -> Optional[Path]: + """Detect repository root for a file under WATCH root.""" + try: + rel_path = file_path.resolve().relative_to(ROOT.resolve()) + except Exception: + return None + if not rel_path.parts: + return ROOT + return ROOT / rel_path.parts[0] + + +def _get_collection_for_repo(repo_path: Path) -> str: + try: + repo_name = _extract_repo_name_from_path(str(repo_path)) + if repo_name: + return get_collection_name(repo_name) + except Exception: + pass + return os.environ.get("COLLECTION_NAME", "my-collection") + + +def _get_collection_for_file(file_path: Path) -> str: + if not is_multi_repo_mode(): + return os.environ.get("COLLECTION_NAME", "my-collection") + repo_path = _detect_repo_for_file(file_path) + if repo_path is not None: + return _get_collection_for_repo(repo_path) + return os.environ.get("COLLECTION_NAME", "my-collection") + + class ChangeQueue: def __init__(self, process_cb): self._lock = threading.Lock() @@ -59,7 +103,10 @@ def add(self, p: Path): try: self._timer.cancel() except Exception as e: - logger.error(f"Failed to cancel timer in ChangeQueue.add: {e}") + logger.error( + "Failed to cancel timer in ChangeQueue.add", + extra={"error": str(e)}, + ) self._timer = threading.Timer(DELAY_SECS, self._flush) self._timer.daemon = True self._timer.start() @@ -88,9 +135,10 @@ def _flush(self): except Exception as e: try: print(f"[watcher_error] processing batch failed: {e}") - except Exception as inner_e: + except Exception as inner_e: # pragma: no cover - logging fallback logger.error( - f"Exception in ChangeQueue._flush during batch processing: {inner_e}" + "Exception in ChangeQueue._flush during batch processing", + extra={"error": str(inner_e)}, ) # drain any pending accumulated during processing with self._lock: @@ -104,25 +152,40 @@ def _flush(self): class IndexHandler(FileSystemEventHandler): def __init__( - self, root: Path, queue: ChangeQueue, client: QdrantClient, collection: str + self, + root: Path, + queue: ChangeQueue, + client: Optional[QdrantClient], + default_collection: Optional[str] = None, + *, + collection: Optional[str] = None, ): super().__init__() self.root = root self.queue = queue self.client = client - self.collection = collection + resolved_collection = collection if collection is not None else default_collection + self.default_collection = resolved_collection + self.collection = resolved_collection self.excl = idx._Excluder(root) # Track ignore file for live reloads try: ig_name = os.environ.get("QDRANT_IGNORE_FILE", ".qdrantignore") self._ignore_path = (self.root / ig_name).resolve() - except Exception: + except (OSError, ValueError) as e: + try: + print(f"[ignore_file] Could not resolve ignore file path: {e}") + except Exception: + pass self._ignore_path = None - self._ignore_mtime = ( - self._ignore_path.stat().st_mtime - if self._ignore_path and self._ignore_path.exists() - else 0.0 - ) + try: + self._ignore_mtime = ( + self._ignore_path.stat().st_mtime + if self._ignore_path and self._ignore_path.exists() + else 0.0 + ) + except Exception: + self._ignore_mtime = 0.0 def _maybe_reload_excluder(self): try: @@ -146,7 +209,6 @@ def _maybe_enqueue(self, src_path: str): self._maybe_reload_excluder() p = Path(src_path) try: - # normalize to absolute within root p = p.resolve() except Exception: return @@ -158,6 +220,17 @@ def _maybe_enqueue(self, src_path: str): rel = p.resolve().relative_to(self.root.resolve()) except ValueError: return + + try: + if _get_global_state_dir is not None: + global_state_dir = _get_global_state_dir() + if p.is_relative_to(global_state_dir): + return + except (OSError, ValueError): + pass + + if any(part == ".codebase" for part in p.parts): + return # directory-level excludes (parent dir) rel_dir = "/" + str(rel.parent).replace(os.sep, "/") if rel_dir == "/.": @@ -191,19 +264,30 @@ def on_deleted(self, event): # Only attempt deletion for code files we would have indexed if p.suffix.lower() not in idx.CODE_EXTS: return - try: - idx.delete_points_by_path(self.client, self.collection, str(p)) - print(f"[deleted] {p}") - # Drop local cache entry + if self.client is not None: try: - remove_cached_file(str(self.root), str(p)) + collection = self.collection or _get_collection_for_file(p) + idx.delete_points_by_path(self.client, collection, str(p)) + print(f"[deleted] {p} -> {collection}") except Exception: pass + else: + print(f"File deletion detected: {p}") - try: - _log_activity(str(self.root), "deleted", p) - except Exception: - pass + try: + repo_path = _detect_repo_for_file(p) + if repo_path: + repo_name = _extract_repo_name_from_path(str(repo_path)) + remove_cached_file(str(p), repo_name) + else: + root_repo_name = _extract_repo_name_from_path(str(self.root)) + remove_cached_file(str(p), root_repo_name) + except Exception: + pass + + try: + repo_path = _detect_repo_for_file(p) or self.root + _log_activity(str(repo_path), "deleted", p) except Exception as e: try: print(f"[delete_error] {p}: {e}") @@ -240,7 +324,13 @@ def on_moved(self, event): ) print(f"[moved:ignored_dest_deleted_src] {src} -> {dest}") try: - remove_cached_file(str(self.root), str(src)) + src_repo_path = _detect_repo_for_file(src) + src_repo_name = ( + _extract_repo_name_from_path(str(src_repo_path)) + if src_repo_path is not None + else None + ) + remove_cached_file(str(src), src_repo_name) except Exception: pass @@ -249,35 +339,53 @@ def on_moved(self, event): return except Exception: pass - # Try in-place rename (preserve vectors) + src_collection = _get_collection_for_file(src) + dest_collection = _get_collection_for_file(dest) + is_cross_collection = src_collection != dest_collection + if is_cross_collection: + print(f"[cross_collection_move] {src} -> {dest}") + moved_count = -1 - try: - moved_count = _rename_in_store(self.client, self.collection, src, dest) - except Exception: - moved_count = -1 + renamed_hash: str | None = None + if self.client is not None: + try: + moved_count, renamed_hash = _rename_in_store( + self.client, src_collection, src, dest, dest_collection + ) + except Exception: + moved_count, renamed_hash = -1, None if moved_count and moved_count > 0: try: - print(f"[moved] {src} -> {dest} ({moved_count} chunk(s) relinked)") - # Update local cache: carry hash from src to dest if present - prev_hash = None - try: - prev_hash = get_cached_file_hash(str(self.root), str(src)) - except Exception: - prev_hash = None - if prev_hash: - try: - set_cached_file_hash(str(self.root), str(dest), prev_hash) - except Exception: - pass - try: - remove_cached_file(str(self.root), str(src)) - except Exception: - pass + print( + f"[moved] {src} -> {dest} ({moved_count} chunk(s) relinked)" + ) + src_repo_path = _detect_repo_for_file(src) + dest_repo_path = _detect_repo_for_file(dest) + src_repo_name = ( + _extract_repo_name_from_path(str(src_repo_path)) + if src_repo_path is not None + else None + ) + dest_repo_name = ( + _extract_repo_name_from_path(str(dest_repo_path)) + if dest_repo_path is not None + else None + ) + src_hash = "" + if src_repo_name: + src_hash = get_cached_file_hash(str(src), src_repo_name) + remove_cached_file(str(src), src_repo_name) + if not src_hash and renamed_hash: + src_hash = renamed_hash + if dest_repo_name and src_hash: + set_cached_file_hash( + str(dest), src_hash, dest_repo_name + ) except Exception: pass try: _log_activity( - str(self.root), + str(dest_repo_path or self.root), "moved", dest, {"from": str(src), "chunks": int(moved_count)}, @@ -285,13 +393,22 @@ def on_moved(self, event): except Exception: pass return - # Fallback: delete old then index new destination - try: - if src.suffix.lower() in idx.CODE_EXTS: - idx.delete_points_by_path(self.client, self.collection, str(src)) - print(f"[moved:deleted_src] {src}") - except Exception: - pass + if self.client is not None: + try: + if src.suffix.lower() in idx.CODE_EXTS: + try: + idx.delete_points_by_path(self.client, src_collection, str(src)) + except Exception: + idx.delete_points_by_path( + self.client, + self.collection or src_collection, + str(src), + ) + print(f"[moved:deleted_src] {src}") + except Exception: + pass + else: + print(f"[remote_mode] Move detected: {src} -> {dest}") try: self._maybe_enqueue(str(dest)) except Exception: @@ -301,9 +418,10 @@ def on_moved(self, event): # --- Workspace state helpers --- def _set_status_indexing(workspace_path: str, total_files: int) -> None: try: + repo_name = _extract_repo_name_from_path(workspace_path) update_indexing_status( - workspace_path, - { + repo_name=repo_name, + status={ "state": "indexing", "started_at": datetime.now().isoformat(), "progress": {"files_processed": 0, "total_files": int(total_files)}, @@ -321,9 +439,10 @@ def _update_progress( current_file: Path | None, ) -> None: try: + repo_name = _extract_repo_name_from_path(workspace_path) update_indexing_status( - workspace_path, - { + repo_name=repo_name, + status={ "state": "indexing", "started_at": started_at, "progress": { @@ -341,14 +460,18 @@ def _log_activity( workspace_path: str, action: str, file_path: Path, details: dict | None = None ) -> None: try: - update_last_activity( - workspace_path, - { - "timestamp": datetime.now().isoformat(), - "action": action, - "file_path": str(file_path), - "details": details or {}, - }, + repo_name = _extract_repo_name_from_path(workspace_path) + from scripts.workspace_state import log_activity + + valid_actions = {"indexed", "deleted", "skipped", "scan-completed", "initialized", "moved"} + if action not in valid_actions: + action = "indexed" + + log_activity( + repo_name=repo_name, + action=action, # type: ignore[arg-type] + file_path=str(file_path), + details=details, ) except Exception: pass @@ -356,13 +479,19 @@ def _log_activity( # --- Move/Rename optimization: reuse vectors when file content unchanged --- def _rename_in_store( - client: QdrantClient, collection: str, src: Path, dest: Path -) -> int: + client: QdrantClient, + src_collection: str, + src: Path, + dest: Path, + dest_collection: Optional[str] = None, +) -> tuple[int, str | None]: """Best-effort: if dest content hash matches previously indexed src hash, update points in-place to the new path without re-embedding. Returns number of points moved, or -1 if not applicable/failure. """ + if dest_collection is None: + dest_collection = src_collection try: if not dest.exists() or dest.is_dir(): return -1 @@ -371,9 +500,16 @@ def _rename_in_store( except Exception: return -1 dest_hash = hashlib.sha1(text.encode("utf-8", errors="ignore")).hexdigest() - prev = idx.get_indexed_file_hash(client, collection, str(src)) + prev = idx.get_indexed_file_hash(client, src_collection, str(src)) + logger.debug( + "rename fast-path candidate src=%s dest=%s prev_hash=%s dest_hash=%s", + str(src), + str(dest), + prev, + dest_hash, + ) if not prev or prev != dest_hash: - return -1 + return -1, prev if prev else None moved = 0 next_offset = None @@ -386,7 +522,7 @@ def _rename_in_store( ] ) points, next_offset = client.scroll( - collection_name=collection, + collection_name=src_collection, scroll_filter=filt, with_payload=True, with_vectors=True, @@ -445,16 +581,34 @@ def _rename_in_store( except Exception: continue if new_points: - idx.upsert_points(client, collection, new_points) + logger.debug( + "rename fast-path upserting %d chunk(s) %s -> %s into %s", + len(new_points), + str(src), + str(dest), + dest_collection, + ) + idx.upsert_points(client, dest_collection, new_points) moved += len(new_points) + if next_offset is None: + break try: - idx.delete_points_by_path(client, collection, str(src)) + idx.delete_points_by_path(client, src_collection, str(src)) except Exception: pass - return moved - except Exception: - return -1 + return moved, dest_hash + except Exception as exc: + try: + logger.warning( + "[rename_debug] rename failed for %s -> %s: %s", + str(src), + str(dest), + exc, + ) + except Exception: + pass + return -1, None def main(): @@ -463,26 +617,47 @@ def main(): from scripts.workspace_state import get_collection_name as _get_coll except Exception: _get_coll = None - global COLLECTION + + multi_repo_enabled = False try: - if _get_coll: - COLLECTION = _get_coll(str(ROOT)) + multi_repo_enabled = bool(is_multi_repo_mode()) except Exception: - pass + multi_repo_enabled = False + + default_collection = os.environ.get("COLLECTION_NAME", "my-collection") + if _get_coll: + try: + resolved = _get_coll(str(ROOT)) + if resolved: + default_collection = resolved + except Exception: + pass + if multi_repo_enabled: + print("[multi_repo] Multi-repo mode enabled - per-repo collections in use") + else: + print("[single_repo] Single-repo mode enabled - using single collection") + + global COLLECTION + COLLECTION = default_collection print( - f"Watch mode: root={ROOT} qdrant={QDRANT_URL} collection={COLLECTION} model={MODEL}" + f"Watch mode: root={ROOT} qdrant={QDRANT_URL} collection={default_collection} model={MODEL}" ) # Health check: detect and auto-heal cache/collection sync issues try: from scripts.collection_health import auto_heal_if_needed + print("[health_check] Checking collection health...") - heal_result = auto_heal_if_needed(str(ROOT), COLLECTION, QDRANT_URL, dry_run=False) - if heal_result["action_taken"] == "cleared_cache": + heal_result = auto_heal_if_needed( + str(ROOT), default_collection, QDRANT_URL, dry_run=False + ) + if heal_result.get("action_taken") == "cleared_cache": print("[health_check] Cache cleared due to sync issue - files will be reindexed") - elif not heal_result["health_check"]["healthy"]: - print(f"[health_check] Issue detected: {heal_result['health_check']['issue']}") + elif not heal_result.get("health_check", {}).get("healthy", True): + print( + f"[health_check] Issue detected: {heal_result['health_check'].get('issue', 'unknown')}" + ) else: print("[health_check] Collection health OK") except Exception as e: @@ -492,23 +667,19 @@ def main(): url=QDRANT_URL, timeout=int(os.environ.get("QDRANT_TIMEOUT", "20") or 20) ) - # Compute embedding dimension first (for deterministic dense vector selection) model = TextEmbedding(model_name=MODEL) - dim = len(next(model.embed(["dimension probe"]))) + model_dim = len(next(model.embed(["dimension probe"]))) - # Determine dense vector name deterministically try: - info = client.get_collection(COLLECTION) + info = client.get_collection(default_collection) cfg = info.config.params.vectors if isinstance(cfg, dict) and cfg: - # Prefer vector whose size matches embedding dim vector_name = None for name, params in cfg.items(): psize = getattr(params, "size", None) or getattr(params, "dim", None) - if psize and int(psize) == int(dim): + if psize and int(psize) == int(model_dim): vector_name = name break - # If LEX vector exists, pick a different name as dense if vector_name is None and getattr(idx, "LEX_VECTOR_NAME", None) in cfg: for name in cfg.keys(): if name != idx.LEX_VECTOR_NAME: @@ -521,24 +692,43 @@ def main(): except Exception: vector_name = idx._sanitize_vector_name(MODEL) - # Ensure collection + payload indexes exist try: - idx.ensure_collection(client, COLLECTION, dim, vector_name) + idx.ensure_collection(client, default_collection, model_dim, vector_name) except Exception: pass - idx.ensure_payload_indexes(client, COLLECTION) + idx.ensure_payload_indexes(client, default_collection) - # Ensure workspace state exists and set collection try: - update_workspace_state(str(ROOT), {"qdrant_collection": COLLECTION}) - update_indexing_status(str(ROOT), {"state": "watching"}) - except Exception: - pass + if multi_repo_enabled: + root_repo_name = _extract_repo_name_from_path(str(ROOT)) + if root_repo_name: + root_collection = get_collection_name(root_repo_name) + update_indexing_status( + repo_name=root_repo_name, + status={"state": "watching"}, + ) + print( + f"[workspace_state] Initialized repo state: {root_repo_name} -> {root_collection}" + ) + else: + print( + "[workspace_state] Multi-repo: root path is not a repo; skipping state initialization" + ) + else: + update_workspace_state( + workspace_path=str(ROOT), + updates={"qdrant_collection": default_collection}, + ) + update_indexing_status(status={"state": "watching"}) + except Exception as e: + print(f"[workspace_state] Error initializing workspace state: {e}") q = ChangeQueue( - lambda paths: _process_paths(paths, client, model, vector_name, str(ROOT)) + lambda paths: _process_paths( + paths, client, model, vector_name, model_dim, str(ROOT) + ) ) - handler = IndexHandler(ROOT, q, client, COLLECTION) + handler = IndexHandler(ROOT, q, client, default_collection) obs = Observer() obs.schedule(handler, str(ROOT), recursive=True) @@ -554,58 +744,86 @@ def main(): obs.join() -def _process_paths(paths, client, model, vector_name: str, workspace_path: str): - # Prepare progress +def _process_paths(paths, client, model, vector_name: str, model_dim: int, workspace_path: str): unique_paths = sorted(set(Path(x) for x in paths)) - total = len(unique_paths) + if not unique_paths: + return + started_at = datetime.now().isoformat() - try: - update_indexing_status( - workspace_path, - { - "state": "indexing", - "started_at": started_at, - "progress": {"files_processed": 0, "total_files": total}, - }, - ) - except Exception: - pass - processed = 0 - try: - for p in unique_paths: - current = p - if not p.exists(): - # File was removed; ensure its points and cache are deleted - try: - idx.delete_points_by_path(client, COLLECTION, str(p)) - print(f"[deleted] {p}") - except Exception: - pass + repo_groups: dict[str, list[Path]] = {} + for p in unique_paths: + repo_path = _detect_repo_for_file(p) or Path(workspace_path) + repo_groups.setdefault(str(repo_path), []).append(p) + + for repo_path, repo_files in repo_groups.items(): + try: + repo_name = _extract_repo_name_from_path(repo_path) + update_indexing_status( + repo_name=repo_name, + status={ + "state": "indexing", + "started_at": started_at, + "progress": { + "files_processed": 0, + "total_files": len(repo_files), + }, + }, + ) + except Exception: + pass + + repo_progress: dict[str, int] = {key: 0 for key in repo_groups.keys()} + + for p in unique_paths: + repo_path = _detect_repo_for_file(p) or Path(workspace_path) + repo_key = str(repo_path) + repo_files = repo_groups.get(repo_key, []) + repo_name = _extract_repo_name_from_path(repo_key) + collection = _get_collection_for_file(p) + + if not p.exists(): + if client is not None: try: - remove_cached_file(workspace_path, str(p)) + idx.delete_points_by_path(client, collection, str(p)) + print(f"[deleted] {p} -> {collection}") except Exception: pass - _log_activity(workspace_path, "deleted", p) - processed += 1 - _update_progress(workspace_path, started_at, processed, total, current) - continue - # Lazily instantiate model if needed - if model is None: - from fastembed import TextEmbedding - - mname = os.environ.get("EMBEDDING_MODEL", "BAAI/bge-base-en-v1.5") - model = TextEmbedding(model_name=mname) + try: + remove_cached_file(str(p), repo_name) + except Exception: + pass + _log_activity(repo_key, "deleted", p) + repo_progress[repo_key] = repo_progress.get(repo_key, 0) + 1 + try: + _update_progress( + repo_key, + started_at, + repo_progress[repo_key], + len(repo_files), + p, + ) + except Exception: + pass + continue + + if client is not None and model is not None: + try: + idx.ensure_collection(client, collection, model_dim, vector_name) + idx.ensure_payload_indexes(client, collection) + except Exception: + pass + ok = False try: ok = idx.index_single_file( client, model, - COLLECTION, + collection, vector_name, p, dedupe=True, - skip_unchanged=True, + skip_unchanged=False, ) except Exception as e: try: @@ -614,23 +832,40 @@ def _process_paths(paths, client, model, vector_name: str, workspace_path: str): pass ok = False status = "indexed" if ok else "skipped" - print(f"[{status}] {p}") + print(f"[{status}] {p} -> {collection}") if ok: try: size = int(p.stat().st_size) except Exception: size = None - _log_activity(workspace_path, "indexed", p, {"file_size": size}) + _log_activity(repo_key, "indexed", p, {"file_size": size}) else: _log_activity( - workspace_path, "skipped", p, {"reason": "no-change-or-error"} + repo_key, "skipped", p, {"reason": "no-change-or-error"} ) - processed += 1 - _update_progress(workspace_path, started_at, processed, total, current) - finally: - # Always return to watching state even if processing raised + else: + print(f"Not processing locally: {p}") + _log_activity(repo_key, "skipped", p, {"reason": "remote-mode"}) + + repo_progress[repo_key] = repo_progress.get(repo_key, 0) + 1 try: - update_indexing_status(workspace_path, {"state": "watching"}) + _update_progress( + repo_key, + started_at, + repo_progress[repo_key], + len(repo_files), + p, + ) + except Exception: + pass + + for repo_path in repo_groups.keys(): + try: + repo_name = _extract_repo_name_from_path(repo_path) + update_indexing_status( + repo_name=repo_name, + status={"state": "watching"}, + ) except Exception: pass diff --git a/scripts/workspace_state.py b/scripts/workspace_state.py index dfa6b4fb..e05f80b5 100644 --- a/scripts/workspace_state.py +++ b/scripts/workspace_state.py @@ -6,26 +6,30 @@ - Collection information and indexing status - Progress tracking during indexing operations - Activity logging with structured metadata -- Multi-project support with per-workspace state files - -Based on the codebase-index-cli workspace state pattern but adapted for our Python ecosystem. +- Multi-repo support with per-repo state files """ import json import os -import uuid import re -import hashlib +import uuid import subprocess +import hashlib from datetime import datetime from pathlib import Path from typing import Dict, Any, Optional, List, Literal, TypedDict import threading import time -# Type definitions matching codebase-index-cli patterns +# Type definitions IndexingState = Literal['idle', 'initializing', 'scanning', 'indexing', 'watching', 'error'] ActivityAction = Literal['indexed', 'deleted', 'skipped', 'scan-completed', 'initialized', 'moved'] +# Constants +STATE_DIRNAME = ".codebase" +STATE_FILENAME = "state.json" +CACHE_FILENAME = "cache.json" +PLACEHOLDER_COLLECTION_NAMES = {"", "default-collection", "my-collection"} + class IndexingProgress(TypedDict, total=False): files_processed: int total_files: Optional[int] @@ -53,40 +57,77 @@ class LastActivity(TypedDict, total=False): file_path: Optional[str] details: Optional[ActivityDetails] -class QdrantStats(TypedDict, total=False): - total_vectors: int - unique_files: int - vector_dimension: int - last_updated: str - collection_name: str +class OriginInfo(TypedDict, total=False): + repo_name: Optional[str] + container_path: Optional[str] + source_path: Optional[str] + collection_name: Optional[str] + updated_at: Optional[str] + class WorkspaceState(TypedDict, total=False): - workspace_path: str created_at: str updated_at: str qdrant_collection: str indexing_status: Optional[IndexingStatus] last_activity: Optional[LastActivity] - qdrant_stats: Optional[QdrantStats] + qdrant_stats: Optional[Dict[str, Any]] + origin: Optional[OriginInfo] -# Constants -STATE_DIRNAME = ".codebase" -STATE_FILENAME = "state.json" +def is_multi_repo_mode() -> bool: + """Check if multi-repo mode is enabled.""" + return os.environ.get("MULTI_REPO_MODE", "0").strip().lower() in { + "1", "true", "yes", "on" + } -# Thread-safe state management -# Use re-entrant locks to avoid deadlocks when helper functions call each other -_state_locks: Dict[str, threading.RLock] = {} _state_lock = threading.Lock() # Track last-used timestamps for cleanup of idle workspace locks +_state_locks: Dict[str, threading.RLock] = {} _state_lock_last_used: Dict[str, float] = {} -def _get_state_lock(workspace_path: str) -> threading.RLock: - """Get or create a thread-safe lock for a specific workspace and record last-used time.""" +def _resolve_workspace_root() -> str: + """Determine the default workspace root path.""" + return os.environ.get("WORKSPACE_PATH") or os.environ.get("WATCH_ROOT") or "/work" + +def _resolve_repo_context( + workspace_path: Optional[str] = None, + repo_name: Optional[str] = None, +) -> tuple[str, Optional[str]]: + """Normalize workspace/repo context, ensuring multi-repo callers map to repo state.""" + resolved_workspace = workspace_path or _resolve_workspace_root() + + if is_multi_repo_mode(): + if repo_name: + return resolved_workspace, repo_name + + if workspace_path: + detected = _detect_repo_name_from_path(Path(workspace_path)) + if detected: + return resolved_workspace, detected + + return resolved_workspace, None + + return resolved_workspace, repo_name + +def _get_state_lock(workspace_path: Optional[str] = None, repo_name: Optional[str] = None) -> threading.RLock: + """Get or create a lock for the workspace or repo state and track usage.""" + if repo_name and is_multi_repo_mode(): + key = f"repo::{repo_name}" + else: + key = str(Path(workspace_path or _resolve_workspace_root()).resolve()) + with _state_lock: - if workspace_path not in _state_locks: - _state_locks[workspace_path] = threading.RLock() - _state_lock_last_used[workspace_path] = time.time() - return _state_locks[workspace_path] + if key not in _state_locks: + _state_locks[key] = threading.RLock() + _state_lock_last_used[key] = time.time() + return _state_locks[key] + +def _get_repo_state_dir(repo_name: str) -> Path: + """Get the state directory for a repository.""" + base_dir = Path(os.environ.get("WORKSPACE_PATH") or os.environ.get("WATCH_ROOT") or "/work") + if is_multi_repo_mode(): + return base_dir / STATE_DIRNAME / "repos" / repo_name + return base_dir / STATE_DIRNAME def _get_state_path(workspace_path: str) -> Path: """Get the path to the state.json file for a workspace.""" @@ -94,6 +135,13 @@ def _get_state_path(workspace_path: str) -> Path: state_dir = workspace / STATE_DIRNAME return state_dir / STATE_FILENAME + +def _get_global_state_dir(workspace_path: Optional[str] = None) -> Path: + """Return the root .codebase directory used for workspace metadata.""" + + base_dir = Path(workspace_path or _resolve_workspace_root()).resolve() + return base_dir / STATE_DIRNAME + def _ensure_state_dir(workspace_path: str) -> Path: """Ensure the .codebase directory exists and return the state file path.""" workspace = Path(workspace_path).resolve() @@ -122,13 +170,33 @@ def _sanitize_name(s: str, max_len: int = 64) -> str: def _cross_process_lock(lock_path: Path): """Advisory cross-process exclusive lock using a companion .lock file. Safe across container/process boundaries; pairs with atomic rename writes. + Ensures group-writable permissions so non-root indexers/watchers can operate. """ - lock_path.parent.mkdir(exist_ok=True) - f = open(lock_path, "a+") + + lock_path.parent.mkdir(parents=True, exist_ok=True) + + lock_file = None + fd = None + try: + fd = os.open(lock_path, os.O_CREAT | os.O_RDWR, 0o664) + lock_file = os.fdopen(fd, "a+") + except PermissionError: + # If we cannot create or open the requested lock, fall back to /tmp (permissive) + tmp_path = Path("/tmp") / (lock_path.name) + tmp_path.parent.mkdir(parents=True, exist_ok=True) + fd = os.open(tmp_path, os.O_CREAT | os.O_RDWR, 0o664) + lock_file = os.fdopen(fd, "a+") + lock_path = tmp_path + try: + try: + os.chmod(lock_path, 0o664) + except PermissionError: + pass + if fcntl is not None: try: - fcntl.flock(f.fileno(), fcntl.LOCK_EX) + fcntl.flock(lock_file.fileno(), fcntl.LOCK_EX) except Exception: pass yield @@ -136,12 +204,12 @@ def _cross_process_lock(lock_path: Path): try: if fcntl is not None: try: - fcntl.flock(f.fileno(), fcntl.LOCK_UN) + fcntl.flock(lock_file.fileno(), fcntl.LOCK_UN) except Exception: pass finally: try: - f.close() + lock_file.close() except Exception: pass @@ -192,145 +260,324 @@ def _atomic_write_state(state_path: Path, state: WorkspaceState) -> None: pass raise -def get_workspace_state(workspace_path: str) -> WorkspaceState: - """Get the current workspace state, creating it if it doesn't exist. +def get_workspace_state( + workspace_path: Optional[str] = None, repo_name: Optional[str] = None +) -> WorkspaceState: + """Get the current workspace state, creating it if it doesn't exist.""" - Uses a cross-process lock to avoid concurrent read-modify-write races across - multiple containers/processes. - """ - lock = _get_state_lock(workspace_path) + workspace_path, repo_name = _resolve_repo_context(workspace_path, repo_name) + + if is_multi_repo_mode() and repo_name is None: + print( + f"[workspace_state] Multi-repo: Skipping state read for workspace={workspace_path} without repo_name" + ) + return {} + + lock = _get_state_lock(workspace_path, repo_name) with lock: - state_path = _get_state_path(workspace_path) - lock_path = state_path.with_suffix(state_path.suffix + ".lock") + state_path: Path + lock_scope_path: Path + + if is_multi_repo_mode() and repo_name: + state_dir = _get_repo_state_dir(repo_name) + state_dir.mkdir(parents=True, exist_ok=True) + state_path = state_dir / STATE_FILENAME + lock_scope_path = state_dir + else: + try: + state_path = _ensure_state_dir(workspace_path) + lock_scope_path = state_path.parent + except PermissionError: + lock_scope_path = _get_global_state_dir(workspace_path) + lock_scope_path.mkdir(parents=True, exist_ok=True) + state_path = lock_scope_path / STATE_FILENAME + + lock_path = lock_scope_path / (STATE_FILENAME + ".lock") with _cross_process_lock(lock_path): if state_path.exists(): try: - with open(state_path, 'r', encoding='utf-8') as f: + with open(state_path, "r", encoding="utf-8") as f: state = json.load(f) - # Ensure required fields exist - if not isinstance(state, dict): - raise ValueError("Invalid state format") - return state + if isinstance(state, dict): + return state except (json.JSONDecodeError, ValueError, OSError): - # Corrupted or invalid state file, recreate pass - # Create new state now = datetime.now().isoformat() - env_coll = os.environ.get("COLLECTION_NAME", "").strip() - # Use env var if set, otherwise default to "codebase" - collection_name = env_coll if env_coll else "codebase" + collection_name = get_collection_name(repo_name) state: WorkspaceState = { - "workspace_path": str(Path(workspace_path).resolve()), + "workspace_path": str(Path(workspace_path or _resolve_workspace_root()).resolve()), "created_at": now, "updated_at": now, "qdrant_collection": collection_name, - "indexing_status": { - "state": "idle" - } + "indexing_status": {"state": "idle"}, } - # Ensure directory exists and write state - state_path = _ensure_state_dir(workspace_path) _atomic_write_state(state_path, state) return state -def update_workspace_state(workspace_path: str, updates: Dict[str, Any]) -> WorkspaceState: - """Update workspace state with the given changes. - Cross-process safe using an advisory lock file. - """ - lock = _get_state_lock(workspace_path) +def update_workspace_state( + workspace_path: Optional[str] = None, + updates: Optional[Dict[str, Any]] = None, + repo_name: Optional[str] = None, +) -> WorkspaceState: + """Update workspace state with the given changes.""" + + workspace_path, repo_name = _resolve_repo_context(workspace_path, repo_name) + updates = updates or {} + + if is_multi_repo_mode() and repo_name is None: + print( + f"[workspace_state] Multi-repo: Skipping state update for workspace={workspace_path} without repo_name" + ) + return {} + + lock = _get_state_lock(workspace_path, repo_name) with lock: - state_path = _ensure_state_dir(workspace_path) - lock_path = state_path.with_suffix(state_path.suffix + ".lock") + state = get_workspace_state(workspace_path, repo_name) + for key, value in updates.items(): + if key in state or key in WorkspaceState.__annotations__: + state[key] = value + + state["updated_at"] = datetime.now().isoformat() + + if is_multi_repo_mode() and repo_name: + state_dir = _get_repo_state_dir(repo_name) + state_dir.mkdir(parents=True, exist_ok=True) + state_path = state_dir / STATE_FILENAME + else: + try: + state_path = _ensure_state_dir(workspace_path) + except PermissionError: + state_dir = _get_global_state_dir(workspace_path) + state_dir.mkdir(parents=True, exist_ok=True) + state_path = state_dir / STATE_FILENAME + + _atomic_write_state(state_path, state) + return state + +def update_indexing_status( + workspace_path: Optional[str] = None, + status: Optional[IndexingStatus] = None, + repo_name: Optional[str] = None, +) -> WorkspaceState: + """Update indexing status in workspace state.""" + workspace_path, repo_name = _resolve_repo_context(workspace_path, repo_name) + + if is_multi_repo_mode() and repo_name is None: + print( + f"[workspace_state] Multi-repo: Skipping indexing status update for workspace={workspace_path} without repo_name" + ) + return {} + + if status is None: + status = {"state": "idle"} + + return update_workspace_state( + workspace_path=workspace_path, + updates={"indexing_status": status}, + repo_name=repo_name, + ) + + +def update_repo_origin( + workspace_path: Optional[str] = None, + repo_name: Optional[str] = None, + *, + container_path: Optional[str] = None, + source_path: Optional[str] = None, + collection_name: Optional[str] = None, +) -> WorkspaceState: + """Update origin metadata for a repository/workspace.""" + + resolved_workspace, resolved_repo = _resolve_repo_context(workspace_path, repo_name) + + if is_multi_repo_mode() and resolved_repo is None: + return {} + + state = get_workspace_state(resolved_workspace, resolved_repo) + if not state: + state = {} + + origin: OriginInfo = dict(state.get("origin", {})) # type: ignore[arg-type] + if resolved_repo: + origin["repo_name"] = resolved_repo + if container_path or workspace_path: + origin["container_path"] = container_path or workspace_path + if source_path: + origin["source_path"] = source_path + if collection_name: + origin["collection_name"] = collection_name + origin["updated_at"] = datetime.now().isoformat() + + updates: Dict[str, Any] = {"origin": origin} + if collection_name: + updates.setdefault("qdrant_collection", collection_name) + + return update_workspace_state( + workspace_path=resolved_workspace, + updates=updates, + repo_name=resolved_repo, + ) + + +def log_activity( + repo_name: Optional[str] = None, + action: Optional[ActivityAction] = None, + file_path: Optional[str] = None, + details: Optional[ActivityDetails] = None, + workspace_path: Optional[str] = None, +) -> None: + """Log activity to workspace state.""" + + if not action: + return + + activity = { + "timestamp": datetime.now().isoformat(), + "action": action, + "file_path": file_path, + "details": details or {}, + } + + resolved_workspace = workspace_path or _resolve_workspace_root() + + if is_multi_repo_mode() and repo_name: + state_dir = _get_repo_state_dir(repo_name) + state_dir.mkdir(parents=True, exist_ok=True) + state_path = state_dir / STATE_FILENAME + lock_path = state_path.with_suffix(".lock") + with _cross_process_lock(lock_path): - # Read current state (best-effort) try: - with open(state_path, 'r', encoding='utf-8') as f: - state = json.load(f) - if not isinstance(state, dict): - state = {} + if state_path.exists(): + with open(state_path, "r", encoding="utf-8") as f: + state = json.load(f) + else: + state = {"created_at": datetime.now().isoformat()} except Exception: - state = {} - - # Apply updates (preserve prior behavior: only known or existing keys) - for key, value in updates.items(): - if key in state or key in WorkspaceState.__annotations__: - state[key] = value + state = {"created_at": datetime.now().isoformat()} - # Always update timestamp + state["last_activity"] = activity state["updated_at"] = datetime.now().isoformat() - - # Write back to file atomically _atomic_write_state(state_path, state) - return state + else: + update_workspace_state( + workspace_path=resolved_workspace, + updates={"last_activity": activity}, + repo_name=repo_name, + ) -def update_indexing_status(workspace_path: str, status: IndexingStatus) -> WorkspaceState: - """Update the indexing status in workspace state.""" - return update_workspace_state(workspace_path, {"indexing_status": status}) -def update_last_activity(workspace_path: str, activity: LastActivity) -> WorkspaceState: - """Update the last activity in workspace state.""" - return update_workspace_state(workspace_path, {"last_activity": activity}) +def _generate_collection_name_from_repo(repo_name: str) -> str: + """Generate collection name with 8-char hash for local workspaces. -def update_qdrant_stats(workspace_path: str, stats: QdrantStats) -> WorkspaceState: - """Update Qdrant statistics in workspace state.""" - stats["last_updated"] = datetime.now().isoformat() - return update_workspace_state(workspace_path, {"qdrant_stats": stats}) + Used by local indexer/watcher. Remote uploads use 16+8 char pattern + for collision avoidance when folder names may be identical. + """ + hash_obj = hashlib.sha256(repo_name.encode()) + short_hash = hash_obj.hexdigest()[:8] + return f"{repo_name}-{short_hash}" -def get_collection_name(workspace_path: str) -> str: - """Get the Qdrant collection name for a workspace. +def get_collection_name(repo_name: Optional[str] = None) -> str: + """Get collection name for repository or workspace.""" + # In multi-repo mode, prioritize repo-specific collection names + if is_multi_repo_mode() and repo_name: + return _generate_collection_name_from_repo(repo_name) - Seamless single-collection mode: - - Defaults to "codebase" for unified cross-repo search - - All your code goes into one collection - - Override via COLLECTION_NAME env var if you need isolation - """ + # Check environment for single-repo mode or fallback env_coll = os.environ.get("COLLECTION_NAME", "").strip() + if env_coll and env_coll not in PLACEHOLDER_COLLECTION_NAMES: + return env_coll - # Use env var if set, otherwise default to unified "codebase" collection - coll = env_coll if env_coll else "codebase" + # Use repo name if provided (for single-repo mode with repo name) + if repo_name: + return _generate_collection_name_from_repo(repo_name) - # Persist to state for consistency - update_workspace_state(workspace_path, {"qdrant_collection": coll}) - return coll + # Default fallback + return "global-collection" -# --- Persistent file-hash cache (.codebase/cache.json) --- -CACHE_FILENAME = "cache.json" +def _detect_repo_name_from_path(path: Path) -> str: + """Detect repository name from path. Clean, robust implementation.""" + try: + resolved_path = path.resolve() + except Exception: + return None + + candidate_roots: List[Path] = [] + for root_str in ( + os.environ.get("WATCH_ROOT"), + os.environ.get("WORKSPACE_PATH"), + "/work", + os.environ.get("HOST_ROOT"), + "/home/coder/project/Context-Engine/dev-workspace", + ): + if not root_str: + continue + try: + root_path = Path(root_str).resolve() + except Exception: + continue + if root_path not in candidate_roots: + candidate_roots.append(root_path) + + for base in candidate_roots: + try: + rel_path = resolved_path.relative_to(base) + except ValueError: + continue + + if not rel_path.parts: + continue + + repo_name = rel_path.parts[0] + if repo_name in (".codebase", ".git", "__pycache__"): + continue + repo_path = base / repo_name + if repo_path.exists() or str(resolved_path).startswith(str(repo_path) + os.sep): + return repo_name + return None + +def _extract_repo_name_from_path(workspace_path: str) -> str: + """Extract repository name from workspace path.""" + return _detect_repo_name_from_path(Path(workspace_path)) + +# Cache functions for file hash tracking def _get_cache_path(workspace_path: str) -> Path: - ws = Path(workspace_path).resolve() - return ws / STATE_DIRNAME / CACHE_FILENAME + """Get the path to the cache.json file.""" + workspace = Path(workspace_path).resolve() + return workspace / STATE_DIRNAME / CACHE_FILENAME def _read_cache(workspace_path: str) -> Dict[str, Any]: - """Best-effort load of the workspace cache (file hashes keyed by absolute path).""" + """Read cache file, return empty dict if it doesn't exist or is invalid.""" + + cache_path = _get_cache_path(workspace_path) + if not cache_path.exists(): + return {"file_hashes": {}, "updated_at": datetime.now().isoformat()} + try: - p = _get_cache_path(workspace_path) - if not p.exists(): - return {"file_hashes": {}, "updated_at": datetime.now().isoformat()} - with open(p, "r", encoding="utf-8") as f: + with open(cache_path, "r", encoding="utf-8") as f: obj = json.load(f) if isinstance(obj, dict) and isinstance(obj.get("file_hashes"), dict): return obj - return {"file_hashes": {}, "updated_at": datetime.now().isoformat()} except Exception: - return {"file_hashes": {}, "updated_at": datetime.now().isoformat()} + pass + + return {"file_hashes": {}, "updated_at": datetime.now().isoformat()} def _write_cache(workspace_path: str, cache: Dict[str, Any]) -> None: - """Atomic write of cache file to avoid corruption under concurrency. + """Atomic write of cache file with cross-process locking.""" - Uses both an in-process lock and a cross-process lock file to serialize writers. - """ lock = _get_state_lock(workspace_path) with lock: - state_dir = Path(workspace_path).resolve() / STATE_DIRNAME - state_dir.mkdir(exist_ok=True) cache_path = _get_cache_path(workspace_path) + cache_path.parent.mkdir(parents=True, exist_ok=True) lock_path = cache_path.with_suffix(cache_path.suffix + ".lock") with _cross_process_lock(lock_path): tmp = cache_path.with_suffix(f".tmp.{uuid.uuid4().hex[:8]}") @@ -345,70 +592,90 @@ def _write_cache(workspace_path: str, cache: Dict[str, Any]) -> None: pass -def get_cached_file_hash(workspace_path: str, file_path: str) -> str: - """Return cached content hash for an absolute file path, or empty string.""" - cache = _read_cache(workspace_path) - try: - return str((cache.get("file_hashes") or {}).get(str(Path(file_path).resolve()), "")) - except Exception: - return "" +def get_cached_file_hash(file_path: str, repo_name: Optional[str] = None) -> str: + """Get cached file hash for tracking changes.""" + if is_multi_repo_mode() and repo_name: + state_dir = _get_repo_state_dir(repo_name) + cache_path = state_dir / CACHE_FILENAME + if cache_path.exists(): + try: + with open(cache_path, 'r', encoding='utf-8') as f: + cache = json.load(f) + file_hashes = cache.get("file_hashes", {}) + return file_hashes.get(str(Path(file_path).resolve()), "") + except Exception: + pass + else: + cache = _read_cache(_resolve_workspace_root()) + return cache.get("file_hashes", {}).get(str(Path(file_path).resolve()), "") -def set_cached_file_hash(workspace_path: str, file_path: str, file_hash: str) -> None: - """Set cached content hash for an absolute file path and persist immediately.""" - lock = _get_state_lock(workspace_path) - with lock: - cache = _read_cache(workspace_path) - fh = cache.setdefault("file_hashes", {}) - fh[str(Path(file_path).resolve())] = str(file_hash) - cache["updated_at"] = datetime.now().isoformat() - _write_cache(workspace_path, cache) + return "" -def remove_cached_file(workspace_path: str, file_path: str) -> None: - """Remove a file entry from the cache and persist.""" - lock = _get_state_lock(workspace_path) - with lock: - cache = _read_cache(workspace_path) - fh = cache.setdefault("file_hashes", {}) +def set_cached_file_hash(file_path: str, file_hash: str, repo_name: Optional[str] = None) -> None: + """Set cached file hash for tracking changes.""" + + fp = str(Path(file_path).resolve()) + + if is_multi_repo_mode() and repo_name: + state_dir = _get_repo_state_dir(repo_name) + cache_path = state_dir / CACHE_FILENAME + state_dir.mkdir(parents=True, exist_ok=True) + try: - fp = str(Path(file_path).resolve()) - except Exception: - fp = str(file_path) - if fp in fh: - fh.pop(fp, None) + if cache_path.exists(): + with open(cache_path, "r", encoding="utf-8") as f: + cache = json.load(f) + else: + cache = {"file_hashes": {}, "created_at": datetime.now().isoformat()} + + cache.setdefault("file_hashes", {})[fp] = file_hash cache["updated_at"] = datetime.now().isoformat() - _write_cache(workspace_path, cache) -def list_workspaces(search_root: Optional[str] = None) -> List[Dict[str, Any]]: - """Find all workspaces with .codebase/state.json files.""" - if search_root is None: - search_root = os.getcwd() + _atomic_write_state(cache_path, cache) # reuse atomic writer for files + except Exception: + pass + return + + cache = _read_cache(_resolve_workspace_root()) + cache.setdefault("file_hashes", {})[fp] = file_hash + cache["updated_at"] = datetime.now().isoformat() + _write_cache(_resolve_workspace_root(), cache) - workspaces = [] - search_path = Path(search_root).resolve() - # Search for .codebase directories - for state_dir in search_path.rglob(STATE_DIRNAME): - state_file = state_dir / STATE_FILENAME - if state_file.exists(): +def remove_cached_file(file_path: str, repo_name: Optional[str] = None) -> None: + """Remove file entry from cache.""" + if is_multi_repo_mode() and repo_name: + state_dir = _get_repo_state_dir(repo_name) + cache_path = state_dir / CACHE_FILENAME + + if cache_path.exists(): try: - workspace_path = str(state_dir.parent) - state = get_workspace_state(workspace_path) - workspaces.append({ - "workspace_path": workspace_path, - "collection_name": state.get("qdrant_collection"), - "last_updated": state.get("updated_at"), - "indexing_state": state.get("indexing_status", {}).get("state", "unknown") - }) + with open(cache_path, 'r', encoding='utf-8') as f: + cache = json.load(f) + file_hashes = cache.get("file_hashes", {}) + + fp = str(Path(file_path).resolve()) + if fp in file_hashes: + file_hashes.pop(fp, None) + cache["updated_at"] = datetime.now().isoformat() + + _atomic_write_state(cache_path, cache) except Exception: - # Skip corrupted state files - continue + pass + return + + cache = _read_cache(_resolve_workspace_root()) + fp = str(Path(file_path).resolve()) + if fp in cache.get("file_hashes", {}): + cache["file_hashes"].pop(fp, None) + cache["updated_at"] = datetime.now().isoformat() + _write_cache(_resolve_workspace_root(), cache) - return sorted(workspaces, key=lambda x: x.get("last_updated", ""), reverse=True) -def cleanup_old_state_locks(max_idle_seconds: int = 900) -> int: - """Best-effort cleanup of idle workspace locks. +def cleanup_old_cache_locks(max_idle_seconds: int = 900) -> int: + """Best-effort cleanup of idle cache locks. Removes locks that have been idle (not requested via _get_state_lock) for longer than max_idle_seconds and whose lock can be acquired without blocking (i.e., not held). @@ -446,14 +713,67 @@ def cleanup_old_state_locks(max_idle_seconds: int = 900) -> int: removed += 1 return removed -if __name__ == "__main__": - # Simple CLI for testing - import sys - if len(sys.argv) > 1: - workspace = sys.argv[1] - state = get_workspace_state(workspace) - print(json.dumps(state, indent=2)) - else: - workspaces = list_workspaces() - for ws in workspaces: - print(f"{ws['workspace_path']}: {ws['collection_name']} ({ws['indexing_state']})") + +def get_collection_mappings(search_root: Optional[str] = None) -> List[Dict[str, Any]]: + """Enumerate collection mappings with origin metadata.""" + + root_path = Path(search_root or _resolve_workspace_root()).resolve() + mappings: List[Dict[str, Any]] = [] + + try: + if is_multi_repo_mode(): + repos_root = root_path / STATE_DIRNAME / "repos" + if repos_root.exists(): + for repo_dir in sorted(p for p in repos_root.iterdir() if p.is_dir()): + repo_name = repo_dir.name + state_path = repo_dir / STATE_FILENAME + if not state_path.exists(): + continue + try: + with open(state_path, "r", encoding="utf-8") as f: + state = json.load(f) or {} + except Exception: + continue + + origin = state.get("origin", {}) or {} + mappings.append( + { + "repo_name": repo_name, + "collection_name": state.get("qdrant_collection") + or get_collection_name(repo_name), + "container_path": origin.get("container_path") + or str((Path(_resolve_workspace_root()) / repo_name).resolve()), + "source_path": origin.get("source_path"), + "state_file": str(state_path), + "updated_at": state.get("updated_at"), + } + ) + else: + state_path = root_path / STATE_DIRNAME / STATE_FILENAME + if state_path.exists(): + try: + with open(state_path, "r", encoding="utf-8") as f: + state = json.load(f) or {} + except Exception: + state = {} + + origin = state.get("origin", {}) or {} + repo_name = origin.get("repo_name") or Path(root_path).name + mappings.append( + { + "repo_name": repo_name, + "collection_name": state.get("qdrant_collection") + or get_collection_name(repo_name), + "container_path": origin.get("container_path") + or str(root_path), + "source_path": origin.get("source_path"), + "state_file": str(state_path), + "updated_at": state.get("updated_at"), + } + ) + except Exception: + return mappings + + return mappings + +# Add missing functions that callers expect (already defined above) \ No newline at end of file diff --git a/tests/test_change_history_for_path.py b/tests/test_change_history_for_path.py index a0f9c046..52be7592 100644 --- a/tests/test_change_history_for_path.py +++ b/tests/test_change_history_for_path.py @@ -16,7 +16,13 @@ def _decorator(fn): return fn return _decorator +class _Context: + def __init__(self, *args, **kwargs): + # Tests only access .session when present; keep permissive defaults + self.session = kwargs.get("session") + setattr(fastmcp_pkg, "FastMCP", _FastMCP) +setattr(fastmcp_pkg, "Context", _Context) sys.modules.setdefault("mcp", mcp_pkg) sys.modules.setdefault("mcp.server", server_pkg) sys.modules.setdefault("mcp.server.fastmcp", fastmcp_pkg) diff --git a/tests/test_hybrid_cli_json.py b/tests/test_hybrid_cli_json.py index 5fb5be79..4a3cf480 100644 --- a/tests/test_hybrid_cli_json.py +++ b/tests/test_hybrid_cli_json.py @@ -28,7 +28,7 @@ def __init__(self, *args, **kwargs): self.args = args self.kwargs = kwargs - def fake_dense_query(client, vec_name, vector, flt, per_query): + def fake_dense_query(client, vec_name, vector, flt, per_query, collection_name=None): md = { "path": "/work/pkg/a.py", "symbol": "foo", diff --git a/tests/test_reranker_verification.py b/tests/test_reranker_verification.py index b7c24123..2642c65f 100644 --- a/tests/test_reranker_verification.py +++ b/tests/test_reranker_verification.py @@ -17,7 +17,12 @@ def _decorator(fn): return fn return _decorator +class _Context: + def __init__(self, *args, **kwargs): + self.session = kwargs.get("session") + setattr(fastmcp_pkg, "FastMCP", _FastMCP) +setattr(fastmcp_pkg, "Context", _Context) sys.modules.setdefault("mcp", mcp_pkg) sys.modules.setdefault("mcp.server", server_pkg) sys.modules.setdefault("mcp.server.fastmcp", fastmcp_pkg) @@ -25,6 +30,20 @@ def _decorator(fn): srv = importlib.import_module("scripts.mcp_indexer_server") +def _make_hybrid_stub(fake_run): + mod = types.ModuleType("scripts.hybrid_search") + mod.run_hybrid_search = fake_run + mod.lang_matches_path = lambda path, lang=None: True + mod._merge_and_budget_spans = lambda spans, *args, **kwargs: spans + mod.TextEmbedding = object + mod.QdrantClient = object + return mod + + +def _fake_embedding_model(*args, **kwargs): + return object() + + @pytest.mark.service @pytest.mark.anyio async def test_rerank_inproc_changes_order(monkeypatch): @@ -58,17 +77,32 @@ def fake_rerank_local(pairs): # Patch hybrid and rerank monkeypatch.setenv("EMBEDDING_MODEL", "BAAI/bge-base-en-v1.5") + monkeypatch.setitem(sys.modules, "scripts.hybrid_search", _make_hybrid_stub(fake_run_hybrid_search)) + monkeypatch.delitem(sys.modules, "scripts.mcp_indexer_server", raising=False) + server = importlib.import_module("scripts.mcp_indexer_server") + monkeypatch.setattr( + server, + "_get_embedding_model", + _fake_embedding_model, + ) + monkeypatch.setattr( + server, + "run_hybrid_search", + fake_run_hybrid_search, + raising=False, + ) monkeypatch.setattr( - importlib.import_module("scripts.hybrid_search"), "run_hybrid_search", fake_run_hybrid_search + importlib.import_module("scripts.rerank_local"), + "rerank_local", + fake_rerank_local, ) - monkeypatch.setattr(importlib.import_module("scripts.rerank_local"), "rerank_local", fake_rerank_local) # Baseline (rerank disabled) preserves hybrid order A then B - base = await srv.repo_search(query="q", limit=2, per_path=2, rerank_enabled=False, compact=True) + base = await server.repo_search(query="q", limit=2, per_path=2, rerank_enabled=False, compact=True) assert [r["path"] for r in base["results"]] == ["/work/a.py", "/work/b.py"] # With rerank enabled, order should flip to B then A; counters should show inproc_hybrid - rr = await srv.repo_search(query="q", limit=2, per_path=2, rerank_enabled=True, compact=True) + rr = await server.repo_search(query="q", limit=2, per_path=2, rerank_enabled=True, compact=True) assert rr.get("used_rerank") is True assert rr.get("rerank_counters", {}).get("inproc_hybrid", 0) >= 1 assert [r["path"] for r in rr["results"]] == ["/work/b.py", "/work/a.py"] @@ -91,12 +125,19 @@ async def fake_run_async(cmd, env=None, timeout=None): # Simulate subprocess reranker timing out return {"ok": False, "code": -1, "stdout": "", "stderr": f"Command timed out after {timeout}s"} + monkeypatch.setitem(sys.modules, "scripts.hybrid_search", _make_hybrid_stub(fake_run_hybrid_search)) + monkeypatch.delitem(sys.modules, "scripts.mcp_indexer_server", raising=False) + server = importlib.import_module("scripts.mcp_indexer_server") monkeypatch.setattr( - importlib.import_module("scripts.hybrid_search"), "run_hybrid_search", fake_run_hybrid_search + server, + "run_hybrid_search", + fake_run_hybrid_search, + raising=False, ) - monkeypatch.setattr(srv, "_run_async", fake_run_async) + monkeypatch.setattr(server, "_get_embedding_model", _fake_embedding_model) + monkeypatch.setattr(server, "_run_async", fake_run_async) - rr = await srv.repo_search(query="q", limit=2, per_path=2, rerank_enabled=True, compact=True) + rr = await server.repo_search(query="q", limit=2, per_path=2, rerank_enabled=True, compact=True) # Fallback should keep original order from hybrid; timeout counter incremented assert rr.get("used_rerank") is False assert rr.get("rerank_counters", {}).get("timeout", 0) >= 1