From 621c4fc10c6c9141d6e741bfe04ef76950f123b1 Mon Sep 17 00:00:00 2001 From: voarsh2 <60437695+voarsh2@users.noreply.github.com> Date: Sat, 25 Oct 2025 01:06:07 +0100 Subject: [PATCH 01/16] Add Claude Code GitHub Actions workflow --- .github/workflows/claude.yaml | 68 +++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 .github/workflows/claude.yaml diff --git a/.github/workflows/claude.yaml b/.github/workflows/claude.yaml new file mode 100644 index 00000000..732de78c --- /dev/null +++ b/.github/workflows/claude.yaml @@ -0,0 +1,68 @@ +name: Claude Code + +on: + issue_comment: + types: [created] + pull_request_review_comment: + types: [created] + issues: + types: [opened] + pull_request_review: + types: [submitted] + pull_request_target: + types: [opened, synchronize] + +jobs: + claude: + # This simplified condition is more robust and correctly checks permissions. + if: > + (contains(github.event.comment.body, '@claude') || + contains(github.event.review.body, '@claude') || + contains(github.event.issue.body, '@claude') || + contains(github.event.pull_request.body, '@claude')) && + (github.event.sender.type == 'User' && ( + github.event.comment.author_association == 'OWNER' || + github.event.comment.author_association == 'MEMBER' || + github.event.comment.author_association == 'COLLABORATOR' + )) + runs-on: ubuntu-latest + permissions: + # CRITICAL: Write permissions are required for the action to push branches and update issues/PRs. + contents: write + pull-requests: write + issues: write + id-token: write # Required for OIDC token exchange + actions: read # Required for Claude to read CI results on PRs + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + # This correctly checks out the PR's head commit for pull_request_target events. + ref: ${{ github.event.pull_request.head.sha }} + + - name: Create Claude settings file + run: | + mkdir -p /home/runner/.claude + cat > /home/runner/.claude/settings.json << 'EOF' + { + "env": { + "ANTHROPIC_BASE_URL": "https://api.z.ai/api/anthropic", + "ANTHROPIC_AUTH_TOKEN": "${{ secrets.CUSTOM_ENDPOINT_API_KEY }}" + } + } + EOF + + - name: Run Claude Code + id: claude + uses: anthropics/claude-code-action@v1 + with: + # Still need this to satisfy the action's validation + anthropic_api_key: ${{ secrets.CUSTOM_ENDPOINT_API_KEY }} + + # Use the same variable names as your local setup + settings: '{"env": {"ANTHROPIC_BASE_URL": "https://api.z.ai/api/anthropic", "ANTHROPIC_AUTH_TOKEN": "${{ secrets.CUSTOM_ENDPOINT_API_KEY }}"}}' + + track_progress: true + claude_args: | + --allowedTools "Bash,Edit,Read,Write,Glob,Grep" From 39bc52b11959701f78ba20b4f0305a6922b755a2 Mon Sep 17 00:00:00 2001 From: "claude[bot]" <41898282+claude[bot]@users.noreply.github.com> Date: Sat, 25 Oct 2025 00:21:08 +0000 Subject: [PATCH 02/16] feat: Add Kubernetes deployment manifests Add comprehensive Kubernetes deployment configuration for Context-Engine: - Complete service manifests converted from docker-compose - Persistent storage for Qdrant database - ConfigMaps with environment variables (local-first defaults) - NodePort services for external access - Optional Ingress configuration for domain-based access - Automated deployment and cleanup scripts - Makefile for development and management - Comprehensive documentation and troubleshooting guide Key features: - Maintains local development defaults - Optional remote hosting capabilities - Health checks and resource limits - Scalable MCP server deployments - Support for both SSE and HTTP transports - Optional Llama.cpp integration Co-authored-by: voarsh2 --- deploy/kubernetes/Makefile | 207 ++++++++++++++++ deploy/kubernetes/README.md | 284 ++++++++++++++++++++++ deploy/kubernetes/cleanup.sh | 239 +++++++++++++++++++ deploy/kubernetes/configmap.yaml | 119 ++++++++++ deploy/kubernetes/deploy.sh | 300 ++++++++++++++++++++++++ deploy/kubernetes/indexer-services.yaml | 203 ++++++++++++++++ deploy/kubernetes/ingress.yaml | 75 ++++++ deploy/kubernetes/kustomization.yaml | 87 +++++++ deploy/kubernetes/llamacpp.yaml | 102 ++++++++ deploy/kubernetes/mcp-http.yaml | 245 +++++++++++++++++++ deploy/kubernetes/mcp-indexer.yaml | 110 +++++++++ deploy/kubernetes/mcp-memory.yaml | 146 ++++++++++++ deploy/kubernetes/namespace.yaml | 7 + deploy/kubernetes/qdrant.yaml | 125 ++++++++++ 14 files changed, 2249 insertions(+) create mode 100644 deploy/kubernetes/Makefile create mode 100644 deploy/kubernetes/README.md create mode 100755 deploy/kubernetes/cleanup.sh create mode 100644 deploy/kubernetes/configmap.yaml create mode 100755 deploy/kubernetes/deploy.sh create mode 100644 deploy/kubernetes/indexer-services.yaml create mode 100644 deploy/kubernetes/ingress.yaml create mode 100644 deploy/kubernetes/kustomization.yaml create mode 100644 deploy/kubernetes/llamacpp.yaml create mode 100644 deploy/kubernetes/mcp-http.yaml create mode 100644 deploy/kubernetes/mcp-indexer.yaml create mode 100644 deploy/kubernetes/mcp-memory.yaml create mode 100644 deploy/kubernetes/namespace.yaml create mode 100644 deploy/kubernetes/qdrant.yaml diff --git a/deploy/kubernetes/Makefile b/deploy/kubernetes/Makefile new file mode 100644 index 00000000..42274f9f --- /dev/null +++ b/deploy/kubernetes/Makefile @@ -0,0 +1,207 @@ +# Context-Engine Kubernetes Deployment Makefile + +# Configuration +NAMESPACE ?= context-engine +IMAGE_REGISTRY ?= context-engine +IMAGE_TAG ?= latest + +# Default target +.PHONY: help +help: ## Show this help message + @echo "Context-Engine Kubernetes Deployment Commands" + @echo "" + @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-20s\033[0m %s\n", $$1, $$2}' + +# Prerequisites +.PHONY: check-kubectl +check-kubectl: ## Check if kubectl is available and cluster is accessible + @which kubectl > /dev/null || (echo "kubectl not found. Please install kubectl." && exit 1) + @kubectl cluster-info > /dev/null || (echo "Cannot connect to Kubernetes cluster." && exit 1) + @echo "✓ Kubernetes connection verified" + +# Deployment targets +.PHONY: deploy +deploy: check-kubectl ## Deploy all Context-Engine services + ./deploy.sh --namespace $(NAMESPACE) --registry $(IMAGE_REGISTRY) --tag $(IMAGE_TAG) + +.PHONY: deploy-core +deploy-core: check-kubectl ## Deploy only core services (Qdrant + MCP servers) + @echo "Deploying core services..." + kubectl apply -f namespace.yaml + kubectl apply -f configmap.yaml + kubectl apply -f qdrant.yaml + kubectl apply -f mcp-memory.yaml + kubectl apply -f mcp-indexer.yaml + +.PHONY: deploy-full +deploy-full: check-kubectl ## Deploy all services including optional ones + ./deploy.sh --namespace $(NAMESPACE) --registry $(IMAGE_REGISTRY) --tag $(IMAGE_TAG) --deploy-ingress + +.PHONY: deploy-minimal +deploy-minimal: check-kubectl ## Deploy minimal setup (skip Llama.cpp and Ingress) + ./deploy.sh --namespace $(NAMESPACE) --registry $(IMAGE_REGISTRY) --tag $(IMAGE_TAG) --skip-llamacpp + +# Management targets +.PHONY: status +status: check-kubectl ## Show deployment status + @echo "=== Namespace Status ===" + kubectl get namespace $(NAMESPACE) || echo "Namespace $(NAMESPACE) not found" + @echo "" + @echo "=== Pods ===" + kubectl get pods -n $(NAMESPACE) -o wide || echo "No pods found" + @echo "" + @echo "=== Services ===" + kubectl get services -n $(NAMESPACE) || echo "No services found" + @echo "" + @echo "=== Deployments ===" + kubectl get deployments -n $(NAMESPACE) || echo "No deployments found" + @echo "" + @echo "=== StatefulSets ===" + kubectl get statefulsets -n $(NAMESPACE) || echo "No statefulsets found" + @echo "" + @echo "=== PersistentVolumeClaims ===" + kubectl get pvc -n $(NAMESPACE) || echo "No PVCs found" + @echo "" + @echo "=== Jobs ===" + kubectl get jobs -n $(NAMESPACE) || echo "No jobs found" + +.PHONY: logs +logs: check-kubectl ## Show logs for all services + @echo "=== Qdrant Logs ===" + kubectl logs -f statefulset/qdrant -n $(NAMESPACE) || echo "Qdrant logs not available" + @echo "" + @echo "=== MCP Memory Logs ===" + kubectl logs -f deployment/mcp-memory -n $(NAMESPACE) || echo "MCP Memory logs not available" + @echo "" + @echo "=== MCP Indexer Logs ===" + kubectl logs -f deployment/mcp-indexer -n $(NAMESPACE) || echo "MCP Indexer logs not available" + @echo "" + @echo "=== Watcher Logs ===" + kubectl logs -f deployment/watcher -n $(NAMESPACE) || echo "Watcher logs not available" + +.PHONY: logs-service +logs-service: check-kubectl ## Show logs for specific service (usage: make logs-service SERVICE=mcp-memory) + @if [ -z "$(SERVICE)" ]; then echo "Usage: make logs-service SERVICE="; exit 1; fi + kubectl logs -f deployment/$(SERVICE) -n $(NAMESPACE) || kubectl logs -f statefulset/$(SERVICE) -n $(NAMESPACE) || kubectl logs -f job/$(SERVICE) -n $(NAMESPACE) || echo "Service $(SERVICE) not found" + +.PHONY: shell +shell: check-kubectl ## Get a shell in a running pod (usage: make shell POD=mcp-memory-xxx) + @if [ -z "$(POD)" ]; then echo "Usage: make shell POD="; echo "Available pods:"; kubectl get pods -n $(NAMESPACE); exit 1; fi + kubectl exec -it $(POD) -n $(NAMESPACE) -- /bin/bash || kubectl exec -it $(POD) -n $(NAMESPACE) -- /bin/sh + +# Cleanup targets +.PHONY: cleanup +cleanup: check-kubectl ## Remove all Context-Engine resources + ./cleanup.sh --namespace $(NAMESPACE) + +.PHONY: clean-force +clean-force: check-kubectl ## Force cleanup without confirmation + ./cleanup.sh --namespace $(NAMESPACE) --force + +# Development targets +.PHONY: restart +restart: check-kubectl ## Restart all deployments + kubectl rollout restart deployment -n $(NAMESPACE) + kubectl rollout restart statefulset -n $(NAMESPACE) + +.PHONY: restart-service +restart-service: check-kubectl ## Restart specific service (usage: make restart-service SERVICE=mcp-memory) + @if [ -z "$(SERVICE)" ]; then echo "Usage: make restart-service SERVICE="; exit 1; fi + kubectl rollout restart deployment/$(SERVICE) -n $(NAMESPACE) || kubectl rollout restart statefulset/$(SERVICE) -n $(NAMESPACE) + +.PHONY: scale +scale: check-kubectl ## Scale a deployment (usage: make scale SERVICE=mcp-memory REPLICAS=3) + @if [ -z "$(SERVICE)" ] || [ -z "$(REPLICAS)" ]; then echo "Usage: make scale SERVICE= REPLICAS="; exit 1; fi + kubectl scale deployment $(SERVICE) -n $(NAMESPACE) --replicas=$(REPLICAS) + +# Port forwarding targets +.PHONY: port-forward +port-forward: check-kubectl ## Port forward all services + @echo "Opening port forwards in background..." + @kubectl port-forward -n $(NAMESPACE) service/qdrant 6333:6333 & + @kubectl port-forward -n $(NAMESPACE) service/mcp-memory 8000:8000 & + @kubectl port-forward -n $(NAMESPACE) service/mcp-indexer 8001:8001 & + @echo "Port forwards started. Use 'make stop-port-forward' to stop." + +.PHONY: port-forward-service +port-forward-service: check-kubectl ## Port forward specific service (usage: make port-forward-service SERVICE=qdrant LOCAL=6333 REMOTE=6333) + @if [ -z "$(SERVICE)" ] || [ -z "$(LOCAL)" ] || [ -z "$(REMOTE)" ]; then echo "Usage: make port-forward-service SERVICE= LOCAL= REMOTE="; exit 1; fi + kubectl port-forward -n $(NAMESPACE) service/$(SERVICE) $(LOCAL):$(REMOTE) + +.PHONY: stop-port-forward +stop-port-forward: ## Stop all port forwards + pkill -f "kubectl port-forward" || echo "No port forwards found" + +# Build and push targets (if you're building your own images) +.PHONY: build-image +build-image: ## Build Docker image (requires Docker) + docker build -t $(IMAGE_REGISTRY)/context-engine:$(IMAGE_TAG) ../../ + +.PHONY: push-image +push-image: build-image ## Push Docker image to registry + docker push $(IMAGE_REGISTRY)/context-engine:$(IMAGE_TAG) + +# Kustomize targets +.PHONY: kustomize-build +kustomize-build: ## Build manifests with Kustomize + kustomize build . + +.PHONY: kustomize-apply +kustomize-apply: check-kubectl ## Apply manifests with Kustomize + kustomize build . | kubectl apply -f - + +.PHONY: kustomize-delete +kustomize-delete: check-kubectl ## Delete manifests with Kustomize + kustomize build . | kubectl delete -f - + +# Test targets +.PHONY: test-connection +test-connection: check-kubectl ## Test connectivity to all services + @echo "Testing service connectivity..." + @echo "Qdrant:" + @kubectl run qdrant-test --image=curlimages/curl --rm -i --restart=Never -- curl -f http://qdrant.$(NAMESPACE).svc.cluster.local:6333/health || echo "Qdrant test failed" + @echo "MCP Memory:" + @kubectl run memory-test --image=curlimages/curl --rm -i --restart=Never -- curl -f http://mcp-memory.$(NAMESPACE).svc.cluster.local:8000/health || echo "MCP Memory test failed" + @echo "MCP Indexer:" + @kubectl run indexer-test --image=curlimages/curl --rm -i --restart=Never -- curl -f http://mcp-indexer.$(NAMESPACE).svc.cluster.local:8001/health || echo "MCP Indexer test failed" + +# Configuration targets +.PHONY: show-config +show-config: ## Show current configuration + @echo "Configuration:" + @echo " NAMESPACE: $(NAMESPACE)" + @echo " IMAGE_REGISTRY: $(IMAGE_REGISTRY)" + @echo " IMAGE_TAG: $(IMAGE_TAG)" + @echo "" + @echo "Quick start commands:" + @echo " make deploy # Deploy all services" + @echo " make status # Show deployment status" + @echo " make logs # Show all logs" + @echo " make cleanup # Remove everything" + +.PHONY: show-urls +show-urls: check-kubectl ## Show access URLs for services + @echo "Service URLs (via NodePort):" + @echo " Qdrant: http://:30333" + @echo " MCP Memory (SSE): http://:30800" + @echo " MCP Indexer (SSE): http://:30802" + @echo " MCP Memory (HTTP): http://:30804" + @echo " MCP Indexer (HTTP): http://:30806" + @echo " Llama.cpp: http://:30808" + @echo "" + @echo "Service URLs (via port-forward):" + @echo " make port-forward # Then access via localhost ports" + +# Advanced targets +.PHONY: watch-deployment +watch-deployment: check-kubectl ## Watch deployment progress + watch kubectl get pods,services,deployments -n $(NAMESPACE) + +.PHONY: describe-service +describe-service: check-kubectl ## Describe a service (usage: make describe-service SERVICE=mcp-memory) + @if [ -z "$(SERVICE)" ]; then echo "Usage: make describe-service SERVICE="; echo "Available services:"; kubectl get services -n $(NAMESPACE); exit 1; fi + kubectl describe service $(SERVICE) -n $(NAMESPACE) + +.PHONY: events +events: check-kubectl ## Show recent events + kubectl get events -n $(NAMESPACE) --sort-by=.metadata.creationTimestamp \ No newline at end of file diff --git a/deploy/kubernetes/README.md b/deploy/kubernetes/README.md new file mode 100644 index 00000000..7a5db376 --- /dev/null +++ b/deploy/kubernetes/README.md @@ -0,0 +1,284 @@ +# Context-Engine Kubernetes Deployment + +This directory contains Kubernetes manifests for deploying Context-Engine on a Kubernetes cluster. The deployment maintains local-first defaults while providing optional remote hosting capabilities. + +## Architecture Overview + +### Services Deployed + +| Service | Port(s) | Description | Protocol | +|---------|---------|-------------|----------| +| **qdrant** | 6333, 6334 | Vector database | HTTP/gRPC | +| **mcp-memory** | 8000, 18000 | Memory server (SSE) | SSE | +| **mcp-memory-http** | 8002, 18002 | Memory server (HTTP) | HTTP | +| **mcp-indexer** | 8001, 18001 | Indexer server (SSE) | SSE | +| **mcp-indexer-http** | 8003, 18003 | Indexer server (HTTP) | HTTP | +| **watcher** | - | File change monitoring | - | +| **llamacpp** (optional) | 8080 | Text generation | HTTP | + +### NodePort Mappings + +For local development or direct access, services are exposed via NodePort: + +| Service | NodePort | Local Access | +|---------|----------|--------------| +| qdrant | 30333, 30334 | `http://:30333` | +| mcp-memory | 30800, 30801 | `http://:30800` | +| mcp-indexer | 30802, 30803 | `http://:30802` | +| mcp-memory-http | 30804, 30805 | `http://:30804` | +| mcp-indexer-http | 30806, 30807 | `http://:30806` | +| llamacpp | 30808 | `http://:30808` | + +## Prerequisites + +1. **Kubernetes Cluster** (v1.20+) +2. **kubectl** configured to access your cluster +3. **Storage Class** named `fast-ssd` (or modify `qdrant.yaml`) +4. **Docker image** built and pushed to registry: + ```bash + # Build and tag the image + docker build -t context-engine:latest . + + # Tag for your registry + docker tag context-engine:latest your-registry/context-engine:latest + + # Push to registry + docker push your-registry/context-engine:latest + ``` + +## Quick Start + +### 1. Deploy Core Services + +```bash +# Deploy namespace and configuration +kubectl apply -f namespace.yaml +kubectl apply -f configmap.yaml + +# Deploy Qdrant database +kubectl apply -f qdrant.yaml + +# Wait for Qdrant to be ready +kubectl wait --for=condition=ready pod -l component=qdrant -n context-engine --timeout=300s + +# Initialize indexes +kubectl apply -f indexer-services.yaml +``` + +### 2. Deploy MCP Servers + +```bash +# Deploy MCP Memory and Indexer servers (SSE) +kubectl apply -f mcp-memory.yaml +kubectl apply -f mcp-indexer.yaml + +# Deploy HTTP versions (optional) +kubectl apply -f mcp-http.yaml +``` + +### 3. Deploy Optional Services + +```bash +# Deploy Llama.cpp (optional, for text generation) +kubectl apply -f llamacpp.yaml + +# Deploy Ingress (optional, for domain-based access) +kubectl apply -f ingress.yaml +``` + +### 4. Verify Deployment + +```bash +# Check all pods +kubectl get pods -n context-engine + +# Check services +kubectl get services -n context-engine + +# Check logs for any service +kubectl logs -f deployment/mcp-memory -n context-engine +``` + +## Configuration + +### Environment Variables + +All configuration is managed through the `context-engine-config` ConfigMap in `configmap.yaml`. Key variables include: + +- **QDRANT_URL**: Database connection (automatically set to Kubernetes service) +- **COLLECTION_NAME**: Default collection name (`my-collection`) +- **EMBEDDING_MODEL**: Embedding model (`BAAI/bge-base-en-v1.5`) +- **EMBEDDING_PROVIDER**: Provider (`fastembed`) + +### Persistent Storage + +- **Qdrant data**: 20Gi persistent volume claim +- **Work directory**: HostPath mounted to `/tmp/context-engine-work` +- **Models directory**: HostPath mounted to `/tmp/context-engine-models` + +### Customization + +1. **Storage Class**: Modify `qdrant.yaml` to use your cluster's storage class +2. **Resources**: Adjust memory/CPU limits in each deployment +3. **Host Paths**: Update volume mounts to match your environment +4. **Ingress**: Configure `ingress.yaml` with your domain and SSL + +## Development Workflow + +### Local Development + +For local development, you can continue using `docker-compose.yml` as before: + +```bash +# Local development (unchanged) +docker-compose up -d +``` + +### Kubernetes Development + +For Kubernetes-based development: + +1. **Build and Push Image**: + ```bash + docker build -t your-registry/context-engine:latest . + docker push your-registry/context-engine:latest + ``` + +2. **Update Image References**: + ```bash + # Update all manifests to use your image + sed -i 's|context-engine:latest|your-registry/context-engine:latest|g' *.yaml + ``` + +3. **Deploy Changes**: + ```bash + kubectl apply -f . + ``` + +### File Synchronization + +The Kubernetes deployment uses HostPath volumes to sync files: + +```bash +# Mount your local code directory +sudo mkdir -p /tmp/context-engine-work +sudo cp -r /path/to/your/code/* /tmp/context-engine-work/ + +# Mount models (if using Llama.cpp) +sudo mkdir -p /tmp/context-engine-models +sudo cp /path/to/your/models/* /tmp/context-engine-models/ +``` + +## Monitoring and Troubleshooting + +### Health Checks + +All services include liveness and readiness probes: + +- **HTTP Services**: `/health` endpoint +- **Qdrant**: `/ready` and `/health` endpoints + +### Logs + +```bash +# View all logs +kubectl logs -f deployment/mcp-memory -n context-engine + +# View watcher logs for indexing activity +kubectl logs -f deployment/watcher -n context-engine + +# View Qdrant logs +kubectl logs -f statefulset/qdrant -n context-engine +``` + +### Common Issues + +1. **Storage Class Not Found**: + ```bash + kubectl get storageclass + # Update qdrant.yaml to use available storage class + ``` + +2. **HostPath Permissions**: + ```bash + # Ensure host directories are accessible + sudo chmod -R 755 /tmp/context-engine-work + ``` + +3. **Image Pull Errors**: + ```bash + # Check image registry access + kubectl describe pod -n context-engine + ``` + +## Scaling and High Availability + +### Scaling MCP Servers + +```bash +# Scale memory servers +kubectl scale deployment mcp-memory --replicas=3 -n context-engine + +# Scale indexer servers +kubectl scale deployment mcp-indexer --replicas=2 -n context-engine +``` + +### High Availability Qdrant + +For production, consider: + +1. **Qdrant Cloud**: Managed service with automatic scaling +2. **Multi-replica StatefulSet**: Configure Qdrant clustering +3. **External Database**: Use managed vector database + +## Security Considerations + +1. **Network Policies**: Restrict inter-service communication +2. **RBAC**: Implement proper role-based access control +3. **Secrets Management**: Use Kubernetes Secrets for sensitive data +4. **TLS**: Configure Ingress with SSL/TLS certificates + +## Migration from Docker Compose + +### Data Migration + +1. **Export Qdrant Data**: + ```bash + docker exec qdrant-db python -c " + import requests + response = requests.get('http://localhost:6333/collections/my-collection') + print(response.json()) + " + ``` + +2. **Import to Kubernetes**: + ```bash + # Copy data to Kubernetes PVC + kubectl cp qdrant-backup.json qdrant-0:/qdrant/storage/ -n context-engine + ``` + +### Configuration Migration + +The Kubernetes deployment maintains the same environment variables as Docker Compose. Most settings should work without changes. + +## Production Deployment Checklist + +- [ ] Use LoadBalancer services instead of NodePort +- [ ] Configure proper Ingress with SSL certificates +- [ ] Set up monitoring and logging (Prometheus, Grafana) +- [ ] Implement backup strategy for Qdrant data +- [ ] Configure resource limits and requests appropriately +- [ ] Set up horizontal pod autoscaling +- [ ] Implement security policies and network segmentation +- [ ] Configure proper secrets management +- [ ] Set up CI/CD pipeline for automated deployments + +## Support + +For issues with Kubernetes deployment: + +1. Check the service logs: `kubectl logs -f -n context-engine` +2. Verify resource usage: `kubectl top pods -n context-engine` +3. Check events: `kubectl get events -n context-engine --sort-by=.metadata.creationTimestamp` + +For application issues, refer to the main project documentation. \ No newline at end of file diff --git a/deploy/kubernetes/cleanup.sh b/deploy/kubernetes/cleanup.sh new file mode 100755 index 00000000..6aae38b4 --- /dev/null +++ b/deploy/kubernetes/cleanup.sh @@ -0,0 +1,239 @@ +#!/bin/bash + +# Context-Engine Kubernetes Cleanup Script +# This script removes all Context-Engine resources from Kubernetes + +set -e + +# Configuration +NAMESPACE="context-engine" + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# Logging functions +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +log_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +log_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +# Check if kubectl is available +check_kubectl() { + if ! command -v kubectl &> /dev/null; then + log_error "kubectl is not installed or not in PATH" + exit 1 + fi + + if ! kubectl cluster-info &> /dev/null; then + log_error "Cannot connect to Kubernetes cluster" + exit 1 + fi + + log_success "Kubernetes connection verified" +} + +# Check if namespace exists +check_namespace() { + if ! kubectl get namespace $NAMESPACE &> /dev/null; then + log_warning "Namespace $NAMESPACE does not exist" + return 1 + fi + return 0 +} + +# Show what will be deleted +show_deletion_plan() { + log_info "The following resources will be deleted:" + echo + + # Show current resources + echo "Pods:" + kubectl get pods -n $NAMESPACE 2>/dev/null || echo " No pods found" + echo + echo "Services:" + kubectl get services -n $NAMESPACE 2>/dev/null || echo " No services found" + echo + echo "Deployments:" + kubectl get deployments -n $NAMESPACE 2>/dev/null || echo " No deployments found" + echo + echo "StatefulSets:" + kubectl get statefulsets -n $NAMESPACE 2>/dev/null || echo " No statefulsets found" + echo + echo "Jobs:" + kubectl get jobs -n $NAMESPACE 2>/dev/null || echo " No jobs found" + echo + echo "PersistentVolumeClaims:" + kubectl get pvc -n $NAMESPACE 2>/dev/null || echo " No PVCs found" + echo + echo "ConfigMaps:" + kubectl get configmaps -n $NAMESPACE 2>/dev/null || echo " No configmaps found" + echo + if kubectl get ingress -n $NAMESPACE &> /dev/null; then + echo "Ingress:" + kubectl get ingress -n $NAMESPACE + echo + fi + + log_warning "This will permanently delete all data in Qdrant and any other persistent storage!" +} + +# Delete namespace and all resources +delete_namespace() { + log_info "Deleting namespace: $NAMESPACE" + kubectl delete namespace $NAMESPACE --ignore-not-found=true + log_success "Namespace deleted" +} + +# Wait for namespace deletion +wait_for_deletion() { + log_info "Waiting for namespace deletion to complete..." + + local timeout=60 + local count=0 + + while kubectl get namespace $NAMESPACE &> /dev/null; do + if [[ $count -ge $timeout ]]; then + log_warning "Namespace deletion is taking longer than expected" + log_info "You may need to manually delete remaining resources" + return 1 + fi + + echo -n "." + sleep 1 + ((count++)) + done + + echo + log_success "Namespace deletion completed" +} + +# Force delete if needed +force_delete() { + log_warning "Attempting to force delete remaining resources..." + + # Force delete any remaining pods + kubectl delete pods --all -n $NAMESPACE --grace-period=0 --force 2>/dev/null || true + + # Force delete any remaining PVCs + kubectl delete pvc --all -n $NAMESPACE --grace-period=0 --force 2>/dev/null || true + + log_success "Force delete completed" +} + +# Verify cleanup +verify_cleanup() { + log_info "Verifying cleanup..." + + if kubectl get namespace $NAMESPACE &> /dev/null; then + log_error "Namespace $NAMESPACE still exists" + return 1 + fi + + log_success "Cleanup completed successfully" +} + +# Main cleanup function +main() { + log_info "Starting Context-Engine Kubernetes cleanup" + + # Check prerequisites + check_kubectl + + # Check if namespace exists + if ! check_namespace; then + log_success "Nothing to clean up - namespace $NAMESPACE does not exist" + exit 0 + fi + + # Show what will be deleted + show_deletion_plan + + # Ask for confirmation + echo + read -p "Are you sure you want to delete all Context-Engine resources? (yes/no): " -r + if [[ ! $REPLY =~ ^[Yy][Ee][Ss]$ ]]; then + log_info "Cleanup cancelled" + exit 0 + fi + + # Delete namespace + delete_namespace + + # Wait for deletion + if ! wait_for_deletion; then + log_warning "Standard deletion incomplete, attempting force delete..." + force_delete + fi + + # Verify cleanup + verify_cleanup + + log_success "Context-Engine cleanup completed!" +} + +# Help function +show_help() { + echo "Context-Engine Kubernetes Cleanup Script" + echo + echo "Usage: $0 [OPTIONS]" + echo + echo "Options:" + echo " -h, --help Show this help message" + echo " -n, --namespace NAMESPACE Kubernetes namespace (default: context-engine)" + echo " -f, --force Skip confirmation prompt" + echo + echo "Environment variables:" + echo " NAMESPACE=context-engine Kubernetes namespace" + echo + echo "Examples:" + echo " $0 # Interactive cleanup with confirmation" + echo " $0 --force # Cleanup without confirmation" + echo " $0 -n my-namespace # Cleanup different namespace" +} + +# Parse command line arguments +while [[ $# -gt 0 ]]; do + case $1 in + -h|--help) + show_help + exit 0 + ;; + -n|--namespace) + NAMESPACE="$2" + shift 2 + ;; + -f|--force) + FORCE=true + shift + ;; + *) + log_error "Unknown option: $1" + show_help + exit 1 + ;; + esac +done + +# Check if we're in the right directory +if [[ ! -f "qdrant.yaml" ]]; then + log_error "Please run this script from the deploy/kubernetes directory" + exit 1 +fi + +# Run main cleanup +main \ No newline at end of file diff --git a/deploy/kubernetes/configmap.yaml b/deploy/kubernetes/configmap.yaml new file mode 100644 index 00000000..1d2b5802 --- /dev/null +++ b/deploy/kubernetes/configmap.yaml @@ -0,0 +1,119 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: context-engine-config + namespace: context-engine + labels: + app: context-engine + component: configuration +data: + # Default collection used by the MCP server (auto-created if missing) + COLLECTION_NAME: "my-collection" + + # Embedding settings (FastEmbed model) + EMBEDDING_MODEL: "BAAI/bge-base-en-v1.5" + EMBEDDING_PROVIDER: "fastembed" + + # FastMCP server settings (SSE transport) + FASTMCP_HOST: "0.0.0.0" + FASTMCP_PORT: "8000" + FASTMCP_INDEXER_PORT: "8001" + + # Optional: customize tool descriptions + TOOL_STORE_DESCRIPTION: "Store reusable code snippets for later retrieval. The 'information' is a clear NL description; include the actual code in 'metadata.code' and add 'metadata.language' (e.g., python, typescript) and 'metadata.path' when known. Use this whenever you generate or refine a code snippet." + TOOL_FIND_DESCRIPTION: "Search for relevant code snippets using multiple phrasings of the query (multi-query). Prefer results where metadata.language matches the target file and metadata.path is relevant. You may pass optional filters (language, path_prefix, kind) which the server applies server-side. Include 'metadata.code', 'metadata.path', and 'metadata.language' in responses." + + # Reranker toggles and tuning + RERANKER_ENABLED: "1" + RERANKER_TOPN: "100" + RERANKER_RETURN_M: "20" + RERANKER_TIMEOUT_MS: "3000" + RERANK_TIMEOUT_FLOOR_MS: "1000" + + # Optional warmups (disabled by default) + EMBEDDING_WARMUP: "0" + RERANK_WARMUP: "0" + + # In-process execution (faster; falls back to subprocess on failure) + HYBRID_IN_PROCESS: "1" + RERANK_IN_PROCESS: "1" + + # Tree-sitter parsing (enable for more accurate symbols/scopes) + USE_TREE_SITTER: "1" + + # Hybrid/rerank quick-win defaults (can override via flags) + HYBRID_EXPAND: "1" + HYBRID_PER_PATH: "1" + HYBRID_SYMBOL_BOOST: "0.35" + HYBRID_RECENCY_WEIGHT: "0.1" + RERANK_EXPAND: "1" + + # Disable semantic chunking to use micro-chunking instead + INDEX_SEMANTIC_CHUNKS: "0" + + # Memory integration (SSE + Qdrant) + MEMORY_SSE_ENABLED: "true" + MEMORY_MCP_URL: "http://mcp:8000/sse" + MEMORY_MCP_TIMEOUT: "6" + + # Local LLM expansion via Ollama (mini model) + LLM_PROVIDER: "ollama" + OLLAMA_HOST: "http://ollama:11434" + LLM_EXPAND_MODEL: "phi3:mini" + LLM_EXPAND_MAX: "4" + PRF_ENABLED: "1" + + # ReFRAG mode: compact gating + micro-chunking + REFRAG_MODE: "1" + MINI_VECTOR_NAME: "mini" + MINI_VEC_DIM: "64" + MINI_VEC_SEED: "1337" + HYBRID_MINI_WEIGHT: "1.0" + + # Micro-chunking controls (token-based) + INDEX_MICRO_CHUNKS: "1" + MICRO_CHUNK_TOKENS: "8" + MICRO_CHUNK_STRIDE: "4" + REFRAG_GATE_FIRST: "1" + REFRAG_CANDIDATES: "200" + + # Output shaping for micro spans + MICRO_OUT_MAX_SPANS: "3" + MICRO_MERGE_LINES: "4" + MICRO_BUDGET_TOKENS: "1500" + MICRO_TOKENS_PER_LINE: "32" + + # Answer shaping (enforce concise responses from context_answer) + CTX_SUMMARY_CHARS: "0" + + # Decoder-path ReFRAG (feature-flagged; off by default) + REFRAG_DECODER: "1" + REFRAG_RUNTIME: "llamacpp" + REFRAG_ENCODER_MODEL: "BAAI/bge-base-en-v1.5" + REFRAG_PHI_PATH: "/work/models/refrag_phi_768_to_dmodel.bin" + REFRAG_SENSE: "heuristic" + + # Llama.cpp sidecar (optional) + LLAMACPP_URL: "http://llamacpp:8080" + LLAMACPP_TIMEOUT_SEC: "120" + DECODER_MAX_TOKENS: "150" + REFRAG_DECODER_MODE: "prompt" + REFRAG_SOFT_SCALE: "1.0" + + # Operational safeguards and timeouts + MAX_MICRO_CHUNKS_PER_FILE: "200" + QDRANT_TIMEOUT: "20" + MEMORY_AUTODETECT: "1" + MEMORY_COLLECTION_TTL_SECS: "300" + + # Duplicate Streamable HTTP MCP instances + FASTMCP_HTTP_TRANSPORT: "http" + FASTMCP_HTTP_PORT: "8002" + FASTMCP_HTTP_HEALTH_PORT: "18002" + FASTMCP_INDEXER_HTTP_PORT: "8003" + FASTMCP_INDEXER_HTTP_HEALTH_PORT: "18003" + + # Watcher-specific settings + WATCH_DEBOUNCE_SECS: "1.5" + INDEX_UPSERT_BATCH: "128" + INDEX_UPSERT_RETRIES: "5" \ No newline at end of file diff --git a/deploy/kubernetes/deploy.sh b/deploy/kubernetes/deploy.sh new file mode 100755 index 00000000..5dc366dc --- /dev/null +++ b/deploy/kubernetes/deploy.sh @@ -0,0 +1,300 @@ +#!/bin/bash + +# Context-Engine Kubernetes Deployment Script +# This script deploys Context-Engine services to Kubernetes + +set -e + +# Configuration +NAMESPACE="context-engine" +IMAGE_REGISTRY="context-engine" # Change to your registry +IMAGE_TAG="latest" + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# Logging functions +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +log_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +log_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +# Check if kubectl is available +check_kubectl() { + if ! command -v kubectl &> /dev/null; then + log_error "kubectl is not installed or not in PATH" + exit 1 + fi + + if ! kubectl cluster-info &> /dev/null; then + log_error "Cannot connect to Kubernetes cluster" + exit 1 + fi + + log_success "Kubernetes connection verified" +} + +# Create namespace if it doesn't exist +create_namespace() { + log_info "Creating namespace: $NAMESPACE" + kubectl apply -f namespace.yaml + log_success "Namespace created/verified" +} + +# Deploy configuration +deploy_config() { + log_info "Deploying configuration" + kubectl apply -f configmap.yaml + log_success "Configuration deployed" +} + +# Deploy core services +deploy_core() { + log_info "Deploying core services" + + # Deploy Qdrant + log_info "Deploying Qdrant database..." + kubectl apply -f qdrant.yaml + + # Wait for Qdrant to be ready + log_info "Waiting for Qdrant to be ready..." + kubectl wait --for=condition=ready pod -l component=qdrant -n $NAMESPACE --timeout=300s + + log_success "Core services deployed" +} + +# Deploy MCP servers +deploy_mcp_servers() { + log_info "Deploying MCP servers" + + # Deploy SSE versions + kubectl apply -f mcp-memory.yaml + kubectl apply -f mcp-indexer.yaml + + # Wait for MCP servers to be ready + log_info "Waiting for MCP servers to be ready..." + kubectl wait --for=condition=ready pod -l component=mcp-memory -n $NAMESPACE --timeout=300s + kubectl wait --for=condition=ready pod -l component=mcp-indexer -n $NAMESPACE --timeout=300s + + log_success "MCP servers deployed" +} + +# Deploy HTTP servers (optional) +deploy_http_servers() { + log_info "Deploying HTTP servers (optional)" + kubectl apply -f mcp-http.yaml + + # Wait for HTTP servers to be ready + kubectl wait --for=condition=ready pod -l component=mcp-memory-http -n $NAMESPACE --timeout=300s + kubectl wait --for=condition=ready pod -l component=mcp-indexer-http -n $NAMESPACE --timeout=300s + + log_success "HTTP servers deployed" +} + +# Deploy indexer services +deploy_indexer_services() { + log_info "Deploying indexer services" + kubectl apply -f indexer-services.yaml + + log_success "Indexer services deployed" +} + +# Deploy optional Llama.cpp service +deploy_llamacpp() { + if [[ "$SKIP_LLAMACPP" != "true" ]]; then + log_info "Deploying Llama.cpp service (optional)" + kubectl apply -f llamacpp.yaml + log_success "Llama.cpp service deployed" + else + log_warning "Skipping Llama.cpp deployment" + fi +} + +# Deploy Ingress (optional) +deploy_ingress() { + if [[ "$DEPLOY_INGRESS" == "true" ]]; then + log_info "Deploying Ingress" + kubectl apply -f ingress.yaml + log_success "Ingress deployed" + else + log_warning "Skipping Ingress deployment (set DEPLOY_INGRESS=true to enable)" + fi +} + +# Update image references in manifests +update_images() { + if [[ "$IMAGE_REGISTRY" != "context-engine" ]]; then + log_info "Updating image references to: $IMAGE_REGISTRY/context-engine:$IMAGE_TAG" + + # Update all YAML files + for file in *.yaml; do + if [[ "$file" != "namespace.yaml" && "$file" != "configmap.yaml" ]]; then + sed -i.tmp "s|context-engine:latest|$IMAGE_REGISTRY/context-engine:$IMAGE_TAG|g" "$file" + rm -f "$file.tmp" + fi + done + + log_success "Image references updated" + fi +} + +# Show deployment status +show_status() { + log_info "Deployment status:" + echo + echo "Namespace: $NAMESPACE" + echo + echo "Pods:" + kubectl get pods -n $NAMESPACE -o wide + echo + echo "Services:" + kubectl get services -n $NAMESPACE + echo + echo "Persistent Volumes:" + kubectl get pvc -n $NAMESPACE + echo + + if [[ "$SHOW_INGRESS" == "true" ]]; then + echo "Ingress:" + kubectl get ingress -n $NAMESPACE + echo + fi + + log_success "Deployment complete!" + echo + log_info "Access URLs:" + echo " Qdrant: http://:30333" + echo " MCP Memory (SSE): http://:30800" + echo " MCP Memory (HTTP): http://:30804" + echo " MCP Indexer (SSE): http://:30802" + echo " MCP Indexer (HTTP): http://:30806" + if [[ "$SKIP_LLAMACPP" != "true" ]]; then + echo " Llama.cpp: http://:30808" + fi +} + +# Cleanup function +cleanup() { + log_warning "Cleaning up failed deployment..." + kubectl delete namespace $NAMESPACE --ignore-not-found=true +} + +# Main deployment function +main() { + log_info "Starting Context-Engine Kubernetes deployment" + + # Set up error handling + trap cleanup ERR + + # Check prerequisites + check_kubectl + + # Update image references if needed + update_images + + # Deploy in order + create_namespace + deploy_config + deploy_core + deploy_mcp_servers + deploy_http_servers + deploy_indexer_services + deploy_llamacpp + deploy_ingress + + # Show status + show_status +} + +# Help function +show_help() { + echo "Context-Engine Kubernetes Deployment Script" + echo + echo "Usage: $0 [OPTIONS]" + echo + echo "Options:" + echo " -h, --help Show this help message" + echo " -r, --registry REGISTRY Docker image registry (default: context-engine)" + echo " -t, --tag TAG Docker image tag (default: latest)" + echo " --skip-llamacpp Skip Llama.cpp deployment" + echo " --deploy-ingress Deploy Ingress configuration" + echo " --show-ingress Show Ingress status" + echo " --namespace NAMESPACE Kubernetes namespace (default: context-engine)" + echo + echo "Environment variables:" + echo " SKIP_LLAMACPP=true Skip Llama.cpp deployment" + echo " DEPLOY_INGRESS=true Deploy Ingress configuration" + echo " SHOW_INGRESS=true Show Ingress status" + echo " IMAGE_REGISTRY=registry Docker image registry" + echo " IMAGE_TAG=tag Docker image tag" + echo " NAMESPACE=context-engine Kubernetes namespace" + echo + echo "Examples:" + echo " $0 # Basic deployment" + echo " $0 --skip-llamacpp # Skip Llama.cpp" + echo " $0 --deploy-ingress # Deploy with Ingress" + echo " $0 -r myregistry.com -t v1.0 # Use custom image" +} + +# Parse command line arguments +while [[ $# -gt 0 ]]; do + case $1 in + -h|--help) + show_help + exit 0 + ;; + -r|--registry) + IMAGE_REGISTRY="$2" + shift 2 + ;; + -t|--tag) + IMAGE_TAG="$2" + shift 2 + ;; + --skip-llamacpp) + SKIP_LLAMACPP=true + shift + ;; + --deploy-ingress) + DEPLOY_INGRESS=true + shift + ;; + --show-ingress) + SHOW_INGRESS=true + shift + ;; + --namespace) + NAMESPACE="$2" + shift 2 + ;; + *) + log_error "Unknown option: $1" + show_help + exit 1 + ;; + esac +done + +# Check if we're in the right directory +if [[ ! -f "qdrant.yaml" ]]; then + log_error "Please run this script from the deploy/kubernetes directory" + exit 1 +fi + +# Run main deployment +main \ No newline at end of file diff --git a/deploy/kubernetes/indexer-services.yaml b/deploy/kubernetes/indexer-services.yaml new file mode 100644 index 00000000..69363a1a --- /dev/null +++ b/deploy/kubernetes/indexer-services.yaml @@ -0,0 +1,203 @@ +--- +# Indexer Job (One-shot code indexing) +apiVersion: batch/v1 +kind: Job +metadata: + name: indexer-job + namespace: context-engine + labels: + app: context-engine + component: indexer +spec: + template: + metadata: + labels: + app: context-engine + component: indexer + spec: + restartPolicy: OnFailure + containers: + - name: indexer + image: context-engine:latest # Replace with actual image after building + imagePullPolicy: IfNotPresent + command: ["python", "/app/scripts/ingest_code.py"] + workingDir: /work + env: + - name: QDRANT_URL + value: "http://qdrant:6333" + - name: COLLECTION_NAME + valueFrom: + configMapKeyRef: + name: context-engine-config + key: COLLECTION_NAME + - name: EMBEDDING_MODEL + valueFrom: + configMapKeyRef: + name: context-engine-config + key: EMBEDDING_MODEL + resources: + requests: + memory: "1Gi" + cpu: "500m" + limits: + memory: "4Gi" + cpu: "2000m" + volumeMounts: + - name: work-volume + mountPath: /work + readOnly: true + - name: codebase-volume + mountPath: /work/.codebase + volumes: + - name: work-volume + hostPath: + path: /tmp/context-engine-work # Adjust for your environment + type: DirectoryOrCreate + - name: codebase-volume + hostPath: + path: /tmp/context-engine-work/.codebase # Adjust for your environment + type: DirectoryOrCreate + +--- +# Watcher Service (File change monitoring and reindexing) +apiVersion: apps/v1 +kind: Deployment +metadata: + name: watcher + namespace: context-engine + labels: + app: context-engine + component: watcher +spec: + replicas: 1 + selector: + matchLabels: + app: context-engine + component: watcher + template: + metadata: + labels: + app: context-engine + component: watcher + spec: + containers: + - name: watcher + image: context-engine:latest # Replace with actual image after building + imagePullPolicy: IfNotPresent + command: ["python", "/app/scripts/watch_index.py"] + workingDir: /work + env: + - name: QDRANT_URL + value: "http://qdrant:6333" + - name: COLLECTION_NAME + valueFrom: + configMapKeyRef: + name: context-engine-config + key: COLLECTION_NAME + - name: EMBEDDING_MODEL + valueFrom: + configMapKeyRef: + name: context-engine-config + key: EMBEDDING_MODEL + - name: WATCH_ROOT + value: "/work" + # Watcher-specific settings with safer defaults + - name: QDRANT_TIMEOUT + value: "60" + - name: MAX_MICRO_CHUNKS_PER_FILE + valueFrom: + configMapKeyRef: + name: context-engine-config + key: MAX_MICRO_CHUNKS_PER_FILE + - name: INDEX_UPSERT_BATCH + valueFrom: + configMapKeyRef: + name: context-engine-config + key: INDEX_UPSERT_BATCH + - name: INDEX_UPSERT_RETRIES + valueFrom: + configMapKeyRef: + name: context-engine-config + key: INDEX_UPSERT_RETRIES + - name: WATCH_DEBOUNCE_SECS + valueFrom: + configMapKeyRef: + name: context-engine-config + key: WATCH_DEBOUNCE_SECS + resources: + requests: + memory: "512Mi" + cpu: "250m" + limits: + memory: "2Gi" + cpu: "1000m" + volumeMounts: + - name: work-volume + mountPath: /work + readOnly: true + - name: codebase-volume + mountPath: /work/.codebase + volumes: + - name: work-volume + hostPath: + path: /tmp/context-engine-work # Adjust for your environment + type: DirectoryOrCreate + - name: codebase-volume + hostPath: + path: /tmp/context-engine-work/.codebase # Adjust for your environment + type: DirectoryOrCreate + +--- +# Index Initialization Job +apiVersion: batch/v1 +kind: Job +metadata: + name: init-payload + namespace: context-engine + labels: + app: context-engine + component: init +spec: + template: + metadata: + labels: + app: context-engine + component: init + spec: + restartPolicy: OnFailure + containers: + - name: init-payload + image: context-engine:latest # Replace with actual image after building + imagePullPolicy: IfNotPresent + command: ["python", "/app/scripts/create_indexes.py"] + workingDir: /work + env: + - name: QDRANT_URL + value: "http://qdrant:6333" + - name: COLLECTION_NAME + valueFrom: + configMapKeyRef: + name: context-engine-config + key: COLLECTION_NAME + resources: + requests: + memory: "512Mi" + cpu: "250m" + limits: + memory: "1Gi" + cpu: "500m" + volumeMounts: + - name: work-volume + mountPath: /work + readOnly: true + - name: codebase-volume + mountPath: /work/.codebase + volumes: + - name: work-volume + hostPath: + path: /tmp/context-engine-work # Adjust for your environment + type: DirectoryOrCreate + - name: codebase-volume + hostPath: + path: /tmp/context-engine-work/.codebase # Adjust for your environment + type: DirectoryOrCreate \ No newline at end of file diff --git a/deploy/kubernetes/ingress.yaml b/deploy/kubernetes/ingress.yaml new file mode 100644 index 00000000..d8b21cd0 --- /dev/null +++ b/deploy/kubernetes/ingress.yaml @@ -0,0 +1,75 @@ +--- +# Optional Ingress for external access via domain +# Note: Requires an Ingress controller (e.g., nginx-ingress, traefik) +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: context-engine-ingress + namespace: context-engine + annotations: + # nginx-ingress annotations (adjust based on your ingress controller) + nginx.ingress.kubernetes.io/rewrite-target: / + nginx.ingress.kubernetes.io/ssl-redirect: "false" + # Enable large request bodies for embeddings + nginx.ingress.kubernetes.io/proxy-body-size: "100m" + # WebSocket support for SSE + nginx.ingress.kubernetes.io/proxy-read-timeout: "3600" + nginx.ingress.kubernetes.io/proxy-send-timeout: "3600" + labels: + app: context-engine + component: ingress +spec: + ingressClassName: nginx # Adjust based on your ingress controller + rules: + # Replace with your actual domain + - host: context-engine.local # Adjust to your domain + http: + paths: + # Qdrant HTTP API + - path: /qdrant + pathType: Prefix + backend: + service: + name: qdrant + port: + number: 6333 + # MCP Memory Server (SSE) + - path: /memory-sse + pathType: Prefix + backend: + service: + name: mcp-memory + port: + number: 8000 + # MCP Memory Server (HTTP) + - path: /memory-http + pathType: Prefix + backend: + service: + name: mcp-memory-http + port: + number: 8002 + # MCP Indexer Server (SSE) + - path: /indexer-sse + pathType: Prefix + backend: + service: + name: mcp-indexer + port: + number: 8001 + # MCP Indexer Server (HTTP) + - path: /indexer-http + pathType: Prefix + backend: + service: + name: mcp-indexer-http + port: + number: 8003 + # Optional Llama.cpp + - path: /llamacpp + pathType: Prefix + backend: + service: + name: llamacpp + port: + number: 8080 \ No newline at end of file diff --git a/deploy/kubernetes/kustomization.yaml b/deploy/kubernetes/kustomization.yaml new file mode 100644 index 00000000..315ba40e --- /dev/null +++ b/deploy/kubernetes/kustomization.yaml @@ -0,0 +1,87 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +metadata: + name: context-engine + namespace: context-engine + +resources: + # Namespace and configuration + - namespace.yaml + - configmap.yaml + + # Core services + - qdrant.yaml + + # MCP servers + - mcp-memory.yaml + - mcp-indexer.yaml + - mcp-http.yaml + + # Indexer services + - indexer-services.yaml + + # Optional services + - llamacpp.yaml + - ingress.yaml + +# Common labels +commonLabels: + app.kubernetes.io/name: context-engine + app.kubernetes.io/component: kubernetes-deployment + app.kubernetes.io/managed-by: kustomize + +# Patches for production customization +patchesStrategicMerge: + # Uncomment and create patches for production + # - patches/production-storage.yaml + # - patches/production-resources.yaml + # - patches/production-ingress.yaml + +# ConfigMap generator (optional - for overrides) +configMapGenerator: + - name: context-engine-overrides + literals: + # Override specific values here + # COLLECTION_NAME=production-collection + # EMBEDDING_MODEL=BAAI/bge-large-en-v1.5 + +# Secret generator (optional - for sensitive data) +secretGenerator: + - name: context-engine-secrets + literals: + # Add secrets here (recommended to use existing secrets instead) + # QDRANT_API_KEY=your-api-key + # GEMINI_API_KEY=your-gemini-key + +# Images configuration (customize for your registry) +images: + - name: context-engine + newTag: latest + # newTag: v1.0.0 + # newName: your-registry/context-engine + +# Namespace override +namespace: context-engine + +# Replicas configuration +replicas: + # Scale MCP servers for high availability + - name: mcp-memory + count: 1 # Set to 2+ for production + - name: mcp-indexer + count: 1 # Set to 2+ for production + +# Resource patches +patches: + # Example resource customization + - patch: |- + - op: replace + path: /spec/resources/requests/memory + value: "1Gi" + - op: replace + path: /spec/resources/limits/memory + value: "4Gi" + target: + kind: Deployment + name: mcp-memory \ No newline at end of file diff --git a/deploy/kubernetes/llamacpp.yaml b/deploy/kubernetes/llamacpp.yaml new file mode 100644 index 00000000..44227c57 --- /dev/null +++ b/deploy/kubernetes/llamacpp.yaml @@ -0,0 +1,102 @@ +--- +# Optional Llama.cpp Service (Text Generation) +apiVersion: apps/v1 +kind: Deployment +metadata: + name: llamacpp + namespace: context-engine + labels: + app: context-engine + component: llamacpp +spec: + replicas: 1 # Set to 0 if not needed + selector: + matchLabels: + app: context-engine + component: llamacpp + template: + metadata: + labels: + app: context-engine + component: llamacpp + spec: + containers: + - name: llamacpp + image: ghcr.io/ggerganov/llama.cpp:server + imagePullPolicy: IfNotPresent + ports: + - name: http + containerPort: 8080 + protocol: TCP + env: + - name: LLAMA_ARG_MODEL + value: "/models/model.gguf" + - name: LLAMA_ARG_CTX_SIZE + value: "8192" + - name: LLAMA_ARG_HOST + value: "0.0.0.0" + - name: LLAMA_ARG_PORT + value: "8080" + command: ["llama-server"] + args: + - "--model" + - "/models/model.gguf" + - "--host" + - "0.0.0.0" + - "--port" + - "8080" + - "--ctx-size" + - "8192" + - "--no-warmup" + resources: + requests: + memory: "2Gi" + cpu: "1000m" + limits: + memory: "8Gi" + cpu: "4000m" + volumeMounts: + - name: models-volume + mountPath: /models + readOnly: true + livenessProbe: + httpGet: + path: /health + port: http + initialDelaySeconds: 60 + periodSeconds: 30 + timeoutSeconds: 10 + readinessProbe: + httpGet: + path: /health + port: http + initialDelaySeconds: 30 + periodSeconds: 10 + timeoutSeconds: 5 + volumes: + - name: models-volume + hostPath: + path: /tmp/context-engine-models # Adjust for your environment + type: DirectoryOrCreate + +--- +# Llama.cpp Service +apiVersion: v1 +kind: Service +metadata: + name: llamacpp + namespace: context-engine + labels: + app: context-engine + component: llamacpp +spec: + type: NodePort # Change to LoadBalancer for external access + ports: + - name: http + port: 8080 + targetPort: http + nodePort: 30808 # Optional: specify node port + protocol: TCP + selector: + app: context-engine + component: llamacpp \ No newline at end of file diff --git a/deploy/kubernetes/mcp-http.yaml b/deploy/kubernetes/mcp-http.yaml new file mode 100644 index 00000000..fb15715b --- /dev/null +++ b/deploy/kubernetes/mcp-http.yaml @@ -0,0 +1,245 @@ +--- +# MCP Memory Server (HTTP) Deployment +apiVersion: apps/v1 +kind: Deployment +metadata: + name: mcp-memory-http + namespace: context-engine + labels: + app: context-engine + component: mcp-memory-http +spec: + replicas: 1 + selector: + matchLabels: + app: context-engine + component: mcp-memory-http + template: + metadata: + labels: + app: context-engine + component: mcp-memory-http + spec: + containers: + - name: mcp-memory-http + image: context-engine:latest # Replace with actual image after building + imagePullPolicy: IfNotPresent + command: ["python", "-m", "mcp.server.fastmcp"] + args: ["--server-name", "context-engine-http", "--host", "0.0.0.0", "--port", "8000", "--transport", "http", "/app/scripts/memory_server.py"] + ports: + - name: http + containerPort: 8000 + protocol: TCP + - name: health + containerPort: 18000 + protocol: TCP + env: + - name: QDRANT_URL + value: "http://qdrant:6333" + - name: FASTMCP_HOST + valueFrom: + configMapKeyRef: + name: context-engine-config + key: FASTMCP_HOST + - name: FASTMCP_PORT + value: "8000" + - name: FASTMCP_TRANSPORT + valueFrom: + configMapKeyRef: + name: context-engine-config + key: FASTMCP_HTTP_TRANSPORT + - name: COLLECTION_NAME + valueFrom: + configMapKeyRef: + name: context-engine-config + key: COLLECTION_NAME + - name: EMBEDDING_MODEL + valueFrom: + configMapKeyRef: + name: context-engine-config + key: EMBEDDING_MODEL + - name: EMBEDDING_PROVIDER + valueFrom: + configMapKeyRef: + name: context-engine-config + key: EMBEDDING_PROVIDER + - name: TOOL_STORE_DESCRIPTION + valueFrom: + configMapKeyRef: + name: context-engine-config + key: TOOL_STORE_DESCRIPTION + - name: TOOL_FIND_DESCRIPTION + valueFrom: + configMapKeyRef: + name: context-engine-config + key: TOOL_FIND_DESCRIPTION + - name: FASTMCP_HEALTH_PORT + value: "18000" + resources: + requests: + memory: "512Mi" + cpu: "250m" + limits: + memory: "2Gi" + cpu: "1000m" + volumeMounts: + - name: work-volume + mountPath: /work + readOnly: true + livenessProbe: + httpGet: + path: /health + port: health + initialDelaySeconds: 30 + periodSeconds: 10 + readinessProbe: + httpGet: + path: /health + port: health + initialDelaySeconds: 10 + periodSeconds: 5 + volumes: + - name: work-volume + hostPath: + path: /tmp/context-engine-work # Adjust for your environment + type: DirectoryOrCreate + +--- +# MCP Memory Server (HTTP) Service +apiVersion: v1 +kind: Service +metadata: + name: mcp-memory-http + namespace: context-engine + labels: + app: context-engine + component: mcp-memory-http +spec: + type: NodePort # Change to LoadBalancer for external access + ports: + - name: http + port: 8002 + targetPort: http + nodePort: 30804 # Optional: specify node port + protocol: TCP + - name: health + port: 18002 + targetPort: health + nodePort: 30805 # Optional: specify node port + protocol: TCP + selector: + app: context-engine + component: mcp-memory-http + +--- +# MCP Indexer Server (HTTP) Deployment +apiVersion: apps/v1 +kind: Deployment +metadata: + name: mcp-indexer-http + namespace: context-engine + labels: + app: context-engine + component: mcp-indexer-http +spec: + replicas: 1 + selector: + matchLabels: + app: context-engine + component: mcp-indexer-http + template: + metadata: + labels: + app: context-engine + component: mcp-indexer-http + spec: + containers: + - name: mcp-indexer-http + image: context-engine:latest # Replace with actual image after building + imagePullPolicy: IfNotPresent + command: ["python", "-m", "mcp.server.fastmcp"] + args: ["--server-name", "context-engine-indexer-http", "--host", "0.0.0.0", "--port", "8001", "--transport", "http", "/app/scripts/indexer_server.py"] + ports: + - name: http + containerPort: 8001 + protocol: TCP + - name: health + containerPort: 18001 + protocol: TCP + env: + - name: QDRANT_URL + value: "http://qdrant:6333" + - name: FASTMCP_HOST + valueFrom: + configMapKeyRef: + name: context-engine-config + key: FASTMCP_HOST + - name: FASTMCP_INDEXER_PORT + value: "8001" + - name: FASTMCP_TRANSPORT + valueFrom: + configMapKeyRef: + name: context-engine-config + key: FASTMCP_HTTP_TRANSPORT + - name: FASTMCP_HEALTH_PORT + value: "18001" + resources: + requests: + memory: "512Mi" + cpu: "250m" + limits: + memory: "2Gi" + cpu: "1000m" + volumeMounts: + - name: work-volume + mountPath: /work + - name: codebase-volume + mountPath: /work/.codebase + livenessProbe: + httpGet: + path: /health + port: health + initialDelaySeconds: 30 + periodSeconds: 10 + readinessProbe: + httpGet: + path: /health + port: health + initialDelaySeconds: 10 + periodSeconds: 5 + volumes: + - name: work-volume + hostPath: + path: /tmp/context-engine-work # Adjust for your environment + type: DirectoryOrCreate + - name: codebase-volume + hostPath: + path: /tmp/context-engine-work/.codebase # Adjust for your environment + type: DirectoryOrCreate + +--- +# MCP Indexer Server (HTTP) Service +apiVersion: v1 +kind: Service +metadata: + name: mcp-indexer-http + namespace: context-engine + labels: + app: context-engine + component: mcp-indexer-http +spec: + type: NodePort # Change to LoadBalancer for external access + ports: + - name: http + port: 8003 + targetPort: http + nodePort: 30806 # Optional: specify node port + protocol: TCP + - name: health + port: 18003 + targetPort: health + nodePort: 30807 # Optional: specify node port + protocol: TCP + selector: + app: context-engine + component: mcp-indexer-http \ No newline at end of file diff --git a/deploy/kubernetes/mcp-indexer.yaml b/deploy/kubernetes/mcp-indexer.yaml new file mode 100644 index 00000000..59d2cf5e --- /dev/null +++ b/deploy/kubernetes/mcp-indexer.yaml @@ -0,0 +1,110 @@ +--- +# MCP Indexer Server (SSE) Deployment +apiVersion: apps/v1 +kind: Deployment +metadata: + name: mcp-indexer + namespace: context-engine + labels: + app: context-engine + component: mcp-indexer +spec: + replicas: 1 + selector: + matchLabels: + app: context-engine + component: mcp-indexer + template: + metadata: + labels: + app: context-engine + component: mcp-indexer + spec: + containers: + - name: mcp-indexer + image: context-engine:latest # Replace with actual image after building + imagePullPolicy: IfNotPresent + command: ["python", "-m", "mcp.server.fastmcp"] + args: ["--server-name", "context-engine-indexer", "--host", "0.0.0.0", "--port", "8001", "--transport", "sse", "/app/scripts/indexer_server.py"] + ports: + - name: sse + containerPort: 8001 + protocol: TCP + - name: health + containerPort: 18001 + protocol: TCP + env: + - name: QDRANT_URL + value: "http://qdrant:6333" + - name: FASTMCP_HOST + valueFrom: + configMapKeyRef: + name: context-engine-config + key: FASTMCP_HOST + - name: FASTMCP_INDEXER_PORT + valueFrom: + configMapKeyRef: + name: context-engine-config + key: FASTMCP_INDEXER_PORT + - name: FASTMCP_HEALTH_PORT + value: "18001" + resources: + requests: + memory: "512Mi" + cpu: "250m" + limits: + memory: "2Gi" + cpu: "1000m" + volumeMounts: + - name: work-volume + mountPath: /work + - name: codebase-volume + mountPath: /work/.codebase + livenessProbe: + httpGet: + path: /health + port: health + initialDelaySeconds: 30 + periodSeconds: 10 + readinessProbe: + httpGet: + path: /health + port: health + initialDelaySeconds: 10 + periodSeconds: 5 + volumes: + - name: work-volume + hostPath: + path: /tmp/context-engine-work # Adjust for your environment + type: DirectoryOrCreate + - name: codebase-volume + hostPath: + path: /tmp/context-engine-work/.codebase # Adjust for your environment + type: DirectoryOrCreate + +--- +# MCP Indexer Server (SSE) Service +apiVersion: v1 +kind: Service +metadata: + name: mcp-indexer + namespace: context-engine + labels: + app: context-engine + component: mcp-indexer +spec: + type: NodePort # Change to LoadBalancer for external access + ports: + - name: sse + port: 8001 + targetPort: sse + nodePort: 30802 # Optional: specify node port + protocol: TCP + - name: health + port: 18001 + targetPort: health + nodePort: 30803 # Optional: specify node port + protocol: TCP + selector: + app: context-engine + component: mcp-indexer \ No newline at end of file diff --git a/deploy/kubernetes/mcp-memory.yaml b/deploy/kubernetes/mcp-memory.yaml new file mode 100644 index 00000000..79e40980 --- /dev/null +++ b/deploy/kubernetes/mcp-memory.yaml @@ -0,0 +1,146 @@ +--- +# MCP Memory Server (SSE) Deployment +apiVersion: apps/v1 +kind: Deployment +metadata: + name: mcp-memory + namespace: context-engine + labels: + app: context-engine + component: mcp-memory +spec: + replicas: 1 + selector: + matchLabels: + app: context-engine + component: mcp-memory + template: + metadata: + labels: + app: context-engine + component: mcp-memory + spec: + containers: + - name: mcp-memory + image: context-engine:latest # Replace with actual image after building + imagePullPolicy: IfNotPresent + command: ["python", "-m", "mcp.server.fastmcp"] + args: ["--server-name", "context-engine", "--host", "0.0.0.0", "--port", "8000", "--transport", "sse", "/app/scripts/memory_server.py"] + ports: + - name: sse + containerPort: 8000 + protocol: TCP + - name: health + containerPort: 18000 + protocol: TCP + env: + - name: QDRANT_URL + value: "http://qdrant:6333" + - name: FASTMCP_HOST + valueFrom: + configMapKeyRef: + name: context-engine-config + key: FASTMCP_HOST + - name: FASTMCP_PORT + valueFrom: + configMapKeyRef: + name: context-engine-config + key: FASTMCP_PORT + - name: COLLECTION_NAME + valueFrom: + configMapKeyRef: + name: context-engine-config + key: COLLECTION_NAME + - name: EMBEDDING_MODEL + valueFrom: + configMapKeyRef: + name: context-engine-config + key: EMBEDDING_MODEL + - name: EMBEDDING_PROVIDER + valueFrom: + configMapKeyRef: + name: context-engine-config + key: EMBEDDING_PROVIDER + - name: TOOL_STORE_DESCRIPTION + valueFrom: + configMapKeyRef: + name: context-engine-config + key: TOOL_STORE_DESCRIPTION + - name: TOOL_FIND_DESCRIPTION + valueFrom: + configMapKeyRef: + name: context-engine-config + key: TOOL_FIND_DESCRIPTION + - name: FASTMCP_HEALTH_PORT + value: "18000" + # Additional environment variables from configmap + - name: MEMORY_SSE_ENABLED + valueFrom: + configMapKeyRef: + name: context-engine-config + key: MEMORY_SSE_ENABLED + - name: MEMORY_MCP_URL + valueFrom: + configMapKeyRef: + name: context-engine-config + key: MEMORY_MCP_URL + - name: MEMORY_MCP_TIMEOUT + valueFrom: + configMapKeyRef: + name: context-engine-config + key: MEMORY_MCP_TIMEOUT + resources: + requests: + memory: "512Mi" + cpu: "250m" + limits: + memory: "2Gi" + cpu: "1000m" + volumeMounts: + - name: work-volume + mountPath: /work + readOnly: true + livenessProbe: + httpGet: + path: /health + port: health + initialDelaySeconds: 30 + periodSeconds: 10 + readinessProbe: + httpGet: + path: /health + port: health + initialDelaySeconds: 10 + periodSeconds: 5 + volumes: + - name: work-volume + hostPath: + path: /tmp/context-engine-work # Adjust for your environment + type: DirectoryOrCreate + +--- +# MCP Memory Server (SSE) Service +apiVersion: v1 +kind: Service +metadata: + name: mcp-memory + namespace: context-engine + labels: + app: context-engine + component: mcp-memory +spec: + type: NodePort # Change to LoadBalancer for external access + ports: + - name: sse + port: 8000 + targetPort: sse + nodePort: 30800 # Optional: specify node port + protocol: TCP + - name: health + port: 18000 + targetPort: health + nodePort: 30801 # Optional: specify node port + protocol: TCP + selector: + app: context-engine + component: mcp-memory \ No newline at end of file diff --git a/deploy/kubernetes/namespace.yaml b/deploy/kubernetes/namespace.yaml new file mode 100644 index 00000000..f64416dc --- /dev/null +++ b/deploy/kubernetes/namespace.yaml @@ -0,0 +1,7 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: context-engine + labels: + app: context-engine + component: infrastructure \ No newline at end of file diff --git a/deploy/kubernetes/qdrant.yaml b/deploy/kubernetes/qdrant.yaml new file mode 100644 index 00000000..ebd8bf09 --- /dev/null +++ b/deploy/kubernetes/qdrant.yaml @@ -0,0 +1,125 @@ +--- +# Qdrant StatefulSet +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: qdrant + namespace: context-engine + labels: + app: context-engine + component: qdrant +spec: + serviceName: qdrant + replicas: 1 + selector: + matchLabels: + app: context-engine + component: qdrant + template: + metadata: + labels: + app: context-engine + component: qdrant + spec: + containers: + - name: qdrant + image: qdrant/qdrant:latest + imagePullPolicy: IfNotPresent + ports: + - name: http + containerPort: 6333 + protocol: TCP + - name: grpc + containerPort: 6334 + protocol: TCP + env: + - name: QDRANT__SERVICE__HTTP_PORT + value: "6333" + - name: QDRANT__SERVICE__GRPC_PORT + value: "6334" + resources: + requests: + memory: "512Mi" + cpu: "250m" + limits: + memory: "2Gi" + cpu: "1000m" + volumeMounts: + - name: qdrant-storage + mountPath: /qdrant/storage + livenessProbe: + httpGet: + path: /health + port: http + initialDelaySeconds: 30 + periodSeconds: 10 + readinessProbe: + httpGet: + path: /ready + port: http + initialDelaySeconds: 5 + periodSeconds: 5 + volumeClaimTemplates: + - metadata: + name: qdrant-storage + labels: + app: context-engine + component: qdrant + spec: + accessModes: ["ReadWriteOnce"] + storageClassName: fast-ssd # Adjust based on your cluster + resources: + requests: + storage: 20Gi + +--- +# Qdrant Service +apiVersion: v1 +kind: Service +metadata: + name: qdrant + namespace: context-engine + labels: + app: context-engine + component: qdrant +spec: + type: ClusterIP + ports: + - name: http + port: 6333 + targetPort: http + protocol: TCP + - name: grpc + port: 6334 + targetPort: grpc + protocol: TCP + selector: + app: context-engine + component: qdrant + +--- +# Optional: Qdrant External Service (for direct access) +apiVersion: v1 +kind: Service +metadata: + name: qdrant-external + namespace: context-engine + labels: + app: context-engine + component: qdrant +spec: + type: NodePort # Change to LoadBalancer if your cluster supports it + ports: + - name: http + port: 6333 + targetPort: http + nodePort: 30333 # Optional: specify node port + protocol: TCP + - name: grpc + port: 6334 + targetPort: grpc + nodePort: 30334 # Optional: specify node port + protocol: TCP + selector: + app: context-engine + component: qdrant \ No newline at end of file From 277aa6c8dd9ef42be5ed4b9ebb69752801a7e729 Mon Sep 17 00:00:00 2001 From: "claude[bot]" <41898282+claude[bot]@users.noreply.github.com> Date: Sat, 25 Oct 2025 00:39:05 +0000 Subject: [PATCH 03/16] fix: Add QDRANT_URL to ConfigMap and fix healthcheck paths - Add missing QDRANT_URL to ConfigMap for proper service discovery - Fix healthcheck paths from /health to /readyz to match MCP server endpoints - Standardize QDRANT_URL environment variable references across all deployments - Update mcp-memory, mcp-indexer, mcp-http, and indexer-services manifests Resolves localhost fallback issues in Kubernetes deployment where services were defaulting to localhost:6333 instead of using proper service names. Co-authored-by: voarsh2 --- deploy/kubernetes/configmap.yaml | 5 ++++- deploy/kubernetes/indexer-services.yaml | 15 ++++++++++++--- deploy/kubernetes/mcp-http.yaml | 18 ++++++++++++------ deploy/kubernetes/mcp-indexer.yaml | 9 ++++++--- deploy/kubernetes/mcp-memory.yaml | 9 ++++++--- 5 files changed, 40 insertions(+), 16 deletions(-) diff --git a/deploy/kubernetes/configmap.yaml b/deploy/kubernetes/configmap.yaml index 1d2b5802..72d97e40 100644 --- a/deploy/kubernetes/configmap.yaml +++ b/deploy/kubernetes/configmap.yaml @@ -116,4 +116,7 @@ data: # Watcher-specific settings WATCH_DEBOUNCE_SECS: "1.5" INDEX_UPSERT_BATCH: "128" - INDEX_UPSERT_RETRIES: "5" \ No newline at end of file + INDEX_UPSERT_RETRIES: "5" + + # Service URLs for Kubernetes deployment + QDRANT_URL: "http://qdrant:6333" \ No newline at end of file diff --git a/deploy/kubernetes/indexer-services.yaml b/deploy/kubernetes/indexer-services.yaml index 69363a1a..2565c948 100644 --- a/deploy/kubernetes/indexer-services.yaml +++ b/deploy/kubernetes/indexer-services.yaml @@ -24,7 +24,10 @@ spec: workingDir: /work env: - name: QDRANT_URL - value: "http://qdrant:6333" + valueFrom: + configMapKeyRef: + name: context-engine-config + key: QDRANT_URL - name: COLLECTION_NAME valueFrom: configMapKeyRef: @@ -88,7 +91,10 @@ spec: workingDir: /work env: - name: QDRANT_URL - value: "http://qdrant:6333" + valueFrom: + configMapKeyRef: + name: context-engine-config + key: QDRANT_URL - name: COLLECTION_NAME valueFrom: configMapKeyRef: @@ -173,7 +179,10 @@ spec: workingDir: /work env: - name: QDRANT_URL - value: "http://qdrant:6333" + valueFrom: + configMapKeyRef: + name: context-engine-config + key: QDRANT_URL - name: COLLECTION_NAME valueFrom: configMapKeyRef: diff --git a/deploy/kubernetes/mcp-http.yaml b/deploy/kubernetes/mcp-http.yaml index fb15715b..7d4e9a11 100644 --- a/deploy/kubernetes/mcp-http.yaml +++ b/deploy/kubernetes/mcp-http.yaml @@ -35,7 +35,10 @@ spec: protocol: TCP env: - name: QDRANT_URL - value: "http://qdrant:6333" + valueFrom: + configMapKeyRef: + name: context-engine-config + key: QDRANT_URL - name: FASTMCP_HOST valueFrom: configMapKeyRef: @@ -88,13 +91,13 @@ spec: readOnly: true livenessProbe: httpGet: - path: /health + path: /readyz port: health initialDelaySeconds: 30 periodSeconds: 10 readinessProbe: httpGet: - path: /health + path: /readyz port: health initialDelaySeconds: 10 periodSeconds: 5 @@ -168,7 +171,10 @@ spec: protocol: TCP env: - name: QDRANT_URL - value: "http://qdrant:6333" + valueFrom: + configMapKeyRef: + name: context-engine-config + key: QDRANT_URL - name: FASTMCP_HOST valueFrom: configMapKeyRef: @@ -197,13 +203,13 @@ spec: mountPath: /work/.codebase livenessProbe: httpGet: - path: /health + path: /readyz port: health initialDelaySeconds: 30 periodSeconds: 10 readinessProbe: httpGet: - path: /health + path: /readyz port: health initialDelaySeconds: 10 periodSeconds: 5 diff --git a/deploy/kubernetes/mcp-indexer.yaml b/deploy/kubernetes/mcp-indexer.yaml index 59d2cf5e..6bb2f328 100644 --- a/deploy/kubernetes/mcp-indexer.yaml +++ b/deploy/kubernetes/mcp-indexer.yaml @@ -35,7 +35,10 @@ spec: protocol: TCP env: - name: QDRANT_URL - value: "http://qdrant:6333" + valueFrom: + configMapKeyRef: + name: context-engine-config + key: QDRANT_URL - name: FASTMCP_HOST valueFrom: configMapKeyRef: @@ -62,13 +65,13 @@ spec: mountPath: /work/.codebase livenessProbe: httpGet: - path: /health + path: /readyz port: health initialDelaySeconds: 30 periodSeconds: 10 readinessProbe: httpGet: - path: /health + path: /readyz port: health initialDelaySeconds: 10 periodSeconds: 5 diff --git a/deploy/kubernetes/mcp-memory.yaml b/deploy/kubernetes/mcp-memory.yaml index 79e40980..3346580f 100644 --- a/deploy/kubernetes/mcp-memory.yaml +++ b/deploy/kubernetes/mcp-memory.yaml @@ -35,7 +35,10 @@ spec: protocol: TCP env: - name: QDRANT_URL - value: "http://qdrant:6333" + valueFrom: + configMapKeyRef: + name: context-engine-config + key: QDRANT_URL - name: FASTMCP_HOST valueFrom: configMapKeyRef: @@ -102,13 +105,13 @@ spec: readOnly: true livenessProbe: httpGet: - path: /health + path: /readyz port: health initialDelaySeconds: 30 periodSeconds: 10 readinessProbe: httpGet: - path: /health + path: /readyz port: health initialDelaySeconds: 10 periodSeconds: 5 From afa7f4166ef68cf7e7f7c2b112d92e512afe681b Mon Sep 17 00:00:00 2001 From: "claude[bot]" <41898282+claude[bot]@users.noreply.github.com> Date: Sat, 25 Oct 2025 00:53:45 +0000 Subject: [PATCH 04/16] feat: Add missing environment variables to Kubernetes ConfigMap Add 4 missing environment variables from docker-compose.yml to Kubernetes ConfigMap: - QDRANT_API_KEY: For Qdrant Cloud/remote authentication (optional) - REPO_NAME: Repository name for payload tracking - FASTMCP_SERVER_NAME: MCP server identifier - HOST_INDEX_PATH: Work directory mounting path This ensures full compatibility between docker-compose and Kubernetes deployments, allowing all services to reference the same environment variables regardless of deployment method. Co-authored-by: voarsh2 --- deploy/kubernetes/configmap.yaml | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/deploy/kubernetes/configmap.yaml b/deploy/kubernetes/configmap.yaml index 1d2b5802..1ab47ad8 100644 --- a/deploy/kubernetes/configmap.yaml +++ b/deploy/kubernetes/configmap.yaml @@ -116,4 +116,10 @@ data: # Watcher-specific settings WATCH_DEBOUNCE_SECS: "1.5" INDEX_UPSERT_BATCH: "128" - INDEX_UPSERT_RETRIES: "5" \ No newline at end of file + INDEX_UPSERT_RETRIES: "5" + + # Additional environment variables for full compatibility + QDRANT_API_KEY: "" # For Qdrant Cloud/remote authentication (optional) + REPO_NAME: "workspace" # Repository name for payload tracking + FASTMCP_SERVER_NAME: "qdrant-mcp" # MCP server identifier + HOST_INDEX_PATH: "/work" # Work directory mounting path \ No newline at end of file From 0a1bbb987371579a73beff3ac3a66dcff8ae99ce Mon Sep 17 00:00:00 2001 From: "claude[bot]" <41898282+claude[bot]@users.noreply.github.com> Date: Sat, 25 Oct 2025 01:16:32 +0000 Subject: [PATCH 05/16] feat: Add Docker build script and update Kubernetes manifests for service-specific images - Add comprehensive build-images.sh script with registry support - Update all deployment manifests to use service-specific image names - Replace hardcoded context-engine:latest with proper image names - Add image override generation for Kubernetes deployment - Support separate images for better maintainability and scaling Co-authored-by: voarsh2 --- build-images.sh | 226 ++++++++++++++++++++++++ deploy/kubernetes/indexer-services.yaml | 6 +- deploy/kubernetes/mcp-http.yaml | 4 +- deploy/kubernetes/mcp-indexer.yaml | 2 +- deploy/kubernetes/mcp-memory.yaml | 2 +- 5 files changed, 233 insertions(+), 7 deletions(-) create mode 100644 build-images.sh diff --git a/build-images.sh b/build-images.sh new file mode 100644 index 00000000..2cc3cd3a --- /dev/null +++ b/build-images.sh @@ -0,0 +1,226 @@ +#!/bin/bash +# Docker Build Script for Context-Engine +# Builds all service images with custom registry tagging + +set -euo pipefail + +# Configuration +REGISTRY="192.168.96.61:30009/library" +PROJECT_NAME="context-engine" +TAG="${TAG:-latest}" + +# Service mapping (service_name:dockerfile:final_image_name) +declare -A SERVICES=( + ["memory"]="Dockerfile.mcp:${PROJECT_NAME}-memory" + ["indexer"]="Dockerfile.mcp-indexer:${PROJECT_NAME}-indexer" + ["indexer-service"]="Dockerfile.indexer:${PROJECT_NAME}-indexer-service" + ["llamacpp"]="Dockerfile.llamacpp:${PROJECT_NAME}-llamacpp" +) + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +# Logging functions +log_info() { echo -e "${GREEN}[INFO]${NC} $1"; } +log_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; } +log_error() { echo -e "${RED}[ERROR]${NC} $1"; } + +# Build function +build_image() { + local service=$1 + local dockerfile=$2 + local image_name=$3 + local full_image="${REGISTRY}/${image_name}:${TAG}" + + log_info "Building ${service} service..." + log_info "Dockerfile: ${dockerfile}" + log_info "Image: ${full_image}" + + if ! docker build \ + -f "${dockerfile}" \ + -t "${full_image}" \ + --build-arg BUILDKIT_INLINE_CACHE=1 \ + .; then + log_error "Failed to build ${service} image" + return 1 + fi + + log_info "Successfully built ${service} image: ${full_image}" + + # Push if registry is accessible + if [[ "${PUSH_IMAGES:-false}" == "true" ]]; then + log_info "Pushing ${service} image..." + if ! docker push "${full_image}"; then + log_warn "Failed to push ${service} image (registry may be inaccessible)" + return 1 + fi + log_info "Successfully pushed ${service} image" + fi + + echo "${full_image}" +} + +# Main build process +main() { + log_info "Starting Context-Engine Docker build process..." + log_info "Registry: ${REGISTRY}" + log_info "Tag: ${TAG}" + log_info "Push enabled: ${PUSH_IMAGES:-false}" + echo + + # Check if Docker is running + if ! docker info >/dev/null 2>&1; then + log_error "Docker is not running or not accessible" + exit 1 + fi + + # Check if Dockerfiles exist + for service in "${!SERVICES[@]}"; do + IFS=':' read -r dockerfile image_name <<< "${SERVICES[$service]}" + if [[ ! -f "${dockerfile}" ]]; then + log_error "Dockerfile not found: ${dockerfile}" + exit 1 + fi + done + + local built_images=() + local failed_services=() + + # Build each service + for service in "${!SERVICES[@]}"; do + IFS=':' read -r dockerfile image_name <<< "${SERVICES[$service]}" + + if built_image=$(build_image "$service" "$dockerfile" "$image_name"); then + built_images+=("$built_image") + else + failed_services+=("$service") + fi + echo + done + + # Summary + log_info "Build Summary:" + log_info "Successfully built: ${#built_images[@]} images" + for img in "${built_images[@]}"; do + log_info " ✓ ${img}" + done + + if [[ ${#failed_services[@]} -gt 0 ]]; then + log_error "Failed to build: ${#failed_services[@]} services" + for service in "${failed_services[@]}"; do + log_error " ✗ ${service}" + done + exit 1 + fi + + log_info "All images built successfully!" + + # Generate updated kustomization.yaml + cat > "deploy/kubernetes/kustomization-images.yaml" << 'EOF' +# Image overrides for Context-Engine Kubernetes deployment +# Use this with: kustomize build . --load-restrictor=LoadRestrictionsNone | kubectl apply -f - +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: + - namespace.yaml + - configmap.yaml + - qdrant.yaml + - mcp-memory.yaml + - mcp-indexer.yaml + - mcp-http.yaml + - indexer-services.yaml + - llamacpp.yaml + - ingress.yaml + +images: +EOF + + # Add images to kustomization + for service in "${!SERVICES[@]}"; do + IFS=':' read -r dockerfile image_name <<< "${SERVICES[$service]}" + full_image="${REGISTRY}/${image_name}:${TAG}" + cat >> "deploy/kubernetes/kustomization-images.yaml" << EOF + - name: ${image_name} + newName: ${full_image%:*} # Remove tag + newTag: ${TAG} +EOF + done + + cat >> "deploy/kubernetes/kustomization-images.yaml" << 'EOF' + +# Common labels +commonLabels: + app.kubernetes.io/name: context-engine + app.kubernetes.io/component: kubernetes-deployment + app.kubernetes.io/managed-by: kustomize + +# Namespace override +namespace: context-engine +EOF + + log_info "Generated deploy/kubernetes/kustomization-images.yaml" + log_info "To deploy: kustomize build deploy/kubernetes/ | kubectl apply -f -" +} + +# Help function +show_help() { + cat << EOF +Context-Engine Docker Build Script + +Usage: $0 [OPTIONS] + +Options: + -t, --tag TAG Set image tag (default: latest) + -p, --push Push images to registry after build + -h, --help Show this help message + +Examples: + $0 # Build with default tag + $0 -t v1.0.0 # Build with custom tag + $0 --push # Build and push to registry + TAG=dev-branch $0 # Build using environment variable + +Environment Variables: + TAG Image tag to use + PUSH_IMAGES Set to 'true' to push after build + +Registry Configuration: + Current registry: ${REGISTRY} + To change: modify REGISTRY variable in script + +Generated Files: + - deploy/kubernetes/kustomization-images.yaml + Contains image references for Kubernetes deployment + +EOF +} + +# Parse arguments +while [[ $# -gt 0 ]]; do + case $1 in + -t|--tag) + TAG="$2" + shift 2 + ;; + -p|--push) + export PUSH_IMAGES=true + shift + ;; + -h|--help) + show_help + exit 0 + ;; + *) + log_error "Unknown option: $1" + show_help + exit 1 + ;; + esac +done + +# Run main function +main "$@" \ No newline at end of file diff --git a/deploy/kubernetes/indexer-services.yaml b/deploy/kubernetes/indexer-services.yaml index 69363a1a..59747100 100644 --- a/deploy/kubernetes/indexer-services.yaml +++ b/deploy/kubernetes/indexer-services.yaml @@ -18,7 +18,7 @@ spec: restartPolicy: OnFailure containers: - name: indexer - image: context-engine:latest # Replace with actual image after building + image: context-engine-indexer-service # Use service-specific image name imagePullPolicy: IfNotPresent command: ["python", "/app/scripts/ingest_code.py"] workingDir: /work @@ -82,7 +82,7 @@ spec: spec: containers: - name: watcher - image: context-engine:latest # Replace with actual image after building + image: context-engine-indexer-service # Use service-specific image name imagePullPolicy: IfNotPresent command: ["python", "/app/scripts/watch_index.py"] workingDir: /work @@ -167,7 +167,7 @@ spec: restartPolicy: OnFailure containers: - name: init-payload - image: context-engine:latest # Replace with actual image after building + image: context-engine-indexer-service # Use service-specific image name imagePullPolicy: IfNotPresent command: ["python", "/app/scripts/create_indexes.py"] workingDir: /work diff --git a/deploy/kubernetes/mcp-http.yaml b/deploy/kubernetes/mcp-http.yaml index fb15715b..58e758b2 100644 --- a/deploy/kubernetes/mcp-http.yaml +++ b/deploy/kubernetes/mcp-http.yaml @@ -22,7 +22,7 @@ spec: spec: containers: - name: mcp-memory-http - image: context-engine:latest # Replace with actual image after building + image: context-engine-memory # Use service-specific image name imagePullPolicy: IfNotPresent command: ["python", "-m", "mcp.server.fastmcp"] args: ["--server-name", "context-engine-http", "--host", "0.0.0.0", "--port", "8000", "--transport", "http", "/app/scripts/memory_server.py"] @@ -155,7 +155,7 @@ spec: spec: containers: - name: mcp-indexer-http - image: context-engine:latest # Replace with actual image after building + image: context-engine-indexer # Use service-specific image name imagePullPolicy: IfNotPresent command: ["python", "-m", "mcp.server.fastmcp"] args: ["--server-name", "context-engine-indexer-http", "--host", "0.0.0.0", "--port", "8001", "--transport", "http", "/app/scripts/indexer_server.py"] diff --git a/deploy/kubernetes/mcp-indexer.yaml b/deploy/kubernetes/mcp-indexer.yaml index 59d2cf5e..66d5dc07 100644 --- a/deploy/kubernetes/mcp-indexer.yaml +++ b/deploy/kubernetes/mcp-indexer.yaml @@ -22,7 +22,7 @@ spec: spec: containers: - name: mcp-indexer - image: context-engine:latest # Replace with actual image after building + image: context-engine-indexer # Use service-specific image name imagePullPolicy: IfNotPresent command: ["python", "-m", "mcp.server.fastmcp"] args: ["--server-name", "context-engine-indexer", "--host", "0.0.0.0", "--port", "8001", "--transport", "sse", "/app/scripts/indexer_server.py"] diff --git a/deploy/kubernetes/mcp-memory.yaml b/deploy/kubernetes/mcp-memory.yaml index 79e40980..27ba6fbb 100644 --- a/deploy/kubernetes/mcp-memory.yaml +++ b/deploy/kubernetes/mcp-memory.yaml @@ -22,7 +22,7 @@ spec: spec: containers: - name: mcp-memory - image: context-engine:latest # Replace with actual image after building + image: context-engine-memory # Use service-specific image name imagePullPolicy: IfNotPresent command: ["python", "-m", "mcp.server.fastmcp"] args: ["--server-name", "context-engine", "--host", "0.0.0.0", "--port", "8000", "--transport", "sse", "/app/scripts/memory_server.py"] From 26a657c35e7ffed19a2a9166084a5a1a154aaf6f Mon Sep 17 00:00:00 2001 From: "claude[bot]" <41898282+claude[bot]@users.noreply.github.com> Date: Sat, 25 Oct 2025 22:45:56 +0000 Subject: [PATCH 06/16] fix: Remove hardcoded storage-class to use cluster default - Replace hardcoded 'fast-ssd' storageClassName with commented configuration - QDRANT StatefulSet will now use cluster's default storage class - Users can uncomment and specify custom storage class if needed - Ensures better compatibility across different Kubernetes clusters Co-authored-by: voarsh2 --- deploy/kubernetes/qdrant.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deploy/kubernetes/qdrant.yaml b/deploy/kubernetes/qdrant.yaml index ebd8bf09..00363e7b 100644 --- a/deploy/kubernetes/qdrant.yaml +++ b/deploy/kubernetes/qdrant.yaml @@ -67,7 +67,7 @@ spec: component: qdrant spec: accessModes: ["ReadWriteOnce"] - storageClassName: fast-ssd # Adjust based on your cluster + # storageClassName: "" # Uncomment and set if you want to specify a storage class resources: requests: storage: 20Gi From 265c5498f47a83c8e29a6220bdd3632eec206a24 Mon Sep 17 00:00:00 2001 From: "claude[bot]" <41898282+claude[bot]@users.noreply.github.com> Date: Sun, 26 Oct 2025 00:51:20 +0000 Subject: [PATCH 07/16] feat: Add Git sync sidecar solution for remote source code access Implements comprehensive Git-based source code synchronization to solve the critical issue of source code distribution in remote Kubernetes deployments. ### Key Features: - Git sync sidecar containers for automatic source code synchronization - Flexible deployment modes: local (hostPath) vs Git-based - Support for public and private Git repositories - SSH and HTTPS authentication methods - Automated deployment script with mode selection - Comprehensive documentation and setup guides ### Files Added: - deploy/kubernetes/deploy-with-source.sh - Smart deployment script - deploy/kubernetes/mcp-indexer-git.yaml - Git-enabled indexer deployment - deploy/kubernetes/mcp-memory-git.yaml - Git-enabled memory server deployment - deploy/kubernetes/GIT_SYNC_SETUP.md - Comprehensive setup documentation ### Files Modified: - deploy/kubernetes/configmap.yaml - Added Git configuration variables - deploy/kubernetes/README.md - Updated with Git sync documentation ### Configuration Variables Added: - SOURCE_CODE_MODE: Switch between 'local' and 'git' modes - GIT_REPO_URL: Git repository URL for synchronization - GIT_BRANCH: Git branch to checkout - GIT_SYNC_PERIOD: Synchronization frequency - GIT_USERNAME/GIT_PASSWORD: HTTPS authentication - GIT_SSH_KEY: SSH authentication configuration This solution enables production-ready Kubernetes deployments with automatic source code management, eliminating the need for manual code distribution across cluster nodes while maintaining compatibility with existing local development workflows. Resolves the critical remote source code access issue identified in issue #1. Co-authored-by: voarsh2 --- deploy/kubernetes/GIT_SYNC_SETUP.md | 356 ++++++++++++++++++++++++ deploy/kubernetes/README.md | 67 ++++- deploy/kubernetes/configmap.yaml | 14 +- deploy/kubernetes/deploy-with-source.sh | 310 +++++++++++++++++++++ deploy/kubernetes/mcp-indexer-git.yaml | 216 ++++++++++++++ deploy/kubernetes/mcp-memory-git.yaml | 209 ++++++++++++++ 6 files changed, 1162 insertions(+), 10 deletions(-) create mode 100644 deploy/kubernetes/GIT_SYNC_SETUP.md create mode 100755 deploy/kubernetes/deploy-with-source.sh create mode 100644 deploy/kubernetes/mcp-indexer-git.yaml create mode 100644 deploy/kubernetes/mcp-memory-git.yaml diff --git a/deploy/kubernetes/GIT_SYNC_SETUP.md b/deploy/kubernetes/GIT_SYNC_SETUP.md new file mode 100644 index 00000000..17ccb05f --- /dev/null +++ b/deploy/kubernetes/GIT_SYNC_SETUP.md @@ -0,0 +1,356 @@ +# Git Sync Source Code Management for Context-Engine + +This guide explains how to set up and configure Git-based source code synchronization for Context-Engine in Kubernetes deployments. + +## Overview + +The Git sync solution uses **Git sync sidecar containers** that automatically pull source code from a Git repository into the application pods. This solves the critical issue of source code distribution in remote Kubernetes clusters. + +### Architecture + +``` +┌─────────────────────────────────────────────────────────────┐ +│ Kubernetes Pod │ +│ ┌─────────────┐ ┌───────────────┐ ┌───────────────────┐ │ +│ │ Main App │ │ Git Sync │ │ Shared Volume │ │ +│ │ Container │ │ Sidecar │ │ (emptyDir) │ │ +│ │ │ │ │ │ │ │ +│ │ /work │←─→│ /git → /work │←─→│ Source Code │ │ +│ └─────────────┘ └───────────────┘ └───────────────────┘ │ +│ ↕ │ +│ Git Repository (GitHub/GitLab/Bitbucket) │ +└─────────────────────────────────────────────────────────────┘ +``` + +## Quick Start + +### 1. Deploy with Git Sync + +```bash +# Public repository +./deploy-with-source.sh git https://github.com/your-org/your-repo.git main + +# Private repository with HTTPS +./deploy-with-source.sh git https://github.com/your-org/your-repo.git main + +# Private repository with SSH +./deploy-with-source.sh git git@github.com:your-org/your-repo.git main +``` + +### 2. Deploy with Local Mode (Alternative) + +```bash +# Use this if source code is already on cluster nodes +./deploy-with-source.sh local +``` + +## Configuration Options + +### ConfigMap Settings + +Update `deploy/kubernetes/configmap.yaml` with your Git configuration: + +```yaml +# Source Code Configuration +SOURCE_CODE_MODE: "git" # Options: "local" or "git" + +# Git repository configuration (only used when SOURCE_CODE_MODE=git) +GIT_REPO_URL: "https://github.com/your-org/your-repo.git" +GIT_BRANCH: "main" +GIT_SYNC_PERIOD: "60" # Sync every 60 seconds +GIT_USERNAME: "" # For private repos (optional) +GIT_PASSWORD: "" # For private repos (optional) +``` + +### Git Sync Sidecar Configuration + +The Git sync sidecar uses the following environment variables: + +| Variable | Description | Default | +|----------|-------------|---------| +| `GITSYNC_REPO` | Git repository URL | From ConfigMap | +| `GITSYNC_BRANCH` | Git branch to checkout | From ConfigMap | +| `GITSYNC_ROOT` | Directory to clone into | `/git` | +| `GITSYNC_SYNC_PERIOD` | Sync frequency in seconds | From ConfigMap | +| `GITSYNC_ONE_TIME` | Sync once and exit | `false` | +| `GITSYNC_LINK` | Create symlink to latest commit | `latest` | +| `GITSYNC_MAX_FAILURES` | Max sync failures before giving up | `5` | +| `GITSYNC_USERNAME` | Username for HTTP basic auth | From ConfigMap | +| `GITSYNC_PASSWORD` | Password/token for HTTP basic auth | From ConfigMap | + +## Authentication Setup + +### Public Repositories + +No additional setup required. The Git sync sidecar will clone public repositories automatically. + +### Private Repositories (HTTPS) + +1. **Using Personal Access Token:** + +```bash +# Create ConfigMap with credentials +kubectl patch configmap context-engine-config -n context-engine --patch '{"data":{"GIT_USERNAME":"your-username","GIT_PASSWORD":"your-personal-access-token"}}' +``` + +2. **Alternative: Create Secret:** + +```bash +kubectl create secret generic git-https-credentials \ + --from-literal=username=your-username \ + --from-literal=password=your-personal-access-token \ + -n context-engine +``` + +### Private Repositories (SSH) + +1. **Generate SSH Key:** + +```bash +ssh-keygen -t rsa -b 4096 -C "git-sync@context-engine" -f ~/.ssh/context_engine_git +``` + +2. **Add SSH Key to Git Repository:** + + - Copy the public key (`~/.ssh/context_engine_git.pub`) + - Add it as a deploy key in your Git repository settings + +3. **Create Kubernetes Secret:** + +```bash +kubectl create secret generic git-ssh-key \ + --from-file=ssh-private-key=~/.ssh/context_engine_git \ + -n context-engine +``` + +4. **Update Git Sync Configuration:** + +The manifests are already configured to use SSH when the `git-ssh-key` secret exists. + +## Deployment Options + +### Option 1: Automated Deployment Script + +```bash +# Deploy with automated script +cd deploy/kubernetes +./deploy-with-source.sh git https://github.com/your-org/your-repo.git main +``` + +### Option 2: Manual Deployment with Kustomize + +1. **Update ConfigMap:** + +```yaml +# kustomization.yaml patches +apiVersion: v1 +kind: ConfigMap +metadata: + name: context-engine-config +data: + SOURCE_CODE_MODE: "git" + GIT_REPO_URL: "https://github.com/your-org/your-repo.git" + GIT_BRANCH: "main" +``` + +2. **Apply Git-enabled manifests:** + +```bash +kubectl apply -f mcp-memory-git.yaml +kubectl apply -f mcp-indexer-git.yaml +``` + +### Option 3: Switching Between Modes + +To switch from local to Git mode (or vice versa): + +```bash +# Update ConfigMap +kubectl patch configmap context-engine-config -n context-engine --type merge --patch '{"data":{"SOURCE_CODE_MODE":"git"}}' + +# Redeploy affected services +kubectl rollout restart deployment/mcp-memory -n context-engine +kubectl rollout restart deployment/mcp-indexer -n context-engine +``` + +## Monitoring and Troubleshooting + +### Check Git Sync Status + +```bash +# Check Git sync logs for indexer +kubectl logs deployment/mcp-indexer -c git-sync -n context-engine + +# Check Git sync logs for memory server +kubectl logs deployment/mcp-memory -c git-sync -n context-engine +``` + +### Common Issues + +#### 1. Authentication Failures + +**Error:** `authentication failed` + +**Solution:** +- Verify SSH key is correctly configured +- Check that the deploy key has read access +- Ensure the SSH key format is correct + +#### 2. Network Connectivity + +**Error:** `unable to access '...'` + +**Solution:** +- Check cluster network policies +- Verify firewall rules allow Git access +- Test connectivity from a pod in the cluster + +#### 3. Repository Not Found + +**Error:** `repository not found` + +**Solution:** +- Verify the repository URL is correct +- Check that the repository exists +- Ensure the Git branch exists + +#### 4. Sync Loop Issues + +**Error:** Continuous sync failures + +**Solution:** +- Check `GITSYNC_MAX_FAILURES` setting +- Examine Git sync logs for specific errors +- Verify repository permissions + +### Health Checks + +The Git sync sidecar doesn't have built-in health endpoints, but you can monitor: + +```bash +# Check if source code is present +kubectl exec deployment/mcp-indexer -c mcp-indexer -n context-engine -- ls -la /work + +# Check Git sync status +kubectl exec deployment/mcp-indexer -c git-sync -n context-engine -- cat /git/.git-sync +``` + +## Best Practices + +### 1. Repository Management + +- **Use specific branches:** Pin to specific branches for production +- **Tag releases:** Use Git tags for release deployments +- **Clean repository:** Avoid including large binary files in the repository + +### 2. Security + +- **Use read-only deploy keys:** Don't use SSH keys with write access +- **Rotate credentials:** Regularly rotate personal access tokens +- **Network policies:** Restrict pod network access as needed + +### 3. Performance + +- **Optimize sync frequency:** Adjust `GIT_SYNC_PERIOD` based on update frequency +- **Repository size:** Keep repository size reasonable for faster clones +- **Shallow clones:** Consider using `--depth 1` for large repositories + +### 4. High Availability + +- **Multiple replicas:** Git sync works with multiple pod replicas +- **Regional repositories:** Use Git mirrors for global deployments +- **Fallback strategies:** Consider local mode as fallback + +## Advanced Configuration + +### Custom Git Sync Options + +You can customize the Git sync sidecar by editing the manifests: + +```yaml +env: +- name: GITSYNC_DEPTH + value: "1" # Shallow clone for faster sync +- name: GITSYNC_GARBAGE_COLLECTION + value: "true" # Clean up old commits +- name: GITSYNC_ADD_USER + value: "true" # Set .gitconfig user info +``` + +### Webhook Integration + +For instant updates, consider using webhooks with a custom controller: + +```yaml +# This would require a custom webhook receiver +apiVersion: v1 +kind: Service +metadata: + name: git-webhook-receiver +spec: + selector: + app: git-webhook-receiver + ports: + - port: 8080 + targetPort: 8080 +``` + +### Multi-Repository Setup + +For complex projects requiring multiple repositories: + +```yaml +# Add multiple Git sync sidecars +- name: git-sync-main + env: + - name: GITSYNC_REPO + value: "https://github.com/your-org/main-repo.git" + volumeMounts: + - name: main-volume + mountPath: /git-main + +- name: git-sync-config + env: + - name: GITSYNC_REPO + value: "https://github.com/your-org/config-repo.git" + volumeMounts: + - name: config-volume + mountPath: /git-config +``` + +## Migration Guide + +### From Local Mode to Git Mode + +1. **Backup current data:** +```bash +kubectl exec deployment/mcp-indexer -c mcp-indexer -n context-engine -- tar czf /tmp/backup.tar.gz -C /work . +``` + +2. **Update configuration:** +```bash +kubectl patch configmap context-engine-config -n context-engine --type merge --patch '{"data":{"SOURCE_CODE_MODE":"git","GIT_REPO_URL":"https://github.com/your-org/your-repo.git"}}' +``` + +3. **Restart deployments:** +```bash +kubectl rollout restart deployment/mcp-indexer -n context-engine +kubectl rollout restart deployment/mcp-memory -n context-engine +``` + +4. **Verify sync:** +```bash +kubectl logs deployment/mcp-indexer -c git-sync -n context-engine -f +``` + +## Support + +For issues with Git sync setup: + +1. Check the [Git sync documentation](https://github.com/kubernetes/git-sync) +2. Review Kubernetes pod logs +3. Verify network connectivity to Git repository +4. Check authentication configuration +5. Validate ConfigMap settings + +Remember that Git sync provides **automatic source code distribution** for remote Kubernetes deployments, eliminating the need for manual code synchronization across cluster nodes. \ No newline at end of file diff --git a/deploy/kubernetes/README.md b/deploy/kubernetes/README.md index 7a5db376..e337207b 100644 --- a/deploy/kubernetes/README.md +++ b/deploy/kubernetes/README.md @@ -33,21 +33,35 @@ For local development or direct access, services are exposed via NodePort: 1. **Kubernetes Cluster** (v1.20+) 2. **kubectl** configured to access your cluster -3. **Storage Class** named `fast-ssd` (or modify `qdrant.yaml`) -4. **Docker image** built and pushed to registry: +3. **Docker images** built and pushed to registry: ```bash - # Build and tag the image - docker build -t context-engine:latest . + # Build all service images + ./build-images.sh --push - # Tag for your registry - docker tag context-engine:latest your-registry/context-engine:latest - - # Push to registry - docker push your-registry/context-engine:latest + # Or build individually + docker build -f Dockerfile.mcp -t context-engine-memory:latest . + docker build -f Dockerfile.mcp-indexer -t context-engine-indexer:latest . + docker build -f Dockerfile.indexer -t context-engine-indexer-service:latest . ``` +4. **Source Code Access** (choose one): + - **Local Mode**: Source code pre-distributed to all cluster nodes at `/tmp/context-engine-work` + - **Git Mode**: Git repository accessible from cluster with proper authentication configured + ## Quick Start +### Option 1: Automated Deployment with Source Code Management + +```bash +# Deploy with Git-based source code synchronization (recommended) +./deploy-with-source.sh git https://github.com/your-org/your-repo.git main + +# Or deploy with local source code (requires pre-distribution) +./deploy-with-source.sh local +``` + +### Option 2: Manual Deployment + ### 1. Deploy Core Services ```bash @@ -123,6 +137,41 @@ All configuration is managed through the `context-engine-config` ConfigMap in `c 3. **Host Paths**: Update volume mounts to match your environment 4. **Ingress**: Configure `ingress.yaml` with your domain and SSL +## Source Code Management + +### Local Mode vs Git Mode + +The deployment supports two source code access strategies: + +#### **Local Mode** (Default) +- Uses hostPath volumes to access source code on cluster nodes +- **Pros**: Simple, no external dependencies +- **Cons**: Requires manual source code distribution to all nodes +- **Use Case**: Single-node clusters, development environments + +#### **Git Mode** (Recommended for Production) +- Uses Git sync sidecars to automatically pull source code from repositories +- **Pros**: Automatic source code synchronization, CI/CD integration +- **Cons**: Requires Git repository access from cluster +- **Use Case**: Multi-node clusters, production deployments + +### Git Sync Setup + +For Git mode setup, see [GIT_SYNC_SETUP.md](./GIT_SYNC_SETUP.md) for detailed instructions. + +**Quick Git Mode Setup:** + +```bash +# Public repository +./deploy-with-source.sh git https://github.com/your-org/your-repo.git main + +# Private repository (requires SSH key setup) +kubectl create secret generic git-ssh-key \ + --from-file=ssh-private-key=~/.ssh/id_rsa \ + -n context-engine +./deploy-with-source.sh git git@github.com:your-org/your-repo.git main +``` + ## Development Workflow ### Local Development diff --git a/deploy/kubernetes/configmap.yaml b/deploy/kubernetes/configmap.yaml index 1d2b5802..5d503030 100644 --- a/deploy/kubernetes/configmap.yaml +++ b/deploy/kubernetes/configmap.yaml @@ -116,4 +116,16 @@ data: # Watcher-specific settings WATCH_DEBOUNCE_SECS: "1.5" INDEX_UPSERT_BATCH: "128" - INDEX_UPSERT_RETRIES: "5" \ No newline at end of file + INDEX_UPSERT_RETRIES: "5" + + # Source Code Configuration + # Set to "git" to use Git sync sidecar, or "local" to use hostPath (default) + SOURCE_CODE_MODE: "local" + + # Git repository configuration (only used when SOURCE_CODE_MODE=git) + GIT_REPO_URL: "" # e.g., "https://github.com/your-org/your-repo.git" + GIT_BRANCH: "main" + GIT_SYNC_PERIOD: "60" # Sync every 60 seconds + GIT_SSH_KEY: "" # SSH private key for private repos (optional) + GIT_USERNAME: "" # Username for private repos (optional) + GIT_PASSWORD: "" # Password/token for private repos (optional) \ No newline at end of file diff --git a/deploy/kubernetes/deploy-with-source.sh b/deploy/kubernetes/deploy-with-source.sh new file mode 100755 index 00000000..3e38bfe6 --- /dev/null +++ b/deploy/kubernetes/deploy-with-source.sh @@ -0,0 +1,310 @@ +#!/bin/bash + +# Context-Engine Kubernetes Deployment with Source Code Management +# Supports both local (hostPath) and Git-based source code access + +set -e + +# Configuration +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +NAMESPACE="context-engine" +SOURCE_MODE="${1:-local}" # Options: local, git +GIT_REPO_URL="${2:-}" +GIT_BRANCH="${3:-main}" + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# Helper functions +log() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +warn() { + echo -e "${YELLOW}[WARN]${NC} $1" +} + +error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +# Print usage +usage() { + cat << EOF +Usage: $0 [git-repo-url] [git-branch] + +Source Modes: + local - Use hostPath volumes (source code must be pre-distributed to nodes) + git - Use Git sync sidecars (automatic source code synchronization) + +Examples: + # Local deployment (requires source code on nodes) + $0 local + + # Git-based deployment + $0 git https://github.com/your-org/your-repo.git main + + # Git-based deployment with private repo (requires SSH key setup) + $0 git git@github.com:your-org/your-repo.git main + +Environment Variables: + REGISTRY - Docker registry prefix (default: context-engine) + TAG - Docker image tag (default: latest) + +Requirements for Git Mode: + - Git repository must be accessible from the cluster + - For private repos: create git-ssh-key secret or configure credentials + - Sufficient network access to clone the repository + +Requirements for Local Mode: + - Source code must exist at /tmp/context-engine-work on ALL nodes + - Node access required for code updates +EOF +} + +# Validate input +validate_input() { + if [[ "$SOURCE_MODE" != "local" && "$SOURCE_MODE" != "git" ]]; then + error "Invalid source mode: $SOURCE_MODE. Must be 'local' or 'git'." + usage + exit 1 + fi + + if [[ "$SOURCE_MODE" == "git" && -z "$GIT_REPO_URL" ]]; then + error "Git repository URL is required when using git mode." + usage + exit 1 + fi +} + +# Check prerequisites +check_prerequisites() { + log "Checking prerequisites..." + + # Check kubectl + if ! command -v kubectl &> /dev/null; then + error "kubectl is not installed or not in PATH" + exit 1 + fi + + # Check cluster access + if ! kubectl cluster-info &> /dev/null; then + error "Cannot connect to Kubernetes cluster" + exit 1 + fi + + # Check if namespace exists + if ! kubectl get namespace "$NAMESPACE" &> /dev/null; then + log "Creating namespace: $NAMESPACE" + kubectl create namespace "$NAMESPACE" + fi + + success "Prerequisites check passed" +} + +# Update ConfigMap with source code configuration +update_configmap() { + log "Updating ConfigMap with source code configuration..." + + # Create a temporary configmap with updated values + kubectl create configmap context-engine-config-temp \ + --from-env-file <(cat << EOF +SOURCE_CODE_MODE=$SOURCE_MODE +GIT_REPO_URL=$GIT_REPO_URL +GIT_BRANCH=$GIT_BRANCH +GIT_SYNC_PERIOD=60 +GIT_USERNAME="" +GIT_PASSWORD="" +EOF +) \ + --namespace "$NAMESPACE" \ + --dry-run=client -o yaml | kubectl apply -f - + + # Merge with existing configmap (preserving other settings) + log "Merging configuration with existing ConfigMap..." + # Note: This is a simplified approach. In production, you might want to use + # kustomize or a more sophisticated config management tool +} + +# Deploy based on source mode +deploy_services() { + log "Deploying Context-Engine services in $SOURCE_MODE mode..." + + # Deploy core infrastructure (always needed) + log "Deploying core infrastructure..." + kubectl apply -f "$SCRIPT_DIR/qdrant.yaml" + + # Wait for Qdrant to be ready + log "Waiting for Qdrant to be ready..." + kubectl wait --for=condition=ready pod -l app=qdrant -n "$NAMESPACE" --timeout=300s + + if [[ "$SOURCE_MODE" == "local" ]]; then + deploy_local_mode + else + deploy_git_mode + fi + + # Deploy remaining services + log "Deploying remaining services..." + kubectl apply -f "$SCRIPT_DIR/mcp-http.yaml" + kubectl apply -f "$SCRIPT_DIR/indexer-services.yaml" + + # Deploy optional services + if [[ -f "$SCRIPT_DIR/llamacpp.yaml" ]]; then + log "Deploying optional Llama.cpp service..." + kubectl apply -f "$SCRIPT_DIR/llamacpp.yaml" + fi +} + +# Deploy in local mode (using hostPath) +deploy_local_mode() { + log "Deploying in LOCAL mode (hostPath volumes)..." + + # Apply hostPath-based deployments + kubectl apply -f "$SCRIPT_DIR/mcp-memory.yaml" + kubectl apply -f "$SCRIPT_DIR/mcp-indexer.yaml" + + warn "⚠️ LOCAL MODE REQUIREMENTS:" + warn " - Source code must exist at /tmp/context-engine-work on ALL cluster nodes" + warn " - Code updates require manual synchronization across nodes" + warn " - This mode is suitable for single-node clusters or development" + + read -p "Do you want to continue? (y/N): " -n 1 -r + echo + if [[ ! $REPLY =~ ^[Yy]$ ]]; then + log "Deployment cancelled" + exit 1 + fi +} + +# Deploy in Git mode (using Git sync sidecars) +deploy_git_mode() { + log "Deploying in GIT mode (automatic source code synchronization)..." + + # Setup Git authentication if needed + setup_git_auth + + # Apply Git-enabled deployments + kubectl apply -f "$SCRIPT_DIR/mcp-memory-git.yaml" + kubectl apply -f "$SCRIPT_DIR/mcp-indexer-git.yaml" + + success "✅ Git sync enabled - source code will be automatically synchronized" + log " Repository: $GIT_REPO_URL" + log " Branch: $GIT_BRANCH" + log " Sync Period: 60 seconds" +} + +# Setup Git authentication +setup_git_auth() { + # Check if this is a private repository requiring authentication + if [[ "$GIT_REPO_URL" =~ ^git@ ]] || [[ "$GIT_REPO_URL" =~ \.git$ && ! "$GIT_REPO_URL" =~ ^https://github\.com/[^/]+/[^/]+\.git$ ]]; then + warn "Private repository detected. Please ensure authentication is configured:" + warn " 1. For SSH: Create git-ssh-key secret with your SSH private key" + warn " 2. For HTTPS: Set GIT_USERNAME and GIT_PASSWORD in ConfigMap" + + # Check if SSH secret exists + if ! kubectl get secret git-ssh-key -n "$NAMESPACE" &> /dev/null; then + log "Creating placeholder SSH secret (please update with your actual SSH key)" + kubectl create secret generic git-ssh-key \ + --from-literal=ssh-private-key="" \ + --namespace "$NAMESPACE" \ + --dry-run=client -o yaml | kubectl apply -f - + warn "⚠️ Please update the git-ssh-key secret with your actual SSH private key:" + warn " kubectl delete secret git-ssh-key -n $NAMESPACE" + warn " kubectl create secret generic git-ssh-key --from-file=ssh-private-key=~/.ssh/id_rsa -n $NAMESPACE" + fi + fi +} + +# Wait for deployment to be ready +wait_for_ready() { + log "Waiting for all deployments to be ready..." + + # List of deployments to wait for + local deployments=("mcp-memory" "mcp-indexer" "mcp-memory-http" "mcp-indexer-http") + + for deployment in "${deployments[@]}"; do + if kubectl get deployment "$deployment" -n "$NAMESPACE" &> /dev/null; then + log "Waiting for $deployment to be ready..." + kubectl wait --for=condition=available deployment/"$deployment" -n "$NAMESPACE" --timeout=300s + fi + done + + success "All deployments are ready" +} + +# Show deployment status and access information +show_status() { + log "Deployment completed successfully!" + echo + echo "=== Context-Engine Status ===" + echo "Namespace: $NAMESPACE" + echo "Source Mode: $SOURCE_MODE" + if [[ "$SOURCE_MODE" == "git" ]]; then + echo "Git Repository: $GIT_REPO_URL" + echo "Git Branch: $GIT_BRANCH" + fi + echo + echo "=== Services ===" + kubectl get services -n "$NAMESPACE" + echo + echo "=== Pods ===" + kubectl get pods -n "$NAMESPACE" + echo + echo "=== Access Information ===" + + # Get service access information + local cluster_ip=$(kubectl get svc qdrant -n "$NAMESPACE" -o jsonpath='{.spec.clusterIP}' 2>/dev/null || echo "N/A") + echo "Qdrant: $cluster_ip:6333" + + if kubectl get svc mcp-memory -n "$NAMESPACE" &> /dev/null; then + local memory_nodeport=$(kubectl get svc mcp-memory -n "$NAMESPACE" -o jsonpath='{.spec.ports[?(@.name=="sse")].nodePort}' 2>/dev/null || echo "N/A") + echo "MCP Memory (SSE): NodePort $memory_nodeport" + fi + + if kubectl get svc mcp-indexer -n "$NAMESPACE" &> /dev/null; then + local indexer_nodeport=$(kubectl get svc mcp-indexer -n "$NAMESPACE" -o jsonpath='{.spec.ports[?(@.name=="sse")].nodePort}' 2>/dev/null || echo "N/A") + echo "MCP Indexer (SSE): NodePort $indexer_nodeport" + fi + + echo + echo "=== Next Steps ===" + echo "1. Test the deployment:" + echo " curl http://:30800/sse # MCP Memory" + echo " curl http://:30802/sse # MCP Indexer" + echo "2. Call indexing tool:" + echo " curl -X POST http://:30802/sse -H 'Content-Type: application/json' \\" + echo " -d '{\"jsonrpc\": \"2.0\", \"id\": 1, \"method\": \"tools/call\", \"params\": {\"name\": \"qdrant_index_root\", \"arguments\": {}}}'" + + if [[ "$SOURCE_MODE" == "git" ]]; then + echo "3. Monitor Git sync:" + echo " kubectl logs deployment/mcp-indexer -c git-sync -n $NAMESPACE" + echo " kubectl logs deployment/mcp-memory -c git-sync -n $NAMESPACE" + fi +} + +# Main execution +main() { + if [[ "$1" == "-h" || "$1" == "--help" ]]; then + usage + exit 0 + fi + + validate_input + check_prerequisites + update_configmap + deploy_services + wait_for_ready + show_status +} + +# Run main function with all arguments +main "$@" \ No newline at end of file diff --git a/deploy/kubernetes/mcp-indexer-git.yaml b/deploy/kubernetes/mcp-indexer-git.yaml new file mode 100644 index 00000000..a6c13b8c --- /dev/null +++ b/deploy/kubernetes/mcp-indexer-git.yaml @@ -0,0 +1,216 @@ +--- +# MCP Indexer Server with Git Sync Sidecar +# This demonstrates both local (hostPath) and Git-based source code access +apiVersion: apps/v1 +kind: Deployment +metadata: + name: mcp-indexer + namespace: context-engine + labels: + app: context-engine + component: mcp-indexer +spec: + replicas: 1 + selector: + matchLabels: + app: context-engine + component: mcp-indexer + template: + metadata: + labels: + app: context-engine + component: mcp-indexer + spec: + # Use init container to determine source code mode + initContainers: + - name: source-mode-check + image: busybox:1.36 + command: ["sh", "-c", "echo 'Source Code Mode: $SOURCE_CODE_MODE'"] + env: + - name: SOURCE_CODE_MODE + valueFrom: + configMapKeyRef: + name: context-engine-config + key: SOURCE_CODE_MODE + + containers: + # Main MCP Indexer Server Container + - name: mcp-indexer + image: context-engine-indexer:latest + imagePullPolicy: IfNotPresent + command: ["python", "-m", "mcp.server.fastmcp"] + args: ["--server-name", "context-engine-indexer", "--host", "0.0.0.0", "--port", "8001", "--transport", "sse", "/app/scripts/indexer_server.py"] + ports: + - name: sse + containerPort: 8001 + protocol: TCP + - name: health + containerPort: 18001 + protocol: TCP + env: + - name: QDRANT_URL + valueFrom: + configMapKeyRef: + name: context-engine-config + key: QDRANT_URL + - name: FASTMCP_HOST + valueFrom: + configMapKeyRef: + name: context-engine-config + key: FASTMCP_HOST + - name: FASTMCP_INDEXER_PORT + valueFrom: + configMapKeyRef: + name: context-engine-config + key: FASTMCP_INDEXER_PORT + - name: FASTMCP_HEALTH_PORT + value: "18001" + - name: SOURCE_CODE_MODE + valueFrom: + configMapKeyRef: + name: context-engine-config + key: SOURCE_CODE_MODE + resources: + requests: + memory: "512Mi" + cpu: "250m" + limits: + memory: "2Gi" + cpu: "1000m" + volumeMounts: + - name: work-volume + mountPath: /work + - name: codebase-volume + mountPath: /work/.codebase + livenessProbe: + httpGet: + path: /readyz + port: health + initialDelaySeconds: 30 + periodSeconds: 10 + readinessProbe: + httpGet: + path: /readyz + port: health + initialDelaySeconds: 10 + periodSeconds: 5 + + # Git Sync Sidecar Container (only active when SOURCE_CODE_MODE=git) + - name: git-sync + image: k8s.gcr.io/git-sync:v4.1.0 + imagePullPolicy: IfNotPresent + env: + - name: GITSYNC_REPO + valueFrom: + configMapKeyRef: + name: context-engine-config + key: GIT_REPO_URL + - name: GITSYNC_BRANCH + valueFrom: + configMapKeyRef: + name: context-engine-config + key: GIT_BRANCH + - name: GITSYNC_ROOT + value: "/git" + - name: GITSYNC_SYNC_PERIOD + valueFrom: + configMapKeyRef: + name: context-engine-config + key: GIT_SYNC_PERIOD + - name: GITSYNC_ONE_TIME + value: "false" + - name: GITSYNC_LINK + value: "latest" + - name: GITSYNC_MAX_FAILURES + value: "5" + # For SSH-based authentication (optional) + - name: GITSYNC_SSH + value: "false" + # For HTTP basic auth (optional) + - name: GITSYNC_USERNAME + valueFrom: + configMapKeyRef: + name: context-engine-config + key: GIT_USERNAME + - name: GITSYNC_PASSWORD + valueFrom: + configMapKeyRef: + name: context-engine-config + key: GIT_PASSWORD + resources: + requests: + memory: "64Mi" + cpu: "50m" + limits: + memory: "128Mi" + cpu: "100m" + volumeMounts: + - name: work-volume + mountPath: /git + # Mount SSH key if using SSH authentication + - name: git-ssh-key + mountPath: /etc/git-secret/ssh + readOnly: true + securityContext: + runAsUser: 65533 # git-sync runs as non-root user + # This container will start but do nothing when SOURCE_CODE_MODE=local + # We could add a more sophisticated mechanism here if needed + + volumes: + # Shared work directory for both main container and git-sync + - name: work-volume + emptyDir: {} + - name: codebase-volume + emptyDir: {} + # SSH key volume for private repositories (optional) + - name: git-ssh-key + secret: + secretName: git-ssh-key + optional: true + items: + - key: ssh-private-key + path: id_rsa + +--- +# MCP Indexer Server Service +apiVersion: v1 +kind: Service +metadata: + name: mcp-indexer + namespace: context-engine + labels: + app: context-engine + component: mcp-indexer +spec: + type: NodePort # Change to LoadBalancer for external access + ports: + - name: sse + port: 8001 + targetPort: sse + nodePort: 30802 # Optional: specify node port + protocol: TCP + - name: health + port: 18001 + targetPort: health + nodePort: 30803 # Optional: specify node port + protocol: TCP + selector: + app: context-engine + component: mcp-indexer + +--- +# Example Secret for SSH Git Access (Optional) +# Create with: kubectl create secret generic git-ssh-key --from-file=ssh-private-key=~/.ssh/id_rsa +apiVersion: v1 +kind: Secret +metadata: + name: git-ssh-key + namespace: context-engine + labels: + app: context-engine + component: git-auth +type: Opaque +data: + # Base64 encoded SSH private key + # ssh-private-key: +type: Opaque \ No newline at end of file diff --git a/deploy/kubernetes/mcp-memory-git.yaml b/deploy/kubernetes/mcp-memory-git.yaml new file mode 100644 index 00000000..dab93a9c --- /dev/null +++ b/deploy/kubernetes/mcp-memory-git.yaml @@ -0,0 +1,209 @@ +--- +# MCP Memory Server with Git Sync Sidecar +apiVersion: apps/v1 +kind: Deployment +metadata: + name: mcp-memory + namespace: context-engine + labels: + app: context-engine + component: mcp-memory +spec: + replicas: 1 + selector: + matchLabels: + app: context-engine + component: mcp-memory + template: + metadata: + labels: + app: context-engine + component: mcp-memory + spec: + containers: + # Main MCP Memory Server Container + - name: mcp-memory + image: context-engine-memory:latest + imagePullPolicy: IfNotPresent + command: ["python", "-m", "mcp.server.fastmcp"] + args: ["--server-name", "context-engine-memory", "--host", "0.0.0.0", "--port", "8000", "--transport", "sse", "/app/scripts/memory_server.py"] + ports: + - name: sse + containerPort: 8000 + protocol: TCP + - name: health + containerPort: 18000 + protocol: TCP + env: + - name: QDRANT_URL + valueFrom: + configMapKeyRef: + name: context-engine-config + key: QDRANT_URL + - name: FASTMCP_HOST + valueFrom: + configMapKeyRef: + name: context-engine-config + key: FASTMCP_HOST + - name: FASTMCP_PORT + valueFrom: + configMapKeyRef: + name: context-engine-config + key: FASTMCP_PORT + - name: FASTMCP_HEALTH_PORT + value: "18000" + - name: COLLECTION_NAME + valueFrom: + configMapKeyRef: + name: context-engine-config + key: COLLECTION_NAME + - name: MEMORY_SSE_ENABLED + valueFrom: + configMapKeyRef: + name: context-engine-config + key: MEMORY_SSE_ENABLED + - name: MEMORY_MCP_URL + valueFrom: + configMapKeyRef: + name: context-engine-config + key: MEMORY_MCP_URL + - name: MEMORY_MCP_TIMEOUT + valueFrom: + configMapKeyRef: + name: context-engine-config + key: MEMORY_MCP_TIMEOUT + - name: MEMORY_AUTODETECT + valueFrom: + configMapKeyRef: + name: context-engine-config + key: MEMORY_AUTODETECT + - name: MEMORY_COLLECTION_TTL_SECS + valueFrom: + configMapKeyRef: + name: context-engine-config + key: MEMORY_COLLECTION_TTL_SECS + - name: TOOL_STORE_DESCRIPTION + valueFrom: + configMapKeyRef: + name: context-engine-config + key: TOOL_STORE_DESCRIPTION + - name: TOOL_FIND_DESCRIPTION + valueFrom: + configMapKeyRef: + name: context-engine-config + key: TOOL_FIND_DESCRIPTION + resources: + requests: + memory: "256Mi" + cpu: "200m" + limits: + memory: "1Gi" + cpu: "500m" + volumeMounts: + - name: work-volume + mountPath: /work + readOnly: true # Memory server only needs read access + livenessProbe: + httpGet: + path: /readyz + port: health + initialDelaySeconds: 30 + periodSeconds: 10 + readinessProbe: + httpGet: + path: /readyz + port: health + initialDelaySeconds: 10 + periodSeconds: 5 + + # Git Sync Sidecar Container + - name: git-sync + image: k8s.gcr.io/git-sync:v4.1.0 + imagePullPolicy: IfNotPresent + env: + - name: GITSYNC_REPO + valueFrom: + configMapKeyRef: + name: context-engine-config + key: GIT_REPO_URL + - name: GITSYNC_BRANCH + valueFrom: + configMapKeyRef: + name: context-engine-config + key: GIT_BRANCH + - name: GITSYNC_ROOT + value: "/git" + - name: GITSYNC_SYNC_PERIOD + valueFrom: + configMapKeyRef: + name: context-engine-config + key: GIT_SYNC_PERIOD + - name: GITSYNC_ONE_TIME + value: "false" + - name: GITSYNC_LINK + value: "latest" + - name: GITSYNC_MAX_FAILURES + value: "5" + - name: GITSYNC_USERNAME + valueFrom: + configMapKeyRef: + name: context-engine-config + key: GIT_USERNAME + - name: GITSYNC_PASSWORD + valueFrom: + configMapKeyRef: + name: context-engine-config + key: GIT_PASSWORD + resources: + requests: + memory: "64Mi" + cpu: "50m" + limits: + memory: "128Mi" + cpu: "100m" + volumeMounts: + - name: work-volume + mountPath: /git + - name: git-ssh-key + mountPath: /etc/git-secret/ssh + readOnly: true + securityContext: + runAsUser: 65533 + + volumes: + - name: work-volume + emptyDir: {} + - name: git-ssh-key + secret: + secretName: git-ssh-key + optional: true + items: + - key: ssh-private-key + path: id_rsa + +--- +# MCP Memory Server Service +apiVersion: v1 +kind: Service +metadata: + name: mcp-memory + namespace: context-engine + labels: + app: context-engine + component: mcp-memory +spec: + type: NodePort # Change to LoadBalancer for external access + ports: + - name: sse + port: 8000 + targetPort: sse + nodePort: 30800 # Optional: specify node port + protocol: TCP + - name: health + port: 18000 + targetPort: health + nodePort: 30801 # Optional: specify node port + protocol: TCP + selector: + app: context-engine + component: mcp-memory \ No newline at end of file From aca0c86bccfdc60607a302111151fb4c41263b62 Mon Sep 17 00:00:00 2001 From: Reese Date: Sun, 26 Oct 2025 02:39:22 +0000 Subject: [PATCH 08/16] feat(delta-upload): add comprehensive remote delta upload system Add complete delta upload system enabling real-time code synchronization across distributed environments. The system includes: - **Upload Service**: FastAPI-based HTTP service for receiving and processing delta bundles with integration to existing indexing pipeline - **Remote Upload Client**: Python client for creating delta bundles, detecting file changes (create/update/delete/move), and uploading with retry logic and sequence tracking - **Enhanced Watch System**: Extended watch_index.py to support both local and remote modes with automatic fallback - **Development Environment**: Complete docker-compose.dev-remote.yml setup simulating Kubernetes CephFS RWX behavior with shared volumes - **Kubernetes Deployment**: Production-ready manifests with persistent volumes, health checks, and proper resource limits - **Comprehensive Documentation**: Architecture docs, design specifications, setup guides, and usage documentation - **Build Tooling**: Development setup script and Make targets for remote upload workflows The delta upload system uses efficient tarball bundles with JSON metadata to transmit only changed files, supporting move detection, hash-based change tracking, and robust error handling with exponential backoff retries. --- Dockerfile.upload-service | 55 ++ Makefile | 65 +- delta_upload_architecture.md | 249 ++++++++ delta_upload_design.md | 651 ++++++++++++++++++++ deploy/kubernetes/upload-pvc.yaml | 47 ++ deploy/kubernetes/upload-service.yaml | 125 ++++ docker-compose.dev-remote.yml | 338 +++++++++++ docs/dev-remote-setup.md | 341 +++++++++++ docs/remote_upload.md | 219 +++++++ docs/upload_service.md | 261 ++++++++ docs/usage_guide.md | 597 +++++++++++++++++++ requirements.txt | 3 + scripts/dev-setup.sh | 169 ++++++ scripts/remote_upload_client.py | 828 ++++++++++++++++++++++++++ scripts/upload_service.py | 544 +++++++++++++++++ scripts/watch_index.py | 269 ++++++--- 16 files changed, 4672 insertions(+), 89 deletions(-) create mode 100644 Dockerfile.upload-service create mode 100644 delta_upload_architecture.md create mode 100644 delta_upload_design.md create mode 100644 deploy/kubernetes/upload-pvc.yaml create mode 100644 deploy/kubernetes/upload-service.yaml create mode 100644 docker-compose.dev-remote.yml create mode 100644 docs/dev-remote-setup.md create mode 100644 docs/remote_upload.md create mode 100644 docs/upload_service.md create mode 100644 docs/usage_guide.md create mode 100644 scripts/dev-setup.sh create mode 100644 scripts/remote_upload_client.py create mode 100644 scripts/upload_service.py diff --git a/Dockerfile.upload-service b/Dockerfile.upload-service new file mode 100644 index 00000000..359bdc04 --- /dev/null +++ b/Dockerfile.upload-service @@ -0,0 +1,55 @@ +# Dockerfile for Context-Engine Delta Upload Service +FROM python:3.11-slim + +# Set environment variables +ENV PYTHONUNBUFFERED=1 \ + PYTHONDONTWRITEBYTECODE=1 \ + PIP_NO_CACHE_DIR=1 \ + PIP_DISABLE_PIP_VERSION_CHECK=1 + +# Install system dependencies +RUN apt-get update && apt-get install -y \ + git \ + curl \ + && rm -rf /var/lib/apt/lists/* + +# Create app directory +WORKDIR /app + +# Copy requirements first for better caching +COPY requirements.txt . + +# Install Python dependencies +RUN pip install --upgrade pip && \ + pip install -r requirements.txt + +# Copy application code +COPY scripts/ ./scripts/ +COPY . . + +# Create work directory for repositories +RUN mkdir -p /work && \ + chmod 755 /work + +# Create non-root user for security +RUN useradd --create-home --shell /bin/bash app && \ + chown -R app:app /app /work +USER app + +# Expose port +EXPOSE 8002 + +# Health check +HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \ + CMD curl -f http://localhost:8002/health || exit 1 + +# Default environment variables +ENV UPLOAD_SERVICE_HOST=0.0.0.0 \ + UPLOAD_SERVICE_PORT=8002 \ + QDRANT_URL=http://qdrant:6333 \ + WORK_DIR=/work \ + MAX_BUNDLE_SIZE_MB=100 \ + UPLOAD_TIMEOUT_SECS=300 + +# Run the upload service +CMD ["python", "scripts/upload_service.py"] \ No newline at end of file diff --git a/Makefile b/Makefile index 85c5c1ab..403a0a0a 100644 --- a/Makefile +++ b/Makefile @@ -4,8 +4,8 @@ SHELL := /bin/bash # An empty export forces docker to use its default context/socket. export DOCKER_HOST = -.PHONY: help up down logs ps restart rebuild index reindex watch env hybrid bootstrap history rerank-local setup-reranker prune warm health -.PHONY: venv venv-install +.PHONY: help up down logs ps restart rebuild index reindex watch watch-remote env hybrid bootstrap history rerank-local setup-reranker prune warm health test-e2e +.PHONY: venv venv-install dev-remote-up dev-remote-down dev-remote-logs dev-remote-restart dev-remote-bootstrap dev-remote-test dev-remote-client dev-remote-clean venv: ## create local virtualenv .venv python3 -m venv .venv && . .venv/bin/activate && pip install -U pip @@ -70,6 +70,23 @@ index-here: ## index the current directory: make index-here [RECREATE=1] [REPO_N watch: ## watch mode: reindex changed files on save (Ctrl+C to stop) docker compose run --rm --entrypoint python indexer /work/scripts/watch_index.py +watch-remote: ## remote watch mode: upload delta bundles to remote server (Ctrl+C to stop) + @echo "Starting remote watch mode..." + @if [ -z "$(REMOTE_UPLOAD_ENDPOINT)" ]; then \ + echo "Error: REMOTE_UPLOAD_ENDPOINT is required"; \ + echo "Usage: make watch-remote REMOTE_UPLOAD_ENDPOINT=http://your-server:8080 [REMOTE_UPLOAD_MAX_RETRIES=3] [REMOTE_UPLOAD_TIMEOUT=30]"; \ + exit 1; \ + fi + @echo "Remote upload endpoint: $(REMOTE_UPLOAD_ENDPOINT)" + @echo "Max retries: $${REMOTE_UPLOAD_MAX_RETRIES:-3}" + @echo "Timeout: $${REMOTE_UPLOAD_TIMEOUT:-30} seconds" + docker compose run --rm --entrypoint python \ + -e REMOTE_UPLOAD_ENABLED=1 \ + -e REMOTE_UPLOAD_ENDPOINT=$(REMOTE_UPLOAD_ENDPOINT) \ + -e REMOTE_UPLOAD_MAX_RETRIES=$${REMOTE_UPLOAD_MAX_RETRIES:-3} \ + -e REMOTE_UPLOAD_TIMEOUT=$${REMOTE_UPLOAD_TIMEOUT:-30} \ + indexer /work/scripts/watch_index.py + rerank: ## multi-query re-ranker helper example docker compose run --rm --entrypoint python indexer /work/scripts/rerank_query.py \ --query "chunk code by lines with overlap for indexing" \ @@ -210,11 +227,53 @@ llamacpp-build-image: ## build custom llama.cpp image with baked model (override # Download a tokenizer.json for micro-chunking (default: BAAI/bge-base-en-v1.5) TOKENIZER_URL ?= https://huggingface.co/BAAI/bge-base-en-v1.5/resolve/main/tokenizer.json TOKENIZER_PATH ?= models/tokenizer.json - tokenizer: ## download tokenizer.json to models/tokenizer.json (override with TOKENIZER_URL/TOKENIZER_PATH) @mkdir -p $(dir $(TOKENIZER_PATH)) @echo "Downloading: $(TOKENIZER_URL) -> $(TOKENIZER_PATH)" && \ curl -L --fail --retry 3 -C - "$(TOKENIZER_URL)" -o "$(TOKENIZER_PATH)" +# --- Development Remote Upload System Targets --- + +dev-remote-up: ## start dev-remote stack with upload service + @echo "Starting development remote upload system..." + @mkdir -p dev-workspace/.codebase + docker compose -f docker-compose.dev-remote.yml up -d --build + +dev-remote-down: ## stop dev-remote stack + @echo "Stopping development remote upload system..." + docker compose -f docker-compose.dev-remote.yml down + +dev-remote-logs: ## follow logs for dev-remote stack + docker compose -f docker-compose.dev-remote.yml logs -f --tail=100 + +dev-remote-restart: ## restart dev-remote stack (rebuild) + docker compose -f docker-compose.dev-remote.yml down && docker compose -f docker-compose.dev-remote.yml up -d --build + +dev-remote-bootstrap: env dev-remote-up ## bootstrap dev-remote: up -> wait -> init -> index -> warm + @echo "Bootstrapping development remote upload system..." + ./scripts/wait-for-qdrant.sh + docker compose -f docker-compose.dev-remote.yml run --rm init_payload || true + $(MAKE) tokenizer + docker compose -f docker-compose.dev-remote.yml run --rm indexer --root /work --recreate + $(MAKE) warm || true + $(MAKE) health + +dev-remote-test: ## test remote upload workflow + @echo "Testing remote upload workflow..." + @echo "Upload service should be accessible at http://localhost:8004" + @echo "Health check: curl http://localhost:8004/health" + @echo "Status check: curl 'http://localhost:8004/api/v1/delta/status?workspace_path=/work/test-repo'" + @echo "Test upload: curl -X POST -F 'bundle=@test-bundle.tar.gz' -F 'workspace_path=/work/test-repo' http://localhost:8004/api/v1/delta/upload" + +dev-remote-client: ## start remote upload client for testing + @echo "Starting remote upload client..." + docker compose -f docker-compose.dev-remote.yml --profile client up -d remote_upload_client + +dev-remote-clean: ## clean up dev-remote volumes and containers + @echo "Cleaning up development remote upload system..." + docker compose -f docker-compose.dev-remote.yml down -v + docker volume rm context-engine_shared_workspace context-engine_shared_codebase context-engine_upload_temp context-engine_qdrant_storage_dev_remote 2>/dev/null || true + rm -rf dev-workspace + diff --git a/delta_upload_architecture.md b/delta_upload_architecture.md new file mode 100644 index 00000000..2b9a93ce --- /dev/null +++ b/delta_upload_architecture.md @@ -0,0 +1,249 @@ +# Delta Upload System Architecture + +## System Overview + +```mermaid +graph TB + subgraph "Local Environment" + FS[File System] + W[watch_index.py] + CQ[ChangeQueue] + DC[Delta Creator] + LC[Local Cache] + end + + subgraph "Delta Upload Service" + API[HTTP API] + BP[Bundle Processor] + Q[Qdrant Client] + WS[Workspace State] + end + + subgraph "Storage" + S3[Bundle Storage] + QDR[Qdrant DB] + end + + FS --> W + W --> CQ + CQ --> DC + DC --> LC + DC --> API + API --> BP + BP --> Q + BP --> WS + Q --> QDR + BP --> S3 +``` + +## Delta Bundle Creation Flow + +```mermaid +sequenceDiagram + participant FS as File System + participant W as watch_index.py + participant CQ as ChangeQueue + participant DC as Delta Creator + participant LC as Local Cache + participant API as Upload API + + FS->>W: File change event + W->>CQ: Add path to queue + CQ->>CQ: Debounce changes + CQ->>DC: Flush batched changes + DC->>LC: Check cached hashes + LC-->>DC: Return cached hashes + DC->>FS: Read file contents + DC->>DC: Detect change types + DC->>DC: Create delta bundle + DC->>LC: Save bundle locally + DC->>API: Upload bundle + API-->>DC: Acknowledge receipt + DC->>LC: Mark as acknowledged +``` + +## Change Detection Algorithm + +```mermaid +flowchart TD + A[Start: File Changes Detected] --> B[Get Cached Hashes] + B --> C{File Exists?} + C -->|No| D[File Deleted] + C -->|Yes| E[Calculate Current Hash] + E --> F{Hash Changed?} + F -->|No| G[Unchanged] + F -->|Yes| H{Has Cached Hash?} + H -->|No| I[File Created] + H -->|Yes| J[File Updated] + D --> K[Add to Deleted List] + I --> L[Add to Created List] + J --> M[Add to Updated List] + G --> N[Skip] + K --> O[Detect Moves] + L --> O + M --> O + N --> O + O --> P[Create Delta Bundle] +``` + +## Error Recovery Flow + +```mermaid +stateDiagram-v2 + [*] --> UploadBundle + UploadBundle --> Success: Upload OK + UploadBundle --> RetryableError: Network/Temp Error + UploadBundle --> SequenceError: Sequence Mismatch + UploadBundle --> FatalError: Permanent Failure + + RetryableError --> WaitRetry + WaitRetry --> UploadBundle: Retry + + SequenceError --> RequestRecovery + RequestRecovery --> ApplyRecovered: Recovery OK + RequestRecovery --> FatalError: Recovery Failed + ApplyRecovered --> UploadBundle + + Success --> [*] + FatalError --> [*] +``` + +## Integration Points + +```mermaid +graph LR + subgraph "Existing Components" + WI[watch_index.py] + IC[ingest_code.py] + WS[workspace_state.py] + Q[Qdrant Client] + end + + subgraph "New Delta Components" + DQ[DeltaChangeQueue] + DC[DeltaCreator] + DS[DeltaService] + DP[DeltaProcessor] + end + + WI --> DQ + DQ --> DC + DC --> DS + DS --> DP + DP --> IC + DP --> WS + IC --> Q +``` + +## Data Flow Architecture + +```mermaid +graph TB + subgraph "Client Side" + A[File Changes] --> B[Change Detection] + B --> C[Delta Bundle Creation] + C --> D[Local Persistence] + D --> E[HTTP Upload] + end + + subgraph "Server Side" + E --> F[Bundle Reception] + F --> G[Validation] + G --> H[Processing Queue] + H --> I[File Operations] + I --> J[Qdrant Updates] + I --> K[State Updates] + end + + subgraph "Recovery Flow" + L[Sequence Mismatch] --> M[Recovery Request] + M --> N[Missing Bundles] + N --> O[Replay Operations] + end + + F -.-> L +``` + +## Component Interactions + +```mermaid +classDiagram + class ChangeQueue { + +add(Path) + +_flush() + -_lock: threading.Lock + -_paths: Set[Path] + -_timer: threading.Timer + } + + class DeltaChangeQueue { + +add(Path) + +_flush() + -detect_changes() + -create_bundle() + -upload_bundle() + } + + class DeltaCreator { + +create_bundle(changes) + +detect_file_changes() + +detect_moves() + -calculate_hashes() + } + + class DeltaService { + +upload_bundle() + +get_status() + +recover_bundles() + -validate_bundle() + -process_operations() + } + + class DeltaProcessor { + +process_bundle() + +process_created() + +process_updated() + +process_deleted() + +process_moved() + } + + ChangeQueue <|-- DeltaChangeQueue + DeltaChangeQueue --> DeltaCreator + DeltaCreator --> DeltaService + DeltaService --> DeltaProcessor +``` + +## Deployment Architecture + +```mermaid +graph TB + subgraph "Development Environment" + DEV_FS[Local File System] + DEV_WATCH[watch_index.py] + DEV_DELTA[Delta Client] + DEV_API[Local Delta API] + end + + subgraph "Production Environment" + PROD_FS[Shared File System] + PROD_WATCH[watch_index.py Cluster] + PROD_DELTA[Delta Client Cluster] + LB[Load Balancer] + PROD_API[Delta API Cluster] + REDIS[Redis Cache] + S3[Object Storage] + end + + DEV_FS --> DEV_WATCH + DEV_WATCH --> DEV_DELTA + DEV_DELTA --> DEV_API + + PROD_FS --> PROD_WATCH + PROD_WATCH --> PROD_DELTA + PROD_DELTA --> LB + LB --> PROD_API + PROD_API --> REDIS + PROD_API --> S3 +``` + +This architecture provides a comprehensive view of how the delta upload system integrates with the existing Context-Engine infrastructure while providing scalability, reliability, and efficient change detection. \ No newline at end of file diff --git a/delta_upload_design.md b/delta_upload_design.md new file mode 100644 index 00000000..e1e0c12d --- /dev/null +++ b/delta_upload_design.md @@ -0,0 +1,651 @@ +# Delta Upload Format and Protocol Design + +## Overview + +This document specifies a delta upload format and protocol for real-time code ingestion in Context-Engine, designed to efficiently transmit only changed files from a local watch client to a remote upload service. + +## 1. Delta Bundle Format Specification + +### 1.1 Bundle Structure + +A delta bundle is a tarball (`.tar.gz`) containing: + +``` +delta-bundle.tar.gz +├── manifest.json # Bundle metadata and file operations +├── files/ # Directory containing file content +│ ├── created/ # New files +│ ├── updated/ # Modified files +│ └── moved/ # Moved files (at destination) +└── metadata/ # File metadata and hashes + ├── hashes.json # Content hashes for all files + └── operations.json # Detailed operation metadata +``` + +### 1.2 Manifest Format (`manifest.json`) + +```json +{ + "version": "1.0", + "bundle_id": "uuid-v4", + "workspace_path": "/absolute/path/to/workspace", + "collection_name": "workspace-collection", + "created_at": "2025-01-26T01:55:00.000Z", + "sequence_number": 42, + "parent_sequence": 41, + "operations": { + "created": 5, + "updated": 3, + "deleted": 2, + "moved": 1 + }, + "total_files": 11, + "total_size_bytes": 1048576, + "compression": "gzip", + "encoding": "utf-8" +} +``` + +### 1.3 File Operations Format + +#### Created Files (`files/created/`) +- Path: `files/created/relative/path/to/file.ext` +- Content: Full file content +- Metadata: Stored in `metadata/operations.json` + +#### Updated Files (`files/updated/`) +- Path: `files/updated/relative/path/to/file.ext` +- Content: Full file content (simpler than diff-based approach) +- Metadata: Stored in `metadata/operations.json` + +#### Moved Files (`files/moved/`) +- Path: `files/moved/destination/path/to/file.ext` +- Content: Full file content at destination +- Metadata: Includes source path in `metadata/operations.json` + +#### Deleted Files +- No content in bundle +- Metadata only in `metadata/operations.json` + +### 1.4 Operations Metadata (`metadata/operations.json`) + +```json +{ + "operations": [ + { + "operation": "created", + "path": "src/new_file.py", + "relative_path": "src/new_file.py", + "absolute_path": "/workspace/src/new_file.py", + "size_bytes": 1024, + "content_hash": "sha1:da39a3ee5e6b4b0d3255bfef95601890afd80709", + "file_hash": "sha1:abc123...", + "modified_time": "2025-01-26T01:55:00.000Z", + "language": "python" + }, + { + "operation": "updated", + "path": "src/existing.py", + "relative_path": "src/existing.py", + "absolute_path": "/workspace/src/existing.py", + "size_bytes": 2048, + "content_hash": "sha1:new_hash_value", + "previous_hash": "sha1:old_hash_value", + "file_hash": "sha1:def456...", + "modified_time": "2025-01-26T01:55:00.000Z", + "language": "python" + }, + { + "operation": "moved", + "path": "src/new_location.py", + "relative_path": "src/new_location.py", + "absolute_path": "/workspace/src/new_location.py", + "source_path": "src/old_location.py", + "source_relative_path": "src/old_location.py", + "source_absolute_path": "/workspace/src/old_location.py", + "size_bytes": 1536, + "content_hash": "sha1:same_hash_as_source", + "file_hash": "sha1:ghi789...", + "modified_time": "2025-01-26T01:55:00.000Z", + "language": "python" + }, + { + "operation": "deleted", + "path": "src/removed.py", + "relative_path": "src/removed.py", + "absolute_path": "/workspace/src/removed.py", + "previous_hash": "sha1:deleted_file_hash", + "file_hash": null, + "modified_time": "2025-01-26T01:55:00.000Z", + "language": "python" + } + ] +} +``` + +### 1.5 Hash Storage (`metadata/hashes.json`) + +```json +{ + "workspace_path": "/workspace", + "updated_at": "2025-01-26T01:55:00.000Z", + "file_hashes": { + "src/new_file.py": "sha1:abc123...", + "src/existing.py": "sha1:def456...", + "src/new_location.py": "sha1:ghi789..." + } +} +``` + +## 2. HTTP API Contract + +### 2.1 Upload Endpoint + +``` +POST /api/v1/delta/upload +Content-Type: multipart/form-data +``` + +#### Request Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| bundle | File | Yes | Delta bundle tarball | +| workspace_path | String | Yes | Absolute workspace path | +| collection_name | String | No | Override collection name | +| sequence_number | Integer | No | Expected sequence number | +| force | Boolean | No | Force upload even if sequence mismatch | + +#### Response Format + +```json +{ + "success": true, + "bundle_id": "uuid-v4", + "sequence_number": 42, + "processed_operations": { + "created": 5, + "updated": 3, + "deleted": 2, + "moved": 1, + "skipped": 0, + "failed": 0 + }, + "processing_time_ms": 1250, + "indexed_points": 156, + "collection_name": "workspace-collection", + "next_sequence": 43 +} +``` + +#### Error Response + +```json +{ + "success": false, + "error": { + "code": "SEQUENCE_MISMATCH", + "message": "Expected sequence 41, got 43", + "expected_sequence": 41, + "received_sequence": 43, + "retry_after": 5000 + } +} +``` + +### 2.2 Status Endpoint + +``` +GET /api/v1/delta/status?workspace_path=/workspace +``` + +#### Response + +```json +{ + "workspace_path": "/workspace", + "collection_name": "workspace-collection", + "last_sequence": 41, + "last_upload": "2025-01-26T01:50:00.000Z", + "pending_operations": 0, + "status": "ready", + "server_info": { + "version": "1.0.0", + "max_bundle_size_mb": 100, + "supported_formats": ["tar.gz"] + } +} +``` + +### 2.3 Recovery Endpoint + +``` +POST /api/v1/delta/recover +Content-Type: application/json +``` + +#### Request + +```json +{ + "workspace_path": "/workspace", + "from_sequence": 38, + "to_sequence": 42 +} +``` + +#### Response + +```json +{ + "success": true, + "recovered_bundles": [ + { + "sequence": 39, + "bundle_id": "uuid-39", + "operations": {"created": 2, "updated": 1} + } + ], + "next_sequence": 43 +} +``` + +## 3. Change Detection Algorithm + +### 3.1 Integration with Existing Hash Cache + +The delta system leverages the existing hash-based caching in [`workspace_state.py`](scripts/workspace_state.py:304-310): + +```python +def detect_file_changes(workspace_path: str, changed_paths: List[Path]) -> Dict[str, Any]: + """ + Detect what type of changes occurred for each file path. + + Returns: + { + "created": [Path], + "updated": [Path], + "deleted": [Path], + "moved": [(source: Path, dest: Path)], + "unchanged": [Path] + } + """ + changes = { + "created": [], + "updated": [], + "deleted": [], + "moved": [], + "unchanged": [] + } + + for path in changed_paths: + abs_path = str(path.resolve()) + cached_hash = get_cached_file_hash(workspace_path, abs_path) + + if not path.exists(): + # File was deleted + if cached_hash: + changes["deleted"].append(path) + else: + # File exists - calculate current hash + try: + with open(path, 'rb') as f: + content = f.read() + current_hash = hashlib.sha1(content).hexdigest() + + if not cached_hash: + # New file + changes["created"].append(path) + elif cached_hash != current_hash: + # Modified file + changes["updated"].append(path) + else: + # Unchanged (might be a move detection candidate) + changes["unchanged"].append(path) + + # Update cache + set_cached_file_hash(workspace_path, abs_path, current_hash) + except Exception: + # Skip files that can't be read + continue + + # Detect moves by looking for files with same content hash + # but different paths (requires additional tracking) + changes["moved"] = detect_moves(changes["created"], changes["deleted"]) + + return changes +``` + +### 3.2 Move Detection Algorithm + +```python +def detect_moves(created_files: List[Path], deleted_files: List[Path]) -> List[Tuple[Path, Path]]: + """ + Detect file moves by matching content hashes between created and deleted files. + """ + moves = [] + deleted_hashes = {} + + # Build hash map for deleted files + for deleted_path in deleted_files: + try: + with open(deleted_path, 'rb') as f: + content = f.read() + file_hash = hashlib.sha1(content).hexdigest() + deleted_hashes[file_hash] = deleted_path + except Exception: + continue + + # Match created files with deleted files by hash + for created_path in created_files: + try: + with open(created_path, 'rb') as f: + content = f.read() + file_hash = hashlib.sha1(content).hexdigest() + + if file_hash in deleted_hashes: + source_path = deleted_hashes[file_hash] + moves.append((source_path, created_path)) + # Remove from consideration + del deleted_hashes[file_hash] + except Exception: + continue + + return moves +``` + +### 3.3 Integration with ChangeQueue + +The delta system integrates with the existing [`ChangeQueue`](scripts/watch_index.py:45-66) debouncing pattern: + +```python +class DeltaChangeQueue(ChangeQueue): + """Extended ChangeQueue that creates delta bundles.""" + + def __init__(self, process_cb, workspace_path: str, upload_endpoint: str): + super().__init__(process_cb) + self.workspace_path = workspace_path + self.upload_endpoint = upload_endpoint + self.sequence_number = self._get_last_sequence() + + def _flush(self): + """Override to create delta bundle before processing.""" + with self._lock: + paths = list(self._paths) + self._paths.clear() + self._timer = None + + # Detect changes and create delta bundle + changes = detect_file_changes(self.workspace_path, paths) + if self._has_meaningful_changes(changes): + bundle = self._create_delta_bundle(changes) + self._upload_bundle(bundle) + + # Call original processing + self._process_cb(paths) + + def _has_meaningful_changes(self, changes: Dict[str, List]) -> bool: + """Check if changes warrant a delta upload.""" + total_changes = sum(len(files) for op, files in changes.items() if op != "unchanged") + return total_changes > 0 +``` + +## 4. Error Handling and Recovery Strategy + +### 4.1 Retry Mechanism + +```python +class DeltaUploadClient: + def __init__(self, endpoint: str, max_retries: int = 3): + self.endpoint = endpoint + self.max_retries = max_retries + self.retry_delays = [1000, 2000, 5000] # ms + + def upload_bundle(self, bundle_path: str, metadata: Dict) -> bool: + for attempt in range(self.max_retries + 1): + try: + response = self._send_bundle(bundle_path, metadata) + if response["success"]: + return True + + # Handle specific error cases + if response["error"]["code"] == "SEQUENCE_MISMATCH": + return self._handle_sequence_mismatch(response, metadata) + + except Exception as e: + if attempt == self.max_retries: + self._log_failure(e, metadata) + return False + + # Wait before retry + if attempt < len(self.retry_delays): + time.sleep(self.retry_delays[attempt] / 1000) + + return False +``` + +### 4.2 Sequence Number Recovery + +```python +def _handle_sequence_mismatch(self, error_response: Dict, metadata: Dict) -> bool: + """Handle sequence number mismatch by recovering missing bundles.""" + expected_seq = error_response["error"]["expected_sequence"] + current_seq = metadata["sequence_number"] + + # Try to recover missing bundles + recovery_response = self._request_recovery( + metadata["workspace_path"], + from_sequence=expected_seq, + to_sequence=current_seq - 1 + ) + + if recovery_response["success"]: + # Apply recovered bundles locally + for bundle_info in recovery_response["recovered_bundles"]: + if not self._apply_recovered_bundle(bundle_info): + return False + + # Retry original upload + return self._send_bundle(metadata["bundle_path"], metadata)["success"] + + return False +``` + +### 4.3 Bundle Persistence + +```python +class BundlePersistence: + """Local persistence for delta bundles to enable recovery.""" + + def __init__(self, workspace_path: str): + self.workspace_path = workspace_path + self.bundle_dir = Path(workspace_path) / ".codebase" / "delta_bundles" + self.bundle_dir.mkdir(exist_ok=True) + + def save_bundle(self, bundle_path: str, metadata: Dict) -> str: + """Save bundle locally with metadata.""" + bundle_id = metadata["bundle_id"] + saved_path = self.bundle_dir / f"{bundle_id}.tar.gz" + metadata_path = self.bundle_dir / f"{bundle_id}.json" + + shutil.copy2(bundle_path, saved_path) + with open(metadata_path, 'w') as f: + json.dump(metadata, f, indent=2) + + return str(saved_path) + + def get_pending_bundles(self) -> List[Dict]: + """Get bundles that haven't been acknowledged by server.""" + pending = [] + for metadata_file in self.bundle_dir.glob("*.json"): + try: + with open(metadata_file) as f: + metadata = json.load(f) + if not metadata.get("acknowledged", False): + pending.append(metadata) + except Exception: + continue + return pending +``` + +## 5. Integration Points with Existing Code + +### 5.1 Integration with watch_index.py + +```python +# Modified IndexHandler to support delta uploads +class DeltaIndexHandler(IndexHandler): + def __init__(self, root: Path, queue: ChangeQueue, client: QdrantClient, + collection: str, delta_client: DeltaUploadClient): + super().__init__(root, queue, client, collection) + self.delta_client = delta_client + + def _maybe_enqueue(self, src_path: str): + """Override to add delta queue processing.""" + super()._maybe_enqueue(src_path) + # Delta processing happens in the extended ChangeQueue + + def on_deleted(self, event): + """Override to handle deletions in delta system.""" + super().on_deleted(event) + # Delta queue will handle the deletion processing +``` + +### 5.2 Integration with ingest_code.py + +```python +# Extend ingest_code.py to process delta bundles +def process_delta_bundle(bundle_path: str, workspace_path: str, + collection: str) -> Dict[str, Any]: + """Process a delta bundle and update Qdrant collection.""" + + # Extract bundle + with tempfile.TemporaryDirectory() as temp_dir: + extract_path = Path(temp_dir) + with tarfile.open(bundle_path, 'r:gz') as tar: + tar.extractall(extract_path) + + # Read manifest + with open(extract_path / "manifest.json") as f: + manifest = json.load(f) + + # Read operations + with open(extract_path / "metadata" / "operations.json") as f: + operations = json.load(f)["operations"] + + # Process each operation + results = {"created": 0, "updated": 0, "deleted": 0, "moved": 0, "failed": 0} + + for op in operations: + try: + if op["operation"] == "created": + _process_created_file(extract_path, op, collection, results) + elif op["operation"] == "updated": + _process_updated_file(extract_path, op, collection, results) + elif op["operation"] == "deleted": + _process_deleted_file(op, collection, results) + elif op["operation"] == "moved": + _process_moved_file(extract_path, op, collection, results) + except Exception as e: + results["failed"] += 1 + print(f"Failed to process {op['operation']} {op['path']}: {e}") + + # Update workspace state + _update_workspace_state_from_delta(workspace_path, manifest, results) + + return results +``` + +### 5.3 Integration with workspace_state.py + +```python +# Extend workspace_state.py for delta tracking +def get_last_delta_sequence(workspace_path: str) -> int: + """Get the last processed delta sequence number.""" + state = get_workspace_state(workspace_path) + return state.get("delta_state", {}).get("last_sequence", 0) + +def update_delta_state(workspace_path: str, sequence: int, bundle_id: str) -> None: + """Update delta processing state.""" + state = get_workspace_state(workspace_path) + delta_state = state.get("delta_state", {}) + delta_state.update({ + "last_sequence": sequence, + "last_bundle_id": bundle_id, + "last_processed": datetime.now().isoformat() + }) + update_workspace_state(workspace_path, {"delta_state": delta_state}) +``` + +## 6. Implementation Roadmap + +### Phase 1: Core Delta Format and API (Week 1) +1. Implement delta bundle creation and parsing +2. Create HTTP API endpoints for upload, status, and recovery +3. Implement basic error handling and response formats +4. Add unit tests for bundle format validation + +### Phase 2: Change Detection Integration (Week 2) +1. Integrate with existing hash cache system +2. Implement move detection algorithm +3. Extend ChangeQueue for delta bundle creation +4. Add integration tests with watch_index.py + +### Phase 3: Error Handling and Recovery (Week 3) +1. Implement retry mechanism with exponential backoff +2. Add sequence number recovery +3. Implement bundle persistence +4. Add comprehensive error logging and monitoring + +### Phase 4: Production Integration (Week 4) +1. Integrate with ingest_code.py for bundle processing +2. Extend workspace_state.py for delta tracking +3. Add performance optimization and batching +4. Implement monitoring and alerting +5. Add end-to-end integration tests + +### Phase 5: Performance and Scaling (Week 5) +1. Optimize bundle compression and size +2. Implement parallel processing for large bundles +3. Add bandwidth optimization for remote uploads +4. Performance testing and tuning + +## 7. Configuration and Environment Variables + +```bash +# Delta upload configuration +DELTA_UPLOAD_ENABLED=true +DELTA_UPLOAD_ENDPOINT=http://delta-server:8002/api/v1/delta +DELTA_MAX_BUNDLE_SIZE_MB=100 +DELTA_BATCH_SIZE_FILES=50 +DELTA_DEBOUNCE_SECS=2.0 + +# Retry and recovery +DELTA_MAX_RETRIES=3 +DELTA_RETRY_DELAYS_MS=1000,2000,5000 +DELTA_PERSIST_BUNDLES=true +DELTA_BUNDLE_RETENTION_DAYS=7 + +# Performance tuning +DELTA_COMPRESSION_LEVEL=6 +DELTA_PARALLEL_UPLOADS=2 +DELTA_CHUNK_SIZE_BYTES=1048576 +``` + +## 8. Security Considerations + +1. **Authentication**: Add API key or token-based authentication +2. **Authorization**: Validate workspace access permissions +3. **Input Validation**: Validate bundle format and file paths +4. **Rate Limiting**: Implement upload rate limits per workspace +5. **Audit Logging**: Log all delta operations for compliance + +## 9. Monitoring and Observability + +1. **Metrics**: Track bundle size, processing time, success rates +2. **Logging**: Structured logging for all delta operations +3. **Health Checks**: Endpoint health monitoring +4. **Alerting**: Alert on failed uploads or processing errors +5. **Dashboards**: Visual monitoring of delta system performance + +This design provides a comprehensive foundation for implementing delta uploads in Context-Engine while leveraging existing infrastructure and maintaining compatibility with current file processing workflows. \ No newline at end of file diff --git a/deploy/kubernetes/upload-pvc.yaml b/deploy/kubernetes/upload-pvc.yaml new file mode 100644 index 00000000..b149e8f3 --- /dev/null +++ b/deploy/kubernetes/upload-pvc.yaml @@ -0,0 +1,47 @@ +--- +# Persistent Volume Claim for workspace storage (CephFS RWX) +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: upload-work-pvc + namespace: context-engine + labels: + app: context-engine + component: upload-service + type: storage +spec: + accessModes: + - ReadWriteMany # CephFS supports RWX for multiple pods + storageClassName: rook-cephfs # Adjust based on your CephFS storage class + resources: + requests: + storage: 10Gi # Adjust size based on your needs + # Optional: selector for specific PV + # selector: + # matchLabels: + # app: context-engine + # component: upload-work + +--- +# Persistent Volume Claim for codebase metadata storage (CephFS RWX) +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: upload-codebase-pvc + namespace: context-engine + labels: + app: context-engine + component: upload-service + type: storage +spec: + accessModes: + - ReadWriteMany # CephFS supports RWX for multiple pods + storageClassName: rook-cephfs # Adjust based on your CephFS storage class + resources: + requests: + storage: 5Gi # Smaller size for metadata/cache + # Optional: selector for specific PV + # selector: + # matchLabels: + # app: context-engine + # component: upload-codebase diff --git a/deploy/kubernetes/upload-service.yaml b/deploy/kubernetes/upload-service.yaml new file mode 100644 index 00000000..0e457e4c --- /dev/null +++ b/deploy/kubernetes/upload-service.yaml @@ -0,0 +1,125 @@ +--- +# Delta Upload Service Deployment +apiVersion: apps/v1 +kind: Deployment +metadata: + name: upload-service + namespace: context-engine + labels: + app: context-engine + component: upload-service +spec: + replicas: 1 + selector: + matchLabels: + app: context-engine + component: upload-service + template: + metadata: + labels: + app: context-engine + component: upload-service + spec: + containers: + - name: upload-service + image: context-engine-upload-service # Use service-specific image name + imagePullPolicy: IfNotPresent + command: ["python", "scripts/upload_service.py"] + ports: + - name: http + containerPort: 8002 + protocol: TCP + env: + - name: QDRANT_URL + valueFrom: + configMapKeyRef: + name: context-engine-config + key: QDRANT_URL + - name: COLLECTION_NAME + valueFrom: + configMapKeyRef: + name: context-engine-config + key: COLLECTION_NAME + - name: UPLOAD_SERVICE_HOST + value: "0.0.0.0" + - name: UPLOAD_SERVICE_PORT + value: "8002" + - name: WORK_DIR + value: "/work" + - name: MAX_BUNDLE_SIZE_MB + value: "100" + - name: UPLOAD_TIMEOUT_SECS + value: "300" + - name: EMBEDDING_MODEL + valueFrom: + configMapKeyRef: + name: context-engine-config + key: EMBEDDING_MODEL + - name: USE_TREE_SITTER + valueFrom: + configMapKeyRef: + name: context-engine-config + key: USE_TREE_SITTER + - name: INDEX_SEMANTIC_CHUNKS + valueFrom: + configMapKeyRef: + name: context-engine-config + key: INDEX_SEMANTIC_CHUNKS + - name: INDEX_MICRO_CHUNKS + valueFrom: + configMapKeyRef: + name: context-engine-config + key: INDEX_MICRO_CHUNKS + resources: + requests: + memory: "512Mi" + cpu: "250m" + limits: + memory: "2Gi" + cpu: "1000m" + volumeMounts: + - name: work-volume + mountPath: /work + - name: codebase-volume + mountPath: /work/.codebase + livenessProbe: + httpGet: + path: /health + port: http + initialDelaySeconds: 30 + periodSeconds: 10 + readinessProbe: + httpGet: + path: /health + port: http + initialDelaySeconds: 10 + periodSeconds: 5 + volumes: + - name: work-volume + persistentVolumeClaim: + claimName: upload-work-pvc + - name: codebase-volume + persistentVolumeClaim: + claimName: upload-codebase-pvc + +--- +# Delta Upload Service Service +apiVersion: v1 +kind: Service +metadata: + name: upload-service + namespace: context-engine + labels: + app: context-engine + component: upload-service +spec: + type: NodePort # Change to LoadBalancer for external access + ports: + - name: http + port: 8002 + targetPort: http + nodePort: 30804 # Optional: specify node port + protocol: TCP + selector: + app: context-engine + component: upload-service \ No newline at end of file diff --git a/docker-compose.dev-remote.yml b/docker-compose.dev-remote.yml new file mode 100644 index 00000000..9dcd9604 --- /dev/null +++ b/docker-compose.dev-remote.yml @@ -0,0 +1,338 @@ +# Development Docker Compose for Remote Upload System Testing +# This file simulates the Kubernetes environment with shared volumes (CephFS RWX) +# for local testing of the complete remote upload workflow + +version: '3.8' + +services: + # Qdrant vector database - same as base compose + qdrant: + image: qdrant/qdrant:latest + container_name: qdrant-db-dev-remote + ports: + - "6333:6333" + - "6334:6334" + volumes: + - qdrant_storage_dev_remote:/qdrant/storage + networks: + - dev-remote-network + + # MCP search service - same as base compose + mcp: + build: + context: . + dockerfile: Dockerfile.mcp + container_name: mcp-search-dev-remote + depends_on: + - qdrant + env_file: + - .env + environment: + - FASTMCP_HOST=${FASTMCP_HOST} + - FASTMCP_PORT=${FASTMCP_PORT} + - QDRANT_URL=${QDRANT_URL} + - COLLECTION_NAME=${COLLECTION_NAME} + - EMBEDDING_MODEL=${EMBEDDING_MODEL} + - EMBEDDING_PROVIDER=${EMBEDDING_PROVIDER} + - TOOL_STORE_DESCRIPTION=${TOOL_STORE_DESCRIPTION} + - TOOL_FIND_DESCRIPTION=${TOOL_FIND_DESCRIPTION} + - FASTMCP_HEALTH_PORT=18000 + ports: + - "18000:18000" + - "8000:8000" + volumes: + - shared_workspace:/work:ro + networks: + - dev-remote-network + + # MCP indexer service - same as base compose + mcp_indexer: + build: + context: . + dockerfile: Dockerfile.mcp-indexer + container_name: mcp-indexer-dev-remote + depends_on: + - qdrant + env_file: + - .env + environment: + - FASTMCP_HEALTH_PORT=18001 + - FASTMCP_HOST=${FASTMCP_HOST} + - FASTMCP_INDEXER_PORT=${FASTMCP_INDEXER_PORT} + - QDRANT_URL=${QDRANT_URL} + ports: + - "${FASTMCP_INDEXER_PORT:-8001}:8001" + - "18001:18001" + volumes: + - shared_workspace:/work + - shared_codebase:/work/.codebase + networks: + - dev-remote-network + + # MCP HTTP search service - same as base compose + mcp_http: + build: + context: . + dockerfile: Dockerfile.mcp + container_name: mcp-search-http-dev-remote + depends_on: + - qdrant + env_file: + - .env + environment: + - FASTMCP_HOST=${FASTMCP_HOST} + - FASTMCP_PORT=8000 + - FASTMCP_TRANSPORT=${FASTMCP_HTTP_TRANSPORT} + - QDRANT_URL=${QDRANT_URL} + - COLLECTION_NAME=${COLLECTION_NAME} + - EMBEDDING_MODEL=${EMBEDDING_MODEL} + - EMBEDDING_PROVIDER=${EMBEDDING_PROVIDER} + - TOOL_STORE_DESCRIPTION=${TOOL_STORE_DESCRIPTION} + - TOOL_FIND_DESCRIPTION=${TOOL_FIND_DESCRIPTION} + - FASTMCP_HEALTH_PORT=18000 + ports: + - "${FASTMCP_HTTP_HEALTH_PORT:-18002}:18000" + - "${FASTMCP_HTTP_PORT:-8002}:8000" + volumes: + - shared_workspace:/work:ro + networks: + - dev-remote-network + + # MCP HTTP indexer service - same as base compose + mcp_indexer_http: + build: + context: . + dockerfile: Dockerfile.mcp-indexer + container_name: mcp-indexer-http-dev-remote + depends_on: + - qdrant + env_file: + - .env + environment: + - FASTMCP_HOST=${FASTMCP_HOST} + - FASTMCP_INDEXER_PORT=8001 + - FASTMCP_TRANSPORT=${FASTMCP_HTTP_TRANSPORT} + - QDRANT_URL=${QDRANT_URL} + - FASTMCP_HEALTH_PORT=18001 + ports: + - "${FASTMCP_INDEXER_HTTP_PORT:-8003}:8001" + - "${FASTMCP_INDEXER_HTTP_HEALTH_PORT:-18003}:18001" + volumes: + - shared_workspace:/work + - shared_codebase:/work/.codebase + networks: + - dev-remote-network + + # Llama.cpp decoder service - same as base compose + llamacpp: + image: ghcr.io/ggerganov/llama.cpp:server + container_name: llama-decoder-dev-remote + environment: + - LLAMA_ARG_MODEL=/models/model.gguf + - LLAMA_ARG_CTX_SIZE=8192 + - LLAMA_ARG_HOST=0.0.0.0 + - LLAMA_ARG_PORT=8080 + ports: + - "8080:8080" + volumes: + - ./models:/models:ro + command: ["--model", "/models/model.gguf", "--host", "0.0.0.0", "--port", "8080", "--no-warmup"] + networks: + - dev-remote-network + + # Indexer service - modified for shared volumes + indexer: + build: + context: . + dockerfile: Dockerfile.indexer + container_name: indexer-dev-remote + depends_on: + - qdrant + env_file: + - .env + environment: + - QDRANT_URL=${QDRANT_URL} + - COLLECTION_NAME=${COLLECTION_NAME} + - EMBEDDING_MODEL=${EMBEDDING_MODEL} + working_dir: /work + volumes: + - shared_workspace:/work:ro + - shared_codebase:/work/.codebase:rw + entrypoint: ["python", "/app/scripts/ingest_code.py"] + networks: + - dev-remote-network + + # Watcher service - modified for shared volumes + watcher: + build: + context: . + dockerfile: Dockerfile.indexer + container_name: watcher-dev-remote + depends_on: + - qdrant + env_file: + - .env + environment: + - QDRANT_URL=${QDRANT_URL} + - COLLECTION_NAME=${COLLECTION_NAME} + - EMBEDDING_MODEL=${EMBEDDING_MODEL} + - WATCH_ROOT=/work + - QDRANT_TIMEOUT=60 + - MAX_MICRO_CHUNKS_PER_FILE=200 + - INDEX_UPSERT_BATCH=128 + - INDEX_UPSERT_RETRIES=5 + - WATCH_DEBOUNCE_SECS=1.5 + working_dir: /work + volumes: + - shared_workspace:/work:ro + - shared_codebase:/work/.codebase:rw + entrypoint: ["python", "/app/scripts/watch_index.py"] + networks: + - dev-remote-network + + # Init payload service - modified for shared volumes + init_payload: + build: + context: . + dockerfile: Dockerfile.indexer + container_name: init-payload-dev-remote + depends_on: + - qdrant + env_file: + - .env + environment: + - QDRANT_URL=${QDRANT_URL} + - COLLECTION_NAME=${COLLECTION_NAME} + working_dir: /work + volumes: + - shared_workspace:/work:ro + - shared_codebase:/work/.codebase:rw + entrypoint: ["python", "/app/scripts/create_indexes.py"] + networks: + - dev-remote-network + + # NEW: Upload Service for Remote Upload System + upload_service: + build: + context: . + dockerfile: Dockerfile.upload-service + container_name: upload-service-dev-remote + depends_on: + - qdrant + env_file: + - .env + environment: + # Upload service configuration + - UPLOAD_SERVICE_HOST=0.0.0.0 + - UPLOAD_SERVICE_PORT=8002 + - QDRANT_URL=${QDRANT_URL} + - WORK_DIR=/work + - MAX_BUNDLE_SIZE_MB=100 + - UPLOAD_TIMEOUT_SECS=300 + + # Indexing configuration + - COLLECTION_NAME=${COLLECTION_NAME} + - EMBEDDING_MODEL=${EMBEDDING_MODEL} + - EMBEDDING_PROVIDER=${EMBEDDING_PROVIDER} + - USE_TREE_SITTER=${USE_TREE_SITTER} + - INDEX_SEMANTIC_CHUNKS=${INDEX_SEMANTIC_CHUNKS} + - INDEX_MICRO_CHUNKS=${INDEX_MICRO_CHUNKS} + + # Remote upload mode configuration + - REMOTE_UPLOAD_ENABLED=1 + - REMOTE_UPLOAD_MODE=development + - REMOTE_UPLOAD_DEBUG=1 + + # Qdrant configuration + - QDRANT_TIMEOUT=${QDRANT_TIMEOUT} + - MAX_MICRO_CHUNKS_PER_FILE=${MAX_MICRO_CHUNKS_PER_FILE} + - INDEX_UPSERT_BATCH=${INDEX_UPSERT_BATCH} + - INDEX_UPSERT_RETRIES=${INDEX_UPSERT_RETRIES} + ports: + - "8004:8002" # Map to different host port to avoid conflicts + - "18004:18000" # Health check port + volumes: + - shared_workspace:/work + - shared_codebase:/work/.codebase + - upload_temp:/tmp/uploads + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:8002/health"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 40s + restart: unless-stopped + networks: + - dev-remote-network + + # NEW: Remote Upload Client for Testing + remote_upload_client: + build: + context: . + dockerfile: Dockerfile.indexer # Reuse indexer image + container_name: remote-upload-client-dev-remote + depends_on: + - upload_service + env_file: + - .env + environment: + # Remote upload client configuration + - REMOTE_UPLOAD_ENABLED=1 + - REMOTE_UPLOAD_ENDPOINT=http://upload_service:8002 + - REMOTE_UPLOAD_MAX_RETRIES=3 + - REMOTE_UPLOAD_TIMEOUT=30 + - REMOTE_UPLOAD_DEBUG=1 + + # Watcher configuration for remote mode + - WATCH_ROOT=/work + - QDRANT_URL=${QDRANT_URL} + - COLLECTION_NAME=${COLLECTION_NAME} + - EMBEDDING_MODEL=${EMBEDDING_MODEL} + - QDRANT_TIMEOUT=60 + - MAX_MICRO_CHUNKS_PER_FILE=200 + - INDEX_UPSERT_BATCH=128 + - INDEX_UPSERT_RETRIES=5 + - WATCH_DEBOUNCE_SECS=1.5 + working_dir: /work + volumes: + - shared_workspace:/work:ro + - shared_codebase:/work/.codebase:rw + entrypoint: ["python", "/app/scripts/remote_upload_client.py"] + profiles: + - client # Only start when explicitly requested + networks: + - dev-remote-network + +# Shared volumes to simulate CephFS RWX PVC behavior +volumes: + # Main workspace volume - simulates CephFS RWX for repository storage + shared_workspace: + driver: local + driver_opts: + type: none + o: bind + device: ${HOST_INDEX_PATH:-./dev-workspace} + + # Codebase metadata volume - simulates CephFS RWX for indexing metadata + shared_codebase: + driver: local + driver_opts: + type: none + o: bind + device: ${HOST_INDEX_PATH:-./dev-workspace}/.codebase + + # Temporary upload storage + upload_temp: + driver: local + + # Qdrant storage - separate from base compose to avoid conflicts + qdrant_storage_dev_remote: + driver: local + +# Custom network for service discovery +networks: + dev-remote-network: + driver: bridge + ipam: + config: + - subnet: 172.20.0.0/16 \ No newline at end of file diff --git a/docs/dev-remote-setup.md b/docs/dev-remote-setup.md new file mode 100644 index 00000000..ac2090d9 --- /dev/null +++ b/docs/dev-remote-setup.md @@ -0,0 +1,341 @@ +# Development Remote Upload System Setup + +This guide covers setting up and using the development environment for testing the Context-Engine remote upload system with shared volumes that simulate the Kubernetes CephFS RWX PVC behavior. + +## Overview + +The `docker-compose.dev-remote.yml` file provides a complete local development environment that simulates the Kubernetes deployment with: + +- **Shared Volumes**: Simulates CephFS ReadWriteMany (RWX) PVC behavior +- **Upload Service**: HTTP service for receiving delta bundles +- **All Existing Services**: Qdrant, MCP servers, indexer, watcher, etc. +- **Service Discovery**: Proper networking between all services +- **Development Tools**: Easy testing and debugging capabilities + +## Quick Start + +### 1. Initial Setup + +```bash +# Run the development setup script +./scripts/dev-setup.sh + +# Or manually: +mkdir -p dev-workspace/.codebase +cp .env.example .env # if not exists +``` + +### 2. Start the System + +```bash +# Bootstrap the complete system (recommended) +make dev-remote-bootstrap + +# Or start services step by step: +make dev-remote-up +``` + +### 3. Test Your Repository + +```bash +# 1. Copy your repository to the workspace +cp -r /path/to/your/repo dev-workspace/your-repo + +# 2. Test the upload service +make dev-remote-test + +# 3. Check service health +curl http://localhost:8004/health +``` + +## Architecture + +### Shared Volume Structure + +The development environment uses shared volumes to simulate Kubernetes CephFS behavior: + +``` +dev-workspace/ # Main workspace (simulates CephFS RWX) +├── your-repo/ # Your repository code +├── .codebase/ # Indexing metadata and cache +└── ... # Other repositories +``` + +### Service Configuration + +| Service | Port | Purpose | Volumes | +|---------|------|---------|---------| +| upload_service | 8004 | Delta upload HTTP API | shared_workspace, shared_codebase | +| qdrant | 6333/6334 | Vector database | qdrant_storage_dev_remote | +| mcp | 8000 | MCP search server (SSE) | shared_workspace (ro) | +| mcp_indexer | 8001 | MCP indexer server (SSE) | shared_workspace, shared_codebase | +| mcp_http | 8002 | MCP search server (HTTP) | shared_workspace (ro) | +| mcp_indexer_http | 8003 | MCP indexer server (HTTP) | shared_workspace, shared_codebase | +| llamacpp | 8080 | LLM decoder service | ./models (ro) | + +### Network Configuration + +All services communicate via the `dev-remote-network` bridge network (172.20.0.0/16), ensuring proper service discovery and isolation. + +## Available Commands + +### Development Environment Commands + +```bash +# Environment setup +make dev-remote-up # Start all services +make dev-remote-down # Stop all services +make dev-remote-restart # Restart with rebuild +make dev-remote-logs # Follow service logs +make dev-remote-clean # Clean up volumes and containers + +# Bootstrap and testing +make dev-remote-bootstrap # Complete system setup +make dev-remote-test # Test upload workflow +make dev-remote-client # Start remote upload client + +# Individual service management +docker compose -f docker-compose.dev-remote.yml ps +docker compose -f docker-compose.dev-remote.yml logs upload_service +``` + +### Remote Upload Testing + +```bash +# Test upload service health +curl http://localhost:8004/health + +# Check workspace status +curl 'http://localhost:8004/api/v1/delta/status?workspace_path=/work/your-repo' + +# Test file upload (requires delta bundle) +curl -X POST \ + -F 'bundle=@test-bundle.tar.gz' \ + -F 'workspace_path=/work/your-repo' \ + http://localhost:8004/api/v1/delta/upload +``` + +## Workflow Examples + +### 1. Local Development Workflow + +```bash +# 1. Setup environment +./scripts/dev-setup.sh + +# 2. Add your repository +cp -r ~/my-project dev-workspace/my-project + +# 3. Start the system +make dev-remote-bootstrap + +# 4. Test indexing +docker compose -f docker-compose.dev-remote.yml run --rm indexer --root /work/my-project + +# 5. Start watcher for live updates +docker compose -f docker-compose.dev-remote.yml run --rm watcher +``` + +### 2. Remote Upload Testing Workflow + +```bash +# 1. Start upload service +make dev-remote-up + +# 2. Test remote upload from another directory +cd ~/my-project +make watch-remote REMOTE_UPLOAD_ENDPOINT=http://localhost:8004 + +# 3. Make changes to your code +# Files will be automatically uploaded and indexed +``` + +### 3. Multiple Repository Testing + +```bash +# 1. Setup multiple repositories +mkdir -p dev-workspace/{repo1,repo2,repo3} +cp -r ~/project1/* dev-workspace/repo1/ +cp -r ~/project2/* dev-workspace/repo2/ +cp -r ~/project3/* dev-workspace/repo3/ + +# 2. Start system +make dev-remote-bootstrap + +# 3. Index each repository +docker compose -f docker-compose.dev-remote.yml run --rm indexer --root /work/repo1 --collection repo1 +docker compose -f docker-compose.dev-remote.yml run --rm indexer --root /work/repo2 --collection repo2 +docker compose -f docker-compose.dev-remote.yml run --rm indexer --root /work/repo3 --collection repo3 +``` + +## Environment Variables + +### Development-Specific Variables + +```bash +# Workspace configuration +HOST_INDEX_PATH=./dev-workspace # Local workspace path +DEV_REMOTE_MODE=1 # Enable dev-remote mode +DEV_REMOTE_DEBUG=1 # Enable debug logging + +# Upload service configuration +UPLOAD_SERVICE_HOST=0.0.0.0 # Service bind address +UPLOAD_SERVICE_PORT=8002 # Service port (internal) +UPLOAD_SERVICE_DEBUG=1 # Enable debug mode + +# Remote upload client configuration +REMOTE_UPLOAD_ENABLED=1 # Enable remote upload +REMOTE_UPLOAD_ENDPOINT=http://upload_service:8002 # Upload endpoint +REMOTE_UPLOAD_MAX_RETRIES=3 # Max retry attempts +REMOTE_UPLOAD_TIMEOUT=30 # Request timeout (seconds) +REMOTE_UPLOAD_DEBUG=1 # Enable debug logging +``` + +### Standard Variables (from .env.example) + +All standard Context-Engine variables are supported and can be overridden for development: + +```bash +QDRANT_URL=http://qdrant:6333 +COLLECTION_NAME=my-collection +EMBEDDING_MODEL=BAAI/bge-base-en-v1.5 +EMBEDDING_PROVIDER=fastembed +# ... other standard variables +``` + +## Troubleshooting + +### Common Issues + +1. **Port Conflicts** + ```bash + # Check what's using ports + netstat -tulpn | grep :8004 + # Stop conflicting services + make dev-remote-down + ``` + +2. **Volume Permission Issues** + ```bash + # Fix workspace permissions + sudo chown -R $USER:$USER dev-workspace + chmod -R 755 dev-workspace + ``` + +3. **Service Not Ready** + ```bash + # Check service status + make dev-remote-logs + docker compose -f docker-compose.dev-remote.yml ps + + # Restart specific service + docker compose -f docker-compose.dev-remote.yml restart upload_service + ``` + +4. **Upload Failures** + ```bash + # Check upload service logs + docker compose -f docker-compose.dev-remote.yml logs upload_service + + # Test upload service directly + curl -v http://localhost:8004/health + ``` + +### Debug Mode + +Enable debug logging for detailed troubleshooting: + +```bash +# Add to .env +DEV_REMOTE_DEBUG=1 +UPLOAD_SERVICE_DEBUG=1 +REMOTE_UPLOAD_DEBUG=1 + +# Restart services +make dev-remote-restart +``` + +### Clean Reset + +For a complete reset: + +```bash +# Clean everything +make dev-remote-clean + +# Remove workspace +rm -rf dev-workspace + +# Start fresh +./scripts/dev-setup.sh +make dev-remote-bootstrap +``` + +## Integration with Existing Workflows + +### Using with Existing Make Targets + +The dev-remote environment integrates with existing Make targets: + +```bash +# Use dev-remote environment with existing targets +HOST_INDEX_PATH=./dev-workspace docker compose -f docker-compose.dev-remote.yml run --rm indexer --root /work/my-repo + +# Test with dev-remote stack +make health # Uses dev-remote stack if running +make hybrid # Uses dev-remote Qdrant instance +``` + +### MCP Client Configuration + +Configure your MCP clients (Cursor, Windsurf, etc.): + +```json +{ + "mcpServers": { + "qdrant": { + "type": "sse", + "url": "http://localhost:8000/sse", + "disabled": false + }, + "qdrant-indexer": { + "type": "sse", + "url": "http://localhost:8001/sse", + "disabled": false + } + } +} +``` + +## Performance Considerations + +### Resource Allocation + +The dev-remote environment is configured for development: + +- **Memory**: Moderate allocation suitable for development +- **CPU**: Shared allocation with reasonable limits +- **Storage**: Local volumes for fast I/O + +### Optimization Tips + +1. **Use SSD Storage**: Place `dev-workspace` on SSD for better performance +2. **Limit Repository Size**: Test with smaller repositories first +3. **Adjust Batch Sizes**: Tune `INDEX_UPSERT_BATCH` for your hardware +4. **Monitor Resources**: Use `docker stats` to monitor resource usage + +## Next Steps + +1. **Test Your Repository**: Add your code to `dev-workspace` and test the workflow +2. **Experiment with Remote Upload**: Try the remote upload client with your changes +3. **Integrate with IDE**: Configure your MCP client for the development environment +4. **Contribute**: Report issues and contribute improvements to the dev-remote setup + +## Support + +For issues with the dev-remote environment: + +1. Check the troubleshooting section above +2. Review service logs: `make dev-remote-logs` +3. Check the main documentation: `docs/remote_upload.md` +4. Open an issue with details about your setup and the problem \ No newline at end of file diff --git a/docs/remote_upload.md b/docs/remote_upload.md new file mode 100644 index 00000000..95d31497 --- /dev/null +++ b/docs/remote_upload.md @@ -0,0 +1,219 @@ +# Remote Upload Client for Context-Engine + +This document describes the remote upload client functionality that extends the existing watch_index.py for remote delta uploads. + +## Overview + +The remote upload client enables real-time code synchronization by uploading delta bundles to a remote server instead of processing files locally. This is useful for distributed development environments where multiple instances need to stay synchronized. + +## Architecture + +The system consists of: + +1. **RemoteUploadClient** - Handles delta bundle creation and HTTP uploads +2. **Extended ChangeQueue** - Integrates with remote client for delta processing +3. **Enhanced watch_index.py** - Supports both local and remote modes +4. **Delta Bundle Format** - Standardized tarball format with metadata + +## Configuration + +The remote upload client is configured via environment variables: + +| Variable | Description | Default | Example | +|----------|-------------|---------|---------| +| `REMOTE_UPLOAD_ENABLED` | Enable remote mode | `false` | `1` | +| `REMOTE_UPLOAD_ENDPOINT` | Upload server URL | `http://localhost:8080` | `https://api.example.com` | +| `REMOTE_UPLOAD_MAX_RETRIES` | Max upload retries | `3` | `5` | +| `REMOTE_UPLOAD_TIMEOUT` | Request timeout (seconds) | `30` | `60` | + +## Usage + +### Local Mode (Default) +```bash +make watch +``` + +### Remote Mode +```bash +# Set environment variables +export REMOTE_UPLOAD_ENABLED=1 +export REMOTE_UPLOAD_ENDPOINT=https://your-server.com:8080 +export REMOTE_UPLOAD_MAX_RETRIES=5 +export REMOTE_UPLOAD_TIMEOUT=60 + +# Or use the convenience target +make watch-remote REMOTE_UPLOAD_ENDPOINT=https://your-server.com:8080 +``` + +## Delta Bundle Format + +Delta bundles are tarballs (`.tar.gz`) containing: + +``` +delta-bundle.tar.gz +├── manifest.json # Bundle metadata and file operations +├── files/ # Directory containing file content +│ ├── created/ # New files +│ ├── updated/ # Modified files +│ └── moved/ # Moved files (at destination) +└── metadata/ # File metadata and hashes + ├── hashes.json # Content hashes for all files + └── operations.json # Detailed operation metadata +``` + +### Manifest Format +```json +{ + "version": "1.0", + "bundle_id": "uuid-v4", + "workspace_path": "/absolute/path/to/workspace", + "collection_name": "workspace-collection", + "created_at": "2025-01-26T01:55:00.000Z", + "sequence_number": 42, + "parent_sequence": 41, + "operations": { + "created": 5, + "updated": 3, + "deleted": 2, + "moved": 1 + }, + "total_files": 11, + "total_size_bytes": 1048576, + "compression": "gzip", + "encoding": "utf-8" +} +``` + +## Features + +### Change Detection +- **Hash-based detection** - Uses SHA1 hashes to detect file changes +- **Move detection** - Identifies file moves by matching content hashes +- **Efficient caching** - Leverages existing workspace state cache +- **Debouncing** - Integrates with existing ChangeQueue debouncing + +### Error Handling +- **Automatic retry** - Exponential backoff for network failures +- **Sequence recovery** - Handles sequence number mismatches +- **Fallback mode** - Falls back to local processing on upload failures +- **Bundle persistence** - Stores bundles locally for recovery + +### Integration +- **Backward compatible** - Existing local mode unchanged +- **Same logging** - Uses existing logging patterns +- **Same filtering** - Leverages existing file exclusion logic +- **Same debouncing** - Integrates with existing ChangeQueue + +## API Endpoints + +### Upload Endpoint +``` +POST /api/v1/delta/upload +Content-Type: multipart/form-data + +Parameters: +- bundle: Delta bundle tarball +- workspace_path: Absolute workspace path +- collection_name: Override collection name +- sequence_number: Expected sequence number +- force: Force upload even if sequence mismatch +``` + +### Status Endpoint +``` +GET /api/v1/delta/status?workspace_path=/workspace + +Response: +{ + "workspace_path": "/workspace", + "collection_name": "workspace-collection", + "last_sequence": 41, + "last_upload": "2025-01-26T01:50:00.000Z", + "pending_operations": 0, + "status": "ready", + "server_info": { + "version": "1.0.0", + "max_bundle_size_mb": 100, + "supported_formats": ["tar.gz"] + } +} +``` + +## Testing + +Run the basic tests to verify functionality: + +```bash +python scripts/test_remote_basic.py +``` + +This tests: +- Remote configuration detection +- Delta bundle structure creation +- Sequence number tracking + +## Implementation Notes + +### File Structure +- `scripts/remote_upload_client.py` - Main remote upload client +- `scripts/watch_index.py` - Extended with remote mode support +- `Makefile` - Added `watch-remote` target + +### Key Classes +- `RemoteUploadClient` - Core upload functionality +- `ChangeQueue` - Extended with remote client support +- `IndexHandler` - Updated for optional client (remote mode) + +### Integration Points +- Uses existing `get_cached_file_hash()` for change detection +- Leverages existing file filtering from `IndexHandler._maybe_enqueue()` +- Integrates with existing debouncing in `ChangeQueue` +- Maintains same logging and progress reporting patterns + +## Troubleshooting + +### Common Issues + +1. **"No module named 'qdrant_client'"** + - Install dependencies: `pip install qdrant-client fastembed watchdog requests` + +2. **"Remote mode not enabled"** + - Set `REMOTE_UPLOAD_ENABLED=1` in environment + +3. **"Upload failed"** + - Check `REMOTE_UPLOAD_ENDPOINT` is accessible + - Verify server supports delta upload API + - Check network connectivity + +4. **"Sequence mismatch"** + - Server will attempt automatic recovery + - Can force upload with `force=true` parameter + +### Debug Mode + +Enable debug logging: +```bash +export PYTHONPATH=. +python -c " +import logging +logging.basicConfig(level=logging.DEBUG) +from scripts.remote_upload_client import RemoteUploadClient +# ... your debug code +" +``` + +## Security Considerations + +For this PoC implementation: +- No authentication is required (development mode) +- No encryption is applied to bundles +- Server endpoint validation is basic +- Production deployment should add proper authentication + +## Future Enhancements + +1. **Authentication** - Add API key or token-based auth +2. **Compression** - Add support for different compression algorithms +3. **Incremental uploads** - Support for large file incremental sync +4. **Conflict resolution** - Handle concurrent modifications +5. **Batch optimization** - Bundle multiple changes together \ No newline at end of file diff --git a/docs/upload_service.md b/docs/upload_service.md new file mode 100644 index 00000000..4edb4dfb --- /dev/null +++ b/docs/upload_service.md @@ -0,0 +1,261 @@ +# Delta Upload Service + +This document describes the HTTP upload service for receiving and processing delta bundles in Context-Engine. + +## Overview + +The delta upload service is a FastAPI-based HTTP service that: +- Receives delta bundles from remote upload clients +- Extracts and processes file operations (create, update, delete, move) +- Integrates with existing indexing pipeline via `ingest_code.py` +- Provides health checks and status monitoring +- Supports CephFS persistent storage for Kubernetes deployment + +## API Endpoints + +### Health Check +``` +GET /health +``` + +Returns service health status and configuration. + +### Status +``` +GET /api/v1/delta/status?workspace_path=/path/to/workspace +``` + +Returns upload status for a specific workspace. + +### Upload Delta Bundle +``` +POST /api/v1/delta/upload +Content-Type: multipart/form-data +``` + +Parameters: +- `bundle`: Delta bundle tarball file +- `workspace_path`: Target workspace path +- `collection_name`: Override collection name (optional) +- `sequence_number`: Expected sequence number (optional) +- `force`: Force upload even if sequence mismatch (optional) + +## Delta Bundle Format + +Delta bundles are tar.gz archives with the following structure: + +``` +delta-bundle.tar.gz +├── manifest.json # Bundle metadata +├── files/ # File content +│ ├── created/ # New files +│ ├── updated/ # Modified files +│ └── moved/ # Moved files (at destination) +└── metadata/ # File metadata + ├── hashes.json # Content hashes + └── operations.json # Detailed operation metadata +``` + +### Manifest Format + +```json +{ + "version": "1.0", + "bundle_id": "uuid-v4", + "workspace_path": "/absolute/path/to/workspace", + "collection_name": "workspace-collection", + "created_at": "2025-01-26T02:00:00.000Z", + "sequence_number": 42, + "parent_sequence": 41, + "operations": { + "created": 5, + "updated": 3, + "deleted": 2, + "moved": 1 + }, + "total_files": 11, + "total_size_bytes": 1048576, + "compression": "gzip", + "encoding": "utf-8" +} +``` + +## Deployment + +### Local Development + +1. Install dependencies: +```bash +pip install -r requirements.txt +``` + +2. Run the service: +```bash +python scripts/upload_service.py +``` + +The service will start on `http://localhost:8002` by default. + +### Docker + +Build the image: +```bash +docker build -f Dockerfile.upload-service -t context-engine-upload-service . +``` + +Run the container: +```bash +docker run -p 8002:8002 \ + -e QDRANT_URL=http://qdrant:6333 \ + -e WORK_DIR=/work \ + -v /path/to/work:/work \ + context-engine-upload-service +``` + +### Kubernetes + +1. Apply the namespace and config: +```bash +kubectl apply -f deploy/kubernetes/namespace.yaml +kubectl apply -f deploy/kubernetes/configmap.yaml +``` + +2. Create persistent volumes (adjust storage class as needed): +```bash +kubectl apply -f deploy/kubernetes/upload-pvc.yaml +``` + +3. Deploy the service: +```bash +kubectl apply -f deploy/kubernetes/upload-service.yaml +``` + +The service will be available at: +- Internal: `http://upload-service.context-engine.svc.cluster.local:8002` +- External: `http://:30804` (NodePort) + +## Configuration + +Environment variables: + +| Variable | Default | Description | +|----------|----------|-------------| +| `UPLOAD_SERVICE_HOST` | `0.0.0.0` | Service bind address | +| `UPLOAD_SERVICE_PORT` | `8002` | Service port | +| `QDRANT_URL` | `http://qdrant:6333` | Qdrant server URL | +| `COLLECTION_NAME` | `my-collection` | Default collection name | +| `WORK_DIR` | `/work` | Workspace directory | +| `MAX_BUNDLE_SIZE_MB` | `100` | Maximum bundle size | +| `UPLOAD_TIMEOUT_SECS` | `300` | Upload timeout | + +## Integration + +### With Remote Upload Client + +The upload service integrates with the remote upload client in `scripts/remote_upload_client.py`: + +```python +from scripts.remote_upload_client import RemoteUploadClient + +client = RemoteUploadClient( + upload_endpoint="http://upload-service:8002", + workspace_path="/path/to/workspace", + collection_name="my-collection" +) + +# Upload changes +success = client.process_and_upload_changes(changed_files) +``` + +### With Existing Indexing Pipeline + +The service reuses the existing indexing pipeline: + +- Calls `ingest_code.index_repo()` for changed files +- Uses `workspace_state.py` for state management +- Integrates with existing Qdrant connection patterns +- Supports hash-based caching and change detection + +## Testing + +Run the test suite: + +```bash +python scripts/test_upload_service.py --url http://localhost:8002 +``` + +This will test: +- Health check endpoint +- Status endpoint +- Upload endpoint with sample delta bundle + +## Monitoring + +### Health Checks + +The service provides liveness and readiness probes: +- Liveness: `/health` every 10 seconds after 30s delay +- Readiness: `/health` every 5 seconds after 10s delay + +### Logging + +Logs include: +- Request/response details +- Bundle processing status +- Error details and stack traces +- Integration with existing logging patterns + +### Metrics + +The service tracks: +- Upload success/failure rates +- Processing times +- Operation counts (create, update, delete, move) +- Indexed points count + +## Security Considerations + +For production deployment: + +1. **Authentication**: Add API key or JWT authentication +2. **Authorization**: Implement workspace-based access control +3. **Input Validation**: Enhanced bundle validation and sanitization +4. **Rate Limiting**: Add request rate limiting +5. **TLS**: Enable HTTPS for production + +## Troubleshooting + +### Common Issues + +1. **Bundle Too Large**: Increase `MAX_BUNDLE_SIZE_MB` or optimize bundles +2. **Sequence Mismatch**: Check client sequence tracking or use `force=true` +3. **Indexing Failures**: Verify Qdrant connectivity and collection exists +4. **Storage Issues**: Check PVC status and CephFS connectivity + +### Debug Mode + +Enable debug logging: +```bash +export UPLOAD_SERVICE_LOG_LEVEL=debug +python scripts/upload_service.py +``` + +### Health Check + +Verify service status: +```bash +curl http://localhost:8002/health +``` + +## Architecture + +The upload service follows the delta upload architecture defined in: +- `delta_upload_design.md` - Format specification +- `delta_upload_architecture.md` - System design + +Key components: +- **FastAPI HTTP Server**: Handles incoming requests +- **Bundle Processor**: Extracts and validates delta bundles +- **File Operations**: Applies create/update/delete/move operations +- **Indexing Integration**: Calls existing indexing pipeline +- **State Management**: Tracks sequences and workspace state \ No newline at end of file diff --git a/docs/usage_guide.md b/docs/usage_guide.md new file mode 100644 index 00000000..35037c4b --- /dev/null +++ b/docs/usage_guide.md @@ -0,0 +1,597 @@ +# Context-Engine Real-Time Code Ingestion: Usage Guide + +This guide provides comprehensive instructions for using the Context-Engine real-time code ingestion system with both local and remote upload capabilities. + +## Table of Contents + +1. [Quick Start](#quick-start) +2. [Local vs Remote Mode](#local-vs-remote-mode) +3. [Configuration](#configuration) +4. [Usage Examples](#usage-examples) +5. [Deployment](#deployment) +6. [Troubleshooting](#troubleshooting) +7. [Advanced Configuration](#advanced-configuration) + +## Quick Start + +### Prerequisites + +- Docker and Docker Compose installed +- Python 3.8+ with required dependencies +- Access to a Qdrant instance (local or remote) + +### Basic Local Mode Setup + +1. **Clone and setup the repository:** +```bash +git clone +cd Context-Engine +cp .env.example .env +``` + +2. **Start the services:** +```bash +make up +``` + +3. **Index your codebase:** +```bash +make index +``` + +4. **Start watching for changes:** +```bash +make watch +``` + +### Basic Remote Mode Setup + +1. **Deploy the upload service:** +```bash +# Deploy to Kubernetes +kubectl apply -f deploy/kubernetes/upload-pvc.yaml +kubectl apply -f deploy/kubernetes/upload-service.yaml +``` + +2. **Start remote watching:** +```bash +make watch-remote REMOTE_UPLOAD_ENDPOINT=http://your-upload-service:8002 +``` + +## Local vs Remote Mode + +### Local Mode + +**Use Case:** Single developer, local development environment + +**How it works:** +- Files are processed directly on the local machine +- Changes are indexed directly into local Qdrant instance +- No network dependencies for indexing + +**Pros:** +- ✅ Fast response time (no network latency) +- ✅ Works offline +- ✅ Simple setup +- ✅ No additional infrastructure needed + +**Cons:** +- ❌ Limited to single machine +- ❌ No collaboration features +- ❌ Each developer maintains separate index + +**Command:** +```bash +make watch +``` + +### Remote Mode + +**Use Case:** Team collaboration, distributed development, centralized indexing + +**How it works:** +- Files are packaged into delta bundles +- Bundles are uploaded to remote upload service +- Remote service processes and indexes changes +- All clients sync from the same central index + +**Pros:** +- ✅ Centralized index for team collaboration +- ✅ Consistent search results across team +- ✅ Reduced local resource usage +- ✅ Better for large codebases +- ✅ Supports distributed teams + +**Cons:** +- ❌ Requires network connectivity +- ❌ Additional infrastructure +- ❌ Network latency +- ❌ More complex setup + +**Command:** +```bash +make watch-remote REMOTE_UPLOAD_ENDPOINT=http://your-server:8002 +``` + +## Configuration + +### Environment Variables + +#### Core Configuration +```bash +# Qdrant connection +QDRANT_URL=http://qdrant:6333 +COLLECTION_NAME=my-collection + +# Workspace configuration +WATCH_ROOT=/work +WORKSPACE_PATH=/work + +# Embedding model +EMBEDDING_MODEL=BAAI/bge-base-en-v1.5 +``` + +#### Remote Upload Configuration +```bash +# Enable remote mode +REMOTE_UPLOAD_ENABLED=1 + +# Upload service endpoint +REMOTE_UPLOAD_ENDPOINT=http://your-server:8002 + +# Upload behavior +REMOTE_UPLOAD_MAX_RETRIES=3 +REMOTE_UPLOAD_TIMEOUT=30 + +# Watch behavior +WATCH_DEBOUNCE_SECS=1.0 +``` + +#### File Filtering +```bash +# Ignore file location +QDRANT_IGNORE_FILE=.qdrantignore +``` + +### Example .env Files + +#### Local Development (.env.local) +```bash +# Local development configuration +QDRANT_URL=http://localhost:6333 +COLLECTION_NAME=my-dev-collection +WATCH_ROOT=/Users/developer/my-project +EMBEDDING_MODEL=BAAI/bge-base-en-v1.5 +WATCH_DEBOUNCE_SECS=0.5 +``` + +#### Team Collaboration (.env.remote) +```bash +# Remote team configuration +QDRANT_URL=http://qdrant.team.svc.cluster.local:6333 +COLLECTION_NAME=team-shared-collection +WATCH_ROOT=/workspace +EMBEDDING_MODEL=BAAI/bge-base-en-v1.5 + +# Remote upload settings +REMOTE_UPLOAD_ENABLED=1 +REMOTE_UPLOAD_ENDPOINT=http://upload-service.team.svc.cluster.local:8002 +REMOTE_UPLOAD_MAX_RETRIES=5 +REMOTE_UPLOAD_TIMEOUT=60 +WATCH_DEBOUNCE_SECS=2.0 +``` + +#### Production (.env.prod) +```bash +# Production configuration +QDRANT_URL=https://qdrant.production.com +COLLECTION_NAME=prod-codebase +WATCH_ROOT=/app/workspace +EMBEDDING_MODEL=BAAI/bge-base-en-v1.5 + +# Remote upload with high reliability +REMOTE_UPLOAD_ENABLED=1 +REMOTE_UPLOAD_ENDPOINT=https://upload-api.production.com +REMOTE_UPLOAD_MAX_RETRIES=10 +REMOTE_UPLOAD_TIMEOUT=120 +WATCH_DEBOUNCE_SECS=3.0 +``` + +## Usage Examples + +### Basic Development Workflow + +```bash +# 1. Start services +make up + +# 2. Initial indexing +make reindex + +# 3. Start watching (local mode) +make watch + +# In another terminal, make changes to your code... +# Changes will be automatically indexed +``` + +### Team Collaboration Workflow + +```bash +# 1. Deploy infrastructure (once) +kubectl apply -f deploy/kubernetes/ + +# 2. Each developer starts remote watching +make watch-remote REMOTE_UPLOAD_ENDPOINT=https://upload.team.com:8002 + +# 3. Developers make changes... +# All changes are synchronized across the team +``` + +### Hybrid Workflow (Local + Remote) + +```bash +# Use local mode for fast iteration +make watch + +# Switch to remote mode when ready to share +make watch-remote REMOTE_UPLOAD_ENDPOINT=https://upload.team.com:8002 +``` + +### Advanced Indexing + +```bash +# Index specific path +make index-path REPO_PATH=/path/to/repo RECREATE=1 + +# Index current directory with custom collection +make index-here REPO_NAME=my-project COLLECTION=my-project-collection + +# Warm up search caches +make warm + +# Run health checks +make health +``` + +## Deployment + +### Local Development + +1. **Using Docker Compose:** +```bash +# Start all services +make up + +# View logs +make logs + +# Check status +make ps +``` + +2. **Manual Setup:** +```bash +# Install dependencies +pip install -r requirements.txt + +# Start Qdrant +docker run -p 6333:6333 qdrant/qdrant + +# Start watching +python scripts/watch_index.py +``` + +### Kubernetes Deployment + +1. **Prerequisites:** +```bash +# Kubernetes cluster with storage support +kubectl cluster-info + +# Install required manifests +kubectl apply -f deploy/kubernetes/namespace.yaml +kubectl apply -f deploy/kubernetes/configmap.yaml +``` + +2. **Deploy Core Services:** +```bash +# Deploy Qdrant +kubectl apply -f deploy/kubernetes/qdrant.yaml + +# Deploy upload service with persistent storage +kubectl apply -f deploy/kubernetes/upload-pvc.yaml +kubectl apply -f deploy/kubernetes/upload-service.yaml + +# Deploy indexer services +kubectl apply -f deploy/kubernetes/indexer-services.yaml +``` + +3. **Configure Access:** +```bash +# Check service status +kubectl get pods -n context-engine + +# Get upload service endpoint +kubectl get svc upload-service -n context-engine + +# Port forward for local testing +kubectl port-forward svc/upload-service 8002:8002 -n context-engine +``` + +### Production Considerations + +1. **High Availability:** +```yaml +# Example: Multiple replicas for upload service +spec: + replicas: 3 + selector: + matchLabels: + app: upload-service +``` + +2. **Resource Limits:** +```yaml +# Example: Resource constraints +resources: + requests: + memory: "512Mi" + cpu: "250m" + limits: + memory: "2Gi" + cpu: "1000m" +``` + +3. **Monitoring:** +```yaml +# Example: Health checks +livenessProbe: + httpGet: + path: /health + port: 8002 + initialDelaySeconds: 30 + periodSeconds: 10 +readinessProbe: + httpGet: + path: /health + port: 8002 + initialDelaySeconds: 10 + periodSeconds: 5 +``` + +## Troubleshooting + +### Common Issues + +#### 1. "No module named 'qdrant_client'" +**Solution:** +```bash +pip install qdrant-client fastembed watchdog requests +``` + +#### 2. "Remote mode not enabled" +**Solution:** +```bash +export REMOTE_UPLOAD_ENABLED=1 +# Or add to .env file +echo "REMOTE_UPLOAD_ENABLED=1" >> .env +``` + +#### 3. "Upload failed: Connection refused" +**Solutions:** +- Check upload service is running: `kubectl get pods` +- Verify endpoint URL: `curl http://your-endpoint:8002/health` +- Check network connectivity: `telnet your-endpoint 8002` + +#### 4. "Sequence mismatch" errors +**Solutions:** +- Client will attempt automatic recovery +- Force upload if needed: Set `force=true` in upload request +- Reset sequence: Delete `.codebase/delta_bundles/last_sequence.txt` + +#### 5. "Bundle too large" errors +**Solutions:** +- Increase `MAX_BUNDLE_SIZE_MB` on upload service +- Reduce number of changes before upload (adjust debounce) +- Split large changes into smaller commits + +#### 6. "Indexing is slow" +**Solutions:** +- Use faster embedding model +- Increase `WATCH_DEBOUNCE_SECS` to reduce frequency +- Upgrade hardware (more CPU/RAM) +- Use remote mode to offload processing + +### Debug Mode + +#### Enable Debug Logging +```bash +# Set log level +export PYTHONPATH=. +export UPLOAD_SERVICE_LOG_LEVEL=debug + +# Run with debug output +python -c " +import logging +logging.basicConfig(level=logging.DEBUG) +from scripts.remote_upload_client import RemoteUploadClient +# Your debug code here +" +``` + +#### Check System Status +```bash +# Check Qdrant +curl http://localhost:6333/collections + +# Check upload service +curl http://localhost:8002/health + +# Check workspace state +ls -la .codebase/ +cat .codebase/workspace_state.json +``` + +#### Monitor File Changes +```bash +# Watch file system events (Linux) +inotifywait -m -r -e modify,create,delete,move /path/to/watch + +# Watch file system events (macOS) +fswatch -r /path/to/watch +``` + +### Performance Tuning + +#### Optimize for Large Codebases +```bash +# Increase debounce to reduce processing frequency +WATCH_DEBOUNCE_SECS=5.0 + +# Use larger batch sizes +BATCH_SIZE=1000 + +# Increase timeouts +REMOTE_UPLOAD_TIMEOUT=120 +QDRANT_TIMEOUT=60 +``` + +#### Optimize for Real-time Response +```bash +# Reduce debounce for faster response +WATCH_DEBOUNCE_SECS=0.1 + +# Use smaller batches for faster processing +BATCH_SIZE=100 + +# Reduce timeouts +REMOTE_UPLOAD_TIMEOUT=30 +QDRANT_TIMEOUT=20 +``` + +## Advanced Configuration + +### Custom File Filtering + +Create `.qdrantignore` in your workspace root: + +``` +# Ignore patterns +*.log +*.tmp +node_modules/ +.git/ +build/ +dist/ +*.min.js +*.min.css + +# Ignore specific directories +tests/fixtures/ +docs/generated/ +``` + +### Custom Embedding Models + +```bash +# Use different model +EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2 + +# Custom model (local path) +EMBEDDING_MODEL=/path/to/custom/model + +# Model-specific settings +EMBEDDING_DEVICE=cuda +EMBEDDING_BATCH_SIZE=32 +``` + +### Multi-Collection Setup + +```bash +# Different collections for different projects +COLLECTION_NAME=project-alpha + +# Or use environment-specific collections +COLLECTION_NAME=${PROJECT_NAME}-${ENVIRONMENT} +``` + +### Integration with CI/CD + +#### GitHub Actions Example +```yaml +name: Index Code Changes +on: [push] + +jobs: + index: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: Setup Python + uses: actions/setup-python@v2 + with: + python-version: '3.9' + - name: Install dependencies + run: pip install -r requirements.txt + - name: Index changes + env: + REMOTE_UPLOAD_ENABLED: 1 + REMOTE_UPLOAD_ENDPOINT: ${{ secrets.UPLOAD_ENDPOINT }} + run: | + python scripts/watch_index.py --once +``` + +#### Jenkins Pipeline Example +```groovy +pipeline { + agent any + environment { + REMOTE_UPLOAD_ENABLED = '1' + REMOTE_UPLOAD_ENDPOINT = credentials('upload-endpoint') + } + stages { + stage('Index') { + steps { + sh 'python scripts/watch_index.py --once' + } + } + } +} +``` + +### Monitoring and Alerting + +#### Prometheus Metrics +```yaml +# Example Prometheus configuration +scrape_configs: + - job_name: 'context-engine' + static_configs: + - targets: ['upload-service:8002'] + metrics_path: '/metrics' +``` + +#### Grafana Dashboard +- Upload success rate +- Processing time +- Queue depth +- Error rates +- Resource usage + +#### Alerting Rules +```yaml +# Example alerting rules +groups: + - name: context-engine + rules: + - alert: HighErrorRate + expr: upload_error_rate > 0.1 + for: 5m + labels: + severity: warning + annotations: + summary: "High upload error rate detected" +``` + +This comprehensive guide should help you get the most out of the Context-Engine real-time code ingestion system. For more specific issues or advanced use cases, refer to the individual component documentation or reach out to the development team. \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index bf87ceee..603f67ca 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,6 +8,9 @@ tree_sitter>=0.25.2 tree_sitter_languages; python_version < "3.13" mcp==1.17.0 fastmcp==2.12.4 +fastapi +uvicorn[standard] +python-multipart # Test-only pytest diff --git a/scripts/dev-setup.sh b/scripts/dev-setup.sh new file mode 100644 index 00000000..1a7c5553 --- /dev/null +++ b/scripts/dev-setup.sh @@ -0,0 +1,169 @@ +#!/bin/bash + +# Development Environment Setup Script for Remote Upload System +# This script sets up the development environment for testing the remote upload workflow + +set -e + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# Configuration +DEV_WORKSPACE="${DEV_WORKSPACE:-./dev-workspace}" + +# Functions +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +log_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +log_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +# Check if Docker is running +check_docker() { + log_info "Checking Docker installation..." + if ! command -v docker &> /dev/null; then + log_error "Docker is not installed or not in PATH" + exit 1 + fi + + if ! docker info &> /dev/null; then + log_error "Docker daemon is not running" + exit 1 + fi + + log_success "Docker is available and running" +} + +# Check if Docker Compose is available +check_docker_compose() { + log_info "Checking Docker Compose installation..." + if ! command -v docker-compose &> /dev/null && ! docker compose version &> /dev/null; then + log_error "Docker Compose is not installed" + exit 1 + fi + + log_success "Docker Compose is available" +} + +# Create development workspace directory structure +setup_workspace() { + log_info "Setting up development workspace..." + + # Create main workspace directory + mkdir -p "$DEV_WORKSPACE" + mkdir -p "$DEV_WORKSPACE/.codebase" + + log_success "Development workspace created at $DEV_WORKSPACE" + log_info "You can mount your existing repositories here for testing" +} + +# Create environment file +create_env_file() { + log_info "Creating environment configuration..." + + if [ ! -f ".env" ]; then + cp .env.example .env + log_success "Created .env from .env.example" + else + log_warning ".env file already exists, skipping creation" + fi + + # Add dev-remote specific configurations if not already present + if ! grep -q "HOST_INDEX_PATH=./dev-workspace" .env; then + cat >> .env << 'EOF' + +# Development Remote Upload Configuration +HOST_INDEX_PATH=./dev-workspace +DEV_REMOTE_MODE=1 +DEV_REMOTE_DEBUG=1 + +# Upload Service Configuration (Development) +UPLOAD_SERVICE_HOST=0.0.0.0 +UPLOAD_SERVICE_PORT=8002 +UPLOAD_SERVICE_DEBUG=1 + +# Remote Upload Client Configuration +REMOTE_UPLOAD_ENABLED=1 +REMOTE_UPLOAD_ENDPOINT=http://upload_service:8002 +REMOTE_UPLOAD_MAX_RETRIES=3 +REMOTE_UPLOAD_TIMEOUT=30 +REMOTE_UPLOAD_DEBUG=1 + +# Development-specific settings +QDRANT_TIMEOUT=60 +MAX_MICRO_CHUNKS_PER_FILE=200 +INDEX_UPSERT_BATCH=128 +INDEX_UPSERT_RETRIES=5 +WATCH_DEBOUNCE_SECS=1.5 +EOF + log_success "Added dev-remote configuration to .env" + else + log_warning "Dev-remote configuration already exists in .env" + fi +} + +# Print usage information +print_usage() { + log_info "Development environment setup complete!" + echo + echo "Quick Start:" + echo " 1. Copy your repository to dev-workspace/your-repo-name" + echo " 2. Run: make dev-remote-bootstrap" + echo " 3. Test with: make dev-remote-test" + echo + echo "Available commands:" + echo " make dev-remote-up - Start the dev-remote stack" + echo " make dev-remote-down - Stop the dev-remote stack" + echo " make dev-remote-bootstrap - Bootstrap the complete system" + echo " make dev-remote-test - Test the remote upload workflow" + echo " make dev-remote-client - Start remote upload client" + echo " make dev-remote-clean - Clean up all dev-remote resources" + echo + echo "Service URLs:" + echo " Upload Service: http://localhost:8004" + echo " Qdrant Dashboard: http://localhost:6333" + echo " MCP Search: http://localhost:8000" + echo " MCP Indexer: http://localhost:8001" + echo + echo "Testing Workflow:" + echo " 1. Place your code in: $DEV_WORKSPACE/your-repo" + echo " 2. Start the stack: make dev-remote-bootstrap" + echo " 3. Test upload: curl http://localhost:8004/health" + echo " 4. Check status: curl 'http://localhost:8004/api/v1/delta/status?workspace_path=/work/your-repo'" + echo + echo "For remote upload testing:" + echo " 1. Set REMOTE_UPLOAD_ENDPOINT=http://localhost:8004" + echo " 2. Run: make watch-remote REMOTE_UPLOAD_ENDPOINT=http://localhost:8004" + echo + log_success "Ready to test the remote upload system!" +} + +# Main execution +main() { + log_info "Setting up development environment for remote upload system..." + + check_docker + check_docker_compose + setup_workspace + create_env_file + print_usage + + log_success "Development environment setup completed successfully!" +} + +# Run main function +main "$@" \ No newline at end of file diff --git a/scripts/remote_upload_client.py b/scripts/remote_upload_client.py new file mode 100644 index 00000000..5623d371 --- /dev/null +++ b/scripts/remote_upload_client.py @@ -0,0 +1,828 @@ +#!/usr/bin/env python3 +""" +Remote upload client for delta bundles in Context-Engine. + +This module provides functionality to create and upload delta bundles to a remote +server, enabling real-time code synchronization across distributed environments. +""" + +import os +import json +import time +import uuid +import hashlib +import tarfile +import tempfile +import threading +import logging +from pathlib import Path +from typing import Dict, List, Any, Optional, Tuple +from datetime import datetime +import requests +from requests.adapters import HTTPAdapter +from urllib3.util.retry import Retry + +# Configure logging +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +# Import existing workspace state functions +from scripts.workspace_state import ( + get_cached_file_hash, + set_cached_file_hash, + remove_cached_file, +) + +# Import existing hash function +import scripts.ingest_code as idx + + +class RemoteUploadClient: + """Client for uploading delta bundles to remote server.""" + + def __init__(self, + upload_endpoint: str, + workspace_path: str, + collection_name: str, + max_retries: int = 3, + timeout: int = 30): + """ + Initialize remote upload client. + + Args: + upload_endpoint: HTTP endpoint for delta uploads + workspace_path: Absolute path to workspace + collection_name: Target collection name + max_retries: Maximum number of upload retries + timeout: Request timeout in seconds + """ + self.upload_endpoint = upload_endpoint.rstrip('/') + self.workspace_path = workspace_path + self.collection_name = collection_name + self.max_retries = max_retries + self.timeout = timeout + + # Bundle persistence directory (initialize before sequence tracking) + self.bundle_dir = Path(workspace_path) / ".codebase" / "delta_bundles" + self.bundle_dir.mkdir(parents=True, exist_ok=True) + + # Sequence number tracking + self._sequence_lock = threading.Lock() + self._sequence_number = self._get_last_sequence() + + # Setup HTTP session with retry strategy + self.session = requests.Session() + retry_strategy = Retry( + total=max_retries, + backoff_factor=1, + status_forcelist=[429, 500, 502, 503, 504], + ) + adapter = HTTPAdapter(max_retries=retry_strategy) + self.session.mount("http://", adapter) + self.session.mount("https://", adapter) + + def _get_last_sequence(self) -> int: + """Get the last sequence number from local state.""" + seq_file = self.bundle_dir / "last_sequence.txt" + try: + if seq_file.exists(): + return int(seq_file.read_text().strip()) + except (ValueError, IOError): + pass + return 0 + + def _set_last_sequence(self, sequence: int) -> None: + """Persist the last sequence number.""" + seq_file = self.bundle_dir / "last_sequence.txt" + try: + seq_file.write_text(str(sequence)) + except IOError: + pass + + def _get_next_sequence(self) -> int: + """Get the next sequence number atomically.""" + with self._sequence_lock: + self._sequence_number += 1 + self._set_last_sequence(self._sequence_number) + return self._sequence_number + + def detect_file_changes(self, changed_paths: List[Path]) -> Dict[str, List]: + """ + Detect what type of changes occurred for each file path. + + Args: + changed_paths: List of changed file paths + + Returns: + Dictionary with change types: created, updated, deleted, moved, unchanged + """ + changes = { + "created": [], + "updated": [], + "deleted": [], + "moved": [], + "unchanged": [] + } + + for path in changed_paths: + abs_path = str(path.resolve()) + cached_hash = get_cached_file_hash(self.workspace_path, abs_path) + + if not path.exists(): + # File was deleted + if cached_hash: + changes["deleted"].append(path) + else: + # File exists - calculate current hash + try: + with open(path, 'rb') as f: + content = f.read() + current_hash = hashlib.sha1(content).hexdigest() + + if not cached_hash: + # New file + changes["created"].append(path) + elif cached_hash != current_hash: + # Modified file + changes["updated"].append(path) + else: + # Unchanged (might be a move detection candidate) + changes["unchanged"].append(path) + + # Update cache + set_cached_file_hash(self.workspace_path, abs_path, current_hash) + except Exception: + # Skip files that can't be read + continue + + # Detect moves by looking for files with same content hash + # but different paths (requires additional tracking) + changes["moved"] = self._detect_moves(changes["created"], changes["deleted"]) + + return changes + + def _detect_moves(self, created_files: List[Path], deleted_files: List[Path]) -> List[Tuple[Path, Path]]: + """ + Detect file moves by matching content hashes between created and deleted files. + + Args: + created_files: List of newly created files + deleted_files: List of deleted files + + Returns: + List of (source, destination) path tuples for detected moves + """ + moves = [] + deleted_hashes = {} + + # Build hash map for deleted files + for deleted_path in deleted_files: + try: + # Try to get cached hash first, fallback to file content + cached_hash = get_cached_file_hash(self.workspace_path, str(deleted_path)) + if cached_hash: + deleted_hashes[cached_hash] = deleted_path + continue + + # If no cached hash, try to read from file if it still exists + if deleted_path.exists(): + with open(deleted_path, 'rb') as f: + content = f.read() + file_hash = hashlib.sha1(content).hexdigest() + deleted_hashes[file_hash] = deleted_path + except Exception: + continue + + # Match created files with deleted files by hash + for created_path in created_files: + try: + with open(created_path, 'rb') as f: + content = f.read() + file_hash = hashlib.sha1(content).hexdigest() + + if file_hash in deleted_hashes: + source_path = deleted_hashes[file_hash] + moves.append((source_path, created_path)) + # Remove from consideration + del deleted_hashes[file_hash] + except Exception: + continue + + return moves + + def create_delta_bundle(self, changes: Dict[str, List]) -> Tuple[str, Dict[str, Any]]: + """ + Create a delta bundle from detected changes. + + Args: + changes: Dictionary of file changes by type + + Returns: + Tuple of (bundle_path, manifest_metadata) + """ + bundle_id = str(uuid.uuid4()) + sequence_number = self._get_next_sequence() + created_at = datetime.now().isoformat() + + # Create temporary directory for bundle + with tempfile.TemporaryDirectory() as temp_dir: + temp_path = Path(temp_dir) + + # Create directory structure + files_dir = temp_path / "files" + metadata_dir = temp_path / "metadata" + files_dir.mkdir() + metadata_dir.mkdir() + + # Create subdirectories + (files_dir / "created").mkdir() + (files_dir / "updated").mkdir() + (files_dir / "moved").mkdir() + + operations = [] + total_size = 0 + file_hashes = {} + + # Process created files + for path in changes["created"]: + rel_path = str(path.relative_to(Path(self.workspace_path))) + try: + with open(path, 'rb') as f: + content = f.read() + file_hash = hashlib.sha1(content).hexdigest() + content_hash = f"sha1:{file_hash}" + + # Write file to bundle + bundle_file_path = files_dir / "created" / rel_path + bundle_file_path.parent.mkdir(parents=True, exist_ok=True) + bundle_file_path.write_bytes(content) + + # Get file info + stat = path.stat() + language = idx.CODE_EXTS.get(path.suffix.lower(), "unknown") + + operation = { + "operation": "created", + "path": rel_path, + "relative_path": rel_path, + "absolute_path": str(path.resolve()), + "size_bytes": stat.st_size, + "content_hash": content_hash, + "file_hash": f"sha1:{idx.hash_id(content.decode('utf-8', errors='ignore'), rel_path, 1, len(content.splitlines()))}", + "modified_time": datetime.fromtimestamp(stat.st_mtime).isoformat(), + "language": language + } + operations.append(operation) + file_hashes[rel_path] = f"sha1:{file_hash}" + total_size += stat.st_size + + except Exception as e: + print(f"[bundle_create] Error processing created file {path}: {e}") + continue + + # Process updated files + for path in changes["updated"]: + rel_path = str(path.relative_to(Path(self.workspace_path))) + try: + with open(path, 'rb') as f: + content = f.read() + file_hash = hashlib.sha1(content).hexdigest() + content_hash = f"sha1:{file_hash}" + previous_hash = get_cached_file_hash(self.workspace_path, str(path.resolve())) + + # Write file to bundle + bundle_file_path = files_dir / "updated" / rel_path + bundle_file_path.parent.mkdir(parents=True, exist_ok=True) + bundle_file_path.write_bytes(content) + + # Get file info + stat = path.stat() + language = idx.CODE_EXTS.get(path.suffix.lower(), "unknown") + + operation = { + "operation": "updated", + "path": rel_path, + "relative_path": rel_path, + "absolute_path": str(path.resolve()), + "size_bytes": stat.st_size, + "content_hash": content_hash, + "previous_hash": f"sha1:{previous_hash}" if previous_hash else None, + "file_hash": f"sha1:{idx.hash_id(content.decode('utf-8', errors='ignore'), rel_path, 1, len(content.splitlines()))}", + "modified_time": datetime.fromtimestamp(stat.st_mtime).isoformat(), + "language": language + } + operations.append(operation) + file_hashes[rel_path] = f"sha1:{file_hash}" + total_size += stat.st_size + + except Exception as e: + print(f"[bundle_create] Error processing updated file {path}: {e}") + continue + + # Process moved files + for source_path, dest_path in changes["moved"]: + dest_rel_path = str(dest_path.relative_to(Path(self.workspace_path))) + source_rel_path = str(source_path.relative_to(Path(self.workspace_path))) + try: + with open(dest_path, 'rb') as f: + content = f.read() + file_hash = hashlib.sha1(content).hexdigest() + content_hash = f"sha1:{file_hash}" + + # Write file to bundle + bundle_file_path = files_dir / "moved" / dest_rel_path + bundle_file_path.parent.mkdir(parents=True, exist_ok=True) + bundle_file_path.write_bytes(content) + + # Get file info + stat = dest_path.stat() + language = idx.CODE_EXTS.get(dest_path.suffix.lower(), "unknown") + + operation = { + "operation": "moved", + "path": dest_rel_path, + "relative_path": dest_rel_path, + "absolute_path": str(dest_path.resolve()), + "source_path": source_rel_path, + "source_relative_path": source_rel_path, + "source_absolute_path": str(source_path.resolve()), + "size_bytes": stat.st_size, + "content_hash": content_hash, + "file_hash": f"sha1:{idx.hash_id(content.decode('utf-8', errors='ignore'), dest_rel_path, 1, len(content.splitlines()))}", + "modified_time": datetime.fromtimestamp(stat.st_mtime).isoformat(), + "language": language + } + operations.append(operation) + file_hashes[dest_rel_path] = f"sha1:{file_hash}" + total_size += stat.st_size + + except Exception as e: + print(f"[bundle_create] Error processing moved file {source_path} -> {dest_path}: {e}") + continue + + # Process deleted files + for path in changes["deleted"]: + rel_path = str(path.relative_to(Path(self.workspace_path))) + try: + previous_hash = get_cached_file_hash(self.workspace_path, str(path.resolve())) + + operation = { + "operation": "deleted", + "path": rel_path, + "relative_path": rel_path, + "absolute_path": str(path.resolve()), + "previous_hash": f"sha1:{previous_hash}" if previous_hash else None, + "file_hash": None, + "modified_time": datetime.now().isoformat(), + "language": idx.CODE_EXTS.get(path.suffix.lower(), "unknown") + } + operations.append(operation) + + except Exception as e: + print(f"[bundle_create] Error processing deleted file {path}: {e}") + continue + + # Create manifest + manifest = { + "version": "1.0", + "bundle_id": bundle_id, + "workspace_path": self.workspace_path, + "collection_name": self.collection_name, + "created_at": created_at, + "sequence_number": sequence_number, + "parent_sequence": sequence_number - 1, + "operations": { + "created": len(changes["created"]), + "updated": len(changes["updated"]), + "deleted": len(changes["deleted"]), + "moved": len(changes["moved"]) + }, + "total_files": len(operations), + "total_size_bytes": total_size, + "compression": "gzip", + "encoding": "utf-8" + } + + # Write manifest + (temp_path / "manifest.json").write_text(json.dumps(manifest, indent=2)) + + # Write operations metadata + operations_metadata = { + "operations": operations + } + (metadata_dir / "operations.json").write_text(json.dumps(operations_metadata, indent=2)) + + # Write hashes + hashes_metadata = { + "workspace_path": self.workspace_path, + "updated_at": created_at, + "file_hashes": file_hashes + } + (metadata_dir / "hashes.json").write_text(json.dumps(hashes_metadata, indent=2)) + + # Create tarball + bundle_path = self.bundle_dir / f"{bundle_id}.tar.gz" + with tarfile.open(bundle_path, "w:gz") as tar: + tar.add(temp_path, arcname=f"{bundle_id}") + + return str(bundle_path), manifest + + def upload_bundle(self, bundle_path: str, manifest: Dict[str, Any]) -> Dict[str, Any]: + """ + Upload delta bundle to remote server with exponential backoff retry. + + Args: + bundle_path: Path to the bundle tarball + manifest: Bundle manifest metadata + + Returns: + Server response dictionary + """ + last_error = None + + for attempt in range(self.max_retries + 1): + try: + # Calculate backoff delay (exponential with jitter) + if attempt > 0: + base_delay = 2 ** (attempt - 1) # 1, 2, 4, 8... + jitter = base_delay * 0.1 * (0.5 + (hash(str(time.time())) % 100) / 100) + delay = min(base_delay + jitter, 30) # Cap at 30 seconds + logger.info(f"[remote_upload] Retry attempt {attempt + 1}/{self.max_retries + 1} after {delay:.2f}s delay") + time.sleep(delay) + + # Verify bundle exists before attempting upload + if not os.path.exists(bundle_path): + return { + "success": False, + "error": { + "code": "BUNDLE_NOT_FOUND", + "message": f"Bundle file not found: {bundle_path}" + } + } + + # Check bundle size + bundle_size = os.path.getsize(bundle_path) + max_size_mb = 100 # Default max size + max_size_bytes = max_size_mb * 1024 * 1024 + + if bundle_size > max_size_bytes: + return { + "success": False, + "error": { + "code": "BUNDLE_TOO_LARGE", + "message": f"Bundle size {bundle_size} bytes exceeds maximum {max_size_bytes} bytes" + } + } + + with open(bundle_path, 'rb') as bundle_file: + files = { + 'bundle': (f"{manifest['bundle_id']}.tar.gz", bundle_file, 'application/gzip') + } + + data = { + 'workspace_path': self.workspace_path, + 'collection_name': self.collection_name, + 'sequence_number': str(manifest['sequence_number']), + 'force': 'false' + } + + logger.info(f"[remote_upload] Uploading bundle {manifest['bundle_id']} (size: {bundle_size} bytes)") + + response = self.session.post( + f"{self.upload_endpoint}/api/v1/delta/upload", + files=files, + data=data, + timeout=self.timeout + ) + + if response.status_code == 200: + result = response.json() + logger.info(f"[remote_upload] Successfully uploaded bundle {manifest['bundle_id']}") + return result + else: + error_msg = f"Upload failed with status {response.status_code}" + try: + error_detail = response.json() + error_detail_msg = error_detail.get('error', {}).get('message', 'Unknown error') + error_msg += f": {error_detail_msg}" + error_code = error_detail.get('error', {}).get('code', 'HTTP_ERROR') + except: + error_msg += f": {response.text[:200]}" # Truncate long responses + error_code = "HTTP_ERROR" + + last_error = { + "success": False, + "error": { + "code": error_code, + "message": error_msg, + "status_code": response.status_code + } + } + + # Don't retry on client errors (4xx) + if 400 <= response.status_code < 500 and response.status_code != 429: + logger.warning(f"[remote_upload] Client error {response.status_code}, not retrying: {error_msg}") + return last_error + + logger.warning(f"[remote_upload] Upload attempt {attempt + 1} failed: {error_msg}") + + except requests.exceptions.Timeout as e: + last_error = { + "success": False, + "error": { + "code": "TIMEOUT_ERROR", + "message": f"Upload timeout after {self.timeout}s: {str(e)}" + } + } + logger.warning(f"[remote_upload] Upload timeout on attempt {attempt + 1}: {e}") + + except requests.exceptions.ConnectionError as e: + last_error = { + "success": False, + "error": { + "code": "CONNECTION_ERROR", + "message": f"Connection error during upload: {str(e)}" + } + } + logger.warning(f"[remote_upload] Connection error on attempt {attempt + 1}: {e}") + + except requests.exceptions.RequestException as e: + last_error = { + "success": False, + "error": { + "code": "NETWORK_ERROR", + "message": f"Network error during upload: {str(e)}" + } + } + logger.warning(f"[remote_upload] Network error on attempt {attempt + 1}: {e}") + + except Exception as e: + last_error = { + "success": False, + "error": { + "code": "UPLOAD_ERROR", + "message": f"Unexpected error during upload: {str(e)}" + } + } + logger.error(f"[remote_upload] Unexpected error on attempt {attempt + 1}: {e}") + + # All retries exhausted + logger.error(f"[remote_upload] All {self.max_retries + 1} upload attempts failed for bundle {manifest.get('bundle_id', 'unknown')}") + return last_error or { + "success": False, + "error": { + "code": "MAX_RETRIES_EXCEEDED", + "message": f"Upload failed after {self.max_retries + 1} attempts" + } + } + + def get_server_status(self) -> Dict[str, Any]: + """Get server status and last sequence number with enhanced error handling.""" + try: + logger.debug(f"[remote_upload] Checking server status at {self.upload_endpoint}") + + response = self.session.get( + f"{self.upload_endpoint}/api/v1/delta/status", + params={'workspace_path': self.workspace_path}, + timeout=min(self.timeout, 10) # Use shorter timeout for status checks + ) + + if response.status_code == 200: + status_data = response.json() + logger.debug(f"[remote_upload] Server status: {status_data}") + return status_data + else: + error_msg = f"Status check failed with HTTP {response.status_code}" + try: + error_detail = response.json() + error_detail_msg = error_detail.get('error', {}).get('message', 'Unknown error') + error_msg += f": {error_detail_msg}" + except: + error_msg += f": {response.text[:100]}" + + logger.warning(f"[remote_upload] {error_msg}") + return { + "success": False, + "error": { + "code": "STATUS_ERROR", + "message": error_msg, + "status_code": response.status_code + } + } + + except requests.exceptions.Timeout as e: + error_msg = f"Status check timeout after {min(self.timeout, 10)}s" + logger.warning(f"[remote_upload] {error_msg}: {e}") + return { + "success": False, + "error": { + "code": "STATUS_TIMEOUT", + "message": error_msg + } + } + except requests.exceptions.ConnectionError as e: + error_msg = f"Cannot connect to server at {self.upload_endpoint}" + logger.warning(f"[remote_upload] {error_msg}: {e}") + return { + "success": False, + "error": { + "code": "CONNECTION_ERROR", + "message": error_msg + } + } + except requests.exceptions.RequestException as e: + error_msg = f"Network error during status check: {str(e)}" + logger.warning(f"[remote_upload] {error_msg}") + return { + "success": False, + "error": { + "code": "NETWORK_ERROR", + "message": error_msg + } + } + except Exception as e: + error_msg = f"Unexpected error during status check: {str(e)}" + logger.error(f"[remote_upload] {error_msg}") + return { + "success": False, + "error": { + "code": "STATUS_CHECK_ERROR", + "message": error_msg + } + } + + def has_meaningful_changes(self, changes: Dict[str, List]) -> bool: + """Check if changes warrant a delta upload.""" + total_changes = sum(len(files) for op, files in changes.items() if op != "unchanged") + return total_changes > 0 + + def process_and_upload_changes(self, changed_paths: List[Path]) -> bool: + """ + Process changed paths and upload delta bundle if meaningful changes exist. + Includes comprehensive error handling and graceful fallback. + + Args: + changed_paths: List of changed file paths + + Returns: + True if upload was successful, False otherwise + """ + try: + logger.info(f"[remote_upload] Processing {len(changed_paths)} changed paths") + + # Validate input + if not changed_paths: + logger.info("[remote_upload] No changed paths provided") + return True + + # Detect changes + try: + changes = self.detect_file_changes(changed_paths) + except Exception as e: + logger.error(f"[remote_upload] Error detecting file changes: {e}") + return False + + if not self.has_meaningful_changes(changes): + logger.info("[remote_upload] No meaningful changes detected, skipping upload") + return True + + # Log change summary + total_changes = sum(len(files) for op, files in changes.items() if op != "unchanged") + logger.info(f"[remote_upload] Detected {total_changes} meaningful changes: " + f"{len(changes['created'])} created, {len(changes['updated'])} updated, " + f"{len(changes['deleted'])} deleted, {len(changes['moved'])} moved") + + # Create delta bundle + bundle_path = None + try: + bundle_path, manifest = self.create_delta_bundle(changes) + logger.info(f"[remote_upload] Created delta bundle: {manifest['bundle_id']} " + f"(seq: {manifest['sequence_number']}, size: {manifest['total_size_bytes']} bytes)") + + # Validate bundle was created successfully + if not bundle_path or not os.path.exists(bundle_path): + raise RuntimeError(f"Failed to create bundle at {bundle_path}") + + except Exception as e: + logger.error(f"[remote_upload] Error creating delta bundle: {e}") + return False + + # Upload bundle with retry logic + try: + response = self.upload_bundle(bundle_path, manifest) + + if response.get("success", False): + processed_ops = response.get('processed_operations', {}) + logger.info(f"[remote_upload] Successfully uploaded bundle {manifest['bundle_id']}") + logger.info(f"[remote_upload] Processed operations: {processed_ops}") + + # Clean up local bundle after successful upload + try: + if os.path.exists(bundle_path): + os.remove(bundle_path) + logger.debug(f"[remote_upload] Cleaned up local bundle: {bundle_path}") + except Exception as cleanup_error: + logger.warning(f"[remote_upload] Failed to cleanup bundle {bundle_path}: {cleanup_error}") + + return True + else: + error = response.get("error", {}) + error_code = error.get("code", "UNKNOWN") + error_msg = error.get("message", "Unknown error") + + logger.error(f"[remote_upload] Upload failed: {error_msg}") + + # Handle specific error types + if error_code == "SEQUENCE_MISMATCH": + logger.info("[remote_upload] Attempting to handle sequence mismatch") + return self._handle_sequence_mismatch(response, manifest) + elif error_code in ["BUNDLE_TOO_LARGE", "BUNDLE_NOT_FOUND"]: + # These are unrecoverable errors + logger.error(f"[remote_upload] Unrecoverable error ({error_code}): {error_msg}") + return False + elif error_code in ["TIMEOUT_ERROR", "CONNECTION_ERROR", "NETWORK_ERROR"]: + # These might be temporary, suggest fallback + logger.warning(f"[remote_upload] Network-related error ({error_code}): {error_msg}") + logger.warning("[remote_upload] Consider falling back to local mode if this persists") + return False + else: + # Other errors + logger.error(f"[remote_upload] Upload error ({error_code}): {error_msg}") + return False + + except Exception as e: + logger.error(f"[remote_upload] Unexpected error during upload: {e}") + return False + + except Exception as e: + logger.error(f"[remote_upload] Critical error in process_and_upload_changes: {e}") + logger.exception("[remote_upload] Full traceback:") + return False + + def _handle_sequence_mismatch(self, error_response: Dict[str, Any], manifest: Dict[str, Any]) -> bool: + """Handle sequence number mismatch by recovering missing bundles.""" + try: + expected_seq = error_response["error"]["expected_sequence"] + current_seq = manifest["sequence_number"] + + print(f"[remote_upload] Sequence mismatch: expected {expected_seq}, got {current_seq}") + + # For PoC, we'll just force upload with the expected sequence + # In a production system, we would implement proper recovery + print(f"[remote_upload] Forcing upload with sequence {expected_seq}") + + # Update our sequence number + with self._sequence_lock: + self._sequence_number = expected_seq + self._set_last_sequence(expected_seq) + + # Retry upload with force=true + bundle_path = self.bundle_dir / f"{manifest['bundle_id']}.tar.gz" + if bundle_path.exists(): + data = { + 'workspace_path': self.workspace_path, + 'collection_name': self.collection_name, + 'sequence_number': str(expected_seq), + 'force': 'true' + } + + with open(bundle_path, 'rb') as bundle_file: + files = { + 'bundle': (f"{manifest['bundle_id']}.tar.gz", bundle_file, 'application/gzip') + } + + response = self.session.post( + f"{self.upload_endpoint}/api/v1/delta/upload", + files=files, + data=data, + timeout=self.timeout + ) + + if response.status_code == 200: + result = response.json() + if result.get("success", False): + print(f"[remote_upload] Force upload successful for bundle {manifest['bundle_id']}") + return True + + print(f"[remote_upload] Force upload failed for bundle {manifest['bundle_id']}") + return False + + except Exception as e: + print(f"[remote_upload] Error handling sequence mismatch: {e}") + return False + + +def is_remote_mode_enabled() -> bool: + """Check if remote upload mode is enabled via environment variables.""" + return os.environ.get("REMOTE_UPLOAD_ENABLED", "").lower() in {"1", "true", "yes", "on"} + + +def get_remote_config() -> Dict[str, str]: + """Get remote upload configuration from environment variables.""" + return { + "upload_endpoint": os.environ.get("REMOTE_UPLOAD_ENDPOINT", "http://localhost:8080"), + "workspace_path": os.environ.get("WATCH_ROOT", os.environ.get("WORKSPACE_PATH", "/work")), + "collection_name": os.environ.get("COLLECTION_NAME", "my-collection"), + "max_retries": int(os.environ.get("REMOTE_UPLOAD_MAX_RETRIES", "3")), + "timeout": int(os.environ.get("REMOTE_UPLOAD_TIMEOUT", "30")) + } \ No newline at end of file diff --git a/scripts/upload_service.py b/scripts/upload_service.py new file mode 100644 index 00000000..243fb44d --- /dev/null +++ b/scripts/upload_service.py @@ -0,0 +1,544 @@ +#!/usr/bin/env python3 +""" +HTTP Upload Service for Delta Bundles in Context-Engine. + +This FastAPI service receives delta bundles from remote upload clients, +processes them, and integrates with the existing indexing pipeline. +""" + +import os +import json +import tarfile +import tempfile +import hashlib +import asyncio +import logging +from pathlib import Path +from typing import Dict, Any, Optional, List +from datetime import datetime + +import uvicorn +from fastapi import FastAPI, File, UploadFile, Form, HTTPException, Request, status +from fastapi.responses import JSONResponse +from fastapi.middleware.cors import CORSMiddleware +from pydantic import BaseModel, Field + +# Import existing workspace state and indexing functions +try: + from scripts.workspace_state import ( + get_workspace_state, + update_workspace_state, + update_last_activity, + get_collection_name, + get_cached_file_hash, + set_cached_file_hash, + remove_cached_file, + ) +except ImportError: + # Fallback for testing without full environment + get_workspace_state = None + update_workspace_state = None + update_last_activity = None + get_collection_name = None + get_cached_file_hash = None + set_cached_file_hash = None + remove_cached_file = None + +try: + from scripts.ingest_code import index_repo, delete_points_by_path +except ImportError: + # Fallback for testing + index_repo = None + delete_points_by_path = None + +# Configure logging +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' +) +logger = logging.getLogger(__name__) + +# Configuration from environment +QDRANT_URL = os.environ.get("QDRANT_URL", "http://qdrant:6333") +DEFAULT_COLLECTION = os.environ.get("COLLECTION_NAME", "my-collection") +WORK_DIR = os.environ.get("WORK_DIR", "/work") +MAX_BUNDLE_SIZE_MB = int(os.environ.get("MAX_BUNDLE_SIZE_MB", "100")) +UPLOAD_TIMEOUT_SECS = int(os.environ.get("UPLOAD_TIMEOUT_SECS", "300")) + +# FastAPI app +app = FastAPI( + title="Context-Engine Delta Upload Service", + description="HTTP service for receiving and processing delta bundles", + version="1.0.0" +) + +# Add CORS middleware +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + +# In-memory sequence tracking (in production, use persistent storage) +_sequence_tracker: Dict[str, int] = {} + +class UploadResponse(BaseModel): + success: bool + bundle_id: Optional[str] = None + sequence_number: Optional[int] = None + processed_operations: Optional[Dict[str, int]] = None + processing_time_ms: Optional[int] = None + indexed_points: Optional[int] = None + collection_name: Optional[str] = None + next_sequence: Optional[int] = None + error: Optional[Dict[str, Any]] = None + +class StatusResponse(BaseModel): + workspace_path: str + collection_name: str + last_sequence: int + last_upload: Optional[str] = None + pending_operations: int + status: str + server_info: Dict[str, Any] + +class HealthResponse(BaseModel): + status: str + timestamp: str + version: str + qdrant_url: str + work_dir: str + +def get_workspace_key(workspace_path: str) -> str: + """Generate a unique key for workspace tracking.""" + return hashlib.sha256(workspace_path.encode('utf-8')).hexdigest()[:16] + +def get_next_sequence(workspace_path: str) -> int: + """Get next sequence number for workspace.""" + key = get_workspace_key(workspace_path) + current = _sequence_tracker.get(key, 0) + next_seq = current + 1 + _sequence_tracker[key] = next_seq + return next_seq + +def get_last_sequence(workspace_path: str) -> int: + """Get last sequence number for workspace.""" + key = get_workspace_key(workspace_path) + return _sequence_tracker.get(key, 0) + +def validate_bundle_format(bundle_path: Path) -> Dict[str, Any]: + """Validate delta bundle format and return manifest.""" + try: + with tarfile.open(bundle_path, "r:gz") as tar: + # Check for required files + required_files = ["manifest.json", "metadata/operations.json", "metadata/hashes.json"] + members = tar.getnames() + + for req_file in required_files: + if not any(req_file in member for member in members): + raise ValueError(f"Missing required file: {req_file}") + + # Extract and validate manifest + manifest_member = None + for member in members: + if member.endswith("manifest.json"): + manifest_member = member + break + + if not manifest_member: + raise ValueError("manifest.json not found in bundle") + + manifest_file = tar.extractfile(manifest_member) + if not manifest_file: + raise ValueError("Cannot extract manifest.json") + + manifest = json.loads(manifest_file.read().decode('utf-8')) + + # Validate manifest structure + required_fields = ["version", "bundle_id", "workspace_path", "created_at", "sequence_number"] + for field in required_fields: + if field not in manifest: + raise ValueError(f"Missing required field in manifest: {field}") + + return manifest + + except Exception as e: + raise ValueError(f"Invalid bundle format: {str(e)}") + +async def process_delta_bundle(workspace_path: str, bundle_path: Path, manifest: Dict[str, Any]) -> Dict[str, int]: + """Process delta bundle and return operation counts.""" + operations_count = { + "created": 0, + "updated": 0, + "deleted": 0, + "moved": 0, + "skipped": 0, + "failed": 0 + } + + try: + # Ensure workspace directory exists + workspace = Path(workspace_path) + workspace.mkdir(parents=True, exist_ok=True) + + with tarfile.open(bundle_path, "r:gz") as tar: + # Extract operations metadata + ops_member = None + for member in tar.getnames(): + if member.endswith("metadata/operations.json"): + ops_member = member + break + + if not ops_member: + raise ValueError("operations.json not found in bundle") + + ops_file = tar.extractfile(ops_member) + if not ops_file: + raise ValueError("Cannot extract operations.json") + + operations_data = json.loads(ops_file.read().decode('utf-8')) + operations = operations_data.get("operations", []) + + # Process each operation + for operation in operations: + op_type = operation.get("operation") + rel_path = operation.get("path") + + if not rel_path: + operations_count["skipped"] += 1 + continue + + target_path = workspace / rel_path + + try: + if op_type == "created": + # Extract file from bundle + file_member = None + for member in tar.getnames(): + if member.endswith(f"files/created/{rel_path}"): + file_member = member + break + + if file_member: + file_content = tar.extractfile(file_member) + if file_content: + target_path.parent.mkdir(parents=True, exist_ok=True) + target_path.write_bytes(file_content.read()) + operations_count["created"] += 1 + else: + operations_count["failed"] += 1 + else: + operations_count["failed"] += 1 + + elif op_type == "updated": + # Extract updated file + file_member = None + for member in tar.getnames(): + if member.endswith(f"files/updated/{rel_path}"): + file_member = member + break + + if file_member: + file_content = tar.extractfile(file_member) + if file_content: + target_path.parent.mkdir(parents=True, exist_ok=True) + target_path.write_bytes(file_content.read()) + operations_count["updated"] += 1 + else: + operations_count["failed"] += 1 + else: + operations_count["failed"] += 1 + + elif op_type == "moved": + # Extract moved file to destination + file_member = None + for member in tar.getnames(): + if member.endswith(f"files/moved/{rel_path}"): + file_member = member + break + + if file_member: + file_content = tar.extractfile(file_member) + if file_content: + target_path.parent.mkdir(parents=True, exist_ok=True) + target_path.write_bytes(file_content.read()) + operations_count["moved"] += 1 + else: + operations_count["failed"] += 1 + else: + operations_count["failed"] += 1 + + elif op_type == "deleted": + # Delete file + if target_path.exists(): + target_path.unlink() + operations_count["deleted"] += 1 + else: + operations_count["skipped"] += 1 + + else: + operations_count["skipped"] += 1 + + except Exception as e: + logger.error(f"Error processing operation {op_type} for {rel_path}: {e}") + operations_count["failed"] += 1 + + return operations_count + + except Exception as e: + logger.error(f"Error processing delta bundle: {e}") + raise + +async def index_changed_files(workspace_path: str, collection_name: str) -> int: + """Index changed files using existing ingest_code pipeline.""" + if not index_repo: + logger.warning("index_repo function not available, skipping indexing") + return 0 + + try: + # Get workspace state to determine what needs indexing + if get_workspace_state: + state = get_workspace_state(workspace_path) + # Update last activity + if update_last_activity: + activity = { + "timestamp": datetime.now().isoformat(), + "action": "indexed", + "details": { + "files_processed": "unknown", + "source": "delta_upload" + } + } + update_last_activity(workspace_path, activity) + + # Call existing indexing function + logger.info(f"Indexing workspace: {workspace_path}") + result = index_repo( + workspace_path, + qdrant_url=QDRANT_URL, + collection_name=collection_name, + recreate=False + ) + + # Return estimated number of points (this is approximate) + return result.get("points_created", 0) if isinstance(result, dict) else 0 + + except Exception as e: + logger.error(f"Error indexing files: {e}") + return 0 + +@app.get("/health", response_model=HealthResponse) +async def health_check(): + """Health check endpoint.""" + return HealthResponse( + status="healthy", + timestamp=datetime.now().isoformat(), + version="1.0.0", + qdrant_url=QDRANT_URL, + work_dir=WORK_DIR + ) + +@app.get("/api/v1/delta/status", response_model=StatusResponse) +async def get_status(workspace_path: str): + """Get upload status for workspace.""" + try: + # Get collection name + if get_collection_name: + collection_name = get_collection_name(workspace_path) + else: + collection_name = DEFAULT_COLLECTION + + # Get last sequence + last_sequence = get_last_sequence(workspace_path) + + # Get workspace state if available + last_upload = None + if get_workspace_state: + try: + state = get_workspace_state(workspace_path) + last_activity = state.get("last_activity") + if last_activity: + last_upload = last_activity.get("timestamp") + except Exception: + pass + + return StatusResponse( + workspace_path=workspace_path, + collection_name=collection_name, + last_sequence=last_sequence, + last_upload=last_upload, + pending_operations=0, + status="ready", + server_info={ + "version": "1.0.0", + "max_bundle_size_mb": MAX_BUNDLE_SIZE_MB, + "supported_formats": ["tar.gz"] + } + ) + + except Exception as e: + logger.error(f"Error getting status: {e}") + raise HTTPException(status_code=500, detail=str(e)) + +@app.post("/api/v1/delta/upload", response_model=UploadResponse) +async def upload_delta_bundle( + request: Request, + bundle: UploadFile = File(...), + workspace_path: str = Form(...), + collection_name: Optional[str] = Form(None), + sequence_number: Optional[int] = Form(None), + force: Optional[bool] = Form(False) +): + """Upload and process delta bundle.""" + start_time = datetime.now() + + try: + # Validate workspace path + workspace = Path(workspace_path) + if not workspace.is_absolute(): + workspace = Path(WORK_DIR) / workspace + + workspace_path = str(workspace.resolve()) + + # Get collection name + if not collection_name: + if get_collection_name: + collection_name = get_collection_name(workspace_path) + else: + collection_name = DEFAULT_COLLECTION + + # Validate bundle size + if bundle.size and bundle.size > MAX_BUNDLE_SIZE_MB * 1024 * 1024: + raise HTTPException( + status_code=413, + detail=f"Bundle too large. Max size: {MAX_BUNDLE_SIZE_MB}MB" + ) + + # Save bundle to temporary file + with tempfile.NamedTemporaryFile(suffix=".tar.gz", delete=False) as temp_file: + bundle_path = Path(temp_file.name) + + # Stream upload to file + content = await bundle.read() + bundle_path.write_bytes(content) + + try: + # Validate bundle format + manifest = validate_bundle_format(bundle_path) + bundle_id = manifest.get("bundle_id") + manifest_sequence = manifest.get("sequence_number") + + # Check sequence number + if sequence_number is None: + sequence_number = manifest_sequence + + if not force and sequence_number is not None: + last_sequence = get_last_sequence(workspace_path) + if sequence_number != last_sequence + 1: + return UploadResponse( + success=False, + error={ + "code": "SEQUENCE_MISMATCH", + "message": f"Expected sequence {last_sequence + 1}, got {sequence_number}", + "expected_sequence": last_sequence + 1, + "received_sequence": sequence_number, + "retry_after": 5000 + } + ) + + # Process delta bundle + operations_count = await process_delta_bundle(workspace_path, bundle_path, manifest) + + # Index changed files + indexed_points = await index_changed_files(workspace_path, collection_name) + + # Update sequence tracking + if sequence_number is not None: + key = get_workspace_key(workspace_path) + _sequence_tracker[key] = sequence_number + + # Update workspace state + if update_last_activity: + activity = { + "timestamp": datetime.now().isoformat(), + "action": "indexed", + "file_path": bundle_id, + "details": { + "bundle_id": bundle_id, + "operations": operations_count, + "indexed_points": indexed_points, + "source": "delta_upload" + } + } + update_last_activity(workspace_path, activity) + + # Calculate processing time + processing_time = (datetime.now() - start_time).total_seconds() * 1000 + + return UploadResponse( + success=True, + bundle_id=bundle_id, + sequence_number=sequence_number, + processed_operations=operations_count, + processing_time_ms=int(processing_time), + indexed_points=indexed_points, + collection_name=collection_name, + next_sequence=sequence_number + 1 if sequence_number else None + ) + + finally: + # Clean up temporary file + try: + bundle_path.unlink() + except Exception: + pass + + except HTTPException: + raise + except Exception as e: + logger.error(f"Error processing upload: {e}") + return UploadResponse( + success=False, + error={ + "code": "PROCESSING_ERROR", + "message": f"Error processing bundle: {str(e)}" + } + ) + +@app.exception_handler(Exception) +async def global_exception_handler(request: Request, exc: Exception): + """Global exception handler.""" + logger.error(f"Unhandled exception: {exc}") + return JSONResponse( + status_code=500, + content={ + "success": False, + "error": { + "code": "INTERNAL_ERROR", + "message": "Internal server error" + } + } + ) + +def main(): + """Main entry point for the upload service.""" + host = os.environ.get("UPLOAD_SERVICE_HOST", "0.0.0.0") + port = int(os.environ.get("UPLOAD_SERVICE_PORT", "8002")) + + logger.info(f"Starting upload service on {host}:{port}") + logger.info(f"Qdrant URL: {QDRANT_URL}") + logger.info(f"Work directory: {WORK_DIR}") + logger.info(f"Max bundle size: {MAX_BUNDLE_SIZE_MB}MB") + + uvicorn.run( + app, + host=host, + port=port, + log_level="info", + access_log=True + ) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/scripts/watch_index.py b/scripts/watch_index.py index ad344e5f..eaa50a71 100644 --- a/scripts/watch_index.py +++ b/scripts/watch_index.py @@ -3,7 +3,7 @@ import time import threading from pathlib import Path -from typing import Set +from typing import Set, Optional from qdrant_client import QdrantClient, models from fastembed import TextEmbedding @@ -33,6 +33,17 @@ import scripts.ingest_code as idx +# Import remote upload client +try: + from scripts.remote_upload_client import ( + RemoteUploadClient, + is_remote_mode_enabled, + get_remote_config + ) + _REMOTE_UPLOAD_AVAILABLE = True +except ImportError: + _REMOTE_UPLOAD_AVAILABLE = False + QDRANT_URL = os.environ.get("QDRANT_URL", "http://qdrant:6333") COLLECTION = os.environ.get("COLLECTION_NAME", "my-collection") MODEL = os.environ.get("EMBEDDING_MODEL", "BAAI/bge-base-en-v1.5") @@ -43,11 +54,12 @@ class ChangeQueue: - def __init__(self, process_cb): + def __init__(self, process_cb, remote_client: Optional[RemoteUploadClient] = None): self._lock = threading.Lock() self._paths: Set[Path] = set() self._timer: threading.Timer | None = None self._process_cb = process_cb + self._remote_client = remote_client def add(self, p: Path): with self._lock: @@ -63,11 +75,26 @@ def _flush(self): paths = list(self._paths) self._paths.clear() self._timer = None - self._process_cb(paths) + + # Handle remote upload if enabled + if self._remote_client and _REMOTE_UPLOAD_AVAILABLE: + try: + success = self._remote_client.process_and_upload_changes(paths) + if success: + print("[remote_upload] Delta upload completed successfully") + else: + print("[remote_upload] Delta upload failed, falling back to local processing") + self._process_cb(paths) + except Exception as e: + print(f"[remote_upload] Error during delta upload: {e}") + print("[remote_upload] Falling back to local processing") + self._process_cb(paths) + else: + self._process_cb(paths) class IndexHandler(FileSystemEventHandler): - def __init__(self, root: Path, queue: ChangeQueue, client: QdrantClient, collection: str): + def __init__(self, root: Path, queue: ChangeQueue, client: Optional[QdrantClient], collection: str): super().__init__() self.root = root self.queue = queue @@ -149,19 +176,26 @@ def on_deleted(self, event): # Only attempt deletion for code files we would have indexed if p.suffix.lower() not in idx.CODE_EXTS: return - try: - idx.delete_points_by_path(self.client, self.collection, str(p)) - print(f"[deleted] {p}") - # Drop local cache entry + # Only attempt deletion if we have a local client + if self.client is not None: try: - remove_cached_file(str(self.root), str(p)) + idx.delete_points_by_path(self.client, self.collection, str(p)) + print(f"[deleted] {p}") except Exception: pass + else: + print(f"[remote_mode] File deletion detected: {p}") + + # Drop local cache entry (always do this) + try: + remove_cached_file(str(self.root), str(p)) + except Exception: + pass - try: - _log_activity(str(self.root), "deleted", p) - except Exception: - pass + try: + _log_activity(str(self.root), "deleted", p) + except Exception: + pass except Exception as e: try: print(f"[delete_error] {p}: {e}") @@ -187,20 +221,24 @@ def on_moved(self, event): rel_dir = "/" if self.excl.exclude_dir(rel_dir): if src.suffix.lower() in idx.CODE_EXTS: - try: - idx.delete_points_by_path(self.client, self.collection, str(src)) - print(f"[moved:ignored_dest_deleted_src] {src} -> {dest}") - except Exception: - pass + if self.client is not None: + try: + idx.delete_points_by_path(self.client, self.collection, str(src)) + print(f"[moved:ignored_dest_deleted_src] {src} -> {dest}") + except Exception: + pass + else: + print(f"[remote_mode] Move to ignored destination: {src} -> {dest}") return except Exception: pass - # Try in-place rename (preserve vectors) + # Try in-place rename (preserve vectors) - only if we have a local client moved_count = -1 - try: - moved_count = _rename_in_store(self.client, self.collection, src, dest) - except Exception: - moved_count = -1 + if self.client is not None: + try: + moved_count = _rename_in_store(self.client, self.collection, src, dest) + except Exception: + moved_count = -1 if moved_count and moved_count > 0: try: print(f"[moved] {src} -> {dest} ({moved_count} chunk(s) relinked)") @@ -227,12 +265,15 @@ def on_moved(self, event): pass return # Fallback: delete old then index new destination - try: - if src.suffix.lower() in idx.CODE_EXTS: - idx.delete_points_by_path(self.client, self.collection, str(src)) - print(f"[moved:deleted_src] {src}") - except Exception: - pass + if self.client is not None: + try: + if src.suffix.lower() in idx.CODE_EXTS: + idx.delete_points_by_path(self.client, self.collection, str(src)) + print(f"[moved:deleted_src] {src}") + except Exception: + pass + else: + print(f"[remote_mode] Move detected: {src} -> {dest}") try: self._maybe_enqueue(str(dest)) except Exception: @@ -388,6 +429,36 @@ def _rename_in_store(client: QdrantClient, collection: str, src: Path, dest: Pat def main(): + # Check if remote mode is enabled + remote_mode = False + remote_client = None + + if _REMOTE_UPLOAD_AVAILABLE and is_remote_mode_enabled(): + remote_mode = True + try: + remote_config = get_remote_config() + remote_client = RemoteUploadClient( + upload_endpoint=remote_config["upload_endpoint"], + workspace_path=remote_config["workspace_path"], + collection_name=remote_config["collection_name"], + max_retries=remote_config["max_retries"], + timeout=remote_config["timeout"] + ) + print(f"[remote_upload] Remote mode enabled: {remote_config['upload_endpoint']}") + + # Check server status + status = remote_client.get_server_status() + if status.get("success", False): + print(f"[remote_upload] Server status: {status.get('status', 'unknown')}") + else: + print(f"[remote_upload] Warning: Could not reach server - {status.get('error', {}).get('message', 'Unknown error')}") + + except Exception as e: + print(f"[remote_upload] Error initializing remote client: {e}") + print("[remote_upload] Falling back to local mode") + remote_mode = False + remote_client = None + # Resolve collection name from workspace state before any client/state ops try: from scripts.workspace_state import get_collection_name as _get_coll @@ -400,49 +471,56 @@ def main(): except Exception: pass + mode_str = "REMOTE" if remote_mode else "LOCAL" print( - f"Watch mode: root={ROOT} qdrant={QDRANT_URL} collection={COLLECTION} model={MODEL}" + f"Watch mode: {mode_str} root={ROOT} qdrant={QDRANT_URL} collection={COLLECTION} model={MODEL}" ) - client = QdrantClient( - url=QDRANT_URL, timeout=int(os.environ.get("QDRANT_TIMEOUT", "20") or 20) - ) + # Initialize Qdrant client for local mode (remote mode doesn't need it for basic operation) + client = None + model = None + vector_name = None + + if not remote_mode: + client = QdrantClient( + url=QDRANT_URL, timeout=int(os.environ.get("QDRANT_TIMEOUT", "20") or 20) + ) - # Compute embedding dimension first (for deterministic dense vector selection) - model = TextEmbedding(model_name=MODEL) - dim = len(next(model.embed(["dimension probe"]))) + # Compute embedding dimension first (for deterministic dense vector selection) + model = TextEmbedding(model_name=MODEL) + dim = len(next(model.embed(["dimension probe"]))) - # Determine dense vector name deterministically - try: - info = client.get_collection(COLLECTION) - cfg = info.config.params.vectors - if isinstance(cfg, dict) and cfg: - # Prefer vector whose size matches embedding dim - vector_name = None - for name, params in cfg.items(): - psize = getattr(params, "size", None) or getattr(params, "dim", None) - if psize and int(psize) == int(dim): - vector_name = name - break - # If LEX vector exists, pick a different name as dense - if vector_name is None and getattr(idx, "LEX_VECTOR_NAME", None) in cfg: - for name in cfg.keys(): - if name != idx.LEX_VECTOR_NAME: + # Determine dense vector name deterministically + try: + info = client.get_collection(COLLECTION) + cfg = info.config.params.vectors + if isinstance(cfg, dict) and cfg: + # Prefer vector whose size matches embedding dim + vector_name = None + for name, params in cfg.items(): + psize = getattr(params, "size", None) or getattr(params, "dim", None) + if psize and int(psize) == int(dim): vector_name = name break - if vector_name is None: + # If LEX vector exists, pick a different name as dense + if vector_name is None and getattr(idx, "LEX_VECTOR_NAME", None) in cfg: + for name in cfg.keys(): + if name != idx.LEX_VECTOR_NAME: + vector_name = name + break + if vector_name is None: + vector_name = idx._sanitize_vector_name(MODEL) + else: vector_name = idx._sanitize_vector_name(MODEL) - else: + except Exception: vector_name = idx._sanitize_vector_name(MODEL) - except Exception: - vector_name = idx._sanitize_vector_name(MODEL) - # Ensure collection + payload indexes exist - try: - idx.ensure_collection(client, COLLECTION, dim, vector_name) - except Exception: - pass - idx.ensure_payload_indexes(client, COLLECTION) + # Ensure collection + payload indexes exist + try: + idx.ensure_collection(client, COLLECTION, dim, vector_name) + except Exception: + pass + idx.ensure_payload_indexes(client, COLLECTION) # Ensure workspace state exists and set collection try: @@ -451,7 +529,12 @@ def main(): except Exception: pass - q = ChangeQueue(lambda paths: _process_paths(paths, client, model, vector_name, str(ROOT))) + # Create change queue with remote client if enabled + if remote_mode: + q = ChangeQueue(lambda paths: _process_paths(paths, client, model, vector_name, str(ROOT), remote_mode), remote_client) + else: + q = ChangeQueue(lambda paths: _process_paths(paths, client, model, vector_name, str(ROOT), remote_mode)) + handler = IndexHandler(ROOT, q, client, COLLECTION) obs = Observer() @@ -468,7 +551,12 @@ def main(): obs.join() -def _process_paths(paths, client, model, vector_name: str, workspace_path: str): +def _process_paths(paths, client, model, vector_name: str, workspace_path: str, remote_mode: bool = False): + # In remote mode, actual processing is handled by the remote client + # This function is called as a fallback when remote upload fails + if remote_mode: + print(f"[local_fallback] Processing {len(paths)} files locally due to remote upload failure") + # Prepare progress unique_paths = sorted(set(Path(x) for x in paths)) total = len(unique_paths) @@ -490,33 +578,42 @@ def _process_paths(paths, client, model, vector_name: str, workspace_path: str): current = p if not p.exists(): # File was removed; ensure its points are deleted - try: - idx.delete_points_by_path(client, COLLECTION, str(p)) - print(f"[deleted] {p}") - except Exception: - pass + if client is not None: # Only process if we have a local client + try: + idx.delete_points_by_path(client, COLLECTION, str(p)) + print(f"[deleted] {p}") + except Exception: + pass _log_activity(workspace_path, "deleted", p) processed += 1 _update_progress(workspace_path, started_at, processed, total, current) continue - # Lazily instantiate model if needed - if model is None: - from fastembed import TextEmbedding - mname = os.environ.get("EMBEDDING_MODEL", "BAAI/bge-base-en-v1.5") - model = TextEmbedding(model_name=mname) - ok = idx.index_single_file( - client, model, COLLECTION, vector_name, p, dedupe=True, skip_unchanged=False - ) - status = "indexed" if ok else "skipped" - print(f"[{status}] {p}") - if ok: - try: - size = int(p.stat().st_size) - except Exception: - size = None - _log_activity(workspace_path, "indexed", p, {"file_size": size}) + + # Only process files locally if we have a client and model + if client is not None and model is not None: + # Lazily instantiate model if needed + if model is None: + from fastembed import TextEmbedding + mname = os.environ.get("EMBEDDING_MODEL", "BAAI/bge-base-en-v1.5") + model = TextEmbedding(model_name=mname) + ok = idx.index_single_file( + client, model, COLLECTION, vector_name, p, dedupe=True, skip_unchanged=False + ) + status = "indexed" if ok else "skipped" + print(f"[{status}] {p}") + if ok: + try: + size = int(p.stat().st_size) + except Exception: + size = None + _log_activity(workspace_path, "indexed", p, {"file_size": size}) + else: + _log_activity(workspace_path, "skipped", p, {"reason": "no-change-or-error"}) else: - _log_activity(workspace_path, "skipped", p, {"reason": "no-change-or-error"}) + # In remote mode without fallback, just log activity + print(f"[remote_mode] Not processing locally: {p}") + _log_activity(workspace_path, "remote_processed", p) + processed += 1 _update_progress(workspace_path, started_at, processed, total, current) From 2de66b34635294cbf24ed4bdb480e922fdb39a7b Mon Sep 17 00:00:00 2001 From: Reese Date: Mon, 10 Nov 2025 09:26:10 +0000 Subject: [PATCH 09/16] feat(dev): Add docker-compose for remote stack simulation - Simulates Kubernetes-hosted environment locally - Enables per-collection repositories and search - Maintains backward compatibility via env var - Supports both single and multi-collection modes - Adds memory search capabilities per collection --- .env.example | 5 + Dockerfile.mcp | 7 +- Dockerfile.upload-service | 3 +- delta_upload_architecture.md | 249 ------- delta_upload_design.md | 651 ----------------- deploy/kubernetes/GIT_SYNC_SETUP.md | 356 ---------- deploy/kubernetes/README.md | 49 +- deploy/kubernetes/configmap.yaml | 12 +- deploy/kubernetes/deploy-with-source.sh | 310 --------- deploy/kubernetes/indexer-services.yaml | 45 +- deploy/kubernetes/mcp-indexer-git.yaml | 216 ------ deploy/kubernetes/mcp-memory-git.yaml | 209 ------ deploy/kubernetes/upload-codebase-pvc.yaml | 23 + deploy/kubernetes/upload-pvc.yaml | 12 +- deploy/kubernetes/upload-service.yaml | 4 + docker-compose.dev-remote.yml | 210 ++++-- docs/dev-remote-setup.md | 341 --------- docs/remote_upload.md | 219 ------ docs/upload_service.md | 261 ------- docs/usage_guide.md | 597 ---------------- scripts/create_indexes.py | 38 +- scripts/dev-setup.sh | 0 scripts/health_check.py | 164 +++-- scripts/hybrid_search.py | 51 +- scripts/ingest_code.py | 297 +++++--- scripts/mcp_indexer_server.py | 240 ++++++- scripts/mcp_memory_server.py | 410 +++++++++-- scripts/remote_upload_client.py | 655 +++++++++++------ scripts/upload_service.py | 218 +++--- scripts/wait-for-qdrant.sh | 14 +- scripts/warm_all_collections.py | 60 ++ scripts/watch_index.py | 772 +++++++++++++++++---- scripts/workspace_state.py | 597 +++++++++------- 33 files changed, 2684 insertions(+), 4611 deletions(-) delete mode 100644 delta_upload_architecture.md delete mode 100644 delta_upload_design.md delete mode 100644 deploy/kubernetes/GIT_SYNC_SETUP.md delete mode 100755 deploy/kubernetes/deploy-with-source.sh delete mode 100644 deploy/kubernetes/mcp-indexer-git.yaml delete mode 100644 deploy/kubernetes/mcp-memory-git.yaml create mode 100644 deploy/kubernetes/upload-codebase-pvc.yaml delete mode 100644 docs/dev-remote-setup.md delete mode 100644 docs/remote_upload.md delete mode 100644 docs/upload_service.md delete mode 100644 docs/usage_guide.md mode change 100644 => 100755 scripts/dev-setup.sh create mode 100644 scripts/warm_all_collections.py diff --git a/.env.example b/.env.example index d7757eea..14abbf9d 100644 --- a/.env.example +++ b/.env.example @@ -3,6 +3,11 @@ QDRANT_URL=http://localhost:6333 QDRANT_API_KEY= COLLECTION_NAME=my-collection +# Repository mode: 0=single-repo (default), 1=multi-repo +# Single-repo: All files go into one collection (COLLECTION_NAME) +# Multi-repo: Each subdirectory gets its own collection +MULTI_REPO_MODE=0 + # Embeddings EMBEDDING_MODEL=BAAI/bge-base-en-v1.5 EMBEDDING_PROVIDER=fastembed diff --git a/Dockerfile.mcp b/Dockerfile.mcp index ef40683b..22524111 100644 --- a/Dockerfile.mcp +++ b/Dockerfile.mcp @@ -3,11 +3,16 @@ FROM python:3.11-slim ENV PYTHONDONTWRITEBYTECODE=1 \ PYTHONUNBUFFERED=1 \ - WORK_ROOTS="/work,/app" + WORK_ROOTS="/work,/app" \ + HF_HOME=/tmp/cache \ + TRANSFORMERS_CACHE=/tmp/cache # Install latest FastMCP with Streamable HTTP (RMCP) support + deps RUN pip install --no-cache-dir --upgrade mcp fastmcp qdrant-client fastembed +# Create cache directory with proper permissions +RUN mkdir -p /tmp/cache && chmod 755 /tmp/cache + # Bake scripts into image so server can run even when /work points elsewhere COPY scripts /app/scripts diff --git a/Dockerfile.upload-service b/Dockerfile.upload-service index 359bdc04..ef6d4538 100644 --- a/Dockerfile.upload-service +++ b/Dockerfile.upload-service @@ -5,7 +5,8 @@ FROM python:3.11-slim ENV PYTHONUNBUFFERED=1 \ PYTHONDONTWRITEBYTECODE=1 \ PIP_NO_CACHE_DIR=1 \ - PIP_DISABLE_PIP_VERSION_CHECK=1 + PIP_DISABLE_PIP_VERSION_CHECK=1 \ + PYTHONPATH=/app # Install system dependencies RUN apt-get update && apt-get install -y \ diff --git a/delta_upload_architecture.md b/delta_upload_architecture.md deleted file mode 100644 index 2b9a93ce..00000000 --- a/delta_upload_architecture.md +++ /dev/null @@ -1,249 +0,0 @@ -# Delta Upload System Architecture - -## System Overview - -```mermaid -graph TB - subgraph "Local Environment" - FS[File System] - W[watch_index.py] - CQ[ChangeQueue] - DC[Delta Creator] - LC[Local Cache] - end - - subgraph "Delta Upload Service" - API[HTTP API] - BP[Bundle Processor] - Q[Qdrant Client] - WS[Workspace State] - end - - subgraph "Storage" - S3[Bundle Storage] - QDR[Qdrant DB] - end - - FS --> W - W --> CQ - CQ --> DC - DC --> LC - DC --> API - API --> BP - BP --> Q - BP --> WS - Q --> QDR - BP --> S3 -``` - -## Delta Bundle Creation Flow - -```mermaid -sequenceDiagram - participant FS as File System - participant W as watch_index.py - participant CQ as ChangeQueue - participant DC as Delta Creator - participant LC as Local Cache - participant API as Upload API - - FS->>W: File change event - W->>CQ: Add path to queue - CQ->>CQ: Debounce changes - CQ->>DC: Flush batched changes - DC->>LC: Check cached hashes - LC-->>DC: Return cached hashes - DC->>FS: Read file contents - DC->>DC: Detect change types - DC->>DC: Create delta bundle - DC->>LC: Save bundle locally - DC->>API: Upload bundle - API-->>DC: Acknowledge receipt - DC->>LC: Mark as acknowledged -``` - -## Change Detection Algorithm - -```mermaid -flowchart TD - A[Start: File Changes Detected] --> B[Get Cached Hashes] - B --> C{File Exists?} - C -->|No| D[File Deleted] - C -->|Yes| E[Calculate Current Hash] - E --> F{Hash Changed?} - F -->|No| G[Unchanged] - F -->|Yes| H{Has Cached Hash?} - H -->|No| I[File Created] - H -->|Yes| J[File Updated] - D --> K[Add to Deleted List] - I --> L[Add to Created List] - J --> M[Add to Updated List] - G --> N[Skip] - K --> O[Detect Moves] - L --> O - M --> O - N --> O - O --> P[Create Delta Bundle] -``` - -## Error Recovery Flow - -```mermaid -stateDiagram-v2 - [*] --> UploadBundle - UploadBundle --> Success: Upload OK - UploadBundle --> RetryableError: Network/Temp Error - UploadBundle --> SequenceError: Sequence Mismatch - UploadBundle --> FatalError: Permanent Failure - - RetryableError --> WaitRetry - WaitRetry --> UploadBundle: Retry - - SequenceError --> RequestRecovery - RequestRecovery --> ApplyRecovered: Recovery OK - RequestRecovery --> FatalError: Recovery Failed - ApplyRecovered --> UploadBundle - - Success --> [*] - FatalError --> [*] -``` - -## Integration Points - -```mermaid -graph LR - subgraph "Existing Components" - WI[watch_index.py] - IC[ingest_code.py] - WS[workspace_state.py] - Q[Qdrant Client] - end - - subgraph "New Delta Components" - DQ[DeltaChangeQueue] - DC[DeltaCreator] - DS[DeltaService] - DP[DeltaProcessor] - end - - WI --> DQ - DQ --> DC - DC --> DS - DS --> DP - DP --> IC - DP --> WS - IC --> Q -``` - -## Data Flow Architecture - -```mermaid -graph TB - subgraph "Client Side" - A[File Changes] --> B[Change Detection] - B --> C[Delta Bundle Creation] - C --> D[Local Persistence] - D --> E[HTTP Upload] - end - - subgraph "Server Side" - E --> F[Bundle Reception] - F --> G[Validation] - G --> H[Processing Queue] - H --> I[File Operations] - I --> J[Qdrant Updates] - I --> K[State Updates] - end - - subgraph "Recovery Flow" - L[Sequence Mismatch] --> M[Recovery Request] - M --> N[Missing Bundles] - N --> O[Replay Operations] - end - - F -.-> L -``` - -## Component Interactions - -```mermaid -classDiagram - class ChangeQueue { - +add(Path) - +_flush() - -_lock: threading.Lock - -_paths: Set[Path] - -_timer: threading.Timer - } - - class DeltaChangeQueue { - +add(Path) - +_flush() - -detect_changes() - -create_bundle() - -upload_bundle() - } - - class DeltaCreator { - +create_bundle(changes) - +detect_file_changes() - +detect_moves() - -calculate_hashes() - } - - class DeltaService { - +upload_bundle() - +get_status() - +recover_bundles() - -validate_bundle() - -process_operations() - } - - class DeltaProcessor { - +process_bundle() - +process_created() - +process_updated() - +process_deleted() - +process_moved() - } - - ChangeQueue <|-- DeltaChangeQueue - DeltaChangeQueue --> DeltaCreator - DeltaCreator --> DeltaService - DeltaService --> DeltaProcessor -``` - -## Deployment Architecture - -```mermaid -graph TB - subgraph "Development Environment" - DEV_FS[Local File System] - DEV_WATCH[watch_index.py] - DEV_DELTA[Delta Client] - DEV_API[Local Delta API] - end - - subgraph "Production Environment" - PROD_FS[Shared File System] - PROD_WATCH[watch_index.py Cluster] - PROD_DELTA[Delta Client Cluster] - LB[Load Balancer] - PROD_API[Delta API Cluster] - REDIS[Redis Cache] - S3[Object Storage] - end - - DEV_FS --> DEV_WATCH - DEV_WATCH --> DEV_DELTA - DEV_DELTA --> DEV_API - - PROD_FS --> PROD_WATCH - PROD_WATCH --> PROD_DELTA - PROD_DELTA --> LB - LB --> PROD_API - PROD_API --> REDIS - PROD_API --> S3 -``` - -This architecture provides a comprehensive view of how the delta upload system integrates with the existing Context-Engine infrastructure while providing scalability, reliability, and efficient change detection. \ No newline at end of file diff --git a/delta_upload_design.md b/delta_upload_design.md deleted file mode 100644 index e1e0c12d..00000000 --- a/delta_upload_design.md +++ /dev/null @@ -1,651 +0,0 @@ -# Delta Upload Format and Protocol Design - -## Overview - -This document specifies a delta upload format and protocol for real-time code ingestion in Context-Engine, designed to efficiently transmit only changed files from a local watch client to a remote upload service. - -## 1. Delta Bundle Format Specification - -### 1.1 Bundle Structure - -A delta bundle is a tarball (`.tar.gz`) containing: - -``` -delta-bundle.tar.gz -├── manifest.json # Bundle metadata and file operations -├── files/ # Directory containing file content -│ ├── created/ # New files -│ ├── updated/ # Modified files -│ └── moved/ # Moved files (at destination) -└── metadata/ # File metadata and hashes - ├── hashes.json # Content hashes for all files - └── operations.json # Detailed operation metadata -``` - -### 1.2 Manifest Format (`manifest.json`) - -```json -{ - "version": "1.0", - "bundle_id": "uuid-v4", - "workspace_path": "/absolute/path/to/workspace", - "collection_name": "workspace-collection", - "created_at": "2025-01-26T01:55:00.000Z", - "sequence_number": 42, - "parent_sequence": 41, - "operations": { - "created": 5, - "updated": 3, - "deleted": 2, - "moved": 1 - }, - "total_files": 11, - "total_size_bytes": 1048576, - "compression": "gzip", - "encoding": "utf-8" -} -``` - -### 1.3 File Operations Format - -#### Created Files (`files/created/`) -- Path: `files/created/relative/path/to/file.ext` -- Content: Full file content -- Metadata: Stored in `metadata/operations.json` - -#### Updated Files (`files/updated/`) -- Path: `files/updated/relative/path/to/file.ext` -- Content: Full file content (simpler than diff-based approach) -- Metadata: Stored in `metadata/operations.json` - -#### Moved Files (`files/moved/`) -- Path: `files/moved/destination/path/to/file.ext` -- Content: Full file content at destination -- Metadata: Includes source path in `metadata/operations.json` - -#### Deleted Files -- No content in bundle -- Metadata only in `metadata/operations.json` - -### 1.4 Operations Metadata (`metadata/operations.json`) - -```json -{ - "operations": [ - { - "operation": "created", - "path": "src/new_file.py", - "relative_path": "src/new_file.py", - "absolute_path": "/workspace/src/new_file.py", - "size_bytes": 1024, - "content_hash": "sha1:da39a3ee5e6b4b0d3255bfef95601890afd80709", - "file_hash": "sha1:abc123...", - "modified_time": "2025-01-26T01:55:00.000Z", - "language": "python" - }, - { - "operation": "updated", - "path": "src/existing.py", - "relative_path": "src/existing.py", - "absolute_path": "/workspace/src/existing.py", - "size_bytes": 2048, - "content_hash": "sha1:new_hash_value", - "previous_hash": "sha1:old_hash_value", - "file_hash": "sha1:def456...", - "modified_time": "2025-01-26T01:55:00.000Z", - "language": "python" - }, - { - "operation": "moved", - "path": "src/new_location.py", - "relative_path": "src/new_location.py", - "absolute_path": "/workspace/src/new_location.py", - "source_path": "src/old_location.py", - "source_relative_path": "src/old_location.py", - "source_absolute_path": "/workspace/src/old_location.py", - "size_bytes": 1536, - "content_hash": "sha1:same_hash_as_source", - "file_hash": "sha1:ghi789...", - "modified_time": "2025-01-26T01:55:00.000Z", - "language": "python" - }, - { - "operation": "deleted", - "path": "src/removed.py", - "relative_path": "src/removed.py", - "absolute_path": "/workspace/src/removed.py", - "previous_hash": "sha1:deleted_file_hash", - "file_hash": null, - "modified_time": "2025-01-26T01:55:00.000Z", - "language": "python" - } - ] -} -``` - -### 1.5 Hash Storage (`metadata/hashes.json`) - -```json -{ - "workspace_path": "/workspace", - "updated_at": "2025-01-26T01:55:00.000Z", - "file_hashes": { - "src/new_file.py": "sha1:abc123...", - "src/existing.py": "sha1:def456...", - "src/new_location.py": "sha1:ghi789..." - } -} -``` - -## 2. HTTP API Contract - -### 2.1 Upload Endpoint - -``` -POST /api/v1/delta/upload -Content-Type: multipart/form-data -``` - -#### Request Parameters - -| Parameter | Type | Required | Description | -|-----------|------|----------|-------------| -| bundle | File | Yes | Delta bundle tarball | -| workspace_path | String | Yes | Absolute workspace path | -| collection_name | String | No | Override collection name | -| sequence_number | Integer | No | Expected sequence number | -| force | Boolean | No | Force upload even if sequence mismatch | - -#### Response Format - -```json -{ - "success": true, - "bundle_id": "uuid-v4", - "sequence_number": 42, - "processed_operations": { - "created": 5, - "updated": 3, - "deleted": 2, - "moved": 1, - "skipped": 0, - "failed": 0 - }, - "processing_time_ms": 1250, - "indexed_points": 156, - "collection_name": "workspace-collection", - "next_sequence": 43 -} -``` - -#### Error Response - -```json -{ - "success": false, - "error": { - "code": "SEQUENCE_MISMATCH", - "message": "Expected sequence 41, got 43", - "expected_sequence": 41, - "received_sequence": 43, - "retry_after": 5000 - } -} -``` - -### 2.2 Status Endpoint - -``` -GET /api/v1/delta/status?workspace_path=/workspace -``` - -#### Response - -```json -{ - "workspace_path": "/workspace", - "collection_name": "workspace-collection", - "last_sequence": 41, - "last_upload": "2025-01-26T01:50:00.000Z", - "pending_operations": 0, - "status": "ready", - "server_info": { - "version": "1.0.0", - "max_bundle_size_mb": 100, - "supported_formats": ["tar.gz"] - } -} -``` - -### 2.3 Recovery Endpoint - -``` -POST /api/v1/delta/recover -Content-Type: application/json -``` - -#### Request - -```json -{ - "workspace_path": "/workspace", - "from_sequence": 38, - "to_sequence": 42 -} -``` - -#### Response - -```json -{ - "success": true, - "recovered_bundles": [ - { - "sequence": 39, - "bundle_id": "uuid-39", - "operations": {"created": 2, "updated": 1} - } - ], - "next_sequence": 43 -} -``` - -## 3. Change Detection Algorithm - -### 3.1 Integration with Existing Hash Cache - -The delta system leverages the existing hash-based caching in [`workspace_state.py`](scripts/workspace_state.py:304-310): - -```python -def detect_file_changes(workspace_path: str, changed_paths: List[Path]) -> Dict[str, Any]: - """ - Detect what type of changes occurred for each file path. - - Returns: - { - "created": [Path], - "updated": [Path], - "deleted": [Path], - "moved": [(source: Path, dest: Path)], - "unchanged": [Path] - } - """ - changes = { - "created": [], - "updated": [], - "deleted": [], - "moved": [], - "unchanged": [] - } - - for path in changed_paths: - abs_path = str(path.resolve()) - cached_hash = get_cached_file_hash(workspace_path, abs_path) - - if not path.exists(): - # File was deleted - if cached_hash: - changes["deleted"].append(path) - else: - # File exists - calculate current hash - try: - with open(path, 'rb') as f: - content = f.read() - current_hash = hashlib.sha1(content).hexdigest() - - if not cached_hash: - # New file - changes["created"].append(path) - elif cached_hash != current_hash: - # Modified file - changes["updated"].append(path) - else: - # Unchanged (might be a move detection candidate) - changes["unchanged"].append(path) - - # Update cache - set_cached_file_hash(workspace_path, abs_path, current_hash) - except Exception: - # Skip files that can't be read - continue - - # Detect moves by looking for files with same content hash - # but different paths (requires additional tracking) - changes["moved"] = detect_moves(changes["created"], changes["deleted"]) - - return changes -``` - -### 3.2 Move Detection Algorithm - -```python -def detect_moves(created_files: List[Path], deleted_files: List[Path]) -> List[Tuple[Path, Path]]: - """ - Detect file moves by matching content hashes between created and deleted files. - """ - moves = [] - deleted_hashes = {} - - # Build hash map for deleted files - for deleted_path in deleted_files: - try: - with open(deleted_path, 'rb') as f: - content = f.read() - file_hash = hashlib.sha1(content).hexdigest() - deleted_hashes[file_hash] = deleted_path - except Exception: - continue - - # Match created files with deleted files by hash - for created_path in created_files: - try: - with open(created_path, 'rb') as f: - content = f.read() - file_hash = hashlib.sha1(content).hexdigest() - - if file_hash in deleted_hashes: - source_path = deleted_hashes[file_hash] - moves.append((source_path, created_path)) - # Remove from consideration - del deleted_hashes[file_hash] - except Exception: - continue - - return moves -``` - -### 3.3 Integration with ChangeQueue - -The delta system integrates with the existing [`ChangeQueue`](scripts/watch_index.py:45-66) debouncing pattern: - -```python -class DeltaChangeQueue(ChangeQueue): - """Extended ChangeQueue that creates delta bundles.""" - - def __init__(self, process_cb, workspace_path: str, upload_endpoint: str): - super().__init__(process_cb) - self.workspace_path = workspace_path - self.upload_endpoint = upload_endpoint - self.sequence_number = self._get_last_sequence() - - def _flush(self): - """Override to create delta bundle before processing.""" - with self._lock: - paths = list(self._paths) - self._paths.clear() - self._timer = None - - # Detect changes and create delta bundle - changes = detect_file_changes(self.workspace_path, paths) - if self._has_meaningful_changes(changes): - bundle = self._create_delta_bundle(changes) - self._upload_bundle(bundle) - - # Call original processing - self._process_cb(paths) - - def _has_meaningful_changes(self, changes: Dict[str, List]) -> bool: - """Check if changes warrant a delta upload.""" - total_changes = sum(len(files) for op, files in changes.items() if op != "unchanged") - return total_changes > 0 -``` - -## 4. Error Handling and Recovery Strategy - -### 4.1 Retry Mechanism - -```python -class DeltaUploadClient: - def __init__(self, endpoint: str, max_retries: int = 3): - self.endpoint = endpoint - self.max_retries = max_retries - self.retry_delays = [1000, 2000, 5000] # ms - - def upload_bundle(self, bundle_path: str, metadata: Dict) -> bool: - for attempt in range(self.max_retries + 1): - try: - response = self._send_bundle(bundle_path, metadata) - if response["success"]: - return True - - # Handle specific error cases - if response["error"]["code"] == "SEQUENCE_MISMATCH": - return self._handle_sequence_mismatch(response, metadata) - - except Exception as e: - if attempt == self.max_retries: - self._log_failure(e, metadata) - return False - - # Wait before retry - if attempt < len(self.retry_delays): - time.sleep(self.retry_delays[attempt] / 1000) - - return False -``` - -### 4.2 Sequence Number Recovery - -```python -def _handle_sequence_mismatch(self, error_response: Dict, metadata: Dict) -> bool: - """Handle sequence number mismatch by recovering missing bundles.""" - expected_seq = error_response["error"]["expected_sequence"] - current_seq = metadata["sequence_number"] - - # Try to recover missing bundles - recovery_response = self._request_recovery( - metadata["workspace_path"], - from_sequence=expected_seq, - to_sequence=current_seq - 1 - ) - - if recovery_response["success"]: - # Apply recovered bundles locally - for bundle_info in recovery_response["recovered_bundles"]: - if not self._apply_recovered_bundle(bundle_info): - return False - - # Retry original upload - return self._send_bundle(metadata["bundle_path"], metadata)["success"] - - return False -``` - -### 4.3 Bundle Persistence - -```python -class BundlePersistence: - """Local persistence for delta bundles to enable recovery.""" - - def __init__(self, workspace_path: str): - self.workspace_path = workspace_path - self.bundle_dir = Path(workspace_path) / ".codebase" / "delta_bundles" - self.bundle_dir.mkdir(exist_ok=True) - - def save_bundle(self, bundle_path: str, metadata: Dict) -> str: - """Save bundle locally with metadata.""" - bundle_id = metadata["bundle_id"] - saved_path = self.bundle_dir / f"{bundle_id}.tar.gz" - metadata_path = self.bundle_dir / f"{bundle_id}.json" - - shutil.copy2(bundle_path, saved_path) - with open(metadata_path, 'w') as f: - json.dump(metadata, f, indent=2) - - return str(saved_path) - - def get_pending_bundles(self) -> List[Dict]: - """Get bundles that haven't been acknowledged by server.""" - pending = [] - for metadata_file in self.bundle_dir.glob("*.json"): - try: - with open(metadata_file) as f: - metadata = json.load(f) - if not metadata.get("acknowledged", False): - pending.append(metadata) - except Exception: - continue - return pending -``` - -## 5. Integration Points with Existing Code - -### 5.1 Integration with watch_index.py - -```python -# Modified IndexHandler to support delta uploads -class DeltaIndexHandler(IndexHandler): - def __init__(self, root: Path, queue: ChangeQueue, client: QdrantClient, - collection: str, delta_client: DeltaUploadClient): - super().__init__(root, queue, client, collection) - self.delta_client = delta_client - - def _maybe_enqueue(self, src_path: str): - """Override to add delta queue processing.""" - super()._maybe_enqueue(src_path) - # Delta processing happens in the extended ChangeQueue - - def on_deleted(self, event): - """Override to handle deletions in delta system.""" - super().on_deleted(event) - # Delta queue will handle the deletion processing -``` - -### 5.2 Integration with ingest_code.py - -```python -# Extend ingest_code.py to process delta bundles -def process_delta_bundle(bundle_path: str, workspace_path: str, - collection: str) -> Dict[str, Any]: - """Process a delta bundle and update Qdrant collection.""" - - # Extract bundle - with tempfile.TemporaryDirectory() as temp_dir: - extract_path = Path(temp_dir) - with tarfile.open(bundle_path, 'r:gz') as tar: - tar.extractall(extract_path) - - # Read manifest - with open(extract_path / "manifest.json") as f: - manifest = json.load(f) - - # Read operations - with open(extract_path / "metadata" / "operations.json") as f: - operations = json.load(f)["operations"] - - # Process each operation - results = {"created": 0, "updated": 0, "deleted": 0, "moved": 0, "failed": 0} - - for op in operations: - try: - if op["operation"] == "created": - _process_created_file(extract_path, op, collection, results) - elif op["operation"] == "updated": - _process_updated_file(extract_path, op, collection, results) - elif op["operation"] == "deleted": - _process_deleted_file(op, collection, results) - elif op["operation"] == "moved": - _process_moved_file(extract_path, op, collection, results) - except Exception as e: - results["failed"] += 1 - print(f"Failed to process {op['operation']} {op['path']}: {e}") - - # Update workspace state - _update_workspace_state_from_delta(workspace_path, manifest, results) - - return results -``` - -### 5.3 Integration with workspace_state.py - -```python -# Extend workspace_state.py for delta tracking -def get_last_delta_sequence(workspace_path: str) -> int: - """Get the last processed delta sequence number.""" - state = get_workspace_state(workspace_path) - return state.get("delta_state", {}).get("last_sequence", 0) - -def update_delta_state(workspace_path: str, sequence: int, bundle_id: str) -> None: - """Update delta processing state.""" - state = get_workspace_state(workspace_path) - delta_state = state.get("delta_state", {}) - delta_state.update({ - "last_sequence": sequence, - "last_bundle_id": bundle_id, - "last_processed": datetime.now().isoformat() - }) - update_workspace_state(workspace_path, {"delta_state": delta_state}) -``` - -## 6. Implementation Roadmap - -### Phase 1: Core Delta Format and API (Week 1) -1. Implement delta bundle creation and parsing -2. Create HTTP API endpoints for upload, status, and recovery -3. Implement basic error handling and response formats -4. Add unit tests for bundle format validation - -### Phase 2: Change Detection Integration (Week 2) -1. Integrate with existing hash cache system -2. Implement move detection algorithm -3. Extend ChangeQueue for delta bundle creation -4. Add integration tests with watch_index.py - -### Phase 3: Error Handling and Recovery (Week 3) -1. Implement retry mechanism with exponential backoff -2. Add sequence number recovery -3. Implement bundle persistence -4. Add comprehensive error logging and monitoring - -### Phase 4: Production Integration (Week 4) -1. Integrate with ingest_code.py for bundle processing -2. Extend workspace_state.py for delta tracking -3. Add performance optimization and batching -4. Implement monitoring and alerting -5. Add end-to-end integration tests - -### Phase 5: Performance and Scaling (Week 5) -1. Optimize bundle compression and size -2. Implement parallel processing for large bundles -3. Add bandwidth optimization for remote uploads -4. Performance testing and tuning - -## 7. Configuration and Environment Variables - -```bash -# Delta upload configuration -DELTA_UPLOAD_ENABLED=true -DELTA_UPLOAD_ENDPOINT=http://delta-server:8002/api/v1/delta -DELTA_MAX_BUNDLE_SIZE_MB=100 -DELTA_BATCH_SIZE_FILES=50 -DELTA_DEBOUNCE_SECS=2.0 - -# Retry and recovery -DELTA_MAX_RETRIES=3 -DELTA_RETRY_DELAYS_MS=1000,2000,5000 -DELTA_PERSIST_BUNDLES=true -DELTA_BUNDLE_RETENTION_DAYS=7 - -# Performance tuning -DELTA_COMPRESSION_LEVEL=6 -DELTA_PARALLEL_UPLOADS=2 -DELTA_CHUNK_SIZE_BYTES=1048576 -``` - -## 8. Security Considerations - -1. **Authentication**: Add API key or token-based authentication -2. **Authorization**: Validate workspace access permissions -3. **Input Validation**: Validate bundle format and file paths -4. **Rate Limiting**: Implement upload rate limits per workspace -5. **Audit Logging**: Log all delta operations for compliance - -## 9. Monitoring and Observability - -1. **Metrics**: Track bundle size, processing time, success rates -2. **Logging**: Structured logging for all delta operations -3. **Health Checks**: Endpoint health monitoring -4. **Alerting**: Alert on failed uploads or processing errors -5. **Dashboards**: Visual monitoring of delta system performance - -This design provides a comprehensive foundation for implementing delta uploads in Context-Engine while leveraging existing infrastructure and maintaining compatibility with current file processing workflows. \ No newline at end of file diff --git a/deploy/kubernetes/GIT_SYNC_SETUP.md b/deploy/kubernetes/GIT_SYNC_SETUP.md deleted file mode 100644 index 17ccb05f..00000000 --- a/deploy/kubernetes/GIT_SYNC_SETUP.md +++ /dev/null @@ -1,356 +0,0 @@ -# Git Sync Source Code Management for Context-Engine - -This guide explains how to set up and configure Git-based source code synchronization for Context-Engine in Kubernetes deployments. - -## Overview - -The Git sync solution uses **Git sync sidecar containers** that automatically pull source code from a Git repository into the application pods. This solves the critical issue of source code distribution in remote Kubernetes clusters. - -### Architecture - -``` -┌─────────────────────────────────────────────────────────────┐ -│ Kubernetes Pod │ -│ ┌─────────────┐ ┌───────────────┐ ┌───────────────────┐ │ -│ │ Main App │ │ Git Sync │ │ Shared Volume │ │ -│ │ Container │ │ Sidecar │ │ (emptyDir) │ │ -│ │ │ │ │ │ │ │ -│ │ /work │←─→│ /git → /work │←─→│ Source Code │ │ -│ └─────────────┘ └───────────────┘ └───────────────────┘ │ -│ ↕ │ -│ Git Repository (GitHub/GitLab/Bitbucket) │ -└─────────────────────────────────────────────────────────────┘ -``` - -## Quick Start - -### 1. Deploy with Git Sync - -```bash -# Public repository -./deploy-with-source.sh git https://github.com/your-org/your-repo.git main - -# Private repository with HTTPS -./deploy-with-source.sh git https://github.com/your-org/your-repo.git main - -# Private repository with SSH -./deploy-with-source.sh git git@github.com:your-org/your-repo.git main -``` - -### 2. Deploy with Local Mode (Alternative) - -```bash -# Use this if source code is already on cluster nodes -./deploy-with-source.sh local -``` - -## Configuration Options - -### ConfigMap Settings - -Update `deploy/kubernetes/configmap.yaml` with your Git configuration: - -```yaml -# Source Code Configuration -SOURCE_CODE_MODE: "git" # Options: "local" or "git" - -# Git repository configuration (only used when SOURCE_CODE_MODE=git) -GIT_REPO_URL: "https://github.com/your-org/your-repo.git" -GIT_BRANCH: "main" -GIT_SYNC_PERIOD: "60" # Sync every 60 seconds -GIT_USERNAME: "" # For private repos (optional) -GIT_PASSWORD: "" # For private repos (optional) -``` - -### Git Sync Sidecar Configuration - -The Git sync sidecar uses the following environment variables: - -| Variable | Description | Default | -|----------|-------------|---------| -| `GITSYNC_REPO` | Git repository URL | From ConfigMap | -| `GITSYNC_BRANCH` | Git branch to checkout | From ConfigMap | -| `GITSYNC_ROOT` | Directory to clone into | `/git` | -| `GITSYNC_SYNC_PERIOD` | Sync frequency in seconds | From ConfigMap | -| `GITSYNC_ONE_TIME` | Sync once and exit | `false` | -| `GITSYNC_LINK` | Create symlink to latest commit | `latest` | -| `GITSYNC_MAX_FAILURES` | Max sync failures before giving up | `5` | -| `GITSYNC_USERNAME` | Username for HTTP basic auth | From ConfigMap | -| `GITSYNC_PASSWORD` | Password/token for HTTP basic auth | From ConfigMap | - -## Authentication Setup - -### Public Repositories - -No additional setup required. The Git sync sidecar will clone public repositories automatically. - -### Private Repositories (HTTPS) - -1. **Using Personal Access Token:** - -```bash -# Create ConfigMap with credentials -kubectl patch configmap context-engine-config -n context-engine --patch '{"data":{"GIT_USERNAME":"your-username","GIT_PASSWORD":"your-personal-access-token"}}' -``` - -2. **Alternative: Create Secret:** - -```bash -kubectl create secret generic git-https-credentials \ - --from-literal=username=your-username \ - --from-literal=password=your-personal-access-token \ - -n context-engine -``` - -### Private Repositories (SSH) - -1. **Generate SSH Key:** - -```bash -ssh-keygen -t rsa -b 4096 -C "git-sync@context-engine" -f ~/.ssh/context_engine_git -``` - -2. **Add SSH Key to Git Repository:** - - - Copy the public key (`~/.ssh/context_engine_git.pub`) - - Add it as a deploy key in your Git repository settings - -3. **Create Kubernetes Secret:** - -```bash -kubectl create secret generic git-ssh-key \ - --from-file=ssh-private-key=~/.ssh/context_engine_git \ - -n context-engine -``` - -4. **Update Git Sync Configuration:** - -The manifests are already configured to use SSH when the `git-ssh-key` secret exists. - -## Deployment Options - -### Option 1: Automated Deployment Script - -```bash -# Deploy with automated script -cd deploy/kubernetes -./deploy-with-source.sh git https://github.com/your-org/your-repo.git main -``` - -### Option 2: Manual Deployment with Kustomize - -1. **Update ConfigMap:** - -```yaml -# kustomization.yaml patches -apiVersion: v1 -kind: ConfigMap -metadata: - name: context-engine-config -data: - SOURCE_CODE_MODE: "git" - GIT_REPO_URL: "https://github.com/your-org/your-repo.git" - GIT_BRANCH: "main" -``` - -2. **Apply Git-enabled manifests:** - -```bash -kubectl apply -f mcp-memory-git.yaml -kubectl apply -f mcp-indexer-git.yaml -``` - -### Option 3: Switching Between Modes - -To switch from local to Git mode (or vice versa): - -```bash -# Update ConfigMap -kubectl patch configmap context-engine-config -n context-engine --type merge --patch '{"data":{"SOURCE_CODE_MODE":"git"}}' - -# Redeploy affected services -kubectl rollout restart deployment/mcp-memory -n context-engine -kubectl rollout restart deployment/mcp-indexer -n context-engine -``` - -## Monitoring and Troubleshooting - -### Check Git Sync Status - -```bash -# Check Git sync logs for indexer -kubectl logs deployment/mcp-indexer -c git-sync -n context-engine - -# Check Git sync logs for memory server -kubectl logs deployment/mcp-memory -c git-sync -n context-engine -``` - -### Common Issues - -#### 1. Authentication Failures - -**Error:** `authentication failed` - -**Solution:** -- Verify SSH key is correctly configured -- Check that the deploy key has read access -- Ensure the SSH key format is correct - -#### 2. Network Connectivity - -**Error:** `unable to access '...'` - -**Solution:** -- Check cluster network policies -- Verify firewall rules allow Git access -- Test connectivity from a pod in the cluster - -#### 3. Repository Not Found - -**Error:** `repository not found` - -**Solution:** -- Verify the repository URL is correct -- Check that the repository exists -- Ensure the Git branch exists - -#### 4. Sync Loop Issues - -**Error:** Continuous sync failures - -**Solution:** -- Check `GITSYNC_MAX_FAILURES` setting -- Examine Git sync logs for specific errors -- Verify repository permissions - -### Health Checks - -The Git sync sidecar doesn't have built-in health endpoints, but you can monitor: - -```bash -# Check if source code is present -kubectl exec deployment/mcp-indexer -c mcp-indexer -n context-engine -- ls -la /work - -# Check Git sync status -kubectl exec deployment/mcp-indexer -c git-sync -n context-engine -- cat /git/.git-sync -``` - -## Best Practices - -### 1. Repository Management - -- **Use specific branches:** Pin to specific branches for production -- **Tag releases:** Use Git tags for release deployments -- **Clean repository:** Avoid including large binary files in the repository - -### 2. Security - -- **Use read-only deploy keys:** Don't use SSH keys with write access -- **Rotate credentials:** Regularly rotate personal access tokens -- **Network policies:** Restrict pod network access as needed - -### 3. Performance - -- **Optimize sync frequency:** Adjust `GIT_SYNC_PERIOD` based on update frequency -- **Repository size:** Keep repository size reasonable for faster clones -- **Shallow clones:** Consider using `--depth 1` for large repositories - -### 4. High Availability - -- **Multiple replicas:** Git sync works with multiple pod replicas -- **Regional repositories:** Use Git mirrors for global deployments -- **Fallback strategies:** Consider local mode as fallback - -## Advanced Configuration - -### Custom Git Sync Options - -You can customize the Git sync sidecar by editing the manifests: - -```yaml -env: -- name: GITSYNC_DEPTH - value: "1" # Shallow clone for faster sync -- name: GITSYNC_GARBAGE_COLLECTION - value: "true" # Clean up old commits -- name: GITSYNC_ADD_USER - value: "true" # Set .gitconfig user info -``` - -### Webhook Integration - -For instant updates, consider using webhooks with a custom controller: - -```yaml -# This would require a custom webhook receiver -apiVersion: v1 -kind: Service -metadata: - name: git-webhook-receiver -spec: - selector: - app: git-webhook-receiver - ports: - - port: 8080 - targetPort: 8080 -``` - -### Multi-Repository Setup - -For complex projects requiring multiple repositories: - -```yaml -# Add multiple Git sync sidecars -- name: git-sync-main - env: - - name: GITSYNC_REPO - value: "https://github.com/your-org/main-repo.git" - volumeMounts: - - name: main-volume - mountPath: /git-main - -- name: git-sync-config - env: - - name: GITSYNC_REPO - value: "https://github.com/your-org/config-repo.git" - volumeMounts: - - name: config-volume - mountPath: /git-config -``` - -## Migration Guide - -### From Local Mode to Git Mode - -1. **Backup current data:** -```bash -kubectl exec deployment/mcp-indexer -c mcp-indexer -n context-engine -- tar czf /tmp/backup.tar.gz -C /work . -``` - -2. **Update configuration:** -```bash -kubectl patch configmap context-engine-config -n context-engine --type merge --patch '{"data":{"SOURCE_CODE_MODE":"git","GIT_REPO_URL":"https://github.com/your-org/your-repo.git"}}' -``` - -3. **Restart deployments:** -```bash -kubectl rollout restart deployment/mcp-indexer -n context-engine -kubectl rollout restart deployment/mcp-memory -n context-engine -``` - -4. **Verify sync:** -```bash -kubectl logs deployment/mcp-indexer -c git-sync -n context-engine -f -``` - -## Support - -For issues with Git sync setup: - -1. Check the [Git sync documentation](https://github.com/kubernetes/git-sync) -2. Review Kubernetes pod logs -3. Verify network connectivity to Git repository -4. Check authentication configuration -5. Validate ConfigMap settings - -Remember that Git sync provides **automatic source code distribution** for remote Kubernetes deployments, eliminating the need for manual code synchronization across cluster nodes. \ No newline at end of file diff --git a/deploy/kubernetes/README.md b/deploy/kubernetes/README.md index e337207b..0bee9375 100644 --- a/deploy/kubernetes/README.md +++ b/deploy/kubernetes/README.md @@ -44,23 +44,11 @@ For local development or direct access, services are exposed via NodePort: docker build -f Dockerfile.indexer -t context-engine-indexer-service:latest . ``` -4. **Source Code Access** (choose one): - - **Local Mode**: Source code pre-distributed to all cluster nodes at `/tmp/context-engine-work` - - **Git Mode**: Git repository accessible from cluster with proper authentication configured +4. **Source Code Access**: Source code should be pre-distributed to all cluster nodes at `/tmp/context-engine-work` ## Quick Start -### Option 1: Automated Deployment with Source Code Management - -```bash -# Deploy with Git-based source code synchronization (recommended) -./deploy-with-source.sh git https://github.com/your-org/your-repo.git main - -# Or deploy with local source code (requires pre-distribution) -./deploy-with-source.sh local -``` - -### Option 2: Manual Deployment +### Manual Deployment ### 1. Deploy Core Services @@ -139,38 +127,7 @@ All configuration is managed through the `context-engine-config` ConfigMap in `c ## Source Code Management -### Local Mode vs Git Mode - -The deployment supports two source code access strategies: - -#### **Local Mode** (Default) -- Uses hostPath volumes to access source code on cluster nodes -- **Pros**: Simple, no external dependencies -- **Cons**: Requires manual source code distribution to all nodes -- **Use Case**: Single-node clusters, development environments - -#### **Git Mode** (Recommended for Production) -- Uses Git sync sidecars to automatically pull source code from repositories -- **Pros**: Automatic source code synchronization, CI/CD integration -- **Cons**: Requires Git repository access from cluster -- **Use Case**: Multi-node clusters, production deployments - -### Git Sync Setup - -For Git mode setup, see [GIT_SYNC_SETUP.md](./GIT_SYNC_SETUP.md) for detailed instructions. - -**Quick Git Mode Setup:** - -```bash -# Public repository -./deploy-with-source.sh git https://github.com/your-org/your-repo.git main - -# Private repository (requires SSH key setup) -kubectl create secret generic git-ssh-key \ - --from-file=ssh-private-key=~/.ssh/id_rsa \ - -n context-engine -./deploy-with-source.sh git git@github.com:your-org/your-repo.git main -``` +The deployment uses hostPath volumes to access source code on cluster nodes. Source code must be pre-distributed to all cluster nodes at the configured paths. ## Development Workflow diff --git a/deploy/kubernetes/configmap.yaml b/deploy/kubernetes/configmap.yaml index ccc33c5a..5a58e87c 100644 --- a/deploy/kubernetes/configmap.yaml +++ b/deploy/kubernetes/configmap.yaml @@ -127,14 +127,4 @@ data: FASTMCP_SERVER_NAME: "qdrant-mcp" # MCP server identifier HOST_INDEX_PATH: "/work" # Work directory mounting path - # Source Code Configuration - # Set to "git" to use Git sync sidecar, or "local" to use hostPath (default) - SOURCE_CODE_MODE: "local" - - # Git repository configuration (only used when SOURCE_CODE_MODE=git) - GIT_REPO_URL: "" # e.g., "https://github.com/your-org/your-repo.git" - GIT_BRANCH: "main" - GIT_SYNC_PERIOD: "60" # Sync every 60 seconds - GIT_SSH_KEY: "" # SSH private key for private repos (optional) - GIT_USERNAME: "" # Username for private repos (optional) - GIT_PASSWORD: "" # Password/token for private repos (optional) + \ No newline at end of file diff --git a/deploy/kubernetes/deploy-with-source.sh b/deploy/kubernetes/deploy-with-source.sh deleted file mode 100755 index 3e38bfe6..00000000 --- a/deploy/kubernetes/deploy-with-source.sh +++ /dev/null @@ -1,310 +0,0 @@ -#!/bin/bash - -# Context-Engine Kubernetes Deployment with Source Code Management -# Supports both local (hostPath) and Git-based source code access - -set -e - -# Configuration -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -NAMESPACE="context-engine" -SOURCE_MODE="${1:-local}" # Options: local, git -GIT_REPO_URL="${2:-}" -GIT_BRANCH="${3:-main}" - -# Colors for output -RED='\033[0;31m' -GREEN='\033[0;32m' -YELLOW='\033[1;33m' -BLUE='\033[0;34m' -NC='\033[0m' # No Color - -# Helper functions -log() { - echo -e "${BLUE}[INFO]${NC} $1" -} - -warn() { - echo -e "${YELLOW}[WARN]${NC} $1" -} - -error() { - echo -e "${RED}[ERROR]${NC} $1" -} - -success() { - echo -e "${GREEN}[SUCCESS]${NC} $1" -} - -# Print usage -usage() { - cat << EOF -Usage: $0 [git-repo-url] [git-branch] - -Source Modes: - local - Use hostPath volumes (source code must be pre-distributed to nodes) - git - Use Git sync sidecars (automatic source code synchronization) - -Examples: - # Local deployment (requires source code on nodes) - $0 local - - # Git-based deployment - $0 git https://github.com/your-org/your-repo.git main - - # Git-based deployment with private repo (requires SSH key setup) - $0 git git@github.com:your-org/your-repo.git main - -Environment Variables: - REGISTRY - Docker registry prefix (default: context-engine) - TAG - Docker image tag (default: latest) - -Requirements for Git Mode: - - Git repository must be accessible from the cluster - - For private repos: create git-ssh-key secret or configure credentials - - Sufficient network access to clone the repository - -Requirements for Local Mode: - - Source code must exist at /tmp/context-engine-work on ALL nodes - - Node access required for code updates -EOF -} - -# Validate input -validate_input() { - if [[ "$SOURCE_MODE" != "local" && "$SOURCE_MODE" != "git" ]]; then - error "Invalid source mode: $SOURCE_MODE. Must be 'local' or 'git'." - usage - exit 1 - fi - - if [[ "$SOURCE_MODE" == "git" && -z "$GIT_REPO_URL" ]]; then - error "Git repository URL is required when using git mode." - usage - exit 1 - fi -} - -# Check prerequisites -check_prerequisites() { - log "Checking prerequisites..." - - # Check kubectl - if ! command -v kubectl &> /dev/null; then - error "kubectl is not installed or not in PATH" - exit 1 - fi - - # Check cluster access - if ! kubectl cluster-info &> /dev/null; then - error "Cannot connect to Kubernetes cluster" - exit 1 - fi - - # Check if namespace exists - if ! kubectl get namespace "$NAMESPACE" &> /dev/null; then - log "Creating namespace: $NAMESPACE" - kubectl create namespace "$NAMESPACE" - fi - - success "Prerequisites check passed" -} - -# Update ConfigMap with source code configuration -update_configmap() { - log "Updating ConfigMap with source code configuration..." - - # Create a temporary configmap with updated values - kubectl create configmap context-engine-config-temp \ - --from-env-file <(cat << EOF -SOURCE_CODE_MODE=$SOURCE_MODE -GIT_REPO_URL=$GIT_REPO_URL -GIT_BRANCH=$GIT_BRANCH -GIT_SYNC_PERIOD=60 -GIT_USERNAME="" -GIT_PASSWORD="" -EOF -) \ - --namespace "$NAMESPACE" \ - --dry-run=client -o yaml | kubectl apply -f - - - # Merge with existing configmap (preserving other settings) - log "Merging configuration with existing ConfigMap..." - # Note: This is a simplified approach. In production, you might want to use - # kustomize or a more sophisticated config management tool -} - -# Deploy based on source mode -deploy_services() { - log "Deploying Context-Engine services in $SOURCE_MODE mode..." - - # Deploy core infrastructure (always needed) - log "Deploying core infrastructure..." - kubectl apply -f "$SCRIPT_DIR/qdrant.yaml" - - # Wait for Qdrant to be ready - log "Waiting for Qdrant to be ready..." - kubectl wait --for=condition=ready pod -l app=qdrant -n "$NAMESPACE" --timeout=300s - - if [[ "$SOURCE_MODE" == "local" ]]; then - deploy_local_mode - else - deploy_git_mode - fi - - # Deploy remaining services - log "Deploying remaining services..." - kubectl apply -f "$SCRIPT_DIR/mcp-http.yaml" - kubectl apply -f "$SCRIPT_DIR/indexer-services.yaml" - - # Deploy optional services - if [[ -f "$SCRIPT_DIR/llamacpp.yaml" ]]; then - log "Deploying optional Llama.cpp service..." - kubectl apply -f "$SCRIPT_DIR/llamacpp.yaml" - fi -} - -# Deploy in local mode (using hostPath) -deploy_local_mode() { - log "Deploying in LOCAL mode (hostPath volumes)..." - - # Apply hostPath-based deployments - kubectl apply -f "$SCRIPT_DIR/mcp-memory.yaml" - kubectl apply -f "$SCRIPT_DIR/mcp-indexer.yaml" - - warn "⚠️ LOCAL MODE REQUIREMENTS:" - warn " - Source code must exist at /tmp/context-engine-work on ALL cluster nodes" - warn " - Code updates require manual synchronization across nodes" - warn " - This mode is suitable for single-node clusters or development" - - read -p "Do you want to continue? (y/N): " -n 1 -r - echo - if [[ ! $REPLY =~ ^[Yy]$ ]]; then - log "Deployment cancelled" - exit 1 - fi -} - -# Deploy in Git mode (using Git sync sidecars) -deploy_git_mode() { - log "Deploying in GIT mode (automatic source code synchronization)..." - - # Setup Git authentication if needed - setup_git_auth - - # Apply Git-enabled deployments - kubectl apply -f "$SCRIPT_DIR/mcp-memory-git.yaml" - kubectl apply -f "$SCRIPT_DIR/mcp-indexer-git.yaml" - - success "✅ Git sync enabled - source code will be automatically synchronized" - log " Repository: $GIT_REPO_URL" - log " Branch: $GIT_BRANCH" - log " Sync Period: 60 seconds" -} - -# Setup Git authentication -setup_git_auth() { - # Check if this is a private repository requiring authentication - if [[ "$GIT_REPO_URL" =~ ^git@ ]] || [[ "$GIT_REPO_URL" =~ \.git$ && ! "$GIT_REPO_URL" =~ ^https://github\.com/[^/]+/[^/]+\.git$ ]]; then - warn "Private repository detected. Please ensure authentication is configured:" - warn " 1. For SSH: Create git-ssh-key secret with your SSH private key" - warn " 2. For HTTPS: Set GIT_USERNAME and GIT_PASSWORD in ConfigMap" - - # Check if SSH secret exists - if ! kubectl get secret git-ssh-key -n "$NAMESPACE" &> /dev/null; then - log "Creating placeholder SSH secret (please update with your actual SSH key)" - kubectl create secret generic git-ssh-key \ - --from-literal=ssh-private-key="" \ - --namespace "$NAMESPACE" \ - --dry-run=client -o yaml | kubectl apply -f - - warn "⚠️ Please update the git-ssh-key secret with your actual SSH private key:" - warn " kubectl delete secret git-ssh-key -n $NAMESPACE" - warn " kubectl create secret generic git-ssh-key --from-file=ssh-private-key=~/.ssh/id_rsa -n $NAMESPACE" - fi - fi -} - -# Wait for deployment to be ready -wait_for_ready() { - log "Waiting for all deployments to be ready..." - - # List of deployments to wait for - local deployments=("mcp-memory" "mcp-indexer" "mcp-memory-http" "mcp-indexer-http") - - for deployment in "${deployments[@]}"; do - if kubectl get deployment "$deployment" -n "$NAMESPACE" &> /dev/null; then - log "Waiting for $deployment to be ready..." - kubectl wait --for=condition=available deployment/"$deployment" -n "$NAMESPACE" --timeout=300s - fi - done - - success "All deployments are ready" -} - -# Show deployment status and access information -show_status() { - log "Deployment completed successfully!" - echo - echo "=== Context-Engine Status ===" - echo "Namespace: $NAMESPACE" - echo "Source Mode: $SOURCE_MODE" - if [[ "$SOURCE_MODE" == "git" ]]; then - echo "Git Repository: $GIT_REPO_URL" - echo "Git Branch: $GIT_BRANCH" - fi - echo - echo "=== Services ===" - kubectl get services -n "$NAMESPACE" - echo - echo "=== Pods ===" - kubectl get pods -n "$NAMESPACE" - echo - echo "=== Access Information ===" - - # Get service access information - local cluster_ip=$(kubectl get svc qdrant -n "$NAMESPACE" -o jsonpath='{.spec.clusterIP}' 2>/dev/null || echo "N/A") - echo "Qdrant: $cluster_ip:6333" - - if kubectl get svc mcp-memory -n "$NAMESPACE" &> /dev/null; then - local memory_nodeport=$(kubectl get svc mcp-memory -n "$NAMESPACE" -o jsonpath='{.spec.ports[?(@.name=="sse")].nodePort}' 2>/dev/null || echo "N/A") - echo "MCP Memory (SSE): NodePort $memory_nodeport" - fi - - if kubectl get svc mcp-indexer -n "$NAMESPACE" &> /dev/null; then - local indexer_nodeport=$(kubectl get svc mcp-indexer -n "$NAMESPACE" -o jsonpath='{.spec.ports[?(@.name=="sse")].nodePort}' 2>/dev/null || echo "N/A") - echo "MCP Indexer (SSE): NodePort $indexer_nodeport" - fi - - echo - echo "=== Next Steps ===" - echo "1. Test the deployment:" - echo " curl http://:30800/sse # MCP Memory" - echo " curl http://:30802/sse # MCP Indexer" - echo "2. Call indexing tool:" - echo " curl -X POST http://:30802/sse -H 'Content-Type: application/json' \\" - echo " -d '{\"jsonrpc\": \"2.0\", \"id\": 1, \"method\": \"tools/call\", \"params\": {\"name\": \"qdrant_index_root\", \"arguments\": {}}}'" - - if [[ "$SOURCE_MODE" == "git" ]]; then - echo "3. Monitor Git sync:" - echo " kubectl logs deployment/mcp-indexer -c git-sync -n $NAMESPACE" - echo " kubectl logs deployment/mcp-memory -c git-sync -n $NAMESPACE" - fi -} - -# Main execution -main() { - if [[ "$1" == "-h" || "$1" == "--help" ]]; then - usage - exit 0 - fi - - validate_input - check_prerequisites - update_configmap - deploy_services - wait_for_ready - show_status -} - -# Run main function with all arguments -main "$@" \ No newline at end of file diff --git a/deploy/kubernetes/indexer-services.yaml b/deploy/kubernetes/indexer-services.yaml index e0db7616..283f43dd 100644 --- a/deploy/kubernetes/indexer-services.yaml +++ b/deploy/kubernetes/indexer-services.yaml @@ -38,6 +38,7 @@ spec: configMapKeyRef: name: context-engine-config key: EMBEDDING_MODEL + # CODEBASE_ROOT not needed when mounting at /work/.codebase resources: requests: memory: "1Gi" @@ -49,17 +50,15 @@ spec: - name: work-volume mountPath: /work readOnly: true - - name: codebase-volume + - name: metadata-volume mountPath: /work/.codebase volumes: - name: work-volume - hostPath: - path: /tmp/context-engine-work # Adjust for your environment - type: DirectoryOrCreate - - name: codebase-volume - hostPath: - path: /tmp/context-engine-work/.codebase # Adjust for your environment - type: DirectoryOrCreate + persistentVolumeClaim: + claimName: code-repos-pvc + - name: metadata-volume + persistentVolumeClaim: + claimName: code-metadata-pvc --- # Watcher Service (File change monitoring and reindexing) @@ -130,6 +129,7 @@ spec: configMapKeyRef: name: context-engine-config key: WATCH_DEBOUNCE_SECS + # CODEBASE_ROOT not needed when mounting at /work/.codebase resources: requests: memory: "512Mi" @@ -141,17 +141,15 @@ spec: - name: work-volume mountPath: /work readOnly: true - - name: codebase-volume + - name: metadata-volume mountPath: /work/.codebase volumes: - name: work-volume - hostPath: - path: /tmp/context-engine-work # Adjust for your environment - type: DirectoryOrCreate - - name: codebase-volume - hostPath: - path: /tmp/context-engine-work/.codebase # Adjust for your environment - type: DirectoryOrCreate + persistentVolumeClaim: + claimName: code-repos-pvc + - name: metadata-volume + persistentVolumeClaim: + claimName: code-metadata-pvc --- # Index Initialization Job @@ -188,6 +186,7 @@ spec: configMapKeyRef: name: context-engine-config key: COLLECTION_NAME + # CODEBASE_ROOT not needed when mounting at /work/.codebase resources: requests: memory: "512Mi" @@ -199,14 +198,12 @@ spec: - name: work-volume mountPath: /work readOnly: true - - name: codebase-volume + - name: metadata-volume mountPath: /work/.codebase volumes: - name: work-volume - hostPath: - path: /tmp/context-engine-work # Adjust for your environment - type: DirectoryOrCreate - - name: codebase-volume - hostPath: - path: /tmp/context-engine-work/.codebase # Adjust for your environment - type: DirectoryOrCreate \ No newline at end of file + persistentVolumeClaim: + claimName: code-repos-pvc + - name: metadata-volume + persistentVolumeClaim: + claimName: code-metadata-pvc \ No newline at end of file diff --git a/deploy/kubernetes/mcp-indexer-git.yaml b/deploy/kubernetes/mcp-indexer-git.yaml deleted file mode 100644 index a6c13b8c..00000000 --- a/deploy/kubernetes/mcp-indexer-git.yaml +++ /dev/null @@ -1,216 +0,0 @@ ---- -# MCP Indexer Server with Git Sync Sidecar -# This demonstrates both local (hostPath) and Git-based source code access -apiVersion: apps/v1 -kind: Deployment -metadata: - name: mcp-indexer - namespace: context-engine - labels: - app: context-engine - component: mcp-indexer -spec: - replicas: 1 - selector: - matchLabels: - app: context-engine - component: mcp-indexer - template: - metadata: - labels: - app: context-engine - component: mcp-indexer - spec: - # Use init container to determine source code mode - initContainers: - - name: source-mode-check - image: busybox:1.36 - command: ["sh", "-c", "echo 'Source Code Mode: $SOURCE_CODE_MODE'"] - env: - - name: SOURCE_CODE_MODE - valueFrom: - configMapKeyRef: - name: context-engine-config - key: SOURCE_CODE_MODE - - containers: - # Main MCP Indexer Server Container - - name: mcp-indexer - image: context-engine-indexer:latest - imagePullPolicy: IfNotPresent - command: ["python", "-m", "mcp.server.fastmcp"] - args: ["--server-name", "context-engine-indexer", "--host", "0.0.0.0", "--port", "8001", "--transport", "sse", "/app/scripts/indexer_server.py"] - ports: - - name: sse - containerPort: 8001 - protocol: TCP - - name: health - containerPort: 18001 - protocol: TCP - env: - - name: QDRANT_URL - valueFrom: - configMapKeyRef: - name: context-engine-config - key: QDRANT_URL - - name: FASTMCP_HOST - valueFrom: - configMapKeyRef: - name: context-engine-config - key: FASTMCP_HOST - - name: FASTMCP_INDEXER_PORT - valueFrom: - configMapKeyRef: - name: context-engine-config - key: FASTMCP_INDEXER_PORT - - name: FASTMCP_HEALTH_PORT - value: "18001" - - name: SOURCE_CODE_MODE - valueFrom: - configMapKeyRef: - name: context-engine-config - key: SOURCE_CODE_MODE - resources: - requests: - memory: "512Mi" - cpu: "250m" - limits: - memory: "2Gi" - cpu: "1000m" - volumeMounts: - - name: work-volume - mountPath: /work - - name: codebase-volume - mountPath: /work/.codebase - livenessProbe: - httpGet: - path: /readyz - port: health - initialDelaySeconds: 30 - periodSeconds: 10 - readinessProbe: - httpGet: - path: /readyz - port: health - initialDelaySeconds: 10 - periodSeconds: 5 - - # Git Sync Sidecar Container (only active when SOURCE_CODE_MODE=git) - - name: git-sync - image: k8s.gcr.io/git-sync:v4.1.0 - imagePullPolicy: IfNotPresent - env: - - name: GITSYNC_REPO - valueFrom: - configMapKeyRef: - name: context-engine-config - key: GIT_REPO_URL - - name: GITSYNC_BRANCH - valueFrom: - configMapKeyRef: - name: context-engine-config - key: GIT_BRANCH - - name: GITSYNC_ROOT - value: "/git" - - name: GITSYNC_SYNC_PERIOD - valueFrom: - configMapKeyRef: - name: context-engine-config - key: GIT_SYNC_PERIOD - - name: GITSYNC_ONE_TIME - value: "false" - - name: GITSYNC_LINK - value: "latest" - - name: GITSYNC_MAX_FAILURES - value: "5" - # For SSH-based authentication (optional) - - name: GITSYNC_SSH - value: "false" - # For HTTP basic auth (optional) - - name: GITSYNC_USERNAME - valueFrom: - configMapKeyRef: - name: context-engine-config - key: GIT_USERNAME - - name: GITSYNC_PASSWORD - valueFrom: - configMapKeyRef: - name: context-engine-config - key: GIT_PASSWORD - resources: - requests: - memory: "64Mi" - cpu: "50m" - limits: - memory: "128Mi" - cpu: "100m" - volumeMounts: - - name: work-volume - mountPath: /git - # Mount SSH key if using SSH authentication - - name: git-ssh-key - mountPath: /etc/git-secret/ssh - readOnly: true - securityContext: - runAsUser: 65533 # git-sync runs as non-root user - # This container will start but do nothing when SOURCE_CODE_MODE=local - # We could add a more sophisticated mechanism here if needed - - volumes: - # Shared work directory for both main container and git-sync - - name: work-volume - emptyDir: {} - - name: codebase-volume - emptyDir: {} - # SSH key volume for private repositories (optional) - - name: git-ssh-key - secret: - secretName: git-ssh-key - optional: true - items: - - key: ssh-private-key - path: id_rsa - ---- -# MCP Indexer Server Service -apiVersion: v1 -kind: Service -metadata: - name: mcp-indexer - namespace: context-engine - labels: - app: context-engine - component: mcp-indexer -spec: - type: NodePort # Change to LoadBalancer for external access - ports: - - name: sse - port: 8001 - targetPort: sse - nodePort: 30802 # Optional: specify node port - protocol: TCP - - name: health - port: 18001 - targetPort: health - nodePort: 30803 # Optional: specify node port - protocol: TCP - selector: - app: context-engine - component: mcp-indexer - ---- -# Example Secret for SSH Git Access (Optional) -# Create with: kubectl create secret generic git-ssh-key --from-file=ssh-private-key=~/.ssh/id_rsa -apiVersion: v1 -kind: Secret -metadata: - name: git-ssh-key - namespace: context-engine - labels: - app: context-engine - component: git-auth -type: Opaque -data: - # Base64 encoded SSH private key - # ssh-private-key: -type: Opaque \ No newline at end of file diff --git a/deploy/kubernetes/mcp-memory-git.yaml b/deploy/kubernetes/mcp-memory-git.yaml deleted file mode 100644 index dab93a9c..00000000 --- a/deploy/kubernetes/mcp-memory-git.yaml +++ /dev/null @@ -1,209 +0,0 @@ ---- -# MCP Memory Server with Git Sync Sidecar -apiVersion: apps/v1 -kind: Deployment -metadata: - name: mcp-memory - namespace: context-engine - labels: - app: context-engine - component: mcp-memory -spec: - replicas: 1 - selector: - matchLabels: - app: context-engine - component: mcp-memory - template: - metadata: - labels: - app: context-engine - component: mcp-memory - spec: - containers: - # Main MCP Memory Server Container - - name: mcp-memory - image: context-engine-memory:latest - imagePullPolicy: IfNotPresent - command: ["python", "-m", "mcp.server.fastmcp"] - args: ["--server-name", "context-engine-memory", "--host", "0.0.0.0", "--port", "8000", "--transport", "sse", "/app/scripts/memory_server.py"] - ports: - - name: sse - containerPort: 8000 - protocol: TCP - - name: health - containerPort: 18000 - protocol: TCP - env: - - name: QDRANT_URL - valueFrom: - configMapKeyRef: - name: context-engine-config - key: QDRANT_URL - - name: FASTMCP_HOST - valueFrom: - configMapKeyRef: - name: context-engine-config - key: FASTMCP_HOST - - name: FASTMCP_PORT - valueFrom: - configMapKeyRef: - name: context-engine-config - key: FASTMCP_PORT - - name: FASTMCP_HEALTH_PORT - value: "18000" - - name: COLLECTION_NAME - valueFrom: - configMapKeyRef: - name: context-engine-config - key: COLLECTION_NAME - - name: MEMORY_SSE_ENABLED - valueFrom: - configMapKeyRef: - name: context-engine-config - key: MEMORY_SSE_ENABLED - - name: MEMORY_MCP_URL - valueFrom: - configMapKeyRef: - name: context-engine-config - key: MEMORY_MCP_URL - - name: MEMORY_MCP_TIMEOUT - valueFrom: - configMapKeyRef: - name: context-engine-config - key: MEMORY_MCP_TIMEOUT - - name: MEMORY_AUTODETECT - valueFrom: - configMapKeyRef: - name: context-engine-config - key: MEMORY_AUTODETECT - - name: MEMORY_COLLECTION_TTL_SECS - valueFrom: - configMapKeyRef: - name: context-engine-config - key: MEMORY_COLLECTION_TTL_SECS - - name: TOOL_STORE_DESCRIPTION - valueFrom: - configMapKeyRef: - name: context-engine-config - key: TOOL_STORE_DESCRIPTION - - name: TOOL_FIND_DESCRIPTION - valueFrom: - configMapKeyRef: - name: context-engine-config - key: TOOL_FIND_DESCRIPTION - resources: - requests: - memory: "256Mi" - cpu: "200m" - limits: - memory: "1Gi" - cpu: "500m" - volumeMounts: - - name: work-volume - mountPath: /work - readOnly: true # Memory server only needs read access - livenessProbe: - httpGet: - path: /readyz - port: health - initialDelaySeconds: 30 - periodSeconds: 10 - readinessProbe: - httpGet: - path: /readyz - port: health - initialDelaySeconds: 10 - periodSeconds: 5 - - # Git Sync Sidecar Container - - name: git-sync - image: k8s.gcr.io/git-sync:v4.1.0 - imagePullPolicy: IfNotPresent - env: - - name: GITSYNC_REPO - valueFrom: - configMapKeyRef: - name: context-engine-config - key: GIT_REPO_URL - - name: GITSYNC_BRANCH - valueFrom: - configMapKeyRef: - name: context-engine-config - key: GIT_BRANCH - - name: GITSYNC_ROOT - value: "/git" - - name: GITSYNC_SYNC_PERIOD - valueFrom: - configMapKeyRef: - name: context-engine-config - key: GIT_SYNC_PERIOD - - name: GITSYNC_ONE_TIME - value: "false" - - name: GITSYNC_LINK - value: "latest" - - name: GITSYNC_MAX_FAILURES - value: "5" - - name: GITSYNC_USERNAME - valueFrom: - configMapKeyRef: - name: context-engine-config - key: GIT_USERNAME - - name: GITSYNC_PASSWORD - valueFrom: - configMapKeyRef: - name: context-engine-config - key: GIT_PASSWORD - resources: - requests: - memory: "64Mi" - cpu: "50m" - limits: - memory: "128Mi" - cpu: "100m" - volumeMounts: - - name: work-volume - mountPath: /git - - name: git-ssh-key - mountPath: /etc/git-secret/ssh - readOnly: true - securityContext: - runAsUser: 65533 - - volumes: - - name: work-volume - emptyDir: {} - - name: git-ssh-key - secret: - secretName: git-ssh-key - optional: true - items: - - key: ssh-private-key - path: id_rsa - ---- -# MCP Memory Server Service -apiVersion: v1 -kind: Service -metadata: - name: mcp-memory - namespace: context-engine - labels: - app: context-engine - component: mcp-memory -spec: - type: NodePort # Change to LoadBalancer for external access - ports: - - name: sse - port: 8000 - targetPort: sse - nodePort: 30800 # Optional: specify node port - protocol: TCP - - name: health - port: 18000 - targetPort: health - nodePort: 30801 # Optional: specify node port - protocol: TCP - selector: - app: context-engine - component: mcp-memory \ No newline at end of file diff --git a/deploy/kubernetes/upload-codebase-pvc.yaml b/deploy/kubernetes/upload-codebase-pvc.yaml new file mode 100644 index 00000000..cd6d07a9 --- /dev/null +++ b/deploy/kubernetes/upload-codebase-pvc.yaml @@ -0,0 +1,23 @@ +--- +# Persistent Volume Claim for codebase metadata storage (CephFS RWX) +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: upload-codebase-pvc + namespace: context-engine + labels: + app: context-engine + component: upload-service + type: storage +spec: + accessModes: + - ReadWriteMany # CephFS supports RWX for multiple pods + storageClassName: rook-cephfs # Adjust based on your CephFS storage class + resources: + requests: + storage: 5Gi # Smaller size for metadata/cache + # Optional: selector for specific PV + # selector: + # matchLabels: + # app: context-engine + # component: upload-codebase \ No newline at end of file diff --git a/deploy/kubernetes/upload-pvc.yaml b/deploy/kubernetes/upload-pvc.yaml index b149e8f3..8e4487dd 100644 --- a/deploy/kubernetes/upload-pvc.yaml +++ b/deploy/kubernetes/upload-pvc.yaml @@ -1,9 +1,9 @@ --- -# Persistent Volume Claim for workspace storage (CephFS RWX) +# Persistent Volume Claim for code repositories storage (CephFS RWX) apiVersion: v1 kind: PersistentVolumeClaim metadata: - name: upload-work-pvc + name: code-repos-pvc namespace: context-engine labels: app: context-engine @@ -20,14 +20,14 @@ spec: # selector: # matchLabels: # app: context-engine - # component: upload-work + # component: code-repos --- -# Persistent Volume Claim for codebase metadata storage (CephFS RWX) +# Persistent Volume Claim for code metadata storage (CephFS RWX) apiVersion: v1 kind: PersistentVolumeClaim metadata: - name: upload-codebase-pvc + name: code-metadata-pvc namespace: context-engine labels: app: context-engine @@ -44,4 +44,4 @@ spec: # selector: # matchLabels: # app: context-engine - # component: upload-codebase + # component: code-metadata diff --git a/deploy/kubernetes/upload-service.yaml b/deploy/kubernetes/upload-service.yaml index 0e457e4c..189a35b1 100644 --- a/deploy/kubernetes/upload-service.yaml +++ b/deploy/kubernetes/upload-service.yaml @@ -20,6 +20,10 @@ spec: app: context-engine component: upload-service spec: + securityContext: + runAsUser: 1000 + runAsGroup: 1000 + fsGroup: 1000 containers: - name: upload-service image: context-engine-upload-service # Use service-specific image name diff --git a/docker-compose.dev-remote.yml b/docker-compose.dev-remote.yml index 9dcd9604..80880117 100644 --- a/docker-compose.dev-remote.yml +++ b/docker-compose.dev-remote.yml @@ -1,6 +1,7 @@ # Development Docker Compose for Remote Upload System Testing -# This file simulates the Kubernetes environment with shared volumes (CephFS RWX) -# for local testing of the complete remote upload workflow +# This file simulates Kubernetes environment with shared volumes that simulate the Kubernetes CephFS RWX PVC behavior. +# Repos stored in /work/ (which is project root - avoiding docker volumes) and metadata are stored in /work/.codebase/repos (project root/.codebase) +# Updated to use separate PVCs for workspace and codebase to eliminate circular dependencies version: '3.8' @@ -23,6 +24,7 @@ services: context: . dockerfile: Dockerfile.mcp container_name: mcp-search-dev-remote + user: "1000:1000" depends_on: - qdrant env_file: @@ -32,16 +34,23 @@ services: - FASTMCP_PORT=${FASTMCP_PORT} - QDRANT_URL=${QDRANT_URL} - COLLECTION_NAME=${COLLECTION_NAME} + - PATH_EMIT_MODE=container + - HF_HOME=/work/.cache/huggingface + - TRANSFORMERS_CACHE=/work/.cache/huggingface + - HUGGINGFACE_HUB_CACHE=/work/.cache/huggingface - EMBEDDING_MODEL=${EMBEDDING_MODEL} - EMBEDDING_PROVIDER=${EMBEDDING_PROVIDER} - TOOL_STORE_DESCRIPTION=${TOOL_STORE_DESCRIPTION} - TOOL_FIND_DESCRIPTION=${TOOL_FIND_DESCRIPTION} - FASTMCP_HEALTH_PORT=18000 + - HF_HOME=/home/user/.cache + - TRANSFORMERS_CACHE=/home/user/.cache ports: - "18000:18000" - "8000:8000" volumes: - - shared_workspace:/work:ro + - workspace_pvc:/work:ro + - huggingface_cache:/home/user/.cache networks: - dev-remote-network @@ -51,6 +60,12 @@ services: context: . dockerfile: Dockerfile.mcp-indexer container_name: mcp-indexer-dev-remote + user: "1000:1000" + # In K8s, scripts would be accessed directly at /app/scripts/ or via proper initContainer + # For Docker Compose dev-remote simulation, create symlink so /work/scripts/ works + # Use /tmp/huggingface for cache to avoid permission issues (universally writable) + # Set CORRECT environment variables for HuggingFace and FastEmbed + command: ["sh", "-c", "mkdir -p /tmp/huggingface/hub /tmp/huggingface/transformers /tmp/huggingface/fastembed && exec python /app/scripts/mcp_indexer_server.py"] depends_on: - qdrant env_file: @@ -60,12 +75,26 @@ services: - FASTMCP_HOST=${FASTMCP_HOST} - FASTMCP_INDEXER_PORT=${FASTMCP_INDEXER_PORT} - QDRANT_URL=${QDRANT_URL} + - COLLECTION_NAME=${COLLECTION_NAME} + - PATH_EMIT_MODE=container + - HF_HOME=/tmp/huggingface + - HF_HUB_CACHE=/tmp/huggingface/hub + - TRANSFORMERS_CACHE=/tmp/huggingface/transformers + - FASTEMBED_CACHE_PATH=/tmp/huggingface/fastembed + - EMBEDDING_MODEL=${EMBEDDING_MODEL} + - EMBEDDING_PROVIDER=${EMBEDDING_PROVIDER} + - QDRANT_TIMEOUT=${QDRANT_TIMEOUT:-60} + - INDEX_SEMANTIC_CHUNKS=${INDEX_SEMANTIC_CHUNKS:-0} + - INDEX_MICRO_CHUNKS=${INDEX_MICRO_CHUNKS:-0} + - INDEX_UPSERT_BATCH=${INDEX_UPSERT_BATCH:-512} + - INDEX_UPSERT_RETRIES=${INDEX_UPSERT_RETRIES:-5} + - MAX_MICRO_CHUNKS_PER_FILE=${MAX_MICRO_CHUNKS_PER_FILE:-200} ports: - "${FASTMCP_INDEXER_PORT:-8001}:8001" - "18001:18001" volumes: - - shared_workspace:/work - - shared_codebase:/work/.codebase + - workspace_pvc:/work:rw + - codebase_pvc:/work/.codebase:rw networks: - dev-remote-network @@ -75,6 +104,7 @@ services: context: . dockerfile: Dockerfile.mcp container_name: mcp-search-http-dev-remote + user: "1000:1000" depends_on: - qdrant env_file: @@ -85,16 +115,23 @@ services: - FASTMCP_TRANSPORT=${FASTMCP_HTTP_TRANSPORT} - QDRANT_URL=${QDRANT_URL} - COLLECTION_NAME=${COLLECTION_NAME} + - PATH_EMIT_MODE=container + - HF_HOME=/work/.cache/huggingface + - TRANSFORMERS_CACHE=/work/.cache/huggingface + - HUGGINGFACE_HUB_CACHE=/work/.cache/huggingface - EMBEDDING_MODEL=${EMBEDDING_MODEL} - EMBEDDING_PROVIDER=${EMBEDDING_PROVIDER} - TOOL_STORE_DESCRIPTION=${TOOL_STORE_DESCRIPTION} - TOOL_FIND_DESCRIPTION=${TOOL_FIND_DESCRIPTION} - FASTMCP_HEALTH_PORT=18000 + - HF_HOME=/home/user/.cache + - TRANSFORMERS_CACHE=/home/user/.cache ports: - "${FASTMCP_HTTP_HEALTH_PORT:-18002}:18000" - "${FASTMCP_HTTP_PORT:-8002}:8000" volumes: - - shared_workspace:/work:ro + - workspace_pvc:/work:ro + - huggingface_cache:/home/user/.cache networks: - dev-remote-network @@ -104,6 +141,12 @@ services: context: . dockerfile: Dockerfile.mcp-indexer container_name: mcp-indexer-http-dev-remote + user: "1000:1000" + # In K8s, scripts would be accessed directly at /app/scripts/ or via proper initContainer + # For Docker Compose dev-remote simulation, create symlink so /work/scripts/ works + # Use /tmp/huggingface for cache to avoid permission issues (universally writable) + # Set CORRECT environment variables for HuggingFace and FastEmbed + command: ["sh", "-c", "mkdir -p /tmp/huggingface/hub /tmp/huggingface/transformers /tmp/huggingface/fastembed && exec python /app/scripts/mcp_indexer_server.py"] depends_on: - qdrant env_file: @@ -114,12 +157,26 @@ services: - FASTMCP_TRANSPORT=${FASTMCP_HTTP_TRANSPORT} - QDRANT_URL=${QDRANT_URL} - FASTMCP_HEALTH_PORT=18001 + - COLLECTION_NAME=${COLLECTION_NAME} + - PATH_EMIT_MODE=container + - HF_HOME=/tmp/huggingface + - HF_HUB_CACHE=/tmp/huggingface/hub + - TRANSFORMERS_CACHE=/tmp/huggingface/transformers + - FASTEMBED_CACHE_PATH=/tmp/huggingface/fastembed + - EMBEDDING_MODEL=${EMBEDDING_MODEL} + - EMBEDDING_PROVIDER=${EMBEDDING_PROVIDER} + - QDRANT_TIMEOUT=${QDRANT_TIMEOUT:-60} + - INDEX_SEMANTIC_CHUNKS=${INDEX_SEMANTIC_CHUNKS:-0} + - INDEX_MICRO_CHUNKS=${INDEX_MICRO_CHUNKS:-0} + - INDEX_UPSERT_BATCH=${INDEX_UPSERT_BATCH:-512} + - INDEX_UPSERT_RETRIES=${INDEX_UPSERT_RETRIES:-5} + - MAX_MICRO_CHUNKS_PER_FILE=${MAX_MICRO_CHUNKS_PER_FILE:-200} ports: - "${FASTMCP_INDEXER_HTTP_PORT:-8003}:8001" - "${FASTMCP_INDEXER_HTTP_HEALTH_PORT:-18003}:18001" volumes: - - shared_workspace:/work - - shared_codebase:/work/.codebase + - workspace_pvc:/work:rw + - codebase_pvc:/work/.codebase:rw networks: - dev-remote-network @@ -140,7 +197,7 @@ services: networks: - dev-remote-network - # Indexer service - modified for shared volumes + # Indexer service - modified for PVC volumes indexer: build: context: . @@ -153,21 +210,35 @@ services: environment: - QDRANT_URL=${QDRANT_URL} - COLLECTION_NAME=${COLLECTION_NAME} + - HF_HOME=/work/.cache/huggingface + - TRANSFORMERS_CACHE=/work/.cache/huggingface + - HUGGINGFACE_HUB_CACHE=/work/.cache/huggingface - EMBEDDING_MODEL=${EMBEDDING_MODEL} - working_dir: /work + - HF_HOME=/home/user/.cache + - HOST_INDEX_PATH=/work + - TRANSFORMERS_CACHE=/home/user/.cache + - QDRANT_TIMEOUT=${QDRANT_TIMEOUT:-60} + - INDEX_SEMANTIC_CHUNKS=${INDEX_SEMANTIC_CHUNKS:-0} + - INDEX_MICRO_CHUNKS=${INDEX_MICRO_CHUNKS:-0} + - INDEX_UPSERT_BATCH=${INDEX_UPSERT_BATCH:-512} + - INDEX_UPSERT_RETRIES=${INDEX_UPSERT_RETRIES:-5} + - MAX_MICRO_CHUNKS_PER_FILE=${MAX_MICRO_CHUNKS_PER_FILE:-200} volumes: - - shared_workspace:/work:ro - - shared_codebase:/work/.codebase:rw - entrypoint: ["python", "/app/scripts/ingest_code.py"] + - workspace_pvc:/work:rw + - codebase_pvc:/work/.codebase:rw + - huggingface_cache:/home/user/.cache + entrypoint: ["sh", "-c", "/app/scripts/wait-for-qdrant.sh && cd /app && python /app/scripts/ingest_code.py --root /work"] + restart: "no" # Run once on startup, do not restart after completion networks: - dev-remote-network - # Watcher service - modified for shared volumes + # Watcher service - modified for PVC volumes watcher: build: context: . dockerfile: Dockerfile.indexer container_name: watcher-dev-remote + user: "1000:1000" depends_on: - qdrant env_file: @@ -175,27 +246,36 @@ services: environment: - QDRANT_URL=${QDRANT_URL} - COLLECTION_NAME=${COLLECTION_NAME} + - HF_HOME=/tmp/huggingface + - HF_HUB_CACHE=/tmp/huggingface/hub + - TRANSFORMERS_CACHE=/tmp/huggingface/transformers + - FASTEMBED_CACHE_PATH=/tmp/huggingface/fastembed - EMBEDDING_MODEL=${EMBEDDING_MODEL} - - WATCH_ROOT=/work - - QDRANT_TIMEOUT=60 - - MAX_MICRO_CHUNKS_PER_FILE=200 - - INDEX_UPSERT_BATCH=128 - - INDEX_UPSERT_RETRIES=5 - - WATCH_DEBOUNCE_SECS=1.5 - working_dir: /work + - EMBEDDING_PROVIDER=${EMBEDDING_PROVIDER} + - WATCH_ROOT=${WATCH_ROOT:-/work} + - HOST_INDEX_PATH=/work + - QDRANT_TIMEOUT=${QDRANT_TIMEOUT:-60} + - INDEX_SEMANTIC_CHUNKS=${INDEX_SEMANTIC_CHUNKS:-0} + - INDEX_MICRO_CHUNKS=${INDEX_MICRO_CHUNKS:-0} + - INDEX_UPSERT_BATCH=${INDEX_UPSERT_BATCH:-512} + - INDEX_UPSERT_RETRIES=${INDEX_UPSERT_RETRIES:-5} + - MAX_MICRO_CHUNKS_PER_FILE=${MAX_MICRO_CHUNKS_PER_FILE:-200} + - WATCH_DEBOUNCE_SECS=${WATCH_DEBOUNCE_SECS:-1.5} + - REMOTE_UPLOAD_ENABLED=${REMOTE_UPLOAD_ENABLED:-0} volumes: - - shared_workspace:/work:ro - - shared_codebase:/work/.codebase:rw - entrypoint: ["python", "/app/scripts/watch_index.py"] + - workspace_pvc:/work:rw + - codebase_pvc:/work/.codebase:rw + command: ["sh", "-c", "mkdir -p /tmp/huggingface/hub /tmp/huggingface/transformers /tmp/huggingface/fastembed && exec python /app/scripts/watch_index.py"] networks: - dev-remote-network - # Init payload service - modified for shared volumes + # Init payload service - modified for PVC volumes with complete bootstrap init_payload: build: context: . dockerfile: Dockerfile.indexer container_name: init-payload-dev-remote + user: "1000:1000" depends_on: - qdrant env_file: @@ -203,11 +283,18 @@ services: environment: - QDRANT_URL=${QDRANT_URL} - COLLECTION_NAME=${COLLECTION_NAME} - working_dir: /work + - HF_HOME=/work/.cache/huggingface + - TRANSFORMERS_CACHE=/work/.cache/huggingface + - HUGGINGFACE_HUB_CACHE=/work/.cache/huggingface + - WORKDIR=/work + - TOKENIZER_URL=${TOKENIZER_URL:-https://huggingface.co/BAAI/bge-base-en-v1.5/resolve/main/tokenizer.json} + - TOKENIZER_PATH=${TOKENIZER_PATH:-/work/models/tokenizer.json} + - EMBEDDING_MODEL=${EMBEDDING_MODEL} volumes: - - shared_workspace:/work:ro - - shared_codebase:/work/.codebase:rw - entrypoint: ["python", "/app/scripts/create_indexes.py"] + - workspace_pvc:/work:rw + - codebase_pvc:/work/.codebase:rw + command: ["sh", "-c", "echo 'Starting initialization sequence...' && echo 'Waiting for Qdrant...' && sleep 30 && PYTHONPATH=/app python /app/scripts/create_indexes.py && echo 'Collections and metadata created' && sleep 15 && python /app/scripts/warm_all_collections.py && echo 'Search caches warmed for all collections' && python /app/scripts/health_check.py && echo 'Initialization completed successfully!'"] + restart: "no" # Run once on startup networks: - dev-remote-network @@ -217,6 +304,7 @@ services: context: . dockerfile: Dockerfile.upload-service container_name: upload-service-dev-remote + user: "1000:1000" depends_on: - qdrant env_file: @@ -226,12 +314,15 @@ services: - UPLOAD_SERVICE_HOST=0.0.0.0 - UPLOAD_SERVICE_PORT=8002 - QDRANT_URL=${QDRANT_URL} - - WORK_DIR=/work + - WORKDIR=/work - MAX_BUNDLE_SIZE_MB=100 - UPLOAD_TIMEOUT_SECS=300 # Indexing configuration - COLLECTION_NAME=${COLLECTION_NAME} + - HF_HOME=/work/.cache/huggingface + - TRANSFORMERS_CACHE=/work/.cache/huggingface + - HUGGINGFACE_HUB_CACHE=/work/.cache/huggingface - EMBEDDING_MODEL=${EMBEDDING_MODEL} - EMBEDDING_PROVIDER=${EMBEDDING_PROVIDER} - USE_TREE_SITTER=${USE_TREE_SITTER} @@ -252,8 +343,8 @@ services: - "8004:8002" # Map to different host port to avoid conflicts - "18004:18000" # Health check port volumes: - - shared_workspace:/work - - shared_codebase:/work/.codebase + - workspace_pvc:/work:rw + - codebase_pvc:/work/.codebase:rw - upload_temp:/tmp/uploads healthcheck: test: ["CMD", "curl", "-f", "http://localhost:8002/health"] @@ -265,48 +356,11 @@ services: networks: - dev-remote-network - # NEW: Remote Upload Client for Testing - remote_upload_client: - build: - context: . - dockerfile: Dockerfile.indexer # Reuse indexer image - container_name: remote-upload-client-dev-remote - depends_on: - - upload_service - env_file: - - .env - environment: - # Remote upload client configuration - - REMOTE_UPLOAD_ENABLED=1 - - REMOTE_UPLOAD_ENDPOINT=http://upload_service:8002 - - REMOTE_UPLOAD_MAX_RETRIES=3 - - REMOTE_UPLOAD_TIMEOUT=30 - - REMOTE_UPLOAD_DEBUG=1 - - # Watcher configuration for remote mode - - WATCH_ROOT=/work - - QDRANT_URL=${QDRANT_URL} - - COLLECTION_NAME=${COLLECTION_NAME} - - EMBEDDING_MODEL=${EMBEDDING_MODEL} - - QDRANT_TIMEOUT=60 - - MAX_MICRO_CHUNKS_PER_FILE=200 - - INDEX_UPSERT_BATCH=128 - - INDEX_UPSERT_RETRIES=5 - - WATCH_DEBOUNCE_SECS=1.5 - working_dir: /work - volumes: - - shared_workspace:/work:ro - - shared_codebase:/work/.codebase:rw - entrypoint: ["python", "/app/scripts/remote_upload_client.py"] - profiles: - - client # Only start when explicitly requested - networks: - - dev-remote-network - -# Shared volumes to simulate CephFS RWX PVC behavior + +# PVCs to simulate CephFS RWX behavior (production-like) volumes: # Main workspace volume - simulates CephFS RWX for repository storage - shared_workspace: + workspace_pvc: driver: local driver_opts: type: none @@ -314,17 +368,25 @@ volumes: device: ${HOST_INDEX_PATH:-./dev-workspace} # Codebase metadata volume - simulates CephFS RWX for indexing metadata - shared_codebase: + codebase_pvc: driver: local driver_opts: type: none o: bind - device: ${HOST_INDEX_PATH:-./dev-workspace}/.codebase + device: ./.codebase # Temporary upload storage upload_temp: driver: local + # HuggingFace cache for model downloads + huggingface_cache: + driver: local + + # Indexer cache for model downloads + indexer_cache: + driver: local + # Qdrant storage - separate from base compose to avoid conflicts qdrant_storage_dev_remote: driver: local diff --git a/docs/dev-remote-setup.md b/docs/dev-remote-setup.md deleted file mode 100644 index ac2090d9..00000000 --- a/docs/dev-remote-setup.md +++ /dev/null @@ -1,341 +0,0 @@ -# Development Remote Upload System Setup - -This guide covers setting up and using the development environment for testing the Context-Engine remote upload system with shared volumes that simulate the Kubernetes CephFS RWX PVC behavior. - -## Overview - -The `docker-compose.dev-remote.yml` file provides a complete local development environment that simulates the Kubernetes deployment with: - -- **Shared Volumes**: Simulates CephFS ReadWriteMany (RWX) PVC behavior -- **Upload Service**: HTTP service for receiving delta bundles -- **All Existing Services**: Qdrant, MCP servers, indexer, watcher, etc. -- **Service Discovery**: Proper networking between all services -- **Development Tools**: Easy testing and debugging capabilities - -## Quick Start - -### 1. Initial Setup - -```bash -# Run the development setup script -./scripts/dev-setup.sh - -# Or manually: -mkdir -p dev-workspace/.codebase -cp .env.example .env # if not exists -``` - -### 2. Start the System - -```bash -# Bootstrap the complete system (recommended) -make dev-remote-bootstrap - -# Or start services step by step: -make dev-remote-up -``` - -### 3. Test Your Repository - -```bash -# 1. Copy your repository to the workspace -cp -r /path/to/your/repo dev-workspace/your-repo - -# 2. Test the upload service -make dev-remote-test - -# 3. Check service health -curl http://localhost:8004/health -``` - -## Architecture - -### Shared Volume Structure - -The development environment uses shared volumes to simulate Kubernetes CephFS behavior: - -``` -dev-workspace/ # Main workspace (simulates CephFS RWX) -├── your-repo/ # Your repository code -├── .codebase/ # Indexing metadata and cache -└── ... # Other repositories -``` - -### Service Configuration - -| Service | Port | Purpose | Volumes | -|---------|------|---------|---------| -| upload_service | 8004 | Delta upload HTTP API | shared_workspace, shared_codebase | -| qdrant | 6333/6334 | Vector database | qdrant_storage_dev_remote | -| mcp | 8000 | MCP search server (SSE) | shared_workspace (ro) | -| mcp_indexer | 8001 | MCP indexer server (SSE) | shared_workspace, shared_codebase | -| mcp_http | 8002 | MCP search server (HTTP) | shared_workspace (ro) | -| mcp_indexer_http | 8003 | MCP indexer server (HTTP) | shared_workspace, shared_codebase | -| llamacpp | 8080 | LLM decoder service | ./models (ro) | - -### Network Configuration - -All services communicate via the `dev-remote-network` bridge network (172.20.0.0/16), ensuring proper service discovery and isolation. - -## Available Commands - -### Development Environment Commands - -```bash -# Environment setup -make dev-remote-up # Start all services -make dev-remote-down # Stop all services -make dev-remote-restart # Restart with rebuild -make dev-remote-logs # Follow service logs -make dev-remote-clean # Clean up volumes and containers - -# Bootstrap and testing -make dev-remote-bootstrap # Complete system setup -make dev-remote-test # Test upload workflow -make dev-remote-client # Start remote upload client - -# Individual service management -docker compose -f docker-compose.dev-remote.yml ps -docker compose -f docker-compose.dev-remote.yml logs upload_service -``` - -### Remote Upload Testing - -```bash -# Test upload service health -curl http://localhost:8004/health - -# Check workspace status -curl 'http://localhost:8004/api/v1/delta/status?workspace_path=/work/your-repo' - -# Test file upload (requires delta bundle) -curl -X POST \ - -F 'bundle=@test-bundle.tar.gz' \ - -F 'workspace_path=/work/your-repo' \ - http://localhost:8004/api/v1/delta/upload -``` - -## Workflow Examples - -### 1. Local Development Workflow - -```bash -# 1. Setup environment -./scripts/dev-setup.sh - -# 2. Add your repository -cp -r ~/my-project dev-workspace/my-project - -# 3. Start the system -make dev-remote-bootstrap - -# 4. Test indexing -docker compose -f docker-compose.dev-remote.yml run --rm indexer --root /work/my-project - -# 5. Start watcher for live updates -docker compose -f docker-compose.dev-remote.yml run --rm watcher -``` - -### 2. Remote Upload Testing Workflow - -```bash -# 1. Start upload service -make dev-remote-up - -# 2. Test remote upload from another directory -cd ~/my-project -make watch-remote REMOTE_UPLOAD_ENDPOINT=http://localhost:8004 - -# 3. Make changes to your code -# Files will be automatically uploaded and indexed -``` - -### 3. Multiple Repository Testing - -```bash -# 1. Setup multiple repositories -mkdir -p dev-workspace/{repo1,repo2,repo3} -cp -r ~/project1/* dev-workspace/repo1/ -cp -r ~/project2/* dev-workspace/repo2/ -cp -r ~/project3/* dev-workspace/repo3/ - -# 2. Start system -make dev-remote-bootstrap - -# 3. Index each repository -docker compose -f docker-compose.dev-remote.yml run --rm indexer --root /work/repo1 --collection repo1 -docker compose -f docker-compose.dev-remote.yml run --rm indexer --root /work/repo2 --collection repo2 -docker compose -f docker-compose.dev-remote.yml run --rm indexer --root /work/repo3 --collection repo3 -``` - -## Environment Variables - -### Development-Specific Variables - -```bash -# Workspace configuration -HOST_INDEX_PATH=./dev-workspace # Local workspace path -DEV_REMOTE_MODE=1 # Enable dev-remote mode -DEV_REMOTE_DEBUG=1 # Enable debug logging - -# Upload service configuration -UPLOAD_SERVICE_HOST=0.0.0.0 # Service bind address -UPLOAD_SERVICE_PORT=8002 # Service port (internal) -UPLOAD_SERVICE_DEBUG=1 # Enable debug mode - -# Remote upload client configuration -REMOTE_UPLOAD_ENABLED=1 # Enable remote upload -REMOTE_UPLOAD_ENDPOINT=http://upload_service:8002 # Upload endpoint -REMOTE_UPLOAD_MAX_RETRIES=3 # Max retry attempts -REMOTE_UPLOAD_TIMEOUT=30 # Request timeout (seconds) -REMOTE_UPLOAD_DEBUG=1 # Enable debug logging -``` - -### Standard Variables (from .env.example) - -All standard Context-Engine variables are supported and can be overridden for development: - -```bash -QDRANT_URL=http://qdrant:6333 -COLLECTION_NAME=my-collection -EMBEDDING_MODEL=BAAI/bge-base-en-v1.5 -EMBEDDING_PROVIDER=fastembed -# ... other standard variables -``` - -## Troubleshooting - -### Common Issues - -1. **Port Conflicts** - ```bash - # Check what's using ports - netstat -tulpn | grep :8004 - # Stop conflicting services - make dev-remote-down - ``` - -2. **Volume Permission Issues** - ```bash - # Fix workspace permissions - sudo chown -R $USER:$USER dev-workspace - chmod -R 755 dev-workspace - ``` - -3. **Service Not Ready** - ```bash - # Check service status - make dev-remote-logs - docker compose -f docker-compose.dev-remote.yml ps - - # Restart specific service - docker compose -f docker-compose.dev-remote.yml restart upload_service - ``` - -4. **Upload Failures** - ```bash - # Check upload service logs - docker compose -f docker-compose.dev-remote.yml logs upload_service - - # Test upload service directly - curl -v http://localhost:8004/health - ``` - -### Debug Mode - -Enable debug logging for detailed troubleshooting: - -```bash -# Add to .env -DEV_REMOTE_DEBUG=1 -UPLOAD_SERVICE_DEBUG=1 -REMOTE_UPLOAD_DEBUG=1 - -# Restart services -make dev-remote-restart -``` - -### Clean Reset - -For a complete reset: - -```bash -# Clean everything -make dev-remote-clean - -# Remove workspace -rm -rf dev-workspace - -# Start fresh -./scripts/dev-setup.sh -make dev-remote-bootstrap -``` - -## Integration with Existing Workflows - -### Using with Existing Make Targets - -The dev-remote environment integrates with existing Make targets: - -```bash -# Use dev-remote environment with existing targets -HOST_INDEX_PATH=./dev-workspace docker compose -f docker-compose.dev-remote.yml run --rm indexer --root /work/my-repo - -# Test with dev-remote stack -make health # Uses dev-remote stack if running -make hybrid # Uses dev-remote Qdrant instance -``` - -### MCP Client Configuration - -Configure your MCP clients (Cursor, Windsurf, etc.): - -```json -{ - "mcpServers": { - "qdrant": { - "type": "sse", - "url": "http://localhost:8000/sse", - "disabled": false - }, - "qdrant-indexer": { - "type": "sse", - "url": "http://localhost:8001/sse", - "disabled": false - } - } -} -``` - -## Performance Considerations - -### Resource Allocation - -The dev-remote environment is configured for development: - -- **Memory**: Moderate allocation suitable for development -- **CPU**: Shared allocation with reasonable limits -- **Storage**: Local volumes for fast I/O - -### Optimization Tips - -1. **Use SSD Storage**: Place `dev-workspace` on SSD for better performance -2. **Limit Repository Size**: Test with smaller repositories first -3. **Adjust Batch Sizes**: Tune `INDEX_UPSERT_BATCH` for your hardware -4. **Monitor Resources**: Use `docker stats` to monitor resource usage - -## Next Steps - -1. **Test Your Repository**: Add your code to `dev-workspace` and test the workflow -2. **Experiment with Remote Upload**: Try the remote upload client with your changes -3. **Integrate with IDE**: Configure your MCP client for the development environment -4. **Contribute**: Report issues and contribute improvements to the dev-remote setup - -## Support - -For issues with the dev-remote environment: - -1. Check the troubleshooting section above -2. Review service logs: `make dev-remote-logs` -3. Check the main documentation: `docs/remote_upload.md` -4. Open an issue with details about your setup and the problem \ No newline at end of file diff --git a/docs/remote_upload.md b/docs/remote_upload.md deleted file mode 100644 index 95d31497..00000000 --- a/docs/remote_upload.md +++ /dev/null @@ -1,219 +0,0 @@ -# Remote Upload Client for Context-Engine - -This document describes the remote upload client functionality that extends the existing watch_index.py for remote delta uploads. - -## Overview - -The remote upload client enables real-time code synchronization by uploading delta bundles to a remote server instead of processing files locally. This is useful for distributed development environments where multiple instances need to stay synchronized. - -## Architecture - -The system consists of: - -1. **RemoteUploadClient** - Handles delta bundle creation and HTTP uploads -2. **Extended ChangeQueue** - Integrates with remote client for delta processing -3. **Enhanced watch_index.py** - Supports both local and remote modes -4. **Delta Bundle Format** - Standardized tarball format with metadata - -## Configuration - -The remote upload client is configured via environment variables: - -| Variable | Description | Default | Example | -|----------|-------------|---------|---------| -| `REMOTE_UPLOAD_ENABLED` | Enable remote mode | `false` | `1` | -| `REMOTE_UPLOAD_ENDPOINT` | Upload server URL | `http://localhost:8080` | `https://api.example.com` | -| `REMOTE_UPLOAD_MAX_RETRIES` | Max upload retries | `3` | `5` | -| `REMOTE_UPLOAD_TIMEOUT` | Request timeout (seconds) | `30` | `60` | - -## Usage - -### Local Mode (Default) -```bash -make watch -``` - -### Remote Mode -```bash -# Set environment variables -export REMOTE_UPLOAD_ENABLED=1 -export REMOTE_UPLOAD_ENDPOINT=https://your-server.com:8080 -export REMOTE_UPLOAD_MAX_RETRIES=5 -export REMOTE_UPLOAD_TIMEOUT=60 - -# Or use the convenience target -make watch-remote REMOTE_UPLOAD_ENDPOINT=https://your-server.com:8080 -``` - -## Delta Bundle Format - -Delta bundles are tarballs (`.tar.gz`) containing: - -``` -delta-bundle.tar.gz -├── manifest.json # Bundle metadata and file operations -├── files/ # Directory containing file content -│ ├── created/ # New files -│ ├── updated/ # Modified files -│ └── moved/ # Moved files (at destination) -└── metadata/ # File metadata and hashes - ├── hashes.json # Content hashes for all files - └── operations.json # Detailed operation metadata -``` - -### Manifest Format -```json -{ - "version": "1.0", - "bundle_id": "uuid-v4", - "workspace_path": "/absolute/path/to/workspace", - "collection_name": "workspace-collection", - "created_at": "2025-01-26T01:55:00.000Z", - "sequence_number": 42, - "parent_sequence": 41, - "operations": { - "created": 5, - "updated": 3, - "deleted": 2, - "moved": 1 - }, - "total_files": 11, - "total_size_bytes": 1048576, - "compression": "gzip", - "encoding": "utf-8" -} -``` - -## Features - -### Change Detection -- **Hash-based detection** - Uses SHA1 hashes to detect file changes -- **Move detection** - Identifies file moves by matching content hashes -- **Efficient caching** - Leverages existing workspace state cache -- **Debouncing** - Integrates with existing ChangeQueue debouncing - -### Error Handling -- **Automatic retry** - Exponential backoff for network failures -- **Sequence recovery** - Handles sequence number mismatches -- **Fallback mode** - Falls back to local processing on upload failures -- **Bundle persistence** - Stores bundles locally for recovery - -### Integration -- **Backward compatible** - Existing local mode unchanged -- **Same logging** - Uses existing logging patterns -- **Same filtering** - Leverages existing file exclusion logic -- **Same debouncing** - Integrates with existing ChangeQueue - -## API Endpoints - -### Upload Endpoint -``` -POST /api/v1/delta/upload -Content-Type: multipart/form-data - -Parameters: -- bundle: Delta bundle tarball -- workspace_path: Absolute workspace path -- collection_name: Override collection name -- sequence_number: Expected sequence number -- force: Force upload even if sequence mismatch -``` - -### Status Endpoint -``` -GET /api/v1/delta/status?workspace_path=/workspace - -Response: -{ - "workspace_path": "/workspace", - "collection_name": "workspace-collection", - "last_sequence": 41, - "last_upload": "2025-01-26T01:50:00.000Z", - "pending_operations": 0, - "status": "ready", - "server_info": { - "version": "1.0.0", - "max_bundle_size_mb": 100, - "supported_formats": ["tar.gz"] - } -} -``` - -## Testing - -Run the basic tests to verify functionality: - -```bash -python scripts/test_remote_basic.py -``` - -This tests: -- Remote configuration detection -- Delta bundle structure creation -- Sequence number tracking - -## Implementation Notes - -### File Structure -- `scripts/remote_upload_client.py` - Main remote upload client -- `scripts/watch_index.py` - Extended with remote mode support -- `Makefile` - Added `watch-remote` target - -### Key Classes -- `RemoteUploadClient` - Core upload functionality -- `ChangeQueue` - Extended with remote client support -- `IndexHandler` - Updated for optional client (remote mode) - -### Integration Points -- Uses existing `get_cached_file_hash()` for change detection -- Leverages existing file filtering from `IndexHandler._maybe_enqueue()` -- Integrates with existing debouncing in `ChangeQueue` -- Maintains same logging and progress reporting patterns - -## Troubleshooting - -### Common Issues - -1. **"No module named 'qdrant_client'"** - - Install dependencies: `pip install qdrant-client fastembed watchdog requests` - -2. **"Remote mode not enabled"** - - Set `REMOTE_UPLOAD_ENABLED=1` in environment - -3. **"Upload failed"** - - Check `REMOTE_UPLOAD_ENDPOINT` is accessible - - Verify server supports delta upload API - - Check network connectivity - -4. **"Sequence mismatch"** - - Server will attempt automatic recovery - - Can force upload with `force=true` parameter - -### Debug Mode - -Enable debug logging: -```bash -export PYTHONPATH=. -python -c " -import logging -logging.basicConfig(level=logging.DEBUG) -from scripts.remote_upload_client import RemoteUploadClient -# ... your debug code -" -``` - -## Security Considerations - -For this PoC implementation: -- No authentication is required (development mode) -- No encryption is applied to bundles -- Server endpoint validation is basic -- Production deployment should add proper authentication - -## Future Enhancements - -1. **Authentication** - Add API key or token-based auth -2. **Compression** - Add support for different compression algorithms -3. **Incremental uploads** - Support for large file incremental sync -4. **Conflict resolution** - Handle concurrent modifications -5. **Batch optimization** - Bundle multiple changes together \ No newline at end of file diff --git a/docs/upload_service.md b/docs/upload_service.md deleted file mode 100644 index 4edb4dfb..00000000 --- a/docs/upload_service.md +++ /dev/null @@ -1,261 +0,0 @@ -# Delta Upload Service - -This document describes the HTTP upload service for receiving and processing delta bundles in Context-Engine. - -## Overview - -The delta upload service is a FastAPI-based HTTP service that: -- Receives delta bundles from remote upload clients -- Extracts and processes file operations (create, update, delete, move) -- Integrates with existing indexing pipeline via `ingest_code.py` -- Provides health checks and status monitoring -- Supports CephFS persistent storage for Kubernetes deployment - -## API Endpoints - -### Health Check -``` -GET /health -``` - -Returns service health status and configuration. - -### Status -``` -GET /api/v1/delta/status?workspace_path=/path/to/workspace -``` - -Returns upload status for a specific workspace. - -### Upload Delta Bundle -``` -POST /api/v1/delta/upload -Content-Type: multipart/form-data -``` - -Parameters: -- `bundle`: Delta bundle tarball file -- `workspace_path`: Target workspace path -- `collection_name`: Override collection name (optional) -- `sequence_number`: Expected sequence number (optional) -- `force`: Force upload even if sequence mismatch (optional) - -## Delta Bundle Format - -Delta bundles are tar.gz archives with the following structure: - -``` -delta-bundle.tar.gz -├── manifest.json # Bundle metadata -├── files/ # File content -│ ├── created/ # New files -│ ├── updated/ # Modified files -│ └── moved/ # Moved files (at destination) -└── metadata/ # File metadata - ├── hashes.json # Content hashes - └── operations.json # Detailed operation metadata -``` - -### Manifest Format - -```json -{ - "version": "1.0", - "bundle_id": "uuid-v4", - "workspace_path": "/absolute/path/to/workspace", - "collection_name": "workspace-collection", - "created_at": "2025-01-26T02:00:00.000Z", - "sequence_number": 42, - "parent_sequence": 41, - "operations": { - "created": 5, - "updated": 3, - "deleted": 2, - "moved": 1 - }, - "total_files": 11, - "total_size_bytes": 1048576, - "compression": "gzip", - "encoding": "utf-8" -} -``` - -## Deployment - -### Local Development - -1. Install dependencies: -```bash -pip install -r requirements.txt -``` - -2. Run the service: -```bash -python scripts/upload_service.py -``` - -The service will start on `http://localhost:8002` by default. - -### Docker - -Build the image: -```bash -docker build -f Dockerfile.upload-service -t context-engine-upload-service . -``` - -Run the container: -```bash -docker run -p 8002:8002 \ - -e QDRANT_URL=http://qdrant:6333 \ - -e WORK_DIR=/work \ - -v /path/to/work:/work \ - context-engine-upload-service -``` - -### Kubernetes - -1. Apply the namespace and config: -```bash -kubectl apply -f deploy/kubernetes/namespace.yaml -kubectl apply -f deploy/kubernetes/configmap.yaml -``` - -2. Create persistent volumes (adjust storage class as needed): -```bash -kubectl apply -f deploy/kubernetes/upload-pvc.yaml -``` - -3. Deploy the service: -```bash -kubectl apply -f deploy/kubernetes/upload-service.yaml -``` - -The service will be available at: -- Internal: `http://upload-service.context-engine.svc.cluster.local:8002` -- External: `http://:30804` (NodePort) - -## Configuration - -Environment variables: - -| Variable | Default | Description | -|----------|----------|-------------| -| `UPLOAD_SERVICE_HOST` | `0.0.0.0` | Service bind address | -| `UPLOAD_SERVICE_PORT` | `8002` | Service port | -| `QDRANT_URL` | `http://qdrant:6333` | Qdrant server URL | -| `COLLECTION_NAME` | `my-collection` | Default collection name | -| `WORK_DIR` | `/work` | Workspace directory | -| `MAX_BUNDLE_SIZE_MB` | `100` | Maximum bundle size | -| `UPLOAD_TIMEOUT_SECS` | `300` | Upload timeout | - -## Integration - -### With Remote Upload Client - -The upload service integrates with the remote upload client in `scripts/remote_upload_client.py`: - -```python -from scripts.remote_upload_client import RemoteUploadClient - -client = RemoteUploadClient( - upload_endpoint="http://upload-service:8002", - workspace_path="/path/to/workspace", - collection_name="my-collection" -) - -# Upload changes -success = client.process_and_upload_changes(changed_files) -``` - -### With Existing Indexing Pipeline - -The service reuses the existing indexing pipeline: - -- Calls `ingest_code.index_repo()` for changed files -- Uses `workspace_state.py` for state management -- Integrates with existing Qdrant connection patterns -- Supports hash-based caching and change detection - -## Testing - -Run the test suite: - -```bash -python scripts/test_upload_service.py --url http://localhost:8002 -``` - -This will test: -- Health check endpoint -- Status endpoint -- Upload endpoint with sample delta bundle - -## Monitoring - -### Health Checks - -The service provides liveness and readiness probes: -- Liveness: `/health` every 10 seconds after 30s delay -- Readiness: `/health` every 5 seconds after 10s delay - -### Logging - -Logs include: -- Request/response details -- Bundle processing status -- Error details and stack traces -- Integration with existing logging patterns - -### Metrics - -The service tracks: -- Upload success/failure rates -- Processing times -- Operation counts (create, update, delete, move) -- Indexed points count - -## Security Considerations - -For production deployment: - -1. **Authentication**: Add API key or JWT authentication -2. **Authorization**: Implement workspace-based access control -3. **Input Validation**: Enhanced bundle validation and sanitization -4. **Rate Limiting**: Add request rate limiting -5. **TLS**: Enable HTTPS for production - -## Troubleshooting - -### Common Issues - -1. **Bundle Too Large**: Increase `MAX_BUNDLE_SIZE_MB` or optimize bundles -2. **Sequence Mismatch**: Check client sequence tracking or use `force=true` -3. **Indexing Failures**: Verify Qdrant connectivity and collection exists -4. **Storage Issues**: Check PVC status and CephFS connectivity - -### Debug Mode - -Enable debug logging: -```bash -export UPLOAD_SERVICE_LOG_LEVEL=debug -python scripts/upload_service.py -``` - -### Health Check - -Verify service status: -```bash -curl http://localhost:8002/health -``` - -## Architecture - -The upload service follows the delta upload architecture defined in: -- `delta_upload_design.md` - Format specification -- `delta_upload_architecture.md` - System design - -Key components: -- **FastAPI HTTP Server**: Handles incoming requests -- **Bundle Processor**: Extracts and validates delta bundles -- **File Operations**: Applies create/update/delete/move operations -- **Indexing Integration**: Calls existing indexing pipeline -- **State Management**: Tracks sequences and workspace state \ No newline at end of file diff --git a/docs/usage_guide.md b/docs/usage_guide.md deleted file mode 100644 index 35037c4b..00000000 --- a/docs/usage_guide.md +++ /dev/null @@ -1,597 +0,0 @@ -# Context-Engine Real-Time Code Ingestion: Usage Guide - -This guide provides comprehensive instructions for using the Context-Engine real-time code ingestion system with both local and remote upload capabilities. - -## Table of Contents - -1. [Quick Start](#quick-start) -2. [Local vs Remote Mode](#local-vs-remote-mode) -3. [Configuration](#configuration) -4. [Usage Examples](#usage-examples) -5. [Deployment](#deployment) -6. [Troubleshooting](#troubleshooting) -7. [Advanced Configuration](#advanced-configuration) - -## Quick Start - -### Prerequisites - -- Docker and Docker Compose installed -- Python 3.8+ with required dependencies -- Access to a Qdrant instance (local or remote) - -### Basic Local Mode Setup - -1. **Clone and setup the repository:** -```bash -git clone -cd Context-Engine -cp .env.example .env -``` - -2. **Start the services:** -```bash -make up -``` - -3. **Index your codebase:** -```bash -make index -``` - -4. **Start watching for changes:** -```bash -make watch -``` - -### Basic Remote Mode Setup - -1. **Deploy the upload service:** -```bash -# Deploy to Kubernetes -kubectl apply -f deploy/kubernetes/upload-pvc.yaml -kubectl apply -f deploy/kubernetes/upload-service.yaml -``` - -2. **Start remote watching:** -```bash -make watch-remote REMOTE_UPLOAD_ENDPOINT=http://your-upload-service:8002 -``` - -## Local vs Remote Mode - -### Local Mode - -**Use Case:** Single developer, local development environment - -**How it works:** -- Files are processed directly on the local machine -- Changes are indexed directly into local Qdrant instance -- No network dependencies for indexing - -**Pros:** -- ✅ Fast response time (no network latency) -- ✅ Works offline -- ✅ Simple setup -- ✅ No additional infrastructure needed - -**Cons:** -- ❌ Limited to single machine -- ❌ No collaboration features -- ❌ Each developer maintains separate index - -**Command:** -```bash -make watch -``` - -### Remote Mode - -**Use Case:** Team collaboration, distributed development, centralized indexing - -**How it works:** -- Files are packaged into delta bundles -- Bundles are uploaded to remote upload service -- Remote service processes and indexes changes -- All clients sync from the same central index - -**Pros:** -- ✅ Centralized index for team collaboration -- ✅ Consistent search results across team -- ✅ Reduced local resource usage -- ✅ Better for large codebases -- ✅ Supports distributed teams - -**Cons:** -- ❌ Requires network connectivity -- ❌ Additional infrastructure -- ❌ Network latency -- ❌ More complex setup - -**Command:** -```bash -make watch-remote REMOTE_UPLOAD_ENDPOINT=http://your-server:8002 -``` - -## Configuration - -### Environment Variables - -#### Core Configuration -```bash -# Qdrant connection -QDRANT_URL=http://qdrant:6333 -COLLECTION_NAME=my-collection - -# Workspace configuration -WATCH_ROOT=/work -WORKSPACE_PATH=/work - -# Embedding model -EMBEDDING_MODEL=BAAI/bge-base-en-v1.5 -``` - -#### Remote Upload Configuration -```bash -# Enable remote mode -REMOTE_UPLOAD_ENABLED=1 - -# Upload service endpoint -REMOTE_UPLOAD_ENDPOINT=http://your-server:8002 - -# Upload behavior -REMOTE_UPLOAD_MAX_RETRIES=3 -REMOTE_UPLOAD_TIMEOUT=30 - -# Watch behavior -WATCH_DEBOUNCE_SECS=1.0 -``` - -#### File Filtering -```bash -# Ignore file location -QDRANT_IGNORE_FILE=.qdrantignore -``` - -### Example .env Files - -#### Local Development (.env.local) -```bash -# Local development configuration -QDRANT_URL=http://localhost:6333 -COLLECTION_NAME=my-dev-collection -WATCH_ROOT=/Users/developer/my-project -EMBEDDING_MODEL=BAAI/bge-base-en-v1.5 -WATCH_DEBOUNCE_SECS=0.5 -``` - -#### Team Collaboration (.env.remote) -```bash -# Remote team configuration -QDRANT_URL=http://qdrant.team.svc.cluster.local:6333 -COLLECTION_NAME=team-shared-collection -WATCH_ROOT=/workspace -EMBEDDING_MODEL=BAAI/bge-base-en-v1.5 - -# Remote upload settings -REMOTE_UPLOAD_ENABLED=1 -REMOTE_UPLOAD_ENDPOINT=http://upload-service.team.svc.cluster.local:8002 -REMOTE_UPLOAD_MAX_RETRIES=5 -REMOTE_UPLOAD_TIMEOUT=60 -WATCH_DEBOUNCE_SECS=2.0 -``` - -#### Production (.env.prod) -```bash -# Production configuration -QDRANT_URL=https://qdrant.production.com -COLLECTION_NAME=prod-codebase -WATCH_ROOT=/app/workspace -EMBEDDING_MODEL=BAAI/bge-base-en-v1.5 - -# Remote upload with high reliability -REMOTE_UPLOAD_ENABLED=1 -REMOTE_UPLOAD_ENDPOINT=https://upload-api.production.com -REMOTE_UPLOAD_MAX_RETRIES=10 -REMOTE_UPLOAD_TIMEOUT=120 -WATCH_DEBOUNCE_SECS=3.0 -``` - -## Usage Examples - -### Basic Development Workflow - -```bash -# 1. Start services -make up - -# 2. Initial indexing -make reindex - -# 3. Start watching (local mode) -make watch - -# In another terminal, make changes to your code... -# Changes will be automatically indexed -``` - -### Team Collaboration Workflow - -```bash -# 1. Deploy infrastructure (once) -kubectl apply -f deploy/kubernetes/ - -# 2. Each developer starts remote watching -make watch-remote REMOTE_UPLOAD_ENDPOINT=https://upload.team.com:8002 - -# 3. Developers make changes... -# All changes are synchronized across the team -``` - -### Hybrid Workflow (Local + Remote) - -```bash -# Use local mode for fast iteration -make watch - -# Switch to remote mode when ready to share -make watch-remote REMOTE_UPLOAD_ENDPOINT=https://upload.team.com:8002 -``` - -### Advanced Indexing - -```bash -# Index specific path -make index-path REPO_PATH=/path/to/repo RECREATE=1 - -# Index current directory with custom collection -make index-here REPO_NAME=my-project COLLECTION=my-project-collection - -# Warm up search caches -make warm - -# Run health checks -make health -``` - -## Deployment - -### Local Development - -1. **Using Docker Compose:** -```bash -# Start all services -make up - -# View logs -make logs - -# Check status -make ps -``` - -2. **Manual Setup:** -```bash -# Install dependencies -pip install -r requirements.txt - -# Start Qdrant -docker run -p 6333:6333 qdrant/qdrant - -# Start watching -python scripts/watch_index.py -``` - -### Kubernetes Deployment - -1. **Prerequisites:** -```bash -# Kubernetes cluster with storage support -kubectl cluster-info - -# Install required manifests -kubectl apply -f deploy/kubernetes/namespace.yaml -kubectl apply -f deploy/kubernetes/configmap.yaml -``` - -2. **Deploy Core Services:** -```bash -# Deploy Qdrant -kubectl apply -f deploy/kubernetes/qdrant.yaml - -# Deploy upload service with persistent storage -kubectl apply -f deploy/kubernetes/upload-pvc.yaml -kubectl apply -f deploy/kubernetes/upload-service.yaml - -# Deploy indexer services -kubectl apply -f deploy/kubernetes/indexer-services.yaml -``` - -3. **Configure Access:** -```bash -# Check service status -kubectl get pods -n context-engine - -# Get upload service endpoint -kubectl get svc upload-service -n context-engine - -# Port forward for local testing -kubectl port-forward svc/upload-service 8002:8002 -n context-engine -``` - -### Production Considerations - -1. **High Availability:** -```yaml -# Example: Multiple replicas for upload service -spec: - replicas: 3 - selector: - matchLabels: - app: upload-service -``` - -2. **Resource Limits:** -```yaml -# Example: Resource constraints -resources: - requests: - memory: "512Mi" - cpu: "250m" - limits: - memory: "2Gi" - cpu: "1000m" -``` - -3. **Monitoring:** -```yaml -# Example: Health checks -livenessProbe: - httpGet: - path: /health - port: 8002 - initialDelaySeconds: 30 - periodSeconds: 10 -readinessProbe: - httpGet: - path: /health - port: 8002 - initialDelaySeconds: 10 - periodSeconds: 5 -``` - -## Troubleshooting - -### Common Issues - -#### 1. "No module named 'qdrant_client'" -**Solution:** -```bash -pip install qdrant-client fastembed watchdog requests -``` - -#### 2. "Remote mode not enabled" -**Solution:** -```bash -export REMOTE_UPLOAD_ENABLED=1 -# Or add to .env file -echo "REMOTE_UPLOAD_ENABLED=1" >> .env -``` - -#### 3. "Upload failed: Connection refused" -**Solutions:** -- Check upload service is running: `kubectl get pods` -- Verify endpoint URL: `curl http://your-endpoint:8002/health` -- Check network connectivity: `telnet your-endpoint 8002` - -#### 4. "Sequence mismatch" errors -**Solutions:** -- Client will attempt automatic recovery -- Force upload if needed: Set `force=true` in upload request -- Reset sequence: Delete `.codebase/delta_bundles/last_sequence.txt` - -#### 5. "Bundle too large" errors -**Solutions:** -- Increase `MAX_BUNDLE_SIZE_MB` on upload service -- Reduce number of changes before upload (adjust debounce) -- Split large changes into smaller commits - -#### 6. "Indexing is slow" -**Solutions:** -- Use faster embedding model -- Increase `WATCH_DEBOUNCE_SECS` to reduce frequency -- Upgrade hardware (more CPU/RAM) -- Use remote mode to offload processing - -### Debug Mode - -#### Enable Debug Logging -```bash -# Set log level -export PYTHONPATH=. -export UPLOAD_SERVICE_LOG_LEVEL=debug - -# Run with debug output -python -c " -import logging -logging.basicConfig(level=logging.DEBUG) -from scripts.remote_upload_client import RemoteUploadClient -# Your debug code here -" -``` - -#### Check System Status -```bash -# Check Qdrant -curl http://localhost:6333/collections - -# Check upload service -curl http://localhost:8002/health - -# Check workspace state -ls -la .codebase/ -cat .codebase/workspace_state.json -``` - -#### Monitor File Changes -```bash -# Watch file system events (Linux) -inotifywait -m -r -e modify,create,delete,move /path/to/watch - -# Watch file system events (macOS) -fswatch -r /path/to/watch -``` - -### Performance Tuning - -#### Optimize for Large Codebases -```bash -# Increase debounce to reduce processing frequency -WATCH_DEBOUNCE_SECS=5.0 - -# Use larger batch sizes -BATCH_SIZE=1000 - -# Increase timeouts -REMOTE_UPLOAD_TIMEOUT=120 -QDRANT_TIMEOUT=60 -``` - -#### Optimize for Real-time Response -```bash -# Reduce debounce for faster response -WATCH_DEBOUNCE_SECS=0.1 - -# Use smaller batches for faster processing -BATCH_SIZE=100 - -# Reduce timeouts -REMOTE_UPLOAD_TIMEOUT=30 -QDRANT_TIMEOUT=20 -``` - -## Advanced Configuration - -### Custom File Filtering - -Create `.qdrantignore` in your workspace root: - -``` -# Ignore patterns -*.log -*.tmp -node_modules/ -.git/ -build/ -dist/ -*.min.js -*.min.css - -# Ignore specific directories -tests/fixtures/ -docs/generated/ -``` - -### Custom Embedding Models - -```bash -# Use different model -EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2 - -# Custom model (local path) -EMBEDDING_MODEL=/path/to/custom/model - -# Model-specific settings -EMBEDDING_DEVICE=cuda -EMBEDDING_BATCH_SIZE=32 -``` - -### Multi-Collection Setup - -```bash -# Different collections for different projects -COLLECTION_NAME=project-alpha - -# Or use environment-specific collections -COLLECTION_NAME=${PROJECT_NAME}-${ENVIRONMENT} -``` - -### Integration with CI/CD - -#### GitHub Actions Example -```yaml -name: Index Code Changes -on: [push] - -jobs: - index: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v2 - - name: Setup Python - uses: actions/setup-python@v2 - with: - python-version: '3.9' - - name: Install dependencies - run: pip install -r requirements.txt - - name: Index changes - env: - REMOTE_UPLOAD_ENABLED: 1 - REMOTE_UPLOAD_ENDPOINT: ${{ secrets.UPLOAD_ENDPOINT }} - run: | - python scripts/watch_index.py --once -``` - -#### Jenkins Pipeline Example -```groovy -pipeline { - agent any - environment { - REMOTE_UPLOAD_ENABLED = '1' - REMOTE_UPLOAD_ENDPOINT = credentials('upload-endpoint') - } - stages { - stage('Index') { - steps { - sh 'python scripts/watch_index.py --once' - } - } - } -} -``` - -### Monitoring and Alerting - -#### Prometheus Metrics -```yaml -# Example Prometheus configuration -scrape_configs: - - job_name: 'context-engine' - static_configs: - - targets: ['upload-service:8002'] - metrics_path: '/metrics' -``` - -#### Grafana Dashboard -- Upload success rate -- Processing time -- Queue depth -- Error rates -- Resource usage - -#### Alerting Rules -```yaml -# Example alerting rules -groups: - - name: context-engine - rules: - - alert: HighErrorRate - expr: upload_error_rate > 0.1 - for: 5m - labels: - severity: warning - annotations: - summary: "High upload error rate detected" -``` - -This comprehensive guide should help you get the most out of the Context-Engine real-time code ingestion system. For more specific issues or advanced use cases, refer to the individual component documentation or reach out to the development team. \ No newline at end of file diff --git a/scripts/create_indexes.py b/scripts/create_indexes.py index 74793a90..b7c93eee 100644 --- a/scripts/create_indexes.py +++ b/scripts/create_indexes.py @@ -4,21 +4,32 @@ QDRANT_URL = os.environ.get("QDRANT_URL", "http://qdrant:6333") from datetime import datetime +# Import critical functions first try: - from scripts.workspace_state import update_workspace_state, update_last_activity, get_collection_name + from scripts.workspace_state import get_collection_name, is_multi_repo_mode except Exception: - update_workspace_state = None # type: ignore - update_last_activity = None # type: ignore get_collection_name = None # type: ignore + is_multi_repo_mode = None # type: ignore + +# Import other optional functions +try: + from scripts.workspace_state import log_activity +except Exception: + log_activity = None # type: ignore COLLECTION = os.environ.get("COLLECTION_NAME", "my-collection") # Discover workspace path for state updates (allows subdir indexing) WS_PATH = os.environ.get("INDEX_ROOT") or os.environ.get("WORKSPACE_PATH") or "/work" +# Skip creating root collection in multi-repo mode +if is_multi_repo_mode and is_multi_repo_mode() and WS_PATH == "/work": + print("Multi-repo mode enabled - skipping root collection creation for /work") + exit(0) + # Prefer per-workspace unique collection if none provided if (COLLECTION == "my-collection") and ('get_collection_name' in globals()) and get_collection_name: try: - COLLECTION = get_collection_name(WS_PATH) + COLLECTION = get_collection_name(None) # Use global state in single-repo mode except Exception: pass @@ -37,19 +48,14 @@ field_schema=models.PayloadSchemaType.KEYWORD, ) -# Update workspace state to record collection and activity +# Log activity using cleaned workspace_state function try: - if update_workspace_state: - update_workspace_state(WS_PATH, {"qdrant_collection": COLLECTION}) - if update_last_activity: - update_last_activity( - WS_PATH, - { - "timestamp": datetime.now().isoformat(), - "action": "initialized", - "file_path": "", - "details": {"created_indexes": ["metadata.language", "metadata.path_prefix"]}, - }, + if log_activity: + log_activity( + repo_name=None, + action="initialized", + file_path="", + details={"created_indexes": ["metadata.language", "metadata.path_prefix"]}, ) except Exception: pass diff --git a/scripts/dev-setup.sh b/scripts/dev-setup.sh old mode 100644 new mode 100755 diff --git a/scripts/health_check.py b/scripts/health_check.py index b856a1d0..eec4d57c 100644 --- a/scripts/health_check.py +++ b/scripts/health_check.py @@ -39,83 +39,101 @@ def main(): client = QdrantClient(url=qdrant_url, api_key=api_key or None) - # 1) Collection exists and has expected named vector/dimension - info = client.get_collection(collection) - cfg = info.config.params.vectors - if isinstance(cfg, dict): - present_names = list(cfg.keys()) - assert_true(len(present_names) >= 1, "Collection has at least one named vector") - assert_true( - vec_name_expect in present_names, - f"Expected vector name present: {vec_name_expect} in {present_names}", - ) - got_dim = cfg[vec_name_expect].size - else: - present_names = [""] - got_dim = cfg.size - assert_true( - got_dim == dim, f"Vector dimension matches embedding ({got_dim} == {dim})" - ) - - # 2) HNSW tuned params (best effort; allow >= thresholds) - hcfg = info.config.hnsw_config + # Get all collections and check each one try: - m = getattr(hcfg, "m", None) - efc = getattr(hcfg, "ef_construct", None) - assert_true(m is None or m >= 16, f"HNSW m>=16 (got {m})") - assert_true(efc is None or efc >= 256, f"HNSW ef_construct>=256 (got {efc})") - except Exception: - print("[WARN] Could not read HNSW config; continuing") - - # 3) Payload indexes created (language, path_prefix, repo, kind, symbol) - # Not all clients expose schema listing; we validate by running filtered queries - probe_text = "split code into overlapping line chunks" - probe_vec = next(model.embed([probe_text])).tolist() - - # Unfiltered query - qp = client.query_points( - collection_name=collection, - query=probe_vec, - using=vec_name_expect, - limit=3, - with_payload=True, - search_params=models.SearchParams(hnsw_ef=128), - ) - res_points = getattr(qp, "points", qp) - assert_true(isinstance(res_points, list), "query_points returns a list of points") - - # Filtered by language + kind (should not error; may return 0 results if dataset sparse) - flt = models.Filter( - must=[ - models.FieldCondition( - key="metadata.language", match=models.MatchValue(value="python") - ), - models.FieldCondition( - key="metadata.kind", match=models.MatchValue(value="function") - ), - ] - ) - qp2 = client.query_points( - collection_name=collection, - query=probe_vec, - using=vec_name_expect, - query_filter=flt, - limit=3, - with_payload=True, - ) - res2 = getattr(qp2, "points", qp2) or [] - # If results exist, ensure payload has kind/symbol keys - if res2: - md: Dict[str, Any] = (res2[0].payload or {}).get("metadata") or {} + collections_response = client.get_collections() + collections = [c.name for c in collections_response.collections] + print(f"Found collections: {collections}") + except Exception as e: + print(f"Error getting collections: {e}") + sys.exit(1) + + if not collections: + print("No collections found - nothing to health check") + return + + # Check each collection + for collection_name in collections: + print(f"Checking collection: {collection_name}") + + # 1) Collection exists and has expected named vector/dimension + info = client.get_collection(collection_name) + cfg = info.config.params.vectors + if isinstance(cfg, dict): + present_names = list(cfg.keys()) + assert_true(len(present_names) >= 1, "Collection has at least one named vector") + assert_true( + vec_name_expect in present_names, + f"Expected vector name present: {vec_name_expect} in {present_names}", + ) + got_dim = cfg[vec_name_expect].size + else: + present_names = [""] + got_dim = cfg.size assert_true( - "kind" in md and "symbol" in md, - "payload includes metadata.kind and metadata.symbol", + got_dim == dim, f"Vector dimension matches embedding ({got_dim} == {dim})" ) - else: - print("[OK] Filtered query ran (no results is acceptable depending on data)") - print("All checks passed.") + # 2) HNSW tuned params (best effort; allow >= thresholds) + hcfg = info.config.hnsw_config + try: + m = getattr(hcfg, "m", None) + efc = getattr(hcfg, "ef_construct", None) + assert_true(m is None or m >= 16, f"HNSW m>=16 (got {m})") + assert_true(efc is None or efc >= 256, f"HNSW ef_construct>=256 (got {efc})") + except Exception: + print("[WARN] Could not read HNSW config; continuing") + + # 3) Test queries on this collection + probe_text = "split code into overlapping line chunks" + probe_vec = next(model.embed([probe_text])).tolist() + + # Unfiltered query + qp = client.query_points( + collection_name=collection_name, + query=probe_vec, + using=vec_name_expect, + limit=3, + with_payload=True, + search_params=models.SearchParams(hnsw_ef=128), + ) + res_points = getattr(qp, "points", qp) + assert_true(isinstance(res_points, list), "query_points returns a list of points") + + # Filtered by language + kind (should not error; may return 0 results if dataset sparse) + flt = models.Filter( + must=[ + models.FieldCondition( + key="metadata.language", match=models.MatchValue(value="python") + ), + models.FieldCondition( + key="metadata.kind", match=models.MatchValue(value="function") + ), + ] + ) + qp2 = client.query_points( + collection_name=collection_name, + query=probe_vec, + using=vec_name_expect, + query_filter=flt, + limit=3, + with_payload=True, + ) + res2 = getattr(qp2, "points", qp2) or [] + # If results exist, ensure payload has kind/symbol keys + if res2: + md: Dict[str, Any] = (res2[0].payload or {}).get("metadata") or {} + assert_true( + "kind" in md and "symbol" in md, + "payload includes metadata.kind and metadata.symbol", + ) + else: + print("[OK] Filtered query ran (no results is acceptable depending on data)") + + print(f"[OK] Collection {collection_name} health check passed") + + print(f"[OK] All {len(collections)} collections passed health check") if __name__ == "__main__": - main() + main() \ No newline at end of file diff --git a/scripts/hybrid_search.py b/scripts/hybrid_search.py index eaa32014..bd4657bf 100644 --- a/scripts/hybrid_search.py +++ b/scripts/hybrid_search.py @@ -9,7 +9,10 @@ import json -def _collection() -> str: +def _collection(collection_name: str | None = None) -> str: + """Get collection name with priority: CLI arg > ENV > default""" + if collection_name and collection_name.strip(): + return collection_name.strip() return os.environ.get("COLLECTION_NAME", "my-collection") @@ -693,14 +696,15 @@ def _sanitize_filter_obj(flt): return None -def lex_query(client: QdrantClient, v: List[float], flt, per_query: int) -> List[Any]: +def lex_query(client: QdrantClient, v: List[float], flt, per_query: int, collection_name: str | None = None) -> List[Any]: ef = max(EF_SEARCH, 32 + 4 * int(per_query)) flt = _sanitize_filter_obj(flt) + collection = _collection(collection_name) # Prefer modern API; handle kwarg rename between client versions (query_filter -> filter) try: qp = client.query_points( - collection_name=_collection(), + collection_name=collection, query=v, using=LEX_VECTOR_NAME, query_filter=flt, @@ -712,7 +716,7 @@ def lex_query(client: QdrantClient, v: List[float], flt, per_query: int) -> List except TypeError: # Older/newer client may expect 'filter' kw qp = client.query_points( - collection_name=_collection(), + collection_name=collection, query=v, using=LEX_VECTOR_NAME, filter=flt, @@ -724,7 +728,7 @@ def lex_query(client: QdrantClient, v: List[float], flt, per_query: int) -> List except AttributeError: # Very old client without query_points: last-resort deprecated path return client.search( - collection_name=_collection(), + collection_name=collection, query_vector={"name": LEX_VECTOR_NAME, "vector": v}, limit=per_query, with_payload=True, @@ -733,14 +737,15 @@ def lex_query(client: QdrantClient, v: List[float], flt, per_query: int) -> List def dense_query( - client: QdrantClient, vec_name: str, v: List[float], flt, per_query: int + client: QdrantClient, vec_name: str, v: List[float], flt, per_query: int, collection_name: str | None = None ) -> List[Any]: ef = max(EF_SEARCH, 32 + 4 * int(per_query)) flt = _sanitize_filter_obj(flt) + collection = _collection(collection_name) try: qp = client.query_points( - collection_name=_collection(), + collection_name=collection, query=v, using=vec_name, query_filter=flt, @@ -751,7 +756,7 @@ def dense_query( return getattr(qp, "points", qp) except TypeError: qp = client.query_points( - collection_name=_collection(), + collection_name=collection, query=v, using=vec_name, filter=flt, @@ -766,7 +771,7 @@ def dense_query( if "expected some form of condition" in _msg or "format error in json body" in _msg: try: qp = client.query_points( - collection_name=_collection(), + collection_name=collection, query=v, using=vec_name, query_filter=None, @@ -780,7 +785,7 @@ def dense_query( # Fallback to legacy search API try: return client.search( - collection_name=_collection(), + collection_name=collection, query_vector={"name": vec_name, "vector": v}, limit=per_query, with_payload=True, @@ -810,6 +815,7 @@ def run_hybrid_search( not_glob: str | list[str] | None = None, expand: bool = True, model: TextEmbedding | None = None, + collection: str | None = None, ) -> List[Dict[str, Any]]: client = QdrantClient(url=os.environ.get("QDRANT_URL", QDRANT_URL), api_key=API_KEY) model_name = os.environ.get("EMBEDDING_MODEL", MODEL_NAME) @@ -943,7 +949,7 @@ def _norm_under(u: str | None) -> str | None: score_map: Dict[str, Dict[str, Any]] = {} try: lex_vec = lex_hash_vector(qlist) - lex_results = lex_query(client, lex_vec, flt, max(24, limit)) + lex_results = lex_query(client, lex_vec, flt, max(24, limit), collection) except Exception: lex_results = [] for rank, p in enumerate(lex_results, 1): @@ -974,7 +980,7 @@ def _norm_under(u: str | None) -> str | None: try: if embedded: dim = len(embedded[0]) - _ensure_collection(client, _collection(), dim, vec_name) + _ensure_collection(client, _collection(collection), dim, vec_name) except Exception: pass # Optional gate-first using mini vectors to restrict dense search to candidates @@ -1003,7 +1009,7 @@ def _norm_under(u: str | None) -> str | None: # Get top candidates using MINI vectors (fast prefilter) candidate_ids = set() for mv in mini_queries: - mini_results = dense_query(client, MINI_VECTOR_NAME, mv, flt, cand_n) + mini_results = dense_query(client, MINI_VECTOR_NAME, mv, flt, cand_n, collection) for result in mini_results: if hasattr(result, 'id'): candidate_ids.add(result.id) @@ -1057,7 +1063,7 @@ def _norm_under(u: str | None) -> str | None: flt_gated = _sanitize_filter_obj(flt_gated) result_sets: List[List[Any]] = [ - dense_query(client, vec_name, v, flt_gated, max(24, limit)) for v in embedded + dense_query(client, vec_name, v, flt_gated, max(24, limit), collection) for v in embedded ] if os.environ.get("DEBUG_HYBRID_SEARCH"): total_dense_results = sum(len(rs) for rs in result_sets) @@ -1074,7 +1080,7 @@ def _norm_under(u: str | None) -> str | None: try: mini_queries = [_project_mini(list(v), MINI_VEC_DIM) for v in embedded] mini_sets: List[List[Any]] = [ - dense_query(client, MINI_VECTOR_NAME, mv, flt, max(24, limit)) + dense_query(client, MINI_VECTOR_NAME, mv, flt, max(24, limit), collection) for mv in mini_queries ] for res in mini_sets: @@ -1146,7 +1152,7 @@ def _norm_under(u: str | None) -> str | None: try: lex_vec2 = lex_hash_vector(prf_qs) lex_results2 = lex_query( - client, lex_vec2, flt, max(12, limit // 2 or 6) + client, lex_vec2, flt, max(12, limit // 2 or 6), collection ) except Exception: lex_results2 = [] @@ -1175,7 +1181,7 @@ def _norm_under(u: str | None) -> str | None: try: embedded2 = _embed_queries_cached(_model, prf_qs) result_sets2: List[List[Any]] = [ - dense_query(client, vec_name, v, flt, max(12, limit // 2 or 6)) + dense_query(client, vec_name, v, flt, max(12, limit // 2 or 6), collection) for v in embedded2 ] for res2 in result_sets2: @@ -1688,6 +1694,8 @@ def main(): # Structured filters to mirror MCP tool fields ap.add_argument("--ext", type=str, default=None) ap.add_argument("--not", dest="not_filter", type=str, default=None) + ap.add_argument("--collection", type=str, default=None, + help="Target collection name (overrides COLLECTION_NAME env var)") ap.add_argument( "--case", type=str, @@ -1700,6 +1708,9 @@ def main(): args = ap.parse_args() + # Resolve effective collection early to avoid variable usage errors + eff_collection = args.collection or os.environ.get("COLLECTION_NAME", "my-collection") + model = TextEmbedding(model_name=MODEL_NAME) vec_name = _sanitize_vector_name(MODEL_NAME) client = QdrantClient(url=QDRANT_URL, api_key=API_KEY or None) @@ -1708,7 +1719,7 @@ def main(): try: first_vec = next(model.embed(["__dim__warmup__"])) dim = len(first_vec.tolist()) - _ensure_collection(client, _collection(), dim, vec_name) + _ensure_collection(client, _collection(eff_collection), dim, vec_name) except Exception: pass @@ -1814,7 +1825,7 @@ def _norm_under(u: str | None) -> str | None: # Server-side lexical vector search (hashing) as an additional ranked list try: lex_vec = lex_hash_vector(queries) - lex_results = lex_query(client, lex_vec, flt, args.per_query) + lex_results = lex_query(client, lex_vec, flt, args.per_query, eff_collection) except Exception: lex_results = [] @@ -1846,7 +1857,7 @@ def _norm_under(u: str | None) -> str | None: embedded = _embed_queries_cached(model, queries) result_sets: List[List[Any]] = [ - dense_query(client, vec_name, v, flt, args.per_query) for v in embedded + dense_query(client, vec_name, v, flt, args.per_query, eff_collection) for v in embedded ] # RRF fusion (weighted) diff --git a/scripts/ingest_code.py b/scripts/ingest_code.py index 60ef1929..1b47951f 100644 --- a/scripts/ingest_code.py +++ b/scripts/ingest_code.py @@ -1,38 +1,15 @@ from __future__ import annotations -# Helper: detect repository name automatically (no REPO_NAME env needed) +# Import repository detection from workspace_state to avoid duplication def _detect_repo_name_from_path(path: Path) -> str: + """Wrapper function to use workspace_state repository detection.""" try: - import subprocess, os as _os - - base = path if path.is_dir() else path.parent - r = subprocess.run( - ["git", "-C", str(base), "rev-parse", "--show-toplevel"], - capture_output=True, - text=True, - ) - top = r.stdout.strip() - if r.returncode == 0 and top: - return Path(top).name or "workspace" - except Exception: - pass - # Fallback: walk up to find a .git folder - try: - cur = path if path.is_dir() else path.parent - for p in [cur] + list(cur.parents): - try: - if (p / ".git").exists(): - return p.name or "workspace" - except Exception: - continue - except Exception: - pass - # Last resort: directory name - try: - return (path if path.is_dir() else path.parent).name or "workspace" - except Exception: - return "workspace" + from scripts.workspace_state import _extract_repo_name_from_path as _ws_detect + return _ws_detect(str(path)) + except ImportError: + # Fallback for when workspace_state is not available + return path.name if path.is_dir() else path.parent.name #!/usr/bin/env python3 @@ -56,27 +33,43 @@ def _detect_repo_name_from_path(path: Path) -> str: from fastembed import TextEmbedding - from datetime import datetime + +# Import critical multi-repo functions first try: from scripts.workspace_state import ( - update_indexing_status, - update_last_activity, - update_workspace_state, + is_multi_repo_mode, get_collection_name, + ) +except ImportError: + is_multi_repo_mode = None # type: ignore + get_collection_name = None # type: ignore + +# Import watcher's repo detection for surgical fix +try: + from scripts.watch_index import _detect_repo_for_file, _get_collection_for_file +except ImportError: + _detect_repo_for_file = None # type: ignore + _get_collection_for_file = None # type: ignore + +# Import other workspace state functions (optional) +try: + from scripts.workspace_state import ( + log_activity, get_cached_file_hash, set_cached_file_hash, remove_cached_file, + update_indexing_status, + update_workspace_state, ) -except Exception: +except ImportError: # State integration is optional; continue if not available - update_indexing_status = None # type: ignore - update_last_activity = None # type: ignore - update_workspace_state = None # type: ignore - get_collection_name = None # type: ignore + log_activity = None # type: ignore get_cached_file_hash = None # type: ignore set_cached_file_hash = None # type: ignore remove_cached_file = None # type: ignore + update_indexing_status = None # type: ignore + update_workspace_state = None # type: ignore # Optional Tree-sitter import (graceful fallback) try: @@ -461,7 +454,6 @@ def chunk_semantic( n = len(lines) - # Extract symbols with line ranges symbols = _extract_symbols(language, text) if not symbols: @@ -522,7 +514,6 @@ def chunk_by_tokens( Tokenizer = None # type: ignore - try: k = int(os.environ.get("MICRO_CHUNK_TOKENS", str(k_tokens or 16)) or 16) except Exception: @@ -686,23 +677,21 @@ def ensure_collection(client: QdrantClient, name: str, dim: int, vector_name: st """ try: info = client.get_collection(name) - # Ensure HNSW tuned params even if the collection already existed - try: - client.update_collection( - collection_name=name, - hnsw_config=models.HnswConfigDiff(m=16, ef_construct=256), - ) - except Exception: - pass - # Schema repair: add missing named vectors on existing collections + # Prevent I/O storm - only update vectors if they actually don't exist try: cfg = getattr(info.config.params, "vectors", None) if isinstance(cfg, dict): + # Check if collection already has required vectors before updating + has_lex = LEX_VECTOR_NAME in cfg + has_mini = MINI_VECTOR_NAME in cfg + + # Only add to missing if vector doesn't already exist missing = {} - if LEX_VECTOR_NAME not in cfg: + if not has_lex: missing[LEX_VECTOR_NAME] = models.VectorParams( size=LEX_VECTOR_DIM, distance=models.Distance.COSINE ) + try: refrag_on = os.environ.get("REFRAG_MODE", "").strip().lower() in { "1", @@ -712,13 +701,17 @@ def ensure_collection(client: QdrantClient, name: str, dim: int, vector_name: st } except Exception: refrag_on = False - if refrag_on and MINI_VECTOR_NAME not in cfg: + + if refrag_on and not has_mini: missing[MINI_VECTOR_NAME] = models.VectorParams( size=int( os.environ.get("MINI_VEC_DIM", MINI_VEC_DIM) or MINI_VEC_DIM ), distance=models.Distance.COSINE, ) + + # Only update collection if vectors are actually missing + # Previous behavior: always called update_collection() causing I/O storms if missing: try: client.update_collection( @@ -727,10 +720,13 @@ def ensure_collection(client: QdrantClient, name: str, dim: int, vector_name: st except Exception: # Best-effort; if server doesn't support adding vectors, leave to recreate path pass - except Exception: + except Exception as e: + print(f"[COLLECTION_ERROR] Failed to update collection {name}: {e}") pass return - except Exception: + except Exception as e: + # Collection doesn't exist - proceed to create it + print(f"[COLLECTION_INFO] Creating new collection {name}: {type(e).__name__}") pass vectors_cfg = { vector_name: models.VectorParams(size=dim, distance=models.Distance.COSINE), @@ -1197,7 +1193,6 @@ def _extract_symbols_java(text: str) -> List[_Sym]: return syms - def _extract_symbols_csharp(text: str) -> List[_Sym]: lines = text.splitlines() syms: List[_Sym] = [] @@ -1261,7 +1256,6 @@ def _extract_symbols_php(text: str) -> List[_Sym]: return syms - def _extract_symbols_shell(text: str) -> List[_Sym]: lines = text.splitlines() syms: List[_Sym] = [] @@ -1665,8 +1659,8 @@ def index_single_file( ws_path = os.environ.get("WATCH_ROOT") or os.environ.get("WORKSPACE_PATH") or "/work" try: if get_cached_file_hash: - prev_local = get_cached_file_hash(ws_path, str(file_path)) - if prev_local and prev_local == file_hash: + prev_local = get_cached_file_hash(str(file_path), repo_tag) + if prev_local and file_hash and prev_local == file_hash: print(f"Skipping unchanged file (cache): {file_path}") return False except Exception: @@ -1837,13 +1831,13 @@ def make_point(pid, dense_vec, lex_vec, payload): try: ws = os.environ.get("WATCH_ROOT") or os.environ.get("WORKSPACE_PATH") or "/work" if set_cached_file_hash: - set_cached_file_hash(ws, str(file_path), file_hash) + file_repo_tag = _detect_repo_name_from_path(file_path) + set_cached_file_hash(str(file_path), file_hash, file_repo_tag) except Exception: pass return True return False - def index_repo( root: Path, qdrant_url: str, @@ -1896,36 +1890,70 @@ def index_repo( # Workspace state: ensure unique per-workspace collection and announce start try: ws_path = str(root) - # If collection is unset or default placeholder, generate a per-workspace one - if 'get_collection_name' in globals() and get_collection_name: - default_marker = os.environ.get("COLLECTION_NAME", "my-collection") - if (not collection) or (collection == "my-collection") or (default_marker == "my-collection"): - collection = get_collection_name(ws_path) + repo_tag = _detect_repo_name_from_path(root) if _detect_repo_name_from_path else None + + # KNOWN LIMITATION: In multi-repo mode, when called with /work as root (e.g., --root /work), + # this indexer treats the entire /work directory as one repository instead of detecting + # individual subdirectories. The watcher service handles individual repos correctly. + # TODO: Add a shared helper function in workspace_state.py for repository detection + # that both watcher and indexer can use to avoid duplication. Then either skip + # /work indexing in multi-repo mode or detect subdirectories properly. + # This creates collections like "work-342f2f" instead of "test-repo-xxxxx". + + # Use per-repo collections for /work root in multi-repo mode + is_multi_repo = is_multi_repo_mode() if is_multi_repo_mode else False + is_work_root = Path(ws_path).resolve() == Path("/work").resolve() + use_per_repo_collections = is_multi_repo_mode() and _get_collection_for_file + + if use_per_repo_collections: + collection = None # Will be determined per-file during indexing + print("[multi_repo] Using per-repo collections for /work root (surgical fix)") + else: + # Original logic: single collection for entire root + # If collection is unset or default placeholder, generate a per-workspace one + if 'get_collection_name' in globals() and get_collection_name: + placeholders = {"", "default-collection"} + if (not collection) or (collection in placeholders): + collection = get_collection_name(ws_path) if update_workspace_state: - update_workspace_state(ws_path, {"qdrant_collection": collection}) + # For multi-repo mode, don't set a single collection since we use per-repo collections + if not use_per_repo_collections: + update_workspace_state( + workspace_path=ws_path, + updates={"qdrant_collection": collection}, + repo_name=repo_tag, + ) if update_indexing_status: update_indexing_status( - ws_path, - { + workspace_path=ws_path, + status={ "state": "indexing", "started_at": datetime.now().isoformat(), "progress": {"files_processed": 0, "total_files": None}, }, + repo_name=repo_tag, ) - except Exception: - pass + except Exception as e: + # Log state update errors instead of silent failure + import traceback + print(f"[ERROR] Failed to update workspace state during indexing: {e}") + print(f"[ERROR] Traceback: {traceback.format_exc()}") print( f"Indexing root={root} -> {qdrant_url} collection={collection} model={model_name} recreate={recreate}" ) - if recreate: - recreate_collection(client, collection, dim, vector_name) - else: - ensure_collection(client, collection, dim, vector_name) - # Ensure useful payload indexes exist (idempotent) - ensure_payload_indexes(client, collection) + # Skip single collection setup in multi-repo mode + if not use_per_repo_collections: + if recreate: + recreate_collection(client, collection, dim, vector_name) + else: + ensure_collection(client, collection, dim, vector_name) + # Ensure useful payload indexes exist (idempotent) + ensure_payload_indexes(client, collection) + else: + print("[multi_repo] Skipping single collection setup - will create per-repo collections during indexing") # Repo tag for filtering: auto-detect from git or folder name repo_tag = _detect_repo_name_from_path(root) @@ -1978,6 +2006,18 @@ def make_point(pid, dense_vec, lex_vec, payload): for file_path in iter_files(root): files_seen += 1 + + # Determine collection per-file in multi-repo mode (use watcher's exact logic) + current_collection = collection + if use_per_repo_collections: + if _get_collection_for_file: + current_collection = _get_collection_for_file(file_path) + # Ensure collection exists on first use + ensure_collection(client, current_collection, dim, vector_name) + ensure_payload_indexes(client, current_collection) + else: + current_collection = get_collection_name(ws_path) if get_collection_name else "default-collection" + try: text = file_path.read_text(encoding="utf-8", errors="ignore") except Exception as e: @@ -1991,15 +2031,16 @@ def make_point(pid, dense_vec, lex_vec, payload): # Prefer local workspace cache to avoid Qdrant lookups try: if get_cached_file_hash: - prev_local = get_cached_file_hash(ws_path, str(file_path)) - if prev_local and prev_local == file_hash: + prev_local = get_cached_file_hash(str(file_path), repo_tag) + if prev_local and file_hash and prev_local == file_hash: if PROGRESS_EVERY <= 0 and files_seen % 50 == 0: print(f"... processed {files_seen} files (skipping unchanged, cache)") try: if update_indexing_status: + per_file_repo = _detect_repo_name_from_path(file_path) if _detect_repo_name_from_path else repo_tag update_indexing_status( - ws_path, - { + workspace_path=str(file_path.parent), + status={ "state": "indexing", "progress": { "files_processed": files_seen, @@ -2007,6 +2048,7 @@ def make_point(pid, dense_vec, lex_vec, payload): "current_file": str(file_path), }, }, + repo_name=per_file_repo, ) except Exception: pass @@ -2015,16 +2057,24 @@ def make_point(pid, dense_vec, lex_vec, payload): continue except Exception: pass - prev = get_indexed_file_hash(client, collection, str(file_path)) - if prev and prev == file_hash: + prev = get_indexed_file_hash(client, current_collection, str(file_path)) + if prev and file_hash and prev == file_hash: + # File exists in Qdrant with same hash - cache it locally for next time + try: + if set_cached_file_hash: + file_repo_tag = _detect_repo_name_from_path(file_path) + set_cached_file_hash(str(file_path), file_hash, file_repo_tag) + except Exception: + pass if PROGRESS_EVERY <= 0 and files_seen % 50 == 0: # minor heartbeat when no progress cadence configured print(f"... processed {files_seen} files (skipping unchanged)") try: if update_indexing_status: + per_file_repo = _detect_repo_name_from_path(file_path) if _detect_repo_name_from_path else repo_tag update_indexing_status( - ws_path, - { + workspace_path=str(file_path.parent), + status={ "state": "indexing", "progress": { "files_processed": files_seen, @@ -2032,6 +2082,7 @@ def make_point(pid, dense_vec, lex_vec, payload): "current_file": str(file_path), }, }, + repo_name=per_file_repo, ) except Exception: pass @@ -2041,7 +2092,7 @@ def make_point(pid, dense_vec, lex_vec, payload): # Dedupe per-file by deleting previous points for this path (default) if dedupe: - delete_points_by_path(client, collection, str(file_path)) + delete_points_by_path(client, current_collection, str(file_path)) files_indexed += 1 symbols = _extract_symbols(language, text) @@ -2173,14 +2224,15 @@ def make_point(pid, dense_vec, lex_vec, payload): make_point(i, v, lx, m) for i, v, lx, m in zip(batch_ids, vectors, batch_lex, batch_meta) ] - upsert_points(client, collection, points) + upsert_points(client, current_collection, points) # Update local file-hash cache for any files that had chunks in this flush try: if set_cached_file_hash: for _p, _h in list(batch_file_hashes.items()): try: if _p and _h: - set_cached_file_hash(ws_path, _p, _h) + file_repo_tag = _detect_repo_name_from_path(Path(_p)) + set_cached_file_hash(_p, _h, file_repo_tag) except Exception: continue except Exception: @@ -2194,19 +2246,24 @@ def make_point(pid, dense_vec, lex_vec, payload): ) try: if update_indexing_status: + per_file_repo = _detect_repo_name_from_path(file_path) if _detect_repo_name_from_path else repo_tag update_indexing_status( - ws_path, - { + workspace_path=str(file_path.parent), + status={ "state": "indexing", "progress": { - "files_processed": files_seen, + "files_processed": files_indexed, "total_files": None, "current_file": str(file_path), }, }, + repo_name=per_file_repo, ) - except Exception: - pass + except Exception as e: + # Log progress update errors instead of silent failure + import traceback + print(f"[ERROR] Failed to update indexing progress: {e}") + print(f"[ERROR] Traceback: {traceback.format_exc()}") if batch_texts: vectors = embed_batch(model, batch_texts) @@ -2220,14 +2277,15 @@ def make_point(pid, dense_vec, lex_vec, payload): make_point(i, v, lx, m) for i, v, lx, m in zip(batch_ids, vectors, batch_lex, batch_meta) ] - upsert_points(client, collection, points) + upsert_points(client, current_collection, points) # Update local file-hash cache for any files that had chunks during this run (final flush) try: if set_cached_file_hash: for _p, _h in list(batch_file_hashes.items()): try: if _p and _h: - set_cached_file_hash(ws_path, _p, _h) + file_repo_tag = _detect_repo_name_from_path(Path(_p)) + set_cached_file_hash(_p, _h, file_repo_tag) except Exception: continue except Exception: @@ -2239,30 +2297,38 @@ def make_point(pid, dense_vec, lex_vec, payload): # Workspace state: mark completion try: - if update_last_activity: - update_last_activity( - ws_path, - { - "timestamp": datetime.now().isoformat(), - "action": "scan-completed", - "file_path": "", - "details": { - "files_seen": files_seen, - "files_indexed": files_indexed, - "chunks_indexed": points_indexed, - }, + if log_activity: + # Extract repo name from workspace path for log_activity + repo_name = None + if use_per_repo_collections: + # In multi-repo mode, we need to determine which repo this activity belongs to + # For scan completion, we use the workspace path as the repo identifier + repo_name = _detect_repo_name_from_path(Path(ws_path)) + + log_activity( + repo_name=repo_name, + action="scan-completed", + file_path="", + details={ + "files_seen": files_seen, + "files_indexed": files_indexed, + "chunks_indexed": points_indexed, }, ) if update_indexing_status: update_indexing_status( - ws_path, - { + workspace_path=ws_path, + status={ "state": "idle", "progress": {"files_processed": files_indexed, "total_files": None}, }, + repo_name=repo_tag, ) - except Exception: - pass + except Exception as e: + # Log the error instead of silently swallowing it + import traceback + print(f"[ERROR] Failed to update workspace state after indexing completion: {e}") + print(f"[ERROR] Traceback: {traceback.format_exc()}") def main(): @@ -2354,9 +2420,18 @@ def main(): qdrant_url = os.environ.get("QDRANT_URL", "http://localhost:6333") api_key = os.environ.get("QDRANT_API_KEY") - collection = os.environ.get("COLLECTION_NAME", "my-collection") model_name = os.environ.get("EMBEDDING_MODEL", "BAAI/bge-base-en-v1.5") + # Resolve collection name based on multi-repo mode + if is_multi_repo_mode and is_multi_repo_mode(): + # Multi-repo mode: pass collection=None to trigger per-repo collection resolution + collection = None + print("[multi_repo] Multi-repo mode enabled - will create separate collections per repository") + else: + # Single-repo mode: use environment variable + collection = os.environ.get("COLLECTION_NAME", "my-collection") + print(f"[single_repo] Single-repo mode enabled - using collection: {collection}") + index_repo( Path(args.root).resolve(), qdrant_url, diff --git a/scripts/mcp_indexer_server.py b/scripts/mcp_indexer_server.py index 9180bf2a..e70c56ea 100644 --- a/scripts/mcp_indexer_server.py +++ b/scripts/mcp_indexer_server.py @@ -42,6 +42,14 @@ # Cache for memory collection autodetection (name + timestamp) _MEM_COLL_CACHE = {"name": None, "ts": 0.0} +# Session defaults map (token -> defaults). Guarded for concurrency. +_SESSION_LOCK = threading.Lock() +SESSION_DEFAULTS: Dict[str, Dict[str, Any]] = {} +# Per-connection defaults keyed by ctx.session (no token required) +from weakref import WeakKeyDictionary +_SESSION_CTX_LOCK = threading.Lock() +SESSION_DEFAULTS_BY_SESSION: "WeakKeyDictionary[Any, Dict[str, Any]]" = WeakKeyDictionary() + _roots = [p.strip() for p in _roots_env.split(",") if p.strip()] or ["/work", "/app"] try: @@ -70,8 +78,9 @@ def _highlight_snippet(snippet, tokens): # type: ignore try: # Official MCP Python SDK (FastMCP convenience server) - from mcp.server.fastmcp import FastMCP + from mcp.server.fastmcp import FastMCP, Context # type: ignore except Exception as e: # pragma: no cover + # Keep FastMCP import error loud; Context is for type hints only raise SystemExit("mcp package is required inside the container: pip install mcp") APP_NAME = os.environ.get("FASTMCP_SERVER_NAME", "qdrant-indexer-mcp") @@ -79,7 +88,21 @@ def _highlight_snippet(snippet, tokens): # type: ignore PORT = int(os.environ.get("FASTMCP_INDEXER_PORT", "8001")) QDRANT_URL = os.environ.get("QDRANT_URL", "http://qdrant:6333") -DEFAULT_COLLECTION = os.environ.get("COLLECTION_NAME", "my-collection") +DEFAULT_COLLECTION = ( + os.environ.get("DEFAULT_COLLECTION") + or os.environ.get("COLLECTION_NAME") + or "my-collection" +) + +# Use auto-generated collection name for workspace only if no env default is set +try: + from scripts.workspace_state import get_collection_name as _ws_get_collection_name # type: ignore + workspace_path = os.environ.get("WATCH_ROOT", "/work") + if not (os.environ.get("DEFAULT_COLLECTION") or os.environ.get("COLLECTION_NAME")): + DEFAULT_COLLECTION = _ws_get_collection_name(workspace_path) +except Exception: + # Fallback to environment variable if workspace_state import fails + pass MAX_LOG_TAIL = int(os.environ.get("MCP_MAX_LOG_TAIL", "4000")) SNIPPET_MAX_BYTES = int(os.environ.get("MCP_SNIPPET_MAX_BYTES", "8192") or 8192) @@ -87,10 +110,33 @@ def _highlight_snippet(snippet, tokens): # type: ignore # --- Workspace state integration helpers --- def _state_file_path(ws_path: str = "/work") -> str: + """ + Get the appropriate state file path using the centralized metadata system. + + This function now uses the centralized metadata system from workspace_state.py + instead of creating workspace-specific .codebase directories. + """ try: - return os.path.join(ws_path, ".codebase", "state.json") + from scripts.workspace_state import _extract_repo_name_from_path, _state_file_path as _ws_state_file_path + + # Extract repository name from workspace path for the centralized system + repo_name = _extract_repo_name_from_path(ws_path) + + # Use the centralized metadata system with repo_name to create files at: + # /work/.codebase/repos/{repo_name}/state.json + # This avoids creating internal .codebase directories in workspaces + state_path = _ws_state_file_path(workspace_path=None, repo_name=repo_name) + + return str(state_path) except Exception: - return "/work/.codebase/state.json" + # Fallback to centralized system using workspace path for backward compatibility + try: + from scripts.workspace_state import _state_file_path as _ws_state_file_path + state_path = _ws_state_file_path(workspace_path=ws_path, repo_name=None) + return str(state_path) + except Exception: + # Ultimate fallback + return "/work/.codebase/state.json" def _read_ws_state(ws_path: str = "/work") -> Optional[Dict[str, Any]]: @@ -106,25 +152,41 @@ def _read_ws_state(ws_path: str = "/work") -> Optional[Dict[str, Any]]: def _default_collection() -> str: + # Prefer explicit environment default if provided + env_coll = (os.environ.get("DEFAULT_COLLECTION") or os.environ.get("COLLECTION_NAME") or "").strip() + if env_coll: + return env_coll + # Else, fall back to workspace state if present st = _read_ws_state("/work") if st: coll = st.get("qdrant_collection") if isinstance(coll, str) and coll.strip(): return coll.strip() + # Finally, fall back to module default (which may have been initialized from env or workspace) return DEFAULT_COLLECTION - def _work_script(name: str) -> str: - """Return path to a script under /work if present, else local ./scripts. + """Return path to a script under /work if present, else /app, else local ./scripts. Keeps Docker/default behavior but works in local dev without /work mount. """ + # Try bind mount location first (local dev with bind mounts) try: w = os.path.join("/work", "scripts", name) if os.path.exists(w): return w except Exception: pass + + # Try baked-in scripts (production/K8s with scripts copied to /app) + try: + app_path = os.path.join("/app", "scripts", name) + if os.path.exists(app_path): + return app_path + except Exception: + pass + + # Original fallback for local development return os.path.join(os.getcwd(), "scripts", name) @@ -162,7 +224,6 @@ def do_GET(self): pass - def log_message(self, *args, **kwargs): # Quiet health server logs return @@ -492,7 +553,13 @@ async def qdrant_index_root( else: try: from scripts.workspace_state import get_collection_name as _ws_get_collection_name # type: ignore - coll = _ws_get_collection_name("/work") + from scripts.workspace_state import is_multi_repo_mode as _ws_is_multi_repo_mode # type: ignore + # In single-repo mode, use None to get env var "my-collection" + # In multi-repo mode, "/work" should be skipped anyway, but handle it consistently + if _ws_is_multi_repo_mode(): + coll = _ws_get_collection_name("/work") # Multi-repo: generates from "/work" (should be skipped) + else: + coll = _ws_get_collection_name(None) # Single-repo: uses env var "my-collection" except Exception: coll = _default_collection() @@ -527,7 +594,6 @@ async def qdrant_list(**kwargs) -> Dict[str, Any]: return {"error": str(e)} - @mcp.tool() async def workspace_info(workspace_path: Optional[str] = None, **kwargs) -> Dict[str, Any]: """Return the current workspace state from .codebase/state.json, if present. @@ -538,8 +604,10 @@ async def workspace_info(workspace_path: Optional[str] = None, **kwargs) -> Dict - state: raw state.json contents (or {}) """ ws_path = (workspace_path or "/work").strip() or "/work" + + st = _read_ws_state(ws_path) or {} - coll = (st.get("qdrant_collection") if isinstance(st, dict) else None) or os.environ.get("COLLECTION_NAME") or DEFAULT_COLLECTION + coll = (st.get("qdrant_collection") if isinstance(st, dict) else None) or os.environ.get("DEFAULT_COLLECTION") or os.environ.get("COLLECTION_NAME") or DEFAULT_COLLECTION return { "workspace_path": ws_path, "default_collection": coll, @@ -569,7 +637,7 @@ async def memory_store( """Store a memory-like entry directly into Qdrant using the default collection. - information: free-form text to remember - metadata: optional tags (e.g., {"kind":"preference","source":"memory"}) - - collection: override target collection (defaults to env COLLECTION_NAME) + - collection: override target collection (env DEFAULT_COLLECTION or COLLECTION_NAME if not provided) """ try: from qdrant_client import QdrantClient, models # type: ignore @@ -577,6 +645,8 @@ async def memory_store( import time, hashlib, re, math from scripts.utils import sanitize_vector_name from scripts.ingest_code import ensure_collection as _ensure_collection # type: ignore + + from scripts.ingest_code import project_mini as _project_mini # type: ignore except Exception as e: # pragma: no cover @@ -838,7 +908,13 @@ async def qdrant_index( else: try: from scripts.workspace_state import get_collection_name as _ws_get_collection_name # type: ignore - coll = _ws_get_collection_name("/work") + from scripts.workspace_state import is_multi_repo_mode as _ws_is_multi_repo_mode # type: ignore + # In single-repo mode, use None to get env var "my-collection" + # In multi-repo mode, "/work" should be skipped anyway, but handle it consistently + if _ws_is_multi_repo_mode(): + coll = _ws_get_collection_name("/work") # Multi-repo: generates from "/work" (should be skipped) + else: + coll = _ws_get_collection_name(None) # Single-repo: uses env var "my-collection" except Exception: coll = _default_collection() @@ -859,6 +935,59 @@ async def qdrant_index( return {"args": {"root": root, "collection": coll, "recreate": recreate}, **res} +@mcp.tool() +async def set_session_defaults(collection: Any = None, session: Any = None, ctx: Context = None, **kwargs) -> Dict[str, Any]: + """Set defaults (e.g., collection) for subsequent calls. + + Behavior: + - If request Context is available, persist defaults per-connection so later calls on + the same MCP session automatically use them (no token required). + - Optionally also stores token-scoped defaults for cross-connection reuse. + """ + try: + _extra = _extract_kwargs_payload(kwargs) + if _extra: + if (collection is None or (isinstance(collection, str) and collection.strip() == "")) and _extra.get("collection") is not None: + collection = _extra.get("collection") + if (session is None or (isinstance(session, str) and str(session).strip() == "")) and _extra.get("session") is not None: + session = _extra.get("session") + except Exception: + pass + + defaults: Dict[str, Any] = {} + if isinstance(collection, str) and collection.strip(): + defaults["collection"] = str(collection).strip() + + # Per-connection storage (preferred) + try: + if ctx is not None and getattr(ctx, "session", None) is not None and defaults: + with _SESSION_CTX_LOCK: + existing2 = SESSION_DEFAULTS_BY_SESSION.get(ctx.session) or {} + existing2.update(defaults) + SESSION_DEFAULTS_BY_SESSION[ctx.session] = existing2 + except Exception: + pass + + # Optional token storage + sid = str(session).strip() if session is not None else "" + if not sid: + sid = uuid.uuid4().hex[:12] + try: + if defaults: + with _SESSION_LOCK: + existing = SESSION_DEFAULTS.get(sid) or {} + existing.update(defaults) + SESSION_DEFAULTS[sid] = existing + except Exception: + pass + + return { + "ok": True, + "session": sid, + "defaults": SESSION_DEFAULTS.get(sid, {}), + "applied": ("connection" if (ctx is not None and getattr(ctx, "session", None) is not None) else "token"), + } + @mcp.tool() async def qdrant_prune(**kwargs) -> Dict[str, Any]: """Prune stale points for the mounted path (/work). Extra params are ignored.""" @@ -885,6 +1014,11 @@ async def repo_search( highlight_snippet: Any = None, collection: Any = None, workspace_path: Any = None, + + + session: Any = None, + ctx: Context = None, + # Structured filters (optional; mirrors hybrid_search flags) language: Any = None, under: Any = None, @@ -909,7 +1043,7 @@ async def repo_search( - per_path: max results per file (default 2) - include_snippet/context_lines: include snippet near hit lines - rerank_*: optional ONNX reranker; timeouts fall back to hybrid - - collection: override target collection (default env COLLECTION_NAME) + - collection: override target collection (env DEFAULT_COLLECTION or COLLECTION_NAME if not provided) - language/under/kind/symbol/path_regex/path_glob/not_glob/ext/not_/case: optional filters - compact: if true, return only path and line range @@ -973,11 +1107,18 @@ async def repo_search( or (isinstance(collection, str) and collection.strip() == "") ) and _extra.get("collection"): collection = _extra.get("collection") + # Optional session token for session-scoped defaults + if ( + (session is None) or (isinstance(session, str) and str(session).strip() == "") + ) and _extra.get("session") is not None: + session = _extra.get("session") + # Optional workspace_path routing if ( (workspace_path is None) or (isinstance(workspace_path, str) and str(workspace_path).strip() == "") ) and _extra.get("workspace_path") is not None: workspace_path = _extra.get("workspace_path") + if ( language is None or (isinstance(language, str) and language.strip() == "") @@ -1042,6 +1183,10 @@ def _to_bool(x, default): return False return default + # Session token (top-level or parsed from nested kwargs above) + sid = (str(session).strip() if session is not None else "") + + def _to_str(x, default=""): if x is None: return default @@ -1068,17 +1213,39 @@ def _to_str(x, default=""): ) highlight_snippet = _to_bool(highlight_snippet, True) - # Resolve collection: explicit > workspace_path state > default - ws_hint = _to_str(workspace_path, "").strip() + # Resolve collection precedence: explicit > per-connection defaults > token defaults > env default coll_hint = _to_str(collection, "").strip() - if not coll_hint and ws_hint: + + # 1) Per-connection defaults via ctx (no token required) + if (not coll_hint) and ctx is not None and getattr(ctx, "session", None) is not None: + try: + with _SESSION_CTX_LOCK: + _d2 = SESSION_DEFAULTS_BY_SESSION.get(ctx.session) or {} + _sc2 = str((_d2.get("collection") or "")).strip() + if _sc2: + coll_hint = _sc2 + except Exception: + pass + + # 2) Legacy token-based defaults + if (not coll_hint) and sid: try: - st = _read_ws_state(ws_hint) - if st and isinstance(st.get("qdrant_collection"), str): - coll_hint = st.get("qdrant_collection").strip() + with _SESSION_LOCK: + _d = SESSION_DEFAULTS.get(sid) or {} + _sc = str((_d.get("collection") or "")).strip() + if _sc: + coll_hint = _sc except Exception: pass - collection = coll_hint or _default_collection() + + # 3) Environment default + env_coll = (os.environ.get("DEFAULT_COLLECTION") or os.environ.get("COLLECTION_NAME") or "").strip() + if (not coll_hint) and env_coll: + coll_hint = env_coll + + # Final fallback + env_fallback = (os.environ.get("DEFAULT_COLLECTION") or os.environ.get("COLLECTION_NAME") or "my-collection").strip() + collection = coll_hint or env_fallback language = _to_str(language, "").strip() under = _to_str(under, "").strip() @@ -1172,6 +1339,7 @@ def _to_str_list(x): expand=str(os.environ.get("HYBRID_EXPAND", "1")).strip().lower() in {"1", "true", "yes", "on"}, model=model, + collection=collection, ) # items are already in structured dict form json_lines = items # reuse downstream shaping @@ -1212,6 +1380,8 @@ def _to_str_list(x): cmd += ["--not-glob", g] for q in queries: cmd += ["--query", q] + if collection: + cmd += ["--collection", str(collection)] res = await _run_async(cmd, env=env) for line in (res.get("stdout") or "").splitlines(): @@ -1611,6 +1781,8 @@ async def repo_search_compat(**arguments) -> Dict[str, Any]: "rerank_timeout_ms": args.get("rerank_timeout_ms"), "highlight_snippet": args.get("highlight_snippet"), "collection": args.get("collection"), + "session": args.get("session"), + "workspace_path": args.get("workspace_path"), "language": args.get("language"), "under": args.get("under"), "kind": args.get("kind"), @@ -1635,7 +1807,6 @@ async def repo_search_compat(**arguments) -> Dict[str, Any]: return {"error": f"repo_search_compat failed: {e}"} - @mcp.tool() async def search_tests_for( query: Any = None, @@ -1645,6 +1816,8 @@ async def search_tests_for( under: Any = None, language: Any = None, compact: Any = None, + session: Any = None, + ctx: Context = None, **kwargs, ) -> Dict[str, Any]: """Intent-specific wrapper to search for tests related to a query. @@ -1673,6 +1846,8 @@ async def search_tests_for( language=language, path_glob=globs, compact=compact, + session=session, + ctx=ctx, **{k: v for k, v in kwargs.items() if k not in {"path_glob"}} ) @@ -1685,6 +1860,8 @@ async def search_config_for( context_lines: Any = None, under: Any = None, compact: Any = None, + session: Any = None, + ctx: Context = None, **kwargs, ) -> Dict[str, Any]: """Intent-specific wrapper to search likely configuration files for a service/query.""" @@ -1718,6 +1895,8 @@ async def search_config_for( under=under, path_glob=globs, compact=compact, + session=session, + ctx=ctx, **{k: v for k, v in kwargs.items() if k not in {"path_glob"}} ) @@ -1727,6 +1906,8 @@ async def search_callers_for( query: Any = None, limit: Any = None, language: Any = None, + session: Any = None, + ctx: Context = None, **kwargs, ) -> Dict[str, Any]: """Heuristic: find likely callers/usages of a symbol. @@ -1737,6 +1918,8 @@ async def search_callers_for( query=query, limit=limit, language=language, + session=session, + ctx=ctx, **kwargs, ) @@ -1746,6 +1929,8 @@ async def search_importers_for( query: Any = None, limit: Any = None, language: Any = None, + session: Any = None, + ctx: Context = None, **kwargs, ) -> Dict[str, Any]: """Intent: find files likely importing/referencing a given module/symbol. @@ -1768,11 +1953,12 @@ async def search_importers_for( limit=limit, language=language, path_glob=globs, + session=session, + ctx=ctx, **{k: v for k, v in kwargs.items() if k not in {"path_glob"}} ) - @mcp.tool() async def change_history_for_path( path: Any, @@ -2214,9 +2400,9 @@ def _poll_ready(): if tool_name: qtext = " ".join([q for q in queries if q]).strip() or queries[0] arg_variants: List[Dict[str, Any]] = [ - {"query": qtext, "limit": mem_limit}, - {"q": qtext, "limit": mem_limit}, - {"text": qtext, "limit": mem_limit}, + {"query": qtext, "limit": mem_limit, "collection": mcoll}, + {"q": qtext, "limit": mem_limit, "collection": mcoll}, + {"text": qtext, "limit": mem_limit, "collection": mcoll}, ] res_obj = None for args in arg_variants: @@ -2591,7 +2777,6 @@ async def expand_query(query: Any = None, max_new: Any = None) -> Dict[str, Any] return {"alternates": [], "error": str(e)} - # Lightweight cleanup to reduce repetition from small models def _cleanup_answer(text: str, max_chars: int | None = None) -> str: try: @@ -2649,7 +2834,6 @@ def _cleanup_answer(text: str, max_chars: int | None = None) -> str: return text - @mcp.tool() async def context_answer( query: Any = None, @@ -2987,7 +3171,6 @@ def _ok_lang(it: Dict[str, Any]) -> bool: context_blocks.append(block) - # Debug: log span details if os.environ.get("DEBUG_CONTEXT_ANSWER"): print(f"DEBUG: spans={len(spans)}, context_blocks={len(context_blocks)}") @@ -2997,7 +3180,6 @@ def _ok_lang(it: Dict[str, Any]) -> bool: print("DEBUG: no context blocks!") - # Optional stop sequences via env (comma-separated) stop_env = os.environ.get("DECODER_STOP", "") stops = [s for s in (stop_env.split(",") if stop_env else []) if s] diff --git a/scripts/mcp_memory_server.py b/scripts/mcp_memory_server.py index d452e042..6f8c8aa9 100644 --- a/scripts/mcp_memory_server.py +++ b/scripts/mcp_memory_server.py @@ -2,14 +2,24 @@ from typing import Any, Dict, Optional, List import json import threading +from weakref import WeakKeyDictionary -from mcp.server.fastmcp import FastMCP +# FastMCP server and request Context (ctx) for per-connection state +try: + from mcp.server.fastmcp import FastMCP, Context # type: ignore +except Exception: + # Fallback: keep FastMCP import; treat Context as Any for type hints + from mcp.server.fastmcp import FastMCP # type: ignore + Context = Any # type: ignore + from qdrant_client import QdrantClient, models # Env QDRANT_URL = os.environ.get("QDRANT_URL", "http://qdrant:6333") -DEFAULT_COLLECTION = os.environ.get("COLLECTION_NAME", "my-collection") +DEFAULT_COLLECTION = (os.environ.get("DEFAULT_COLLECTION") + or os.environ.get("COLLECTION_NAME") + or "my-collection") LEX_VECTOR_NAME = os.environ.get("LEX_VECTOR_NAME", "lex") LEX_VECTOR_DIM = int(os.environ.get("LEX_VECTOR_DIM", "4096") or 4096) EMBEDDING_MODEL = os.environ.get("EMBEDDING_MODEL", "BAAI/bge-base-en-v1.5") @@ -21,8 +31,6 @@ import hashlib - - # Ensure repo roots are importable so 'scripts' resolves inside container import sys as _sys _roots_env = os.environ.get("WORK_ROOTS", "") @@ -43,6 +51,51 @@ VECTOR_NAME = _sanitize_vector_name(EMBEDDING_MODEL) +# I/O-safety knobs for memory server behavior +# These env vars allow tuning startup latency vs. first-call latency, especially important +# on slow storage backends (e.g., Ceph + HDD). See comments below for rationale. +MEMORY_ENSURE_ON_START = str(os.environ.get("MEMORY_ENSURE_ON_START", "1")).strip().lower() in {"1", "true", "yes", "on"} +MEMORY_COLD_SKIP_DENSE = str(os.environ.get("MEMORY_COLD_SKIP_DENSE", "0")).strip().lower() in {"1", "true", "yes", "on"} +MEMORY_PROBE_EMBED_DIM = str(os.environ.get("MEMORY_PROBE_EMBED_DIM", "1")).strip().lower() in {"1", "true", "yes", "on"} +try: + MEMORY_VECTOR_DIM = int(os.environ.get("MEMORY_VECTOR_DIM") or os.environ.get("EMBED_DIM") or "768") +except Exception: + MEMORY_VECTOR_DIM = 768 + +# Lazy embedding model cache with double-checked locking. +# RATIONALE: Avoid loading the embedding model (100–500 MB) on module import. +# On slow storage (Ceph + HDD), eager loading can cause 30–60s startup delays. +# Instead, load on first tool call (store/find). Subsequent calls reuse cached instance. +_EMBED_MODEL_CACHE: Dict[str, Any] = {} +_EMBED_MODEL_LOCK = threading.Lock() + +def _get_embedding_model(): + """Lazily load and cache the embedding model to avoid startup I/O.""" + from fastembed import TextEmbedding + m = _EMBED_MODEL_CACHE.get(EMBEDDING_MODEL) + if m is None: + with _EMBED_MODEL_LOCK: + m = _EMBED_MODEL_CACHE.get(EMBEDDING_MODEL) + if m is None: + m = TextEmbedding(model_name=EMBEDDING_MODEL) + _EMBED_MODEL_CACHE[EMBEDDING_MODEL] = m + return m + +# Track ensured collections to reduce redundant ensure calls. +# RATIONALE: Avoid repeated Qdrant network calls for the same collection. +_ENSURED = set() + +def _ensure_once(name: str) -> bool: + """Ensure collection exists, but only once per process (cached result).""" + if name in _ENSURED: + return True + try: + _ensure_collection(name) + _ENSURED.add(name) + return True + except Exception: + return False + mcp = FastMCP(name="memory-server") HOST = os.environ.get("FASTMCP_HOST", "0.0.0.0") PORT = int(os.environ.get("FASTMCP_PORT", "8000") or 8000) @@ -53,6 +106,13 @@ except Exception: HEALTH_PORT = 18000 +# In-memory session defaults (legacy token-based) +_SESSION_LOCK = threading.Lock() +SESSION_DEFAULTS: Dict[str, Dict[str, Any]] = {} +# In-memory per-connection defaults keyed by ctx.session (no token required) +_SESSION_CTX_LOCK = threading.Lock() +SESSION_DEFAULTS_BY_SESSION: "WeakKeyDictionary[Any, Dict[str, Any]]" = WeakKeyDictionary() + def _start_readyz_server(): try: @@ -94,35 +154,129 @@ def log_message(self, *args, **kwargs): def _ensure_collection(name: str): + """Create collection if missing. + + Default behavior mirrors the original implementation for PR compatibility: + - Probe the embedding model to detect the dense vector dimension (MEMORY_PROBE_EMBED_DIM=1) + - Eager ensure on startup (MEMORY_ENSURE_ON_START=1) + + For slow storage backends (e.g., Ceph + HDD), set the following in your env: + - MEMORY_PROBE_EMBED_DIM=0 -> skip model probing; use MEMORY_VECTOR_DIM/EMBED_DIM + - MEMORY_ENSURE_ON_START=0 -> ensure lazily on first tool call + """ try: - info = client.get_collection(name) + client.get_collection(name) return True except Exception: pass - # Derive dense vector dimension from embedding model to avoid mismatch - # Derive dense vector dimension from embedding model to avoid mismatch - try: - _model_probe = TextEmbedding(model_name=EMBEDDING_MODEL) - _dense_vec = next(_model_probe.embed(["probe"])) - _dense_dim = len(getattr(_dense_vec, "tolist", lambda: _dense_vec)()) if hasattr(_dense_vec, "tolist") else len(_dense_vec) - except Exception: + + # Choose dense dimension based on config: probe (default) vs env-configured + if MEMORY_PROBE_EMBED_DIM: try: - _dense_dim = int(os.environ.get("EMBED_DIM", "768") or 768) + from fastembed import TextEmbedding + _model_probe = TextEmbedding(model_name=EMBEDDING_MODEL) + _dense_vec = next(_model_probe.embed(["probe"])) + if hasattr(_dense_vec, "tolist"): + dense_dim = len(_dense_vec.tolist()) + else: + try: + dense_dim = len(_dense_vec) + except Exception: + dense_dim = int(os.environ.get("MEMORY_VECTOR_DIM") or os.environ.get("EMBED_DIM") or "768") except Exception: - _dense_dim = 768 + # Fallback to env-configured dimension if probing fails + try: + dense_dim = int(os.environ.get("MEMORY_VECTOR_DIM") or os.environ.get("EMBED_DIM") or "768") + except Exception: + dense_dim = 768 + else: + dense_dim = int(MEMORY_VECTOR_DIM or 768) + vectors_cfg = { - VECTOR_NAME: models.VectorParams( - size=int(_dense_dim or 768), distance=models.Distance.COSINE - ), - LEX_VECTOR_NAME: models.VectorParams( - size=LEX_VECTOR_DIM, distance=models.Distance.COSINE - ), + VECTOR_NAME: models.VectorParams(size=int(dense_dim or 768), distance=models.Distance.COSINE), + LEX_VECTOR_NAME: models.VectorParams(size=LEX_VECTOR_DIM, distance=models.Distance.COSINE), } client.create_collection(collection_name=name, vectors_config=vectors_cfg) return True -_ensure_collection(DEFAULT_COLLECTION) +# Optional eager collection ensure on startup (enabled by default for backward compatibility). +# Set MEMORY_ENSURE_ON_START=0 to defer ensure to first tool call (recommended on slow storage). +if MEMORY_ENSURE_ON_START: + try: + _ensure_collection(DEFAULT_COLLECTION) + except Exception: + pass + +@mcp.tool() +def set_session_defaults( + collection: Optional[str] = None, + session: Optional[str] = None, + ctx: Context = None, + **kwargs: Any, +) -> Dict[str, Any]: + """Set defaults (e.g., collection) for subsequent calls. + + Behavior: + - If a request Context is provided (normal with FastMCP), store defaults per-connection + so subsequent calls on the same MCP session automatically use them (no token needed). + - Optionally, also supports a lightweight token for clients that prefer cross-connection reuse. + + Precedence everywhere: explicit collection > per-connection defaults > token defaults > env default. + """ + try: + _extra = kwargs or {} + if isinstance(_extra, dict) and "kwargs" in _extra: + inner = _extra.get("kwargs") + if isinstance(inner, dict): + _extra = inner + elif isinstance(inner, str): + try: + _extra = json.loads(inner) + except Exception: + _extra = {} + if (not collection) and isinstance(_extra, dict) and _extra.get("collection") is not None: + collection = _extra.get("collection") + if (not session) and isinstance(_extra, dict) and _extra.get("session") is not None: + session = _extra.get("session") + except Exception: + pass + + # Prepare defaults payload + defaults: Dict[str, Any] = {} + if isinstance(collection, str) and collection.strip(): + defaults["collection"] = collection.strip() + + # Store per-connection (preferred, no token required) + try: + if ctx is not None and getattr(ctx, "session", None) is not None and defaults: + with _SESSION_CTX_LOCK: + existing = SESSION_DEFAULTS_BY_SESSION.get(ctx.session) or {} + existing.update(defaults) + SESSION_DEFAULTS_BY_SESSION[ctx.session] = existing + except Exception: + pass + + # Optional: also support legacy token + sid = (str(session).strip() if session is not None else "") or None + if not sid: + import uuid as _uuid + sid = _uuid.uuid4().hex[:12] + try: + if defaults: + with _SESSION_LOCK: + existing = SESSION_DEFAULTS.get(sid) or {} + existing.update(defaults) + SESSION_DEFAULTS[sid] = existing + except Exception: + pass + + return { + "ok": True, + "session": sid, + "defaults": (SESSION_DEFAULTS.get(sid, {}) if sid else {}), + "applied": ("connection" if (ctx is not None and getattr(ctx, "session", None) is not None) else "token"), + } @mcp.tool() @@ -130,10 +284,87 @@ def store( information: str, metadata: Optional[Dict[str, Any]] = None, collection: Optional[str] = None, + session: Optional[str] = None, + ctx: Context = None, + **kwargs: Any, ) -> Dict[str, Any]: - """Store a memory entry into Qdrant (dual vectors consistent with indexer).""" - coll = collection or DEFAULT_COLLECTION - model = TextEmbedding(model_name=EMBEDDING_MODEL) + """Store a memory entry into Qdrant (dual vectors consistent with indexer). + + Note: First call may be slow (model loads on first use). Subsequent calls are fast. + """ + # Leniency: absorb nested 'kwargs' JSON some MCP clients send (so callers can pass + # collection inside a single kwargs payload) + try: + _extra = kwargs or {} + if isinstance(_extra, dict) and "kwargs" in _extra: + inner = _extra.get("kwargs") + if isinstance(inner, dict): + _extra = inner + elif isinstance(inner, str): + try: + _extra = json.loads(inner) + except Exception: + _extra = {} + if (not collection) and isinstance(_extra, dict) and _extra.get("collection") is not None: + collection = _extra.get("collection") + except Exception: + pass + + # Apply session default if provided via kwargs + sid = None + try: + _sx = kwargs or {} + if isinstance(_sx, dict) and "kwargs" in _sx: + inner = _sx.get("kwargs") + if isinstance(inner, dict): + _sx = inner + elif isinstance(inner, str): + try: + _sx = json.loads(inner) + except Exception: + _sx = {} + if isinstance(_sx, dict) and _sx.get("session") is not None: + sid = str(_sx.get("session")).strip() + except Exception: + pass + + # Prefer explicit session param if provided + try: + if session is not None and str(session).strip(): + sid = str(session).strip() + except Exception: + pass + + + coll = (collection or "").strip() + + # 1) Per-connection defaults via ctx (no token required) + if (not coll) and ctx is not None and getattr(ctx, "session", None) is not None: + try: + with _SESSION_CTX_LOCK: + _d2 = SESSION_DEFAULTS_BY_SESSION.get(ctx.session) or {} + _sc2 = str((_d2.get("collection") or "")).strip() + if _sc2: + coll = _sc2 + except Exception: + pass + + # 2) Legacy token-based session defaults + if (not coll) and sid: + try: + with _SESSION_LOCK: + _d = SESSION_DEFAULTS.get(sid) or {} + _sc = str((_d.get("collection") or "")).strip() + if _sc: + coll = _sc + except Exception: + pass + + # 3) Environment fallback + coll = coll or DEFAULT_COLLECTION + + _ensure_once(coll) # Lazy: only ensures collection once per process + model = _get_embedding_model() # Lazy: loads model on first call, cached thereafter dense = next(model.embed([str(information)])).tolist() lex = _lex_hash_vector_text(str(information), LEX_VECTOR_DIM) # Use UUID to avoid point ID collisions under concurrent load @@ -156,33 +387,124 @@ def find( limit: int = 5, collection: Optional[str] = None, top_k: Optional[int] = None, + session: Optional[str] = None, + ctx: Context = None, + **kwargs: Any, ) -> Dict[str, Any]: - """Find memory-like entries by vector similarity (dense + lexical fusion).""" - coll = collection or DEFAULT_COLLECTION - model = TextEmbedding(model_name=EMBEDDING_MODEL) - dense = next(model.embed([str(query)])).tolist() + """Find memory-like entries by vector similarity (dense + lexical fusion). + + Note: First call may be slow if dense embedding is used (model loads on first use). + Set MEMORY_COLD_SKIP_DENSE=1 to skip dense on the very first query (falls back to + lexical-only search). Default is 0 (always use dense) for backward compatibility. + """ + # Leniency: absorb nested 'kwargs' JSON some MCP clients send + try: + _extra = kwargs or {} + if isinstance(_extra, dict) and "kwargs" in _extra: + inner = _extra.get("kwargs") + if isinstance(inner, dict): + _extra = inner + elif isinstance(inner, str): + try: + _extra = json.loads(inner) + except Exception: + _extra = {} + if (not collection) and isinstance(_extra, dict) and _extra.get("collection") is not None: + collection = _extra.get("collection") + except Exception: + pass + + # Apply session default if provided via kwargs + sid = None + try: + _sx = kwargs or {} + if isinstance(_sx, dict) and "kwargs" in _sx: + inner = _sx.get("kwargs") + if isinstance(inner, dict): + _sx = inner + elif isinstance(inner, str): + try: + _sx = json.loads(inner) + except Exception: + _sx = {} + if isinstance(_sx, dict) and _sx.get("session") is not None: + sid = str(_sx.get("session")).strip() + except Exception: + pass + + # Prefer explicit session param if provided + try: + if session is not None and str(session).strip(): + sid = str(session).strip() + except Exception: + pass + + coll = (collection or "").strip() + + # 1) Per-connection defaults via ctx (no token required) + if (not coll) and ctx is not None and getattr(ctx, "session", None) is not None: + try: + with _SESSION_CTX_LOCK: + _d2 = SESSION_DEFAULTS_BY_SESSION.get(ctx.session) or {} + _sc2 = str((_d2.get("collection") or "")).strip() + if _sc2: + coll = _sc2 + except Exception: + pass + + # 2) Legacy token-based session defaults + if (not coll) and sid: + try: + with _SESSION_LOCK: + _d = SESSION_DEFAULTS.get(sid) or {} + _sc = str((_d.get("collection") or "")).strip() + if _sc: + coll = _sc + except Exception: + pass + + # 3) Environment fallback + coll = coll or DEFAULT_COLLECTION + _ensure_once(coll) + + # Cold-start optimization: skip dense embedding on first query if model not yet loaded. + # RATIONALE: On slow storage, loading the embedding model (100–500 MB) can delay + # the first response by 5–10s. By skipping dense on cold start, we return results + # faster using lexical search alone. Subsequent queries use dense (model is cached). + # Set MEMORY_COLD_SKIP_DENSE=0 to disable this optimization. + use_dense = True + if MEMORY_COLD_SKIP_DENSE and EMBEDDING_MODEL not in _EMBED_MODEL_CACHE: + use_dense = False + if use_dense: + model = _get_embedding_model() + dense = next(model.embed([str(query)])).tolist() + else: + dense = None lex = _lex_hash_vector_text(str(query), LEX_VECTOR_DIM) # Harmonize alias: top_k -> limit lim = int(limit or top_k or 5) # Two searches (prefer query_points) then simple RRF-like merge - try: - qp_dense = client.query_points( - collection_name=coll, - query=dense, - using=VECTOR_NAME, - limit=max(10, lim), - with_payload=True, - ) - res_dense = getattr(qp_dense, "points", qp_dense) - except AttributeError: - res_dense = client.search( - collection_name=coll, - query_vector=(VECTOR_NAME, dense), - limit=max(10, lim), - with_payload=True, - ) + if use_dense: + try: + qp_dense = client.query_points( + collection_name=coll, + query=dense, + using=VECTOR_NAME, + limit=max(10, lim), + with_payload=True, + ) + res_dense = getattr(qp_dense, "points", qp_dense) + except AttributeError: + res_dense = client.search( + collection_name=coll, + query_vector=(VECTOR_NAME, dense), + limit=max(10, lim), + with_payload=True, + ) + else: + res_dense = [] try: qp_lex = client.query_points( diff --git a/scripts/remote_upload_client.py b/scripts/remote_upload_client.py index 5623d371..93caf87e 100644 --- a/scripts/remote_upload_client.py +++ b/scripts/remote_upload_client.py @@ -15,6 +15,7 @@ import tempfile import threading import logging +import argparse from pathlib import Path from typing import Dict, List, Any, Optional, Tuple from datetime import datetime @@ -30,7 +31,8 @@ from scripts.workspace_state import ( get_cached_file_hash, set_cached_file_hash, - remove_cached_file, + get_collection_name, + _extract_repo_name_from_path, ) # Import existing hash function @@ -39,37 +41,58 @@ class RemoteUploadClient: """Client for uploading delta bundles to remote server.""" - + + def _translate_to_container_path(self, host_path: str) -> str: + """Translate host path to container path for API communication.""" + # Use environment variable for path mapping if available + host_root = os.environ.get("HOST_ROOT", "/home/coder/project/Context-Engine/dev-workspace") + container_root = os.environ.get("CONTAINER_ROOT", "/work") + + if host_path.startswith(host_root): + return host_path.replace(host_root, container_root) + else: + # Fallback: if path doesn't match expected pattern, use as-is + return host_path + def __init__(self, upload_endpoint: str, workspace_path: str, collection_name: str, max_retries: int = 3, - timeout: int = 30): + timeout: int = 30, + metadata_path: Optional[str] = None): """ Initialize remote upload client. - + Args: upload_endpoint: HTTP endpoint for delta uploads - workspace_path: Absolute path to workspace + workspace_path: Absolute path to workspace (where files are located) collection_name: Target collection name max_retries: Maximum number of upload retries timeout: Request timeout in seconds + metadata_path: Absolute path to metadata directory (for delta bundles) + If None, uses workspace_path/.codebase/delta_bundles """ self.upload_endpoint = upload_endpoint.rstrip('/') self.workspace_path = workspace_path self.collection_name = collection_name self.max_retries = max_retries self.timeout = timeout - - # Bundle persistence directory (initialize before sequence tracking) - self.bundle_dir = Path(workspace_path) / ".codebase" / "delta_bundles" - self.bundle_dir.mkdir(parents=True, exist_ok=True) - - # Sequence number tracking - self._sequence_lock = threading.Lock() - self._sequence_number = self._get_last_sequence() - + + # Use temporary directory for bundle creation - CLI should be stateless + # Temporary bundles are cleaned up after upload + self.temp_dir = None + self.bundle_dir = None # No persistent bundle directory in CLI mode + + # Store repo name for cache operations + # Import here to avoid circular imports + try: + from scripts.workspace_state import _extract_repo_name_from_path + self.repo_name = _extract_repo_name_from_path(workspace_path) + except ImportError: + # Fallback: use directory name as repo name + self.repo_name = Path(workspace_path).name + # Setup HTTP session with retry strategy self.session = requests.Session() retry_strategy = Retry( @@ -80,39 +103,41 @@ def __init__(self, adapter = HTTPAdapter(max_retries=retry_strategy) self.session.mount("http://", adapter) self.session.mount("https://", adapter) - - def _get_last_sequence(self) -> int: - """Get the last sequence number from local state.""" - seq_file = self.bundle_dir / "last_sequence.txt" - try: - if seq_file.exists(): - return int(seq_file.read_text().strip()) - except (ValueError, IOError): - pass - return 0 - - def _set_last_sequence(self, sequence: int) -> None: - """Persist the last sequence number.""" - seq_file = self.bundle_dir / "last_sequence.txt" - try: - seq_file.write_text(str(sequence)) - except IOError: - pass - - def _get_next_sequence(self) -> int: - """Get the next sequence number atomically.""" - with self._sequence_lock: - self._sequence_number += 1 - self._set_last_sequence(self._sequence_number) - return self._sequence_number - + + def __enter__(self): + """Context manager entry.""" + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + """Context manager exit with cleanup.""" + self.cleanup() + + def cleanup(self): + """Clean up temporary directories.""" + if self.temp_dir and os.path.exists(self.temp_dir): + try: + import shutil + shutil.rmtree(self.temp_dir) + logger.debug(f"[remote_upload] Cleaned up temporary directory: {self.temp_dir}") + except Exception as e: + logger.warning(f"[remote_upload] Failed to cleanup temp directory {self.temp_dir}: {e}") + finally: + self.temp_dir = None + + def _get_temp_bundle_dir(self) -> Path: + """Get or create temporary directory for bundle creation.""" + if not self.temp_dir: + self.temp_dir = tempfile.mkdtemp(prefix="delta_bundle_") + return Path(self.temp_dir) + # CLI is stateless - sequence tracking is handled by server + def detect_file_changes(self, changed_paths: List[Path]) -> Dict[str, List]: """ Detect what type of changes occurred for each file path. - + Args: changed_paths: List of changed file paths - + Returns: Dictionary with change types: created, updated, deleted, moved, unchanged """ @@ -123,11 +148,11 @@ def detect_file_changes(self, changed_paths: List[Path]) -> Dict[str, List]: "moved": [], "unchanged": [] } - + for path in changed_paths: abs_path = str(path.resolve()) - cached_hash = get_cached_file_hash(self.workspace_path, abs_path) - + cached_hash = get_cached_file_hash(abs_path, self.repo_name) + if not path.exists(): # File was deleted if cached_hash: @@ -138,7 +163,7 @@ def detect_file_changes(self, changed_paths: List[Path]) -> Dict[str, List]: with open(path, 'rb') as f: content = f.read() current_hash = hashlib.sha1(content).hexdigest() - + if not cached_hash: # New file changes["created"].append(path) @@ -148,42 +173,42 @@ def detect_file_changes(self, changed_paths: List[Path]) -> Dict[str, List]: else: # Unchanged (might be a move detection candidate) changes["unchanged"].append(path) - + # Update cache - set_cached_file_hash(self.workspace_path, abs_path, current_hash) + set_cached_file_hash(abs_path, current_hash, self.repo_name) except Exception: # Skip files that can't be read continue - + # Detect moves by looking for files with same content hash # but different paths (requires additional tracking) changes["moved"] = self._detect_moves(changes["created"], changes["deleted"]) - + return changes - + def _detect_moves(self, created_files: List[Path], deleted_files: List[Path]) -> List[Tuple[Path, Path]]: """ Detect file moves by matching content hashes between created and deleted files. - + Args: created_files: List of newly created files deleted_files: List of deleted files - + Returns: List of (source, destination) path tuples for detected moves """ moves = [] deleted_hashes = {} - + # Build hash map for deleted files for deleted_path in deleted_files: try: # Try to get cached hash first, fallback to file content - cached_hash = get_cached_file_hash(self.workspace_path, str(deleted_path)) + cached_hash = get_cached_file_hash(str(deleted_path), self.repo_name) if cached_hash: deleted_hashes[cached_hash] = deleted_path continue - + # If no cached hash, try to read from file if it still exists if deleted_path.exists(): with open(deleted_path, 'rb') as f: @@ -192,14 +217,14 @@ def _detect_moves(self, created_files: List[Path], deleted_files: List[Path]) -> deleted_hashes[file_hash] = deleted_path except Exception: continue - + # Match created files with deleted files by hash for created_path in created_files: try: with open(created_path, 'rb') as f: content = f.read() file_hash = hashlib.sha1(content).hexdigest() - + if file_hash in deleted_hashes: source_path = deleted_hashes[file_hash] moves.append((source_path, created_path)) @@ -207,42 +232,42 @@ def _detect_moves(self, created_files: List[Path], deleted_files: List[Path]) -> del deleted_hashes[file_hash] except Exception: continue - + return moves - + def create_delta_bundle(self, changes: Dict[str, List]) -> Tuple[str, Dict[str, Any]]: """ Create a delta bundle from detected changes. - + Args: changes: Dictionary of file changes by type - + Returns: Tuple of (bundle_path, manifest_metadata) """ bundle_id = str(uuid.uuid4()) - sequence_number = self._get_next_sequence() + # CLI is stateless - server handles sequence numbers created_at = datetime.now().isoformat() - + # Create temporary directory for bundle with tempfile.TemporaryDirectory() as temp_dir: temp_path = Path(temp_dir) - + # Create directory structure files_dir = temp_path / "files" metadata_dir = temp_path / "metadata" files_dir.mkdir() metadata_dir.mkdir() - + # Create subdirectories (files_dir / "created").mkdir() (files_dir / "updated").mkdir() (files_dir / "moved").mkdir() - + operations = [] total_size = 0 file_hashes = {} - + # Process created files for path in changes["created"]: rel_path = str(path.relative_to(Path(self.workspace_path))) @@ -251,16 +276,16 @@ def create_delta_bundle(self, changes: Dict[str, List]) -> Tuple[str, Dict[str, content = f.read() file_hash = hashlib.sha1(content).hexdigest() content_hash = f"sha1:{file_hash}" - + # Write file to bundle bundle_file_path = files_dir / "created" / rel_path bundle_file_path.parent.mkdir(parents=True, exist_ok=True) bundle_file_path.write_bytes(content) - + # Get file info stat = path.stat() language = idx.CODE_EXTS.get(path.suffix.lower(), "unknown") - + operation = { "operation": "created", "path": rel_path, @@ -275,11 +300,11 @@ def create_delta_bundle(self, changes: Dict[str, List]) -> Tuple[str, Dict[str, operations.append(operation) file_hashes[rel_path] = f"sha1:{file_hash}" total_size += stat.st_size - + except Exception as e: print(f"[bundle_create] Error processing created file {path}: {e}") continue - + # Process updated files for path in changes["updated"]: rel_path = str(path.relative_to(Path(self.workspace_path))) @@ -288,17 +313,17 @@ def create_delta_bundle(self, changes: Dict[str, List]) -> Tuple[str, Dict[str, content = f.read() file_hash = hashlib.sha1(content).hexdigest() content_hash = f"sha1:{file_hash}" - previous_hash = get_cached_file_hash(self.workspace_path, str(path.resolve())) - + previous_hash = get_cached_file_hash(str(path.resolve()), self.repo_name) + # Write file to bundle bundle_file_path = files_dir / "updated" / rel_path bundle_file_path.parent.mkdir(parents=True, exist_ok=True) bundle_file_path.write_bytes(content) - + # Get file info stat = path.stat() language = idx.CODE_EXTS.get(path.suffix.lower(), "unknown") - + operation = { "operation": "updated", "path": rel_path, @@ -314,11 +339,11 @@ def create_delta_bundle(self, changes: Dict[str, List]) -> Tuple[str, Dict[str, operations.append(operation) file_hashes[rel_path] = f"sha1:{file_hash}" total_size += stat.st_size - + except Exception as e: print(f"[bundle_create] Error processing updated file {path}: {e}") continue - + # Process moved files for source_path, dest_path in changes["moved"]: dest_rel_path = str(dest_path.relative_to(Path(self.workspace_path))) @@ -328,16 +353,16 @@ def create_delta_bundle(self, changes: Dict[str, List]) -> Tuple[str, Dict[str, content = f.read() file_hash = hashlib.sha1(content).hexdigest() content_hash = f"sha1:{file_hash}" - + # Write file to bundle bundle_file_path = files_dir / "moved" / dest_rel_path bundle_file_path.parent.mkdir(parents=True, exist_ok=True) bundle_file_path.write_bytes(content) - + # Get file info stat = dest_path.stat() language = idx.CODE_EXTS.get(dest_path.suffix.lower(), "unknown") - + operation = { "operation": "moved", "path": dest_rel_path, @@ -355,17 +380,17 @@ def create_delta_bundle(self, changes: Dict[str, List]) -> Tuple[str, Dict[str, operations.append(operation) file_hashes[dest_rel_path] = f"sha1:{file_hash}" total_size += stat.st_size - + except Exception as e: print(f"[bundle_create] Error processing moved file {source_path} -> {dest_path}: {e}") continue - + # Process deleted files for path in changes["deleted"]: rel_path = str(path.relative_to(Path(self.workspace_path))) try: - previous_hash = get_cached_file_hash(self.workspace_path, str(path.resolve())) - + previous_hash = get_cached_file_hash(str(path.resolve()), self.repo_name) + operation = { "operation": "deleted", "path": rel_path, @@ -377,11 +402,11 @@ def create_delta_bundle(self, changes: Dict[str, List]) -> Tuple[str, Dict[str, "language": idx.CODE_EXTS.get(path.suffix.lower(), "unknown") } operations.append(operation) - + except Exception as e: print(f"[bundle_create] Error processing deleted file {path}: {e}") continue - + # Create manifest manifest = { "version": "1.0", @@ -389,8 +414,9 @@ def create_delta_bundle(self, changes: Dict[str, List]) -> Tuple[str, Dict[str, "workspace_path": self.workspace_path, "collection_name": self.collection_name, "created_at": created_at, - "sequence_number": sequence_number, - "parent_sequence": sequence_number - 1, + # CLI is stateless - server will assign sequence numbers + "sequence_number": None, # Server will assign + "parent_sequence": None, # Server will determine "operations": { "created": len(changes["created"]), "updated": len(changes["updated"]), @@ -402,16 +428,16 @@ def create_delta_bundle(self, changes: Dict[str, List]) -> Tuple[str, Dict[str, "compression": "gzip", "encoding": "utf-8" } - + # Write manifest (temp_path / "manifest.json").write_text(json.dumps(manifest, indent=2)) - + # Write operations metadata operations_metadata = { "operations": operations } (metadata_dir / "operations.json").write_text(json.dumps(operations_metadata, indent=2)) - + # Write hashes hashes_metadata = { "workspace_path": self.workspace_path, @@ -419,27 +445,28 @@ def create_delta_bundle(self, changes: Dict[str, List]) -> Tuple[str, Dict[str, "file_hashes": file_hashes } (metadata_dir / "hashes.json").write_text(json.dumps(hashes_metadata, indent=2)) - - # Create tarball - bundle_path = self.bundle_dir / f"{bundle_id}.tar.gz" + + # Create tarball in temporary directory + temp_bundle_dir = self._get_temp_bundle_dir() + bundle_path = temp_bundle_dir / f"{bundle_id}.tar.gz" with tarfile.open(bundle_path, "w:gz") as tar: tar.add(temp_path, arcname=f"{bundle_id}") - + return str(bundle_path), manifest - + def upload_bundle(self, bundle_path: str, manifest: Dict[str, Any]) -> Dict[str, Any]: """ Upload delta bundle to remote server with exponential backoff retry. - + Args: bundle_path: Path to the bundle tarball manifest: Bundle manifest metadata - + Returns: Server response dictionary """ last_error = None - + for attempt in range(self.max_retries + 1): try: # Calculate backoff delay (exponential with jitter) @@ -449,7 +476,7 @@ def upload_bundle(self, bundle_path: str, manifest: Dict[str, Any]) -> Dict[str, delay = min(base_delay + jitter, 30) # Cap at 30 seconds logger.info(f"[remote_upload] Retry attempt {attempt + 1}/{self.max_retries + 1} after {delay:.2f}s delay") time.sleep(delay) - + # Verify bundle exists before attempting upload if not os.path.exists(bundle_path): return { @@ -459,12 +486,12 @@ def upload_bundle(self, bundle_path: str, manifest: Dict[str, Any]) -> Dict[str, "message": f"Bundle file not found: {bundle_path}" } } - + # Check bundle size bundle_size = os.path.getsize(bundle_path) max_size_mb = 100 # Default max size max_size_bytes = max_size_mb * 1024 * 1024 - + if bundle_size > max_size_bytes: return { "success": False, @@ -473,28 +500,28 @@ def upload_bundle(self, bundle_path: str, manifest: Dict[str, Any]) -> Dict[str, "message": f"Bundle size {bundle_size} bytes exceeds maximum {max_size_bytes} bytes" } } - + with open(bundle_path, 'rb') as bundle_file: files = { 'bundle': (f"{manifest['bundle_id']}.tar.gz", bundle_file, 'application/gzip') } - + data = { - 'workspace_path': self.workspace_path, + 'workspace_path': self._translate_to_container_path(self.workspace_path), 'collection_name': self.collection_name, - 'sequence_number': str(manifest['sequence_number']), + # CLI is stateless - server handles sequence numbers 'force': 'false' } - + logger.info(f"[remote_upload] Uploading bundle {manifest['bundle_id']} (size: {bundle_size} bytes)") - + response = self.session.post( f"{self.upload_endpoint}/api/v1/delta/upload", files=files, data=data, timeout=self.timeout ) - + if response.status_code == 200: result = response.json() logger.info(f"[remote_upload] Successfully uploaded bundle {manifest['bundle_id']}") @@ -509,7 +536,7 @@ def upload_bundle(self, bundle_path: str, manifest: Dict[str, Any]) -> Dict[str, except: error_msg += f": {response.text[:200]}" # Truncate long responses error_code = "HTTP_ERROR" - + last_error = { "success": False, "error": { @@ -518,14 +545,14 @@ def upload_bundle(self, bundle_path: str, manifest: Dict[str, Any]) -> Dict[str, "status_code": response.status_code } } - + # Don't retry on client errors (4xx) if 400 <= response.status_code < 500 and response.status_code != 429: logger.warning(f"[remote_upload] Client error {response.status_code}, not retrying: {error_msg}") return last_error - + logger.warning(f"[remote_upload] Upload attempt {attempt + 1} failed: {error_msg}") - + except requests.exceptions.Timeout as e: last_error = { "success": False, @@ -535,7 +562,7 @@ def upload_bundle(self, bundle_path: str, manifest: Dict[str, Any]) -> Dict[str, } } logger.warning(f"[remote_upload] Upload timeout on attempt {attempt + 1}: {e}") - + except requests.exceptions.ConnectionError as e: last_error = { "success": False, @@ -545,7 +572,7 @@ def upload_bundle(self, bundle_path: str, manifest: Dict[str, Any]) -> Dict[str, } } logger.warning(f"[remote_upload] Connection error on attempt {attempt + 1}: {e}") - + except requests.exceptions.RequestException as e: last_error = { "success": False, @@ -555,7 +582,7 @@ def upload_bundle(self, bundle_path: str, manifest: Dict[str, Any]) -> Dict[str, } } logger.warning(f"[remote_upload] Network error on attempt {attempt + 1}: {e}") - + except Exception as e: last_error = { "success": False, @@ -565,7 +592,7 @@ def upload_bundle(self, bundle_path: str, manifest: Dict[str, Any]) -> Dict[str, } } logger.error(f"[remote_upload] Unexpected error on attempt {attempt + 1}: {e}") - + # All retries exhausted logger.error(f"[remote_upload] All {self.max_retries + 1} upload attempts failed for bundle {manifest.get('bundle_id', 'unknown')}") return last_error or { @@ -575,18 +602,21 @@ def upload_bundle(self, bundle_path: str, manifest: Dict[str, Any]) -> Dict[str, "message": f"Upload failed after {self.max_retries + 1} attempts" } } - + def get_server_status(self) -> Dict[str, Any]: """Get server status and last sequence number with enhanced error handling.""" try: logger.debug(f"[remote_upload] Checking server status at {self.upload_endpoint}") - + + # Translate host path to container path for API communication + container_workspace_path = self._translate_to_container_path(self.workspace_path) + response = self.session.get( f"{self.upload_endpoint}/api/v1/delta/status", - params={'workspace_path': self.workspace_path}, + params={'workspace_path': container_workspace_path}, timeout=min(self.timeout, 10) # Use shorter timeout for status checks ) - + if response.status_code == 200: status_data = response.json() logger.debug(f"[remote_upload] Server status: {status_data}") @@ -599,7 +629,7 @@ def get_server_status(self) -> Dict[str, Any]: error_msg += f": {error_detail_msg}" except: error_msg += f": {response.text[:100]}" - + logger.warning(f"[remote_upload] {error_msg}") return { "success": False, @@ -609,7 +639,7 @@ def get_server_status(self) -> Dict[str, Any]: "status_code": response.status_code } } - + except requests.exceptions.Timeout as e: error_msg = f"Status check timeout after {min(self.timeout, 10)}s" logger.warning(f"[remote_upload] {error_msg}: {e}") @@ -650,93 +680,174 @@ def get_server_status(self) -> Dict[str, Any]: "message": error_msg } } - + def has_meaningful_changes(self, changes: Dict[str, List]) -> bool: """Check if changes warrant a delta upload.""" total_changes = sum(len(files) for op, files in changes.items() if op != "unchanged") return total_changes > 0 - + + def process_changes_and_upload(self, changes: Dict[str, List]) -> bool: + """ + Process pre-computed changes and upload delta bundle. + Includes comprehensive error handling and graceful fallback. + + Args: + changes: Dictionary of file changes by type + + Returns: + True if upload was successful, False otherwise + """ + try: + logger.info(f"[remote_upload] Processing pre-computed changes") + + # Validate input + if not changes: + logger.info("[remote_upload] No changes provided") + return True + + if not self.has_meaningful_changes(changes): + logger.info("[remote_upload] No meaningful changes detected, skipping upload") + return True + + # Log change summary + total_changes = sum(len(files) for op, files in changes.items() if op != "unchanged") + logger.info(f"[remote_upload] Detected {total_changes} meaningful changes: " + f"{len(changes['created'])} created, {len(changes['updated'])} updated, " + f"{len(changes['deleted'])} deleted, {len(changes['moved'])} moved") + + # Create delta bundle + bundle_path = None + try: + bundle_path, manifest = self.create_delta_bundle(changes) + logger.info(f"[remote_upload] Created delta bundle: {manifest['bundle_id']} " + f"(size: {manifest['total_size_bytes']} bytes)") + + # Validate bundle was created successfully + if not bundle_path or not os.path.exists(bundle_path): + raise RuntimeError(f"Failed to create bundle at {bundle_path}") + + except Exception as e: + logger.error(f"[remote_upload] Error creating delta bundle: {e}") + # Clean up any temporary files on failure + self.cleanup() + return False + + # Upload bundle with retry logic + try: + response = self.upload_bundle(bundle_path, manifest) + + if response.get("success", False): + processed_ops = response.get('processed_operations', {}) + logger.info(f"[remote_upload] Successfully uploaded bundle {manifest['bundle_id']}") + logger.info(f"[remote_upload] Processed operations: {processed_ops}") + + # Clean up temporary bundle after successful upload + try: + if os.path.exists(bundle_path): + os.remove(bundle_path) + logger.debug(f"[remote_upload] Cleaned up temporary bundle: {bundle_path}") + # Also clean up the entire temp directory if this is the last bundle + self.cleanup() + except Exception as cleanup_error: + logger.warning(f"[remote_upload] Failed to cleanup bundle {bundle_path}: {cleanup_error}") + + return True + else: + error_msg = response.get('error', {}).get('message', 'Unknown upload error') + logger.error(f"[remote_upload] Upload failed: {error_msg}") + return False + + except Exception as e: + logger.error(f"[remote_upload] Error uploading bundle: {e}") + return False + + except Exception as e: + logger.error(f"[remote_upload] Unexpected error in process_changes_and_upload: {e}") + return False + def process_and_upload_changes(self, changed_paths: List[Path]) -> bool: """ Process changed paths and upload delta bundle if meaningful changes exist. Includes comprehensive error handling and graceful fallback. - + Args: changed_paths: List of changed file paths - + Returns: True if upload was successful, False otherwise """ try: logger.info(f"[remote_upload] Processing {len(changed_paths)} changed paths") - + # Validate input if not changed_paths: logger.info("[remote_upload] No changed paths provided") return True - + # Detect changes try: changes = self.detect_file_changes(changed_paths) except Exception as e: logger.error(f"[remote_upload] Error detecting file changes: {e}") return False - + if not self.has_meaningful_changes(changes): logger.info("[remote_upload] No meaningful changes detected, skipping upload") return True - + # Log change summary total_changes = sum(len(files) for op, files in changes.items() if op != "unchanged") logger.info(f"[remote_upload] Detected {total_changes} meaningful changes: " f"{len(changes['created'])} created, {len(changes['updated'])} updated, " f"{len(changes['deleted'])} deleted, {len(changes['moved'])} moved") - + # Create delta bundle bundle_path = None try: bundle_path, manifest = self.create_delta_bundle(changes) logger.info(f"[remote_upload] Created delta bundle: {manifest['bundle_id']} " - f"(seq: {manifest['sequence_number']}, size: {manifest['total_size_bytes']} bytes)") - + f"(size: {manifest['total_size_bytes']} bytes)") + # Validate bundle was created successfully if not bundle_path or not os.path.exists(bundle_path): raise RuntimeError(f"Failed to create bundle at {bundle_path}") - + except Exception as e: logger.error(f"[remote_upload] Error creating delta bundle: {e}") + # Clean up any temporary files on failure + self.cleanup() return False - + # Upload bundle with retry logic try: response = self.upload_bundle(bundle_path, manifest) - + if response.get("success", False): processed_ops = response.get('processed_operations', {}) logger.info(f"[remote_upload] Successfully uploaded bundle {manifest['bundle_id']}") logger.info(f"[remote_upload] Processed operations: {processed_ops}") - - # Clean up local bundle after successful upload + + # Clean up temporary bundle after successful upload try: if os.path.exists(bundle_path): os.remove(bundle_path) - logger.debug(f"[remote_upload] Cleaned up local bundle: {bundle_path}") + logger.debug(f"[remote_upload] Cleaned up temporary bundle: {bundle_path}") + # Also clean up the entire temp directory if this is the last bundle + self.cleanup() except Exception as cleanup_error: logger.warning(f"[remote_upload] Failed to cleanup bundle {bundle_path}: {cleanup_error}") - + return True else: error = response.get("error", {}) error_code = error.get("code", "UNKNOWN") error_msg = error.get("message", "Unknown error") - + logger.error(f"[remote_upload] Upload failed: {error_msg}") - + # Handle specific error types - if error_code == "SEQUENCE_MISMATCH": - logger.info("[remote_upload] Attempting to handle sequence mismatch") - return self._handle_sequence_mismatch(response, manifest) - elif error_code in ["BUNDLE_TOO_LARGE", "BUNDLE_NOT_FOUND"]: + # CLI is stateless - server handles sequence management + if error_code in ["BUNDLE_TOO_LARGE", "BUNDLE_NOT_FOUND"]: # These are unrecoverable errors logger.error(f"[remote_upload] Unrecoverable error ({error_code}): {error_msg}") return False @@ -749,67 +860,17 @@ def process_and_upload_changes(self, changed_paths: List[Path]) -> bool: # Other errors logger.error(f"[remote_upload] Upload error ({error_code}): {error_msg}") return False - + except Exception as e: logger.error(f"[remote_upload] Unexpected error during upload: {e}") return False - + except Exception as e: logger.error(f"[remote_upload] Critical error in process_and_upload_changes: {e}") logger.exception("[remote_upload] Full traceback:") return False - - def _handle_sequence_mismatch(self, error_response: Dict[str, Any], manifest: Dict[str, Any]) -> bool: - """Handle sequence number mismatch by recovering missing bundles.""" - try: - expected_seq = error_response["error"]["expected_sequence"] - current_seq = manifest["sequence_number"] - - print(f"[remote_upload] Sequence mismatch: expected {expected_seq}, got {current_seq}") - - # For PoC, we'll just force upload with the expected sequence - # In a production system, we would implement proper recovery - print(f"[remote_upload] Forcing upload with sequence {expected_seq}") - - # Update our sequence number - with self._sequence_lock: - self._sequence_number = expected_seq - self._set_last_sequence(expected_seq) - - # Retry upload with force=true - bundle_path = self.bundle_dir / f"{manifest['bundle_id']}.tar.gz" - if bundle_path.exists(): - data = { - 'workspace_path': self.workspace_path, - 'collection_name': self.collection_name, - 'sequence_number': str(expected_seq), - 'force': 'true' - } - - with open(bundle_path, 'rb') as bundle_file: - files = { - 'bundle': (f"{manifest['bundle_id']}.tar.gz", bundle_file, 'application/gzip') - } - - response = self.session.post( - f"{self.upload_endpoint}/api/v1/delta/upload", - files=files, - data=data, - timeout=self.timeout - ) - - if response.status_code == 200: - result = response.json() - if result.get("success", False): - print(f"[remote_upload] Force upload successful for bundle {manifest['bundle_id']}") - return True - - print(f"[remote_upload] Force upload failed for bundle {manifest['bundle_id']}") - return False - - except Exception as e: - print(f"[remote_upload] Error handling sequence mismatch: {e}") - return False + + # CLI is stateless - sequence mismatch handling is done by server def is_remote_mode_enabled() -> bool: @@ -817,12 +878,190 @@ def is_remote_mode_enabled() -> bool: return os.environ.get("REMOTE_UPLOAD_ENABLED", "").lower() in {"1", "true", "yes", "on"} -def get_remote_config() -> Dict[str, str]: - """Get remote upload configuration from environment variables.""" +def get_remote_config(cli_path: Optional[str] = None) -> Dict[str, str]: + """Get remote upload configuration from environment variables and command-line arguments.""" + # Use command-line path if provided, otherwise fall back to environment variables + if cli_path: + workspace_path = cli_path + else: + workspace_path = os.environ.get("WATCH_ROOT", os.environ.get("WORKSPACE_PATH", "/work")) + + # Use auto-generated collection name based on repo name + repo_name = _extract_repo_name_from_path(workspace_path) + collection_name = get_collection_name(repo_name) + return { "upload_endpoint": os.environ.get("REMOTE_UPLOAD_ENDPOINT", "http://localhost:8080"), - "workspace_path": os.environ.get("WATCH_ROOT", os.environ.get("WORKSPACE_PATH", "/work")), - "collection_name": os.environ.get("COLLECTION_NAME", "my-collection"), + "workspace_path": workspace_path, + "collection_name": collection_name, "max_retries": int(os.environ.get("REMOTE_UPLOAD_MAX_RETRIES", "3")), "timeout": int(os.environ.get("REMOTE_UPLOAD_TIMEOUT", "30")) - } \ No newline at end of file + } + + +def main(): + """Main entry point for the remote upload client.""" + parser = argparse.ArgumentParser( + description="Remote upload client for delta bundles in Context-Engine", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Upload from current directory or environment variables + python remote_upload_client.py + + # Upload from specific directory + python remote_upload_client.py --path /path/to/repo + + # Upload from specific directory with custom endpoint + python remote_upload_client.py --path /path/to/repo --endpoint http://remote-server:8080 + """ + ) + + parser.add_argument( + "--path", + type=str, + help="Path to the directory to upload (overrides WATCH_ROOT/WORKSPACE_PATH environment variables)" + ) + + parser.add_argument( + "--endpoint", + type=str, + help="Remote upload endpoint (overrides REMOTE_UPLOAD_ENDPOINT environment variable)" + ) + + parser.add_argument( + "--max-retries", + type=int, + help="Maximum number of upload retries (overrides REMOTE_UPLOAD_MAX_RETRIES environment variable)" + ) + + parser.add_argument( + "--timeout", + type=int, + help="Request timeout in seconds (overrides REMOTE_UPLOAD_TIMEOUT environment variable)" + ) + + parser.add_argument( + "--force", + action="store_true", + help="Force upload of all files (ignore cached state and treat all files as new)" + ) + + args = parser.parse_args() + + # Validate path if provided + if args.path: + if not os.path.exists(args.path): + logger.error(f"Path does not exist: {args.path}") + return 1 + + if not os.path.isdir(args.path): + logger.error(f"Path is not a directory: {args.path}") + return 1 + + args.path = os.path.abspath(args.path) + logger.info(f"Using specified path: {args.path}") + + # Get configuration + config = get_remote_config(args.path) + + # Override config with command-line arguments if provided + if args.endpoint: + config["upload_endpoint"] = args.endpoint + if args.max_retries is not None: + config["max_retries"] = args.max_retries + if args.timeout is not None: + config["timeout"] = args.timeout + + logger.info(f"Workspace path: {config['workspace_path']}") + logger.info(f"Collection name: {config['collection_name']}") + logger.info(f"Upload endpoint: {config['upload_endpoint']}") + + # Check if remote mode is enabled + if not is_remote_mode_enabled(): + logger.error("Remote upload mode is not enabled. Set REMOTE_UPLOAD_ENABLED=1 in environment variables.") + return 1 + + # Initialize client with context manager for cleanup + try: + with RemoteUploadClient( + upload_endpoint=config["upload_endpoint"], + workspace_path=config["workspace_path"], + collection_name=config["collection_name"], + max_retries=config["max_retries"], + timeout=config["timeout"] + ) as client: + + logger.info("Remote upload client initialized successfully") + + # Test server connection + logger.info("Checking server status...") + status = client.get_server_status() + # For delta endpoint, success is indicated by having expected fields (not a "success" boolean) + is_success = ( + isinstance(status, dict) and + 'workspace_path' in status and + 'collection_name' in status and + status.get('status') == 'ready' + ) + if not is_success: + error = status.get("error", {}) + logger.error(f"Cannot connect to server: {error.get('message', 'Unknown error')}") + return 1 + + logger.info("Server connection successful") + + # Scan repository and upload files + logger.info("Scanning repository for files...") + workspace_path = Path(config['workspace_path']) + + # Find all files in the repository + all_files = [] + for file_path in workspace_path.rglob('*'): + if file_path.is_file() and not file_path.name.startswith('.'): + rel_path = file_path.relative_to(workspace_path) + # Skip .codebase directory and other metadata + if not str(rel_path).startswith('.codebase'): + all_files.append(file_path) + + logger.info(f"Found {len(all_files)} files to upload") + + if not all_files: + logger.warning("No files found to upload") + return 0 + + # Detect changes (treat all files as changes for initial upload) + if args.force: + # Force mode: treat all files as created + changes = {"created": all_files, "updated": [], "deleted": [], "moved": [], "unchanged": []} + else: + changes = client.detect_file_changes(all_files) + + if not client.has_meaningful_changes(changes): + logger.info("No meaningful changes to upload") + return 0 + + logger.info(f"Changes detected: {len(changes.get('created', []))} created, {len(changes.get('updated', []))} updated, {len(changes.get('deleted', []))} deleted") + + # Process and upload changes + logger.info("Uploading files to remote server...") + success = client.process_changes_and_upload(changes) + + if success: + logger.info("Repository upload completed successfully!") + logger.info(f"Collection name: {config['collection_name']}") + logger.info(f"Files uploaded: {len(all_files)}") + else: + logger.error("Repository upload failed!") + return 1 + + return 0 + + except Exception as e: + logger.error(f"Failed to initialize remote upload client: {e}") + return 1 + + +if __name__ == "__main__": + import sys + sys.exit(main()) diff --git a/scripts/upload_service.py b/scripts/upload_service.py index 243fb44d..a1c2e605 100644 --- a/scripts/upload_service.py +++ b/scripts/upload_service.py @@ -26,30 +26,20 @@ # Import existing workspace state and indexing functions try: from scripts.workspace_state import ( - get_workspace_state, - update_workspace_state, - update_last_activity, + log_activity, get_collection_name, get_cached_file_hash, set_cached_file_hash, - remove_cached_file, + _extract_repo_name_from_path, ) except ImportError: # Fallback for testing without full environment - get_workspace_state = None - update_workspace_state = None - update_last_activity = None + log_activity = None get_collection_name = None get_cached_file_hash = None set_cached_file_hash = None - remove_cached_file = None + _extract_repo_name_from_path = None -try: - from scripts.ingest_code import index_repo, delete_points_by_path -except ImportError: - # Fallback for testing - index_repo = None - delete_points_by_path = None # Configure logging logging.basicConfig( @@ -90,8 +80,6 @@ class UploadResponse(BaseModel): sequence_number: Optional[int] = None processed_operations: Optional[Dict[str, int]] = None processing_time_ms: Optional[int] = None - indexed_points: Optional[int] = None - collection_name: Optional[str] = None next_sequence: Optional[int] = None error: Optional[Dict[str, Any]] = None @@ -112,8 +100,12 @@ class HealthResponse(BaseModel): work_dir: str def get_workspace_key(workspace_path: str) -> str: - """Generate a unique key for workspace tracking.""" - return hashlib.sha256(workspace_path.encode('utf-8')).hexdigest()[:16] + """Generate a unique key for workspace tracking using repository name.""" + # Extract repository name from path for consistent identification + # Both host paths (/home/user/project/repo) and container paths (/work/repo) + # should generate the same key for the same repository + repo_name = Path(workspace_path).name + return hashlib.sha256(repo_name.encode('utf-8')).hexdigest()[:16] def get_next_sequence(workspace_path: str) -> int: """Get next sequence number for workspace.""" @@ -135,35 +127,35 @@ def validate_bundle_format(bundle_path: Path) -> Dict[str, Any]: # Check for required files required_files = ["manifest.json", "metadata/operations.json", "metadata/hashes.json"] members = tar.getnames() - + for req_file in required_files: if not any(req_file in member for member in members): raise ValueError(f"Missing required file: {req_file}") - + # Extract and validate manifest manifest_member = None for member in members: if member.endswith("manifest.json"): manifest_member = member break - + if not manifest_member: raise ValueError("manifest.json not found in bundle") - + manifest_file = tar.extractfile(manifest_member) if not manifest_file: raise ValueError("Cannot extract manifest.json") - + manifest = json.loads(manifest_file.read().decode('utf-8')) - + # Validate manifest structure required_fields = ["version", "bundle_id", "workspace_path", "created_at", "sequence_number"] for field in required_fields: if field not in manifest: raise ValueError(f"Missing required field in manifest: {field}") - + return manifest - + except Exception as e: raise ValueError(f"Invalid bundle format: {str(e)}") @@ -177,12 +169,22 @@ async def process_delta_bundle(workspace_path: str, bundle_path: Path, manifest: "skipped": 0, "failed": 0 } - + try: - # Ensure workspace directory exists - workspace = Path(workspace_path) + # CRITICAL FIX: Extract repo name and create workspace under WORK_DIR + # Previous bug: used source workspace_path directly, extracting files outside /work + # This caused watcher service to never see uploaded files + if _extract_repo_name_from_path: + repo_name = _extract_repo_name_from_path(workspace_path) + else: + # Fallback: use directory name + repo_name = Path(workspace_path).name + + # Generate workspace under WORK_DIR using repo name hash + workspace_key = get_workspace_key(workspace_path) + workspace = Path(WORK_DIR) / f"{repo_name}-{workspace_key}" workspace.mkdir(parents=True, exist_ok=True) - + with tarfile.open(bundle_path, "r:gz") as tar: # Extract operations metadata ops_member = None @@ -190,28 +192,28 @@ async def process_delta_bundle(workspace_path: str, bundle_path: Path, manifest: if member.endswith("metadata/operations.json"): ops_member = member break - + if not ops_member: raise ValueError("operations.json not found in bundle") - + ops_file = tar.extractfile(ops_member) if not ops_file: raise ValueError("Cannot extract operations.json") - + operations_data = json.loads(ops_file.read().decode('utf-8')) operations = operations_data.get("operations", []) - + # Process each operation for operation in operations: op_type = operation.get("operation") rel_path = operation.get("path") - + if not rel_path: operations_count["skipped"] += 1 continue - + target_path = workspace / rel_path - + try: if op_type == "created": # Extract file from bundle @@ -220,7 +222,7 @@ async def process_delta_bundle(workspace_path: str, bundle_path: Path, manifest: if member.endswith(f"files/created/{rel_path}"): file_member = member break - + if file_member: file_content = tar.extractfile(file_member) if file_content: @@ -231,7 +233,7 @@ async def process_delta_bundle(workspace_path: str, bundle_path: Path, manifest: operations_count["failed"] += 1 else: operations_count["failed"] += 1 - + elif op_type == "updated": # Extract updated file file_member = None @@ -239,7 +241,7 @@ async def process_delta_bundle(workspace_path: str, bundle_path: Path, manifest: if member.endswith(f"files/updated/{rel_path}"): file_member = member break - + if file_member: file_content = tar.extractfile(file_member) if file_content: @@ -250,7 +252,7 @@ async def process_delta_bundle(workspace_path: str, bundle_path: Path, manifest: operations_count["failed"] += 1 else: operations_count["failed"] += 1 - + elif op_type == "moved": # Extract moved file to destination file_member = None @@ -258,7 +260,7 @@ async def process_delta_bundle(workspace_path: str, bundle_path: Path, manifest: if member.endswith(f"files/moved/{rel_path}"): file_member = member break - + if file_member: file_content = tar.extractfile(file_member) if file_content: @@ -269,7 +271,7 @@ async def process_delta_bundle(workspace_path: str, bundle_path: Path, manifest: operations_count["failed"] += 1 else: operations_count["failed"] += 1 - + elif op_type == "deleted": # Delete file if target_path.exists(): @@ -277,57 +279,20 @@ async def process_delta_bundle(workspace_path: str, bundle_path: Path, manifest: operations_count["deleted"] += 1 else: operations_count["skipped"] += 1 - + else: operations_count["skipped"] += 1 - + except Exception as e: logger.error(f"Error processing operation {op_type} for {rel_path}: {e}") operations_count["failed"] += 1 - + return operations_count - + except Exception as e: logger.error(f"Error processing delta bundle: {e}") raise -async def index_changed_files(workspace_path: str, collection_name: str) -> int: - """Index changed files using existing ingest_code pipeline.""" - if not index_repo: - logger.warning("index_repo function not available, skipping indexing") - return 0 - - try: - # Get workspace state to determine what needs indexing - if get_workspace_state: - state = get_workspace_state(workspace_path) - # Update last activity - if update_last_activity: - activity = { - "timestamp": datetime.now().isoformat(), - "action": "indexed", - "details": { - "files_processed": "unknown", - "source": "delta_upload" - } - } - update_last_activity(workspace_path, activity) - - # Call existing indexing function - logger.info(f"Indexing workspace: {workspace_path}") - result = index_repo( - workspace_path, - qdrant_url=QDRANT_URL, - collection_name=collection_name, - recreate=False - ) - - # Return estimated number of points (this is approximate) - return result.get("points_created", 0) if isinstance(result, dict) else 0 - - except Exception as e: - logger.error(f"Error indexing files: {e}") - return 0 @app.get("/health", response_model=HealthResponse) async def health_check(): @@ -346,24 +311,16 @@ async def get_status(workspace_path: str): try: # Get collection name if get_collection_name: - collection_name = get_collection_name(workspace_path) + repo_name = _extract_repo_name_from_path(workspace_path) if _extract_repo_name_from_path else None + collection_name = get_collection_name(repo_name) else: collection_name = DEFAULT_COLLECTION - + # Get last sequence last_sequence = get_last_sequence(workspace_path) - - # Get workspace state if available + last_upload = None - if get_workspace_state: - try: - state = get_workspace_state(workspace_path) - last_activity = state.get("last_activity") - if last_activity: - last_upload = last_activity.get("timestamp") - except Exception: - pass - + return StatusResponse( workspace_path=workspace_path, collection_name=collection_name, @@ -377,7 +334,7 @@ async def get_status(workspace_path: str): "supported_formats": ["tar.gz"] } ) - + except Exception as e: logger.error(f"Error getting status: {e}") raise HTTPException(status_code=500, detail=str(e)) @@ -393,47 +350,48 @@ async def upload_delta_bundle( ): """Upload and process delta bundle.""" start_time = datetime.now() - + try: # Validate workspace path workspace = Path(workspace_path) if not workspace.is_absolute(): workspace = Path(WORK_DIR) / workspace - + workspace_path = str(workspace.resolve()) - + # Get collection name if not collection_name: if get_collection_name: - collection_name = get_collection_name(workspace_path) + repo_name = _extract_repo_name_from_path(workspace_path) if _extract_repo_name_from_path else None + collection_name = get_collection_name(repo_name) else: collection_name = DEFAULT_COLLECTION - + # Validate bundle size if bundle.size and bundle.size > MAX_BUNDLE_SIZE_MB * 1024 * 1024: raise HTTPException( status_code=413, detail=f"Bundle too large. Max size: {MAX_BUNDLE_SIZE_MB}MB" ) - + # Save bundle to temporary file with tempfile.NamedTemporaryFile(suffix=".tar.gz", delete=False) as temp_file: bundle_path = Path(temp_file.name) - + # Stream upload to file content = await bundle.read() bundle_path.write_bytes(content) - + try: # Validate bundle format manifest = validate_bundle_format(bundle_path) bundle_id = manifest.get("bundle_id") manifest_sequence = manifest.get("sequence_number") - + # Check sequence number if sequence_number is None: sequence_number = manifest_sequence - + if not force and sequence_number is not None: last_sequence = get_last_sequence(workspace_path) if sequence_number != last_sequence + 1: @@ -447,54 +405,48 @@ async def upload_delta_bundle( "retry_after": 5000 } ) - + # Process delta bundle operations_count = await process_delta_bundle(workspace_path, bundle_path, manifest) - - # Index changed files - indexed_points = await index_changed_files(workspace_path, collection_name) - + + # Update sequence tracking if sequence_number is not None: key = get_workspace_key(workspace_path) _sequence_tracker[key] = sequence_number - - # Update workspace state - if update_last_activity: - activity = { - "timestamp": datetime.now().isoformat(), - "action": "indexed", - "file_path": bundle_id, - "details": { + + # Log activity using cleaned workspace_state function + if log_activity: + log_activity( + repo_name=_extract_repo_name_from_path(workspace_path) if _extract_repo_name_from_path else None, + action="uploaded", + file_path=bundle_id, + details={ "bundle_id": bundle_id, "operations": operations_count, - "indexed_points": indexed_points, "source": "delta_upload" } - } - update_last_activity(workspace_path, activity) - + ) + # Calculate processing time processing_time = (datetime.now() - start_time).total_seconds() * 1000 - + return UploadResponse( success=True, bundle_id=bundle_id, sequence_number=sequence_number, processed_operations=operations_count, processing_time_ms=int(processing_time), - indexed_points=indexed_points, - collection_name=collection_name, next_sequence=sequence_number + 1 if sequence_number else None ) - + finally: # Clean up temporary file try: bundle_path.unlink() except Exception: pass - + except HTTPException: raise except Exception as e: @@ -526,12 +478,12 @@ def main(): """Main entry point for the upload service.""" host = os.environ.get("UPLOAD_SERVICE_HOST", "0.0.0.0") port = int(os.environ.get("UPLOAD_SERVICE_PORT", "8002")) - + logger.info(f"Starting upload service on {host}:{port}") logger.info(f"Qdrant URL: {QDRANT_URL}") logger.info(f"Work directory: {WORK_DIR}") logger.info(f"Max bundle size: {MAX_BUNDLE_SIZE_MB}MB") - + uvicorn.run( app, host=host, @@ -541,4 +493,4 @@ def main(): ) if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/scripts/wait-for-qdrant.sh b/scripts/wait-for-qdrant.sh index 98f9e859..e26c73cf 100755 --- a/scripts/wait-for-qdrant.sh +++ b/scripts/wait-for-qdrant.sh @@ -1,6 +1,18 @@ #!/usr/bin/env bash set -euo pipefail -until curl -fsS "${QDRANT_URL:-http://localhost:6333}/" >/dev/null; do +# Use Python stdlib to avoid curl dependency in the container +until python - <<'PY' +import os, sys, urllib.request +url = os.environ.get("QDRANT_URL", "http://localhost:6333") +if not url.endswith("/"): + url += "/" +try: + with urllib.request.urlopen(url, timeout=2) as r: + sys.exit(0 if getattr(r, "status", 200) < 500 else 1) +except Exception: + sys.exit(1) +PY +do echo "Waiting for Qdrant at ${QDRANT_URL:-http://localhost:6333} ..." sleep 1 done diff --git a/scripts/warm_all_collections.py b/scripts/warm_all_collections.py new file mode 100644 index 00000000..0344da82 --- /dev/null +++ b/scripts/warm_all_collections.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python3 +""" +Script to warm all collections in Qdrant +""" +import os +import sys +import subprocess +from qdrant_client import QdrantClient + +def main(): + # Get configuration from environment + qdrant_url = os.environ.get("QDRANT_URL", "http://qdrant:6333") + ef = os.environ.get("EF", "256") + limit = os.environ.get("LIMIT", "3") + + print(f"Connecting to Qdrant at {qdrant_url}") + + # Connect to Qdrant + client = QdrantClient(url=qdrant_url) + + # Get all collections + try: + collections_response = client.get_collections() + collections = [c.name for c in collections_response.collections] + print(f"Found collections: {collections}") + except Exception as e: + print(f"Error getting collections: {e}") + sys.exit(1) + + # Warm each collection + for collection_name in collections: + print(f"Warming collection: {collection_name}") + try: + # Set environment variable for the collection name + env = os.environ.copy() + env["COLLECTION_NAME"] = collection_name + + result = subprocess.run( + [ + "python", + "/app/scripts/warm_start.py", + "--ef", ef, + "--limit", limit + ], + capture_output=True, + text=True, + check=True, + env=env + ) + print(f"Successfully warmed {collection_name}") + except subprocess.CalledProcessError as e: + print(f"Error warming {collection_name}: {e}") + print(f"stdout: {e.stdout}") + print(f"stderr: {e.stderr}") + sys.exit(1) + + print("All collections warmed successfully") + +if __name__ == "__main__": + main() diff --git a/scripts/watch_index.py b/scripts/watch_index.py index eaa50a71..bd1bc823 100644 --- a/scripts/watch_index.py +++ b/scripts/watch_index.py @@ -4,6 +4,7 @@ import threading from pathlib import Path from typing import Set, Optional +from collections import OrderedDict from qdrant_client import QdrantClient, models from fastembed import TextEmbedding @@ -19,15 +20,41 @@ if str(ROOT_DIR) not in sys.path: sys.path.insert(0, str(ROOT_DIR)) -from scripts.workspace_state import ( - get_workspace_state, - update_indexing_status, - update_last_activity, - update_workspace_state, - get_cached_file_hash, - set_cached_file_hash, - remove_cached_file, -) +# Import critical functions first to prevent cascading failures +try: + from scripts.workspace_state import ( + _extract_repo_name_from_path, + get_collection_name, + _get_global_state_dir, + _get_repo_state_dir, + is_multi_repo_mode, + get_cached_file_hash, + set_cached_file_hash, + ) +except ImportError: + # If critical imports fail, set None to prevent crashes + _extract_repo_name_from_path = None # type: ignore + get_collection_name = None # type: ignore + _get_global_state_dir = None # type: ignore + _get_repo_state_dir = None # type: ignore + is_multi_repo_mode = None # type: ignore + get_cached_file_hash = None # type: ignore + set_cached_file_hash = None # type: ignore + +# Import optional functions that may not exist +try: + from scripts.workspace_state import ( + get_workspace_state, + update_indexing_status, + update_workspace_state, + remove_cached_file, + ) +except ImportError: + # Optional functions - set to None if not available + get_workspace_state = None # type: ignore + update_indexing_status = None # type: ignore + update_workspace_state = None # type: ignore + remove_cached_file = None # type: ignore import hashlib from datetime import datetime @@ -45,21 +72,251 @@ _REMOTE_UPLOAD_AVAILABLE = False QDRANT_URL = os.environ.get("QDRANT_URL", "http://qdrant:6333") -COLLECTION = os.environ.get("COLLECTION_NAME", "my-collection") MODEL = os.environ.get("EMBEDDING_MODEL", "BAAI/bge-base-en-v1.5") -ROOT = Path(os.environ.get("WATCH_ROOT", "/work")).resolve() +ROOT = Path(os.environ.get("WATCH_ROOT", "/work")) # Debounce interval DELAY_SECS = float(os.environ.get("WATCH_DEBOUNCE_SECS", "1.0")) +# Simple LRU cache implementation to prevent memory growth +class LRUCache: + """Simple LRU cache with size limits.""" + + def __init__(self, max_size: int = 1000): + self.max_size = max_size + self.cache = OrderedDict() + self._hits = 0 + self._misses = 0 + + def get(self, key): + if key in self.cache: + # Move to end (most recently used) + self.cache.move_to_end(key) + self._hits += 1 + return self.cache[key] + self._misses += 1 + return None + + def put(self, key, value): + if key in self.cache: + # Update existing entry + self.cache[key] = value + self.cache.move_to_end(key) + else: + # Add new entry, evict if necessary + if len(self.cache) >= self.max_size: + # Remove least recently used item + self.cache.popitem(last=False) + self.cache[key] = value + + def clear(self): + self.cache.clear() + self._hits = 0 + self._misses = 0 + + def get_hit_rate(self): + total = self._hits + self._misses + return self._hits / total if total > 0 else 0.0 + + def size(self): + return len(self.cache) + +# Multi-repo collection management with size-limited caches +_collection_cache = LRUCache(max_size=500) # Cache for repo path -> collection name mapping +_repo_cache = LRUCache(max_size=2000) # Cache for file path -> repo path mapping + +# Optional cache statistics logging (disabled by default) +_ENABLE_CACHE_STATS = os.environ.get("ENABLE_CACHE_STATS", "false").lower() == "true" + +def _log_cache_stats(): + """Log cache statistics for monitoring.""" + if _ENABLE_CACHE_STATS: + print(f"[cache_stats] Collection cache: {_collection_cache.size()} items, " + f"hit rate: {_collection_cache.get_hit_rate():.2%}") + print(f"[cache_stats] Repo cache: {_repo_cache.size()} items, " + f"hit rate: {_repo_cache.get_hit_rate():.2%}") + + +def _detect_repo_for_file(file_path: Path) -> Optional[Path]: + """ + Detect which repository a file belongs to using the new workspace_state functions. + Returns the repository root path or None if not under WATCH_ROOT. + """ + try: + # Normalize paths - get current WATCH_ROOT to handle env changes + abs_file = file_path.resolve() + watch_root = Path(os.environ.get("WATCH_ROOT", "/work")).resolve() + abs_root = watch_root + + # File must be under WATCH_ROOT + try: + abs_file.relative_to(abs_root) + except ValueError: + return None + + # Check cache first + file_key = str(abs_file) + cached_result = _repo_cache.get(file_key) + if cached_result is not None: + return cached_result + + # Use new workspace_state function to extract repo name from file path + repo_name = _extract_repo_name_from_path(str(abs_file)) + + # Construct repo path from the detected repo name + # Look for the repo directory under WATCH_ROOT + repo_path = None + rel_path = abs_file.relative_to(abs_root) + path_parts = rel_path.parts + + if not path_parts: + return None + + # Strategy 1: Look for repo with matching name in common locations + # Check immediate directories under WATCH_ROOT + if len(path_parts) >= 1: + potential_repo_name = path_parts[0] + if potential_repo_name and repo_name and (potential_repo_name == repo_name or potential_repo_name.lower() == repo_name.lower()): + repo_path = abs_root / potential_repo_name + if repo_path.exists(): + _repo_cache.put(file_key, repo_path) + return repo_path + + # Strategy 2: Walk up the path hierarchy to find repo root + current_path = abs_file.parent + abs_root_resolved = abs_root.resolve() + + while True: + # Check if current path name matches our detected repo name + if current_path.name == repo_name or current_path.name.lower() == repo_name.lower(): + repo_path = current_path + break + + # Check if current_path has .git + if (current_path / ".git").exists(): + repo_path = current_path + break + + # Stop if we've reached WATCH_ROOT or above it + current_resolved = current_path.resolve() + if current_resolved == abs_root_resolved or current_resolved == current_path.parent.resolve(): + break + + current_path = current_path.parent + + # Strategy 3: Fallback to first-level directory under WATCH_ROOT + if repo_path is None: + repo_path = abs_root / path_parts[0] + if not repo_path.exists(): + # If the assumed repo path doesn't exist, fall back to WATCH_ROOT itself + repo_path = abs_root + + # Cache the result + _repo_cache.put(file_key, repo_path) + return repo_path + + except (OSError, ValueError, RuntimeError) as e: + # Log the specific error for debugging if needed + print(f"[repo_detection] Error detecting repo for {file_path}: {e}") + return None + + +def _get_collection_for_repo(repo_path: Path) -> str: + """ + Get the collection name for a repository using new workspace_state functions. + Uses caching to avoid repeated calls. + """ + try: + repo_key = str(repo_path) # repo_path is already resolved + + # Check cache first + cached_collection = _collection_cache.get(repo_key) + if cached_collection is not None: + return cached_collection + + # Extract repo name using new workspace_state function + repo_name = _extract_repo_name_from_path(repo_key) + + # Use new workspace_state function to get collection name + collection_name = get_collection_name(repo_name) + + # Cache the result + _collection_cache.put(repo_key, collection_name) + return collection_name + + except (OSError, ImportError, ValueError) as e: + # Fallback to default collection name with logging + print(f"[collection_detection] Error getting collection for {repo_path}: {e}") + fallback = os.environ.get("COLLECTION_NAME", "my-collection") + return fallback + + +def _get_collection_for_file(file_path: Path) -> str: + """ + Get the collection name for a file by detecting its repository. + """ + # In single-repo mode, always use the global collection + if not is_multi_repo_mode(): + return os.environ.get("COLLECTION_NAME", "my-collection") + + # Multi-repo mode: detect repository for file + repo_path = _detect_repo_for_file(file_path) + + if repo_path: + collection = _get_collection_for_repo(repo_path) + return collection + + # Fallback to default collection + return os.environ.get("COLLECTION_NAME", "my-collection") + + +def _get_remote_client_for_repo(repo_path: Path, remote_clients: dict, remote_config: dict) -> Optional[RemoteUploadClient]: + """ + Get or create a remote upload client for a specific repository. + Uses the new repo-specific metadata structure for delta bundles. + """ + repo_key = str(repo_path) # repo_path is already resolved + + if repo_key in remote_clients: + return remote_clients[repo_key] + + # Create new client for this repository + try: + collection_name = _get_collection_for_repo(repo_path) + + # Extract repo name and get the repo-specific metadata directory + repo_name = _extract_repo_name_from_path(repo_key) + repo_state_dir = _get_repo_state_dir(repo_name) + + # Use the actual repository path as workspace_path for file resolution + # But use the repo-specific metadata directory for delta bundle storage + workspace_path = repo_key # This is the actual repo path where files are located + metadata_path = str(repo_state_dir) # This is where delta bundles are stored + + client = RemoteUploadClient( + upload_endpoint=remote_config["upload_endpoint"], + workspace_path=workspace_path, + collection_name=collection_name, + max_retries=remote_config["max_retries"], + timeout=remote_config["timeout"], + metadata_path=metadata_path + ) + remote_clients[repo_key] = client + print(f"[remote_upload] Created client for repo: {repo_path} -> {collection_name} (workspace: {workspace_path}, metadata: {metadata_path})") + return client + except (OSError, ValueError, ConnectionError, KeyError) as e: + print(f"[remote_upload] Error creating client for {repo_path}: {e}") + return None + class ChangeQueue: - def __init__(self, process_cb, remote_client: Optional[RemoteUploadClient] = None): + def __init__(self, process_cb, remote_clients: Optional[dict] = None, remote_config: Optional[dict] = None): self._lock = threading.Lock() self._paths: Set[Path] = set() self._timer: threading.Timer | None = None self._process_cb = process_cb - self._remote_client = remote_client + self._remote_clients = remote_clients or {} + self._remote_config = remote_config def add(self, p: Path): with self._lock: @@ -75,18 +332,60 @@ def _flush(self): paths = list(self._paths) self._paths.clear() self._timer = None - + # Handle remote upload if enabled - if self._remote_client and _REMOTE_UPLOAD_AVAILABLE: + if self._remote_clients and _REMOTE_UPLOAD_AVAILABLE and self._remote_config: try: - success = self._remote_client.process_and_upload_changes(paths) - if success: - print("[remote_upload] Delta upload completed successfully") - else: - print("[remote_upload] Delta upload failed, falling back to local processing") - self._process_cb(paths) + # Group paths by repository for remote upload + repo_groups = {} + for path in paths: + repo_path = _detect_repo_for_file(path) + if repo_path: + repo_key = str(repo_path) # repo_path is already resolved + if repo_key not in repo_groups: + repo_groups[repo_key] = [] + repo_groups[repo_key].append(path) + else: + # Use default client for files not under any repo + if "default" not in repo_groups: + repo_groups["default"] = [] + repo_groups["default"].append(path) + + # Process each repository with its own remote client + all_successful = True + for repo_key, repo_paths in repo_groups.items(): + try: + # Get or create remote client for this repository + if repo_key == "default": + remote_client = self._remote_clients.get("default") + else: + remote_client = _get_remote_client_for_repo( + Path(repo_key), self._remote_clients, self._remote_config + ) + + if remote_client: + success = remote_client.process_and_upload_changes(repo_paths) + if not success: + all_successful = False + print(f"[remote_upload] Upload failed for repo {repo_key}, falling back to local processing") + self._process_cb(repo_paths) + else: + print(f"[remote_upload] Upload successful for repo {repo_key}") + else: + all_successful = False + print(f"[remote_upload] No remote client available for repo {repo_key}, falling back to local processing") + self._process_cb(repo_paths) + except Exception as e: + all_successful = False + print(f"[remote_upload] Error during delta upload for repo {repo_key}: {e}") + print("[remote_upload] Falling back to local processing") + self._process_cb(repo_paths) + + if all_successful: + print("[remote_upload] All repository uploads completed successfully") + except Exception as e: - print(f"[remote_upload] Error during delta upload: {e}") + print(f"[remote_upload] Error during multi-repo delta upload: {e}") print("[remote_upload] Falling back to local processing") self._process_cb(paths) else: @@ -94,18 +393,19 @@ def _flush(self): class IndexHandler(FileSystemEventHandler): - def __init__(self, root: Path, queue: ChangeQueue, client: Optional[QdrantClient], collection: str): + def __init__(self, root: Path, queue: ChangeQueue, client: Optional[QdrantClient], default_collection: str = None): super().__init__() self.root = root self.queue = queue self.client = client - self.collection = collection + self.default_collection = default_collection self.excl = idx._Excluder(root) # Track ignore file for live reloads try: ig_name = os.environ.get("QDRANT_IGNORE_FILE", ".qdrantignore") self._ignore_path = (self.root / ig_name).resolve() - except Exception: + except (OSError, ValueError) as e: + print(f"[ignore_file] Could not resolve ignore file path: {e}") self._ignore_path = None self._ignore_mtime = ( (self._ignore_path.stat().st_mtime if self._ignore_path and self._ignore_path.exists() else 0.0) @@ -121,9 +421,11 @@ def _maybe_reload_excluder(self): self._ignore_mtime = cur try: print(f"[ignore_reload] reloaded patterns from {self._ignore_path}") - except Exception: + except (OSError, RuntimeError) as e: + print(f"[ignore_reload] Error printing reload message: {e}") pass - except Exception: + except (OSError, IOError) as e: + print(f"[ignore_reload] Error reloading ignore patterns: {e}") pass def _maybe_enqueue(self, src_path: str): @@ -133,16 +435,31 @@ def _maybe_enqueue(self, src_path: str): try: # normalize to absolute within root p = p.resolve() - except Exception: + except (OSError, ValueError): return # skip directories if p.is_dir(): return # ensure file is under root try: - rel = p.resolve().relative_to(self.root.resolve()) + rel = p.relative_to(self.root.resolve()) except ValueError: return + + # NEW: Exclude root-level metadata directory and its contents + try: + # Get the global state directory path and exclude it + if _get_global_state_dir is not None: + global_state_dir = _get_global_state_dir() + if p.is_relative_to(global_state_dir): + return # Skip files in /work/.codebase/ + except (OSError, ValueError): + pass # If we can't determine global state dir, continue processing + + # Skip all .codebase directories (including per-repo ones in multi-repo mode) + if any(part == ".codebase" for part in p.parts): + return + # directory-level excludes (parent dir) rel_dir = "/" + str(rel.parent).replace(os.sep, "/") if rel_dir == "/.": @@ -179,23 +496,32 @@ def on_deleted(self, event): # Only attempt deletion if we have a local client if self.client is not None: try: - idx.delete_points_by_path(self.client, self.collection, str(p)) - print(f"[deleted] {p}") + # Get the correct collection for this file + collection = _get_collection_for_file(p) + idx.delete_points_by_path(self.client, collection, str(p)) + print(f"[deleted] {p} -> {collection}") except Exception: pass else: print(f"[remote_mode] File deletion detected: {p}") - + # Drop local cache entry (always do this) try: - remove_cached_file(str(self.root), str(p)) + repo_path = _detect_repo_for_file(p) + if repo_path: + # Use new repo-based cache structure + repo_name = _extract_repo_name_from_path(str(repo_path)) + remove_cached_file(str(p), repo_name) + else: + # Use root as fallback + root_repo_name = _extract_repo_name_from_path(str(self.root)) + remove_cached_file(str(p), root_repo_name) except Exception: pass try: - _log_activity(str(self.root), "deleted", p) - except Exception: - pass + repo_path = _detect_repo_for_file(p) or self.root + _log_activity(str(repo_path), "deleted", p) except Exception as e: try: print(f"[delete_error] {p}: {e}") @@ -216,15 +542,16 @@ def on_moved(self, event): return # If destination directory is ignored, treat as simple deletion try: - rel_dir = "/" + str(dest.parent.resolve().relative_to(self.root.resolve())).replace(os.sep, "/") + rel_dir = "/" + str(dest.parent.relative_to(self.root.resolve())).replace(os.sep, "/") if rel_dir == "/.": rel_dir = "/" if self.excl.exclude_dir(rel_dir): if src.suffix.lower() in idx.CODE_EXTS: if self.client is not None: try: - idx.delete_points_by_path(self.client, self.collection, str(src)) - print(f"[moved:ignored_dest_deleted_src] {src} -> {dest}") + src_collection = _get_collection_for_file(src) + idx.delete_points_by_path(self.client, src_collection, str(src)) + print(f"[moved:ignored_dest_deleted_src] {src} -> {dest} (from {src_collection})") except Exception: pass else: @@ -236,31 +563,42 @@ def on_moved(self, event): moved_count = -1 if self.client is not None: try: - moved_count = _rename_in_store(self.client, self.collection, src, dest) + # Get collections for source and destination + src_collection = _get_collection_for_file(src) + dest_collection = _get_collection_for_file(dest) + moved_count = _rename_in_store(self.client, src_collection, src, dest, dest_collection) except Exception: moved_count = -1 if moved_count and moved_count > 0: try: - print(f"[moved] {src} -> {dest} ({moved_count} chunk(s) relinked)") + src_collection = _get_collection_for_file(src) + print(f"[moved] {src} -> {dest} ({moved_count} chunk(s) relinked from {src_collection})") # Update local cache: carry hash from src to dest if present prev_hash = None + src_repo = _detect_repo_for_file(src) + dest_repo = _detect_repo_for_file(dest) try: - prev_hash = get_cached_file_hash(str(self.root), str(src)) + # Use new repo-based cache structure + src_repo_name = _extract_repo_name_from_path(str(src_repo or self.root)) + prev_hash = get_cached_file_hash(str(src), src_repo_name) except Exception: prev_hash = None if prev_hash: try: - set_cached_file_hash(str(self.root), str(dest), prev_hash) + # Use new repo-based cache structure + dest_repo_name = _extract_repo_name_from_path(str(dest_repo or self.root)) + set_cached_file_hash(str(dest), prev_hash, dest_repo_name) except Exception: pass try: - remove_cached_file(str(self.root), str(src)) + remove_cached_file(str(src), src_repo_name) except Exception: pass except Exception: pass try: - _log_activity(str(self.root), "moved", dest, {"from": str(src), "chunks": int(moved_count)}) + repo_path = _detect_repo_for_file(dest) or self.root + _log_activity(str(repo_path), "moved", dest, {"from": str(src), "chunks": int(moved_count)}) except Exception: pass return @@ -268,8 +606,9 @@ def on_moved(self, event): if self.client is not None: try: if src.suffix.lower() in idx.CODE_EXTS: - idx.delete_points_by_path(self.client, self.collection, str(src)) - print(f"[moved:deleted_src] {src}") + src_collection = _get_collection_for_file(src) + idx.delete_points_by_path(self.client, src_collection, str(src)) + print(f"[moved:deleted_src] {src} from {src_collection}") except Exception: pass else: @@ -282,14 +621,17 @@ def on_moved(self, event): # --- Workspace state helpers --- def _set_status_indexing(workspace_path: str, total_files: int) -> None: try: - update_indexing_status( - workspace_path, - { - "state": "indexing", - "started_at": datetime.now().isoformat(), - "progress": {"files_processed": 0, "total_files": int(total_files)}, - }, - ) + # Extract repo name to use new structure + repo_name = _extract_repo_name_from_path(workspace_path) + if update_indexing_status is not None: + update_indexing_status( + repo_name=repo_name, + status={ + "state": "indexing", + "started_at": datetime.now().isoformat(), + "progress": {"files_processed": 0, "total_files": int(total_files)}, + }, + ) except Exception: pass @@ -298,44 +640,66 @@ def _update_progress( workspace_path: str, started_at: str, processed: int, total: int, current_file: Path | None ) -> None: try: - update_indexing_status( - workspace_path, - { - "state": "indexing", - "started_at": started_at, - "progress": { - "files_processed": int(processed), - "total_files": int(total), - "current_file": str(current_file) if current_file else None, + # Extract repo name to use new structure + repo_name = _extract_repo_name_from_path(workspace_path) + if update_indexing_status is not None: + update_indexing_status( + repo_name=repo_name, + status={ + "state": "indexing", + "started_at": started_at, + "progress": { + "files_processed": int(processed), + "total_files": int(total), + "current_file": str(current_file) if current_file else None, + }, }, - }, - ) + ) except Exception: pass def _log_activity(workspace_path: str, action: str, file_path: Path, details: dict | None = None) -> None: try: - update_last_activity( - workspace_path, - { - "timestamp": datetime.now().isoformat(), - "action": action, - "file_path": str(file_path), - "details": details or {}, - }, + # Extract repo name from workspace path to use new structure + repo_name = _extract_repo_name_from_path(workspace_path) + + # Import log_activity from workspace_state + from scripts.workspace_state import log_activity + + # Convert action to match expected ActivityAction type + valid_actions = {'indexed', 'deleted', 'skipped', 'scan-completed', 'initialized', 'moved'} + if action not in valid_actions: + action = 'indexed' # Default fallback + + # Import ActivityAction for type checking + from scripts.workspace_state import ActivityAction + if isinstance(action, str): + # Convert string to proper ActivityAction format + action = action # type: ignore # The function will validate the action + + # Use new log_activity function with repo-based structure + log_activity( + repo_name=repo_name, + action=action, # type: ignore + file_path=str(file_path), + details=details ) except Exception: pass # --- Move/Rename optimization: reuse vectors when file content unchanged --- -def _rename_in_store(client: QdrantClient, collection: str, src: Path, dest: Path) -> int: +def _rename_in_store(client: QdrantClient, src_collection: str, src: Path, dest: Path, dest_collection: str = None) -> int: """Best-effort: if dest content hash matches previously indexed src hash, update points in-place to the new path without re-embedding. + Supports cross-collection moves when dest_collection is different from src_collection. + Returns number of points moved, or -1 if not applicable/failure. """ + if dest_collection is None: + dest_collection = src_collection try: if not dest.exists() or dest.is_dir(): return -1 @@ -344,7 +708,7 @@ def _rename_in_store(client: QdrantClient, collection: str, src: Path, dest: Pat except Exception: return -1 dest_hash = hashlib.sha1(text.encode("utf-8", errors="ignore")).hexdigest() - prev = idx.get_indexed_file_hash(client, collection, str(src)) + prev = idx.get_indexed_file_hash(client, src_collection, str(src)) if not prev or prev != dest_hash: return -1 @@ -359,7 +723,7 @@ def _rename_in_store(client: QdrantClient, collection: str, src: Path, dest: Pat ] ) points, next_offset = client.scroll( - collection_name=collection, + collection_name=src_collection, scroll_filter=filt, with_payload=True, with_vectors=True, @@ -416,11 +780,11 @@ def _rename_in_store(client: QdrantClient, collection: str, src: Path, dest: Pat except Exception: continue if new_points: - idx.upsert_points(client, collection, new_points) + idx.upsert_points(client, dest_collection, new_points) moved += len(new_points) try: - idx.delete_points_by_path(client, collection, str(src)) + idx.delete_points_by_path(client, src_collection, str(src)) except Exception: pass return moved @@ -431,56 +795,89 @@ def _rename_in_store(client: QdrantClient, collection: str, src: Path, dest: Pat def main(): # Check if remote mode is enabled remote_mode = False - remote_client = None - + remote_clients = {} # Map repo paths to remote clients + if _REMOTE_UPLOAD_AVAILABLE and is_remote_mode_enabled(): remote_mode = True try: remote_config = get_remote_config() - remote_client = RemoteUploadClient( - upload_endpoint=remote_config["upload_endpoint"], - workspace_path=remote_config["workspace_path"], - collection_name=remote_config["collection_name"], - max_retries=remote_config["max_retries"], - timeout=remote_config["timeout"] - ) + + # For multi-repo support, we'll create remote clients on-demand for each repository + # The base configuration will be used, but collection names will be determined per-repo print(f"[remote_upload] Remote mode enabled: {remote_config['upload_endpoint']}") - - # Check server status - status = remote_client.get_server_status() - if status.get("success", False): - print(f"[remote_upload] Server status: {status.get('status', 'unknown')}") - else: - print(f"[remote_upload] Warning: Could not reach server - {status.get('error', {}).get('message', 'Unknown error')}") - + print("[remote_upload] Multi-repo remote support - will create clients per repository") + + # Create a default client for backward compatibility + try: + # For the default client, use the global metadata directory to avoid permission issues + if _get_global_state_dir is not None: + global_state_dir = _get_global_state_dir() + default_workspace_path = str(global_state_dir) + else: + # Fallback if function is not available + default_workspace_path = "/work" + + default_remote_client = RemoteUploadClient( + upload_endpoint=remote_config["upload_endpoint"], + workspace_path=default_workspace_path, + collection_name=remote_config["collection_name"], + max_retries=remote_config["max_retries"], + timeout=remote_config["timeout"] + ) + + # Check server status + status = default_remote_client.get_server_status() + if status.get("success", False): + print(f"[remote_upload] Server status: {status.get('status', 'unknown')}") + else: + print(f"[remote_upload] Warning: Could not reach server - {status.get('error', {}).get('message', 'Unknown error')}") + + # Store as default client (will be used for single-repo scenarios) + remote_clients["default"] = default_remote_client + print(f"[remote_upload] Default client initialized with workspace: {default_workspace_path}") + + except Exception as e: + print(f"[remote_upload] Error initializing default remote client: {e}") + print("[remote_upload] Will create clients per-repository as needed") + except Exception as e: - print(f"[remote_upload] Error initializing remote client: {e}") + print(f"[remote_upload] Error initializing remote mode: {e}") print("[remote_upload] Falling back to local mode") remote_mode = False - remote_client = None - - # Resolve collection name from workspace state before any client/state ops + remote_clients = {} + + # Determine collection and mode based on MULTI_REPO_MODE setting try: from scripts.workspace_state import get_collection_name as _get_coll except Exception: _get_coll = None - global COLLECTION - try: - if _get_coll: - COLLECTION = _get_coll(str(ROOT)) - except Exception: - pass + + multi_repo_enabled = is_multi_repo_mode() if is_multi_repo_mode else False + + if multi_repo_enabled: + # Multi-repo mode: use per-repo collections + default_collection = os.environ.get("COLLECTION_NAME", "my-collection") + try: + if _get_coll: + default_collection = _get_coll(str(ROOT)) + except Exception: + pass + print("[multi_repo] Multi-repo mode enabled - files will be routed to per-repo collections") + else: + # Single-repo mode: use one collection for everything + default_collection = os.environ.get("COLLECTION_NAME", "my-collection") + print("[single_repo] Single-repo mode enabled - using single collection for all files") mode_str = "REMOTE" if remote_mode else "LOCAL" print( - f"Watch mode: {mode_str} root={ROOT} qdrant={QDRANT_URL} collection={COLLECTION} model={MODEL}" + f"Watch mode: {mode_str} root={ROOT} qdrant={QDRANT_URL} collection={default_collection} model={MODEL}" ) # Initialize Qdrant client for local mode (remote mode doesn't need it for basic operation) client = None model = None vector_name = None - + if not remote_mode: client = QdrantClient( url=QDRANT_URL, timeout=int(os.environ.get("QDRANT_TIMEOUT", "20") or 20) @@ -490,9 +887,9 @@ def main(): model = TextEmbedding(model_name=MODEL) dim = len(next(model.embed(["dimension probe"]))) - # Determine dense vector name deterministically + # Determine dense vector name deterministically (use default collection as reference) try: - info = client.get_collection(COLLECTION) + info = client.get_collection(default_collection) cfg = info.config.params.vectors if isinstance(cfg, dict) and cfg: # Prefer vector whose size matches embedding dim @@ -515,27 +912,52 @@ def main(): except Exception: vector_name = idx._sanitize_vector_name(MODEL) - # Ensure collection + payload indexes exist + # Ensure default collection + payload indexes exist try: - idx.ensure_collection(client, COLLECTION, dim, vector_name) + idx.ensure_collection(client, default_collection, dim, vector_name) except Exception: pass - idx.ensure_payload_indexes(client, COLLECTION) + idx.ensure_payload_indexes(client, default_collection) - # Ensure workspace state exists and set collection + # Ensure workspace state exists and set collection based on mode try: - update_workspace_state(str(ROOT), {"qdrant_collection": COLLECTION}) - update_indexing_status(str(ROOT), {"state": "watching"}) - except Exception: + if multi_repo_enabled: + # Multi-repo mode: use per-repo state structure + root_repo_name = _extract_repo_name_from_path(str(ROOT)) + if not root_repo_name: + print("[workspace_state] Multi-repo: Root path is not a repo; skipping root state initialization") + else: + root_collection = get_collection_name(root_repo_name) + update_indexing_status( + repo_name=root_repo_name, + status={"state": "watching"}, + ) + print( + f"[workspace_state] Multi-repo: Initialized state for repo: {root_repo_name} -> {root_collection}" + ) + else: + # Single-repo mode: use original workspace state structure + update_workspace_state( + workspace_path=str(ROOT), + updates={"qdrant_collection": default_collection}, + ) + update_indexing_status(status={"state": "watching"}) + print(f"[workspace_state] Single-repo: Initialized state for workspace: {str(ROOT)} -> {default_collection}") + except Exception as e: + print(f"[workspace_state] Error initializing workspace state: {e}") pass - # Create change queue with remote client if enabled + # Create change queue with remote clients if enabled if remote_mode: - q = ChangeQueue(lambda paths: _process_paths(paths, client, model, vector_name, str(ROOT), remote_mode), remote_client) + q = ChangeQueue( + lambda paths: _process_paths(paths, client, model, vector_name, str(ROOT), remote_mode), + remote_clients=remote_clients, + remote_config=get_remote_config() if _REMOTE_UPLOAD_AVAILABLE else None + ) else: q = ChangeQueue(lambda paths: _process_paths(paths, client, model, vector_name, str(ROOT), remote_mode)) - - handler = IndexHandler(ROOT, q, client, COLLECTION) + + handler = IndexHandler(ROOT, q, client, default_collection) obs = Observer() obs.schedule(handler, str(ROOT), recursive=True) @@ -556,39 +978,64 @@ def _process_paths(paths, client, model, vector_name: str, workspace_path: str, # This function is called as a fallback when remote upload fails if remote_mode: print(f"[local_fallback] Processing {len(paths)} files locally due to remote upload failure") - + # Prepare progress unique_paths = sorted(set(Path(x) for x in paths)) total = len(unique_paths) started_at = datetime.now().isoformat() - try: - update_indexing_status( - workspace_path, - { - "state": "indexing", - "started_at": started_at, - "progress": {"files_processed": 0, "total_files": total}, - }, - ) - except Exception: - pass + + # Group files by repository for progress tracking + repo_groups = {} + for p in unique_paths: + repo_path = _detect_repo_for_file(p) or Path(workspace_path) + if str(repo_path) not in repo_groups: + repo_groups[str(repo_path)] = [] + repo_groups[str(repo_path)].append(p) + + # Initialize progress for all repositories + for repo_path, repo_files in repo_groups.items(): + try: + # Extract repo name to use new structure + repo_name = _extract_repo_name_from_path(repo_path) + update_indexing_status( + repo_name=repo_name, + status={ + "state": "indexing", + "started_at": started_at, + "progress": {"files_processed": 0, "total_files": len(repo_files)}, + }, + ) + except Exception: + pass processed = 0 for p in unique_paths: current = p + + # Get collection for this file + collection = _get_collection_for_file(p) + repo_path = _detect_repo_for_file(p) or Path(workspace_path) + + if not p.exists(): # File was removed; ensure its points are deleted if client is not None: # Only process if we have a local client try: - idx.delete_points_by_path(client, COLLECTION, str(p)) - print(f"[deleted] {p}") + idx.delete_points_by_path(client, collection, str(p)) + print(f"[deleted] {p} -> {collection}") except Exception: pass - _log_activity(workspace_path, "deleted", p) + _log_activity(str(repo_path), "deleted", p) processed += 1 - _update_progress(workspace_path, started_at, processed, total, current) + # Update progress for the specific repository + try: + repo_files = repo_groups[str(repo_path)] + repo_processed = len([f for f in repo_files[:processed] if not f.exists()]) + _update_progress(str(repo_path), started_at, repo_processed, len(repo_files), current) + except Exception: + pass continue - + # Only process files locally if we have a client and model if client is not None and model is not None: # Lazily instantiate model if needed @@ -596,32 +1043,57 @@ def _process_paths(paths, client, model, vector_name: str, workspace_path: str, from fastembed import TextEmbedding mname = os.environ.get("EMBEDDING_MODEL", "BAAI/bge-base-en-v1.5") model = TextEmbedding(model_name=mname) + + # Ensure collection exists for this repo + try: + idx.ensure_collection(client, collection, len(next(model.embed(["dimension probe"]))), vector_name) + idx.ensure_payload_indexes(client, collection) + except Exception: + pass + + print(f"[DEBUG] Indexing file with path: {p}") ok = idx.index_single_file( - client, model, COLLECTION, vector_name, p, dedupe=True, skip_unchanged=False + client, model, collection, vector_name, p, dedupe=True, skip_unchanged=False ) status = "indexed" if ok else "skipped" - print(f"[{status}] {p}") + print(f"[{status}] {p} -> {collection}") if ok: try: size = int(p.stat().st_size) except Exception: size = None - _log_activity(workspace_path, "indexed", p, {"file_size": size}) + _log_activity(str(repo_path), "indexed", p, {"file_size": size}) else: - _log_activity(workspace_path, "skipped", p, {"reason": "no-change-or-error"}) + _log_activity(str(repo_path), "skipped", p, {"reason": "no-change-or-error"}) else: # In remote mode without fallback, just log activity print(f"[remote_mode] Not processing locally: {p}") - _log_activity(workspace_path, "remote_processed", p) - + _log_activity(str(repo_path), "indexed", p, {"reason": "remote_processed"}) + processed += 1 - _update_progress(workspace_path, started_at, processed, total, current) + # Update progress for the specific repository + try: + repo_files = repo_groups[str(repo_path)] + repo_processed = len([f for f in repo_files if f in unique_paths[:processed]]) + _update_progress(str(repo_path), started_at, repo_processed, len(repo_files), current) - # Return to watching state - try: - update_indexing_status(workspace_path, {"state": "watching"}) - except Exception: - pass + # Log cache stats periodically (every 50 files processed) + if processed % 50 == 0: + _log_cache_stats() + except Exception: + pass + + # Return to watching state for all repositories + for repo_path in repo_groups.keys(): + try: + # Extract repo name to use new structure + repo_name = _extract_repo_name_from_path(repo_path) + update_indexing_status( + repo_name=repo_name, + status={"state": "watching"}, + ) + except Exception: + pass if __name__ == "__main__": diff --git a/scripts/workspace_state.py b/scripts/workspace_state.py index dbffb45f..51f8d5c3 100644 --- a/scripts/workspace_state.py +++ b/scripts/workspace_state.py @@ -6,25 +6,28 @@ - Collection information and indexing status - Progress tracking during indexing operations - Activity logging with structured metadata -- Multi-project support with per-workspace state files - -Based on the codebase-index-cli workspace state pattern but adapted for our Python ecosystem. +- Multi-repo support with per-repo state files """ import json import os import uuid -import re -import hashlib import subprocess +import hashlib from datetime import datetime from pathlib import Path from typing import Dict, Any, Optional, List, Literal, TypedDict import threading -# Type definitions matching codebase-index-cli patterns +# Type definitions IndexingState = Literal['idle', 'initializing', 'scanning', 'indexing', 'watching', 'error'] ActivityAction = Literal['indexed', 'deleted', 'skipped', 'scan-completed', 'initialized', 'moved'] +# Constants +STATE_DIRNAME = ".codebase" +STATE_FILENAME = "state.json" +CACHE_FILENAME = "cache.json" +PLACEHOLDER_COLLECTION_NAMES = {"", "default-collection", "my-collection"} + class IndexingProgress(TypedDict, total=False): files_processed: int total_files: Optional[int] @@ -52,112 +55,91 @@ class LastActivity(TypedDict, total=False): file_path: Optional[str] details: Optional[ActivityDetails] -class QdrantStats(TypedDict, total=False): - total_vectors: int - unique_files: int - vector_dimension: int - last_updated: str - collection_name: str - class WorkspaceState(TypedDict, total=False): - workspace_path: str created_at: str updated_at: str qdrant_collection: str indexing_status: Optional[IndexingStatus] last_activity: Optional[LastActivity] - qdrant_stats: Optional[QdrantStats] + qdrant_stats: Optional[Dict[str, Any]] -# Constants -STATE_DIRNAME = ".codebase" -STATE_FILENAME = "state.json" +def is_multi_repo_mode() -> bool: + """Check if multi-repo mode is enabled.""" + return os.environ.get("MULTI_REPO_MODE", "0").strip().lower() in { + "1", "true", "yes", "on" + } -# Thread-safe state management -# Use re-entrant locks to avoid deadlocks when helper functions call each other +# Simple locking for concurrent access _state_locks: Dict[str, threading.RLock] = {} -_state_lock = threading.Lock() -def _get_state_lock(workspace_path: str) -> threading.RLock: - """Get or create a thread-safe lock for a specific workspace.""" - with _state_lock: - if workspace_path not in _state_locks: - _state_locks[workspace_path] = threading.RLock() - return _state_locks[workspace_path] +def _resolve_workspace_root() -> str: + """Determine the default workspace root path.""" + return os.environ.get("WORKSPACE_PATH") or os.environ.get("WATCH_ROOT") or "/work" -def _get_state_path(workspace_path: str) -> Path: - """Get the path to the state.json file for a workspace.""" - workspace = Path(workspace_path).resolve() - state_dir = workspace / STATE_DIRNAME - return state_dir / STATE_FILENAME +def _resolve_repo_context( + workspace_path: Optional[str] = None, + repo_name: Optional[str] = None, +) -> tuple[str, Optional[str]]: + """Normalize workspace/repo context, ensuring multi-repo callers map to repo state.""" + resolved_workspace = workspace_path or _resolve_workspace_root() -def _ensure_state_dir(workspace_path: str) -> Path: - """Ensure the .codebase directory exists and return the state file path.""" - workspace = Path(workspace_path).resolve() - state_dir = workspace / STATE_DIRNAME - state_dir.mkdir(exist_ok=True) - return state_dir / STATE_FILENAME + if is_multi_repo_mode(): + if repo_name: + return resolved_workspace, repo_name -def _sanitize_name(s: str, max_len: int = 64) -> str: - s = s.lower().strip() - s = re.sub(r"[^a-z0-9_.-]+", "-", s) - s = re.sub(r"-+", "-", s).strip("-") - if not s: - s = "workspace" - return s[:max_len] + if workspace_path: + detected = _detect_repo_name_from_path(Path(workspace_path)) + if detected: + return resolved_workspace, detected + return resolved_workspace, None -def _detect_repo_name_from_path(path: Path) -> str: - try: - base = path if path.is_dir() else path.parent - r = subprocess.run(["git", "-C", str(base), "rev-parse", "--show-toplevel"], - capture_output=True, text=True) - top = (r.stdout or "").strip() - if r.returncode == 0 and top: - return Path(top).name - except Exception: - pass - try: - # Walk up to find .git - cur = path if path.is_dir() else path.parent - for p in [cur] + list(cur.parents): - try: - if (p / ".git").exists(): - return p.name - except Exception: - continue - except Exception: - pass - return (path if path.is_dir() else path.parent).name or "workspace" + return resolved_workspace, repo_name +def _get_state_lock(workspace_path: Optional[str] = None, repo_name: Optional[str] = None) -> threading.RLock: + """Get or create a lock for the workspace or repo state.""" + if workspace_path: + key = str(Path(workspace_path).resolve()) + elif repo_name: + key = f"repo::{repo_name}" + else: + key = str(Path(_resolve_workspace_root()).resolve()) -def _generate_collection_name(workspace_path: str) -> str: - ws = Path(workspace_path).resolve() - repo = _sanitize_name(_detect_repo_name_from_path(ws)) - # stable suffix from absolute path - h = hashlib.sha1(str(ws).encode("utf-8", errors="ignore")).hexdigest()[:6] - return _sanitize_name(f"{repo}-{h}") + if key not in _state_locks: + _state_locks[key] = threading.RLock() + return _state_locks[key] -def _atomic_write_state(state_path: Path, state: WorkspaceState) -> None: - """Atomically write state to prevent corruption during concurrent access.""" - # Write to temp file first, then rename (atomic on most filesystems) - temp_path = state_path.with_suffix(f".tmp.{uuid.uuid4().hex[:8]}") - try: - with open(temp_path, 'w', encoding='utf-8') as f: - json.dump(state, f, indent=2, ensure_ascii=False) - temp_path.replace(state_path) - except Exception: - # Clean up temp file if something went wrong - try: - temp_path.unlink(missing_ok=True) - except Exception: - pass - raise +def _get_repo_state_dir(repo_name: str) -> Path: + """Get the state directory for a repository.""" + # Use workspace root (typically /work in containers) not script directory + base_dir = Path(os.environ.get("WORKSPACE_PATH") or os.environ.get("WATCH_ROOT") or "/work") + if is_multi_repo_mode(): + return base_dir / STATE_DIRNAME / "repos" / repo_name + return base_dir / STATE_DIRNAME -def get_workspace_state(workspace_path: str) -> WorkspaceState: +def _get_state_path(workspace_path: str) -> Path: + """Get the path to the state.json file for a workspace.""" + workspace = Path(workspace_path).resolve() + state_dir = workspace / STATE_DIRNAME + return state_dir / STATE_FILENAME + +def get_workspace_state(workspace_path: Optional[str] = None, repo_name: Optional[str] = None) -> WorkspaceState: """Get the current workspace state, creating it if it doesn't exist.""" - lock = _get_state_lock(workspace_path) + workspace_path, repo_name = _resolve_repo_context(workspace_path, repo_name) + + if is_multi_repo_mode() and repo_name is None: + print( + f"[workspace_state] Multi-repo: Skipping state read for workspace={workspace_path} without repo_name" + ) + return {} + + lock = _get_state_lock(workspace_path, repo_name) with lock: - state_path = _get_state_path(workspace_path) + # In multi-repo mode, use repo-based state path + if is_multi_repo_mode() and repo_name: + state_path = _get_repo_state_dir(repo_name) / STATE_FILENAME + else: + state_path = _get_state_path(workspace_path) if state_path.exists(): try: @@ -172,108 +154,189 @@ def get_workspace_state(workspace_path: str) -> WorkspaceState: pass # Create new state - now = datetime.now().isoformat() - env_coll = os.environ.get("COLLECTION_NAME") - if isinstance(env_coll, str) and env_coll.strip() and env_coll.strip() != "my-collection": - collection_name = env_coll.strip() - else: - collection_name = _generate_collection_name(workspace_path) - - state: WorkspaceState = { - "workspace_path": str(Path(workspace_path).resolve()), - "created_at": now, - "updated_at": now, - "qdrant_collection": collection_name, - "indexing_status": { - "state": "idle" - } + state = { + "created_at": datetime.now().isoformat(), + "updated_at": datetime.now().isoformat(), + "qdrant_collection": get_collection_name(repo_name), + "indexing_status": {"state": "idle"}, } - # Ensure directory exists and write state - state_path = _ensure_state_dir(workspace_path) - _atomic_write_state(state_path, state) + # Write state + state_path.parent.mkdir(parents=True, exist_ok=True) + with open(state_path, 'w', encoding='utf-8') as f: + json.dump(state, f, ensure_ascii=False, indent=2) + return state -def update_workspace_state(workspace_path: str, updates: Dict[str, Any]) -> WorkspaceState: - """Update workspace state with the given changes.""" - lock = _get_state_lock(workspace_path) +def update_workspace_state( + workspace_path: Optional[str] = None, + updates: Optional[Dict[str, Any]] = None, + repo_name: Optional[str] = None, +) -> WorkspaceState: + """Update workspace state with new values.""" + workspace_path, repo_name = _resolve_repo_context(workspace_path, repo_name) + updates = updates or {} + + if is_multi_repo_mode() and repo_name is None: + print( + f"[workspace_state] Multi-repo: Skipping state update for workspace={workspace_path} without repo_name" + ) + return {} + + lock = _get_state_lock(workspace_path, repo_name) with lock: - state = get_workspace_state(workspace_path) + state = get_workspace_state(workspace_path, repo_name) + state.update(updates) + state["updated_at"] = datetime.now().isoformat() - # Apply updates - for key, value in updates.items(): - if key in state or key in WorkspaceState.__annotations__: - state[key] = value + # Write updated state using same path logic as get_workspace_state + if is_multi_repo_mode() and repo_name: + state_path = _get_repo_state_dir(repo_name) / STATE_FILENAME + else: + state_path = _get_state_path(workspace_path) - # Always update timestamp - state["updated_at"] = datetime.now().isoformat() + with open(state_path, 'w', encoding='utf-8') as f: + json.dump(state, f, ensure_ascii=False, indent=2) - # Write back to file - state_path = _ensure_state_dir(workspace_path) - _atomic_write_state(state_path, state) return state -def update_indexing_status(workspace_path: str, status: IndexingStatus) -> WorkspaceState: - """Update the indexing status in workspace state.""" - return update_workspace_state(workspace_path, {"indexing_status": status}) - -def update_last_activity(workspace_path: str, activity: LastActivity) -> WorkspaceState: - """Update the last activity in workspace state.""" - return update_workspace_state(workspace_path, {"last_activity": activity}) - -def update_qdrant_stats(workspace_path: str, stats: QdrantStats) -> WorkspaceState: - """Update Qdrant statistics in workspace state.""" - stats["last_updated"] = datetime.now().isoformat() - return update_workspace_state(workspace_path, {"qdrant_stats": stats}) - -def get_collection_name(workspace_path: str) -> str: - """Get the Qdrant collection name for a workspace. - If none is present in state, persist either COLLECTION_NAME from env or a generated - repoName- based on the workspace path, and return it. - - Fix: treat placeholders as not-real so we don't collide across repos. - Placeholders include: empty string, "my-collection", and the env default if it equals "my-collection". - Only short-circuit when the stored name is already real. - """ - state = get_workspace_state(workspace_path) - coll = state.get("qdrant_collection") if isinstance(state, dict) else None - env_coll = os.environ.get("COLLECTION_NAME") - env_coll = env_coll.strip() if isinstance(env_coll, str) else "" - placeholders = {"", "my-collection"} - # If env is explicitly the default placeholder, consider it a placeholder too - if env_coll == "my-collection": - placeholders.add(env_coll) - - # If state has a real (non-placeholder) collection, keep it - if isinstance(coll, str): - c = coll.strip() - if c and c not in placeholders: - return c - - # Otherwise, prefer a non-placeholder explicit env override; else generate - if env_coll and env_coll not in placeholders: - coll = env_coll.strip() +def update_indexing_status( + workspace_path: Optional[str] = None, + status: Optional[IndexingStatus] = None, + repo_name: Optional[str] = None, +) -> WorkspaceState: + """Update indexing status in workspace state.""" + workspace_path, repo_name = _resolve_repo_context(workspace_path, repo_name) + + if is_multi_repo_mode() and repo_name is None: + print( + f"[workspace_state] Multi-repo: Skipping indexing status update for workspace={workspace_path} without repo_name" + ) + return {} + + if status is None: + status = {"state": "idle"} + + return update_workspace_state( + workspace_path=workspace_path, + updates={"indexing_status": status}, + repo_name=repo_name, + ) + +def log_activity(repo_name: Optional[str] = None, action: Optional[ActivityAction] = None, + file_path: Optional[str] = None, details: Optional[ActivityDetails] = None) -> None: + """Log activity to workspace state.""" + if not action: + return + + activity = { + "timestamp": datetime.now().isoformat(), + "action": action, + "file_path": file_path, + "details": details or {} + } + + if is_multi_repo_mode() and repo_name: + # Multi-repo mode: use repo-based state + state_dir = _get_repo_state_dir(repo_name) + state_path = state_dir / STATE_FILENAME + + state_path.parent.mkdir(parents=True, exist_ok=True) + + if state_path.exists(): + try: + with open(state_path, 'r', encoding='utf-8') as f: + state = json.load(f) + except (json.JSONDecodeError, OSError): + state = {"created_at": datetime.now().isoformat()} + else: + state = {"created_at": datetime.now().isoformat()} + + state["last_activity"] = activity + state["updated_at"] = datetime.now().isoformat() + + with open(state_path, 'w', encoding='utf-8') as f: + json.dump(state, f, ensure_ascii=False, indent=2) else: - coll = _generate_collection_name(workspace_path) - update_workspace_state(workspace_path, {"qdrant_collection": coll}) - return coll + # Single-repo mode: use workspace-based state (not implemented here) + pass -# --- Persistent file-hash cache (.codebase/cache.json) --- -CACHE_FILENAME = "cache.json" +def _generate_collection_name_from_repo(repo_name: str) -> str: + """Generate a collection name from repository name with hash suffix.""" + # Create a short hash from repo name to ensure uniqueness + hash_obj = hashlib.sha256(repo_name.encode()) + short_hash = hash_obj.hexdigest()[:8] + return f"{repo_name}-{short_hash}" +def get_collection_name(repo_name: Optional[str] = None) -> str: + """Get collection name for repository or workspace.""" + # In multi-repo mode, prioritize repo-specific collection names + if is_multi_repo_mode() and repo_name: + return _generate_collection_name_from_repo(repo_name) -def _get_cache_path(workspace_path: str) -> Path: - ws = Path(workspace_path).resolve() - return ws / STATE_DIRNAME / CACHE_FILENAME + # Check environment for single-repo mode or fallback + env_coll = os.environ.get("COLLECTION_NAME", "").strip() + if env_coll and env_coll not in PLACEHOLDER_COLLECTION_NAMES: + return env_coll + + # Use repo name if provided (for single-repo mode with repo name) + if repo_name: + return _generate_collection_name_from_repo(repo_name) + + # Default fallback + return "global-collection" + +def _detect_repo_name_from_path(path: Path) -> str: + """Detect repository name from path. Clean, robust implementation.""" + try: + # Normalize path + resolved_path = path.resolve() + + # Get workspace root + workspace_root = Path(os.environ.get("WATCH_ROOT") or os.environ.get("WORKSPACE_PATH") or "/work") + + # Path must be under workspace root + try: + rel_path = resolved_path.relative_to(workspace_root) + except ValueError: + return None # Path is outside workspace root + + # Get first path component as repo name + if rel_path.parts: + repo_name = rel_path.parts[0] + + # Exclude system directories + if repo_name in (".codebase", ".git", "__pycache__"): + return None + + # Verify the path exists (handles bindmounts) + repo_path = workspace_root / repo_name + if repo_path.exists() or str(resolved_path).startswith(str(repo_path) + "/"): + return repo_name + + return None # Not a valid repo path + + except Exception: + return None +def _extract_repo_name_from_path(workspace_path: str) -> str: + """Extract repository name from workspace path.""" + return _detect_repo_name_from_path(Path(workspace_path)) + +# Cache functions for file hash tracking +def _get_cache_path(workspace_path: str) -> Path: + """Get the path to the cache.json file.""" + workspace = Path(workspace_path).resolve() + return workspace / STATE_DIRNAME / CACHE_FILENAME def _read_cache(workspace_path: str) -> Dict[str, Any]: - """Best-effort load of the workspace cache (file hashes keyed by absolute path).""" + """Read cache file, return empty dict if doesn't exist.""" + cache_path = _get_cache_path(workspace_path) + if not cache_path.exists(): + return {"file_hashes": {}, "updated_at": datetime.now().isoformat()} + try: - p = _get_cache_path(workspace_path) - if not p.exists(): - return {"file_hashes": {}, "updated_at": datetime.now().isoformat()} - with open(p, "r", encoding="utf-8") as f: + with open(cache_path, 'r', encoding='utf-8') as f: obj = json.load(f) if isinstance(obj, dict) and isinstance(obj.get("file_hashes"), dict): return obj @@ -281,103 +344,129 @@ def _read_cache(workspace_path: str) -> Dict[str, Any]: except Exception: return {"file_hashes": {}, "updated_at": datetime.now().isoformat()} - def _write_cache(workspace_path: str, cache: Dict[str, Any]) -> None: - """Atomic write of cache file to avoid corruption under concurrency.""" - lock = _get_state_lock(workspace_path) - with lock: - state_dir = Path(workspace_path).resolve() / STATE_DIRNAME - state_dir.mkdir(exist_ok=True) - cache_path = _get_cache_path(workspace_path) - tmp = cache_path.with_suffix(f".tmp.{uuid.uuid4().hex[:8]}") + """Write cache file atomically.""" + cache_path = _get_cache_path(workspace_path) + cache_path.parent.mkdir(parents=True, exist_ok=True) + + tmp = cache_path.with_suffix(f".tmp.{uuid.uuid4().hex[:8]}") + try: + with open(tmp, "w", encoding="utf-8") as f: + json.dump(cache, f, ensure_ascii=False, indent=2) + tmp.replace(cache_path) + finally: try: - with open(tmp, "w", encoding="utf-8") as f: - json.dump(cache, f, ensure_ascii=False, indent=2) - tmp.replace(cache_path) - finally: + tmp.unlink(missing_ok=True) + except Exception: + pass + +def get_cached_file_hash(file_path: str, repo_name: Optional[str] = None) -> str: + """Get cached file hash for tracking changes.""" + if is_multi_repo_mode() and repo_name: + state_dir = _get_repo_state_dir(repo_name) + cache_path = state_dir / CACHE_FILENAME + + if cache_path.exists(): try: - tmp.unlink(missing_ok=True) + with open(cache_path, 'r', encoding='utf-8') as f: + cache = json.load(f) + file_hashes = cache.get("file_hashes", {}) + return file_hashes.get(str(Path(file_path).resolve()), "") except Exception: pass + return "" -def get_cached_file_hash(workspace_path: str, file_path: str) -> str: - """Return cached content hash for an absolute file path, or empty string.""" - cache = _read_cache(workspace_path) - try: - return str((cache.get("file_hashes") or {}).get(str(Path(file_path).resolve()), "")) - except Exception: - return "" +def set_cached_file_hash(file_path: str, file_hash: str, repo_name: Optional[str] = None) -> None: + """Set cached file hash for tracking changes.""" + if is_multi_repo_mode() and repo_name: + state_dir = _get_repo_state_dir(repo_name) + cache_path = state_dir / CACHE_FILENAME + cache_path.parent.mkdir(parents=True, exist_ok=True) -def set_cached_file_hash(workspace_path: str, file_path: str, file_hash: str) -> None: - """Set cached content hash for an absolute file path and persist immediately.""" - lock = _get_state_lock(workspace_path) - with lock: - cache = _read_cache(workspace_path) - fh = cache.setdefault("file_hashes", {}) - fh[str(Path(file_path).resolve())] = str(file_hash) - cache["updated_at"] = datetime.now().isoformat() - _write_cache(workspace_path, cache) + try: + if cache_path.exists(): + with open(cache_path, 'r', encoding='utf-8') as f: + cache = json.load(f) + else: + cache = {"file_hashes": {}, "created_at": datetime.now().isoformat()} + cache.setdefault("file_hashes", {})[str(Path(file_path).resolve())] = file_hash + cache["updated_at"] = datetime.now().isoformat() -def remove_cached_file(workspace_path: str, file_path: str) -> None: - """Remove a file entry from the cache and persist.""" - lock = _get_state_lock(workspace_path) - with lock: - cache = _read_cache(workspace_path) - fh = cache.setdefault("file_hashes", {}) - try: - fp = str(Path(file_path).resolve()) + # Atomic write + tmp = cache_path.with_suffix(f".tmp.{uuid.uuid4().hex[:8]}") + with open(tmp, "w", encoding="utf-8") as f: + json.dump(cache, f, ensure_ascii=False, indent=2) + tmp.replace(cache_path) except Exception: - fp = str(file_path) - if fp in fh: - fh.pop(fp, None) - cache["updated_at"] = datetime.now().isoformat() - _write_cache(workspace_path, cache) + pass + +def remove_cached_file(file_path: str, repo_name: Optional[str] = None) -> None: + """Remove file entry from cache.""" + if is_multi_repo_mode() and repo_name: + state_dir = _get_repo_state_dir(repo_name) + cache_path = state_dir / CACHE_FILENAME + + if cache_path.exists(): + try: + with open(cache_path, 'r', encoding='utf-8') as f: + cache = json.load(f) + file_hashes = cache.get("file_hashes", {}) + + fp = str(Path(file_path).resolve()) + if fp in file_hashes: + file_hashes.pop(fp, None) + cache["updated_at"] = datetime.now().isoformat() + + tmp = cache_path.with_suffix(f".tmp.{uuid.uuid4().hex[:8]}") + with open(tmp, "w", encoding="utf-8") as f: + json.dump(cache, f, ensure_ascii=False, indent=2) + tmp.replace(cache_path) + except Exception: + pass + +# Additional functions needed by callers +def _state_file_path(workspace_path: Optional[str] = None, repo_name: Optional[str] = None) -> Path: + """Get state file path for workspace or repo.""" + if repo_name and is_multi_repo_mode(): + state_dir = _get_repo_state_dir(repo_name) + return state_dir / STATE_FILENAME + + if workspace_path: + return _get_state_path(workspace_path) + + # Default to current directory + return Path.cwd() / STATE_DIRNAME / STATE_FILENAME + +def _get_global_state_dir() -> Path: + """Get the global .codebase directory.""" + base_dir = Path.cwd() + return base_dir / ".codebase" def list_workspaces(search_root: Optional[str] = None) -> List[Dict[str, Any]]: """Find all workspaces with .codebase/state.json files.""" if search_root is None: - search_root = os.getcwd() + # Use workspace root instead of current directory + search_root = os.environ.get("WORKSPACE_PATH") or "/work" workspaces = [] - search_path = Path(search_root).resolve() + root_path = Path(search_root) - # Search for .codebase directories - for state_dir in search_path.rglob(STATE_DIRNAME): - state_file = state_dir / STATE_FILENAME - if state_file.exists(): - try: - workspace_path = str(state_dir.parent) - state = get_workspace_state(workspace_path) - workspaces.append({ - "workspace_path": workspace_path, - "collection_name": state.get("qdrant_collection"), - "last_updated": state.get("updated_at"), - "indexing_state": state.get("indexing_status", {}).get("state", "unknown") - }) - except Exception: - # Skip corrupted state files - continue - - return sorted(workspaces, key=lambda x: x.get("last_updated", ""), reverse=True) + # Look for state files + for state_file in root_path.rglob(STATE_FILENAME): + try: + rel_path = state_file.relative_to(root_path) + workspace_info = { + "path": str(rel_path.parent), + "state_file": str(state_file), + "relative_path": str(rel_path.parent) + } + workspaces.append(workspace_info) + except (ValueError, OSError): + continue -def cleanup_old_state_locks(): - """Clean up unused state locks to prevent memory leaks.""" - with _state_lock: - # Keep only locks for recently accessed workspaces - # In practice, this would need more sophisticated cleanup logic - pass + return workspaces -if __name__ == "__main__": - # Simple CLI for testing - import sys - if len(sys.argv) > 1: - workspace = sys.argv[1] - state = get_workspace_state(workspace) - print(json.dumps(state, indent=2)) - else: - workspaces = list_workspaces() - for ws in workspaces: - print(f"{ws['workspace_path']}: {ws['collection_name']} ({ws['indexing_state']})") +# Add missing functions that callers expect (already defined above) \ No newline at end of file From 2a713a5466c5675d76d0e6f446b3b6448de0c52c Mon Sep 17 00:00:00 2001 From: Reese Date: Tue, 11 Nov 2025 23:04:08 +0000 Subject: [PATCH 10/16] feat(remote): add collection mapping and origin tracking for remote workspaces MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add collection_map MCP tool to enumerate collection↔repo mappings with optional Qdrant payload samples - Implement origin metadata persistence in workspace_state.py for remote source tracking - Enhance remote upload client with mapping summary and --show-mapping option - Add source_path parameter to upload service for complete origin tracking - Simplify watch_index.py by removing remote mode complexity and focusing on local indexing - Update workspace state functions to support collection mappings enumeration These changes provide comprehensive visibility into collection mappings across local and remote workspaces, enabling better tracking and management of distributed indexing operations. --- scripts/hybrid_search.py | 2 +- scripts/mcp_indexer_server.py | 217 +++++++++++++ scripts/remote_upload_client.py | 51 ++- scripts/upload_service.py | 40 ++- scripts/watch_index.py | 551 ++++++-------------------------- scripts/workspace_state.py | 170 ++++++++-- 6 files changed, 540 insertions(+), 491 deletions(-) diff --git a/scripts/hybrid_search.py b/scripts/hybrid_search.py index bd4657bf..b0fa4f50 100644 --- a/scripts/hybrid_search.py +++ b/scripts/hybrid_search.py @@ -1695,7 +1695,7 @@ def main(): ap.add_argument("--ext", type=str, default=None) ap.add_argument("--not", dest="not_filter", type=str, default=None) ap.add_argument("--collection", type=str, default=None, - help="Target collection name (overrides COLLECTION_NAME env var)") + help="Target collection name") ap.add_argument( "--case", type=str, diff --git a/scripts/mcp_indexer_server.py b/scripts/mcp_indexer_server.py index e70c56ea..90940ef9 100644 --- a/scripts/mcp_indexer_server.py +++ b/scripts/mcp_indexer_server.py @@ -628,6 +628,223 @@ async def list_workspaces(search_root: Optional[str] = None) -> Dict[str, Any]: return {"error": str(e)} +@mcp.tool() +async def collection_map( + search_root: Optional[str] = None, + collection: Optional[str] = None, + repo_name: Optional[str] = None, + include_samples: Optional[bool] = None, + limit: Optional[int] = None, +) -> Dict[str, Any]: + """Return collection↔repo mappings with optional Qdrant payload samples.""" + + def _norm_str(val: Any) -> Optional[str]: + if val is None: + return None + try: + s = str(val).strip() + except Exception: + return None + return s or None + + collection_filter = _norm_str(collection) + repo_filter = _norm_str(repo_name) + sample_flag = _coerce_bool(include_samples, False) + + max_entries: Optional[int] = None + if limit is not None: + try: + max_entries = max(1, int(limit)) + except Exception: + max_entries = None + + state_entries: List[Dict[str, Any]] = [] + state_error: Optional[str] = None + + try: + from scripts.workspace_state import get_collection_mappings as _get_collection_mappings # type: ignore + + try: + state_entries = await asyncio.to_thread( + lambda: _get_collection_mappings(search_root) + ) + except Exception as exc: + state_error = str(exc) + state_entries = [] + except Exception as exc: # pragma: no cover + state_error = f"workspace_state unavailable: {exc}" + state_entries = [] + + if repo_filter: + state_entries = [ + entry for entry in state_entries if _norm_str(entry.get("repo_name")) == repo_filter + ] + if collection_filter: + state_entries = [ + entry + for entry in state_entries + if _norm_str(entry.get("collection_name")) == collection_filter + ] + + results: List[Dict[str, Any]] = [] + seen_collections: set[str] = set() + + for entry in state_entries: + item = dict(entry) + item["source"] = "state" + results.append(item) + coll = _norm_str(entry.get("collection_name")) + if coll: + seen_collections.add(coll) + + # Qdrant helpers ----------------------------------------------------- + sample_cache: Dict[str, Tuple[Optional[Dict[str, Any]], Optional[str]]] = {} + qdrant_error: Optional[str] = None + qdrant_used = False + client = None + + def _ensure_qdrant_client(): + nonlocal client, qdrant_error, qdrant_used + if client is not None or qdrant_error: + return client + try: + from qdrant_client import QdrantClient # type: ignore + except Exception as exc: # pragma: no cover + qdrant_error = f"qdrant_client unavailable: {exc}" + return None + + try: + qdrant_used = True + return QdrantClient( + url=QDRANT_URL, + api_key=os.environ.get("QDRANT_API_KEY"), + timeout=float(os.environ.get("QDRANT_TIMEOUT", "20") or 20), + ) + except Exception as exc: # pragma: no cover + qdrant_error = str(exc) + return None + + async def _sample_payload(coll_name: Optional[str]) -> Tuple[Optional[Dict[str, Any]], Optional[str]]: + key = _norm_str(coll_name) or "" + if not key: + return None, "missing_collection" + if key in sample_cache: + return sample_cache[key] + + cli = _ensure_qdrant_client() + if cli is None: + sample_cache[key] = (None, qdrant_error) + return sample_cache[key] + + def _scroll_one(): + try: + points, _ = cli.scroll( + collection_name=key, + limit=1, + with_payload=True, + with_vectors=False, + ) + return points + except Exception as exc: # pragma: no cover + raise exc + + try: + points = await asyncio.to_thread(_scroll_one) + except Exception as exc: # pragma: no cover + err = str(exc) + sample_cache[key] = (None, err) + return sample_cache[key] + + if not points: + sample_cache[key] = (None, None) + return sample_cache[key] + + payload = points[0].payload or {} + metadata = payload.get("metadata") or {} + sample = { + "host_path": metadata.get("host_path"), + "container_path": metadata.get("container_path"), + "path": metadata.get("path") or payload.get("path"), + "start_line": metadata.get("start_line"), + "end_line": metadata.get("end_line"), + } + sample_cache[key] = (sample, None) + return sample_cache[key] + + # Attach samples to state-backed entries when requested + if sample_flag and results: + for entry in results: + coll_name = entry.get("collection_name") + sample, err = await _sample_payload(coll_name) + if sample: + entry["sample"] = sample + if err: + entry.setdefault("warnings", []).append(err) + + # If no state entries (or explicit collection filtered out), fall back to Qdrant listings + fallback_entries: List[Dict[str, Any]] = [] + need_qdrant_listing = not results + + if need_qdrant_listing: + cli = _ensure_qdrant_client() + if cli is not None: + def _list_collections(): + info = cli.get_collections() + return [c.name for c in info.collections] + + try: + collection_names = await asyncio.to_thread(_list_collections) + except Exception as exc: # pragma: no cover + qdrant_error = str(exc) + collection_names = [] + + if collection_filter: + collection_names = [ + name for name in collection_names if _norm_str(name) == collection_filter + ] + + count = 0 + for name in collection_names: + if name in seen_collections: + continue + entry: Dict[str, Any] = { + "collection_name": name, + "source": "qdrant", + } + sample, err = await _sample_payload(name) if sample_flag else (None, None) + if sample: + entry["sample"] = sample + if err: + entry.setdefault("warnings", []).append(err) + fallback_entries.append(entry) + count += 1 + if max_entries is not None and count >= max_entries: + break + + entries = results + fallback_entries + + return { + "results": entries, + "counts": { + "state": len(state_entries), + "returned": len(entries), + "fallback": len(fallback_entries), + }, + "errors": { + "state": state_error, + "qdrant": qdrant_error, + }, + "qdrant_used": qdrant_used, + "filters": { + "collection": collection_filter, + "repo_name": repo_filter, + "search_root": search_root, + "include_samples": sample_flag, + "limit": max_entries, + }, + } + + @mcp.tool() async def memory_store( information: str, diff --git a/scripts/remote_upload_client.py b/scripts/remote_upload_client.py index 93caf87e..6b006fd6 100644 --- a/scripts/remote_upload_client.py +++ b/scripts/remote_upload_client.py @@ -4,6 +4,11 @@ This module provides functionality to create and upload delta bundles to a remote server, enabling real-time code synchronization across distributed environments. + +Example usage: + export HOST_ROOT="/tmp/testupload" && export CONTAINER_ROOT="/work" && export + PYTHONPATH="/home/coder/project/Context-Engine:$PYTHONPATH" && python3 + scripts/remote_upload_client.py --path /tmp/testupload) """ import os @@ -124,6 +129,27 @@ def cleanup(self): finally: self.temp_dir = None + def get_mapping_summary(self) -> Dict[str, Any]: + """Return derived collection mapping details.""" + container_path = self._translate_to_container_path(self.workspace_path) + return { + "repo_name": self.repo_name, + "collection_name": self.collection_name, + "source_path": self.workspace_path, + "container_path": container_path, + "upload_endpoint": self.upload_endpoint, + } + + def log_mapping_summary(self) -> None: + """Log mapping summary for user visibility.""" + info = self.get_mapping_summary() + logger.info("[remote_upload] Collection mapping:") + logger.info(f" repo_name : {info['repo_name']}") + logger.info(f" collection_name : {info['collection_name']}") + logger.info(f" source_path : {info['source_path']}") + logger.info(f" container_path : {info['container_path']}") + logger.info("[remote_upload] To query remote state later, call the MCP `collection_map` tool.") + def _get_temp_bundle_dir(self) -> Path: """Get or create temporary directory for bundle creation.""" if not self.temp_dir: @@ -510,7 +536,8 @@ def upload_bundle(self, bundle_path: str, manifest: Dict[str, Any]) -> Dict[str, 'workspace_path': self._translate_to_container_path(self.workspace_path), 'collection_name': self.collection_name, # CLI is stateless - server handles sequence numbers - 'force': 'false' + 'force': 'false', + 'source_path': self.workspace_path, } logger.info(f"[remote_upload] Uploading bundle {manifest['bundle_id']} (size: {bundle_size} bytes)") @@ -888,6 +915,9 @@ def get_remote_config(cli_path: Optional[str] = None) -> Dict[str, str]: # Use auto-generated collection name based on repo name repo_name = _extract_repo_name_from_path(workspace_path) + # Fallback to directory name if repo detection fails + if not repo_name: + repo_name = Path(workspace_path).name collection_name = get_collection_name(repo_name) return { @@ -947,6 +977,12 @@ def main(): help="Force upload of all files (ignore cached state and treat all files as new)" ) + parser.add_argument( + "--show-mapping", + action="store_true", + help="Print collection↔workspace mapping information and exit" + ) + args = parser.parse_args() # Validate path if provided @@ -977,6 +1013,17 @@ def main(): logger.info(f"Collection name: {config['collection_name']}") logger.info(f"Upload endpoint: {config['upload_endpoint']}") + if args.show_mapping: + with RemoteUploadClient( + upload_endpoint=config["upload_endpoint"], + workspace_path=config["workspace_path"], + collection_name=config["collection_name"], + max_retries=config["max_retries"], + timeout=config["timeout"], + ) as client: + client.log_mapping_summary() + return 0 + # Check if remote mode is enabled if not is_remote_mode_enabled(): logger.error("Remote upload mode is not enabled. Set REMOTE_UPLOAD_ENABLED=1 in environment variables.") @@ -994,6 +1041,8 @@ def main(): logger.info("Remote upload client initialized successfully") + client.log_mapping_summary() + # Test server connection logger.info("Checking server status...") status = client.get_server_status() diff --git a/scripts/upload_service.py b/scripts/upload_service.py index a1c2e605..0b5c1589 100644 --- a/scripts/upload_service.py +++ b/scripts/upload_service.py @@ -31,6 +31,8 @@ get_cached_file_hash, set_cached_file_hash, _extract_repo_name_from_path, + update_repo_origin, + get_collection_mappings, ) except ImportError: # Fallback for testing without full environment @@ -39,6 +41,8 @@ get_cached_file_hash = None set_cached_file_hash = None _extract_repo_name_from_path = None + update_repo_origin = None + get_collection_mappings = None # Configure logging @@ -100,10 +104,14 @@ class HealthResponse(BaseModel): work_dir: str def get_workspace_key(workspace_path: str) -> str: - """Generate a unique key for workspace tracking using repository name.""" - # Extract repository name from path for consistent identification - # Both host paths (/home/user/project/repo) and container paths (/work/repo) - # should generate the same key for the same repository + """Generate 16-char hash for collision avoidance in remote uploads. + + Remote uploads may have identical folder names from different users, + so uses longer hash than local indexing (8-chars) to ensure uniqueness. + + Both host paths (/home/user/project/repo) and container paths (/work/repo) + should generate the same key for the same repository. + """ repo_name = Path(workspace_path).name return hashlib.sha256(repo_name.encode('utf-8')).hexdigest()[:16] @@ -176,6 +184,9 @@ async def process_delta_bundle(workspace_path: str, bundle_path: Path, manifest: # This caused watcher service to never see uploaded files if _extract_repo_name_from_path: repo_name = _extract_repo_name_from_path(workspace_path) + # Fallback to directory name if repo detection fails + if not repo_name: + repo_name = Path(workspace_path).name else: # Fallback: use directory name repo_name = Path(workspace_path).name @@ -346,7 +357,8 @@ async def upload_delta_bundle( workspace_path: str = Form(...), collection_name: Optional[str] = Form(None), sequence_number: Optional[int] = Form(None), - force: Optional[bool] = Form(False) + force: Optional[bool] = Form(False), + source_path: Optional[str] = Form(None), ): """Upload and process delta bundle.""" start_time = datetime.now() @@ -363,10 +375,28 @@ async def upload_delta_bundle( if not collection_name: if get_collection_name: repo_name = _extract_repo_name_from_path(workspace_path) if _extract_repo_name_from_path else None + # Fallback to directory name if repo detection fails + if not repo_name: + repo_name = Path(workspace_path).name collection_name = get_collection_name(repo_name) else: collection_name = DEFAULT_COLLECTION + # Persist origin metadata for remote lookups + try: + if update_repo_origin and repo_name: + workspace_key = get_workspace_key(workspace_path) + container_workspace = str(Path(WORK_DIR) / f"{repo_name}-{workspace_key}") + update_repo_origin( + workspace_path=container_workspace, + repo_name=repo_name, + container_path=container_workspace, + source_path=source_path or workspace_path, + collection_name=collection_name, + ) + except Exception as origin_err: + logger.debug(f"[upload_service] Failed to persist origin info: {origin_err}") + # Validate bundle size if bundle.size and bundle.size > MAX_BUNDLE_SIZE_MB * 1024 * 1024: raise HTTPException( diff --git a/scripts/watch_index.py b/scripts/watch_index.py index bd1bc823..27f2c996 100644 --- a/scripts/watch_index.py +++ b/scripts/watch_index.py @@ -4,7 +4,6 @@ import threading from pathlib import Path from typing import Set, Optional -from collections import OrderedDict from qdrant_client import QdrantClient, models from fastembed import TextEmbedding @@ -20,56 +19,21 @@ if str(ROOT_DIR) not in sys.path: sys.path.insert(0, str(ROOT_DIR)) -# Import critical functions first to prevent cascading failures -try: - from scripts.workspace_state import ( - _extract_repo_name_from_path, - get_collection_name, - _get_global_state_dir, - _get_repo_state_dir, - is_multi_repo_mode, - get_cached_file_hash, - set_cached_file_hash, - ) -except ImportError: - # If critical imports fail, set None to prevent crashes - _extract_repo_name_from_path = None # type: ignore - get_collection_name = None # type: ignore - _get_global_state_dir = None # type: ignore - _get_repo_state_dir = None # type: ignore - is_multi_repo_mode = None # type: ignore - get_cached_file_hash = None # type: ignore - set_cached_file_hash = None # type: ignore - -# Import optional functions that may not exist -try: - from scripts.workspace_state import ( - get_workspace_state, - update_indexing_status, - update_workspace_state, - remove_cached_file, - ) -except ImportError: - # Optional functions - set to None if not available - get_workspace_state = None # type: ignore - update_indexing_status = None # type: ignore - update_workspace_state = None # type: ignore - remove_cached_file = None # type: ignore +from scripts.workspace_state import ( + _extract_repo_name_from_path, + get_collection_name, + _get_global_state_dir, + is_multi_repo_mode, + get_cached_file_hash, + set_cached_file_hash, + remove_cached_file, + update_indexing_status, +) import hashlib from datetime import datetime import scripts.ingest_code as idx -# Import remote upload client -try: - from scripts.remote_upload_client import ( - RemoteUploadClient, - is_remote_mode_enabled, - get_remote_config - ) - _REMOTE_UPLOAD_AVAILABLE = True -except ImportError: - _REMOTE_UPLOAD_AVAILABLE = False QDRANT_URL = os.environ.get("QDRANT_URL", "http://qdrant:6333") MODEL = os.environ.get("EMBEDDING_MODEL", "BAAI/bge-base-en-v1.5") @@ -78,177 +42,30 @@ # Debounce interval DELAY_SECS = float(os.environ.get("WATCH_DEBOUNCE_SECS", "1.0")) -# Simple LRU cache implementation to prevent memory growth -class LRUCache: - """Simple LRU cache with size limits.""" - - def __init__(self, max_size: int = 1000): - self.max_size = max_size - self.cache = OrderedDict() - self._hits = 0 - self._misses = 0 - - def get(self, key): - if key in self.cache: - # Move to end (most recently used) - self.cache.move_to_end(key) - self._hits += 1 - return self.cache[key] - self._misses += 1 - return None - - def put(self, key, value): - if key in self.cache: - # Update existing entry - self.cache[key] = value - self.cache.move_to_end(key) - else: - # Add new entry, evict if necessary - if len(self.cache) >= self.max_size: - # Remove least recently used item - self.cache.popitem(last=False) - self.cache[key] = value - - def clear(self): - self.cache.clear() - self._hits = 0 - self._misses = 0 - - def get_hit_rate(self): - total = self._hits + self._misses - return self._hits / total if total > 0 else 0.0 - - def size(self): - return len(self.cache) - -# Multi-repo collection management with size-limited caches -_collection_cache = LRUCache(max_size=500) # Cache for repo path -> collection name mapping -_repo_cache = LRUCache(max_size=2000) # Cache for file path -> repo path mapping - -# Optional cache statistics logging (disabled by default) -_ENABLE_CACHE_STATS = os.environ.get("ENABLE_CACHE_STATS", "false").lower() == "true" - -def _log_cache_stats(): - """Log cache statistics for monitoring.""" - if _ENABLE_CACHE_STATS: - print(f"[cache_stats] Collection cache: {_collection_cache.size()} items, " - f"hit rate: {_collection_cache.get_hit_rate():.2%}") - print(f"[cache_stats] Repo cache: {_repo_cache.size()} items, " - f"hit rate: {_repo_cache.get_hit_rate():.2%}") def _detect_repo_for_file(file_path: Path) -> Optional[Path]: """ - Detect which repository a file belongs to using the new workspace_state functions. + Detect which repository a file belongs to. Returns the repository root path or None if not under WATCH_ROOT. """ try: - # Normalize paths - get current WATCH_ROOT to handle env changes - abs_file = file_path.resolve() - watch_root = Path(os.environ.get("WATCH_ROOT", "/work")).resolve() - abs_root = watch_root - - # File must be under WATCH_ROOT - try: - abs_file.relative_to(abs_root) - except ValueError: - return None - - # Check cache first - file_key = str(abs_file) - cached_result = _repo_cache.get(file_key) - if cached_result is not None: - return cached_result - - # Use new workspace_state function to extract repo name from file path - repo_name = _extract_repo_name_from_path(str(abs_file)) - - # Construct repo path from the detected repo name - # Look for the repo directory under WATCH_ROOT - repo_path = None - rel_path = abs_file.relative_to(abs_root) - path_parts = rel_path.parts - - if not path_parts: - return None - - # Strategy 1: Look for repo with matching name in common locations - # Check immediate directories under WATCH_ROOT - if len(path_parts) >= 1: - potential_repo_name = path_parts[0] - if potential_repo_name and repo_name and (potential_repo_name == repo_name or potential_repo_name.lower() == repo_name.lower()): - repo_path = abs_root / potential_repo_name - if repo_path.exists(): - _repo_cache.put(file_key, repo_path) - return repo_path - - # Strategy 2: Walk up the path hierarchy to find repo root - current_path = abs_file.parent - abs_root_resolved = abs_root.resolve() - - while True: - # Check if current path name matches our detected repo name - if current_path.name == repo_name or current_path.name.lower() == repo_name.lower(): - repo_path = current_path - break - - # Check if current_path has .git - if (current_path / ".git").exists(): - repo_path = current_path - break - - # Stop if we've reached WATCH_ROOT or above it - current_resolved = current_path.resolve() - if current_resolved == abs_root_resolved or current_resolved == current_path.parent.resolve(): - break - - current_path = current_path.parent - - # Strategy 3: Fallback to first-level directory under WATCH_ROOT - if repo_path is None: - repo_path = abs_root / path_parts[0] - if not repo_path.exists(): - # If the assumed repo path doesn't exist, fall back to WATCH_ROOT itself - repo_path = abs_root - - # Cache the result - _repo_cache.put(file_key, repo_path) - return repo_path - - except (OSError, ValueError, RuntimeError) as e: - # Log the specific error for debugging if needed - print(f"[repo_detection] Error detecting repo for {file_path}: {e}") + rel_path = file_path.relative_to(ROOT) + if rel_path.parts: + return ROOT / rel_path.parts[0] + except ValueError: return None def _get_collection_for_repo(repo_path: Path) -> str: """ - Get the collection name for a repository using new workspace_state functions. - Uses caching to avoid repeated calls. + Get the collection name for a repository. """ try: - repo_key = str(repo_path) # repo_path is already resolved - - # Check cache first - cached_collection = _collection_cache.get(repo_key) - if cached_collection is not None: - return cached_collection - - # Extract repo name using new workspace_state function - repo_name = _extract_repo_name_from_path(repo_key) - - # Use new workspace_state function to get collection name - collection_name = get_collection_name(repo_name) - - # Cache the result - _collection_cache.put(repo_key, collection_name) - return collection_name - - except (OSError, ImportError, ValueError) as e: - # Fallback to default collection name with logging - print(f"[collection_detection] Error getting collection for {repo_path}: {e}") - fallback = os.environ.get("COLLECTION_NAME", "my-collection") - return fallback + repo_name = _extract_repo_name_from_path(str(repo_path)) + return get_collection_name(repo_name) + except Exception: + return os.environ.get("COLLECTION_NAME", "my-collection") def _get_collection_for_file(file_path: Path) -> str: @@ -270,53 +87,14 @@ def _get_collection_for_file(file_path: Path) -> str: return os.environ.get("COLLECTION_NAME", "my-collection") -def _get_remote_client_for_repo(repo_path: Path, remote_clients: dict, remote_config: dict) -> Optional[RemoteUploadClient]: - """ - Get or create a remote upload client for a specific repository. - Uses the new repo-specific metadata structure for delta bundles. - """ - repo_key = str(repo_path) # repo_path is already resolved - - if repo_key in remote_clients: - return remote_clients[repo_key] - - # Create new client for this repository - try: - collection_name = _get_collection_for_repo(repo_path) - - # Extract repo name and get the repo-specific metadata directory - repo_name = _extract_repo_name_from_path(repo_key) - repo_state_dir = _get_repo_state_dir(repo_name) - - # Use the actual repository path as workspace_path for file resolution - # But use the repo-specific metadata directory for delta bundle storage - workspace_path = repo_key # This is the actual repo path where files are located - metadata_path = str(repo_state_dir) # This is where delta bundles are stored - - client = RemoteUploadClient( - upload_endpoint=remote_config["upload_endpoint"], - workspace_path=workspace_path, - collection_name=collection_name, - max_retries=remote_config["max_retries"], - timeout=remote_config["timeout"], - metadata_path=metadata_path - ) - remote_clients[repo_key] = client - print(f"[remote_upload] Created client for repo: {repo_path} -> {collection_name} (workspace: {workspace_path}, metadata: {metadata_path})") - return client - except (OSError, ValueError, ConnectionError, KeyError) as e: - print(f"[remote_upload] Error creating client for {repo_path}: {e}") - return None class ChangeQueue: - def __init__(self, process_cb, remote_clients: Optional[dict] = None, remote_config: Optional[dict] = None): + def __init__(self, process_cb): self._lock = threading.Lock() self._paths: Set[Path] = set() self._timer: threading.Timer | None = None self._process_cb = process_cb - self._remote_clients = remote_clients or {} - self._remote_config = remote_config def add(self, p: Path): with self._lock: @@ -333,63 +111,7 @@ def _flush(self): self._paths.clear() self._timer = None - # Handle remote upload if enabled - if self._remote_clients and _REMOTE_UPLOAD_AVAILABLE and self._remote_config: - try: - # Group paths by repository for remote upload - repo_groups = {} - for path in paths: - repo_path = _detect_repo_for_file(path) - if repo_path: - repo_key = str(repo_path) # repo_path is already resolved - if repo_key not in repo_groups: - repo_groups[repo_key] = [] - repo_groups[repo_key].append(path) - else: - # Use default client for files not under any repo - if "default" not in repo_groups: - repo_groups["default"] = [] - repo_groups["default"].append(path) - - # Process each repository with its own remote client - all_successful = True - for repo_key, repo_paths in repo_groups.items(): - try: - # Get or create remote client for this repository - if repo_key == "default": - remote_client = self._remote_clients.get("default") - else: - remote_client = _get_remote_client_for_repo( - Path(repo_key), self._remote_clients, self._remote_config - ) - - if remote_client: - success = remote_client.process_and_upload_changes(repo_paths) - if not success: - all_successful = False - print(f"[remote_upload] Upload failed for repo {repo_key}, falling back to local processing") - self._process_cb(repo_paths) - else: - print(f"[remote_upload] Upload successful for repo {repo_key}") - else: - all_successful = False - print(f"[remote_upload] No remote client available for repo {repo_key}, falling back to local processing") - self._process_cb(repo_paths) - except Exception as e: - all_successful = False - print(f"[remote_upload] Error during delta upload for repo {repo_key}: {e}") - print("[remote_upload] Falling back to local processing") - self._process_cb(repo_paths) - - if all_successful: - print("[remote_upload] All repository uploads completed successfully") - - except Exception as e: - print(f"[remote_upload] Error during multi-repo delta upload: {e}") - print("[remote_upload] Falling back to local processing") - self._process_cb(paths) - else: - self._process_cb(paths) + self._process_cb(paths) class IndexHandler(FileSystemEventHandler): @@ -399,6 +121,7 @@ def __init__(self, root: Path, queue: ChangeQueue, client: Optional[QdrantClient self.queue = queue self.client = client self.default_collection = default_collection + self.collection = default_collection self.excl = idx._Excluder(root) # Track ignore file for live reloads try: @@ -503,7 +226,7 @@ def on_deleted(self, event): except Exception: pass else: - print(f"[remote_mode] File deletion detected: {p}") + print(f"File deletion detected: {p}") # Drop local cache entry (always do this) try: @@ -535,23 +258,30 @@ def on_moved(self, event): try: src = Path(event.src_path).resolve() dest = Path(event.dest_path).resolve() - except Exception: + # Move detected - proceed with rename logic + except Exception as e: + print(f"[move_error] {e}") return # Only react to code files if dest.suffix.lower() not in idx.CODE_EXTS and src.suffix.lower() not in idx.CODE_EXTS: return # If destination directory is ignored, treat as simple deletion try: - rel_dir = "/" + str(dest.parent.relative_to(self.root.resolve())).replace(os.sep, "/") + rel_dir = "/" + str(dest.parent.resolve().relative_to(self.root.resolve())).replace(os.sep, "/") if rel_dir == "/.": rel_dir = "/" if self.excl.exclude_dir(rel_dir): if src.suffix.lower() in idx.CODE_EXTS: if self.client is not None: try: + # Try to delete from the file's current collection first src_collection = _get_collection_for_file(src) - idx.delete_points_by_path(self.client, src_collection, str(src)) - print(f"[moved:ignored_dest_deleted_src] {src} -> {dest} (from {src_collection})") + try: + idx.delete_points_by_path(self.client, src_collection, str(src)) + except Exception: + # Fallback to original behavior if source collection doesn't exist + idx.delete_points_by_path(self.client, self.collection, str(src)) + print(f"[moved:ignored_dest_deleted_src] {src} -> {dest}") except Exception: pass else: @@ -559,56 +289,27 @@ def on_moved(self, event): return except Exception: pass - # Try in-place rename (preserve vectors) - only if we have a local client - moved_count = -1 - if self.client is not None: - try: - # Get collections for source and destination - src_collection = _get_collection_for_file(src) - dest_collection = _get_collection_for_file(dest) - moved_count = _rename_in_store(self.client, src_collection, src, dest, dest_collection) - except Exception: - moved_count = -1 - if moved_count and moved_count > 0: - try: - src_collection = _get_collection_for_file(src) - print(f"[moved] {src} -> {dest} ({moved_count} chunk(s) relinked from {src_collection})") - # Update local cache: carry hash from src to dest if present - prev_hash = None - src_repo = _detect_repo_for_file(src) - dest_repo = _detect_repo_for_file(dest) - try: - # Use new repo-based cache structure - src_repo_name = _extract_repo_name_from_path(str(src_repo or self.root)) - prev_hash = get_cached_file_hash(str(src), src_repo_name) - except Exception: - prev_hash = None - if prev_hash: - try: - # Use new repo-based cache structure - dest_repo_name = _extract_repo_name_from_path(str(dest_repo or self.root)) - set_cached_file_hash(str(dest), prev_hash, dest_repo_name) - except Exception: - pass - try: - remove_cached_file(str(src), src_repo_name) - except Exception: - pass - except Exception: - pass - try: - repo_path = _detect_repo_for_file(dest) or self.root - _log_activity(str(repo_path), "moved", dest, {"from": str(src), "chunks": int(moved_count)}) - except Exception: - pass - return + # Determine source and destination collections + src_collection = _get_collection_for_file(src) + dest_collection = _get_collection_for_file(dest) + is_cross_collection = src_collection != dest_collection + + # For cross-collection moves, log the operation since it's a significant event + if is_cross_collection: + print(f"[cross_collection_move] {src} -> {dest}") # Fallback: delete old then index new destination + # This handles all moves using reliable delete+reindex approach if self.client is not None: try: if src.suffix.lower() in idx.CODE_EXTS: + # Try to delete from the file's current collection first src_collection = _get_collection_for_file(src) - idx.delete_points_by_path(self.client, src_collection, str(src)) - print(f"[moved:deleted_src] {src} from {src_collection}") + try: + idx.delete_points_by_path(self.client, src_collection, str(src)) + except Exception: + # Final fallback to original behavior if source collection doesn't exist + idx.delete_points_by_path(self.client, self.collection, str(src)) + print(f"[moved:deleted_src] {src}") except Exception: pass else: @@ -793,60 +494,7 @@ def _rename_in_store(client: QdrantClient, src_collection: str, src: Path, dest: def main(): - # Check if remote mode is enabled - remote_mode = False - remote_clients = {} # Map repo paths to remote clients - - if _REMOTE_UPLOAD_AVAILABLE and is_remote_mode_enabled(): - remote_mode = True - try: - remote_config = get_remote_config() - - # For multi-repo support, we'll create remote clients on-demand for each repository - # The base configuration will be used, but collection names will be determined per-repo - print(f"[remote_upload] Remote mode enabled: {remote_config['upload_endpoint']}") - print("[remote_upload] Multi-repo remote support - will create clients per repository") - - # Create a default client for backward compatibility - try: - # For the default client, use the global metadata directory to avoid permission issues - if _get_global_state_dir is not None: - global_state_dir = _get_global_state_dir() - default_workspace_path = str(global_state_dir) - else: - # Fallback if function is not available - default_workspace_path = "/work" - - default_remote_client = RemoteUploadClient( - upload_endpoint=remote_config["upload_endpoint"], - workspace_path=default_workspace_path, - collection_name=remote_config["collection_name"], - max_retries=remote_config["max_retries"], - timeout=remote_config["timeout"] - ) - - # Check server status - status = default_remote_client.get_server_status() - if status.get("success", False): - print(f"[remote_upload] Server status: {status.get('status', 'unknown')}") - else: - print(f"[remote_upload] Warning: Could not reach server - {status.get('error', {}).get('message', 'Unknown error')}") - - # Store as default client (will be used for single-repo scenarios) - remote_clients["default"] = default_remote_client - print(f"[remote_upload] Default client initialized with workspace: {default_workspace_path}") - - except Exception as e: - print(f"[remote_upload] Error initializing default remote client: {e}") - print("[remote_upload] Will create clients per-repository as needed") - - except Exception as e: - print(f"[remote_upload] Error initializing remote mode: {e}") - print("[remote_upload] Falling back to local mode") - remote_mode = False - remote_clients = {} - - # Determine collection and mode based on MULTI_REPO_MODE setting + # Determine collection and mode based on MULTI_REPO_MODE setting try: from scripts.workspace_state import get_collection_name as _get_coll except Exception: @@ -868,56 +516,50 @@ def main(): default_collection = os.environ.get("COLLECTION_NAME", "my-collection") print("[single_repo] Single-repo mode enabled - using single collection for all files") - mode_str = "REMOTE" if remote_mode else "LOCAL" print( - f"Watch mode: {mode_str} root={ROOT} qdrant={QDRANT_URL} collection={default_collection} model={MODEL}" + f"Watch mode: LOCAL root={ROOT} qdrant={QDRANT_URL} collection={default_collection} model={MODEL}" ) - # Initialize Qdrant client for local mode (remote mode doesn't need it for basic operation) - client = None - model = None - vector_name = None - - if not remote_mode: - client = QdrantClient( - url=QDRANT_URL, timeout=int(os.environ.get("QDRANT_TIMEOUT", "20") or 20) - ) + # Initialize Qdrant client + client = QdrantClient( + url=QDRANT_URL, timeout=int(os.environ.get("QDRANT_TIMEOUT", "20") or 20) + ) # Compute embedding dimension first (for deterministic dense vector selection) - model = TextEmbedding(model_name=MODEL) - dim = len(next(model.embed(["dimension probe"]))) + model = TextEmbedding(model_name=MODEL) + dim = len(next(model.embed(["dimension probe"]))) - # Determine dense vector name deterministically (use default collection as reference) - try: - info = client.get_collection(default_collection) - cfg = info.config.params.vectors - if isinstance(cfg, dict) and cfg: - # Prefer vector whose size matches embedding dim - vector_name = None - for name, params in cfg.items(): - psize = getattr(params, "size", None) or getattr(params, "dim", None) - if psize and int(psize) == int(dim): + # Determine dense vector name deterministically (use default collection as reference) + try: + info = client.get_collection(default_collection) + cfg = info.config.params.vectors + if isinstance(cfg, dict) and cfg: + # Prefer vector whose size matches embedding dim + vector_name = None + for name, params in cfg.items(): + psize = getattr(params, "size", None) or getattr(params, "dim", None) + if psize and int(psize) == int(dim): + vector_name = name + break + # If LEX vector exists, pick a different name as dense + if vector_name is None and getattr(idx, "LEX_VECTOR_NAME", None) in cfg: + for name in cfg.keys(): + if name != idx.LEX_VECTOR_NAME: vector_name = name break - # If LEX vector exists, pick a different name as dense - if vector_name is None and getattr(idx, "LEX_VECTOR_NAME", None) in cfg: - for name in cfg.keys(): - if name != idx.LEX_VECTOR_NAME: - vector_name = name - break - if vector_name is None: - vector_name = idx._sanitize_vector_name(MODEL) - else: + if vector_name is None: vector_name = idx._sanitize_vector_name(MODEL) - except Exception: + else: vector_name = idx._sanitize_vector_name(MODEL) + except Exception: + vector_name = idx._sanitize_vector_name(MODEL) # Ensure default collection + payload indexes exist - try: - idx.ensure_collection(client, default_collection, dim, vector_name) - except Exception: - pass - idx.ensure_payload_indexes(client, default_collection) + try: + idx.ensure_collection(client, default_collection, dim, vector_name) + except Exception: + pass + idx.ensure_payload_indexes(client, default_collection) # Ensure workspace state exists and set collection based on mode try: @@ -947,15 +589,8 @@ def main(): print(f"[workspace_state] Error initializing workspace state: {e}") pass - # Create change queue with remote clients if enabled - if remote_mode: - q = ChangeQueue( - lambda paths: _process_paths(paths, client, model, vector_name, str(ROOT), remote_mode), - remote_clients=remote_clients, - remote_config=get_remote_config() if _REMOTE_UPLOAD_AVAILABLE else None - ) - else: - q = ChangeQueue(lambda paths: _process_paths(paths, client, model, vector_name, str(ROOT), remote_mode)) + # Create change queue + q = ChangeQueue(lambda paths: _process_paths(paths, client, model, vector_name, str(ROOT))) handler = IndexHandler(ROOT, q, client, default_collection) @@ -973,11 +608,7 @@ def main(): obs.join() -def _process_paths(paths, client, model, vector_name: str, workspace_path: str, remote_mode: bool = False): - # In remote mode, actual processing is handled by the remote client - # This function is called as a fallback when remote upload fails - if remote_mode: - print(f"[local_fallback] Processing {len(paths)} files locally due to remote upload failure") +def _process_paths(paths, client, model, vector_name: str, workspace_path: str): # Prepare progress unique_paths = sorted(set(Path(x) for x in paths)) @@ -1067,8 +698,8 @@ def _process_paths(paths, client, model, vector_name: str, workspace_path: str, _log_activity(str(repo_path), "skipped", p, {"reason": "no-change-or-error"}) else: # In remote mode without fallback, just log activity - print(f"[remote_mode] Not processing locally: {p}") - _log_activity(str(repo_path), "indexed", p, {"reason": "remote_processed"}) + print(f"Not processing locally: {p}") + _log_activity(str(repo_path), "indexed", p, {"reason": "skipped"}) processed += 1 # Update progress for the specific repository @@ -1076,10 +707,6 @@ def _process_paths(paths, client, model, vector_name: str, workspace_path: str, repo_files = repo_groups[str(repo_path)] repo_processed = len([f for f in repo_files if f in unique_paths[:processed]]) _update_progress(str(repo_path), started_at, repo_processed, len(repo_files), current) - - # Log cache stats periodically (every 50 files processed) - if processed % 50 == 0: - _log_cache_stats() except Exception: pass diff --git a/scripts/workspace_state.py b/scripts/workspace_state.py index 51f8d5c3..bb762b17 100644 --- a/scripts/workspace_state.py +++ b/scripts/workspace_state.py @@ -55,6 +55,14 @@ class LastActivity(TypedDict, total=False): file_path: Optional[str] details: Optional[ActivityDetails] +class OriginInfo(TypedDict, total=False): + repo_name: Optional[str] + container_path: Optional[str] + source_path: Optional[str] + collection_name: Optional[str] + updated_at: Optional[str] + + class WorkspaceState(TypedDict, total=False): created_at: str updated_at: str @@ -62,6 +70,7 @@ class WorkspaceState(TypedDict, total=False): indexing_status: Optional[IndexingStatus] last_activity: Optional[LastActivity] qdrant_stats: Optional[Dict[str, Any]] + origin: Optional[OriginInfo] def is_multi_repo_mode() -> bool: """Check if multi-repo mode is enabled.""" @@ -223,6 +232,47 @@ def update_indexing_status( repo_name=repo_name, ) + +def update_repo_origin( + workspace_path: Optional[str] = None, + repo_name: Optional[str] = None, + *, + container_path: Optional[str] = None, + source_path: Optional[str] = None, + collection_name: Optional[str] = None, +) -> WorkspaceState: + """Update origin metadata for a repository/workspace.""" + + resolved_workspace, resolved_repo = _resolve_repo_context(workspace_path, repo_name) + + if is_multi_repo_mode() and resolved_repo is None: + return {} + + state = get_workspace_state(resolved_workspace, resolved_repo) + if not state: + state = {} + + origin: OriginInfo = dict(state.get("origin", {})) # type: ignore[arg-type] + if resolved_repo: + origin["repo_name"] = resolved_repo + if container_path or workspace_path: + origin["container_path"] = container_path or workspace_path + if source_path: + origin["source_path"] = source_path + if collection_name: + origin["collection_name"] = collection_name + origin["updated_at"] = datetime.now().isoformat() + + updates: Dict[str, Any] = {"origin": origin} + if collection_name: + updates.setdefault("qdrant_collection", collection_name) + + return update_workspace_state( + workspace_path=resolved_workspace, + updates=updates, + repo_name=resolved_repo, + ) + def log_activity(repo_name: Optional[str] = None, action: Optional[ActivityAction] = None, file_path: Optional[str] = None, details: Optional[ActivityDetails] = None) -> None: """Log activity to workspace state.""" @@ -262,8 +312,11 @@ def log_activity(repo_name: Optional[str] = None, action: Optional[ActivityActio pass def _generate_collection_name_from_repo(repo_name: str) -> str: - """Generate a collection name from repository name with hash suffix.""" - # Create a short hash from repo name to ensure uniqueness + """Generate collection name with 8-char hash for local workspaces. + + Used by local indexer/watcher. Remote uploads use 16+8 char pattern + for collision avoidance when folder names may be identical. + """ hash_obj = hashlib.sha256(repo_name.encode()) short_hash = hash_obj.hexdigest()[:8] return f"{repo_name}-{short_hash}" @@ -289,35 +342,45 @@ def get_collection_name(repo_name: Optional[str] = None) -> str: def _detect_repo_name_from_path(path: Path) -> str: """Detect repository name from path. Clean, robust implementation.""" try: - # Normalize path resolved_path = path.resolve() + except Exception: + return None - # Get workspace root - workspace_root = Path(os.environ.get("WATCH_ROOT") or os.environ.get("WORKSPACE_PATH") or "/work") + candidate_roots: List[Path] = [] + for root_str in ( + os.environ.get("WATCH_ROOT"), + os.environ.get("WORKSPACE_PATH"), + "/work", + os.environ.get("HOST_ROOT"), + "/home/coder/project/Context-Engine/dev-workspace", + ): + if not root_str: + continue + try: + root_path = Path(root_str).resolve() + except Exception: + continue + if root_path not in candidate_roots: + candidate_roots.append(root_path) - # Path must be under workspace root + for base in candidate_roots: try: - rel_path = resolved_path.relative_to(workspace_root) + rel_path = resolved_path.relative_to(base) except ValueError: - return None # Path is outside workspace root - - # Get first path component as repo name - if rel_path.parts: - repo_name = rel_path.parts[0] + continue - # Exclude system directories - if repo_name in (".codebase", ".git", "__pycache__"): - return None + if not rel_path.parts: + continue - # Verify the path exists (handles bindmounts) - repo_path = workspace_root / repo_name - if repo_path.exists() or str(resolved_path).startswith(str(repo_path) + "/"): - return repo_name + repo_name = rel_path.parts[0] + if repo_name in (".codebase", ".git", "__pycache__"): + continue - return None # Not a valid repo path + repo_path = base / repo_name + if repo_path.exists() or str(resolved_path).startswith(str(repo_path) + os.sep): + return repo_name - except Exception: - return None + return None def _extract_repo_name_from_path(workspace_path: str) -> str: """Extract repository name from workspace path.""" @@ -469,4 +532,67 @@ def list_workspaces(search_root: Optional[str] = None) -> List[Dict[str, Any]]: return workspaces + +def get_collection_mappings(search_root: Optional[str] = None) -> List[Dict[str, Any]]: + """Enumerate collection mappings with origin metadata.""" + + root_path = Path(search_root or _resolve_workspace_root()).resolve() + mappings: List[Dict[str, Any]] = [] + + try: + if is_multi_repo_mode(): + repos_root = root_path / STATE_DIRNAME / "repos" + if repos_root.exists(): + for repo_dir in sorted(p for p in repos_root.iterdir() if p.is_dir()): + repo_name = repo_dir.name + state_path = repo_dir / STATE_FILENAME + if not state_path.exists(): + continue + try: + with open(state_path, "r", encoding="utf-8") as f: + state = json.load(f) or {} + except Exception: + continue + + origin = state.get("origin", {}) or {} + mappings.append( + { + "repo_name": repo_name, + "collection_name": state.get("qdrant_collection") + or get_collection_name(repo_name), + "container_path": origin.get("container_path") + or str((Path(_resolve_workspace_root()) / repo_name).resolve()), + "source_path": origin.get("source_path"), + "state_file": str(state_path), + "updated_at": state.get("updated_at"), + } + ) + else: + state_path = root_path / STATE_DIRNAME / STATE_FILENAME + if state_path.exists(): + try: + with open(state_path, "r", encoding="utf-8") as f: + state = json.load(f) or {} + except Exception: + state = {} + + origin = state.get("origin", {}) or {} + repo_name = origin.get("repo_name") or Path(root_path).name + mappings.append( + { + "repo_name": repo_name, + "collection_name": state.get("qdrant_collection") + or get_collection_name(repo_name), + "container_path": origin.get("container_path") + or str(root_path), + "source_path": origin.get("source_path"), + "state_file": str(state_path), + "updated_at": state.get("updated_at"), + } + ) + except Exception: + return mappings + + return mappings + # Add missing functions that callers expect (already defined above) \ No newline at end of file From 12ff1ff4fc696295863ca9a44d1b8a280ba7aad0 Mon Sep 17 00:00:00 2001 From: Reese Date: Wed, 12 Nov 2025 11:06:25 +0000 Subject: [PATCH 11/16] feat(remote): add watch mode and standalone upload client Add continuous file monitoring capability with --watch flag that automatically detects changes and uploads delta bundles at configurable intervals. Also introduce standalone_upload_client.py as a self-contained version that includes embedded dependencies, allowing delta uploads without requiring the full repository. --- scripts/remote_upload_client.py | 125 +++ scripts/standalone_upload_client.py | 1389 +++++++++++++++++++++++++++ 2 files changed, 1514 insertions(+) create mode 100644 scripts/standalone_upload_client.py diff --git a/scripts/remote_upload_client.py b/scripts/remote_upload_client.py index 6b006fd6..ec433715 100644 --- a/scripts/remote_upload_client.py +++ b/scripts/remote_upload_client.py @@ -792,6 +792,68 @@ def process_changes_and_upload(self, changes: Dict[str, List]) -> bool: logger.error(f"[remote_upload] Unexpected error in process_changes_and_upload: {e}") return False + def get_all_code_files(self) -> List[Path]: + """Get all code files in the workspace.""" + all_files = [] + try: + workspace_path = Path(self.workspace_path) + for ext in idx.CODE_EXTS: + all_files.extend(workspace_path.rglob(f"*{ext}")) + + # Filter out directories and hidden files + all_files = [ + f for f in all_files + if f.is_file() + and not any(part.startswith('.') for part in f.parts) + and '.codebase' not in str(f) + ] + except Exception as e: + logger.error(f"[watch] Error scanning files: {e}") + + return all_files + + def watch_loop(self, interval: int = 5): + """Main file watching loop using existing detection and upload methods.""" + logger.info(f"[watch] Starting file monitoring (interval: {interval}s)") + logger.info(f"[watch] Monitoring: {self.workspace_path}") + logger.info(f"[watch] Press Ctrl+C to stop") + + try: + while True: + try: + # Use existing change detection (get all files in workspace) + all_files = self.get_all_code_files() + changes = self.detect_file_changes(all_files) + + # Count only meaningful changes (exclude unchanged) + meaningful_changes = len(changes.get("created", [])) + len(changes.get("updated", [])) + len(changes.get("deleted", [])) + len(changes.get("moved", [])) + + if meaningful_changes > 0: + logger.info(f"[watch] Detected {meaningful_changes} changes: { {k: len(v) for k, v in changes.items() if k != 'unchanged'} }") + + # Use existing upload method + success = self.process_changes_and_upload(changes) + + if success: + logger.info(f"[watch] Successfully uploaded changes") + else: + logger.error(f"[watch] Failed to upload changes") + else: + logger.debug(f"[watch] No changes detected") # Debug level to avoid spam + + # Sleep until next check + time.sleep(interval) + + except KeyboardInterrupt: + logger.info(f"[watch] Received interrupt signal, stopping...") + break + except Exception as e: + logger.error(f"[watch] Error in watch loop: {e}") + time.sleep(interval) # Continue even after errors + + except KeyboardInterrupt: + logger.info(f"[watch] File monitoring stopped by user") + def process_and_upload_changes(self, changed_paths: List[Path]) -> bool: """ Process changed paths and upload delta bundle if meaningful changes exist. @@ -944,6 +1006,12 @@ def main(): # Upload from specific directory with custom endpoint python remote_upload_client.py --path /path/to/repo --endpoint http://remote-server:8080 + + # Watch for file changes and upload automatically + python remote_upload_client.py --path /path/to/repo --watch + + # Watch with custom interval (check every 3 seconds) + python remote_upload_client.py --path /path/to/repo --watch --interval 3 """ ) @@ -983,6 +1051,19 @@ def main(): help="Print collection↔workspace mapping information and exit" ) + parser.add_argument( + "--watch", "-w", + action="store_true", + help="Watch for file changes and upload automatically (continuous mode)" + ) + + parser.add_argument( + "--interval", "-i", + type=int, + default=5, + help="Watch interval in seconds (default: 5)" + ) + args = parser.parse_args() # Validate path if provided @@ -1029,6 +1110,50 @@ def main(): logger.error("Remote upload mode is not enabled. Set REMOTE_UPLOAD_ENABLED=1 in environment variables.") return 1 + # Handle watch mode + if args.watch: + logger.info("Starting watch mode for continuous file monitoring") + try: + with RemoteUploadClient( + upload_endpoint=config["upload_endpoint"], + workspace_path=config["workspace_path"], + collection_name=config["collection_name"], + max_retries=config["max_retries"], + timeout=config["timeout"] + ) as client: + + logger.info("Remote upload client initialized successfully") + client.log_mapping_summary() + + # Test server connection first + logger.info("Checking server status...") + status = client.get_server_status() + is_success = ( + isinstance(status, dict) and + 'workspace_path' in status and + 'collection_name' in status and + status.get('status') == 'ready' + ) + if not is_success: + error = status.get("error", {}) + logger.error(f"Cannot connect to server: {error.get('message', 'Unknown error')}") + return 1 + + logger.info("Server connection successful") + logger.info(f"Starting file monitoring with {args.interval}s interval") + + # Start the watch loop + client.watch_loop(interval=args.interval) + + return 0 + + except KeyboardInterrupt: + logger.info("Watch mode stopped by user") + return 0 + except Exception as e: + logger.error(f"Watch mode failed: {e}") + return 1 + # Initialize client with context manager for cleanup try: with RemoteUploadClient( diff --git a/scripts/standalone_upload_client.py b/scripts/standalone_upload_client.py new file mode 100644 index 00000000..c697b425 --- /dev/null +++ b/scripts/standalone_upload_client.py @@ -0,0 +1,1389 @@ +#!/usr/bin/env python3 +""" +Standalone Remote Upload Client for Context-Engine. + +This is a self-contained version of the remote upload client that doesn't require +the full Context-Engine repository. It includes only the essential functions +needed for delta bundle creation and upload. + +Example usage: + python3 standalone_upload_client.py --path /path/to/your/project --server https://your-server.com +""" + +import os +import json +import time +import uuid +import hashlib +import tarfile +import tempfile +import logging +import argparse +from pathlib import Path +from typing import Dict, List, Any, Optional, Tuple +from datetime import datetime +import requests +from requests.adapters import HTTPAdapter +from urllib3.util.retry import Retry + +# Configure logging +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +# ============================================================================= +# EMBEDDED DEPENDENCIES (Extracted from Context-Engine) +# ============================================================================= + +# Language detection mapping (from ingest_code.py) +CODE_EXTS = { + ".py": "python", + ".js": "javascript", + ".ts": "typescript", + ".tsx": "typescript", + ".jsx": "javascript", + ".java": "java", + ".go": "go", + ".rs": "rust", + ".rb": "ruby", + ".php": "php", + ".c": "c", + ".h": "c", + ".cpp": "cpp", + ".cc": "cpp", + ".hpp": "cpp", + ".cs": "csharp", + ".kt": "kotlin", + ".swift": "swift", + ".scala": "scala", + ".sh": "shell", + ".ps1": "powershell", + ".psm1": "powershell", + ".psd1": "powershell", + ".sql": "sql", + ".md": "markdown", + ".yml": "yaml", + ".yaml": "yaml", + ".toml": "toml", + ".ini": "ini", + ".cfg": "ini", + ".conf": "ini", + ".xml": "xml", + ".html": "html", + ".htm": "html", + ".css": "css", + ".scss": "scss", + ".sass": "sass", + ".less": "less", + ".json": "json", + "Dockerfile": "dockerfile", + "Makefile": "makefile", + ".tf": "terraform", + ".tfvars": "terraform", + ".hcl": "terraform", + ".vue": "vue", + ".svelte": "svelte", + ".elm": "elm", + ".dart": "dart", + ".lua": "lua", + ".r": "r", + ".R": "r", + ".m": "matlab", + ".pl": "perl", + ".swift": "swift", + ".kt": "kotlin", + ".cljs": "clojure", + ".clj": "clojure", + ".hs": "haskell", + ".ml": "ocaml", + ".zig": "zig", + ".nim": "nim", + ".v": "verilog", + ".sv": "verilog", + ".vhdl": "vhdl", + ".asm": "assembly", + ".s": "assembly", + ". Dockerfile": "dockerfile", +} + +def hash_id(text: str, path: str, start: int, end: int) -> str: + """Generate hash ID for content (from ingest_code.py).""" + h = hashlib.sha1( + f"{path}:{start}-{end}\n{text}".encode("utf-8", errors="ignore") + ).hexdigest() + return h[:16] + +def get_collection_name(repo_name: Optional[str] = None) -> str: + """Generate collection name with 8-char hash for local workspaces. + + Simplified version from workspace_state.py. + """ + if not repo_name: + return "default-collection" + hash_obj = hashlib.sha256(repo_name.encode()) + short_hash = hash_obj.hexdigest()[:8] + return f"{repo_name}-{short_hash}" + +def _extract_repo_name_from_path(workspace_path: str) -> str: + """Extract repository name from workspace path. + + Simplified version from workspace_state.py. + """ + try: + path = Path(workspace_path).resolve() + # Get the directory name as repo name + return path.name + except Exception: + return "unknown-repo" + +# Simple file-based hash cache (simplified from workspace_state.py) +class SimpleHashCache: + """Simple file-based hash cache for tracking file changes.""" + + def __init__(self, workspace_path: str, repo_name: str): + self.workspace_path = Path(workspace_path).resolve() + self.repo_name = repo_name + self.cache_dir = self.workspace_path / ".context-engine" + self.cache_file = self.cache_dir / "file_cache.json" + self.cache_dir.mkdir(exist_ok=True) + + def _load_cache(self) -> Dict[str, str]: + """Load cache from disk.""" + if not self.cache_file.exists(): + return {} + try: + with open(self.cache_file, 'r', encoding='utf-8') as f: + data = json.load(f) + return data.get("file_hashes", {}) + except Exception: + return {} + + def _save_cache(self, file_hashes: Dict[str, str]): + """Save cache to disk.""" + try: + data = { + "file_hashes": file_hashes, + "updated_at": datetime.now().isoformat() + } + with open(self.cache_file, 'w', encoding='utf-8') as f: + json.dump(data, f, indent=2) + except Exception: + pass + + def get_hash(self, file_path: str) -> str: + """Get cached file hash.""" + file_hashes = self._load_cache() + abs_path = str(Path(file_path).resolve()) + return file_hashes.get(abs_path, "") + + def set_hash(self, file_path: str, file_hash: str): + """Set cached file hash.""" + file_hashes = self._load_cache() + abs_path = str(Path(file_path).resolve()) + file_hashes[abs_path] = file_hash + self._save_cache(file_hashes) + +# Create global cache instance (will be initialized in RemoteUploadClient) +_hash_cache: Optional[SimpleHashCache] = None + +def get_cached_file_hash(file_path: str, repo_name: Optional[str] = None) -> str: + """Get cached file hash for tracking changes.""" + global _hash_cache + if _hash_cache: + return _hash_cache.get_hash(file_path) + return "" + +def set_cached_file_hash(file_path: str, file_hash: str, repo_name: Optional[str] = None): + """Set cached file hash for tracking changes.""" + global _hash_cache + if _hash_cache: + _hash_cache.set_hash(file_path, file_hash) + + +class RemoteUploadClient: + """Client for uploading delta bundles to remote server.""" + + def _translate_to_container_path(self, host_path: str) -> str: + """Translate host path to container path for API communication.""" + # Use environment variable for path mapping if available + host_root = os.environ.get("HOST_ROOT", "/home/coder/project/Context-Engine/dev-workspace") + container_root = os.environ.get("CONTAINER_ROOT", "/work") + + if host_path.startswith(host_root): + return host_path.replace(host_root, container_root) + else: + # Fallback: if path doesn't match expected pattern, use as-is + return host_path + + def __init__(self, + upload_endpoint: str, + workspace_path: str, + collection_name: str, + max_retries: int = 3, + timeout: int = 30, + metadata_path: Optional[str] = None): + """ + Initialize remote upload client. + + Args: + upload_endpoint: HTTP endpoint for delta uploads + workspace_path: Absolute path to workspace (where files are located) + collection_name: Target collection name + max_retries: Maximum number of upload retries + timeout: Request timeout in seconds + metadata_path: Absolute path to metadata directory (for delta bundles) + If None, uses workspace_path/.codebase/delta_bundles + """ + self.upload_endpoint = upload_endpoint.rstrip('/') + self.workspace_path = workspace_path + self.collection_name = collection_name + self.max_retries = max_retries + self.timeout = timeout + + # Use temporary directory for bundle creation - CLI should be stateless + # Temporary bundles are cleaned up after upload + self.temp_dir = None + self.bundle_dir = None # No persistent bundle directory in CLI mode + + # Store repo name for cache operations + self.repo_name = _extract_repo_name_from_path(workspace_path) + + # Initialize hash cache + global _hash_cache + _hash_cache = SimpleHashCache(workspace_path, self.repo_name) + + # Setup HTTP session with retry strategy + self.session = requests.Session() + retry_strategy = Retry( + total=max_retries, + backoff_factor=1, + status_forcelist=[429, 500, 502, 503, 504], + ) + adapter = HTTPAdapter(max_retries=retry_strategy) + self.session.mount("http://", adapter) + self.session.mount("https://", adapter) + + def __enter__(self): + """Context manager entry.""" + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + """Context manager exit with cleanup.""" + self.cleanup() + + def cleanup(self): + """Clean up temporary directories.""" + if self.temp_dir and os.path.exists(self.temp_dir): + try: + import shutil + shutil.rmtree(self.temp_dir) + logger.debug(f"[remote_upload] Cleaned up temporary directory: {self.temp_dir}") + except Exception as e: + logger.warning(f"[remote_upload] Failed to cleanup temp directory {self.temp_dir}: {e}") + finally: + self.temp_dir = None + + def get_mapping_summary(self) -> Dict[str, Any]: + """Return derived collection mapping details.""" + container_path = self._translate_to_container_path(self.workspace_path) + return { + "repo_name": self.repo_name, + "collection_name": self.collection_name, + "source_path": self.workspace_path, + "container_path": container_path, + "upload_endpoint": self.upload_endpoint, + } + + def log_mapping_summary(self) -> None: + """Log mapping summary for user visibility.""" + info = self.get_mapping_summary() + logger.info("[remote_upload] Collection mapping:") + logger.info(f" repo_name : {info['repo_name']}") + logger.info(f" collection_name : {info['collection_name']}") + logger.info(f" source_path : {info['source_path']}") + logger.info(f" container_path : {info['container_path']}") + logger.info("[remote_upload] To query remote state later, call the MCP `collection_map` tool.") + + def _get_temp_bundle_dir(self) -> Path: + """Get or create temporary directory for bundle creation.""" + if not self.temp_dir: + self.temp_dir = tempfile.mkdtemp(prefix="delta_bundle_") + return Path(self.temp_dir) + # CLI is stateless - sequence tracking is handled by server + + def detect_file_changes(self, changed_paths: List[Path]) -> Dict[str, List]: + """ + Detect what type of changes occurred for each file path. + + Args: + changed_paths: List of changed file paths + + Returns: + Dictionary with change types: created, updated, deleted, moved, unchanged + """ + changes = { + "created": [], + "updated": [], + "deleted": [], + "moved": [], + "unchanged": [] + } + + for path in changed_paths: + abs_path = str(path.resolve()) + cached_hash = get_cached_file_hash(abs_path, self.repo_name) + + if not path.exists(): + # File was deleted + if cached_hash: + changes["deleted"].append(path) + else: + # File exists - calculate current hash + try: + with open(path, 'rb') as f: + content = f.read() + current_hash = hashlib.sha1(content).hexdigest() + + if not cached_hash: + # New file + changes["created"].append(path) + elif cached_hash != current_hash: + # Modified file + changes["updated"].append(path) + else: + # Unchanged (might be a move detection candidate) + changes["unchanged"].append(path) + + # Update cache + set_cached_file_hash(abs_path, current_hash, self.repo_name) + except Exception: + # Skip files that can't be read + continue + + # Detect moves by looking for files with same content hash + # but different paths (requires additional tracking) + changes["moved"] = self._detect_moves(changes["created"], changes["deleted"]) + + return changes + + def _detect_moves(self, created_files: List[Path], deleted_files: List[Path]) -> List[Tuple[Path, Path]]: + """ + Detect file moves by matching content hashes between created and deleted files. + + Args: + created_files: List of newly created files + deleted_files: List of deleted files + + Returns: + List of (source, destination) path tuples for detected moves + """ + moves = [] + deleted_hashes = {} + + # Build hash map for deleted files + for deleted_path in deleted_files: + try: + # Try to get cached hash first, fallback to file content + cached_hash = get_cached_file_hash(str(deleted_path), self.repo_name) + if cached_hash: + deleted_hashes[cached_hash] = deleted_path + continue + + # If no cached hash, try to read from file if it still exists + if deleted_path.exists(): + with open(deleted_path, 'rb') as f: + content = f.read() + file_hash = hashlib.sha1(content).hexdigest() + deleted_hashes[file_hash] = deleted_path + except Exception: + continue + + # Match created files with deleted files by hash + for created_path in created_files: + try: + with open(created_path, 'rb') as f: + content = f.read() + file_hash = hashlib.sha1(content).hexdigest() + + if file_hash in deleted_hashes: + source_path = deleted_hashes[file_hash] + moves.append((source_path, created_path)) + # Remove from consideration + del deleted_hashes[file_hash] + except Exception: + continue + + return moves + + def create_delta_bundle(self, changes: Dict[str, List]) -> Tuple[str, Dict[str, Any]]: + """ + Create a delta bundle from detected changes. + + Args: + changes: Dictionary of file changes by type + + Returns: + Tuple of (bundle_path, manifest_metadata) + """ + bundle_id = str(uuid.uuid4()) + # CLI is stateless - server handles sequence numbers + created_at = datetime.now().isoformat() + + # Create temporary directory for bundle + with tempfile.TemporaryDirectory() as temp_dir: + temp_path = Path(temp_dir) + + # Create directory structure + files_dir = temp_path / "files" + metadata_dir = temp_path / "metadata" + files_dir.mkdir() + metadata_dir.mkdir() + + # Create subdirectories + (files_dir / "created").mkdir() + (files_dir / "updated").mkdir() + (files_dir / "moved").mkdir() + + operations = [] + total_size = 0 + file_hashes = {} + + # Process created files + for path in changes["created"]: + rel_path = str(path.relative_to(Path(self.workspace_path))) + try: + with open(path, 'rb') as f: + content = f.read() + file_hash = hashlib.sha1(content).hexdigest() + content_hash = f"sha1:{file_hash}" + + # Write file to bundle + bundle_file_path = files_dir / "created" / rel_path + bundle_file_path.parent.mkdir(parents=True, exist_ok=True) + bundle_file_path.write_bytes(content) + + # Get file info + stat = path.stat() + language = CODE_EXTS.get(path.suffix.lower(), "unknown") + + operation = { + "operation": "created", + "path": rel_path, + "relative_path": rel_path, + "absolute_path": str(path.resolve()), + "size_bytes": stat.st_size, + "content_hash": content_hash, + "file_hash": f"sha1:{hash_id(content.decode('utf-8', errors='ignore'), rel_path, 1, len(content.splitlines()))}", + "modified_time": datetime.fromtimestamp(stat.st_mtime).isoformat(), + "language": language + } + operations.append(operation) + file_hashes[rel_path] = f"sha1:{file_hash}" + total_size += stat.st_size + + except Exception as e: + print(f"[bundle_create] Error processing created file {path}: {e}") + continue + + # Process updated files + for path in changes["updated"]: + rel_path = str(path.relative_to(Path(self.workspace_path))) + try: + with open(path, 'rb') as f: + content = f.read() + file_hash = hashlib.sha1(content).hexdigest() + content_hash = f"sha1:{file_hash}" + previous_hash = get_cached_file_hash(str(path.resolve()), self.repo_name) + + # Write file to bundle + bundle_file_path = files_dir / "updated" / rel_path + bundle_file_path.parent.mkdir(parents=True, exist_ok=True) + bundle_file_path.write_bytes(content) + + # Get file info + stat = path.stat() + language = CODE_EXTS.get(path.suffix.lower(), "unknown") + + operation = { + "operation": "updated", + "path": rel_path, + "relative_path": rel_path, + "absolute_path": str(path.resolve()), + "size_bytes": stat.st_size, + "content_hash": content_hash, + "previous_hash": f"sha1:{previous_hash}" if previous_hash else None, + "file_hash": f"sha1:{hash_id(content.decode('utf-8', errors='ignore'), rel_path, 1, len(content.splitlines()))}", + "modified_time": datetime.fromtimestamp(stat.st_mtime).isoformat(), + "language": language + } + operations.append(operation) + file_hashes[rel_path] = f"sha1:{file_hash}" + total_size += stat.st_size + + except Exception as e: + print(f"[bundle_create] Error processing updated file {path}: {e}") + continue + + # Process moved files + for source_path, dest_path in changes["moved"]: + dest_rel_path = str(dest_path.relative_to(Path(self.workspace_path))) + source_rel_path = str(source_path.relative_to(Path(self.workspace_path))) + try: + with open(dest_path, 'rb') as f: + content = f.read() + file_hash = hashlib.sha1(content).hexdigest() + content_hash = f"sha1:{file_hash}" + + # Write file to bundle + bundle_file_path = files_dir / "moved" / dest_rel_path + bundle_file_path.parent.mkdir(parents=True, exist_ok=True) + bundle_file_path.write_bytes(content) + + # Get file info + stat = dest_path.stat() + language = CODE_EXTS.get(dest_path.suffix.lower(), "unknown") + + operation = { + "operation": "moved", + "path": dest_rel_path, + "relative_path": dest_rel_path, + "absolute_path": str(dest_path.resolve()), + "source_path": source_rel_path, + "source_relative_path": source_rel_path, + "source_absolute_path": str(source_path.resolve()), + "size_bytes": stat.st_size, + "content_hash": content_hash, + "file_hash": f"sha1:{idx.hash_id(content.decode('utf-8', errors='ignore'), dest_rel_path, 1, len(content.splitlines()))}", + "modified_time": datetime.fromtimestamp(stat.st_mtime).isoformat(), + "language": language + } + operations.append(operation) + file_hashes[dest_rel_path] = f"sha1:{file_hash}" + total_size += stat.st_size + + except Exception as e: + print(f"[bundle_create] Error processing moved file {source_path} -> {dest_path}: {e}") + continue + + # Process deleted files + for path in changes["deleted"]: + rel_path = str(path.relative_to(Path(self.workspace_path))) + try: + previous_hash = get_cached_file_hash(str(path.resolve()), self.repo_name) + + operation = { + "operation": "deleted", + "path": rel_path, + "relative_path": rel_path, + "absolute_path": str(path.resolve()), + "previous_hash": f"sha1:{previous_hash}" if previous_hash else None, + "file_hash": None, + "modified_time": datetime.now().isoformat(), + "language": idx.CODE_EXTS.get(path.suffix.lower(), "unknown") + } + operations.append(operation) + + except Exception as e: + print(f"[bundle_create] Error processing deleted file {path}: {e}") + continue + + # Create manifest + manifest = { + "version": "1.0", + "bundle_id": bundle_id, + "workspace_path": self.workspace_path, + "collection_name": self.collection_name, + "created_at": created_at, + # CLI is stateless - server will assign sequence numbers + "sequence_number": None, # Server will assign + "parent_sequence": None, # Server will determine + "operations": { + "created": len(changes["created"]), + "updated": len(changes["updated"]), + "deleted": len(changes["deleted"]), + "moved": len(changes["moved"]) + }, + "total_files": len(operations), + "total_size_bytes": total_size, + "compression": "gzip", + "encoding": "utf-8" + } + + # Write manifest + (temp_path / "manifest.json").write_text(json.dumps(manifest, indent=2)) + + # Write operations metadata + operations_metadata = { + "operations": operations + } + (metadata_dir / "operations.json").write_text(json.dumps(operations_metadata, indent=2)) + + # Write hashes + hashes_metadata = { + "workspace_path": self.workspace_path, + "updated_at": created_at, + "file_hashes": file_hashes + } + (metadata_dir / "hashes.json").write_text(json.dumps(hashes_metadata, indent=2)) + + # Create tarball in temporary directory + temp_bundle_dir = self._get_temp_bundle_dir() + bundle_path = temp_bundle_dir / f"{bundle_id}.tar.gz" + with tarfile.open(bundle_path, "w:gz") as tar: + tar.add(temp_path, arcname=f"{bundle_id}") + + return str(bundle_path), manifest + + def upload_bundle(self, bundle_path: str, manifest: Dict[str, Any]) -> Dict[str, Any]: + """ + Upload delta bundle to remote server with exponential backoff retry. + + Args: + bundle_path: Path to the bundle tarball + manifest: Bundle manifest metadata + + Returns: + Server response dictionary + """ + last_error = None + + for attempt in range(self.max_retries + 1): + try: + # Calculate backoff delay (exponential with jitter) + if attempt > 0: + base_delay = 2 ** (attempt - 1) # 1, 2, 4, 8... + jitter = base_delay * 0.1 * (0.5 + (hash(str(time.time())) % 100) / 100) + delay = min(base_delay + jitter, 30) # Cap at 30 seconds + logger.info(f"[remote_upload] Retry attempt {attempt + 1}/{self.max_retries + 1} after {delay:.2f}s delay") + time.sleep(delay) + + # Verify bundle exists before attempting upload + if not os.path.exists(bundle_path): + return { + "success": False, + "error": { + "code": "BUNDLE_NOT_FOUND", + "message": f"Bundle file not found: {bundle_path}" + } + } + + # Check bundle size + bundle_size = os.path.getsize(bundle_path) + max_size_mb = 100 # Default max size + max_size_bytes = max_size_mb * 1024 * 1024 + + if bundle_size > max_size_bytes: + return { + "success": False, + "error": { + "code": "BUNDLE_TOO_LARGE", + "message": f"Bundle size {bundle_size} bytes exceeds maximum {max_size_bytes} bytes" + } + } + + with open(bundle_path, 'rb') as bundle_file: + files = { + 'bundle': (f"{manifest['bundle_id']}.tar.gz", bundle_file, 'application/gzip') + } + + data = { + 'workspace_path': self._translate_to_container_path(self.workspace_path), + 'collection_name': self.collection_name, + # CLI is stateless - server handles sequence numbers + 'force': 'false', + 'source_path': self.workspace_path, + } + + logger.info(f"[remote_upload] Uploading bundle {manifest['bundle_id']} (size: {bundle_size} bytes)") + + response = self.session.post( + f"{self.upload_endpoint}/api/v1/delta/upload", + files=files, + data=data, + timeout=self.timeout + ) + + if response.status_code == 200: + result = response.json() + logger.info(f"[remote_upload] Successfully uploaded bundle {manifest['bundle_id']}") + return result + else: + error_msg = f"Upload failed with status {response.status_code}" + try: + error_detail = response.json() + error_detail_msg = error_detail.get('error', {}).get('message', 'Unknown error') + error_msg += f": {error_detail_msg}" + error_code = error_detail.get('error', {}).get('code', 'HTTP_ERROR') + except: + error_msg += f": {response.text[:200]}" # Truncate long responses + error_code = "HTTP_ERROR" + + last_error = { + "success": False, + "error": { + "code": error_code, + "message": error_msg, + "status_code": response.status_code + } + } + + # Don't retry on client errors (4xx) + if 400 <= response.status_code < 500 and response.status_code != 429: + logger.warning(f"[remote_upload] Client error {response.status_code}, not retrying: {error_msg}") + return last_error + + logger.warning(f"[remote_upload] Upload attempt {attempt + 1} failed: {error_msg}") + + except requests.exceptions.Timeout as e: + last_error = { + "success": False, + "error": { + "code": "TIMEOUT_ERROR", + "message": f"Upload timeout after {self.timeout}s: {str(e)}" + } + } + logger.warning(f"[remote_upload] Upload timeout on attempt {attempt + 1}: {e}") + + except requests.exceptions.ConnectionError as e: + last_error = { + "success": False, + "error": { + "code": "CONNECTION_ERROR", + "message": f"Connection error during upload: {str(e)}" + } + } + logger.warning(f"[remote_upload] Connection error on attempt {attempt + 1}: {e}") + + except requests.exceptions.RequestException as e: + last_error = { + "success": False, + "error": { + "code": "NETWORK_ERROR", + "message": f"Network error during upload: {str(e)}" + } + } + logger.warning(f"[remote_upload] Network error on attempt {attempt + 1}: {e}") + + except Exception as e: + last_error = { + "success": False, + "error": { + "code": "UPLOAD_ERROR", + "message": f"Unexpected error during upload: {str(e)}" + } + } + logger.error(f"[remote_upload] Unexpected error on attempt {attempt + 1}: {e}") + + # All retries exhausted + logger.error(f"[remote_upload] All {self.max_retries + 1} upload attempts failed for bundle {manifest.get('bundle_id', 'unknown')}") + return last_error or { + "success": False, + "error": { + "code": "MAX_RETRIES_EXCEEDED", + "message": f"Upload failed after {self.max_retries + 1} attempts" + } + } + + def get_server_status(self) -> Dict[str, Any]: + """Get server status and last sequence number with enhanced error handling.""" + try: + logger.debug(f"[remote_upload] Checking server status at {self.upload_endpoint}") + + # Translate host path to container path for API communication + container_workspace_path = self._translate_to_container_path(self.workspace_path) + + response = self.session.get( + f"{self.upload_endpoint}/api/v1/delta/status", + params={'workspace_path': container_workspace_path}, + timeout=min(self.timeout, 10) # Use shorter timeout for status checks + ) + + if response.status_code == 200: + status_data = response.json() + logger.debug(f"[remote_upload] Server status: {status_data}") + return status_data + else: + error_msg = f"Status check failed with HTTP {response.status_code}" + try: + error_detail = response.json() + error_detail_msg = error_detail.get('error', {}).get('message', 'Unknown error') + error_msg += f": {error_detail_msg}" + except: + error_msg += f": {response.text[:100]}" + + logger.warning(f"[remote_upload] {error_msg}") + return { + "success": False, + "error": { + "code": "STATUS_ERROR", + "message": error_msg, + "status_code": response.status_code + } + } + + except requests.exceptions.Timeout as e: + error_msg = f"Status check timeout after {min(self.timeout, 10)}s" + logger.warning(f"[remote_upload] {error_msg}: {e}") + return { + "success": False, + "error": { + "code": "STATUS_TIMEOUT", + "message": error_msg + } + } + except requests.exceptions.ConnectionError as e: + error_msg = f"Cannot connect to server at {self.upload_endpoint}" + logger.warning(f"[remote_upload] {error_msg}: {e}") + return { + "success": False, + "error": { + "code": "CONNECTION_ERROR", + "message": error_msg + } + } + except requests.exceptions.RequestException as e: + error_msg = f"Network error during status check: {str(e)}" + logger.warning(f"[remote_upload] {error_msg}") + return { + "success": False, + "error": { + "code": "NETWORK_ERROR", + "message": error_msg + } + } + except Exception as e: + error_msg = f"Unexpected error during status check: {str(e)}" + logger.error(f"[remote_upload] {error_msg}") + return { + "success": False, + "error": { + "code": "STATUS_CHECK_ERROR", + "message": error_msg + } + } + + def has_meaningful_changes(self, changes: Dict[str, List]) -> bool: + """Check if changes warrant a delta upload.""" + total_changes = sum(len(files) for op, files in changes.items() if op != "unchanged") + return total_changes > 0 + + def process_changes_and_upload(self, changes: Dict[str, List]) -> bool: + """ + Process pre-computed changes and upload delta bundle. + Includes comprehensive error handling and graceful fallback. + + Args: + changes: Dictionary of file changes by type + + Returns: + True if upload was successful, False otherwise + """ + try: + logger.info(f"[remote_upload] Processing pre-computed changes") + + # Validate input + if not changes: + logger.info("[remote_upload] No changes provided") + return True + + if not self.has_meaningful_changes(changes): + logger.info("[remote_upload] No meaningful changes detected, skipping upload") + return True + + # Log change summary + total_changes = sum(len(files) for op, files in changes.items() if op != "unchanged") + logger.info(f"[remote_upload] Detected {total_changes} meaningful changes: " + f"{len(changes['created'])} created, {len(changes['updated'])} updated, " + f"{len(changes['deleted'])} deleted, {len(changes['moved'])} moved") + + # Create delta bundle + bundle_path = None + try: + bundle_path, manifest = self.create_delta_bundle(changes) + logger.info(f"[remote_upload] Created delta bundle: {manifest['bundle_id']} " + f"(size: {manifest['total_size_bytes']} bytes)") + + # Validate bundle was created successfully + if not bundle_path or not os.path.exists(bundle_path): + raise RuntimeError(f"Failed to create bundle at {bundle_path}") + + except Exception as e: + logger.error(f"[remote_upload] Error creating delta bundle: {e}") + # Clean up any temporary files on failure + self.cleanup() + return False + + # Upload bundle with retry logic + try: + response = self.upload_bundle(bundle_path, manifest) + + if response.get("success", False): + processed_ops = response.get('processed_operations', {}) + logger.info(f"[remote_upload] Successfully uploaded bundle {manifest['bundle_id']}") + logger.info(f"[remote_upload] Processed operations: {processed_ops}") + + # Clean up temporary bundle after successful upload + try: + if os.path.exists(bundle_path): + os.remove(bundle_path) + logger.debug(f"[remote_upload] Cleaned up temporary bundle: {bundle_path}") + # Also clean up the entire temp directory if this is the last bundle + self.cleanup() + except Exception as cleanup_error: + logger.warning(f"[remote_upload] Failed to cleanup bundle {bundle_path}: {cleanup_error}") + + return True + else: + error_msg = response.get('error', {}).get('message', 'Unknown upload error') + logger.error(f"[remote_upload] Upload failed: {error_msg}") + return False + + except Exception as e: + logger.error(f"[remote_upload] Error uploading bundle: {e}") + return False + + except Exception as e: + logger.error(f"[remote_upload] Unexpected error in process_changes_and_upload: {e}") + return False + + def watch_loop(self, interval: int = 5): + """Main file watching loop using existing detection and upload methods.""" + logger.info(f"[watch] Starting file monitoring (interval: {interval}s)") + logger.info(f"[watch] Monitoring: {self.workspace_path}") + logger.info(f"[watch] Press Ctrl+C to stop") + + try: + while True: + try: + # Use existing change detection (get all files in workspace) + all_files = self.get_all_code_files() + changes = self.detect_file_changes(all_files) + + # Count only meaningful changes (exclude unchanged) + meaningful_changes = len(changes.get("created", [])) + len(changes.get("updated", [])) + len(changes.get("deleted", [])) + len(changes.get("moved", [])) + + if meaningful_changes > 0: + logger.info(f"[watch] Detected {meaningful_changes} changes: { {k: len(v) for k, v in changes.items() if k != 'unchanged'} }") + + # Use existing upload method + success = self.process_changes_and_upload(changes) + + if success: + logger.info(f"[watch] Successfully uploaded changes") + else: + logger.error(f"[watch] Failed to upload changes") + else: + logger.debug(f"[watch] No changes detected") # Debug level to avoid spam + + # Sleep until next check + time.sleep(interval) + + except KeyboardInterrupt: + logger.info(f"[watch] Received interrupt signal, stopping...") + break + except Exception as e: + logger.error(f"[watch] Error in watch loop: {e}") + time.sleep(interval) # Continue even after errors + + except KeyboardInterrupt: + logger.info(f"[watch] File monitoring stopped by user") + + def get_all_code_files(self) -> List[Path]: + """Get all code files in the workspace.""" + all_files = [] + try: + workspace_path = Path(self.workspace_path) + for ext in CODE_EXTS: + all_files.extend(workspace_path.rglob(f"*{ext}")) + + # Filter out directories and hidden files + all_files = [ + f for f in all_files + if f.is_file() + and not any(part.startswith('.') for part in f.parts) + and '.context-engine' not in str(f) + ] + except Exception as e: + logger.error(f"[watch] Error scanning files: {e}") + + return all_files + + def process_and_upload_changes(self, changed_paths: List[Path]) -> bool: + """ + Process changed paths and upload delta bundle if meaningful changes exist. + Includes comprehensive error handling and graceful fallback. + + Args: + changed_paths: List of changed file paths + + Returns: + True if upload was successful, False otherwise + """ + try: + logger.info(f"[remote_upload] Processing {len(changed_paths)} changed paths") + + # Validate input + if not changed_paths: + logger.info("[remote_upload] No changed paths provided") + return True + + # Detect changes + try: + changes = self.detect_file_changes(changed_paths) + except Exception as e: + logger.error(f"[remote_upload] Error detecting file changes: {e}") + return False + + if not self.has_meaningful_changes(changes): + logger.info("[remote_upload] No meaningful changes detected, skipping upload") + return True + + # Log change summary + total_changes = sum(len(files) for op, files in changes.items() if op != "unchanged") + logger.info(f"[remote_upload] Detected {total_changes} meaningful changes: " + f"{len(changes['created'])} created, {len(changes['updated'])} updated, " + f"{len(changes['deleted'])} deleted, {len(changes['moved'])} moved") + + # Create delta bundle + bundle_path = None + try: + bundle_path, manifest = self.create_delta_bundle(changes) + logger.info(f"[remote_upload] Created delta bundle: {manifest['bundle_id']} " + f"(size: {manifest['total_size_bytes']} bytes)") + + # Validate bundle was created successfully + if not bundle_path or not os.path.exists(bundle_path): + raise RuntimeError(f"Failed to create bundle at {bundle_path}") + + except Exception as e: + logger.error(f"[remote_upload] Error creating delta bundle: {e}") + # Clean up any temporary files on failure + self.cleanup() + return False + + # Upload bundle with retry logic + try: + response = self.upload_bundle(bundle_path, manifest) + + if response.get("success", False): + processed_ops = response.get('processed_operations', {}) + logger.info(f"[remote_upload] Successfully uploaded bundle {manifest['bundle_id']}") + logger.info(f"[remote_upload] Processed operations: {processed_ops}") + + # Clean up temporary bundle after successful upload + try: + if os.path.exists(bundle_path): + os.remove(bundle_path) + logger.debug(f"[remote_upload] Cleaned up temporary bundle: {bundle_path}") + # Also clean up the entire temp directory if this is the last bundle + self.cleanup() + except Exception as cleanup_error: + logger.warning(f"[remote_upload] Failed to cleanup bundle {bundle_path}: {cleanup_error}") + + return True + else: + error = response.get("error", {}) + error_code = error.get("code", "UNKNOWN") + error_msg = error.get("message", "Unknown error") + + logger.error(f"[remote_upload] Upload failed: {error_msg}") + + # Handle specific error types + # CLI is stateless - server handles sequence management + if error_code in ["BUNDLE_TOO_LARGE", "BUNDLE_NOT_FOUND"]: + # These are unrecoverable errors + logger.error(f"[remote_upload] Unrecoverable error ({error_code}): {error_msg}") + return False + elif error_code in ["TIMEOUT_ERROR", "CONNECTION_ERROR", "NETWORK_ERROR"]: + # These might be temporary, suggest fallback + logger.warning(f"[remote_upload] Network-related error ({error_code}): {error_msg}") + logger.warning("[remote_upload] Consider falling back to local mode if this persists") + return False + else: + # Other errors + logger.error(f"[remote_upload] Upload error ({error_code}): {error_msg}") + return False + + except Exception as e: + logger.error(f"[remote_upload] Unexpected error during upload: {e}") + return False + + except Exception as e: + logger.error(f"[remote_upload] Critical error in process_and_upload_changes: {e}") + logger.exception("[remote_upload] Full traceback:") + return False + + # CLI is stateless - sequence mismatch handling is done by server + + +def is_remote_mode_enabled() -> bool: + """Check if remote upload mode is enabled via environment variables.""" + return os.environ.get("REMOTE_UPLOAD_ENABLED", "").lower() in {"1", "true", "yes", "on"} + + +def get_remote_config(cli_path: Optional[str] = None) -> Dict[str, str]: + """Get remote upload configuration from environment variables and command-line arguments.""" + # Use command-line path if provided, otherwise fall back to environment variables + if cli_path: + workspace_path = cli_path + else: + workspace_path = os.environ.get("WATCH_ROOT", os.environ.get("WORKSPACE_PATH", "/work")) + + # Use auto-generated collection name based on repo name + repo_name = _extract_repo_name_from_path(workspace_path) + # Fallback to directory name if repo detection fails + if not repo_name: + repo_name = Path(workspace_path).name + collection_name = get_collection_name(repo_name) + + return { + "upload_endpoint": os.environ.get("REMOTE_UPLOAD_ENDPOINT", "http://localhost:8080"), + "workspace_path": workspace_path, + "collection_name": collection_name, + "max_retries": int(os.environ.get("REMOTE_UPLOAD_MAX_RETRIES", "3")), + "timeout": int(os.environ.get("REMOTE_UPLOAD_TIMEOUT", "30")) + } + + +def main(): + """Main entry point for the remote upload client.""" + parser = argparse.ArgumentParser( + description="Remote upload client for delta bundles in Context-Engine", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Upload from current directory or environment variables + python remote_upload_client.py + + # Upload from specific directory + python remote_upload_client.py --path /path/to/repo + + # Upload from specific directory with custom endpoint + python remote_upload_client.py --path /path/to/repo --endpoint http://remote-server:8080 + """ + ) + + parser.add_argument( + "--path", + type=str, + help="Path to the directory to upload (overrides WATCH_ROOT/WORKSPACE_PATH environment variables)" + ) + + parser.add_argument( + "--endpoint", + type=str, + help="Remote upload endpoint (overrides REMOTE_UPLOAD_ENDPOINT environment variable)" + ) + + parser.add_argument( + "--max-retries", + type=int, + help="Maximum number of upload retries (overrides REMOTE_UPLOAD_MAX_RETRIES environment variable)" + ) + + parser.add_argument( + "--timeout", + type=int, + help="Request timeout in seconds (overrides REMOTE_UPLOAD_TIMEOUT environment variable)" + ) + + parser.add_argument( + "--force", + action="store_true", + help="Force upload of all files (ignore cached state and treat all files as new)" + ) + + parser.add_argument( + "--show-mapping", + action="store_true", + help="Print collection↔workspace mapping information and exit" + ) + + parser.add_argument( + "--watch", "-w", + action="store_true", + help="Watch for file changes and upload automatically (continuous mode)" + ) + + parser.add_argument( + "--interval", "-i", + type=int, + default=5, + help="Watch interval in seconds (default: 5)" + ) + + args = parser.parse_args() + + # Validate path if provided + if args.path: + if not os.path.exists(args.path): + logger.error(f"Path does not exist: {args.path}") + return 1 + + if not os.path.isdir(args.path): + logger.error(f"Path is not a directory: {args.path}") + return 1 + + args.path = os.path.abspath(args.path) + logger.info(f"Using specified path: {args.path}") + + # Get configuration + config = get_remote_config(args.path) + + # Override config with command-line arguments if provided + if args.endpoint: + config["upload_endpoint"] = args.endpoint + if args.max_retries is not None: + config["max_retries"] = args.max_retries + if args.timeout is not None: + config["timeout"] = args.timeout + + logger.info(f"Workspace path: {config['workspace_path']}") + logger.info(f"Collection name: {config['collection_name']}") + logger.info(f"Upload endpoint: {config['upload_endpoint']}") + + if args.show_mapping: + with RemoteUploadClient( + upload_endpoint=config["upload_endpoint"], + workspace_path=config["workspace_path"], + collection_name=config["collection_name"], + max_retries=config["max_retries"], + timeout=config["timeout"], + ) as client: + client.log_mapping_summary() + return 0 + + # Check if remote mode is enabled + if not is_remote_mode_enabled(): + logger.error("Remote upload mode is not enabled. Set REMOTE_UPLOAD_ENABLED=1 in environment variables.") + return 1 + + # Handle watch mode + if args.watch: + logger.info("Starting watch mode for continuous file monitoring") + try: + with RemoteUploadClient( + upload_endpoint=config["upload_endpoint"], + workspace_path=config["workspace_path"], + collection_name=config["collection_name"], + max_retries=config["max_retries"], + timeout=config["timeout"] + ) as client: + + logger.info("Remote upload client initialized successfully") + client.log_mapping_summary() + + # Test server connection first + logger.info("Checking server status...") + status = client.get_server_status() + is_success = ( + isinstance(status, dict) and + 'workspace_path' in status and + 'collection_name' in status and + status.get('status') == 'ready' + ) + if not is_success: + error = status.get("error", {}) + logger.error(f"Cannot connect to server: {error.get('message', 'Unknown error')}") + return 1 + + logger.info("Server connection successful") + logger.info(f"Starting file monitoring with {args.interval}s interval") + + # Start the watch loop + client.watch_loop(interval=args.interval) + + return 0 + + except KeyboardInterrupt: + logger.info("Watch mode stopped by user") + return 0 + except Exception as e: + logger.error(f"Watch mode failed: {e}") + return 1 + + # Single upload mode (original logic) + # Initialize client with context manager for cleanup + try: + with RemoteUploadClient( + upload_endpoint=config["upload_endpoint"], + workspace_path=config["workspace_path"], + collection_name=config["collection_name"], + max_retries=config["max_retries"], + timeout=config["timeout"] + ) as client: + + logger.info("Remote upload client initialized successfully") + + client.log_mapping_summary() + + # Test server connection + logger.info("Checking server status...") + status = client.get_server_status() + # For delta endpoint, success is indicated by having expected fields (not a "success" boolean) + is_success = ( + isinstance(status, dict) and + 'workspace_path' in status and + 'collection_name' in status and + status.get('status') == 'ready' + ) + if not is_success: + error = status.get("error", {}) + logger.error(f"Cannot connect to server: {error.get('message', 'Unknown error')}") + return 1 + + logger.info("Server connection successful") + + # Scan repository and upload files + logger.info("Scanning repository for files...") + workspace_path = Path(config['workspace_path']) + + # Find all files in the repository + all_files = [] + for file_path in workspace_path.rglob('*'): + if file_path.is_file() and not file_path.name.startswith('.'): + rel_path = file_path.relative_to(workspace_path) + # Skip .codebase directory and other metadata + if not str(rel_path).startswith('.codebase'): + all_files.append(file_path) + + logger.info(f"Found {len(all_files)} files to upload") + + if not all_files: + logger.warning("No files found to upload") + return 0 + + # Detect changes (treat all files as changes for initial upload) + if args.force: + # Force mode: treat all files as created + changes = {"created": all_files, "updated": [], "deleted": [], "moved": [], "unchanged": []} + else: + changes = client.detect_file_changes(all_files) + + if not client.has_meaningful_changes(changes): + logger.info("No meaningful changes to upload") + return 0 + + logger.info(f"Changes detected: {len(changes.get('created', []))} created, {len(changes.get('updated', []))} updated, {len(changes.get('deleted', []))} deleted") + + # Process and upload changes + logger.info("Uploading files to remote server...") + success = client.process_changes_and_upload(changes) + + if success: + logger.info("Repository upload completed successfully!") + logger.info(f"Collection name: {config['collection_name']}") + logger.info(f"Files uploaded: {len(all_files)}") + else: + logger.error("Repository upload failed!") + return 1 + + return 0 + + except Exception as e: + logger.error(f"Failed to initialize remote upload client: {e}") + return 1 + + +if __name__ == "__main__": + import sys + sys.exit(main()) From 79c0eca689c2adc680af8cb83f5631bf3905e83a Mon Sep 17 00:00:00 2001 From: Reese Date: Wed, 12 Nov 2025 11:36:10 +0000 Subject: [PATCH 12/16] refactor(upload): simplify error handling and retry logic Streamline upload client implementations by: - Removing complex jitter calculations in favor of simple exponential backoff - Consolidating error response formatting and dictionary structures - Simplifying exception handling across upload and status check methods - Reducing code verbosity while maintaining identical functionality - Making error messages more concise and consistent --- scripts/remote_upload_client.py | 251 ++++++++-------------------- scripts/standalone_upload_client.py | 247 +++++++-------------------- 2 files changed, 132 insertions(+), 366 deletions(-) diff --git a/scripts/remote_upload_client.py b/scripts/remote_upload_client.py index ec433715..39337bc0 100644 --- a/scripts/remote_upload_client.py +++ b/scripts/remote_upload_client.py @@ -18,7 +18,6 @@ import hashlib import tarfile import tempfile -import threading import logging import argparse from pathlib import Path @@ -59,52 +58,32 @@ def _translate_to_container_path(self, host_path: str) -> str: # Fallback: if path doesn't match expected pattern, use as-is return host_path - def __init__(self, - upload_endpoint: str, - workspace_path: str, - collection_name: str, - max_retries: int = 3, - timeout: int = 30, - metadata_path: Optional[str] = None): - """ - Initialize remote upload client. - - Args: - upload_endpoint: HTTP endpoint for delta uploads - workspace_path: Absolute path to workspace (where files are located) - collection_name: Target collection name - max_retries: Maximum number of upload retries - timeout: Request timeout in seconds - metadata_path: Absolute path to metadata directory (for delta bundles) - If None, uses workspace_path/.codebase/delta_bundles - """ + def __init__(self, upload_endpoint: str, workspace_path: str, collection_name: str, + max_retries: int = 3, timeout: int = 30, metadata_path: Optional[str] = None): + """Initialize remote upload client.""" self.upload_endpoint = upload_endpoint.rstrip('/') self.workspace_path = workspace_path self.collection_name = collection_name self.max_retries = max_retries self.timeout = timeout - - # Use temporary directory for bundle creation - CLI should be stateless - # Temporary bundles are cleaned up after upload self.temp_dir = None - self.bundle_dir = None # No persistent bundle directory in CLI mode - # Store repo name for cache operations - # Import here to avoid circular imports + # Set environment variables for cache functions + os.environ["WORKSPACE_PATH"] = workspace_path + + # Get repo name for cache operations try: from scripts.workspace_state import _extract_repo_name_from_path self.repo_name = _extract_repo_name_from_path(workspace_path) + # Fallback to directory name if repo detection fails (for non-git repos) + if not self.repo_name: + self.repo_name = Path(workspace_path).name except ImportError: - # Fallback: use directory name as repo name self.repo_name = Path(workspace_path).name - # Setup HTTP session with retry strategy + # Setup HTTP session with simple retry self.session = requests.Session() - retry_strategy = Retry( - total=max_retries, - backoff_factor=1, - status_forcelist=[429, 500, 502, 503, 504], - ) + retry_strategy = Retry(total=max_retries, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504]) adapter = HTTPAdapter(max_retries=retry_strategy) self.session.mount("http://", adapter) self.session.mount("https://", adapter) @@ -144,11 +123,10 @@ def log_mapping_summary(self) -> None: """Log mapping summary for user visibility.""" info = self.get_mapping_summary() logger.info("[remote_upload] Collection mapping:") - logger.info(f" repo_name : {info['repo_name']}") - logger.info(f" collection_name : {info['collection_name']}") - logger.info(f" source_path : {info['source_path']}") - logger.info(f" container_path : {info['container_path']}") - logger.info("[remote_upload] To query remote state later, call the MCP `collection_map` tool.") + logger.info(f" repo_name: {info['repo_name']}") + logger.info(f" collection_name: {info['collection_name']}") + logger.info(f" source_path: {info['source_path']}") + logger.info(f" container_path: {info['container_path']}") def _get_temp_bundle_dir(self) -> Path: """Get or create temporary directory for bundle creation.""" @@ -495,37 +473,20 @@ def upload_bundle(self, bundle_path: str, manifest: Dict[str, Any]) -> Dict[str, for attempt in range(self.max_retries + 1): try: - # Calculate backoff delay (exponential with jitter) + # Simple exponential backoff if attempt > 0: - base_delay = 2 ** (attempt - 1) # 1, 2, 4, 8... - jitter = base_delay * 0.1 * (0.5 + (hash(str(time.time())) % 100) / 100) - delay = min(base_delay + jitter, 30) # Cap at 30 seconds - logger.info(f"[remote_upload] Retry attempt {attempt + 1}/{self.max_retries + 1} after {delay:.2f}s delay") + delay = min(2 ** (attempt - 1), 30) # 1, 2, 4, 8... capped at 30s + logger.info(f"[remote_upload] Retry attempt {attempt + 1}/{self.max_retries + 1} after {delay}s delay") time.sleep(delay) - # Verify bundle exists before attempting upload + # Verify bundle exists if not os.path.exists(bundle_path): - return { - "success": False, - "error": { - "code": "BUNDLE_NOT_FOUND", - "message": f"Bundle file not found: {bundle_path}" - } - } + return {"success": False, "error": {"code": "BUNDLE_NOT_FOUND", "message": f"Bundle not found: {bundle_path}"}} - # Check bundle size + # Check bundle size (100MB limit) bundle_size = os.path.getsize(bundle_path) - max_size_mb = 100 # Default max size - max_size_bytes = max_size_mb * 1024 * 1024 - - if bundle_size > max_size_bytes: - return { - "success": False, - "error": { - "code": "BUNDLE_TOO_LARGE", - "message": f"Bundle size {bundle_size} bytes exceeds maximum {max_size_bytes} bytes" - } - } + if bundle_size > 100 * 1024 * 1024: + return {"success": False, "error": {"code": "BUNDLE_TOO_LARGE", "message": f"Bundle too large: {bundle_size} bytes"}} with open(bundle_path, 'rb') as bundle_file: files = { @@ -553,71 +514,40 @@ def upload_bundle(self, bundle_path: str, manifest: Dict[str, Any]) -> Dict[str, result = response.json() logger.info(f"[remote_upload] Successfully uploaded bundle {manifest['bundle_id']}") return result - else: - error_msg = f"Upload failed with status {response.status_code}" - try: - error_detail = response.json() - error_detail_msg = error_detail.get('error', {}).get('message', 'Unknown error') - error_msg += f": {error_detail_msg}" - error_code = error_detail.get('error', {}).get('code', 'HTTP_ERROR') - except: - error_msg += f": {response.text[:200]}" # Truncate long responses - error_code = "HTTP_ERROR" - - last_error = { - "success": False, - "error": { - "code": error_code, - "message": error_msg, - "status_code": response.status_code - } - } - - # Don't retry on client errors (4xx) - if 400 <= response.status_code < 500 and response.status_code != 429: - logger.warning(f"[remote_upload] Client error {response.status_code}, not retrying: {error_msg}") - return last_error - - logger.warning(f"[remote_upload] Upload attempt {attempt + 1} failed: {error_msg}") + + # Handle error + error_msg = f"Upload failed with status {response.status_code}" + try: + error_detail = response.json() + error_detail_msg = error_detail.get('error', {}).get('message', 'Unknown error') + error_msg += f": {error_detail_msg}" + error_code = error_detail.get('error', {}).get('code', 'HTTP_ERROR') + except: + error_msg += f": {response.text[:200]}" + error_code = "HTTP_ERROR" + + last_error = {"success": False, "error": {"code": error_code, "message": error_msg, "status_code": response.status_code}} + + # Don't retry on client errors (except 429) + if 400 <= response.status_code < 500 and response.status_code != 429: + return last_error + + logger.warning(f"[remote_upload] Upload attempt {attempt + 1} failed: {error_msg}") except requests.exceptions.Timeout as e: - last_error = { - "success": False, - "error": { - "code": "TIMEOUT_ERROR", - "message": f"Upload timeout after {self.timeout}s: {str(e)}" - } - } + last_error = {"success": False, "error": {"code": "TIMEOUT_ERROR", "message": f"Upload timeout: {str(e)}"}} logger.warning(f"[remote_upload] Upload timeout on attempt {attempt + 1}: {e}") except requests.exceptions.ConnectionError as e: - last_error = { - "success": False, - "error": { - "code": "CONNECTION_ERROR", - "message": f"Connection error during upload: {str(e)}" - } - } + last_error = {"success": False, "error": {"code": "CONNECTION_ERROR", "message": f"Connection error: {str(e)}"}} logger.warning(f"[remote_upload] Connection error on attempt {attempt + 1}: {e}") except requests.exceptions.RequestException as e: - last_error = { - "success": False, - "error": { - "code": "NETWORK_ERROR", - "message": f"Network error during upload: {str(e)}" - } - } + last_error = {"success": False, "error": {"code": "NETWORK_ERROR", "message": f"Network error: {str(e)}"}} logger.warning(f"[remote_upload] Network error on attempt {attempt + 1}: {e}") except Exception as e: - last_error = { - "success": False, - "error": { - "code": "UPLOAD_ERROR", - "message": f"Unexpected error during upload: {str(e)}" - } - } + last_error = {"success": False, "error": {"code": "UPLOAD_ERROR", "message": f"Upload error: {str(e)}"}} logger.error(f"[remote_upload] Unexpected error on attempt {attempt + 1}: {e}") # All retries exhausted @@ -631,82 +561,35 @@ def upload_bundle(self, bundle_path: str, manifest: Dict[str, Any]) -> Dict[str, } def get_server_status(self) -> Dict[str, Any]: - """Get server status and last sequence number with enhanced error handling.""" + """Get server status with simplified error handling.""" try: - logger.debug(f"[remote_upload] Checking server status at {self.upload_endpoint}") - - # Translate host path to container path for API communication container_workspace_path = self._translate_to_container_path(self.workspace_path) response = self.session.get( f"{self.upload_endpoint}/api/v1/delta/status", params={'workspace_path': container_workspace_path}, - timeout=min(self.timeout, 10) # Use shorter timeout for status checks + timeout=min(self.timeout, 10) ) if response.status_code == 200: - status_data = response.json() - logger.debug(f"[remote_upload] Server status: {status_data}") - return status_data - else: - error_msg = f"Status check failed with HTTP {response.status_code}" - try: - error_detail = response.json() - error_detail_msg = error_detail.get('error', {}).get('message', 'Unknown error') - error_msg += f": {error_detail_msg}" - except: - error_msg += f": {response.text[:100]}" - - logger.warning(f"[remote_upload] {error_msg}") - return { - "success": False, - "error": { - "code": "STATUS_ERROR", - "message": error_msg, - "status_code": response.status_code - } - } - - except requests.exceptions.Timeout as e: - error_msg = f"Status check timeout after {min(self.timeout, 10)}s" - logger.warning(f"[remote_upload] {error_msg}: {e}") - return { - "success": False, - "error": { - "code": "STATUS_TIMEOUT", - "message": error_msg - } - } - except requests.exceptions.ConnectionError as e: - error_msg = f"Cannot connect to server at {self.upload_endpoint}" - logger.warning(f"[remote_upload] {error_msg}: {e}") - return { - "success": False, - "error": { - "code": "CONNECTION_ERROR", - "message": error_msg - } - } - except requests.exceptions.RequestException as e: - error_msg = f"Network error during status check: {str(e)}" - logger.warning(f"[remote_upload] {error_msg}") - return { - "success": False, - "error": { - "code": "NETWORK_ERROR", - "message": error_msg - } - } + return response.json() + + # Handle error response + error_msg = f"Status check failed with HTTP {response.status_code}" + try: + error_detail = response.json() + error_msg += f": {error_detail.get('error', {}).get('message', 'Unknown error')}" + except: + error_msg += f": {response.text[:100]}" + + return {"success": False, "error": {"code": "STATUS_ERROR", "message": error_msg}} + + except requests.exceptions.Timeout: + return {"success": False, "error": {"code": "STATUS_TIMEOUT", "message": "Status check timeout"}} + except requests.exceptions.ConnectionError: + return {"success": False, "error": {"code": "CONNECTION_ERROR", "message": f"Cannot connect to server"}} except Exception as e: - error_msg = f"Unexpected error during status check: {str(e)}" - logger.error(f"[remote_upload] {error_msg}") - return { - "success": False, - "error": { - "code": "STATUS_CHECK_ERROR", - "message": error_msg - } - } + return {"success": False, "error": {"code": "STATUS_CHECK_ERROR", "message": f"Status check error: {str(e)}"}} def has_meaningful_changes(self, changes: Dict[str, List]) -> bool: """Check if changes warrant a delta upload.""" @@ -1082,7 +965,7 @@ def main(): # Get configuration config = get_remote_config(args.path) - # Override config with command-line arguments if provided + # Override with command-line arguments if args.endpoint: config["upload_endpoint"] = args.endpoint if args.max_retries is not None: diff --git a/scripts/standalone_upload_client.py b/scripts/standalone_upload_client.py index c697b425..0162f0fa 100644 --- a/scripts/standalone_upload_client.py +++ b/scripts/standalone_upload_client.py @@ -214,50 +214,30 @@ def _translate_to_container_path(self, host_path: str) -> str: # Fallback: if path doesn't match expected pattern, use as-is return host_path - def __init__(self, - upload_endpoint: str, - workspace_path: str, - collection_name: str, - max_retries: int = 3, - timeout: int = 30, - metadata_path: Optional[str] = None): - """ - Initialize remote upload client. - - Args: - upload_endpoint: HTTP endpoint for delta uploads - workspace_path: Absolute path to workspace (where files are located) - collection_name: Target collection name - max_retries: Maximum number of upload retries - timeout: Request timeout in seconds - metadata_path: Absolute path to metadata directory (for delta bundles) - If None, uses workspace_path/.codebase/delta_bundles - """ + def __init__(self, upload_endpoint: str, workspace_path: str, collection_name: str, + max_retries: int = 3, timeout: int = 30, metadata_path: Optional[str] = None): + """Initialize remote upload client.""" self.upload_endpoint = upload_endpoint.rstrip('/') self.workspace_path = workspace_path self.collection_name = collection_name self.max_retries = max_retries self.timeout = timeout - - # Use temporary directory for bundle creation - CLI should be stateless - # Temporary bundles are cleaned up after upload self.temp_dir = None - self.bundle_dir = None # No persistent bundle directory in CLI mode - # Store repo name for cache operations - self.repo_name = _extract_repo_name_from_path(workspace_path) + # Set environment variables for cache functions + os.environ["WORKSPACE_PATH"] = workspace_path - # Initialize hash cache + # Store repo name and initialize hash cache + self.repo_name = _extract_repo_name_from_path(workspace_path) + # Fallback to directory name if repo detection fails (for non-git repos) + if not self.repo_name: + self.repo_name = Path(workspace_path).name global _hash_cache _hash_cache = SimpleHashCache(workspace_path, self.repo_name) - # Setup HTTP session with retry strategy + # Setup HTTP session with simple retry self.session = requests.Session() - retry_strategy = Retry( - total=max_retries, - backoff_factor=1, - status_forcelist=[429, 500, 502, 503, 504], - ) + retry_strategy = Retry(total=max_retries, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504]) adapter = HTTPAdapter(max_retries=retry_strategy) self.session.mount("http://", adapter) self.session.mount("https://", adapter) @@ -297,11 +277,10 @@ def log_mapping_summary(self) -> None: """Log mapping summary for user visibility.""" info = self.get_mapping_summary() logger.info("[remote_upload] Collection mapping:") - logger.info(f" repo_name : {info['repo_name']}") - logger.info(f" collection_name : {info['collection_name']}") - logger.info(f" source_path : {info['source_path']}") - logger.info(f" container_path : {info['container_path']}") - logger.info("[remote_upload] To query remote state later, call the MCP `collection_map` tool.") + logger.info(f" repo_name: {info['repo_name']}") + logger.info(f" collection_name: {info['collection_name']}") + logger.info(f" source_path: {info['source_path']}") + logger.info(f" container_path: {info['container_path']}") def _get_temp_bundle_dir(self) -> Path: """Get or create temporary directory for bundle creation.""" @@ -648,37 +627,20 @@ def upload_bundle(self, bundle_path: str, manifest: Dict[str, Any]) -> Dict[str, for attempt in range(self.max_retries + 1): try: - # Calculate backoff delay (exponential with jitter) + # Simple exponential backoff if attempt > 0: - base_delay = 2 ** (attempt - 1) # 1, 2, 4, 8... - jitter = base_delay * 0.1 * (0.5 + (hash(str(time.time())) % 100) / 100) - delay = min(base_delay + jitter, 30) # Cap at 30 seconds - logger.info(f"[remote_upload] Retry attempt {attempt + 1}/{self.max_retries + 1} after {delay:.2f}s delay") + delay = min(2 ** (attempt - 1), 30) # 1, 2, 4, 8... capped at 30s + logger.info(f"[remote_upload] Retry attempt {attempt + 1}/{self.max_retries + 1} after {delay}s delay") time.sleep(delay) - # Verify bundle exists before attempting upload + # Verify bundle exists if not os.path.exists(bundle_path): - return { - "success": False, - "error": { - "code": "BUNDLE_NOT_FOUND", - "message": f"Bundle file not found: {bundle_path}" - } - } + return {"success": False, "error": {"code": "BUNDLE_NOT_FOUND", "message": f"Bundle not found: {bundle_path}"}} - # Check bundle size + # Check bundle size (100MB limit) bundle_size = os.path.getsize(bundle_path) - max_size_mb = 100 # Default max size - max_size_bytes = max_size_mb * 1024 * 1024 - - if bundle_size > max_size_bytes: - return { - "success": False, - "error": { - "code": "BUNDLE_TOO_LARGE", - "message": f"Bundle size {bundle_size} bytes exceeds maximum {max_size_bytes} bytes" - } - } + if bundle_size > 100 * 1024 * 1024: + return {"success": False, "error": {"code": "BUNDLE_TOO_LARGE", "message": f"Bundle too large: {bundle_size} bytes"}} with open(bundle_path, 'rb') as bundle_file: files = { @@ -706,71 +668,39 @@ def upload_bundle(self, bundle_path: str, manifest: Dict[str, Any]) -> Dict[str, result = response.json() logger.info(f"[remote_upload] Successfully uploaded bundle {manifest['bundle_id']}") return result - else: - error_msg = f"Upload failed with status {response.status_code}" - try: - error_detail = response.json() - error_detail_msg = error_detail.get('error', {}).get('message', 'Unknown error') - error_msg += f": {error_detail_msg}" - error_code = error_detail.get('error', {}).get('code', 'HTTP_ERROR') - except: - error_msg += f": {response.text[:200]}" # Truncate long responses - error_code = "HTTP_ERROR" - - last_error = { - "success": False, - "error": { - "code": error_code, - "message": error_msg, - "status_code": response.status_code - } - } - - # Don't retry on client errors (4xx) - if 400 <= response.status_code < 500 and response.status_code != 429: - logger.warning(f"[remote_upload] Client error {response.status_code}, not retrying: {error_msg}") - return last_error - - logger.warning(f"[remote_upload] Upload attempt {attempt + 1} failed: {error_msg}") + # Handle error + error_msg = f"Upload failed with status {response.status_code}" + try: + error_detail = response.json() + error_detail_msg = error_detail.get('error', {}).get('message', 'Unknown error') + error_msg += f": {error_detail_msg}" + error_code = error_detail.get('error', {}).get('code', 'HTTP_ERROR') + except: + error_msg += f": {response.text[:200]}" + error_code = "HTTP_ERROR" + + last_error = {"success": False, "error": {"code": error_code, "message": error_msg, "status_code": response.status_code}} + + # Don't retry on client errors (except 429) + if 400 <= response.status_code < 500 and response.status_code != 429: + return last_error + + logger.warning(f"[remote_upload] Upload attempt {attempt + 1} failed: {error_msg}") except requests.exceptions.Timeout as e: - last_error = { - "success": False, - "error": { - "code": "TIMEOUT_ERROR", - "message": f"Upload timeout after {self.timeout}s: {str(e)}" - } - } + last_error = {"success": False, "error": {"code": "TIMEOUT_ERROR", "message": f"Upload timeout: {str(e)}"}} logger.warning(f"[remote_upload] Upload timeout on attempt {attempt + 1}: {e}") except requests.exceptions.ConnectionError as e: - last_error = { - "success": False, - "error": { - "code": "CONNECTION_ERROR", - "message": f"Connection error during upload: {str(e)}" - } - } + last_error = {"success": False, "error": {"code": "CONNECTION_ERROR", "message": f"Connection error: {str(e)}"}} logger.warning(f"[remote_upload] Connection error on attempt {attempt + 1}: {e}") except requests.exceptions.RequestException as e: - last_error = { - "success": False, - "error": { - "code": "NETWORK_ERROR", - "message": f"Network error during upload: {str(e)}" - } - } + last_error = {"success": False, "error": {"code": "NETWORK_ERROR", "message": f"Network error: {str(e)}"}} logger.warning(f"[remote_upload] Network error on attempt {attempt + 1}: {e}") except Exception as e: - last_error = { - "success": False, - "error": { - "code": "UPLOAD_ERROR", - "message": f"Unexpected error during upload: {str(e)}" - } - } + last_error = {"success": False, "error": {"code": "UPLOAD_ERROR", "message": f"Upload error: {str(e)}"}} logger.error(f"[remote_upload] Unexpected error on attempt {attempt + 1}: {e}") # All retries exhausted @@ -784,82 +714,35 @@ def upload_bundle(self, bundle_path: str, manifest: Dict[str, Any]) -> Dict[str, } def get_server_status(self) -> Dict[str, Any]: - """Get server status and last sequence number with enhanced error handling.""" + """Get server status with simplified error handling.""" try: - logger.debug(f"[remote_upload] Checking server status at {self.upload_endpoint}") - - # Translate host path to container path for API communication container_workspace_path = self._translate_to_container_path(self.workspace_path) response = self.session.get( f"{self.upload_endpoint}/api/v1/delta/status", params={'workspace_path': container_workspace_path}, - timeout=min(self.timeout, 10) # Use shorter timeout for status checks + timeout=min(self.timeout, 10) ) if response.status_code == 200: - status_data = response.json() - logger.debug(f"[remote_upload] Server status: {status_data}") - return status_data - else: - error_msg = f"Status check failed with HTTP {response.status_code}" - try: - error_detail = response.json() - error_detail_msg = error_detail.get('error', {}).get('message', 'Unknown error') - error_msg += f": {error_detail_msg}" - except: - error_msg += f": {response.text[:100]}" - - logger.warning(f"[remote_upload] {error_msg}") - return { - "success": False, - "error": { - "code": "STATUS_ERROR", - "message": error_msg, - "status_code": response.status_code - } - } - - except requests.exceptions.Timeout as e: - error_msg = f"Status check timeout after {min(self.timeout, 10)}s" - logger.warning(f"[remote_upload] {error_msg}: {e}") - return { - "success": False, - "error": { - "code": "STATUS_TIMEOUT", - "message": error_msg - } - } - except requests.exceptions.ConnectionError as e: - error_msg = f"Cannot connect to server at {self.upload_endpoint}" - logger.warning(f"[remote_upload] {error_msg}: {e}") - return { - "success": False, - "error": { - "code": "CONNECTION_ERROR", - "message": error_msg - } - } - except requests.exceptions.RequestException as e: - error_msg = f"Network error during status check: {str(e)}" - logger.warning(f"[remote_upload] {error_msg}") - return { - "success": False, - "error": { - "code": "NETWORK_ERROR", - "message": error_msg - } - } + return response.json() + + # Handle error response + error_msg = f"Status check failed with HTTP {response.status_code}" + try: + error_detail = response.json() + error_msg += f": {error_detail.get('error', {}).get('message', 'Unknown error')}" + except: + error_msg += f": {response.text[:100]}" + + return {"success": False, "error": {"code": "STATUS_ERROR", "message": error_msg}} + + except requests.exceptions.Timeout: + return {"success": False, "error": {"code": "STATUS_TIMEOUT", "message": "Status check timeout"}} + except requests.exceptions.ConnectionError: + return {"success": False, "error": {"code": "CONNECTION_ERROR", "message": f"Cannot connect to server"}} except Exception as e: - error_msg = f"Unexpected error during status check: {str(e)}" - logger.error(f"[remote_upload] {error_msg}") - return { - "success": False, - "error": { - "code": "STATUS_CHECK_ERROR", - "message": error_msg - } - } + return {"success": False, "error": {"code": "STATUS_CHECK_ERROR", "message": f"Status check error: {str(e)}"}} def has_meaningful_changes(self, changes: Dict[str, List]) -> bool: """Check if changes warrant a delta upload.""" From 5845cf1bf74bd3593a6a6db7c09e3db256f34282 Mon Sep 17 00:00:00 2001 From: Reese Date: Wed, 12 Nov 2025 13:04:41 +0000 Subject: [PATCH 13/16] feat(memory): add memory backup and restore utilities Add comprehensive utilities for backing up and restoring memories (non-code points) from Qdrant collections. The backup utility exports user-added notes and context to JSON with optional vector embeddings, while the restore utility can import these backups to existing or new collections with support for re-embedding when vectors are not included in the backup. Both tools provide batch processing, CLI interfaces, and robust error handling for production use. --- scripts/memory_backup.py | 319 ++++++++++++++++++++++++++++++++ scripts/memory_restore.py | 379 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 698 insertions(+) create mode 100644 scripts/memory_backup.py create mode 100644 scripts/memory_restore.py diff --git a/scripts/memory_backup.py b/scripts/memory_backup.py new file mode 100644 index 00000000..410ed90a --- /dev/null +++ b/scripts/memory_backup.py @@ -0,0 +1,319 @@ +#!/usr/bin/env python3 +""" +Memory Backup Utility for Qdrant Collections + +Exports memories (non-code points) from Qdrant collections to JSON for backup purposes. +Memories are identified as points without file path metadata - typically user-added notes, +context, or other information that's not tied to specific code files. + +Usage: + python scripts/memory_backup.py --collection test-repo-58ecbbc8 --output memories_backup.json + python scripts/memory_backup.py --collection test-repo-58ecbbc8 --output memories_backup_$(date +%Y%m%d).json +""" + +import os +import sys +import json +import argparse +from datetime import datetime +from typing import List, Dict, Any, Optional +from pathlib import Path + +# Add project root to path for imports +ROOT_DIR = Path(__file__).resolve().parent.parent +if str(ROOT_DIR) not in sys.path: + sys.path.insert(0, str(ROOT_DIR)) + +try: + from qdrant_client import QdrantClient + from qdrant_client.models import Filter, FieldCondition, MatchValue +except ImportError: + print("ERROR: qdrant-client not installed. Install with: pip install qdrant-client") + sys.exit(1) + + +def get_qdrant_client() -> QdrantClient: + """Initialize Qdrant client with environment configuration.""" + qdrant_url = os.environ.get("QDRANT_URL", "http://localhost:6333") + api_key = os.environ.get("QDRANT_API_KEY") + + return QdrantClient(url=qdrant_url, api_key=api_key or None) + + +def is_memory_point(payload: Dict[str, Any]) -> bool: + """ + Determine if a point is a memory (user-added) rather than code-indexed content. + + Memory points typically: + - Have no 'path' in metadata (not tied to a file) + - May have 'source' set to 'memory' + - Have 'content' field that's not extracted from code + + Args: + payload: Point payload from Qdrant + + Returns: + True if this appears to be a memory point, False if it's code content + """ + if not payload: + return False + + metadata = payload.get("metadata", {}) + + # Primary indicator: no file path means it's likely a memory + if not metadata.get("path"): + return True + + # Secondary indicator: explicit source marking + if metadata.get("source") == "memory": + return True + + # Tertiary: content-based heuristics + content = payload.get("information", "") + if content and not metadata.get("language") and not metadata.get("kind"): + # Content without language/kind metadata is likely user-added + return True + + return False + + +def export_memories( + collection_name: str, + output_file: str, + client: Optional[QdrantClient] = None, + include_vectors: bool = True, + batch_size: int = 1000 +) -> Dict[str, Any]: + """ + Export memories from a Qdrant collection to JSON. + + Args: + collection_name: Qdrant collection name + output_file: Output JSON file path + client: Qdrant client instance (will create if None) + include_vectors: Whether to include vector embeddings in backup + batch_size: Number of points to fetch per request + + Returns: + Dict with backup statistics + """ + if client is None: + client = get_qdrant_client() + + # Verify collection exists + try: + collections = client.get_collections().collections + if collection_name not in [c.name for c in collections]: + raise ValueError(f"Collection '{collection_name}' not found") + except Exception as e: + raise RuntimeError(f"Failed to access Qdrant: {e}") + + print(f"Exporting memories from collection: {collection_name}") + print(f"Output file: {output_file}") + + # Get all points from collection + all_points = [] + total_count = 0 + memory_count = 0 + + # Use scroll to get all points efficiently + next_page_offset = None + while True: + points, next_page_offset = client.scroll( + collection_name=collection_name, + offset=next_page_offset, + limit=batch_size, + with_payload=True, + with_vectors=include_vectors + ) + + if not points: + break + + all_points.extend(points) + total_count += len(points) + + # Filter for memory points + memory_points = [] + for point in points: + if is_memory_point(point.payload or {}): + memory_points.append(point) + memory_count += 1 + + print(f"Fetched {len(points)} points (total: {total_count}), found {len(memory_points)} memories (total: {memory_count})") + + if next_page_offset is None: + break + + if memory_count == 0: + print("No memories found in collection!") + return { + "collection": collection_name, + "total_points": total_count, + "memory_count": 0, + "backup_file": output_file, + "success": True + } + + # Prepare backup data + backup_data = { + "backup_info": { + "collection_name": collection_name, + "export_date": datetime.now().isoformat(), + "total_points_exported": total_count, + "memory_points_found": memory_count, + "include_vectors": include_vectors, + "vector_dimension": None # Will be set if vectors included + }, + "memories": [] + } + + # Process memory points + for point in all_points: + if not is_memory_point(point.payload or {}): + continue + + payload = point.payload or {} + memory_entry = { + "id": str(point.id), + "content": payload.get("information", ""), + "metadata": payload.get("metadata", {}), + } + + # Include vector if requested + if include_vectors and point.vector: + if hasattr(point.vector, 'tolist'): + memory_entry["vector"] = point.vector.tolist() + else: + memory_entry["vector"] = point.vector + + # Set vector dimension from first memory + if backup_data["backup_info"]["vector_dimension"] is None: + vector_data = memory_entry["vector"] + if isinstance(vector_data, dict): + # Named vector format: {"memory": [values]} + first_vector = next(iter(vector_data.values())) + backup_data["backup_info"]["vector_dimension"] = len(first_vector) + else: + # Direct vector list format + backup_data["backup_info"]["vector_dimension"] = len(vector_data) + + backup_data["memories"].append(memory_entry) + + # Write backup file + output_path = Path(output_file) + output_path.parent.mkdir(parents=True, exist_ok=True) + + with open(output_path, 'w') as f: + json.dump(backup_data, f, indent=2) + + print(f"✅ Backup completed successfully!") + print(f" Total points processed: {total_count}") + print(f" Memory points exported: {memory_count}") + print(f" Backup file: {output_path}") + print(f" File size: {output_path.stat().st_size / 1024:.1f} KB") + + return { + "collection": collection_name, + "total_points": total_count, + "memory_count": memory_count, + "backup_file": str(output_path), + "file_size": output_path.stat().st_size, + "success": True + } + + +def list_collections() -> None: + """List all available Qdrant collections.""" + client = get_qdrant_client() + + try: + collections = client.get_collections().collections + print("Available collections:") + for collection in collections: + info = client.get_collection(collection.name) + point_count = info.points_count + print(f" - {collection.name} ({point_count:,} points)") + except Exception as e: + print(f"Error listing collections: {e}") + + +def main(): + parser = argparse.ArgumentParser( + description="Backup memories (non-code points) from Qdrant collections", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + %(prog)s --collection test-repo-58ecbbc8 --output memories_backup.json + %(prog)s --list-collections + %(prog)s --collection test-repo-58ecbbc8 --output backup_$(date +%Y%m%d_%H%M%S).json --no-vectors + """ + ) + + parser.add_argument( + "--collection", "-c", + required=False, + help="Qdrant collection name to backup memories from" + ) + + parser.add_argument( + "--output", "-o", + help="Output JSON file path for backup" + ) + + parser.add_argument( + "--list-collections", "-l", + action="store_true", + help="List all available collections" + ) + + parser.add_argument( + "--no-vectors", + action="store_true", + help="Don't include vector embeddings in backup (smaller file, requires re-embedding)" + ) + + parser.add_argument( + "--batch-size", + type=int, + default=1000, + help="Number of points to fetch per request (default: 1000)" + ) + + args = parser.parse_args() + + if args.list_collections: + list_collections() + return + + if not args.collection: + parser.error("--collection required unless using --list-collections") + + if not args.output: + # Generate default filename with timestamp + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + args.output = f"{args.collection}_memories_{timestamp}.json" + + try: + result = export_memories( + collection_name=args.collection, + output_file=args.output, + include_vectors=not args.no_vectors, + batch_size=args.batch_size + ) + + if result["success"]: + print(f"\n🎉 Memory backup completed successfully!") + if result["memory_count"] == 0: + print(" (No memories found to backup)") + else: + print(f"\n❌ Memory backup failed!") + sys.exit(1) + + except Exception as e: + print(f"\n❌ Error during backup: {e}") + sys.exit(1) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/scripts/memory_restore.py b/scripts/memory_restore.py new file mode 100644 index 00000000..cacddeda --- /dev/null +++ b/scripts/memory_restore.py @@ -0,0 +1,379 @@ +#!/usr/bin/env python3 +""" +Memory Restore Utility for Qdrant Collections + +Imports previously backed up memories into Qdrant collections. +Can restore to existing collections (append) or new ones. +Supports re-embedding memories if vectors were not included in backup. + +Usage: + python scripts/memory_restore.py --backup memories_backup.json --collection test-repo-58ecbbc8 + python scripts/memory_restore.py --backup memories_backup.json --collection new-test-repo --embedding-model BAAI/bge-large-en-v1.5 + python scripts/memory_restore.py --backup memories_backup.json --collection new-collection --new-collection +""" + +import os +import sys +import json +import argparse +from datetime import datetime +from typing import List, Dict, Any, Optional +from pathlib import Path + +# Add project root to path for imports +ROOT_DIR = Path(__file__).resolve().parent.parent +if str(ROOT_DIR) not in sys.path: + sys.path.insert(0, str(ROOT_DIR)) + +try: + from qdrant_client import QdrantClient + from qdrant_client.models import VectorParams, Distance + from fastembed import TextEmbedding +except ImportError as e: + print(f"ERROR: Missing required dependency: {e}") + print("Install with: pip install qdrant-client fastembed") + sys.exit(1) + + +def get_qdrant_client() -> QdrantClient: + """Initialize Qdrant client with environment configuration.""" + qdrant_url = os.environ.get("QDRANT_URL", "http://localhost:6333") + api_key = os.environ.get("QDRANT_API_KEY") + + return QdrantClient(url=qdrant_url, api_key=api_key or None) + + +def get_embedding_model(model_name: str): + """Initialize embedding model with the given name.""" + try: + return TextEmbedding(model_name=model_name) + except Exception as e: + raise RuntimeError(f"Failed to load embedding model '{model_name}': {e}") + + +def ensure_collection_exists( + client: QdrantClient, + collection_name: str, + vector_dimension: int, + vector_name: str = "memory" +) -> None: + """ + Ensure the target collection exists with appropriate vector configuration. + + Args: + client: Qdrant client instance + collection_name: Collection name + vector_dimension: Vector dimensions for memories + vector_name: Name for the memory vector + """ + try: + # Check if collection exists + collections = client.get_collections().collections + if collection_name in [c.name for c in collections]: + print(f"Collection '{collection_name}' already exists") + return + except Exception as e: + print(f"Warning: Could not check collection existence: {e}") + + # Create collection with memory vector + try: + client.create_collection( + collection_name=collection_name, + vectors_config={ + vector_name: VectorParams( + size=vector_dimension, + distance=Distance.COSINE + ) + } + ) + print(f"✅ Created collection '{collection_name}' with {vector_dimension}-dim vectors") + except Exception as e: + raise RuntimeError(f"Failed to create collection '{collection_name}': {e}") + + +def restore_memories( + backup_file: str, + collection_name: str, + client: Optional[QdrantClient] = None, + embedding_model_name: Optional[str] = None, + vector_name: str = "memory", + batch_size: int = 100, + skip_existing: bool = True +) -> Dict[str, Any]: + """ + Restore memories from backup file to Qdrant collection. + + Args: + backup_file: Path to backup JSON file + collection_name: Target collection name + client: Qdrant client instance (will create if None) + embedding_model_name: Model name for re-embedding (if vectors not in backup) + vector_name: Name for the memory vector in collection + batch_size: Number of memories to upload per batch + skip_existing: Skip memories that already exist in collection + + Returns: + Dict with restore statistics + """ + if client is None: + client = get_qdrant_client() + + # Load backup file + backup_path = Path(backup_file) + if not backup_path.exists(): + raise FileNotFoundError(f"Backup file not found: {backup_file}") + + try: + with open(backup_path, 'r') as f: + backup_data = json.load(f) + except Exception as e: + raise ValueError(f"Invalid backup file format: {e}") + + # Validate backup structure + if "memories" not in backup_data: + raise ValueError("Invalid backup file: missing 'memories' section") + + memories = backup_data["memories"] + backup_info = backup_data.get("backup_info", {}) + + print(f"Restoring memories from: {backup_file}") + print(f"Target collection: {collection_name}") + print(f"Memories in backup: {len(memories)}") + + if backup_info: + print(f"Original collection: {backup_info.get('collection_name', 'unknown')}") + print(f"Backup date: {backup_info.get('export_date', 'unknown')}") + print(f"Vector dimension: {backup_info.get('vector_dimension', 'unknown')}") + + # Determine vector configuration + vectors_included = backup_info.get("include_vectors", True) and memories and "vector" in memories[0] + + if not vectors_included: + if not embedding_model_name: + # Use default model + embedding_model_name = os.environ.get("EMBEDDING_MODEL", "BAAI/bge-base-en-v1.5") + + print(f"Vectors not included in backup, will re-embed with: {embedding_model_name}") + embedding_model = get_embedding_model(embedding_model_name) + + # Get vector dimension from model + test_vector = next(embedding_model.embed(["test"])).tolist() + vector_dimension = len(test_vector) + print(f"Embedding model vector dimension: {vector_dimension}") + else: + # Use dimension from backup + vector_dimension = backup_info.get("vector_dimension", len(memories[0]["vector"])) + embedding_model = None + print(f"Using vectors from backup, dimension: {vector_dimension}") + + # Ensure collection exists + ensure_collection_exists(client, collection_name, vector_dimension, vector_name) + + # Check for existing memories if skip_existing is True + existing_ids = set() + if skip_existing: + try: + # Get all existing point IDs + all_points, _ = client.scroll( + collection_name=collection_name, + limit=None, + with_payload=False, + with_vectors=False + ) + existing_ids = {str(point.id) for point in all_points} + print(f"Found {len(existing_ids)} existing points in collection") + except Exception as e: + print(f"Warning: Could not check existing points: {e}") + skip_existing = False + + # Process and upload memories in batches + restored_count = 0 + skipped_count = 0 + error_count = 0 + + for i in range(0, len(memories), batch_size): + batch = memories[i:i + batch_size] + batch_points = [] + + for memory in batch: + memory_id = memory.get("id", "") + + # Skip if already exists + if skip_existing and memory_id in existing_ids: + skipped_count += 1 + continue + + try: + # Prepare vector + if vectors_included: + vector = memory.get("vector") + if not vector: + raise ValueError("Memory missing vector data") + # Vector from backup is already in the correct format: {"memory": [values]} + else: + # Re-embed content + content = memory.get("content", "") + if not content: + raise ValueError("Memory missing content for embedding") + + vector = next(embedding_model.embed([content])).tolist() + # For re-embedded vectors, we need to structure them with the vector name + vector = {vector_name: vector} + + # Prepare point data + point_data = { + "id": memory_id, + "vector": vector, + "payload": { + "information": memory.get("content", ""), + "metadata": memory.get("metadata", {}) + } + } + + batch_points.append(point_data) + + except Exception as e: + print(f"Error processing memory {memory_id}: {e}") + error_count += 1 + continue + + # Upload batch + if batch_points: + try: + client.upsert(collection_name=collection_name, points=batch_points) + restored_count += len(batch_points) + print(f" Uploaded batch {i//batch_size + 1}: +{len(batch_points)} memories (total: {restored_count})") + except Exception as e: + print(f"Error uploading batch {i//batch_size + 1}: {e}") + error_count += len(batch_points) + + # Final statistics + print(f"\n✅ Memory restore completed!") + print(f" Total memories in backup: {len(memories)}") + print(f" Successfully restored: {restored_count}") + print(f" Skipped (already exists): {skipped_count}") + print(f" Errors: {error_count}") + print(f" Target collection: {collection_name}") + + # Verify final count + try: + final_count = client.count(collection_name).count + print(f" Final collection size: {final_count:,} points") + except Exception as e: + print(f" Warning: Could not get final count: {e}") + + return { + "collection": collection_name, + "backup_file": backup_file, + "total_memories": len(memories), + "restored": restored_count, + "skipped": skipped_count, + "errors": error_count, + "success": True + } + + +def main(): + parser = argparse.ArgumentParser( + description="Restore memories from backup to Qdrant collections", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + %(prog)s --backup memories_backup.json --collection test-repo-58ecbbc8 + %(prog)s --backup memories_backup.json --collection new-test-repo --embedding-model BAAI/bge-large-en-v1.5 + %(prog)s --backup memories_backup.json --collection new-collection --new-collection --no-skip-existing + """ + ) + + parser.add_argument( + "--backup", "-b", + required=True, + help="Path to backup JSON file" + ) + + parser.add_argument( + "--collection", "-c", + required=True, + help="Target Qdrant collection name" + ) + + parser.add_argument( + "--embedding-model", "-m", + help="Embedding model for re-embedding (if vectors not in backup)" + ) + + parser.add_argument( + "--vector-name", + default="memory", + help="Name for the memory vector in collection (default: memory)" + ) + + parser.add_argument( + "--batch-size", + type=int, + default=100, + help="Number of memories to upload per batch (default: 100)" + ) + + parser.add_argument( + "--no-skip-existing", + action="store_true", + help="Don't skip memories that already exist in collection" + ) + + parser.add_argument( + "--list-backup-info", + action="store_true", + help="Show backup file information without restoring" + ) + + args = parser.parse_args() + + try: + # Load backup to show info + with open(args.backup, 'r') as f: + backup_data = json.load(f) + + if args.list_backup_info: + print("Backup Information:") + print("=" * 50) + backup_info = backup_data.get("backup_info", {}) + for key, value in backup_info.items(): + print(f" {key}: {value}") + + memories = backup_data.get("memories", []) + print(f" Memory count: {len(memories)}") + + if memories: + sample = memories[0] + has_vector = "vector" in sample + print(f" Has vectors: {has_vector}") + if has_vector: + vector_dim = len(sample["vector"]) + print(f" Vector dimension: {vector_dim}") + + return + + # Restore memories + result = restore_memories( + backup_file=args.backup, + collection_name=args.collection, + embedding_model_name=args.embedding_model, + vector_name=args.vector_name, + batch_size=args.batch_size, + skip_existing=not args.no_skip_existing + ) + + if result["success"]: + print(f"\n🎉 Memory restoration completed successfully!") + else: + print(f"\n❌ Memory restoration failed!") + sys.exit(1) + + except Exception as e: + print(f"\n❌ Error during restoration: {e}") + sys.exit(1) + + +if __name__ == "__main__": + main() \ No newline at end of file From d0ee44e04a852532bce98fa1fca0cc2e027c122d Mon Sep 17 00:00:00 2001 From: Reese Date: Wed, 12 Nov 2025 13:08:37 +0000 Subject: [PATCH 14/16] docs: update README with collection mapping and naming strategies Add documentation for new collection mapping features and detailed explanation of collection naming strategies for local workspaces versus remote uploads. Includes information about collision avoidance and hash lengths used for different workspace types. --- README.md | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index d0929b7b..205fb817 100644 --- a/README.md +++ b/README.md @@ -345,6 +345,8 @@ Indexer/Search MCP (8001 SSE, 8003 RMCP): - search_callers_for — intent wrapper for probable callers/usages - search_importers_for — intent wrapper for files importing a module/symbol - change_history_for_path(path) — summarize recent changes using stored metadata +- collection_map - return collection↔repo mappings +- default_collection - set the collection to use for the session Notes: - Most search tools accept filters like language, under, path_glob, kind, symbol, ext. @@ -502,11 +504,25 @@ For production-grade backup/migration strategies, see the official Qdrant docume Operational notes: - Collection name comes from `COLLECTION_NAME` (see .env). This stack defaults to a single collection for both code and memories; filtering uses `metadata.kind`. -- If you switch to a dedicated memory collection, update the MCP Memory server and the Indexer’s memory blending env to point at it. +- If you switch to a dedicated memory collection, update the MCP Memory server and the Indexer's memory blending env to point at it. - Consider pruning expired memories by filtering `expires_at < now`. - Call `context_search` on :8001 (SSE) or :8003 (RMCP) with `{ "include_memories": true }` to return both memory and code results. +### Collection Naming Strategies + +Different hash lengths are used for different workspace types: + +**Local Workspaces:** `repo-name-8charhash` +- Example: `Anesidara-e8d0f5fc` +- Used by local indexer/watcher +- Assumes unique repo names within workspace + +**Remote Uploads:** `folder-name-16charhash-8charhash` +- Example: `testupload2-04e680d5939dd035-b8b8d4cc` +- Collision avoidance for duplicate folder names for different codebases +- 16-char hash identifies workspace, 8-char hash identifies collection + ### Enable memory blending (for context_search) From 5888b2ebbb49e30c67801686e3a60a0b39533bd9 Mon Sep 17 00:00:00 2001 From: Reese Date: Sat, 15 Nov 2025 06:12:08 +0000 Subject: [PATCH 15/16] chore: drop remote upload env gate - remove REMOTE_UPLOAD_ENABLED guard from standalone_upload_client - do the same for remote_upload_client so both run without extra env setup --- scripts/remote_upload_client.py | 13 ------------- scripts/standalone_upload_client.py | 13 ------------- 2 files changed, 26 deletions(-) diff --git a/scripts/remote_upload_client.py b/scripts/remote_upload_client.py index 39337bc0..aac98034 100644 --- a/scripts/remote_upload_client.py +++ b/scripts/remote_upload_client.py @@ -842,14 +842,6 @@ def process_and_upload_changes(self, changed_paths: List[Path]) -> bool: logger.exception("[remote_upload] Full traceback:") return False - # CLI is stateless - sequence mismatch handling is done by server - - -def is_remote_mode_enabled() -> bool: - """Check if remote upload mode is enabled via environment variables.""" - return os.environ.get("REMOTE_UPLOAD_ENABLED", "").lower() in {"1", "true", "yes", "on"} - - def get_remote_config(cli_path: Optional[str] = None) -> Dict[str, str]: """Get remote upload configuration from environment variables and command-line arguments.""" # Use command-line path if provided, otherwise fall back to environment variables @@ -988,11 +980,6 @@ def main(): client.log_mapping_summary() return 0 - # Check if remote mode is enabled - if not is_remote_mode_enabled(): - logger.error("Remote upload mode is not enabled. Set REMOTE_UPLOAD_ENABLED=1 in environment variables.") - return 1 - # Handle watch mode if args.watch: logger.info("Starting watch mode for continuous file monitoring") diff --git a/scripts/standalone_upload_client.py b/scripts/standalone_upload_client.py index 0162f0fa..76a42432 100644 --- a/scripts/standalone_upload_client.py +++ b/scripts/standalone_upload_client.py @@ -995,14 +995,6 @@ def process_and_upload_changes(self, changed_paths: List[Path]) -> bool: logger.exception("[remote_upload] Full traceback:") return False - # CLI is stateless - sequence mismatch handling is done by server - - -def is_remote_mode_enabled() -> bool: - """Check if remote upload mode is enabled via environment variables.""" - return os.environ.get("REMOTE_UPLOAD_ENABLED", "").lower() in {"1", "true", "yes", "on"} - - def get_remote_config(cli_path: Optional[str] = None) -> Dict[str, str]: """Get remote upload configuration from environment variables and command-line arguments.""" # Use command-line path if provided, otherwise fall back to environment variables @@ -1135,11 +1127,6 @@ def main(): client.log_mapping_summary() return 0 - # Check if remote mode is enabled - if not is_remote_mode_enabled(): - logger.error("Remote upload mode is not enabled. Set REMOTE_UPLOAD_ENABLED=1 in environment variables.") - return 1 - # Handle watch mode if args.watch: logger.info("Starting watch mode for continuous file monitoring") From b45b582ba1edbc9457eaaaa6bf63f6fea565894e Mon Sep 17 00:00:00 2001 From: Reese Date: Sat, 15 Nov 2025 06:54:49 +0000 Subject: [PATCH 16/16] Remove claude workflow for upstream --- .github/workflows/claude.yaml | 68 ----------------------------------- 1 file changed, 68 deletions(-) delete mode 100644 .github/workflows/claude.yaml diff --git a/.github/workflows/claude.yaml b/.github/workflows/claude.yaml deleted file mode 100644 index 732de78c..00000000 --- a/.github/workflows/claude.yaml +++ /dev/null @@ -1,68 +0,0 @@ -name: Claude Code - -on: - issue_comment: - types: [created] - pull_request_review_comment: - types: [created] - issues: - types: [opened] - pull_request_review: - types: [submitted] - pull_request_target: - types: [opened, synchronize] - -jobs: - claude: - # This simplified condition is more robust and correctly checks permissions. - if: > - (contains(github.event.comment.body, '@claude') || - contains(github.event.review.body, '@claude') || - contains(github.event.issue.body, '@claude') || - contains(github.event.pull_request.body, '@claude')) && - (github.event.sender.type == 'User' && ( - github.event.comment.author_association == 'OWNER' || - github.event.comment.author_association == 'MEMBER' || - github.event.comment.author_association == 'COLLABORATOR' - )) - runs-on: ubuntu-latest - permissions: - # CRITICAL: Write permissions are required for the action to push branches and update issues/PRs. - contents: write - pull-requests: write - issues: write - id-token: write # Required for OIDC token exchange - actions: read # Required for Claude to read CI results on PRs - - steps: - - name: Checkout repository - uses: actions/checkout@v4 - with: - # This correctly checks out the PR's head commit for pull_request_target events. - ref: ${{ github.event.pull_request.head.sha }} - - - name: Create Claude settings file - run: | - mkdir -p /home/runner/.claude - cat > /home/runner/.claude/settings.json << 'EOF' - { - "env": { - "ANTHROPIC_BASE_URL": "https://api.z.ai/api/anthropic", - "ANTHROPIC_AUTH_TOKEN": "${{ secrets.CUSTOM_ENDPOINT_API_KEY }}" - } - } - EOF - - - name: Run Claude Code - id: claude - uses: anthropics/claude-code-action@v1 - with: - # Still need this to satisfy the action's validation - anthropic_api_key: ${{ secrets.CUSTOM_ENDPOINT_API_KEY }} - - # Use the same variable names as your local setup - settings: '{"env": {"ANTHROPIC_BASE_URL": "https://api.z.ai/api/anthropic", "ANTHROPIC_AUTH_TOKEN": "${{ secrets.CUSTOM_ENDPOINT_API_KEY }}"}}' - - track_progress: true - claude_args: | - --allowedTools "Bash,Edit,Read,Write,Glob,Grep"