diff --git a/.codebase/.gitkeep b/.codebase/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/.env.example b/.env.example index 52f6b595..3da5e865 100644 --- a/.env.example +++ b/.env.example @@ -7,6 +7,12 @@ QDRANT_URL=http://localhost:6333 # Multi-repo: Each subdirectory gets its own collection MULTI_REPO_MODE=0 +# Logical repo reuse (experimental): 0=disabled (default), 1=enable logical_repo_id-based +# collection reuse across git worktrees / clones. When enabled, indexer, watcher, and +# upload service will try to reuse a canonical collection per logical repository and +# use (repo_id + repo_rel_path) for skip-unchanged across worktrees. +#LOGICAL_REPO_REUSE=0 + # Single unified collection for seamless cross-repo search (default: "codebase") # Leave unset or use "codebase" for unified search across all your code COLLECTION_NAME=codebase @@ -153,6 +159,8 @@ QDRANT_TIMEOUT=20 MEMORY_AUTODETECT=1 MEMORY_COLLECTION_TTL_SECS=300 +# Smarter re-indexing for symbol cache, reuse embeddings and reduce decoder/pseudo tags to re-index +SMART_SYMBOL_REINDEXING=0 # Watcher-safe defaults (recommended) # Applied to watcher via compose; uncomment to apply globally. @@ -166,3 +174,14 @@ MEMORY_COLLECTION_TTL_SECS=300 # INDEX_UPSERT_BACKOFF=0.5 # Debounce file events to coalesce bursts # WATCH_DEBOUNCE_SECS=1.5 + +# Remote upload git history (used by upload clients) +# Max number of commits to include per bundle (0 disables git history) +# REMOTE_UPLOAD_GIT_MAX_COMMITS=500 +# Optional git log since filter, e.g. '6 months ago' or '2024-01-01' +# REMOTE_UPLOAD_GIT_SINCE= +# Enable commit lineage goals for indexing +REFRAG_COMMIT_DESCRIBE=1 + +STRICT_MEMORY_RESTORE=0 + diff --git a/.gitignore b/.gitignore index 0ddd7594..4c548379 100644 --- a/.gitignore +++ b/.gitignore @@ -17,7 +17,8 @@ __pycache__/ paper.md /semantic-search /codebase-index-cli -/.codebase +/.codebase/* +!/.codebase/.gitkeep scripts/.codebase/cache.json scripts/.codebase/state.json tests/.codebase/cache.json @@ -29,4 +30,5 @@ tests/.codebase/state.json CLAUDE.md .qodo/.cursor/rules /.augment -/dev-workspace +/dev-workspace/* +!/dev-workspace/.gitkeep diff --git a/README.md b/README.md index 96106fdf..c6150e29 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ [![CI](https://github.com/m1rl0k/Context-Engine/actions/workflows/ci.yml/badge.svg)](https://github.com/m1rl0k/Context-Engine/actions/workflows/ci.yml) -**Documentation:** README · [Configuration](docs/CONFIGURATION.md) · [IDE Clients](docs/IDE_CLIENTS.md) · [MCP API](docs/MCP_API.md) · [ctx CLI](docs/CTX_CLI.md) · [Memory Guide](docs/MEMORY_GUIDE.md) · [Architecture](docs/ARCHITECTURE.md) · [Multi-Repo](docs/MULTI_REPO_COLLECTIONS.md) · [Kubernetes](deploy/kubernetes/README.md) · [VS Code Extension](docs/vscode-extension.md) · [Troubleshooting](docs/TROUBLESHOOTING.md) · [Development](docs/DEVELOPMENT.md) +**Documentation:** [Getting Started](docs/GETTING_STARTED.md) · README · [Configuration](docs/CONFIGURATION.md) · [IDE Clients](docs/IDE_CLIENTS.md) · [MCP API](docs/MCP_API.md) · [ctx CLI](docs/CTX_CLI.md) · [Memory Guide](docs/MEMORY_GUIDE.md) · [Architecture](docs/ARCHITECTURE.md) · [Multi-Repo](docs/MULTI_REPO_COLLECTIONS.md) · [Kubernetes](deploy/kubernetes/README.md) · [VS Code Extension](docs/vscode-extension.md) · [Troubleshooting](docs/TROUBLESHOOTING.md) · [Development](docs/DEVELOPMENT.md) --- @@ -38,13 +38,19 @@ Context-Engine is a plug-and-play MCP retrieval stack that unifies code indexing | OpenAI Codex | RMCP | TOML config | | Augment | SSE | Simple JSON configs | | AmpCode | SSE | Simple URL for SSE endpoints | -| Claude Code CLI | SSE | Simple JSON configs | +| Claude Code CLI | SSE / HTTP (RMCP) | Simple JSON configs via .mcp.json | > **See [docs/IDE_CLIENTS.md](docs/IDE_CLIENTS.md) for detailed configuration examples.** ## Getting Started +If you're a VS Code user trying Context-Engine locally, start with the low-friction dev-remote + extension guide: + +- **[docs/GETTING_STARTED.md](docs/GETTING_STARTED.md)** + +The options below describe the `docker compose` + CLI workflows. + ### Option 1: Deploy & Connect (Recommended) Deploy Context-Engine once, connect any IDE. No need to clone this repo into your project. diff --git a/ctx-hook-simple.sh b/ctx-hook-simple.sh index 1d4cdb0e..e6c07bd4 100755 --- a/ctx-hook-simple.sh +++ b/ctx-hook-simple.sh @@ -105,16 +105,17 @@ fi # Read all settings from ctx_config.json if [ -n "$CONFIG_FILE" ] && [ -f "$CONFIG_FILE" ]; then - CTX_COLLECTION=$(grep -o '"default_collection"[[:space:]]*:[[:space:]]*"[^"]*"' "$CONFIG_FILE" | sed 's/.*"default_collection"[[:space:]]*:[[:space:]]*"\([^"]*\)".*/\1/' ) - REFRAG_RUNTIME=$(grep -o '"refrag_runtime"[[:space:]]*:[[:space:]]*"[^"]*"' "$CONFIG_FILE" | sed 's/.*"refrag_runtime"[[:space:]]*:[[:space:]]*"\([^"]*\)".*/\1/' || echo "glm") - GLM_API_KEY=$(grep -o '"glm_api_key"[[:space:]]*:[[:space:]]*"[^"]*"' "$CONFIG_FILE" | sed 's/.*"glm_api_key"[[:space:]]*:[[:space:]]*"\([^"]*\)".*/\1/' ) - GLM_API_BASE=$(grep -o '"glm_api_base"[[:space:]]*:[[:space:]]*"[^"]*"' "$CONFIG_FILE" | sed 's/.*"glm_api_base"[[:space:]]*:[[:space:]]*"\([^"]*\)".*/\1/') - GLM_MODEL=$(grep -o '"glm_model"[[:space:]]*:[[:space:]]*"[^\"]*"' "$CONFIG_FILE" | sed 's/.*"glm_model"[[:space:]]*:[[:space:]]*"\([^\"]*\)".*/\1/' || echo "glm-4.6") - CTX_DEFAULT_MODE=$(grep -o '"default_mode"[[:space:]]*:[[:space:]]*"[^\"]*"' "$CONFIG_FILE" | sed 's/.*"default_mode"[[:space:]]*:[[:space:]]*"\([^\"]*\)".*/\1/') - CTX_REQUIRE_CONTEXT=$(grep -o '"require_context"[[:space:]]*:[[:space:]]*\(true\|false\)' "$CONFIG_FILE" | sed 's/.*"require_context"[[:space:]]*:[[:space:]]*\(true\|false\).*/\1/') - CTX_RELEVANCE_GATE=$(grep -o '"relevance_gate_enabled"[[:space:]]*:[[:space:]]*\(true\|false\)' "$CONFIG_FILE" | sed 's/.*"relevance_gate_enabled"[[:space:]]*:[[:space:]]*\(true\|false\).*/\1/') - CTX_MIN_RELEVANCE=$(grep -o '"min_relevance"[[:space:]]*:[[:space:]]*[0-9.][0-9.]*' "$CONFIG_FILE" | sed 's/.*"min_relevance"[[:space:]]*:[[:space:]]*\([0-9.][0-9.]*\).*/\1/') - CTX_REWRITE_MAX_TOKENS=$(grep -o '"rewrite_max_tokens"[[:space:]]*:[[:space:]]*[0-9][0-9]*' "$CONFIG_FILE" | sed 's/.*"rewrite_max_tokens"[[:space:]]*:[[:space:]]*\([0-9][0-9]*\).*/\1/') + CTX_COLLECTION=$(grep -o '"default_collection"[[:space:]]*:[[:space:]]*"[^"]*"' "$CONFIG_FILE" | sed 's/.*"default_collection"[[:space:]]*:[[:space:]]*"\([^"]*\)".*/\1/' ) + REFRAG_RUNTIME=$(grep -o '"refrag_runtime"[[:space:]]*:[[:space:]]*"[^"]*"' "$CONFIG_FILE" | sed 's/.*"refrag_runtime"[[:space:]]*:[[:space:]]*"\([^"]*\)".*/\1/' || echo "glm") + GLM_API_KEY=$(grep -o '"glm_api_key"[[:space:]]*:[[:space:]]*"[^"]*"' "$CONFIG_FILE" | sed 's/.*"glm_api_key"[[:space:]]*:[[:space:]]*"\([^"]*\)".*/\1/' ) + GLM_API_BASE=$(grep -o '"glm_api_base"[[:space:]]*:[[:space:]]*"[^"]*"' "$CONFIG_FILE" | sed 's/.*"glm_api_base"[[:space:]]*:[[:space:]]*"\([^"]*\)".*/\1/') + GLM_MODEL=$(grep -o '"glm_model"[[:space:]]*:[[:space:]]*"[^\"]*"' "$CONFIG_FILE" | sed 's/.*"glm_model"[[:space:]]*:[[:space:]]*"\([^\"]*\)".*/\1/' || echo "glm-4.6") + CTX_DEFAULT_MODE=$(grep -o '"default_mode"[[:space:]]*:[[:space:]]*"[^\"]*"' "$CONFIG_FILE" | sed 's/.*"default_mode"[[:space:]]*:[[:space:]]*"\([^\"]*\)".*/\1/') + CTX_REQUIRE_CONTEXT=$(grep -o '"require_context"[[:space:]]*:[[:space:]]*\(true\|false\)' "$CONFIG_FILE" | sed 's/.*"require_context"[[:space:]]*:[[:space:]]*\(true\|false\).*/\1/') + CTX_RELEVANCE_GATE=$(grep -o '"relevance_gate_enabled"[[:space:]]*:[[:space:]]*\(true\|false\)' "$CONFIG_FILE" | sed 's/.*"relevance_gate_enabled"[[:space:]]*:[[:space:]]*\(true\|false\).*/\1/') + CTX_MIN_RELEVANCE=$(grep -o '"min_relevance"[[:space:]]*:[[:space:]]*[0-9.][0-9.]*' "$CONFIG_FILE" | sed 's/.*"min_relevance"[[:space:]]*:[[:space:]]*\([0-9.][0-9.]*\).*/\1/') + CTX_REWRITE_MAX_TOKENS=$(grep -o '"rewrite_max_tokens"[[:space:]]*:[[:space:]]*[0-9][0-9]*' "$CONFIG_FILE" | sed 's/.*"rewrite_max_tokens"[[:space:]]*:[[:space:]]*\([0-9][0-9]*\).*/\1/') + CTX_SURFACE_COLLECTION_CFG=$(grep -o '"surface_qdrant_collection_hint"[[:space:]]*:[[:space:]]*\(true\|false\)' "$CONFIG_FILE" | sed 's/.*"surface_qdrant_collection_hint"[[:space:]]*:[[:space:]]*\(true\|false\).*/\1/') fi # Set defaults if not found in config @@ -129,6 +130,25 @@ CTX_RELEVANCE_GATE=${CTX_RELEVANCE_GATE:-false} CTX_MIN_RELEVANCE=${CTX_MIN_RELEVANCE:-0.1} CTX_REWRITE_MAX_TOKENS=${CTX_REWRITE_MAX_TOKENS:-320} +# Normalize surface_qdrant_collection_hint from config (true/false) into 1/0 +CFG_HINT="" +if [ -n "$CTX_SURFACE_COLLECTION_CFG" ]; then + if [ "$CTX_SURFACE_COLLECTION_CFG" = "true" ]; then + CFG_HINT="1" + elif [ "$CTX_SURFACE_COLLECTION_CFG" = "false" ]; then + CFG_HINT="0" + fi +fi + +# Precedence: explicit env override > ctx_config flag > auto-on when collection known +if [ -n "${CTX_SURFACE_COLLECTION_HINT:-}" ]; then + : +elif [ -n "$CFG_HINT" ]; then + CTX_SURFACE_COLLECTION_HINT="$CFG_HINT" +elif [ -n "$CTX_COLLECTION" ]; then + CTX_SURFACE_COLLECTION_HINT="1" +fi + # Export GLM/context environment variables from config export REFRAG_RUNTIME GLM_API_KEY GLM_API_BASE GLM_MODEL CTX_REQUIRE_CONTEXT CTX_RELEVANCE_GATE CTX_MIN_RELEVANCE CTX_REWRITE_MAX_TOKENS diff --git a/deploy/kubernetes/indexer-services.yaml b/deploy/kubernetes/indexer-services.yaml index c0a73e00..271e3578 100644 --- a/deploy/kubernetes/indexer-services.yaml +++ b/deploy/kubernetes/indexer-services.yaml @@ -47,6 +47,10 @@ spec: configMapKeyRef: name: context-engine-config key: EMBEDDING_MODEL + - name: HF_HOME + value: /work/models/hf-cache + - name: XDG_CACHE_HOME + value: /work/models/hf-cache - name: WATCH_ROOT value: /work - name: QDRANT_TIMEOUT @@ -74,6 +78,10 @@ spec: configMapKeyRef: name: context-engine-config key: WATCH_DEBOUNCE_SECS + - name: HF_HOME + value: /work/models/hf-cache + - name: XDG_CACHE_HOME + value: /work/models/hf-cache resources: requests: memory: 512Mi @@ -86,6 +94,8 @@ spec: mountPath: /work - name: metadata-volume mountPath: /work/.codebase + - name: models-volume + mountPath: /work/models envFrom: - configMapRef: name: context-engine-config @@ -96,6 +106,9 @@ spec: - name: metadata-volume persistentVolumeClaim: claimName: code-metadata-pvc + - name: models-volume + persistentVolumeClaim: + claimName: code-models-pvc --- apiVersion: batch/v1 kind: Job @@ -142,6 +155,10 @@ spec: configMapKeyRef: name: context-engine-config key: EMBEDDING_MODEL + - name: HF_HOME + value: /work/models/hf-cache + - name: XDG_CACHE_HOME + value: /work/models/hf-cache resources: requests: memory: 1Gi @@ -154,6 +171,8 @@ spec: mountPath: /work - name: metadata-volume mountPath: /work/.codebase + - name: models-volume + mountPath: /work/models envFrom: - configMapRef: name: context-engine-config @@ -164,6 +183,9 @@ spec: - name: metadata-volume persistentVolumeClaim: claimName: code-metadata-pvc + - name: models-volume + persistentVolumeClaim: + claimName: code-models-pvc --- apiVersion: batch/v1 kind: Job diff --git a/deploy/kubernetes/mcp-http.yaml b/deploy/kubernetes/mcp-http.yaml index 5d60bf4b..ae6a707c 100644 --- a/deploy/kubernetes/mcp-http.yaml +++ b/deploy/kubernetes/mcp-http.yaml @@ -53,6 +53,10 @@ spec: configMapKeyRef: name: context-engine-config key: EMBEDDING_MODEL + - name: HF_HOME + value: /work/models/hf-cache + - name: XDG_CACHE_HOME + value: /work/models/hf-cache - name: EMBEDDING_PROVIDER valueFrom: configMapKeyRef: @@ -216,6 +220,12 @@ spec: configMapKeyRef: name: context-engine-config key: EMBEDDING_MODEL + - name: HF_HOME + value: /work/models/hf-cache + - name: XDG_CACHE_HOME + value: /work/models/hf-cache + - name: HF_HUB_CACHE + value: /work/models/hf-cache/huggingface - name: INDEX_MICRO_CHUNKS valueFrom: configMapKeyRef: @@ -277,6 +287,8 @@ spec: mountPath: /work - name: codebase-volume mountPath: /work/.codebase + - name: models-volume + mountPath: /work/models livenessProbe: httpGet: path: /readyz @@ -303,6 +315,9 @@ spec: - name: codebase-volume persistentVolumeClaim: claimName: code-metadata-pvc + - name: models-volume + persistentVolumeClaim: + claimName: code-models-pvc --- apiVersion: v1 kind: Service diff --git a/deploy/kubernetes/mcp-indexer.yaml b/deploy/kubernetes/mcp-indexer.yaml index 505eaed5..c8ade75a 100644 --- a/deploy/kubernetes/mcp-indexer.yaml +++ b/deploy/kubernetes/mcp-indexer.yaml @@ -67,6 +67,12 @@ spec: configMapKeyRef: name: context-engine-config key: EMBEDDING_MODEL + - name: HF_HOME + value: /work/models/hf-cache + - name: XDG_CACHE_HOME + value: /work/models/hf-cache + - name: HF_HUB_CACHE + value: /work/models/hf-cache/huggingface resources: requests: memory: 512Mi @@ -79,6 +85,8 @@ spec: mountPath: /work - name: codebase-volume mountPath: /work/.codebase + - name: models-volume + mountPath: /work/models livenessProbe: httpGet: path: /readyz @@ -105,6 +113,9 @@ spec: - name: codebase-volume persistentVolumeClaim: claimName: code-metadata-pvc + - name: models-volume + persistentVolumeClaim: + claimName: code-models-pvc --- apiVersion: v1 kind: Service diff --git a/deploy/kubernetes/qdrant.yaml b/deploy/kubernetes/qdrant.yaml index 180ed637..7ec599f7 100644 --- a/deploy/kubernetes/qdrant.yaml +++ b/deploy/kubernetes/qdrant.yaml @@ -47,12 +47,6 @@ spec: volumeMounts: - name: qdrant-storage mountPath: /qdrant/storage - livenessProbe: - httpGet: - path: /healthz - port: http - initialDelaySeconds: 30 - periodSeconds: 10 readinessProbe: httpGet: path: /readyz diff --git a/dev-workspace/.gitkeep b/dev-workspace/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/docker-compose.dev-remote.yml b/docker-compose.dev-remote.yml index 7b1630a4..437ec4a0 100644 --- a/docker-compose.dev-remote.yml +++ b/docker-compose.dev-remote.yml @@ -34,7 +34,7 @@ services: - FASTMCP_PORT=${FASTMCP_PORT} - QDRANT_URL=${QDRANT_URL} - COLLECTION_NAME=${COLLECTION_NAME} - - PATH_EMIT_MODE=container + - PATH_EMIT_MODE=auto - HF_HOME=/work/.cache/huggingface - TRANSFORMERS_CACHE=/work/.cache/huggingface - HUGGINGFACE_HUB_CACHE=/work/.cache/huggingface @@ -82,7 +82,7 @@ services: - GLM_MODEL=${GLM_MODEL:-glm-4.6} - LLAMACPP_URL=${LLAMACPP_URL:-http://llamacpp:8080} - COLLECTION_NAME=${COLLECTION_NAME} - - PATH_EMIT_MODE=container + - PATH_EMIT_MODE=auto - HF_HOME=/tmp/huggingface - HF_HUB_CACHE=/tmp/huggingface/hub - TRANSFORMERS_CACHE=/tmp/huggingface/transformers @@ -121,7 +121,7 @@ services: - FASTMCP_TRANSPORT=${FASTMCP_HTTP_TRANSPORT} - QDRANT_URL=${QDRANT_URL} - COLLECTION_NAME=${COLLECTION_NAME} - - PATH_EMIT_MODE=container + - PATH_EMIT_MODE=auto - HF_HOME=/work/.cache/huggingface - TRANSFORMERS_CACHE=/work/.cache/huggingface - HUGGINGFACE_HUB_CACHE=/work/.cache/huggingface @@ -170,7 +170,7 @@ services: - LLAMACPP_URL=${LLAMACPP_URL:-http://llamacpp:8080} - FASTMCP_HEALTH_PORT=18001 - COLLECTION_NAME=${COLLECTION_NAME} - - PATH_EMIT_MODE=container + - PATH_EMIT_MODE=auto - HF_HOME=/tmp/huggingface - HF_HUB_CACHE=/tmp/huggingface/hub - TRANSFORMERS_CACHE=/tmp/huggingface/transformers @@ -215,6 +215,7 @@ services: context: . dockerfile: Dockerfile.indexer container_name: indexer-dev-remote + user: "1000:1000" depends_on: - qdrant env_file: diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index 9de3318c..cc6edd43 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -1,6 +1,6 @@ # Context Engine Architecture -**Documentation:** [README](../README.md) · [Configuration](CONFIGURATION.md) · [IDE Clients](IDE_CLIENTS.md) · [MCP API](MCP_API.md) · [ctx CLI](CTX_CLI.md) · [Memory Guide](MEMORY_GUIDE.md) · [Architecture](ARCHITECTURE.md) · [Multi-Repo](MULTI_REPO_COLLECTIONS.md) · [Kubernetes](../deploy/kubernetes/README.md) · [VS Code Extension](vscode-extension.md) · [Troubleshooting](TROUBLESHOOTING.md) · [Development](DEVELOPMENT.md) +**Documentation:** [README](../README.md) · [Getting Started](GETTING_STARTED.md) · [Configuration](CONFIGURATION.md) · [IDE Clients](IDE_CLIENTS.md) · [MCP API](MCP_API.md) · [ctx CLI](CTX_CLI.md) · [Memory Guide](MEMORY_GUIDE.md) · [Architecture](ARCHITECTURE.md) · [Multi-Repo](MULTI_REPO_COLLECTIONS.md) · [Kubernetes](../deploy/kubernetes/README.md) · [VS Code Extension](vscode-extension.md) · [Troubleshooting](TROUBLESHOOTING.md) · [Development](DEVELOPMENT.md) --- diff --git a/docs/CLAUDE.example.md b/docs/CLAUDE.example.md index ce132d4e..baa1f3c1 100644 --- a/docs/CLAUDE.example.md +++ b/docs/CLAUDE.example.md @@ -1,25 +1,26 @@ -This file is intended for AI agents (Claude, etc.) using the Context‑Engine Qdrant‑Indexer and Memory MCP tools. It encodes project‑specific best practices; adapt it per‑repo. +This file is intended for AI agents (Claude, etc.) using the Context‑Engine Qdrant‑Indexer and Memory MCP tools. Agentic AI Project Rules: When to Use MCP Qdrant-Indexer vs Grep Core Decision Rules (for AI agents) + Workspace default: For this repo, MCP Qdrant-Indexer tools are the primary way to explore code and history. Always start with MCP for exploration, debugging, or "where/why" questions; use literal search/file-open only for narrow exact-literal lookups. + - Use MCP Qdrant-Indexer when: - You are exploring or don't know exact strings/symbols. - You need semantic or cross-file understanding (relationships, patterns, architecture). - You want ranked results with surrounding context, not just line hits. - - Use grep when: - - You know the exact string/function/variable or error message. - - You need fast literal search or are extremely token/latency constrained. + - Use literal search/file-open when (and only when): + - You know the exact string/function/variable or error message, and you only need to confirm its existence or a file/line quickly (not to understand behavior or architecture). Quick Heuristics: - - If you know the exact string → start with grep, then switch to MCP for broader context. - - If the question is conceptual/architectural → start with MCP. + - If the question is conceptual/architectural or about "where/why" behavior changed → start with MCP. - If you need rich context/snippets around matches → MCP. - - If you just need to confirm existence/location → grep. + - If you only need to confirm existence/location of a specific literal (error message, env var, exact function name) → literal search/file-open. + - If in doubt → start with MCP. Grep Anti-Patterns: @@ -86,6 +87,20 @@ Agentic AI Project Rules: When to Use MCP Qdrant-Indexer vs Grep - Use for: short natural-language summaries/explanations of specific modules or tools, grounded in code/docs with citations. - Good for: "What does scripts/standalone_upload_client.py do at a high level?", "Summarize the remote upload client pipeline.". + Advanced lineage workflow (code + history): + + - Goal: answer "when/why did behavior X change?" without flooding context. + - Step 1 – Find current implementation (code): + - Use repo_search to locate the relevant file/symbol, e.g. `repo_search(query: "upload client timeout", language: "python", under: "scripts")`. + - Step 2 – Summarize recent change activity for a file: + - Call change_history_for_path with `include_commits=true` to get churn stats and a small list of recent commits, e.g. `change_history_for_path(path: "scripts/remote_upload_client.py", include_commits: true)`. + - Step 3 – Pull commit lineage for a specific behavior: + - Use search_commits_for with short behavior phrases plus an optional path filter, e.g. `search_commits_for(query: "remote upload timeout retry", path: "scripts/remote_upload_client.py")`. + - Read lineage_goal / lineage_symbols / lineage_tags to understand intent and related concepts. + - Step 4 – Optionally summarize current behavior: + - After you have the right file/symbol from repo_search, use context_answer to explain what the module does now; treat commit lineage as background, not as primary code context. + - For exact line-level changes (e.g. "when did this literal constant change?"), use lineage tools to narrow candidate commits, then inspect diffs with git tooling; do not guess purely from summaries. + Query Phrasing Tips for context_answer: - Prefer behavior/architecture questions about a single module or tool: @@ -100,9 +115,6 @@ Agentic AI Project Rules: When to Use MCP Qdrant-Indexer vs Grep - Then call context_answer to summarize behavior, using a behavior-focused question that doesn't over-specify filenames. - Avoid using context_answer as a primary debugger for low-level helper/env behavior; prefer repo_search + direct code reading for detailed semantics. - Remember: the MCP tools themselves expose detailed descriptions and parameter docs. - Use those for exact knobs; this guide is about choosing the right tool and shaping good queries. - MCP Tool Families (for AI agents) - Indexer / Qdrant tools: diff --git a/docs/CONFIGURATION.md b/docs/CONFIGURATION.md index dda96fdd..13c3a502 100644 --- a/docs/CONFIGURATION.md +++ b/docs/CONFIGURATION.md @@ -2,7 +2,7 @@ Complete environment variable reference for Context Engine. -**Documentation:** [README](../README.md) · [Configuration](CONFIGURATION.md) · [IDE Clients](IDE_CLIENTS.md) · [MCP API](MCP_API.md) · [ctx CLI](CTX_CLI.md) · [Memory Guide](MEMORY_GUIDE.md) · [Architecture](ARCHITECTURE.md) · [Multi-Repo](MULTI_REPO_COLLECTIONS.md) · [Kubernetes](../deploy/kubernetes/README.md) · [VS Code Extension](vscode-extension.md) · [Troubleshooting](TROUBLESHOOTING.md) · [Development](DEVELOPMENT.md) +**Documentation:** [README](../README.md) · [Getting Started](GETTING_STARTED.md) · [Configuration](CONFIGURATION.md) · [IDE Clients](IDE_CLIENTS.md) · [MCP API](MCP_API.md) · [ctx CLI](CTX_CLI.md) · [Memory Guide](MEMORY_GUIDE.md) · [Architecture](ARCHITECTURE.md) · [Multi-Repo](MULTI_REPO_COLLECTIONS.md) · [Kubernetes](../deploy/kubernetes/README.md) · [VS Code Extension](vscode-extension.md) · [Troubleshooting](TROUBLESHOOTING.md) · [Development](DEVELOPMENT.md) --- diff --git a/docs/CTX_CLI.md b/docs/CTX_CLI.md index 2a0f620c..a9e87938 100644 --- a/docs/CTX_CLI.md +++ b/docs/CTX_CLI.md @@ -2,7 +2,7 @@ A thin CLI that retrieves code context and rewrites your input into a better, context-aware prompt using the local LLM decoder. Works with both questions and commands/instructions. -**Documentation:** [README](../README.md) · [Configuration](CONFIGURATION.md) · [IDE Clients](IDE_CLIENTS.md) · [MCP API](MCP_API.md) · [ctx CLI](CTX_CLI.md) · [Memory Guide](MEMORY_GUIDE.md) · [Architecture](ARCHITECTURE.md) · [Multi-Repo](MULTI_REPO_COLLECTIONS.md) · [Kubernetes](../deploy/kubernetes/README.md) · [VS Code Extension](vscode-extension.md) · [Troubleshooting](TROUBLESHOOTING.md) · [Development](DEVELOPMENT.md) +**Documentation:** [README](../README.md) · [Getting Started](GETTING_STARTED.md) · [Configuration](CONFIGURATION.md) · [IDE Clients](IDE_CLIENTS.md) · [MCP API](MCP_API.md) · [ctx CLI](CTX_CLI.md) · [Memory Guide](MEMORY_GUIDE.md) · [Architecture](ARCHITECTURE.md) · [Multi-Repo](MULTI_REPO_COLLECTIONS.md) · [Kubernetes](../deploy/kubernetes/README.md) · [VS Code Extension](vscode-extension.md) · [Troubleshooting](TROUBLESHOOTING.md) · [Development](DEVELOPMENT.md) --- diff --git a/docs/DEVELOPMENT.md b/docs/DEVELOPMENT.md index 51e8041a..b38e0b1a 100644 --- a/docs/DEVELOPMENT.md +++ b/docs/DEVELOPMENT.md @@ -2,7 +2,7 @@ This guide covers setting up a development environment, understanding the codebase structure, and contributing to Context Engine. -**Documentation:** [README](../README.md) · [Configuration](CONFIGURATION.md) · [IDE Clients](IDE_CLIENTS.md) · [MCP API](MCP_API.md) · [ctx CLI](CTX_CLI.md) · [Memory Guide](MEMORY_GUIDE.md) · [Architecture](ARCHITECTURE.md) · [Multi-Repo](MULTI_REPO_COLLECTIONS.md) · [Kubernetes](../deploy/kubernetes/README.md) · [VS Code Extension](vscode-extension.md) · [Troubleshooting](TROUBLESHOOTING.md) · [Development](DEVELOPMENT.md) +**Documentation:** [README](../README.md) · [Getting Started](GETTING_STARTED.md) · [Configuration](CONFIGURATION.md) · [IDE Clients](IDE_CLIENTS.md) · [MCP API](MCP_API.md) · [ctx CLI](CTX_CLI.md) · [Memory Guide](MEMORY_GUIDE.md) · [Architecture](ARCHITECTURE.md) · [Multi-Repo](MULTI_REPO_COLLECTIONS.md) · [Kubernetes](../deploy/kubernetes/README.md) · [VS Code Extension](vscode-extension.md) · [Troubleshooting](TROUBLESHOOTING.md) · [Development](DEVELOPMENT.md) --- @@ -66,6 +66,30 @@ curl http://localhost:8000/sse # Memory server SSE curl http://localhost:8001/sse # Indexer server SSE ``` +### 4. IDE MCP Configuration + +For MCP-aware IDEs (Claude Code, Windsurf, etc.), prefer the HTTP MCP endpoints: + +```bash +# Memory MCP (HTTP) +http://localhost:8002/mcp + +# Indexer MCP (HTTP) +http://localhost:8003/mcp + +# Health checks +curl http://localhost:18002/readyz # Memory health +curl http://localhost:18003/readyz # Indexer health +``` + +SSE endpoints (`/sse`) remain available and are typically used via `mcp-remote`, but some clients that send MCP messages in parallel on a fresh session can hit a FastMCP initialization guard and intermittently log: + +```text +Failed to validate request: Received request before initialization was complete +``` + +If you see tools/resources only appearing after a second reconnect, switch your IDE configuration to use the HTTP `/mcp` endpoints instead of SSE. + ## Project Structure ``` diff --git a/docs/GETTING_STARTED.md b/docs/GETTING_STARTED.md new file mode 100644 index 00000000..8d22bafb --- /dev/null +++ b/docs/GETTING_STARTED.md @@ -0,0 +1,153 @@ +# Getting Started (VS Code + Dev-Remote) + +**Documentation:** [README](../README.md) · [Getting Started](GETTING_STARTED.md) · [Configuration](CONFIGURATION.md) · [IDE Clients](IDE_CLIENTS.md) · [MCP API](MCP_API.md) · [ctx CLI](CTX_CLI.md) · [Memory Guide](MEMORY_GUIDE.md) · [Architecture](ARCHITECTURE.md) · [Multi-Repo](MULTI_REPO_COLLECTIONS.md) · [Kubernetes](../deploy/kubernetes/README.md) · [VS Code Extension](vscode-extension.md) · [Troubleshooting](TROUBLESHOOTING.md) · [Development](DEVELOPMENT.md) + +This guide is for developers who want the lowest-friction way to try Context-Engine: + +- Run a single Docker Compose stack +- Install one VS Code extension +- Open a project and start asking questions about your code + +--- + +## 1. Prerequisites + +- **Docker** (Docker Desktop or equivalent) +- **Git** +- **VS Code** (to use the Context Engine Uploader extension) +- **An MCP-enabled IDE or client** to talk to Context-Engine via MCP, for example: + - Claude Code, Windsurf, Cursor, Roo, Cline, Zed (via `mcp-remote`), etc. + +CLI-only workflows using `ctx.py` and hybrid search tools are supported but are documented separately. This guide assumes you will talk to Context-Engine through an MCP-enabled assistant. + +You do *not* need to clone this repo into every project. You run Context-Engine once, then point it at whatever code you care about. + +--- + +## 2. Start the dev-remote stack + +In a terminal (from wherever you want the stack to live): + +```bash +git clone https://github.com/m1rl0k/Context-Engine.git +cd Context-Engine + +# Start the dev-remote stack (Qdrant, MCPs, upload service, watcher, etc.) +docker compose -f docker-compose.dev-remote.yml up -d +``` + +This brings up, on your host machine: + +- Qdrant on `http://localhost:6333` +- Memory MCP: + - SSE: `http://localhost:8000/sse` + - HTTP / RMCP: `http://localhost:8002/mcp` +- Indexer MCP: + - SSE: `http://localhost:8001/sse` + - HTTP / RMCP: `http://localhost:8003/mcp` +- Upload service (used by the VS Code extension) on `http://localhost:8004` + +--- + +## 3. Index your code (via VS Code extension) + +In the dev-remote flow, you normally do **not** run the indexer manually. + +Instead, the VS Code extension uploads your workspace to the dev-remote stack, and the `indexer` + `watcher` services handle: + +- Mirroring your project into the container under `/work` (in dev-workspace folder) +- Walking files, chunking them, and writing vectors + metadata into Qdrant +- Tracking per-file hashes under `.codebase` so unchanged files are skipped + +If you prefer CLI-based indexing, see the README and advanced docs (Multi-Repo, Kubernetes, etc.) for `docker compose run --rm indexer` usage. + +--- + +## 4. Connect your IDE + +The normal way to use Context-Engine is through an MCP-enabled assistant. The simplest config is via the HTTP MCP endpoints below; the VS Code extension can also scaffold these configs for you. + +### Example: Claude Code / generic RMCP client + +Add to your MCP config (for example `claude_code_config.json`): + +```json +{ + "mcpServers": { + "memory": { "url": "http://localhost:8002/mcp" }, + "qdrant-indexer": { "url": "http://localhost:8003/mcp" } + } +} +``` + +### Example: Windsurf / Cursor (SSE) + +If your client prefers SSE: + +```json +{ + "mcpServers": { + "memory": { "type": "sse", "url": "http://localhost:8000/sse", "disabled": false }, + "qdrant-indexer": { "type": "sse", "url": "http://localhost:8001/sse", "disabled": false } + } +} +``` + +See [docs/IDE_CLIENTS.md](IDE_CLIENTS.md) for copy-paste configs for specific IDEs. + +--- + +## 5. Try a few example queries + +Once your IDE MCP's are connected and indexing has finished, just *ask your assistant* questions like the ones below; it will call the MCP tools on your behalf. + +### Code search examples (qdrant-indexer) + +Ask your assistant to run something like: + +- "Find places where remote uploads are retried" +- "Show functions that call ingest_code.index_repo" +- "Search for performance bottlenecks in the upload_service script" + +Under the hood, the client will call tools such as `repo_search` or `context_answer` on the `qdrant-indexer` server. + +### Commit history / lineage examples (qdrant-indexer) + +If you have git history ingestion enabled, you can also ask: + +- "When did we add git history ingestion to the upload client?" +- "When did we optimize git history collection to fetch only commits since last upload?" +- "What commits mention Windows UnicodeDecodeError in git history collection?" + +These eventually call `search_commits_for` and related tools, which use the commit index and lineage summaries. + +--- + +## 6. VS Code extension (recommended) + +For this dev-remote flow, the **Context Engine Uploader** VS Code extension is the primary way to sync and index code: + +- Install from the Marketplace: +- Point it at your project (or let it auto-detect the current workspace root) +- Configure the upload endpoint to `http://localhost:8004` +- Start the uploader; it will force an initial upload and then watch for changes + +Under **Settings → Extensions → Context Engine Uploader** you will typically use: + +- `endpoint`: `http://localhost:8004` (dev-remote upload_service) +- Optional MCP settings: `mcpIndexerUrl`, `mcpMemoryUrl`, and `mcpTransportMode` (`sse-remote` or `http`) pointing at the dev-remote memory/indexer URLs listed above +- Optional auto-config: enable `mcpClaudeEnabled` / `mcpWindsurfEnabled` and `autoWriteMcpConfigOnStartup` to have the extension write Claude Code/Windsurf MCP configs (and an optional `/ctx` hook) for you + +Once running, your code is kept in sync with the dev-remote stack without any manual indexer commands. + +## 7. Where to go next + +Once you have the basic flow working (dev-remote stack up → VS Code extension syncing → IDE connected via MCP → run a few queries), you can explore: + +- [Configuration](CONFIGURATION.md) — environment variables and tuning knobs +- [IDE Clients](IDE_CLIENTS.md) — detailed configs for specific IDEs +- [Multi-Repo](MULTI_REPO_COLLECTIONS.md) — multi-repo collections, remote servers, Kubernetes +- [Memory Guide](MEMORY_GUIDE.md) — how to use the Memory MCP server alongside the indexer +- [Architecture](ARCHITECTURE.md) — deeper dive into how the components fit together +- [ctx CLI](CTX_CLI.md) — CLI workflows and prompt hooks; see `ctx/claude-hook-example.json` for a Claude Code `/ctx` hook wired to `ctx.py` +- [VS Code Extension](vscode-extension.md) — full extension capabilities and settings diff --git a/docs/IDE_CLIENTS.md b/docs/IDE_CLIENTS.md index 97f14c56..468138f6 100644 --- a/docs/IDE_CLIENTS.md +++ b/docs/IDE_CLIENTS.md @@ -2,7 +2,7 @@ Connect your IDE to a running Context-Engine stack. No need to clone this repo into your project. -**Documentation:** [README](../README.md) · [Configuration](CONFIGURATION.md) · [IDE Clients](IDE_CLIENTS.md) · [MCP API](MCP_API.md) · [ctx CLI](CTX_CLI.md) · [Memory Guide](MEMORY_GUIDE.md) · [Architecture](ARCHITECTURE.md) · [Multi-Repo](MULTI_REPO_COLLECTIONS.md) · [Kubernetes](../deploy/kubernetes/README.md) · [VS Code Extension](vscode-extension.md) · [Troubleshooting](TROUBLESHOOTING.md) · [Development](DEVELOPMENT.md) +**Documentation:** [README](../README.md) · [Getting Started](GETTING_STARTED.md) · [Configuration](CONFIGURATION.md) · [IDE Clients](IDE_CLIENTS.md) · [MCP API](MCP_API.md) · [ctx CLI](CTX_CLI.md) · [Memory Guide](MEMORY_GUIDE.md) · [Architecture](ARCHITECTURE.md) · [Multi-Repo](MULTI_REPO_COLLECTIONS.md) · [Kubernetes](../deploy/kubernetes/README.md) · [VS Code Extension](vscode-extension.md) · [Troubleshooting](TROUBLESHOOTING.md) · [Development](DEVELOPMENT.md) --- @@ -21,7 +21,7 @@ Connect your IDE to a running Context-Engine stack. No need to clone this repo i **Prerequisites:** Context-Engine running somewhere (localhost, remote server, or Kubernetes). -**Minimal config** — add to your IDE's MCP settings: +**Minimal config (SSE)** — for clients that only understand SSE or use `mcp-remote`: ```json { "mcpServers": { @@ -30,6 +30,25 @@ Connect your IDE to a running Context-Engine stack. No need to clone this repo i } ``` +**HTTP (recommended for RMCP-capable IDEs)** — prefer this when your IDE supports HTTP MCP / RMCP (Claude Code, Windsurf, Qodo, etc.): + +```json +{ + "mcpServers": { + "memory": { "url": "http://localhost:8002/mcp" }, + "qdrant-indexer": { "url": "http://localhost:8003/mcp" } + } +} +``` + +Using HTTP `/mcp` avoids a FastMCP initialization race that some SSE clients hit when they send `listTools` in parallel with `initialize`, which can log: + +```text +Failed to validate request: Received request before initialization was complete +``` + +If you see tools/resources only appearing after a second reconnect when using SSE, switch your IDE configuration to these HTTP endpoints instead. + Replace `localhost` with your server IP/hostname for remote setups. --- @@ -47,7 +66,9 @@ Replace `localhost` with your server IP/hostname for remote setups. | OpenAI Codex | RMCP | TOML config | | Augment | SSE | Simple JSON configs | | AmpCode | SSE | Simple URL for SSE endpoints | -| Claude Code CLI | SSE | Simple JSON configs | +| Claude Code CLI | SSE / HTTP (RMCP) | Simple JSON configs via .mcp.json | + +**Claude Desktop (Connectors):** Claude Desktop also supports remote MCP servers over SSE and streamable HTTP, but configuration happens via the Claude Connectors UI (Settings → Connectors on claude.ai), not local `.mcp.json`. Treat Context-Engine as a normal remote MCP server there; this guide focuses on IDEs where you control MCP URLs/config files directly (Claude Code, Windsurf, etc.). --- @@ -120,7 +141,7 @@ Add to your Zed `settings.json` (Command Palette → "Settings: Open Settings (J { "qdrant-indexer": { "type": "http", - "url": "http://localhost:8001/sse" + "url": "http://localhost:8003/mcp" } } ``` @@ -200,6 +221,19 @@ When Context-Engine runs on a remote server (e.g., `context.yourcompany.com`): } ``` +If your IDE supports HTTP MCP / RMCP, prefer the HTTP endpoints instead: + +```json +{ + "mcpServers": { + "memory": { "url": "http://context.yourcompany.com:8002/mcp" }, + "qdrant-indexer": { "url": "http://context.yourcompany.com:8003/mcp" } + } +} +``` + +This uses the HTTP `/mcp` transport and avoids the initialization race described above. + **Indexing your local project to the remote server:** ```bash # Using VS Code extension (recommended) diff --git a/docs/MCP_API.md b/docs/MCP_API.md index 73b1b8ef..ac12ea69 100644 --- a/docs/MCP_API.md +++ b/docs/MCP_API.md @@ -2,7 +2,7 @@ This document provides comprehensive API documentation for all MCP (Model Context Protocol) tools exposed by Context Engine's dual-server architecture. -**Documentation:** [README](../README.md) · [Configuration](CONFIGURATION.md) · [IDE Clients](IDE_CLIENTS.md) · [MCP API](MCP_API.md) · [ctx CLI](CTX_CLI.md) · [Memory Guide](MEMORY_GUIDE.md) · [Architecture](ARCHITECTURE.md) · [Multi-Repo](MULTI_REPO_COLLECTIONS.md) · [Kubernetes](../deploy/kubernetes/README.md) · [VS Code Extension](vscode-extension.md) · [Troubleshooting](TROUBLESHOOTING.md) · [Development](DEVELOPMENT.md) +**Documentation:** [README](../README.md) · [Getting Started](GETTING_STARTED.md) · [Configuration](CONFIGURATION.md) · [IDE Clients](IDE_CLIENTS.md) · [MCP API](MCP_API.md) · [ctx CLI](CTX_CLI.md) · [Memory Guide](MEMORY_GUIDE.md) · [Architecture](ARCHITECTURE.md) · [Multi-Repo](MULTI_REPO_COLLECTIONS.md) · [Kubernetes](../deploy/kubernetes/README.md) · [VS Code Extension](vscode-extension.md) · [Troubleshooting](TROUBLESHOOTING.md) · [Development](DEVELOPMENT.md) --- @@ -24,6 +24,33 @@ Context Engine exposes two MCP servers: Both servers support SSE and HTTP RMCP transports simultaneously. +### Transports & IDE Integration + +For each server, two transports are available: + +- **SSE (Server-Sent Events)** + - Memory: `http://localhost:8000/sse` + - Indexer: `http://localhost:8001/sse` + - Typically used via `mcp-remote` or legacy MCP clients. + +- **HTTP (streamable MCP over HTTP)** + - Memory: `http://localhost:8002/mcp` + - Indexer: `http://localhost:8003/mcp` + - Health: + - Memory: `http://localhost:18002/readyz` + - Indexer: `http://localhost:18003/readyz` + - Tools (for debugging): `GET /tools` on the health ports. + +**Recommendation for IDEs:** Prefer the HTTP `/mcp` endpoints when integrating with IDE clients (Claude Code, Windsurf, etc.). HTTP uses a simple request/response pattern where `initialize` completes before `listTools` and other calls, avoiding initialization races. + +When using SSE via `mcp-remote`, some clients may send MCP messages (for example `listTools`) in parallel on a fresh session before `initialize` has fully completed. FastMCP enforces that only `initialize` may be processed during initialization; if a non-initialize request arrives too early, the server can log: + +```text +Failed to validate request: Received request before initialization was complete +``` + +This manifests as tools/resources only appearing after a second reconnect. Switching the IDE to talk directly to the HTTP `/mcp` endpoints avoids this class of issue. + ## Memory Server API ### store() @@ -657,16 +684,11 @@ All API methods follow consistent error handling patterns: ## Transport-Specific Behavior -### SSE (Server-Sent Events) -- Real-time bidirectional communication -- Automatic reconnection on disconnect -- Streaming responses for long operations +Both SSE and HTTP RMCP transports expose the **same tools, arguments, and response shapes**. The choice of transport affects only how MCP messages are carried, not what the tools do. -### HTTP RMCP -- JSON-RPC over HTTP -- Request/response pattern -- Better for batch operations and integrations +- **SSE (`/sse`)** is primarily intended for use behind `mcp-remote` or legacy clients. +- **HTTP (`/mcp`)** is recommended for IDE integrations and direct tooling because it uses a simple request/response pattern where `initialize` completes before `listTools` and other calls, avoiding known initialization races in some SSE clients. -Both transports provide identical API semantics and response formats. +When in doubt, prefer the HTTP `/mcp` endpoints described in the Overview. This API reference should enable developers to effectively integrate Context Engine's MCP tools into their applications and workflows. \ No newline at end of file diff --git a/docs/MEMORY_GUIDE.md b/docs/MEMORY_GUIDE.md index 8f6ee1de..5f65d599 100644 --- a/docs/MEMORY_GUIDE.md +++ b/docs/MEMORY_GUIDE.md @@ -2,7 +2,7 @@ Best practices for using Context Engine's memory system effectively. -**Documentation:** [README](../README.md) · [Configuration](CONFIGURATION.md) · [IDE Clients](IDE_CLIENTS.md) · [MCP API](MCP_API.md) · [ctx CLI](CTX_CLI.md) · [Memory Guide](MEMORY_GUIDE.md) · [Architecture](ARCHITECTURE.md) · [Multi-Repo](MULTI_REPO_COLLECTIONS.md) · [Kubernetes](../deploy/kubernetes/README.md) · [VS Code Extension](vscode-extension.md) · [Troubleshooting](TROUBLESHOOTING.md) · [Development](DEVELOPMENT.md) +**Documentation:** [README](../README.md) · [Getting Started](GETTING_STARTED.md) · [Configuration](CONFIGURATION.md) · [IDE Clients](IDE_CLIENTS.md) · [MCP API](MCP_API.md) · [ctx CLI](CTX_CLI.md) · [Memory Guide](MEMORY_GUIDE.md) · [Architecture](ARCHITECTURE.md) · [Multi-Repo](MULTI_REPO_COLLECTIONS.md) · [Kubernetes](../deploy/kubernetes/README.md) · [VS Code Extension](vscode-extension.md) · [Troubleshooting](TROUBLESHOOTING.md) · [Development](DEVELOPMENT.md) --- diff --git a/docs/MULTI_REPO_COLLECTIONS.md b/docs/MULTI_REPO_COLLECTIONS.md index 991d2cac..9aba4852 100644 --- a/docs/MULTI_REPO_COLLECTIONS.md +++ b/docs/MULTI_REPO_COLLECTIONS.md @@ -1,6 +1,6 @@ # Multi-Repository Collection Architecture -**Documentation:** [README](../README.md) · [Configuration](CONFIGURATION.md) · [IDE Clients](IDE_CLIENTS.md) · [MCP API](MCP_API.md) · [ctx CLI](CTX_CLI.md) · [Memory Guide](MEMORY_GUIDE.md) · [Architecture](ARCHITECTURE.md) · [Multi-Repo](MULTI_REPO_COLLECTIONS.md) · [Kubernetes](../deploy/kubernetes/README.md) · [VS Code Extension](vscode-extension.md) · [Troubleshooting](TROUBLESHOOTING.md) · [Development](DEVELOPMENT.md) +**Documentation:** [README](../README.md) · [Getting Started](GETTING_STARTED.md) · [Configuration](CONFIGURATION.md) · [IDE Clients](IDE_CLIENTS.md) · [MCP API](MCP_API.md) · [ctx CLI](CTX_CLI.md) · [Memory Guide](MEMORY_GUIDE.md) · [Architecture](ARCHITECTURE.md) · [Multi-Repo](MULTI_REPO_COLLECTIONS.md) · [Kubernetes](../deploy/kubernetes/README.md) · [VS Code Extension](vscode-extension.md) · [Troubleshooting](TROUBLESHOOTING.md) · [Development](DEVELOPMENT.md) --- diff --git a/docs/TROUBLESHOOTING.md b/docs/TROUBLESHOOTING.md index 34913d72..ad044bb8 100644 --- a/docs/TROUBLESHOOTING.md +++ b/docs/TROUBLESHOOTING.md @@ -2,7 +2,7 @@ Common issues and solutions for Context Engine. -**Documentation:** [README](../README.md) · [Configuration](CONFIGURATION.md) · [IDE Clients](IDE_CLIENTS.md) · [MCP API](MCP_API.md) · [ctx CLI](CTX_CLI.md) · [Memory Guide](MEMORY_GUIDE.md) · [Architecture](ARCHITECTURE.md) · [Multi-Repo](MULTI_REPO_COLLECTIONS.md) · [Kubernetes](../deploy/kubernetes/README.md) · [VS Code Extension](vscode-extension.md) · [Troubleshooting](TROUBLESHOOTING.md) · [Development](DEVELOPMENT.md) +**Documentation:** [README](../README.md) · [Getting Started](GETTING_STARTED.md) · [Configuration](CONFIGURATION.md) · [IDE Clients](IDE_CLIENTS.md) · [MCP API](MCP_API.md) · [ctx CLI](CTX_CLI.md) · [Memory Guide](MEMORY_GUIDE.md) · [Architecture](ARCHITECTURE.md) · [Multi-Repo](MULTI_REPO_COLLECTIONS.md) · [Kubernetes](../deploy/kubernetes/README.md) · [VS Code Extension](vscode-extension.md) · [Troubleshooting](TROUBLESHOOTING.md) · [Development](DEVELOPMENT.md) --- diff --git a/docs/commit-indexing/cmds.md b/docs/commit-indexing/cmds.md new file mode 100644 index 00000000..fa819886 --- /dev/null +++ b/docs/commit-indexing/cmds.md @@ -0,0 +1,29 @@ +curl -s "http://localhost:6333/collections/Context-Engine-41e67959/points/scroll" -H "Content-Type: application/json" -d '{"filter":{"must":[{"key":"metadata.language","match":{"value":"git"}},{"key":"metadata.kind","match":{"value":"git_message"}}]},"limit":5,"with_payload":true,"with_vector":false}' + + + + +set -a; . .env; set +a; REFRAG_DECODER=1 REFRAG_RUNTIME=glm REFRAG_COMMIT_DESCRIBE=1 python3 - << 'PY' +from scripts.refrag_llamacpp import is_decoder_enabled, get_runtime_kind +from scripts.ingest_history import commit_metadata, generate_commit_summary, run + +print('is_decoder_enabled:', is_decoder_enabled()) +print('runtime:', get_runtime_kind()) + +sha = run('git rev-list --max-count=1 HEAD').strip() +md = commit_metadata(sha) + +diff = run(f'git show --stat --patch --unified=3 {sha}') +print('Testing commit:', sha) +print('Files:', md.get('files')) + +goal, symbols, tags = generate_commit_summary(md, diff) +print('goal:', repr(goal)) +print('symbols:', symbols) +print('tags:', tags) +PY + + + +Index commits: +set -a; . .env; set +a; COLLECTION_NAME=Context-Engine-41e67959 QDRANT_URL=http://localhost:6333 REFRAG_DECODER=1 REFRAG_RUNTIME=glm REFRAG_COMMIT_DESCRIBE=1 python3 -m scripts.ingest_history --since '6 months ago' --max-commits 10 --per-batch 10 \ No newline at end of file diff --git a/docs/commit-indexing/experiments.md b/docs/commit-indexing/experiments.md new file mode 100644 index 00000000..64d77e2b --- /dev/null +++ b/docs/commit-indexing/experiments.md @@ -0,0 +1,281 @@ +--- + +## 1. High-level agent recipe + +Target question: +**“When and why did behavior X change in file Y?”** + +Recommended steps: + +1. **Localize the behavior now (code search).** + Use [repo_search](cci:1://file:///home/coder/project/Context-Engine/scripts/mcp_indexer_server.py:1663:0-2468:5) to find the current implementation of X. + +2. **Shortlist relevant commits (lineage search).** + Use [search_commits_for(path=, query=)](cci:1://file:///home/coder/project/Context-Engine/scripts/mcp_indexer_server.py:2760:0-2898:65) to get a tiny set of candidate commits with `lineage_goal` / tags. + +3. **Decide if lineage summary is enough.** + Sometimes `lineage_goal` already answers the “why” without diffs. + +4. **If needed, pull small diffs and reason.** + For 1–2 chosen SHAs, fetch compact diffs for that file and let the LLM explain _how_ the behavior changed. + +Everything else is just detail and guardrails around these four steps. + +--- + +## 2. Step-by-step, with knobs + +### Step 1: Localize behavior (repo_search) + +- **Inputs:** + - Behavior symbol: [ensureIndexedWatcher](cci:1://file:///home/coder/project/Context-Engine/vscode-extension/context-engine-uploader/extension.js:751:0-781:1), `index_repo`, etc. + - Optional context: `"status bar"`, `"upload delta"`, etc. + +- **Call pattern:** + + ```jsonc + repo_search( + query: "ensureIndexedWatcher status bar", + under: "vscode-extension/context-engine-uploader", + limit: 8, + per_path: 2 + ) + ``` + +- **Goal:** + - Identify: + - Canonical file path, e.g. `"vscode-extension/context-engine-uploader/extension.js"`. + - Specific function or symbol (span) you care about. + +Agents should **store**: + +- `target_file`: relative path under repo. +- `target_symbol` or a short description of the behavior X. + +### Step 2: Shortlist commits (search_commits_for) + +- **Normalize path:** + - [search_commits_for](cci:1://file:///home/coder/project/Context-Engine/scripts/mcp_indexer_server.py:2760:0-2898:65) uses commit metadata `files` like: + - `"scripts/ingest_code.py"` + - `"vscode-extension/context-engine-uploader/scripts/ctx.py"` + - So pass exactly that style: **no `/work/...` prefix**. + +- **Call pattern:** + + ```jsonc + search_commits_for( + query: "ensureIndexedWatcher status bar", // or a simpler keyword + path: "vscode-extension/context-engine-uploader/extension.js", + collection: "Context-Engine-41e67959", + limit: 5, + max_points: 1000 + ) + ``` + +- **What you get back:** + + ```json + { + "commit_id": "...", + "message": "...", + "files": ["..."], + "lineage_goal": "short intent summary", + "lineage_symbols": [...], + "lineage_tags": [...] + } + ``` + +Agents should: + +- Prefer commits where: + - `files` includes the target file, and + - `lineage_goal` / `lineage_symbols` mention relevant concepts. + +This step is the **semantic pre-filter**: instead of scrolling `git log -- path`, you pick 1–3 promising SHAs. + +### Step 3: Answer “why” from lineage, if possible + +For many questions, you don’t need diffs at all: + +- Example from your run: + + ```json + { + "commit_id": "e9d5...", + "message": "Remove duplicate ctx script in extension ...", + "lineage_goal": "Remove duplicate ctx script from extension as it's bundled at build time", + "lineage_tags": ["cleanup","duplicate","build","extension","script"] + } + ``` + +For a question like: + +> “Why did ctx.py disappear from the extension folder?” + +An agent can answer almost entirely from: + +- `lineage_goal` + `message` + filename, maybe with a tiny code/context snippet. + +**Rule of thumb:** +If the question is “why was this introduced/removed/renamed?”, try to answer from `lineage_goal` before reaching for diffs. + +### Step 4: When you actually need diffs + +Only when: + +- The question is about **behavior changes** (“when did it start returning null?”, “when did it stop calling X?”), or +- `lineage_goal` is too high-level, + +should you pull real diffs. + +In this repo, a local agent can: + +```bash +git show -p -- +# or with smaller context: +git show --unified=3 -- +``` + +Then: + +- Extract only the hunks around the target symbol or lines (to save tokens). +- Ask the model: + + > “Given this diff for `` and the current code for ``, explain how the behavior of X changed.” + +For a future remote/MCP world, this would be a natural small MCP tool: + +- `get_commit_diff(commit_id, path, context_lines=3)` → returns only the relevant diff hunks as text. + +But you don’t need that implemented yet to exercise the pattern locally. + +--- + +## 3. Example in this repo (ctx.py cleanup) + +Concrete run we just saw: + +1. **Behavior:** “What happened to `ctx.py` in the VS Code extension?” + +2. **repo_search (hypothetical):** + + - Find `vscode-extension/context-engine-uploader/scripts/ctx.py`. + +3. **search_commits_for:** + + ```json + search_commits_for( + query: "ctx script build time", + path: "vscode-extension/context-engine-uploader/scripts/ctx.py", + limit: 3 + ) + ``` + + One of the results: + + ```json + { + "commit_id": "e9d5...", + "message": "Remove duplicate ctx script in extension - bundled at build time - ctx is available in-repo (scripts/ctx.py)", + "lineage_goal": "Remove duplicate ctx script from extension as it's bundled at build time", + "lineage_symbols": ["ctx.py","vscode-extension","build-time","bundled"], + "lineage_tags": ["cleanup","duplicate","build","extension","script"] + } + ``` + +4. **Answer “why”**: + + - No diff needed: you can say: + - It was removed as a duplicate because the script is bundled at build time and already available in-repo. + - If you *also* want “how did the code change?”: + - Pull `git show -p e9d5... -- vscode-extension/context-engine-uploader/scripts/ctx.py`. + - Let the LLM confirm that the extension now uses the in-repo `scripts/ctx.py` and no longer ships a copy. + +--- + +## 4. Where this goes next + +We don’t need more MCP tools immediately; we have: + +- [repo_search](cci:1://file:///home/coder/project/Context-Engine/scripts/mcp_indexer_server.py:1663:0-2468:5) → code now. +- [search_commits_for](cci:1://file:///home/coder/project/Context-Engine/scripts/mcp_indexer_server.py:2760:0-2898:65) → commit shortlist with lineage summaries. +- [change_history_for_path(include_commits=true)](cci:1://file:///home/coder/project/Context-Engine/scripts/mcp_indexer_server.py:2901:0-3015:56) → file-level view with recent commits. + +Polish / next actions (conceptual, not coding yet): + +- Encode this **4-step playbook** into an “advanced: context lineage” section in [CLAUDE.md](cci:7://file:///home/coder/project/CLAUDE.md:0:0-0:0) for agents. +- Later, if needed, introduce a tiny `get_commit_diff` MCP tool for remote setups; locally, continue to use `git show` directly. + +If you want, next step we can actually draft that “advanced lineage workflow” section text for [CLAUDE.md](cci:7://file:///home/coder/project/CLAUDE.md:0:0-0:0), using the above structure but even more compressed for agents. + +--- + +## 5. "Bad message" / good summary sanity check + +Question: *Is GLM just parroting commit messages, or is it actually reading diffs and/or detailed bodies?* + +### Commit under test + +- **SHA:** `6adced4ed83adf75ad8f8c2649b4599a68fb53ae` +- **Subject:** `fix` +- **Body (excerpt):** + - `What this fixes:` + - `stopProcesses will not resolve prematurely` + - `runSequence cannot start a new watcher while the previous one is still alive` + - `Resilient to processes that ignore SIGTERM` +- **Files touched (relevant):** + - `vscode-extension/context-engine-uploader/extension.js` + +Diff (abridged) shows changes to `terminateProcess(proc, label)`: + +- Introduces `termTimer` / `killTimer` and a `cleanup()` helper. +- Makes `finalize(reason)` idempotent and ensures timers are cleared. +- Hooks `exit` / `close` handlers into a shared `onExit` that calls `finalize` with exit status / signal. +- Keeps an initial `proc.kill()` (SIGTERM), then: + - Waits `waitSigtermMs` (4s), then tries `proc.kill('SIGKILL')` and logs a message. + - After an additional `waitSigkillMs` (2s), forces `finalize` with a “forced after X ms” reason. + +### Lineage summary produced by GLM + +From `search_commits_for(query="fix", path="", limit=5)` we see, for this SHA: + +```json +{ + "commit_id": "6adced4ed83adf75ad8f8c2649b4599a68fb53ae", + "message": "fix", + "files": [".env.example", "vscode-extension/context-engine-uploader/extension.js"], + "lineage_goal": "Fix process termination and watcher lifecycle issues", + "lineage_symbols": [ + "SIGTERM", + "SIGKILL", + "watchProcess", + "forceProcess" + ], + "lineage_tags": [ + "process-management", + "termination", + "watcher", + "lifecycle", + "signal-handling" + ] +} +``` + +### Interpretation + +- The **subject** alone (`fix`) is non-informative. +- The **body** gives some English hints about watcher behavior and SIGTERM resilience. +- The **diff** clearly shows: + - A more robust termination sequence (SIGTERM → SIGKILL → forced finalize). + - Explicit references to `watchProcess`, `forceProcess`, and signal names. + +The GLM summary: + +- Captures the high-level intent (`process termination and watcher lifecycle issues`). +- Names concrete symbols seen in the diff (`SIGTERM`, `SIGKILL`, `watchProcess`, `forceProcess`). +- Adds tags (`process-management`, `signal-handling`, etc.) that do not appear verbatim in the subject. + +Conclusion for this case: + +- `lineage_goal` is *not* just a restatement of the one-word subject; it reflects both the commit body and the structure of the diff. +- `lineage_symbols` / `lineage_tags` show that GLM is paying attention to changed identifiers and behavior, making this commit discoverable via queries like `"watcher lifecycle"`, `"SIGTERM"`, or `"process termination"` even though the subject is just `fix`. \ No newline at end of file diff --git a/docs/commit-indexing/overview.md b/docs/commit-indexing/overview.md new file mode 100644 index 00000000..3d4ea088 --- /dev/null +++ b/docs/commit-indexing/overview.md @@ -0,0 +1,144 @@ +# Commit Indexing & Context Lineage: Goals and Status + +## 1. Motivation + +- **Historical context for agents** + - Modern agents are good at reading current files but struggle when the answer is buried in months of commit history. + - Goal: expose a compact, queryable view of *how and why* the code evolved, not just what it looks like now. +- **Complement, not replace, git log** + - Humans and local tools can always use `git log` / `git show` / `git diff` directly. + - Commit indexing and lineage should add value by: + - Making history available to remote/agent clients that cannot run git. + - Providing structured summaries and tags so agents can quickly find and explain relevant changes. + +## 2. Current architecture (v1) + +- **Commit harvesting (`scripts/ingest_history.py`)** + - Walks git history with configurable filters (`--since`, `--until`, `--author`, `--path`, `--max-commits`). + - For each commit: + - Captures `commit_id`, `author_name`, `authored_date`, `message` (subject + body, redacted), and `files` touched. + - Builds a short `document` and `information` string. + - Embeds into Qdrant in the same collection as code, with metadata: + - `language="git"`, `kind="git_message"`. + - `symbol` / `symbol_path` = `commit_id`. + - `files`, `repo`, `path=.git`, `ingested_at`, etc. + +- **GLM-backed diff summarization (`generate_commit_summary`)** + - Opt-in via `REFRAG_COMMIT_DESCRIBE=1` and decoder flags (`REFRAG_DECODER=1`, `REFRAG_RUNTIME=glm`). + - For each commit, fetches a truncated `git show --stat --patch --unified=3 ` and sends it to the decoder (GLM or llama.cpp). + - Asks for compact JSON: + - `goal`: short explanation of the commit’s intent / behavior change. + - `symbols`: 1–6 key functions/flags/terms. + - `tags`: 3–6 short keywords to aid retrieval. + - On success, stores these as metadata on the commit point: + - `lineage_goal`, `lineage_symbols`, `lineage_tags`. + - On failure or when disabled, falls back gracefully and leaves these fields empty. + +- **Indexer-facing Qdrant schema** + - Commit points live in the same Qdrant collection as code spans (e.g. `Context-Engine-41e67959`). + - This allows hybrid flows that combine code search and commit search within one collection. + +## 3. MCP tools and usage + +- **`search_commits_for` (indexer MCP)** + - Purpose: search git commit history stored in Qdrant. + - Filters: + - Always restricts to `language="git"`, `kind="git_message"`. + - Optional `path` filter: only keep commits whose `files` list contains the path substring. + - Optional `query`: lexical match against a composite blob containing: + - `message` + `information`. + - `lineage_goal`, `lineage_symbols`, `lineage_tags`. + - Output (per commit): + - `commit_id`, `author_name`, `authored_date`, `message`, `files`. + - `lineage_goal`, `lineage_symbols`, `lineage_tags`. + - Dedupes by `commit_id` so each commit appears at most once per response. + +- **`change_history_for_path(include_commits=true)` (indexer MCP)** + - Base behavior: + - Scans Qdrant points whose `metadata.path == ` (code index), summarizing: + - `points_scanned`, `distinct_hashes`, `last_modified_min/max`, `ingested_min/max`, `churn_count_max`. + - With `include_commits=true`: + - Calls `search_commits_for(path=)` and attaches a small list of recent commits: + - Each entry includes commit metadata plus any `lineage_*` fields. + - Dedupes by `commit_id` before attaching. + - Intended usage: + - Fast “what changed and how hot is this file?” view for agents. + - Entry point for deeper lineage questions when combined with `repo_search` and git diffs. + +## 4. Current experiments and evaluation + +See: +- `cmds.md` for handy one-liner commands (curl, ingest, local GLM tests). +- `experiments.md` for a detailed “when/why did behavior X change?” recipe and worked examples. + +Key experiments so far: + +- **GLM summarization sanity-check** + - Local script that: + - Picks `HEAD` via `git rev-list --max-count=1 HEAD`. + - Calls `commit_metadata` + `generate_commit_summary`. + - Observed: with valid GLM API keys and flags, we get reasonable `goal/symbols/tags` for real commits. + +- **Qdrant payload inspection** + - Direct `curl` scroll over `Context-Engine-41e67959` for `language="git"`, `kind="git_message"`. + - Verified commit points include: + - Baseline metadata (message, files, etc.). + - Newly-added `lineage_goal`, `lineage_symbols`, `lineage_tags` after reindexing. + +- **MCP round-trip tests** + - `search_commits_for(query="pseudo tag boost")` → surfaces the hybrid_search commit with clear lineage fields. + - `search_commits_for(query="ctx script", path="vscode-extension/context-engine-uploader/scripts/ctx.py")` → surfaces the ctx cleanup commit and explains its intent. + - `change_history_for_path(path="vscode-extension/context-engine-uploader/scripts/ctx.py", include_commits=true)` → returns a deduped list of relevant commits with lineage summaries. + +These confirm the end-to-end path: +- Git → ingest_history → GLM → Qdrant → MCP → agent. + +## 5. Target workflows (what we are aiming for) + +Our north star is the "Context Lineage" behavior from the Augment blog: + +- **Hero question:** + - “When and why did behavior X change in file Y?” + +- **Recommended agent flow:** + 1. **Localize X in current code** + - Use `repo_search` to find the symbol / behavior in the current tree. + 2. **Shortlist commits about X** + - Use `search_commits_for(path=, query=)` to get a compact list of relevant commits with `lineage_goal`/tags. + 3. **Try to answer "why" from summaries** + - Many “why was this introduced/removed/renamed?” questions can be answered from `lineage_goal` plus minimal code context. + 4. **If necessary, pull diffs to answer "how"** + - Use `git show --unified=3 -- ` (or a future MCP diff tool) and let the LLM explain the behavior change in detail. + +This should: +- Reduce reliance on raw `git log` grepping in larger repos. +- Give agents a semantic, compact view of history they can reason over. + +## 6. Open questions and future improvements + +- **Prompt quality and consistency** + - Are `lineage_goal` strings consistently helpful across many commits, or do they drift toward restating the subject line? + - Do `lineage_symbols` and `lineage_tags` give agents enough hooks to connect history with current code (e.g., flags, functions, config keys)? + +- **Search behavior and ranking** + - How often does `search_commits_for` surface the right commit(s) in the top N for real questions? + - Do we need semantic reranking or additional filters (date ranges, authors, etc.) in practice? + +- **Higher-level `lineage_answer` helper** + - Today: agents compose `repo_search` + `change_history_for_path(include_commits=true)` + `search_commits_for` + optional `context_answer` themselves. + - Future: a thin MCP wrapper (e.g., `lineage_answer(query, path=...)`) could orchestrate those calls and ask the decoder to produce a short "when/why did this change" answer, returning both the text and the underlying commit/code citations. + +- **Diff access for remote agents** + - Today: local workflows can rely on `git show` from the shell. + - Future: a small, token-conscious MCP tool like `get_commit_diff(commit_id, path, context_lines)` could make lineage usable from fully remote contexts when precise line-level inspection is required. + +- **Remote git metadata via upload pipeline** + - Current commit ingest assumes direct access to a local `.git` repo (`ingest_history.py` running alongside the indexer). + - Future: the standalone upload client could optionally parse a compact git log view (e.g., JSON-ified commit metadata + diffs) and bundle it with delta uploads, with the upload_service and watcher feeding that into commit indexing for remote/non-git workspaces. + +- **Docs and agent guidance** + - CLAUDE.md (and related examples) should clearly document when to: + - Prefer lineage summaries over raw diffs for "why" questions. + - Fallback to `repo_search + git show` (or a future diff MCP tool) for detailed "how" questions. + +This document is meant as the high-level tracker for commit indexing and context-lineage work. Use `cmds.md` for concrete commands and `experiments.md` for detailed workflows and notes on specific runs. diff --git a/docs/vscode-extension.md b/docs/vscode-extension.md index 605fd85a..396fbd49 100644 --- a/docs/vscode-extension.md +++ b/docs/vscode-extension.md @@ -2,7 +2,7 @@ Context Engine Uploader extension for automatic workspace sync and Prompt+ integration. -**Documentation:** [README](../README.md) · [Configuration](CONFIGURATION.md) · [IDE Clients](IDE_CLIENTS.md) · [MCP API](MCP_API.md) · [ctx CLI](CTX_CLI.md) · [Memory Guide](MEMORY_GUIDE.md) · [Architecture](ARCHITECTURE.md) · [Multi-Repo](MULTI_REPO_COLLECTIONS.md) · [Kubernetes](../deploy/kubernetes/README.md) · [VS Code Extension](vscode-extension.md) · [Troubleshooting](TROUBLESHOOTING.md) · [Development](DEVELOPMENT.md) +**Documentation:** [README](../README.md) · [Getting Started](GETTING_STARTED.md) · [Configuration](CONFIGURATION.md) · [IDE Clients](IDE_CLIENTS.md) · [MCP API](MCP_API.md) · [ctx CLI](CTX_CLI.md) · [Memory Guide](MEMORY_GUIDE.md) · [Architecture](ARCHITECTURE.md) · [Multi-Repo](MULTI_REPO_COLLECTIONS.md) · [Kubernetes](../deploy/kubernetes/README.md) · [VS Code Extension](vscode-extension.md) · [Troubleshooting](TROUBLESHOOTING.md) · [Development](DEVELOPMENT.md) --- @@ -19,7 +19,7 @@ Context Engine Uploader extension for automatic workspace sync and Prompt+ integ ## Quick Start 1. **Install**: Build the `.vsix` and install in VS Code (see [Installation](#installation)) -2. **Configure server**: Settings → `contextEngineUploader.endpoint` → `http://localhost:9090` (or remote server) +2. **Configure server**: Settings → `contextEngineUploader.endpoint` → `http://localhost:8004` for the dev-remote Docker stack (or your upload_service URL) 3. **Index workspace**: Click status bar button or run `Context Engine Uploader: Start` 4. **Use Prompt+**: Select code, click `Prompt+` in status bar to enhance with AI @@ -30,14 +30,15 @@ Context Engine Uploader extension for automatic workspace sync and Prompt+ integ - **Output channel**: Real-time logs for force-sync and watch operations - **GPU decoder support**: Configure llama.cpp, Ollama, or GLM as decoder backend - **Remote server support**: Index to any Context-Engine server (local, remote, Kubernetes) +- **MCP + ctx scaffolding**: Optionally auto-writes Claude Code/Windsurf MCP configs, an optional Claude prompt hook, and a `ctx_config.json` wired to the right collection and decoder settings. ## Workflow Examples -### Local Development -Context-Engine running on same machine: +### Local Development (dev-remote stack) +Context-Engine running via `docker-compose.dev-remote.yml` on the same machine: ``` -Endpoint: http://localhost:9090 -Target Path: (leave empty - uses current workspace) +Endpoint: http://localhost:8004 +Target Path: (leave empty - uses current workspace or let the extension auto-detect) ``` Open any project → extension auto-syncs → MCP tools have your code context. @@ -76,6 +77,13 @@ Looking at upload_service.py lines 120-180, the upload_file() function currently Reference the existing error patterns in remote_upload_client.py lines 45-67 which use structured logging via logger.error(). ``` +### Claude Code hook (optional) + +For Claude Code, you can also enable a `/ctx` hook so that each prompt is expanded via `ctx.py` before it reaches Claude: + +- The extension can auto-write MCP config and, on Linux/dev-remote, a Claude hook when `claudeHookEnabled` is turned on. +- See `docs/ctx/claude-hook-example.json` for a minimal `UserPromptSubmit` hook that shells out to `ctx-hook-simple.sh`. + ## Installation ### Build Prerequisites @@ -108,7 +116,7 @@ Key Settings After Install -------------------------- - `Context Engine Upload` output channel shows force-sync and watch logs. - `Context Engine Uploader: Index Codebase` command or status bar button runs a force sync followed by watch. -- Configure `contextEngineUploader.targetPath`, `endpoint`, and other options under Settings → Extensions → Context Engine Uploader. +- Configure `contextEngineUploader.targetPath`, `endpoint`, and (optionally) MCP settings (`mcpIndexerUrl`, `mcpMemoryUrl`, `mcpTransportMode`, `mcpClaudeEnabled`, `mcpWindsurfEnabled`, `autoWriteMcpConfigOnStartup`) under Settings → Extensions → Context Engine Uploader. ## Prerequisites Python 3.8+ must be available on the host so the bundled client can run. @@ -130,6 +138,14 @@ All settings live under `Context Engine Uploader` in the VS Code settings UI or | `contextEngineUploader.intervalSeconds` | Poll interval for watch mode. Set to `5` to match the previous command file. | | `contextEngineUploader.extraForceArgs` | Optional string array appended to the force invocation. Leave empty for the standard workflow. | | `contextEngineUploader.extraWatchArgs` | Optional string array appended to the watch invocation. | +| `contextEngineUploader.mcpClaudeEnabled` | Enable writing the project-local `.mcp.json` used by Claude Code MCP clients. | +| `contextEngineUploader.mcpWindsurfEnabled` | Enable writing Windsurf’s global MCP config. | +| `contextEngineUploader.autoWriteMcpConfigOnStartup` | Automatically run “Write MCP Config” on activation to keep `.mcp.json`, Windsurf config, and Claude hook in sync with these settings. | +| `contextEngineUploader.mcpTransportMode` | Transport for MCP configs: `sse-remote` (SSE via mcp-remote) or `http` (direct `/mcp` endpoints). | +| `contextEngineUploader.mcpIndexerUrl` | MCP indexer URL used when writing configs. For dev-remote, typical values are `http://localhost:8001/sse` (SSE) or `http://localhost:8003/mcp` (HTTP). | +| `contextEngineUploader.mcpMemoryUrl` | MCP memory URL used when writing configs. For dev-remote, typical values are `http://localhost:8000/sse` (SSE) or `http://localhost:8002/mcp` (HTTP). | +| `contextEngineUploader.ctxIndexerUrl` | HTTP MCP indexer endpoint used by `ctx.py` in the Claude Code `/ctx` hook, typically `http://localhost:8003/mcp` for dev-remote. | +| `contextEngineUploader.claudeHookEnabled` | Enable writing a Claude Code `/ctx` hook in `.claude/settings.local.json`. | ## Commands and lifecycle @@ -156,7 +172,7 @@ The extension logs all subprocess output to the **Context Engine Upload** output ### Connection refused ```bash # Verify upload service is running -curl http://localhost:9090/health +curl http://localhost:8004/health # Check Docker logs docker compose logs upload_service diff --git a/scripts/ctx.py b/scripts/ctx.py index c5dda3d3..b1cc3c00 100755 --- a/scripts/ctx.py +++ b/scripts/ctx.py @@ -306,8 +306,14 @@ def format_search_results(results: List[Dict[str, Any]], include_snippets: bool """ lines: List[str] = [] for hit in results: - # Prefer client-facing host_path, fall back to container path - path = hit.get("host_path") or hit.get("path", "unknown") + # Prefer the server-chosen display path; fall back to host/container paths + raw_path = ( + hit.get("path") + or hit.get("host_path") + or hit.get("container_path") + or "unknown" + ) + path = raw_path start = hit.get("start_line", "?") end = hit.get("end_line", "?") language = hit.get("language") or "" @@ -513,25 +519,19 @@ def sanitize_citations(text: str, allowed_paths: Set[str]) -> str: if _b: basename_to_paths.setdefault(_b, set()).add(_p) + # For now, keep allowed paths exactly as they appear in the context refs. + # Earlier versions tried to be clever by rewriting absolute paths to + # workspace-relative forms (e.g., "Context-Engine/scripts/ctx.py"), which + # could produce confusing hybrids when multiple workspace roots or + # slugged/collection-hash directories were involved. To simplify behavior + # and avoid mixing host/container/hash paths, we preserve the original + # full path strings for any citation that is known to come from the + # formatted context. root = (os.environ.get("CTX_WORKSPACE_DIR") or "").strip() def _to_display_path(full_path: str) -> str: - if not full_path: - return full_path - if not root: - return full_path - try: - root_norm = root.rstrip("/\\") - repo_name = os.path.basename(root_norm) if root_norm else "" - if full_path == root_norm: - return repo_name or "." - if full_path.startswith(root_norm + os.sep): - rel = os.path.relpath(full_path, root_norm) - if repo_name: - return repo_name + os.sep + (rel or "") - return rel or "." - except Exception: - return full_path + # Identity mapping: leave allowed paths as-is so the LLM sees the same + # absolute/host paths that appeared in the Context refs. return full_path def _repl(m): diff --git a/scripts/ingest_code.py b/scripts/ingest_code.py index 3bdcdcb9..dc769dcd 100644 --- a/scripts/ingest_code.py +++ b/scripts/ingest_code.py @@ -40,11 +40,15 @@ def _detect_repo_name_from_path(path: Path) -> str: from scripts.workspace_state import ( is_multi_repo_mode, get_collection_name, + logical_repo_reuse_enabled, ) except ImportError: is_multi_repo_mode = None # type: ignore get_collection_name = None # type: ignore + def logical_repo_reuse_enabled() -> bool: # type: ignore[no-redef] + return False + # Import watcher's repo detection for surgical fix try: from scripts.watch_index import _detect_repo_for_file, _get_collection_for_file @@ -61,6 +65,14 @@ def _detect_repo_name_from_path(path: Path) -> str: remove_cached_file, update_indexing_status, update_workspace_state, + get_cached_symbols, + set_cached_symbols, + remove_cached_symbols, + compare_symbol_changes, + get_cached_pseudo, + set_cached_pseudo, + update_symbols_with_pseudo, + get_workspace_state, ) except ImportError: # State integration is optional; continue if not available @@ -70,6 +82,14 @@ def _detect_repo_name_from_path(path: Path) -> str: remove_cached_file = None # type: ignore update_indexing_status = None # type: ignore update_workspace_state = None # type: ignore + get_cached_symbols = None # type: ignore + set_cached_symbols = None # type: ignore + remove_cached_symbols = None # type: ignore + get_cached_pseudo = None # type: ignore + set_cached_pseudo = None # type: ignore + update_symbols_with_pseudo = None # type: ignore + compare_symbol_changes = None # type: ignore + get_workspace_state = None # type: ignore # Optional Tree-sitter import (graceful fallback) try: @@ -140,7 +160,6 @@ def _use_tree_sitter() -> bool: ".csproj": "xml", ".config": "xml", ".resx": "xml", - } # --- Named vector config --- @@ -292,7 +311,6 @@ def _git_metadata(file_path: Path) -> tuple[int, int, int]: "/.vs", "/.cache", "/.codebase", - "/node_modules", "/dist", "/build", @@ -302,7 +320,6 @@ def _git_metadata(file_path: Path) -> tuple[int, int, int]: "bin", "obj", "TestResults", - "/.git", ] _DEFAULT_EXCLUDE_FILES = [ @@ -439,7 +456,6 @@ def chunk_lines(text: str, max_lines: int = 120, overlap: int = 20) -> List[Dict if j == n: break - i = max(j - overlap, i + 1) return chunks @@ -455,7 +471,6 @@ def chunk_semantic( lines = text.splitlines() n = len(lines) - # Extract symbols with line ranges symbols = _extract_symbols(language, text) if not symbols: @@ -515,7 +530,6 @@ def chunk_by_tokens( except Exception: Tokenizer = None # type: ignore - try: k = int(os.environ.get("MICRO_CHUNK_TOKENS", str(k_tokens or 16)) or 16) except Exception: @@ -624,11 +638,112 @@ def char_to_line(c: int) -> int: def _pseudo_describe_enabled() -> bool: try: - return str(os.environ.get("REFRAG_PSEUDO_DESCRIBE", "0")).strip().lower() in {"1","true","yes","on"} + return str(os.environ.get("REFRAG_PSEUDO_DESCRIBE", "0")).strip().lower() in { + "1", + "true", + "yes", + "on", + } + except Exception: + return False + + +# ===== Symbol Extraction for Smart Reindexing ===== + +def _smart_symbol_reindexing_enabled() -> bool: + """Check if symbol-aware reindexing is enabled.""" + try: + return str(os.environ.get("SMART_SYMBOL_REINDEXING", "0")).strip().lower() in { + "1", + "true", + "yes", + "on", + } except Exception: return False +def extract_symbols_with_tree_sitter(file_path: str) -> dict: + """Extract functions, classes, methods from file using tree-sitter or fallback. + + Returns: + dict: {symbol_id: {name, type, start_line, end_line, content_hash, pseudo, tags}} + """ + try: + # Read file content + text = Path(file_path).read_text(encoding="utf-8", errors="ignore") + language = detect_language(Path(file_path)) + + # Use existing symbol extraction infrastructure + symbols_list = _extract_symbols(language, text) + + # Convert to our expected dict format + symbols = {} + for sym in symbols_list: + symbol_id = f"{sym['kind']}_{sym['name']}_{sym['start']}" + + # Extract actual content for hashing + content_lines = text.split("\n")[sym["start"] - 1 : sym["end"]] + content = "\n".join(content_lines) + content_hash = hashlib.sha1(content.encode("utf-8", errors="ignore")).hexdigest() + + symbols[symbol_id] = { + "name": sym["name"], + "type": sym["kind"], + "start_line": sym["start"], + "end_line": sym["end"], + "content_hash": content_hash, + "content": content, + # These will be populated during processing + "pseudo": "", + "tags": [], + "qdrant_ids": [], # Will store Qdrant point IDs for this symbol + } + + return symbols + + except Exception as e: + print(f"[SYMBOL_EXTRACTION] Failed to extract symbols from {file_path}: {e}") + return {} + + +def should_use_smart_reindexing(file_path: str, file_hash: str) -> tuple[bool, str]: + """Determine if smart reindexing should be used for a file. + + Returns: + (use_smart, reason) + """ + if not _smart_symbol_reindexing_enabled(): + return False, "smart_reindexing_disabled" + + if not get_cached_symbols or not set_cached_symbols: + return False, "symbol_cache_unavailable" + + # Load cached symbols + cached_symbols = get_cached_symbols(file_path) + if not cached_symbols: + return False, "no_cached_symbols" + + # Extract current symbols + current_symbols = extract_symbols_with_tree_sitter(file_path) + if not current_symbols: + return False, "no_current_symbols" + + # Compare symbols + unchanged_symbols, changed_symbols = compare_symbol_changes(cached_symbols, current_symbols) + + total_symbols = len(current_symbols) + changed_ratio = len(changed_symbols) / max(total_symbols, 1) + + # Use thresholds to decide strategy + max_changed_ratio = float(os.environ.get("MAX_CHANGED_SYMBOLS_RATIO", "0.3")) + if changed_ratio > max_changed_ratio: + return False, f"too_many_changes_{changed_ratio:.2f}" + + print(f"[SMART_REINDEX] {file_path}: {len(unchanged_symbols)} unchanged, {len(changed_symbols)} changed") + return True, f"smart_reindex_{len(changed_symbols)}/{total_symbols}" + + def generate_pseudo_tags(text: str) -> tuple[str, list[str]]: """Best-effort: ask local decoder to produce a short label and 3-6 tags. Returns (pseudo, tags). On failure returns ("", []).""" @@ -697,11 +812,69 @@ def generate_pseudo_tags(text: str) -> tuple[str, list[str]]: return pseudo, tags +def should_process_pseudo_for_chunk( + file_path: str, chunk: dict, changed_symbols: set +) -> tuple[bool, str, list[str]]: + """Determine if a chunk needs pseudo processing based on symbol changes AND pseudo cache. + + Uses existing symbol change detection and pseudo cache lookup for optimal performance. + + Args: + file_path: Path to the file containing this chunk + chunk: Chunk dict with symbol information + changed_symbols: Set of symbol IDs that changed (from compare_symbol_changes) + + Returns: + (needs_processing, cached_pseudo, cached_tags) + """ + # For chunks without symbol info, process them (fallback - no symbol to reuse from) + symbol_name = chunk.get("symbol", "") + if not symbol_name: + return True, "", [] + + # Create symbol ID matching the format used in symbol cache + kind = chunk.get("kind", "unknown") + start_line = chunk.get("start", 0) + symbol_id = f"{kind}_{symbol_name}_{start_line}" + + # If we don't have any change information, best effort: try reusing cached pseudo when present + if not changed_symbols and get_cached_pseudo: + try: + cached_pseudo, cached_tags = get_cached_pseudo(file_path, symbol_id) + if cached_pseudo or cached_tags: + return False, cached_pseudo, cached_tags + except Exception: + pass + return True, "", [] + + # Unchanged symbol: prefer reuse when cached pseudo/tags exist + if symbol_id not in changed_symbols: + if get_cached_pseudo: + try: + cached_pseudo, cached_tags = get_cached_pseudo(file_path, symbol_id) + if cached_pseudo or cached_tags: + return False, cached_pseudo, cached_tags + except Exception: + pass + # Unchanged but no cached data yet – process once + return True, "", [] + + # Symbol content changed: always re-run pseudo; do not reuse stale cached values + return True, "", [] + + +class CollectionNeedsRecreateError(Exception): + """Raised when a collection needs to be recreated to add new vector types.""" + pass + + def ensure_collection(client: QdrantClient, name: str, dim: int, vector_name: str): """Ensure collection exists with named vectors. Always includes dense (vector_name) and lexical (LEX_VECTOR_NAME). When REFRAG_MODE=1, also includes a compact mini vector (MINI_VECTOR_NAME). """ + # Track backup file path for this ensure_collection call (per-collection, per-process) + backup_file = None try: info = client.get_collection(name) # Prevent I/O storm - only update vectors if they actually don't exist @@ -744,9 +917,58 @@ def ensure_collection(client: QdrantClient, name: str, dim: int, vector_name: st client.update_collection( collection_name=name, vectors_config=missing ) - except Exception: - # Best-effort; if server doesn't support adding vectors, leave to recreate path - pass + print(f"[COLLECTION_SUCCESS] Successfully updated collection {name} with missing vectors") + except Exception as update_e: + # Qdrant doesn't support adding new vector names to existing collections + # Fall back to recreating the collection with the correct vector configuration + print(f"[COLLECTION_WARNING] Cannot add missing vectors to {name} ({update_e}). Recreating collection...") + + # Backup memories before recreating collection using dedicated backup script + backup_file = None + try: + import tempfile + import subprocess + import sys + + # Create temporary backup file + with tempfile.NamedTemporaryFile(mode='w', suffix='_memories_backup.json', delete=False) as f: + backup_file = f.name + + print(f"[MEMORY_BACKUP] Backing up memories from {name} to {backup_file}") + + # Use battle-tested backup script + backup_script = Path(__file__).parent / "memory_backup.py" + result = subprocess.run([ + sys.executable, str(backup_script), + "--collection", name, + "--output", backup_file + ], capture_output=True, text=True, cwd=Path(__file__).parent.parent) + + if result.returncode == 0: + print(f"[MEMORY_BACKUP] Successfully backed up memories using {backup_script.name}") + else: + print(f"[MEMORY_BACKUP_WARNING] Backup script failed: {result.stderr}") + backup_file = None + + except Exception as backup_e: + print(f"[MEMORY_BACKUP_WARNING] Failed to backup memories: {backup_e}") + backup_file = None + + try: + client.delete_collection(name) + print(f"[COLLECTION_INFO] Deleted existing collection {name}") + except Exception: + pass + + # Store backup info for restoration + # backup_file remains bound for this function call; used after collection creation + + # Proceed to recreate with full vector configuration + raise CollectionNeedsRecreateError(f"Collection {name} needs recreation for new vectors") + except CollectionNeedsRecreateError: + # Let this fall through to collection creation logic + print(f"[COLLECTION_INFO] Collection {name} needs recreation - proceeding...") + raise except Exception as e: print(f"[COLLECTION_ERROR] Failed to update collection {name}: {e}") pass @@ -780,6 +1002,65 @@ def ensure_collection(client: QdrantClient, name: str, dim: int, vector_name: st vectors_config=vectors_cfg, hnsw_config=models.HnswConfigDiff(m=16, ef_construct=256), ) + print(f"[COLLECTION_INFO] Successfully created new collection {name} with vectors: {list(vectors_cfg.keys())}") + + # Restore memories if we have a backup from recreation using dedicated restore script + strict_restore = False + try: + val = os.environ.get("STRICT_MEMORY_RESTORE", "") + strict_restore = str(val or "").strip().lower() in {"1", "true", "yes", "on"} + except Exception: + strict_restore = False + + try: + if backup_file and os.path.exists(backup_file): + print(f"[MEMORY_RESTORE] Restoring memories from {backup_file}") + import subprocess + import sys + + # Use battle-tested restore script (skip collection creation since ingest_code.py already handles it) + restore_script = Path(__file__).parent / "memory_restore.py" + result = subprocess.run( + [ + sys.executable, + str(restore_script), + "--backup", + backup_file, + "--collection", + name, + "--skip-collection-creation", + ], + capture_output=True, + text=True, + cwd=Path(__file__).parent.parent, + ) + + if result.returncode == 0: + print(f"[MEMORY_RESTORE] Successfully restored memories using {restore_script.name}") + else: + msg = result.stderr or result.stdout or "unknown error" + print(f"[MEMORY_RESTORE_WARNING] Restore script failed: {msg}") + if strict_restore: + raise RuntimeError(f"Memory restore failed for collection {name}: {msg}") + + # Clean up backup file once we've attempted restore + try: + os.unlink(backup_file) + print(f"[MEMORY_RESTORE] Cleaned up backup file {backup_file}") + except Exception: + pass + finally: + backup_file = None + + elif backup_file: + print(f"[MEMORY_RESTORE_WARNING] Backup file {backup_file} not found") + backup_file = None + + except Exception as restore_e: + print(f"[MEMORY_RESTORE_ERROR] Failed to restore memories: {restore_e}") + # Optionally fail hard when STRICT_MEMORY_RESTORE is enabled + if strict_restore: + raise def recreate_collection(client: QdrantClient, name: str, dim: int, vector_name: str): @@ -819,6 +1100,8 @@ def ensure_payload_indexes(client: QdrantClient, collection: str): for field in ( "metadata.language", "metadata.path_prefix", + "metadata.repo_id", + "metadata.repo_rel_path", "metadata.repo", "metadata.kind", "metadata.symbol", @@ -841,6 +1124,23 @@ def ensure_payload_indexes(client: QdrantClient, collection: str): except Exception: pass +ENSURED_COLLECTIONS: set[str] = set() + + +def ensure_collection_and_indexes_once( + client: QdrantClient, + collection: str, + dim: int, + vector_name: str | None, +) -> None: + if not collection: + return + if collection in ENSURED_COLLECTIONS: + return + ensure_collection(client, collection, dim, vector_name) + ensure_payload_indexes(client, collection) + ENSURED_COLLECTIONS.add(collection) + # Lightweight import extraction per language (best-effort) def _extract_imports(language: str, text: str) -> list: @@ -980,8 +1280,50 @@ def _extract_calls(language: str, text: str) -> list: return out[:200] -def get_indexed_file_hash(client: QdrantClient, collection: str, file_path: str) -> str: - """Return previously indexed file hash for this path, or empty string.""" +def get_indexed_file_hash( + client: QdrantClient, + collection: str, + file_path: str, + *, + repo_id: str | None = None, + repo_rel_path: str | None = None, +) -> str: + """Return previously indexed file hash for this logical path, or empty string. + + Prefers logical identity (repo_id + repo_rel_path) when available so that + worktrees sharing a logical repo can reuse existing index state, but falls + back to metadata.path for backwards compatibility. + """ + # Prefer logical identity when both repo_id and repo_rel_path are provided + if logical_repo_reuse_enabled() and repo_id and repo_rel_path: + try: + filt = models.Filter( + must=[ + models.FieldCondition( + key="metadata.repo_id", match=models.MatchValue(value=repo_id) + ), + models.FieldCondition( + key="metadata.repo_rel_path", + match=models.MatchValue(value=repo_rel_path), + ), + ] + ) + points, _ = client.scroll( + collection_name=collection, + scroll_filter=filt, + with_payload=True, + limit=1, + ) + if points: + md = (points[0].payload or {}).get("metadata") or {} + fh = md.get("file_hash") + if fh: + return str(fh) + except Exception: + # Fall back to path-based lookup below + pass + + # Backwards-compatible path-based lookup try: filt = models.Filter( must=[ @@ -998,7 +1340,9 @@ def get_indexed_file_hash(client: QdrantClient, collection: str, file_path: str) ) if points: md = (points[0].payload or {}).get("metadata") or {} - return str(md.get("file_hash") or "") + fh = md.get("file_hash") + if fh: + return str(fh) except Exception: return "" return "" @@ -1693,6 +2037,50 @@ def index_single_file( repo_tag = _detect_repo_name_from_path(file_path) + # Derive logical repo identity and repo-relative path for cross-worktree reuse. + repo_id: str | None = None + repo_rel_path: str | None = None + if logical_repo_reuse_enabled() and get_workspace_state is not None: + try: + ws_root = os.environ.get("WATCH_ROOT") or os.environ.get("WORKSPACE_PATH") or "/work" + # Resolve workspace state for this repo to read logical_repo_id + state = get_workspace_state(ws_root, repo_tag) + lrid = state.get("logical_repo_id") if isinstance(state, dict) else None + if isinstance(lrid, str) and lrid: + repo_id = lrid + # Compute repo-relative path within the current workspace tree + try: + fp = file_path.resolve() + except Exception: + fp = file_path + try: + ws_base = Path(os.environ.get("WATCH_ROOT") or os.environ.get("WORKSPACE_PATH") or "/work").resolve() + repo_root = ws_base + if repo_tag: + # In multi-repo scenarios, repos live under /work/ + candidate = ws_base / repo_tag + if candidate.exists(): + repo_root = candidate + rel = fp.relative_to(repo_root) + repo_rel_path = rel.as_posix() + except Exception: + repo_rel_path = None + except Exception as e: + print(f"[logical_repo] Failed to derive logical identity for {file_path}: {e}") + + # Get changed symbols for pseudo processing optimization + changed_symbols = set() + if get_cached_symbols and set_cached_symbols: + cached_symbols = get_cached_symbols(str(file_path)) + if cached_symbols: + current_symbols = extract_symbols_with_tree_sitter(str(file_path)) + _, changed = compare_symbol_changes(cached_symbols, current_symbols) + # Convert symbol names to IDs for lookup + for symbol_data in current_symbols.values(): + symbol_id = f"{symbol_data['type']}_{symbol_data['name']}_{symbol_data['start_line']}" + if symbol_id in changed: + changed_symbols.add(symbol_id) + if skip_unchanged: # Prefer local workspace cache to avoid Qdrant lookups ws_path = os.environ.get("WATCH_ROOT") or os.environ.get("WORKSPACE_PATH") or "/work" @@ -1704,7 +2092,13 @@ def index_single_file( return False except Exception: pass - prev = get_indexed_file_hash(client, collection, str(file_path)) + prev = get_indexed_file_hash( + client, + collection, + str(file_path), + repo_id=repo_id, + repo_rel_path=repo_rel_path, + ) if prev and prev == file_hash: print(f"Skipping unchanged file: {file_path}") return False @@ -1804,9 +2198,18 @@ def make_point(pid, dense_vec, lex_vec, payload): sym = ch.get("symbol") or sym if "symbol_path" in ch and ch.get("symbol_path"): sym_path = ch.get("symbol_path") or sym_path + # Ensure chunks always carry symbol metadata so pseudo gating can work for all chunking modes + if not ch.get("kind") and kind: + ch["kind"] = kind + if not ch.get("symbol") and sym: + ch["symbol"] = sym + if not ch.get("symbol_path") and sym_path: + ch["symbol_path"] = sym_path # Track both container path (/work mirror) and original host path for clarity across environments _cur_path = str(file_path) _host_root = str(os.environ.get("HOST_INDEX_PATH") or "").strip().rstrip("/") + if ":" in _host_root: # Windows drive letter (e.g., "C:") + _host_root = "" _host_path = None _container_path = None @@ -1874,21 +2277,41 @@ def make_point(pid, dense_vec, lex_vec, payload): "last_modified_at": int(last_mod), "churn_count": int(churn_count), "author_count": int(author_count), + # Logical identity for cross-worktree reuse + "repo_id": repo_id, + "repo_rel_path": repo_rel_path, # New: explicit dual-path tracking "host_path": _host_path, "container_path": _container_path, }, } # Optional LLM enrichment for lexical retrieval: pseudo + tags per micro-chunk - pseudo, tags = ("", []) - try: - pseudo, tags = generate_pseudo_tags(ch.get("text") or "") - if pseudo: - payload["pseudo"] = pseudo - if tags: - payload["tags"] = tags - except Exception: - pass + # Use symbol-aware gating and cached pseudo/tags where possible + needs_pseudo, cached_pseudo, cached_tags = should_process_pseudo_for_chunk( + str(file_path), ch, changed_symbols + ) + pseudo, tags = cached_pseudo, cached_tags + if needs_pseudo: + try: + pseudo, tags = generate_pseudo_tags(ch.get("text") or "") + if pseudo or tags: + # Cache the pseudo data for this symbol + symbol_name = ch.get("symbol", "") + if symbol_name: + kind = ch.get("kind", "unknown") + start_line = ch.get("start", 0) + symbol_id = f"{kind}_{symbol_name}_{start_line}" + + if set_cached_pseudo: + set_cached_pseudo(str(file_path), symbol_id, pseudo, tags, file_hash) + except Exception: + # Fall back to cached values (if any) or empty pseudo/tags + pass + # Attach whichever pseudo/tags we ended up with (cached or freshly generated) + if pseudo: + payload["pseudo"] = pseudo + if tags: + payload["tags"] = tags batch_texts.append(info) batch_meta.append(payload) batch_ids.append(hash_id(ch["text"], str(file_path), ch["start"], ch["end"])) @@ -2039,10 +2462,8 @@ def index_repo( if not use_per_repo_collections: if recreate: recreate_collection(client, collection, dim, vector_name) - else: - ensure_collection(client, collection, dim, vector_name) # Ensure useful payload indexes exist (idempotent) - ensure_payload_indexes(client, collection) + ensure_collection_and_indexes_once(client, collection, dim, vector_name) else: print("[multi_repo] Skipping single collection setup - will create per-repo collections during indexing") # Repo tag for filtering: auto-detect from git or folder name @@ -2107,8 +2528,7 @@ def make_point(pid, dense_vec, lex_vec, payload): if _get_collection_for_file: current_collection = _get_collection_for_file(file_path) # Ensure collection exists on first use - ensure_collection(client, current_collection, dim, vector_name) - ensure_payload_indexes(client, current_collection) + ensure_collection_and_indexes_once(client, current_collection, dim, vector_name) else: current_collection = get_collection_name(ws_path) if get_collection_name else "default-collection" @@ -2132,6 +2552,35 @@ def make_point(pid, dense_vec, lex_vec, payload): str(Path(workspace_root).resolve() / per_file_repo), ) + # Derive logical repo identity and repo-relative path for cross-worktree reuse. + repo_id: str | None = None + repo_rel_path: str | None = None + try: + if get_workspace_state is not None: + ws_root = os.environ.get("WATCH_ROOT") or os.environ.get("WORKSPACE_PATH") or "/work" + state = get_workspace_state(ws_root, per_file_repo) + lrid = state.get("logical_repo_id") if isinstance(state, dict) else None + if isinstance(lrid, str) and lrid: + repo_id = lrid + try: + fp_resolved = file_path.resolve() + except Exception: + fp_resolved = file_path + try: + ws_base = Path(workspace_root).resolve() + repo_root = ws_base + if per_file_repo: + candidate = ws_base / per_file_repo + if candidate.exists(): + repo_root = candidate + rel = fp_resolved.relative_to(repo_root) + repo_rel_path = rel.as_posix() + except Exception: + repo_rel_path = None + except Exception: + repo_id = None + repo_rel_path = None + # Skip unchanged files if enabled (default) if skip_unchanged: # Prefer local workspace cache to avoid Qdrant lookups @@ -2168,7 +2617,15 @@ def make_point(pid, dense_vec, lex_vec, payload): continue except Exception: pass - prev = get_indexed_file_hash(client, current_collection, str(file_path)) + + # Check existing indexed hash in Qdrant (logical identity when available) + prev = get_indexed_file_hash( + client, + current_collection, + str(file_path), + repo_id=repo_id, + repo_rel_path=repo_rel_path, + ) if prev and file_hash and prev == file_hash: # File exists in Qdrant with same hash - cache it locally for next time try: @@ -2205,6 +2662,35 @@ def make_point(pid, dense_vec, lex_vec, payload): print(f"Skipping unchanged file: {file_path}") continue + # At this point, file content has changed vs previous index; attempt smart reindex when enabled + if _smart_symbol_reindexing_enabled(): + try: + use_smart, smart_reason = should_use_smart_reindexing(str(file_path), file_hash) + if use_smart: + print(f"[SMART_REINDEX] Using smart reindexing for {file_path} ({smart_reason})") + status = process_file_with_smart_reindexing( + file_path, + text, + language, + client, + current_collection, + per_file_repo, + model, + vector_name, + ) + if status == "success": + files_indexed += 1 + # Smart path handles point counts internally; skip full reindex for this file + continue + else: + print( + f"[SMART_REINDEX] Smart reindex failed for {file_path} (status={status}), falling back to full reindex" + ) + else: + print(f"[SMART_REINDEX] Using full reindexing for {file_path} ({smart_reason})") + except Exception as e: + print(f"[SMART_REINDEX] Smart reindexing failed, falling back to full reindex: {e}") + # Dedupe per-file by deleting previous points for this path (default) if dedupe: delete_points_by_path(client, current_collection, str(file_path)) @@ -2214,6 +2700,19 @@ def make_point(pid, dense_vec, lex_vec, payload): imports, calls = _get_imports_calls(language, text) last_mod, churn_count, author_count = _git_metadata(file_path) + # Get changed symbols for pseudo processing optimization (reuse existing pattern) + changed_symbols = set() + if get_cached_symbols and set_cached_symbols: + cached_symbols = get_cached_symbols(str(file_path)) + if cached_symbols: + current_symbols = extract_symbols_with_tree_sitter(str(file_path)) + _, changed = compare_symbol_changes(cached_symbols, current_symbols) + # Convert symbol names to IDs for lookup + for symbol_data in current_symbols.values(): + symbol_id = f"{symbol_data['type']}_{symbol_data['name']}_{symbol_data['start_line']}" + if symbol_id in changed: + changed_symbols.add(symbol_id) + # Micro-chunking (token-based) takes precedence; else semantic; else line-based use_micro = os.environ.get("INDEX_MICRO_CHUNKS", "0").lower() in { "1", @@ -2272,6 +2771,13 @@ def make_point(pid, dense_vec, lex_vec, payload): sym = ch.get("symbol") or sym if "symbol_path" in ch and ch.get("symbol_path"): sym_path = ch.get("symbol_path") or sym_path + # Ensure chunks carry symbol metadata so pseudo gating works across all chunking modes + if not ch.get("kind") and kind: + ch["kind"] = kind + if not ch.get("symbol") and sym: + ch["symbol"] = sym + if not ch.get("symbol_path") and sym_path: + ch["symbol_path"] = sym_path # Track both container path (/work mirror) and original host path _cur_path = str(file_path) _host_root = str(os.environ.get("HOST_INDEX_PATH") or "").strip().rstrip("/") @@ -2297,7 +2803,10 @@ def make_point(pid, dense_vec, lex_vec, payload): _rel = _cur_path[len("/work/"):] # Prioritize client path from origin metadata over HOST_INDEX_PATH if _origin_client_path: - _host_path = os.path.realpath(os.path.join(_origin_client_path, _rel)) + _parts = _rel.split("/", 1) + _tail = _parts[1] if len(_parts) > 1 else "" + _base = _origin_client_path.rstrip("/") + _host_path = os.path.realpath(os.path.join(_base, _tail)) if _tail else _base else: _host_path = os.path.realpath(os.path.join(_host_root, _rel)) _container_path = _cur_path @@ -2333,21 +2842,37 @@ def make_point(pid, dense_vec, lex_vec, payload): "last_modified_at": int(last_mod), "churn_count": int(churn_count), "author_count": int(author_count), + # Logical identity for cross-worktree reuse + "repo_id": repo_id, + "repo_rel_path": repo_rel_path, # New: dual-path tracking "host_path": _host_path, "container_path": _container_path, }, } # Optional LLM enrichment for lexical retrieval: pseudo + tags per micro-chunk - pseudo, tags = ("", []) - try: - pseudo, tags = generate_pseudo_tags(ch.get("text") or "") - if pseudo: - payload["pseudo"] = pseudo - if tags: - payload["tags"] = tags - except Exception: - pass + # Use symbol-aware gating and cached pseudo/tags where possible + needs_pseudo, cached_pseudo, cached_tags = should_process_pseudo_for_chunk( + str(file_path), ch, changed_symbols + ) + pseudo, tags = cached_pseudo, cached_tags + if needs_pseudo: + try: + pseudo, tags = generate_pseudo_tags(ch.get("text") or "") + if pseudo or tags: + symbol_name = ch.get("symbol", "") + if symbol_name: + kind = ch.get("kind", "unknown") + start_line = ch.get("start", 0) + symbol_id = f"{kind}_{symbol_name}_{start_line}" + if set_cached_pseudo: + set_cached_pseudo(str(file_path), symbol_id, pseudo, tags, file_hash) + except Exception: + pass + if pseudo: + payload["pseudo"] = pseudo + if tags: + payload["tags"] = tags batch_texts.append(info) batch_meta.append(payload) # Track per-file latest hash once we add the first chunk to any batch @@ -2447,6 +2972,30 @@ def make_point(pid, dense_vec, lex_vec, payload): set_cached_file_hash(_p, _h, per_file_repo) except Exception: continue + + # NEW: Update symbol cache for files that were processed + if set_cached_symbols and _smart_symbol_reindexing_enabled(): + try: + # Process files that had chunks and extract/update their symbol cache + processed_files = set(str(Path(_p).resolve()) for _p in batch_file_hashes.keys()) + + for file_path_str in processed_files: + try: + # Extract current symbols for this file + current_symbols = extract_symbols_with_tree_sitter(file_path_str) + if current_symbols: + # Generate file hash for this file + with open(file_path_str, 'r', encoding='utf-8') as f: + content = f.read() + file_hash = hashlib.sha1(content.encode('utf-8', errors='ignore')).hexdigest() + + # Save symbol cache + set_cached_symbols(file_path_str, current_symbols, file_hash) + print(f"[SYMBOL_CACHE] Updated symbols for {Path(file_path_str).name}: {len(current_symbols)} symbols") + except Exception as e: + print(f"[SYMBOL_CACHE] Failed to update symbols for {Path(_p).name}: {e}") + except Exception as e: + print(f"[SYMBOL_CACHE] Symbol cache update failed: {e}") except Exception: pass @@ -2495,6 +3044,385 @@ def make_point(pid, dense_vec, lex_vec, payload): print(f"[ERROR] Traceback: {traceback.format_exc()}") +def process_file_with_smart_reindexing( + file_path, + text: str, + language: str, + client: QdrantClient, + current_collection: str, + per_file_repo, + model: TextEmbedding, + vector_name: str | None, +) -> str: + """Smart, chunk-level reindexing for a single file. + + Rebuilds all points for the file with *accurate* line numbers while: + - Reusing existing embeddings/lexical vectors for unchanged chunks (by code content), and + - Re-embedding only for changed chunks. + + Symbol cache is used to gate pseudo/tag generation, but embedding reuse is decided + at the chunk level by matching previous chunk code. + + TODO(logical_repo): consider loading existing points by logical identity + (repo_id + repo_rel_path) instead of metadata.path so worktrees/branches + sharing a repo can reuse embeddings across slugs, not just per-path. + """ + try: + print(f"[SMART_REINDEX] Processing {file_path} with chunk-level reindexing") + + # Normalize path / types + try: + fp = str(file_path) + except Exception: + fp = str(file_path) + try: + if not isinstance(file_path, Path): + file_path = Path(fp) + except Exception: + file_path = Path(fp) + + # Compute current file hash + file_hash = hashlib.sha1(text.encode("utf-8", errors="ignore")).hexdigest() + + # Extract current symbols for diffing (dict) and for chunk mapping (List[_Sym]) + symbol_meta = extract_symbols_with_tree_sitter(fp) + if not symbol_meta: + print(f"[SMART_REINDEX] No symbols found in {file_path}, falling back to full reindex") + return "failed" + + # Use the dict-style symbol_meta for cache diffing + cached_symbols = get_cached_symbols(fp) if get_cached_symbols else {} + unchanged_symbols: list[str] = [] + changed_symbols: list[str] = [] + if cached_symbols and compare_symbol_changes: + try: + unchanged_symbols, changed_symbols = compare_symbol_changes( + cached_symbols, symbol_meta + ) + except Exception: + # On failure, treat everything as changed + unchanged_symbols = [] + changed_symbols = list(symbol_meta.keys()) + else: + changed_symbols = list(symbol_meta.keys()) + changed_set = set(changed_symbols) + + # Load existing points for this file (for embedding reuse) + existing_points = [] + try: + filt = models.Filter( + must=[ + models.FieldCondition( + key="metadata.path", match=models.MatchValue(value=fp) + ) + ] + ) + next_offset = None + while True: + pts, next_offset = client.scroll( + collection_name=current_collection, + scroll_filter=filt, + with_payload=True, + with_vectors=True, + limit=256, + offset=next_offset, + ) + if not pts: + break + existing_points.extend(pts) + if next_offset is None: + break + except Exception as e: + print(f"[SMART_REINDEX] Failed to load existing points for {file_path}: {e}") + existing_points = [] + + # Index existing points by (symbol_id, code) for reuse + points_by_code: dict[tuple[str, str], list[models.Record]] = {} + try: + for rec in existing_points: + payload = rec.payload or {} + md = payload.get("metadata") or {} + code_text = md.get("code") or "" + kind = md.get("kind") or "" + sym_name = md.get("symbol") or "" + start_line = md.get("start_line") or 0 + symbol_id = ( + f"{kind}_{sym_name}_{start_line}" + if kind and sym_name and start_line + else "" + ) + key = (symbol_id, code_text) if symbol_id else ("", code_text) + points_by_code.setdefault(key, []).append(rec) + except Exception: + points_by_code = {} + + # Chunk current file using the same strategy as normal indexing + CHUNK_LINES = int(os.environ.get("INDEX_CHUNK_LINES", "120") or 120) + CHUNK_OVERLAP = int(os.environ.get("INDEX_CHUNK_OVERLAP", "20") or 20) + use_micro = os.environ.get("INDEX_MICRO_CHUNKS", "0").lower() in { + "1", + "true", + "yes", + "on", + } + use_semantic = os.environ.get("INDEX_SEMANTIC_CHUNKS", "1").lower() in { + "1", + "true", + "yes", + "on", + } + + if use_micro: + chunks = chunk_by_tokens(text) + symbol_spans: list[_Sym] = _extract_symbols(language, text) + elif use_semantic: + chunks = chunk_semantic(text, language, CHUNK_LINES, CHUNK_OVERLAP) + symbol_spans = _extract_symbols(language, text) + else: + chunks = chunk_lines(text, CHUNK_LINES, CHUNK_OVERLAP) + symbol_spans = _extract_symbols(language, text) + + # Prepare collections for reused vs newly embedded points + reused_points: list[models.PointStruct] = [] + embed_texts: list[str] = [] + embed_payloads: list[dict] = [] + embed_ids: list[int] = [] + embed_lex: list[list[float]] = [] + + imports, calls = _get_imports_calls(language, text) + last_mod, churn_count, author_count = _git_metadata(file_path) + + for ch in chunks: + info = build_information( + language, + file_path, + ch["start"], + ch["end"], + ch["text"].splitlines()[0] if ch["text"] else "", + ) + # Use span-style symbols for mapping chunks to symbols + kind, sym, sym_path = _choose_symbol_for_chunk( + ch["start"], ch["end"], symbol_spans + ) + # Prefer embedded symbol metadata from semantic chunker when present + if "kind" in ch and ch.get("kind"): + kind = ch.get("kind") or kind + if "symbol" in ch and ch.get("symbol"): + sym = ch.get("symbol") or sym + if "symbol_path" in ch and ch.get("symbol_path"): + sym_path = ch.get("symbol_path") or sym_path + # Ensure chunks carry symbol metadata so pseudo gating works + if not ch.get("kind") and kind: + ch["kind"] = kind + if not ch.get("symbol") and sym: + ch["symbol"] = sym + if not ch.get("symbol_path") and sym_path: + ch["symbol_path"] = sym_path + + # Basic metadata payload + _cur_path = str(file_path) + _host_root = str(os.environ.get("HOST_INDEX_PATH") or "").strip().rstrip("/") + _host_path = None + _container_path = None + _origin_client_path = None + try: + if _cur_path.startswith("/work/"): + _parts = _cur_path[6:].split("/") + if len(_parts) >= 2: + _repo_name = _parts[0] + _workspace_path = f"/work/{_repo_name}" + _origin_client_path = _get_host_path_from_origin( + _workspace_path, _repo_name + ) + except Exception: + pass + try: + if _cur_path.startswith("/work/") and (_host_root or _origin_client_path): + _rel = _cur_path[len("/work/") :] + if _origin_client_path: + _host_path = os.path.realpath( + os.path.join(_origin_client_path, _rel) + ) + else: + _host_path = os.path.realpath(os.path.join(_host_root, _rel)) + _container_path = _cur_path + else: + _host_path = _cur_path + if ( + (_host_root or _origin_client_path) + and _cur_path.startswith( + ((_origin_client_path or _host_root) + "/") + ) + ): + _rel = _cur_path[ + len((_origin_client_path or _host_root)) + 1 : + ] + _container_path = "/work/" + _rel + except Exception: + _host_path = _cur_path + _container_path = ( + _cur_path if _cur_path.startswith("/work/") else None + ) + + payload = { + "document": info, + "information": info, + "metadata": { + "path": str(file_path), + "path_prefix": str(file_path.parent), + "ext": str(file_path.suffix).lstrip(".").lower(), + "language": language, + "kind": kind, + "symbol": sym, + "symbol_path": sym_path or "", + "repo": per_file_repo, + "start_line": ch["start"], + "end_line": ch["end"], + "code": ch["text"], + "file_hash": file_hash, + "imports": imports, + "calls": calls, + "ingested_at": int(time.time()), + "last_modified_at": int(last_mod), + "churn_count": int(churn_count), + "author_count": int(author_count), + "host_path": _host_path, + "container_path": _container_path, + }, + } + + # Pseudo / tags with symbol-aware gating + needs_pseudo, cached_pseudo, cached_tags = should_process_pseudo_for_chunk( + fp, ch, changed_set + ) + pseudo, tags = cached_pseudo, cached_tags + if needs_pseudo: + try: + pseudo, tags = generate_pseudo_tags(ch.get("text") or "") + if pseudo or tags: + symbol_name = ch.get("symbol", "") + if symbol_name: + k = ch.get("kind", "unknown") + start_line = ch.get("start", 0) + sid = f"{k}_{symbol_name}_{start_line}" + if set_cached_pseudo: + set_cached_pseudo(fp, sid, pseudo, tags, file_hash) + except Exception: + pass + if pseudo: + payload["pseudo"] = pseudo + if tags: + payload["tags"] = tags + + # Decide whether we can reuse an existing embedding for this chunk + code_text = ch.get("text") or "" + chunk_symbol_id = "" + if sym and kind: + chunk_symbol_id = f"{kind}_{sym}_{ch['start']}" + + reuse_key = (chunk_symbol_id, code_text) + fallback_key = ("", code_text) + reused_rec = None + bucket = points_by_code.get(reuse_key) or points_by_code.get(fallback_key) + if bucket: + try: + reused_rec = bucket.pop() + if not bucket: + # Clean up empty bucket + points_by_code.pop(reuse_key, None) + points_by_code.pop(fallback_key, None) + except Exception: + reused_rec = None + + if reused_rec is not None: + try: + vec = reused_rec.vector + pid = hash_id(code_text, fp, ch["start"], ch["end"]) + reused_points.append( + models.PointStruct(id=pid, vector=vec, payload=payload) + ) + continue + except Exception: + # Fall through to re-embedding path + pass + + # Need to embed this chunk + embed_texts.append(info) + embed_payloads.append(payload) + embed_ids.append( + hash_id(code_text, fp, ch["start"], ch["end"]) + ) + aug_lex_text = (code_text or "") + ( + " " + pseudo if pseudo else "" + ) + (" " + " ".join(tags) if tags else "") + embed_lex.append(_lex_hash_vector_text(aug_lex_text)) + + # Embed changed/new chunks and build final point set + new_points: list[models.PointStruct] = [] + if embed_texts: + vectors = embed_batch(model, embed_texts) + for pid, v, lx, pl in zip( + embed_ids, + vectors, + embed_lex, + embed_payloads, + ): + if vector_name: + vecs = {vector_name: v, LEX_VECTOR_NAME: lx} + try: + if os.environ.get("REFRAG_MODE", "").strip().lower() in { + "1", + "true", + "yes", + "on", + }: + vecs[MINI_VECTOR_NAME] = project_mini( + list(v), MINI_VEC_DIM + ) + except Exception: + pass + new_points.append( + models.PointStruct(id=pid, vector=vecs, payload=pl) + ) + else: + new_points.append( + models.PointStruct(id=pid, vector=v, payload=pl) + ) + + all_points = reused_points + new_points + + # Replace existing points for this file with the new set + try: + delete_points_by_path(client, current_collection, fp) + except Exception as e: + print(f"[SMART_REINDEX] Failed to delete old points for {file_path}: {e}") + + if all_points: + upsert_points(client, current_collection, all_points) + + # Update caches with the new state + try: + if set_cached_symbols: + set_cached_symbols(fp, symbol_meta, file_hash) + except Exception as e: + print(f"[SMART_REINDEX] Failed to update symbol cache for {file_path}: {e}") + try: + if set_cached_file_hash: + set_cached_file_hash(fp, file_hash, per_file_repo) + except Exception: + pass + + print( + f"[SMART_REINDEX] Completed {file_path}: chunks={len(chunks)}, reused_points={len(reused_points)}, embedded_points={len(new_points)}" + ) + return "success" + + except Exception as e: + print(f"[SMART_REINDEX] Failed to process {file_path}: {e}") + import traceback + print(f"[SMART_REINDEX] Traceback: {traceback.format_exc()}") + return "failed" + def main(): parser = argparse.ArgumentParser( description="Index code into Qdrant with metadata for MCP code search." diff --git a/scripts/ingest_history.py b/scripts/ingest_history.py index 99645386..10079253 100644 --- a/scripts/ingest_history.py +++ b/scripts/ingest_history.py @@ -7,6 +7,9 @@ from typing import List, Dict, Any import re import time +import json +import sys +from pathlib import Path from qdrant_client import QdrantClient, models from fastembed import TextEmbedding @@ -17,6 +20,9 @@ API_KEY = os.environ.get("QDRANT_API_KEY") REPO_NAME = os.environ.get("REPO_NAME", "workspace") +ROOT_DIR = Path(__file__).resolve().parent.parent +if str(ROOT_DIR) not in sys.path: + sys.path.insert(0, str(ROOT_DIR)) from scripts.utils import sanitize_vector_name as _sanitize_vector_name @@ -108,6 +114,119 @@ def stable_id(commit_id: str) -> int: return int(h[:16], 16) +def _commit_summary_enabled() -> bool: + """Check REFRAG_COMMIT_DESCRIBE to decide if commit summarization is enabled. + + This is an opt-in feature: set REFRAG_COMMIT_DESCRIBE=1 (and enable the decoder) + to generate per-commit lineage summaries at ingest time. + """ + try: + return str(os.environ.get("REFRAG_COMMIT_DESCRIBE", "0")).strip().lower() in { + "1", + "true", + "yes", + "on", + } + except Exception: + return False + + +def generate_commit_summary(md: Dict[str, Any], diff_text: str) -> tuple[str, list[str], list[str]]: + """Best-effort: ask local decoder to summarize a git commit. + + Returns (goal, symbols, tags). On failure returns ("", [], []). + + The summary is designed to be compact and search-friendly, mirroring the + Context Lineage goals: high-level intent, key symbols, and short tags. + """ + goal: str = "" + symbols: list[str] = [] + tags: list[str] = [] + if not _commit_summary_enabled() or not diff_text.strip(): + return goal, symbols, tags + try: + from scripts.refrag_llamacpp import ( # type: ignore + LlamaCppRefragClient, + is_decoder_enabled, + get_runtime_kind, + ) + + if not is_decoder_enabled(): + return "", [], [] + runtime = get_runtime_kind() + commit_id = str(md.get("commit_id") or "") + message = str(md.get("message") or "") + files = md.get("files") or [] + try: + files_str = "\n".join(str(f) for f in files[:50]) + except Exception: + files_str = "" + # Truncate diff text to keep summarization fast/token-efficient + try: + max_chars = int(os.environ.get("COMMIT_SUMMARY_DIFF_CHARS", "6000") or 6000) + except Exception: + max_chars = 6000 + body = diff_text[:max_chars] + + if runtime == "glm": + from scripts.refrag_glm import GLMRefragClient # type: ignore + + client = GLMRefragClient() + prompt = ( + "You are a JSON-only function that summarizes git commits for search enrichment.\n" + "Respond with a single JSON object and nothing else (no prose, no markdown).\n" + "Exact format: {\"goal\": string (<=200 chars), \"symbols\": [1-6 short strings], \"tags\": [3-6 short strings]}.\n" + f"Commit id: {commit_id}\n" + f"Message:\n{message}\n" + f"Files:\n{files_str}\n" + "Diff:\n" + body + ) + out = client.generate_with_soft_embeddings( + prompt=prompt, + max_tokens=int(os.environ.get("COMMIT_SUMMARY_MAX_TOKENS", "128") or 128), + temperature=float(os.environ.get("COMMIT_SUMMARY_TEMPERATURE", "0.10") or 0.10), + top_p=float(os.environ.get("COMMIT_SUMMARY_TOP_P", "0.9") or 0.9), + stop=["\n\n"], + force_json=True, + ) + else: + client = LlamaCppRefragClient() + prompt = ( + "You summarize git commits for search enrichment.\n" + "Return strictly JSON: {\"goal\": string (<=200 chars), \"symbols\": [1-6 short strings], \"tags\": [3-6 short strings]}.\n" + f"Commit id: {commit_id}\n" + f"Message:\n{message}\n" + f"Files:\n{files_str}\n" + "Diff:\n" + body + ) + out = client.generate_with_soft_embeddings( + prompt=prompt, + max_tokens=int(os.environ.get("COMMIT_SUMMARY_MAX_TOKENS", "128") or 128), + temperature=float(os.environ.get("COMMIT_SUMMARY_TEMPERATURE", "0.10") or 0.10), + top_k=int(os.environ.get("COMMIT_SUMMARY_TOP_K", "30") or 30), + top_p=float(os.environ.get("COMMIT_SUMMARY_TOP_P", "0.9") or 0.9), + stop=["\n\n"], + ) + import json as _json + try: + obj = _json.loads(out) + if isinstance(obj, dict): + g = obj.get("goal") + s = obj.get("symbols") + t = obj.get("tags") + if isinstance(g, str): + goal = g.strip()[:200] + if isinstance(s, list): + symbols = [str(x).strip() for x in s if str(x).strip()][:6] + if isinstance(t, list): + tags = [str(x).strip() for x in t if str(x).strip()][:6] + except Exception: + pass + except Exception: + return "", [], [] + return goal, symbols, tags + + def build_text( md: Dict[str, Any], max_files: int = 200, include_body: bool = True ) -> str: @@ -120,6 +239,107 @@ def build_text( return (head + "\n\nFiles:\n" + files_part).strip() +def _ingest_from_manifest( + manifest_path: str, + model: TextEmbedding, + client: QdrantClient, + vec_name: str, + include_body: bool, + per_batch: int, +) -> int: + try: + with open(manifest_path, "r", encoding="utf-8") as f: + data = json.load(f) + except Exception as e: + print(f"Failed to read manifest {manifest_path}: {e}") + return 0 + + commits = data.get("commits") or [] + if not commits: + print("No commits in manifest.") + return 0 + + points: List[models.PointStruct] = [] + count = 0 + for c in commits: + try: + if not isinstance(c, dict): + continue + commit_id = str(c.get("commit_id") or "").strip() + if not commit_id: + continue + author_name = str(c.get("author_name") or "") + authored_date = str(c.get("authored_date") or "") + message = str(c.get("message") or "") + files = c.get("files") or [] + if not isinstance(files, list): + files = [] + md: Dict[str, Any] = { + "commit_id": commit_id, + "author_name": author_name, + "authored_date": authored_date, + "message": message, + "files": files, + } + text = build_text(md, include_body=include_body) + try: + vec = next(model.embed([text])).tolist() + except Exception: + continue + + goal: str = "" + sym: List[str] = [] + tgs: List[str] = [] + diff_text = str(c.get("diff") or "") + if diff_text.strip(): + try: + goal, sym, tgs = generate_commit_summary(md, diff_text) + except Exception: + goal, sym, tgs = "", [], [] + + md_payload: Dict[str, Any] = { + "language": "git", + "kind": "git_message", + "symbol": commit_id, + "symbol_path": commit_id, + "repo": REPO_NAME, + "commit_id": commit_id, + "author_name": author_name, + "authored_date": authored_date, + "message": message, + "files": files, + "path": ".git", + "path_prefix": ".git", + "ingested_at": int(time.time()), + } + if goal: + md_payload["lineage_goal"] = goal + if sym: + md_payload["lineage_symbols"] = sym + if tgs: + md_payload["lineage_tags"] = tgs + + payload = { + "document": (message.splitlines()[0] if message else commit_id), + "information": text[:512], + "metadata": md_payload, + } + pid = stable_id(commit_id) + pt = models.PointStruct(id=pid, vector={vec_name: vec}, payload=payload) + points.append(pt) + count += 1 + if len(points) >= per_batch: + client.upsert(collection_name=COLLECTION, points=points) + points.clear() + except Exception: + continue + + if points: + client.upsert(collection_name=COLLECTION, points=points) + print(f"Ingested {count} commits into {COLLECTION} from manifest {manifest_path}.") + return count + + def main(): ap = argparse.ArgumentParser( description="Ingest Git history into Qdrant deterministically" @@ -146,6 +366,12 @@ def main(): default="origin", help="Remote to fetch from if no local HEAD is present", ) + ap.add_argument( + "--manifest-json", + type=str, + default=None, + help="Path to git history manifest JSON produced by upload client", + ) ap.add_argument( "--fetch-depth", type=int, @@ -158,6 +384,17 @@ def main(): vec_name = _sanitize_vector_name(MODEL_NAME) client = QdrantClient(url=QDRANT_URL, api_key=API_KEY or None) + if args.manifest_json: + _ingest_from_manifest( + args.manifest_json, + model, + client, + vec_name, + args.include_body, + args.per_batch, + ) + return + commits = list_commits(args) if not commits: print("No commits matched filters.") @@ -168,6 +405,35 @@ def main(): md = commit_metadata(sha) text = build_text(md, include_body=args.include_body) vec = next(model.embed([text])).tolist() + goal, sym, tgs = "", [], [] + try: + diff = run(f"git show --stat --patch --unified=3 {sha}") + goal, sym, tgs = generate_commit_summary(md, diff) + except Exception: + pass + + md_payload: Dict[str, Any] = { + "language": "git", + "kind": "git_message", + "symbol": md["commit_id"], + "symbol_path": md["commit_id"], + "repo": REPO_NAME, + "commit_id": md["commit_id"], + "author_name": md["author_name"], + "authored_date": md["authored_date"], + "message": md["message"], + "files": md["files"], + "path": ".git", + "path_prefix": ".git", + "ingested_at": int(time.time()), + } + if goal: + md_payload["lineage_goal"] = goal + if sym: + md_payload["lineage_symbols"] = sym + if tgs: + md_payload["lineage_tags"] = tgs + payload = { "document": ( md.get("message", "").splitlines()[0] @@ -175,21 +441,7 @@ def main(): else md["commit_id"] ), "information": text[:512], - "metadata": { - "language": "git", - "kind": "git_message", - "symbol": md["commit_id"], - "symbol_path": md["commit_id"], - "repo": REPO_NAME, - "commit_id": md["commit_id"], - "author_name": md["author_name"], - "authored_date": md["authored_date"], - "message": md["message"], - "files": md["files"], - "path": ".git", - "path_prefix": ".git", - "ingested_at": int(time.time()), - }, + "metadata": md_payload, } pid = stable_id(md["commit_id"]) # deterministic per-commit point = models.PointStruct(id=pid, vector={vec_name: vec}, payload=payload) diff --git a/scripts/mcp_indexer_server.py b/scripts/mcp_indexer_server.py index 54a63292..8e20a7b0 100644 --- a/scripts/mcp_indexer_server.py +++ b/scripts/mcp_indexer_server.py @@ -1963,6 +1963,16 @@ def _to_str_list(x): if include_snippet: compact = False + # Default behavior: exclude commit-history docs (which use path=".git") from + # generic repo_search calls, unless the caller explicitly asks for git + # content. This prevents normal code queries from surfacing commit-index + # points as if they were source files. + if (not language or language.lower() != "git") and ( + not kind or kind.lower() != "git_message" + ): + if ".git" not in not_globs: + not_globs.append(".git") + # Accept top-level alias `queries` as a drop-in for `query` # Many clients send queries=[...] instead of query=[...] if kwargs and "queries" in kwargs and kwargs.get("queries") is not None: @@ -2363,7 +2373,11 @@ def _read_snip(args): el = int(item.get("end_line") or 0) if not path or not sl: return (i, "") - raw_path = str(path) + raw_path = ( + str(item.get("container_path")) + if item.get("container_path") + else str(path) + ) p = ( raw_path if os.path.isabs(raw_path) @@ -2759,11 +2773,175 @@ async def search_importers_for( ) +@mcp.tool() +async def search_commits_for( + query: Any = None, + path: Any = None, + collection: Any = None, + limit: Any = None, + max_points: Any = None, +) -> Dict[str, Any]: + """Search git commit history indexed in Qdrant. + + What it does: + - Queries commit documents ingested by scripts/ingest_history.py + - Filters by optional file path (metadata.files contains path) + + Parameters: + - query: str or list[str]; matched lexically against commit message/text + - path: str (optional). Relative path under /work; filters commits that touched this file + - collection: str (optional). Defaults to env/WS collection + - limit: int (optional, default 10). Max commits to return + - max_points: int (optional). Safety cap on scanned points (default 1000) + + Returns: + - {"ok": true, "results": [{"commit_id", "author_name", "authored_date", "message", "files"}, ...], "scanned": int} + - On error: {"ok": false, "error": "..."} + """ + # Normalize inputs + # query may be a string ("ctx script build") or a list of terms; + # in both cases we normalize to lowercase tokens and require all of + # them to appear somewhere in the composite text. + q_terms: list[str] = [] + if isinstance(query, (list, tuple)): + for x in query: + for tok in str(x).strip().split(): + if tok.strip(): + q_terms.append(tok.strip().lower()) + elif query is not None: + qs = str(query).strip() + if qs: + for tok in qs.split(): + if tok.strip(): + q_terms.append(tok.strip().lower()) + p = str(path or "").strip() + coll = str(collection or "").strip() or _default_collection() + try: + lim = int(limit) if limit not in (None, "") else 10 + except (ValueError, TypeError): + lim = 10 + try: + mcap = int(max_points) if max_points not in (None, "") else 1000 + except (ValueError, TypeError): + mcap = 1000 + + try: + from qdrant_client import QdrantClient # type: ignore + from qdrant_client import models as qmodels # type: ignore + + client = QdrantClient( + url=QDRANT_URL, + api_key=os.environ.get("QDRANT_API_KEY"), + timeout=float(os.environ.get("QDRANT_TIMEOUT", "20") or 20), + ) + + # Restrict to commit documents ingested by ingest_history.py + filt = qmodels.Filter( + must=[ + qmodels.FieldCondition( + key="metadata.language", match=qmodels.MatchValue(value="git") + ), + qmodels.FieldCondition( + key="metadata.kind", match=qmodels.MatchValue(value="git_message") + ), + ] + ) + + page = None + scanned = 0 + out: list[dict[str, Any]] = [] + seen_ids: set[str] = set() + while scanned < mcap and len(seen_ids) < lim: + sc, page = await asyncio.to_thread( + lambda: client.scroll( + collection_name=coll, + with_payload=True, + with_vectors=False, + limit=200, + offset=page, + scroll_filter=filt, + ) + ) + if not sc: + break + for pt in sc: + scanned += 1 + if scanned > mcap: + break + payload = getattr(pt, "payload", {}) or {} + md = payload.get("metadata") or {} + msg = str(md.get("message") or "") + info = str(payload.get("information") or "") + files = md.get("files") or [] + try: + files_list = [str(f) for f in files] + except Exception: + files_list = [] + # Optional lineage-style metadata from ingest_history (GLM/decoder-backed) + lg = md.get("lineage_goal") + if isinstance(lg, str): + lineage_goal = lg.strip() + else: + lineage_goal = "" + ls_raw = md.get("lineage_symbols") or [] + if isinstance(ls_raw, list): + lineage_symbols = [ + str(x).strip() for x in ls_raw if str(x).strip() + ][:6] + else: + lineage_symbols = [] + lt_raw = md.get("lineage_tags") or [] + if isinstance(lt_raw, list): + lineage_tags = [ + str(x).strip() for x in lt_raw if str(x).strip() + ][:6] + else: + lineage_tags = [] + # Build a composite lowercase text blob for simple lexical matching + lineage_text_parts = [] + if lineage_goal: + lineage_text_parts.append(lineage_goal) + if lineage_symbols: + lineage_text_parts.extend(lineage_symbols) + if lineage_tags: + lineage_text_parts.extend(lineage_tags) + text_l = (msg + "\n" + info + "\n" + " ".join(lineage_text_parts)).lower() + if q_terms and not all(t in text_l for t in q_terms): + continue + if p: + # Require the path substring to appear in at least one touched file + if not any(p in f for f in files_list): + continue + cid = md.get("commit_id") or md.get("symbol") + scid = str(cid) if cid is not None else "" + if not scid or scid in seen_ids: + continue + seen_ids.add(scid) + out.append( + { + "commit_id": cid, + "author_name": md.get("author_name"), + "authored_date": md.get("authored_date"), + "message": msg.splitlines()[0] if msg else "", + "files": files_list, + "lineage_goal": lineage_goal, + "lineage_symbols": lineage_symbols, + "lineage_tags": lineage_tags, + } + ) + if len(seen_ids) >= lim: + break + return {"ok": True, "results": out, "scanned": scanned, "collection": coll} + except Exception as e: + return {"ok": False, "error": str(e), "collection": coll} + + @mcp.tool() async def change_history_for_path( path: Any, collection: Any = None, max_points: Any = None, + include_commits: Any = None, ) -> Dict[str, Any]: """Summarize recent change metadata for a file path from the index. @@ -2771,6 +2949,8 @@ async def change_history_for_path( - path: str. Relative path under /work. - collection: str (optional). Defaults to env/WS default. - max_points: int (optional). Safety cap on scanned points. + - include_commits: bool (optional). If true, attach a small list of recent commits + touching this path based on the commit index. Returns: - {"ok": true, "summary": {...}} or {"ok": false, "error": "..."}. @@ -2783,6 +2963,14 @@ async def change_history_for_path( mcap = int(max_points) if max_points not in (None, "") else 200 except (ValueError, TypeError): mcap = 200 + # Treat include_commits as a loose boolean flag + inc_commits = False + if include_commits not in (None, ""): + try: + inc_commits = str(include_commits).strip().lower() in {"1", "true", "yes", "on"} + except Exception: + inc_commits = False + try: from qdrant_client import QdrantClient # type: ignore from qdrant_client import models as qmodels # type: ignore @@ -2836,7 +3024,7 @@ async def change_history_for_path( total += 1 if total >= mcap: break - summary = { + summary: Dict[str, Any] = { "path": p, "points_scanned": total, "distinct_hashes": len(hashes), @@ -2846,6 +3034,30 @@ async def change_history_for_path( "ingested_max": max(ingested) if ingested else None, "churn_count_max": max(churns) if churns else None, } + if inc_commits: + try: + commits = await search_commits_for( + query=None, + path=p, + collection=coll, + limit=10, + max_points=1000, + ) + if isinstance(commits, dict) and commits.get("ok"): + raw = commits.get("results") or [] + seen: set[str] = set() + uniq: list[dict[str, Any]] = [] + for c in raw: + cid = c.get("commit_id") if isinstance(c, dict) else None + scid = str(cid) if cid is not None else "" + if not scid or scid in seen: + continue + seen.add(scid) + uniq.append(c) + summary["commits"] = uniq + except Exception: + # Best-effort: change-history summary is still useful without commit details + pass return {"ok": True, "summary": summary} except Exception as e: return {"ok": False, "error": str(e), "path": p} @@ -4433,6 +4645,7 @@ def _ca_prepare_filters_and_retrieve( ".kiro/", "node_modules/", ".git/", + ".git", ] def _variants(p: str) -> list[str]: @@ -5509,12 +5722,14 @@ def _read_span_snippet(span: Dict[str, Any]) -> str: return "" try: path = str(span.get("path") or "") + container_path = str(span.get("container_path") or "") sline = int(span.get("start_line") or 0) eline = int(span.get("end_line") or 0) - if not path or sline <= 0: + if not (path or container_path) or sline <= 0: span["_ident_snippet"] = "" return "" - fp = path + raw_path = container_path or path + fp = raw_path if not os.path.isabs(fp): fp = os.path.join("/work", fp) realp = os.path.realpath(fp) diff --git a/scripts/mcp_memory_server.py b/scripts/mcp_memory_server.py index 6777f16a..6644fe07 100644 --- a/scripts/mcp_memory_server.py +++ b/scripts/mcp_memory_server.py @@ -241,7 +241,24 @@ def _ensure_collection(name: str): VECTOR_NAME: models.VectorParams(size=int(dense_dim or 768), distance=models.Distance.COSINE), LEX_VECTOR_NAME: models.VectorParams(size=LEX_VECTOR_DIM, distance=models.Distance.COSINE), } + + # Add mini vector for ReFRAG mode (same logic as ingest_code.py) + try: + if os.environ.get("REFRAG_MODE", "").strip().lower() in { + "1", "true", "yes", "on" + }: + mini_vector_name = os.environ.get("MINI_VECTOR_NAME", "mini") + mini_vec_dim = int(os.environ.get("MINI_VEC_DIM", "64")) + vectors_cfg[mini_vector_name] = models.VectorParams( + size=mini_vec_dim, + distance=models.Distance.COSINE, + ) + except Exception: + pass + client.create_collection(collection_name=name, vectors_config=vectors_cfg) + vector_names = list(vectors_cfg.keys()) + print(f"[MEMORY_SERVER] Created collection '{name}' with vectors: {vector_names}") return True diff --git a/scripts/memory_restore.py b/scripts/memory_restore.py index cacddeda..c2f4c01e 100644 --- a/scripts/memory_restore.py +++ b/scripts/memory_restore.py @@ -98,7 +98,8 @@ def restore_memories( embedding_model_name: Optional[str] = None, vector_name: str = "memory", batch_size: int = 100, - skip_existing: bool = True + skip_existing: bool = True, + skip_collection_creation: bool = False ) -> Dict[str, Any]: """ Restore memories from backup file to Qdrant collection. @@ -111,6 +112,7 @@ def restore_memories( vector_name: Name for the memory vector in collection batch_size: Number of memories to upload per batch skip_existing: Skip memories that already exist in collection + skip_collection_creation: Skip collection creation (useful when collection is already configured) Returns: Dict with restore statistics @@ -166,8 +168,18 @@ def restore_memories( embedding_model = None print(f"Using vectors from backup, dimension: {vector_dimension}") - # Ensure collection exists - ensure_collection_exists(client, collection_name, vector_dimension, vector_name) + # Ensure collection exists (unless skipped) + if not skip_collection_creation: + ensure_collection_exists(client, collection_name, vector_dimension, vector_name) + else: + print(f"Skipping collection creation for '{collection_name}' (as requested)") + + # Verify collection actually exists when skipping creation + try: + client.get_collection(collection_name) + print(f"Confirmed collection '{collection_name}' exists") + except Exception: + raise RuntimeError(f"Collection '{collection_name}' does not exist but creation was skipped") # Check for existing memories if skip_existing is True existing_ids = set() @@ -196,7 +208,17 @@ def restore_memories( batch_points = [] for memory in batch: - memory_id = memory.get("id", "") + raw_id = memory.get("id", "") + + # Qdrant HTTP API expects point IDs to be either an unsigned integer + # or a UUID string. Backups store IDs as strings, so we convert + # purely numeric IDs back to integers to match the original type. + memory_id = raw_id + try: + if isinstance(raw_id, str) and raw_id.isdigit(): + memory_id = int(raw_id) + except Exception: + memory_id = raw_id # Skip if already exists if skip_existing and memory_id in existing_ids: @@ -327,6 +349,12 @@ def main(): help="Show backup file information without restoring" ) + parser.add_argument( + "--skip-collection-creation", + action="store_true", + help="Skip collection creation (useful when collection is already configured by other processes)" + ) + args = parser.parse_args() try: @@ -361,7 +389,8 @@ def main(): embedding_model_name=args.embedding_model, vector_name=args.vector_name, batch_size=args.batch_size, - skip_existing=not args.no_skip_existing + skip_existing=not args.no_skip_existing, + skip_collection_creation=args.skip_collection_creation ) if result["success"]: diff --git a/scripts/remote_upload_client.py b/scripts/remote_upload_client.py index 449aa0bc..a8e8a5be 100644 --- a/scripts/remote_upload_client.py +++ b/scripts/remote_upload_client.py @@ -20,6 +20,9 @@ import tempfile import logging import argparse +import subprocess +import shlex +import re from pathlib import Path, PurePosixPath from typing import Dict, List, Any, Optional, Tuple from datetime import datetime @@ -44,6 +47,271 @@ import scripts.ingest_code as idx +def _find_git_root(start: Path) -> Optional[Path]: + """Best-effort detection of the git repository root for a workspace. + + Walks up from the given path looking for a .git directory. Returns None if + no repo is found or git metadata is unavailable. + """ + try: + cur = start.resolve() + except Exception: + cur = start + try: + for p in [cur] + list(cur.parents): + try: + if (p / ".git").exists(): + return p + except Exception: + continue + except Exception: + return None + return None + + +def _compute_logical_repo_id(workspace_path: str) -> str: + try: + p = Path(workspace_path).resolve() + except Exception: + p = Path(workspace_path) + + try: + r = subprocess.run( + ["git", "-C", str(p), "rev-parse", "--git-common-dir"], + capture_output=True, + text=True, + ) + raw = (r.stdout or "").strip() + if r.returncode == 0 and raw: + common = Path(raw) + if not common.is_absolute(): + base = p if p.is_dir() else p.parent + common = base / common + key = str(common.resolve()) + prefix = "git:" + else: + raise RuntimeError + except Exception: + key = str(p) + prefix = "fs:" + + h = hashlib.sha1(key.encode("utf-8", errors="ignore")).hexdigest()[:16] + return f"{prefix}{h}" + + +def _redact_emails(text: str) -> str: + """Redact email addresses from commit messages for privacy.""" + try: + return re.sub( + r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}", "", text or "", + ) + except Exception: + return text + + +def _collect_git_history_for_workspace(workspace_path: str) -> Optional[Dict[str, Any]]: + """Best-effort collection of recent git history for a workspace. + + Uses REMOTE_UPLOAD_GIT_MAX_COMMITS (0/empty disables) and + REMOTE_UPLOAD_GIT_SINCE (optional) to bound history. Returns a + serializable dict suitable for writing as metadata/git_history.json, or + None when git metadata is unavailable. + """ + # Read configuration from environment + try: + raw_max = (os.environ.get("REMOTE_UPLOAD_GIT_MAX_COMMITS", "") or "").strip() + max_commits = int(raw_max) if raw_max else 0 + except Exception: + max_commits = 0 + since = (os.environ.get("REMOTE_UPLOAD_GIT_SINCE", "") or "").strip() + force_full = str(os.environ.get("REMOTE_UPLOAD_GIT_FORCE", "") or "").strip().lower() in { + "1", + "true", + "yes", + "on", + } + + if max_commits <= 0: + return None + + root = _find_git_root(Path(workspace_path)) + if not root: + return None + + # Git history cache: avoid emitting identical manifests when HEAD/settings are unchanged + base = Path(os.environ.get("WORKSPACE_PATH") or workspace_path).resolve() + git_cache_path = base / ".context-engine" / "git_history_cache.json" + current_head = "" + try: + head_proc = subprocess.run( + ["git", "rev-parse", "HEAD"], + cwd=str(root), + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + encoding="utf-8", + errors="replace", + ) + if head_proc.returncode == 0 and head_proc.stdout.strip(): + current_head = head_proc.stdout.strip() + except Exception: + current_head = "" + + cache: Dict[str, Any] = {} + if not force_full: + try: + if git_cache_path.exists(): + with git_cache_path.open("r", encoding="utf-8") as f: + obj = json.load(f) + if isinstance(obj, dict): + cache = obj + except Exception: + cache = {} + + if current_head and cache.get("last_head") == current_head and cache.get("max_commits") == max_commits and str(cache.get("since") or "") == since: + return None + + base_head = "" + if not force_full: + try: + prev_head = str(cache.get("last_head") or "").strip() + if current_head and prev_head and prev_head != current_head: + base_head = prev_head + except Exception: + base_head = "" + + # Build git rev-list command (simple HEAD-based history) + cmd: List[str] = ["git", "rev-list", "--no-merges"] + if since: + cmd.append(f"--since={since}") + if base_head and current_head: + cmd.append(f"{base_head}..{current_head}") + else: + cmd.append("HEAD") + + try: + proc = subprocess.run( + cmd, + cwd=str(root), + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + encoding="utf-8", + errors="replace", + ) + if proc.returncode != 0 or not proc.stdout.strip(): + return None + commits = [l.strip() for l in proc.stdout.splitlines() if l.strip()] + except Exception: + return None + + if not commits: + return None + if len(commits) > max_commits: + commits = commits[:max_commits] + + records: List[Dict[str, Any]] = [] + for sha in commits: + try: + fmt = "%H%x1f%an%x1f%ae%x1f%ad%x1f%s%x1f%b" + show_proc = subprocess.run( + ["git", "show", "-s", f"--format={fmt}", sha], + cwd=str(root), + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + encoding="utf-8", + errors="replace", + ) + if show_proc.returncode != 0 or not show_proc.stdout.strip(): + continue + parts = show_proc.stdout.strip().split("\x1f") + c_sha, an, _ae, ad, subj, body = (parts + [""] * 6)[:6] + + files_proc = subprocess.run( + ["git", "diff-tree", "--no-commit-id", "--name-only", "-r", sha], + cwd=str(root), + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + encoding="utf-8", + errors="replace", + ) + files: List[str] = [] + if files_proc.returncode == 0 and files_proc.stdout: + files = [f for f in files_proc.stdout.splitlines() if f] + + diff_text = "" + try: + diff_proc = subprocess.run( + ["git", "show", "--stat", "--patch", "--unified=3", sha], + cwd=str(root), + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + encoding="utf-8", + errors="replace", + ) + if diff_proc.returncode == 0 and diff_proc.stdout: + try: + max_chars = int(os.environ.get("COMMIT_SUMMARY_DIFF_CHARS", "6000") or 6000) + except Exception: + max_chars = 6000 + diff_text = diff_proc.stdout[:max_chars] + except Exception: + diff_text = "" + + msg = _redact_emails((subj + ("\n" + body if body else "")).strip()) + if len(msg) > 2000: + msg = msg[:2000] + "\u2026" + + records.append( + { + "commit_id": c_sha or sha, + "author_name": an, + "authored_date": ad, + "message": msg, + "files": files, + "diff": diff_text, + } + ) + except Exception: + continue + + if not records: + return None + + try: + repo_name = root.name + except Exception: + repo_name = "workspace" + + manifest = { + "version": 1, + "repo_name": repo_name, + "generated_at": datetime.now().isoformat(), + "max_commits": max_commits, + "since": since, + "commits": records, + } + + # Update git history cache with the HEAD and settings used for this manifest + try: + git_cache_path.parent.mkdir(parents=True, exist_ok=True) + cache_out = { + "last_head": current_head or (commits[0] if commits else ""), + "max_commits": max_commits, + "since": since, + "updated_at": datetime.now().isoformat(), + } + with git_cache_path.open("w", encoding="utf-8") as f: + json.dump(cache_out, f, indent=2) + except Exception: + pass + + return manifest + + def _load_local_cache_file_hashes(workspace_path: str, repo_name: Optional[str]) -> Dict[str, str]: """Best-effort read of the local cache.json file_hashes map. @@ -108,7 +376,8 @@ def _translate_to_container_path(self, host_path: str) -> str: return host_path.replace('\\', '/').replace(':', '') def __init__(self, upload_endpoint: str, workspace_path: str, collection_name: str, - max_retries: int = 3, timeout: int = 30, metadata_path: Optional[str] = None): + max_retries: int = 3, timeout: int = 30, metadata_path: Optional[str] = None, + logical_repo_id: Optional[str] = None): """Initialize remote upload client.""" self.upload_endpoint = upload_endpoint.rstrip('/') self.workspace_path = workspace_path @@ -116,6 +385,7 @@ def __init__(self, upload_endpoint: str, workspace_path: str, collection_name: s self.max_retries = max_retries self.timeout = timeout self.temp_dir = None + self.logical_repo_id = logical_repo_id # Set environment variables for cache functions os.environ["WORKSPACE_PATH"] = workspace_path @@ -323,7 +593,7 @@ def create_delta_bundle(self, changes: Dict[str, List]) -> Tuple[str, Dict[str, # Process created files for path in changes["created"]: - rel_path = str(path.relative_to(Path(self.workspace_path))) + rel_path = path.relative_to(Path(self.workspace_path)).as_posix() try: with open(path, 'rb') as f: content = f.read() @@ -360,7 +630,7 @@ def create_delta_bundle(self, changes: Dict[str, List]) -> Tuple[str, Dict[str, # Process updated files for path in changes["updated"]: - rel_path = str(path.relative_to(Path(self.workspace_path))) + rel_path = path.relative_to(Path(self.workspace_path)).as_posix() try: with open(path, 'rb') as f: content = f.read() @@ -399,8 +669,8 @@ def create_delta_bundle(self, changes: Dict[str, List]) -> Tuple[str, Dict[str, # Process moved files for source_path, dest_path in changes["moved"]: - dest_rel_path = str(dest_path.relative_to(Path(self.workspace_path))) - source_rel_path = str(source_path.relative_to(Path(self.workspace_path))) + dest_rel_path = dest_path.relative_to(Path(self.workspace_path)).as_posix() + source_rel_path = source_path.relative_to(Path(self.workspace_path)).as_posix() try: with open(dest_path, 'rb') as f: content = f.read() @@ -440,7 +710,7 @@ def create_delta_bundle(self, changes: Dict[str, List]) -> Tuple[str, Dict[str, # Process deleted files for path in changes["deleted"]: - rel_path = str(path.relative_to(Path(self.workspace_path))) + rel_path = path.relative_to(Path(self.workspace_path)).as_posix() try: previous_hash = get_cached_file_hash(str(path.resolve()), self.repo_name) @@ -506,6 +776,17 @@ def create_delta_bundle(self, changes: Dict[str, List]) -> Tuple[str, Dict[str, } (metadata_dir / "hashes.json").write_text(json.dumps(hashes_metadata, indent=2)) + # Optional: attach recent git history for this workspace + try: + git_history = _collect_git_history_for_workspace(self.workspace_path) + if git_history: + (metadata_dir / "git_history.json").write_text( + json.dumps(git_history, indent=2) + ) + except Exception: + # Best-effort only; never fail bundle creation on git history issues + pass + # Create tarball in temporary directory temp_bundle_dir = self._get_temp_bundle_dir() bundle_path = temp_bundle_dir / f"{bundle_id}.tar.gz" @@ -554,6 +835,8 @@ def upload_bundle(self, bundle_path: str, manifest: Dict[str, Any]) -> Dict[str, 'force': 'false', 'source_path': self.workspace_path, } + if getattr(self, "logical_repo_id", None): + data['logical_repo_id'] = self.logical_repo_id logger.info(f"[remote_upload] Uploading bundle {manifest['bundle_id']} (size: {bundle_size} bytes)") @@ -1078,6 +1361,8 @@ def get_remote_config(cli_path: Optional[str] = None) -> Dict[str, str]: else: workspace_path = os.environ.get("WATCH_ROOT", os.environ.get("WORKSPACE_PATH", "/work")) + logical_repo_id = _compute_logical_repo_id(workspace_path) + # Use auto-generated collection name based on repo name repo_name = _extract_repo_name_from_path(workspace_path) # Fallback to directory name if repo detection fails @@ -1089,6 +1374,7 @@ def get_remote_config(cli_path: Optional[str] = None) -> Dict[str, str]: "upload_endpoint": os.environ.get("REMOTE_UPLOAD_ENDPOINT", "http://localhost:8080"), "workspace_path": workspace_path, "collection_name": collection_name, + "logical_repo_id": logical_repo_id, # Use higher, more robust defaults but still allow env overrides "max_retries": int(os.environ.get("REMOTE_UPLOAD_MAX_RETRIES", "5")), "timeout": int(os.environ.get("REMOTE_UPLOAD_TIMEOUT", "1800")), @@ -1205,6 +1491,7 @@ def main(): collection_name=config["collection_name"], max_retries=config["max_retries"], timeout=config["timeout"], + logical_repo_id=config.get("logical_repo_id"), ) as client: client.log_mapping_summary() return 0 @@ -1218,7 +1505,8 @@ def main(): workspace_path=config["workspace_path"], collection_name=config["collection_name"], max_retries=config["max_retries"], - timeout=config["timeout"] + timeout=config["timeout"], + logical_repo_id=config.get("logical_repo_id"), ) as client: logger.info("Remote upload client initialized successfully") @@ -1260,7 +1548,8 @@ def main(): workspace_path=config["workspace_path"], collection_name=config["collection_name"], max_retries=config["max_retries"], - timeout=config["timeout"] + timeout=config["timeout"], + logical_repo_id=config.get("logical_repo_id"), ) as client: logger.info("Remote upload client initialized successfully") diff --git a/scripts/standalone_upload_client.py b/scripts/standalone_upload_client.py index da852723..979244ad 100644 --- a/scripts/standalone_upload_client.py +++ b/scripts/standalone_upload_client.py @@ -19,6 +19,8 @@ import tempfile import logging import argparse +import subprocess +import re from pathlib import Path, PurePosixPath from typing import Dict, List, Any, Optional, Tuple from datetime import datetime @@ -231,6 +233,271 @@ def remove_cached_file(file_path: str, repo_name: Optional[str] = None) -> None: _hash_cache.remove_hash(file_path) +def _find_git_root(start: Path) -> Optional[Path]: + """Best-effort detection of the git repository root for a workspace. + + Walks up from the given path looking for a .git directory. Returns None if + no repo is found or git metadata is unavailable. + """ + try: + cur = start.resolve() + except Exception: + cur = start + try: + for p in [cur] + list(cur.parents): + try: + if (p / ".git").exists(): + return p + except Exception: + continue + except Exception: + return None + return None + + +def _compute_logical_repo_id(workspace_path: str) -> str: + try: + p = Path(workspace_path).resolve() + except Exception: + p = Path(workspace_path) + + try: + r = subprocess.run( + ["git", "-C", str(p), "rev-parse", "--git-common-dir"], + capture_output=True, + text=True, + ) + raw = (r.stdout or "").strip() + if r.returncode == 0 and raw: + common = Path(raw) + if not common.is_absolute(): + base = p if p.is_dir() else p.parent + common = base / common + key = str(common.resolve()) + prefix = "git:" + else: + raise RuntimeError + except Exception: + key = str(p) + prefix = "fs:" + + h = hashlib.sha1(key.encode("utf-8", errors="ignore")).hexdigest()[:16] + return f"{prefix}{h}" + + +def _redact_emails(text: str) -> str: + """Redact email addresses from commit messages for privacy.""" + try: + return re.sub( + r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}", "", text or "", + ) + except Exception: + return text + + +def _collect_git_history_for_workspace(workspace_path: str) -> Optional[Dict[str, Any]]: + """Best-effort collection of recent git history for a workspace. + + Uses REMOTE_UPLOAD_GIT_MAX_COMMITS (0/empty disables) and + REMOTE_UPLOAD_GIT_SINCE (optional) to bound history. Returns a + serializable dict suitable for writing as metadata/git_history.json, or + None when git metadata is unavailable. + """ + # Read configuration from environment + try: + raw_max = (os.environ.get("REMOTE_UPLOAD_GIT_MAX_COMMITS", "") or "").strip() + max_commits = int(raw_max) if raw_max else 0 + except Exception: + max_commits = 0 + since = (os.environ.get("REMOTE_UPLOAD_GIT_SINCE", "") or "").strip() + force_full = str(os.environ.get("REMOTE_UPLOAD_GIT_FORCE", "") or "").strip().lower() in { + "1", + "true", + "yes", + "on", + } + + if max_commits <= 0: + return None + + root = _find_git_root(Path(workspace_path)) + if not root: + return None + + # Git history cache: avoid emitting identical manifests when HEAD/settings are unchanged + base = Path(os.environ.get("WORKSPACE_PATH") or workspace_path).resolve() + git_cache_path = base / ".context-engine" / "git_history_cache.json" + current_head = "" + try: + head_proc = subprocess.run( + ["git", "rev-parse", "HEAD"], + cwd=str(root), + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + encoding="utf-8", + errors="replace", + ) + if head_proc.returncode == 0 and head_proc.stdout.strip(): + current_head = head_proc.stdout.strip() + except Exception: + current_head = "" + + cache: Dict[str, Any] = {} + if not force_full: + try: + if git_cache_path.exists(): + with git_cache_path.open("r", encoding="utf-8") as f: + obj = json.load(f) + if isinstance(obj, dict): + cache = obj + except Exception: + cache = {} + + if current_head and cache.get("last_head") == current_head and cache.get("max_commits") == max_commits and str(cache.get("since") or "") == since: + return None + + base_head = "" + if not force_full: + try: + prev_head = str(cache.get("last_head") or "").strip() + if current_head and prev_head and prev_head != current_head: + base_head = prev_head + except Exception: + base_head = "" + + # Build git rev-list command (simple HEAD-based history) + cmd: List[str] = ["git", "rev-list", "--no-merges"] + if since: + cmd.append(f"--since={since}") + if base_head and current_head: + cmd.append(f"{base_head}..{current_head}") + else: + cmd.append("HEAD") + + try: + proc = subprocess.run( + cmd, + cwd=str(root), + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + encoding="utf-8", + errors="replace", + ) + if proc.returncode != 0 or not proc.stdout.strip(): + return None + commits = [l.strip() for l in proc.stdout.splitlines() if l.strip()] + except Exception: + return None + + if not commits: + return None + if len(commits) > max_commits: + commits = commits[:max_commits] + + records: List[Dict[str, Any]] = [] + for sha in commits: + try: + fmt = "%H%x1f%an%x1f%ae%x1f%ad%x1f%s%x1f%b" + show_proc = subprocess.run( + ["git", "show", "-s", f"--format={fmt}", sha], + cwd=str(root), + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + encoding="utf-8", + errors="replace", + ) + if show_proc.returncode != 0 or not show_proc.stdout.strip(): + continue + parts = show_proc.stdout.strip().split("\x1f") + c_sha, an, _ae, ad, subj, body = (parts + [""] * 6)[:6] + + files_proc = subprocess.run( + ["git", "diff-tree", "--no-commit-id", "--name-only", "-r", sha], + cwd=str(root), + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + encoding="utf-8", + errors="replace", + ) + files: List[str] = [] + if files_proc.returncode == 0 and files_proc.stdout: + files = [f for f in files_proc.stdout.splitlines() if f] + + diff_text = "" + try: + diff_proc = subprocess.run( + ["git", "show", "--stat", "--patch", "--unified=3", sha], + cwd=str(root), + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + encoding="utf-8", + errors="replace", + ) + if diff_proc.returncode == 0 and diff_proc.stdout: + try: + max_chars = int(os.environ.get("COMMIT_SUMMARY_DIFF_CHARS", "6000") or 6000) + except Exception: + max_chars = 6000 + diff_text = diff_proc.stdout[:max_chars] + except Exception: + diff_text = "" + + msg = _redact_emails((subj + ("\n" + body if body else "")).strip()) + if len(msg) > 2000: + msg = msg[:2000] + "\u2026" + + records.append( + { + "commit_id": c_sha or sha, + "author_name": an, + "authored_date": ad, + "message": msg, + "files": files, + "diff": diff_text, + } + ) + except Exception: + continue + + if not records: + return None + + try: + repo_name = root.name + except Exception: + repo_name = "workspace" + + manifest = { + "version": 1, + "repo_name": repo_name, + "generated_at": datetime.now().isoformat(), + "max_commits": max_commits, + "since": since, + "commits": records, + } + + # Update git history cache with the HEAD and settings used for this manifest + try: + git_cache_path.parent.mkdir(parents=True, exist_ok=True) + cache_out = { + "last_head": current_head or (commits[0] if commits else ""), + "max_commits": max_commits, + "since": since, + "updated_at": datetime.now().isoformat(), + } + with git_cache_path.open("w", encoding="utf-8") as f: + json.dump(cache_out, f, indent=2) + except Exception: + pass + + return manifest + + class RemoteUploadClient: """Client for uploading delta bundles to remote server.""" @@ -266,7 +533,8 @@ def _translate_to_container_path(self, host_path: str) -> str: return host_path.replace('\\', '/').replace(':', '') def __init__(self, upload_endpoint: str, workspace_path: str, collection_name: str, - max_retries: int = 3, timeout: int = 30, metadata_path: Optional[str] = None): + max_retries: int = 3, timeout: int = 30, metadata_path: Optional[str] = None, + logical_repo_id: Optional[str] = None): """Initialize remote upload client.""" self.upload_endpoint = upload_endpoint.rstrip('/') self.workspace_path = workspace_path @@ -274,6 +542,7 @@ def __init__(self, upload_endpoint: str, workspace_path: str, collection_name: s self.max_retries = max_retries self.timeout = timeout self.temp_dir = None + self.logical_repo_id = logical_repo_id # Set environment variables for cache functions os.environ["WORKSPACE_PATH"] = workspace_path @@ -479,7 +748,7 @@ def create_delta_bundle(self, changes: Dict[str, List]) -> Tuple[str, Dict[str, # Process created files for path in changes["created"]: - rel_path = str(path.relative_to(Path(self.workspace_path))) + rel_path = path.relative_to(Path(self.workspace_path)).as_posix() try: with open(path, 'rb') as f: content = f.read() @@ -517,7 +786,7 @@ def create_delta_bundle(self, changes: Dict[str, List]) -> Tuple[str, Dict[str, # Process updated files for path in changes["updated"]: - rel_path = str(path.relative_to(Path(self.workspace_path))) + rel_path = path.relative_to(Path(self.workspace_path)).as_posix() try: with open(path, 'rb') as f: content = f.read() @@ -557,8 +826,8 @@ def create_delta_bundle(self, changes: Dict[str, List]) -> Tuple[str, Dict[str, # Process moved files for source_path, dest_path in changes["moved"]: - dest_rel_path = str(dest_path.relative_to(Path(self.workspace_path))) - source_rel_path = str(source_path.relative_to(Path(self.workspace_path))) + dest_rel_path = dest_path.relative_to(Path(self.workspace_path)).as_posix() + source_rel_path = source_path.relative_to(Path(self.workspace_path)).as_posix() try: with open(dest_path, 'rb') as f: content = f.read() @@ -599,7 +868,7 @@ def create_delta_bundle(self, changes: Dict[str, List]) -> Tuple[str, Dict[str, # Process deleted files for path in changes["deleted"]: - rel_path = str(path.relative_to(Path(self.workspace_path))) + rel_path = path.relative_to(Path(self.workspace_path)).as_posix() try: previous_hash = get_cached_file_hash(str(path.resolve()), self.repo_name) @@ -661,6 +930,15 @@ def create_delta_bundle(self, changes: Dict[str, List]) -> Tuple[str, Dict[str, } (metadata_dir / "hashes.json").write_text(json.dumps(hashes_metadata, indent=2)) + try: + git_history = _collect_git_history_for_workspace(self.workspace_path) + if git_history: + (metadata_dir / "git_history.json").write_text( + json.dumps(git_history, indent=2) + ) + except Exception: + pass + # Create tarball in temporary directory temp_bundle_dir = self._get_temp_bundle_dir() bundle_path = temp_bundle_dir / f"{bundle_id}.tar.gz" @@ -710,6 +988,9 @@ def upload_bundle(self, bundle_path: str, manifest: Dict[str, Any]) -> Dict[str, 'source_path': self.workspace_path, } + if getattr(self, "logical_repo_id", None): + data['logical_repo_id'] = self.logical_repo_id + logger.info(f"[remote_upload] Uploading bundle {manifest['bundle_id']} (size: {bundle_size} bytes)") response = self.session.post( @@ -1232,6 +1513,8 @@ def get_remote_config(cli_path: Optional[str] = None) -> Dict[str, str]: else: workspace_path = os.environ.get("WATCH_ROOT", os.environ.get("WORKSPACE_PATH", "/work")) + logical_repo_id = _compute_logical_repo_id(workspace_path) + # Use auto-generated collection name based on repo name repo_name = _extract_repo_name_from_path(workspace_path) # Fallback to directory name if repo detection fails @@ -1243,6 +1526,7 @@ def get_remote_config(cli_path: Optional[str] = None) -> Dict[str, str]: "upload_endpoint": os.environ.get("REMOTE_UPLOAD_ENDPOINT", "http://localhost:8080"), "workspace_path": workspace_path, "collection_name": collection_name, + "logical_repo_id": logical_repo_id, # Use higher, more robust defaults but still allow env overrides "max_retries": int(os.environ.get("REMOTE_UPLOAD_MAX_RETRIES", "5")), "timeout": int(os.environ.get("REMOTE_UPLOAD_TIMEOUT", "1800")), @@ -1353,6 +1637,7 @@ def main(): collection_name=config["collection_name"], max_retries=config["max_retries"], timeout=config["timeout"], + logical_repo_id=config.get("logical_repo_id"), ) as client: client.log_mapping_summary() return 0 @@ -1366,7 +1651,8 @@ def main(): workspace_path=config["workspace_path"], collection_name=config["collection_name"], max_retries=config["max_retries"], - timeout=config["timeout"] + timeout=config["timeout"], + logical_repo_id=config.get("logical_repo_id"), ) as client: logger.info("Remote upload client initialized successfully") @@ -1409,7 +1695,8 @@ def main(): workspace_path=config["workspace_path"], collection_name=config["collection_name"], max_retries=config["max_retries"], - timeout=config["timeout"] + timeout=config["timeout"], + logical_repo_id=config.get("logical_repo_id"), ) as client: logger.info("Remote upload client initialized successfully") diff --git a/scripts/upload_service.py b/scripts/upload_service.py index f9eb416b..ad386faf 100644 --- a/scripts/upload_service.py +++ b/scripts/upload_service.py @@ -33,6 +33,9 @@ _extract_repo_name_from_path, update_repo_origin, get_collection_mappings, + find_collection_for_logical_repo, + update_workspace_state, + logical_repo_reuse_enabled, ) except ImportError: # Fallback for testing without full environment @@ -43,6 +46,11 @@ _extract_repo_name_from_path = None update_repo_origin = None get_collection_mappings = None + find_collection_for_logical_repo = None + update_workspace_state = None + + def logical_repo_reuse_enabled() -> bool: # type: ignore[no-redef] + return False # Configure logging @@ -214,6 +222,28 @@ def process_delta_bundle(workspace_path: str, bundle_path: Path, manifest: Dict[ operations_data = json.loads(ops_file.read().decode('utf-8')) operations = operations_data.get("operations", []) + # Best-effort: extract git history metadata for watcher to ingest + try: + git_member = None + for member in tar.getnames(): + if member.endswith("metadata/git_history.json"): + git_member = member + break + if git_member: + git_file = tar.extractfile(git_member) + if git_file: + history_bytes = git_file.read() + history_dir = workspace / ".remote-git" + history_dir.mkdir(parents=True, exist_ok=True) + bundle_id = manifest.get("bundle_id") or "unknown" + history_path = history_dir / f"git_history_{bundle_id}.json" + try: + history_path.write_bytes(history_bytes) + except Exception as write_err: + logger.debug(f"[upload_service] Failed to write git history manifest: {write_err}") + except Exception as git_err: + logger.debug(f"[upload_service] Error extracting git history metadata: {git_err}") + # Process each operation for operation in operations: op_type = operation.get("operation") @@ -402,6 +432,7 @@ async def upload_delta_bundle( sequence_number: Optional[int] = Form(None), force: Optional[bool] = Form(False), source_path: Optional[str] = Form(None), + logical_repo_id: Optional[str] = Form(None), ): """Upload and process delta bundle.""" start_time = datetime.now() @@ -421,8 +452,51 @@ async def upload_delta_bundle( if not repo_name: repo_name = Path(workspace_path).name - # Get collection name (respect client-supplied name when provided) - if not collection_name: + # Preserve any client-supplied collection name but allow server-side overrides + client_collection_name = collection_name + resolved_collection: Optional[str] = None + + # Resolve collection name, preferring server-side mapping for logical_repo_id when enabled + if logical_repo_reuse_enabled() and logical_repo_id and find_collection_for_logical_repo: + try: + existing = find_collection_for_logical_repo(logical_repo_id, search_root=WORK_DIR) + except Exception: + existing = None + if existing: + resolved_collection = existing + + # Latent migration: when no explicit mapping exists yet for this logical_repo_id, but there is a + # single existing collection mapping, prefer reusing it rather than creating a fresh collection. + if logical_repo_reuse_enabled() and logical_repo_id and resolved_collection is None and get_collection_mappings: + try: + mappings = get_collection_mappings(search_root=WORK_DIR) or [] + except Exception: + mappings = [] + + if len(mappings) == 1: + canonical = mappings[0] + canonical_coll = canonical.get("collection_name") + if canonical_coll: + resolved_collection = canonical_coll + if update_workspace_state: + try: + update_workspace_state( + workspace_path=canonical.get("container_path") or canonical.get("state_file"), + updates={"logical_repo_id": logical_repo_id}, + repo_name=canonical.get("repo_name"), + ) + except Exception as migrate_err: + logger.debug( + f"[upload_service] Failed to migrate logical_repo_id for existing mapping: {migrate_err}" + ) + + # Finalize collection_name: prefer resolved server-side mapping, then client-supplied name, + # then standard get_collection_name/DEFAULT_COLLECTION fallbacks. + if resolved_collection is not None: + collection_name = resolved_collection + elif client_collection_name: + collection_name = client_collection_name + else: if get_collection_name and repo_name: collection_name = get_collection_name(repo_name) else: @@ -431,17 +505,35 @@ async def upload_delta_bundle( # Persist origin metadata for remote lookups (including client source_path) # Use slugged repo name (repo+16) for state so it matches ingest/watch_index usage try: - if update_repo_origin and repo_name: + if repo_name: workspace_key = get_workspace_key(workspace_path) slug_repo_name = f"{repo_name}-{workspace_key}" container_workspace = str(Path(WORK_DIR) / slug_repo_name) - update_repo_origin( - workspace_path=container_workspace, - repo_name=slug_repo_name, - container_path=container_workspace, - source_path=source_path or workspace_path, - collection_name=collection_name, - ) + + # Persist logical_repo_id mapping for this slug/workspace when provided (feature-gated) + if logical_repo_reuse_enabled() and logical_repo_id and update_workspace_state: + try: + update_workspace_state( + workspace_path=container_workspace, + updates={ + "logical_repo_id": logical_repo_id, + "qdrant_collection": collection_name, + }, + repo_name=slug_repo_name, + ) + except Exception as state_err: + logger.debug( + f"[upload_service] Failed to persist logical_repo_id mapping: {state_err}" + ) + + if update_repo_origin: + update_repo_origin( + workspace_path=container_workspace, + repo_name=slug_repo_name, + container_path=container_workspace, + source_path=source_path or workspace_path, + collection_name=collection_name, + ) except Exception as origin_err: logger.debug(f"[upload_service] Failed to persist origin info: {origin_err}") diff --git a/scripts/watch_index.py b/scripts/watch_index.py index c9e94c57..ba373d82 100644 --- a/scripts/watch_index.py +++ b/scripts/watch_index.py @@ -2,8 +2,10 @@ import os import time import threading +import json +import subprocess from pathlib import Path -from typing import Optional, Set +from typing import Optional, Set, Dict, List, Any from qdrant_client import QdrantClient, models from fastembed import TextEmbedding @@ -29,6 +31,10 @@ remove_cached_file, update_indexing_status, update_workspace_state, + get_workspace_state, + ensure_logical_repo_id, + find_collection_for_logical_repo, + logical_repo_reuse_enabled, ) import hashlib from datetime import datetime @@ -68,13 +74,75 @@ def _detect_repo_for_file(file_path: Path) -> Optional[Path]: def _get_collection_for_repo(repo_path: Path) -> str: + """Resolve Qdrant collection for a repo, with logical_repo_id-aware reuse. + + In multi-repo mode, prefer reusing an existing canonical collection that has + already been associated with this logical repository (same git common dir) + by consulting workspace_state. Falls back to the legacy per-repo hashed + collection naming when no mapping exists. + """ + default_coll = os.environ.get("COLLECTION_NAME", "my-collection") try: repo_name = _extract_repo_name_from_path(str(repo_path)) + except Exception: + repo_name = None + + # Multi-repo: try to reuse a canonical collection based on logical_repo_id + if repo_name and is_multi_repo_mode() and logical_repo_reuse_enabled(): + workspace_root = os.environ.get("WORKSPACE_PATH") or os.environ.get("WATCH_ROOT") or "/work" + try: + ws_root_path = Path(workspace_root).resolve() + except Exception: + ws_root_path = Path(workspace_root) + ws_path = str((ws_root_path / repo_name).resolve()) + + state: Dict[str, Any] + try: + state = get_workspace_state(ws_path, repo_name) or {} + except Exception: + state = {} + + if isinstance(state, dict): + try: + state = ensure_logical_repo_id(state, ws_path) + except Exception: + pass + lrid = state.get("logical_repo_id") + if isinstance(lrid, str) and lrid: + coll: Optional[str] + try: + coll = find_collection_for_logical_repo(lrid, search_root=str(ws_root_path)) + except Exception: + coll = None + if isinstance(coll, str) and coll: + try: + update_workspace_state( + workspace_path=ws_path, + updates={"qdrant_collection": coll, "logical_repo_id": lrid}, + repo_name=repo_name, + ) + except Exception: + pass + return coll + + # Fallback to any explicit collection stored in state for this repo + coll2 = state.get("qdrant_collection") + if isinstance(coll2, str) and coll2: + return coll2 + + # Legacy behaviour: derive per-repo collection name + try: + return get_collection_name(repo_name) + except Exception: + return default_coll + + # Single-repo mode or repo_name detection failed: use existing helpers/env + try: if repo_name: return get_collection_name(repo_name) except Exception: pass - return os.environ.get("COLLECTION_NAME", "my-collection") + return default_coll def _get_collection_for_file(file_path: Path) -> str: @@ -237,6 +305,9 @@ def _maybe_enqueue(self, src_path: str): rel_dir = "/" if self.excl.exclude_dir(rel_dir): return + if any(part == ".remote-git" for part in p.parts) and p.suffix.lower() == ".json": + self.queue.add(p) + return # only code files if p.suffix.lower() not in idx.CODE_EXTS: return @@ -261,6 +332,8 @@ def on_deleted(self, event): p = Path(event.src_path).resolve() except Exception: return + if any(part == ".codebase" for part in p.parts): + return # Only attempt deletion for code files we would have indexed if p.suffix.lower() not in idx.CODE_EXTS: return @@ -279,9 +352,25 @@ def on_deleted(self, event): if repo_path: repo_name = _extract_repo_name_from_path(str(repo_path)) remove_cached_file(str(p), repo_name) + + # Remove symbol cache entry + try: + from scripts.workspace_state import remove_cached_symbols + remove_cached_symbols(str(p)) + print(f"[deleted_symbol_cache] {p}") + except Exception as e: + print(f"[symbol_cache_delete_error] {p}: {e}") else: root_repo_name = _extract_repo_name_from_path(str(self.root)) remove_cached_file(str(p), root_repo_name) + + # Remove symbol cache entry (single repo mode) + try: + from scripts.workspace_state import remove_cached_symbols + remove_cached_symbols(str(p)) + print(f"[deleted_symbol_cache] {p}") + except Exception as e: + print(f"[symbol_cache_delete_error] {p}: {e}") except Exception: pass @@ -552,6 +641,8 @@ def _rename_in_store( host_root = ( str(os.environ.get("HOST_INDEX_PATH") or "").strip().rstrip("/") ) + if ":" in host_root: # Windows drive letter (e.g., "C:") + host_root = "" host_path = None container_path = None try: @@ -693,10 +784,9 @@ def main(): vector_name = idx._sanitize_vector_name(MODEL) try: - idx.ensure_collection(client, default_collection, model_dim, vector_name) + idx.ensure_collection_and_indexes_once(client, default_collection, model_dim, vector_name) except Exception: pass - idx.ensure_payload_indexes(client, default_collection) try: if multi_repo_enabled: @@ -730,7 +820,31 @@ def main(): ) handler = IndexHandler(ROOT, q, client, default_collection) - obs = Observer() + use_polling = (os.environ.get("WATCH_USE_POLLING") or "").strip().lower() in ( + "1", + "true", + "yes", + "on", + ) + if use_polling: + try: + from watchdog.observers.polling import PollingObserver # type: ignore + + obs = PollingObserver() + try: + print("[watch_mode] Using polling observer for filesystem events") + except Exception: + pass + except Exception: + obs = Observer() + try: + print( + "[watch_mode] Polling observer unavailable, falling back to default Observer" + ) + except Exception: + pass + else: + obs = Observer() obs.schedule(handler, str(ROOT), recursive=True) obs.start() @@ -744,6 +858,39 @@ def main(): obs.join() +def _process_git_history_manifest( + p: Path, + client, + model, + collection: str, + vector_name: str, + repo_name: Optional[str], +): + try: + import sys + + script = ROOT_DIR / "scripts" / "ingest_history.py" + if not script.exists(): + return + cmd = [sys.executable or "python3", str(script), "--manifest-json", str(p)] + env = os.environ.copy() + if collection: + env["COLLECTION_NAME"] = collection + if QDRANT_URL: + env["QDRANT_URL"] = QDRANT_URL + if repo_name: + env["REPO_NAME"] = repo_name + try: + print( + f"[git_history_manifest] launching ingest_history.py for {p} collection={collection} repo={repo_name}" + ) + except Exception: + pass + subprocess.Popen(cmd, env=env) + except Exception: + return + + def _process_paths(paths, client, model, vector_name: str, model_dim: int, workspace_path: str): unique_paths = sorted(set(Path(x) for x in paths)) if not unique_paths: @@ -782,6 +929,27 @@ def _process_paths(paths, client, model, vector_name: str, model_dim: int, works repo_name = _extract_repo_name_from_path(repo_key) collection = _get_collection_for_file(p) + if ".remote-git" in p.parts and p.suffix.lower() == ".json": + try: + _process_git_history_manifest(p, client, model, collection, vector_name, repo_name) + except Exception as e: + try: + print(f"[commit_ingest_error] {p}: {e}") + except Exception: + pass + repo_progress[repo_key] = repo_progress.get(repo_key, 0) + 1 + try: + _update_progress( + repo_key, + started_at, + repo_progress[repo_key], + len(repo_files), + p, + ) + except Exception: + pass + continue + if not p.exists(): if client is not None: try: @@ -809,22 +977,83 @@ def _process_paths(paths, client, model, vector_name: str, model_dim: int, works if client is not None and model is not None: try: - idx.ensure_collection(client, collection, model_dim, vector_name) - idx.ensure_payload_indexes(client, collection) + idx.ensure_collection_and_indexes_once(client, collection, model_dim, vector_name) except Exception: pass ok = False try: - ok = idx.index_single_file( - client, - model, - collection, - vector_name, - p, - dedupe=True, - skip_unchanged=False, - ) + # Prefer smart symbol-aware reindexing when enabled and cache is available + try: + if getattr(idx, "_smart_symbol_reindexing_enabled", None) and idx._smart_symbol_reindexing_enabled(): + text: str | None = None + try: + text = p.read_text(encoding="utf-8", errors="ignore") + except Exception: + text = None + if text is not None: + try: + language = idx.detect_language(p) + except Exception: + language = "" + try: + file_hash = hashlib.sha1(text.encode("utf-8", errors="ignore")).hexdigest() + except Exception: + file_hash = "" + if file_hash: + try: + use_smart, smart_reason = idx.should_use_smart_reindexing(str(p), file_hash) + except Exception: + use_smart, smart_reason = False, "smart_check_failed" + + # Bootstrap: if we have no symbol cache yet, still run smart path once + bootstrap = smart_reason == "no_cached_symbols" + if use_smart or bootstrap: + msg_kind = "smart reindexing" if use_smart else "bootstrap (no_cached_symbols) for smart reindex" + try: + print(f"[SMART_REINDEX][watcher] Using {msg_kind} for {p} ({smart_reason})") + except Exception: + pass + try: + status = idx.process_file_with_smart_reindexing( + p, + text, + language, + client, + collection, + repo_name, + model, + vector_name, + ) + ok = status == "success" + except Exception as se: + try: + print(f"[SMART_REINDEX][watcher] Smart reindexing failed for {p}: {se}") + except Exception: + pass + ok = False + else: + try: + print(f"[SMART_REINDEX][watcher] Using full reindexing for {p} ({smart_reason})") + except Exception: + pass + except Exception as e_smart: + try: + print(f"[SMART_REINDEX][watcher] Smart reindexing disabled or preview failed for {p}: {e_smart}") + except Exception: + pass + + # Fallback: full single-file reindex + if not ok: + ok = idx.index_single_file( + client, + model, + collection, + vector_name, + p, + dedupe=True, + skip_unchanged=False, + ) except Exception as e: try: print(f"[index_error] {p}: {e}") diff --git a/scripts/workspace_state.py b/scripts/workspace_state.py index 99209c50..df48059e 100644 --- a/scripts/workspace_state.py +++ b/scripts/workspace_state.py @@ -73,6 +73,7 @@ class WorkspaceState(TypedDict, total=False): last_activity: Optional[LastActivity] qdrant_stats: Optional[Dict[str, Any]] origin: Optional[OriginInfo] + logical_repo_id: Optional[str] def is_multi_repo_mode() -> bool: """Check if multi-repo mode is enabled.""" @@ -80,6 +81,21 @@ def is_multi_repo_mode() -> bool: "1", "true", "yes", "on" } + +def logical_repo_reuse_enabled() -> bool: + """Feature flag for logical-repo / collection reuse. + + Controlled by LOGICAL_REPO_REUSE env var: 1/true/yes/on => enabled. + When disabled, behavior falls back to legacy per-repo collection logic + and does not write logical_repo_id into workspace state. + """ + return os.environ.get("LOGICAL_REPO_REUSE", "").strip().lower() in { + "1", + "true", + "yes", + "on", + } + _state_lock = threading.Lock() # Track last-used timestamps for cleanup of idle workspace locks _state_locks: Dict[str, threading.RLock] = {} @@ -158,6 +174,59 @@ def _sanitize_name(s: str, max_len: int = 64) -> str: return s[:max_len] +def _detect_git_common_dir(start: Path) -> Optional[Path]: + try: + base = start if start.is_dir() else start.parent + r = subprocess.run( + ["git", "-C", str(base), "rev-parse", "--git-common-dir"], + capture_output=True, + text=True, + ) + raw = (r.stdout or "").strip() + if r.returncode != 0 or not raw: + return None + p = Path(raw) + if not p.is_absolute(): + p = base / p + return p.resolve() + except Exception: + return None + + +def compute_logical_repo_id(workspace_path: str) -> str: + try: + p = Path(workspace_path).resolve() + except Exception: + p = Path(workspace_path) + + common = _detect_git_common_dir(p) + if common is not None: + key = str(common) + prefix = "git:" + else: + key = str(p) + prefix = "fs:" + + h = hashlib.sha1(key.encode("utf-8", errors="ignore")).hexdigest()[:16] + return f"{prefix}{h}" + + +def ensure_logical_repo_id(state: WorkspaceState, workspace_path: str) -> WorkspaceState: + if not isinstance(state, dict): + return state + if not logical_repo_reuse_enabled(): + # Gate: when logical repo reuse is disabled, leave state untouched + return state + if state.get("logical_repo_id"): + return state + lrid = compute_logical_repo_id(workspace_path) + state["logical_repo_id"] = lrid + origin = dict(state.get("origin", {}) or {}) + origin.setdefault("logical_repo_id", lrid) + state["origin"] = origin + return state + + # Cross-process file locking (POSIX fcntl), falls back to no-op if unavailable try: import fcntl # type: ignore @@ -252,6 +321,12 @@ def _atomic_write_state(state_path: Path, state: WorkspaceState) -> None: with open(temp_path, 'w', encoding='utf-8') as f: json.dump(state, f, indent=2, ensure_ascii=False) temp_path.replace(state_path) + # Ensure state/cache files are group-writable so multiple processes + # (upload service, watcher, indexer) can update them. + try: + os.chmod(state_path, 0o664) + except PermissionError: + pass except Exception: # Clean up temp file if something went wrong try: @@ -304,10 +379,17 @@ def get_workspace_state( try: with open(state_path, "r", encoding="utf-8") as f: state = json.load(f) - if isinstance(state, dict): - return state - except (json.JSONDecodeError, ValueError, OSError): - pass + if isinstance(state, dict): + if logical_repo_reuse_enabled(): + workspace_real = str(Path(workspace_path or _resolve_workspace_root()).resolve()) + state = ensure_logical_repo_id(state, workspace_real) + try: + _atomic_write_state(state_path, state) + except Exception as e: + print(f"[workspace_state] Failed to persist logical_repo_id to {state_path}: {e}") + return state + except (json.JSONDecodeError, ValueError, OSError) as e: + print(f"[workspace_state] Failed to read state from {state_path}: {e}") now = datetime.now().isoformat() collection_name = get_collection_name(repo_name) @@ -320,6 +402,12 @@ def get_workspace_state( "indexing_status": {"state": "idle"}, } + if logical_repo_reuse_enabled(): + try: + state = ensure_logical_repo_id(state, state.get("workspace_path", workspace_path or _resolve_workspace_root())) + except Exception as e: + print(f"[workspace_state] Failed to ensure logical_repo_id for {workspace_path}: {e}") + _atomic_write_state(state_path, state) return state @@ -651,8 +739,12 @@ def set_cached_file_hash(file_path: str, file_hash: str, repo_name: Optional[str try: if cache_path.exists(): - with open(cache_path, "r", encoding="utf-8") as f: - cache = json.load(f) + try: + with open(cache_path, "r", encoding="utf-8") as f: + cache = json.load(f) + except Exception: + # If the existing cache is corrupt/empty, recreate it + cache = {"file_hashes": {}, "created_at": datetime.now().isoformat()} else: cache = {"file_hashes": {}, "created_at": datetime.now().isoformat()} @@ -758,7 +850,8 @@ def get_collection_mappings(search_root: Optional[str] = None) -> List[Dict[str, try: with open(state_path, "r", encoding="utf-8") as f: state = json.load(f) or {} - except Exception: + except Exception as e: + print(f"[workspace_state] Failed to read repo state from {state_path}: {e}") continue origin = state.get("origin", {}) or {} @@ -802,4 +895,318 @@ def get_collection_mappings(search_root: Optional[str] = None) -> List[Dict[str, return mappings + +def find_collection_for_logical_repo(logical_repo_id: str, search_root: Optional[str] = None) -> Optional[str]: + if not logical_repo_reuse_enabled(): + return None + + root_path = Path(search_root or _resolve_workspace_root()).resolve() + + try: + if is_multi_repo_mode(): + repos_root = root_path / STATE_DIRNAME / "repos" + if repos_root.exists(): + for repo_dir in repos_root.iterdir(): + if not repo_dir.is_dir(): + continue + state_path = repo_dir / STATE_FILENAME + if not state_path.exists(): + continue + try: + with open(state_path, "r", encoding="utf-8") as f: + state = json.load(f) or {} + except Exception: + continue + + ws = state.get("workspace_path") or str(root_path) + state = ensure_logical_repo_id(state, ws) + if state.get("logical_repo_id") == logical_repo_id: + coll = state.get("qdrant_collection") + if coll: + try: + _atomic_write_state(state_path, state) + except Exception as e: + print(f"[workspace_state] Failed to persist logical_repo_id mapping to {state_path}: {e}") + return coll + + state_path = root_path / STATE_DIRNAME / STATE_FILENAME + if state_path.exists(): + try: + with open(state_path, "r", encoding="utf-8") as f: + state = json.load(f) or {} + except Exception as e: + print(f"[workspace_state] Failed to read workspace state from {state_path}: {e}") + state = {} + + ws = state.get("workspace_path") or str(root_path) + state = ensure_logical_repo_id(state, ws) + if state.get("logical_repo_id") == logical_repo_id: + coll = state.get("qdrant_collection") + if coll: + try: + _atomic_write_state(state_path, state) + except Exception as e: + print(f"[workspace_state] Failed to persist logical_repo_id mapping to {state_path}: {e}") + return coll + except Exception as e: + print(f"[workspace_state] Error while searching collections for logical_repo_id={logical_repo_id}: {e}") + return None + + return None + + +def get_or_create_collection_for_logical_repo( + workspace_path: str, + preferred_repo_name: Optional[str] = None, +) -> str: + # Gate entire logical-repo based resolution behind feature flag + if not logical_repo_reuse_enabled(): + base_repo = preferred_repo_name + try: + coll = get_collection_name(base_repo) + except Exception: + coll = get_collection_name(None) + try: + update_workspace_state( + workspace_path=workspace_path, + updates={"qdrant_collection": coll}, + repo_name=preferred_repo_name, + ) + except Exception as e: + print(f"[workspace_state] Failed to persist legacy qdrant_collection for {workspace_path}: {e}") + return coll + try: + ws = Path(workspace_path).resolve() + except Exception: + ws = Path(workspace_path) + + common = _detect_git_common_dir(ws) + if common is not None: + canonical_root = common.parent + else: + canonical_root = ws + + ws_path = str(canonical_root) + + try: + state = get_workspace_state(workspace_path=ws_path, repo_name=preferred_repo_name) + except Exception: + state = {} + + if not isinstance(state, dict): + state = {} + + try: + state = ensure_logical_repo_id(state, ws_path) + except Exception: + pass + + lrid = state.get("logical_repo_id") + if isinstance(lrid, str) and lrid: + coll = find_collection_for_logical_repo(lrid, search_root=ws_path) + if isinstance(coll, str) and coll: + if state.get("qdrant_collection") != coll: + try: + update_workspace_state( + workspace_path=ws_path, + updates={"qdrant_collection": coll, "logical_repo_id": lrid}, + repo_name=preferred_repo_name, + ) + except Exception: + pass + return coll + + coll = state.get("qdrant_collection") + if not isinstance(coll, str) or not coll: + base_repo = preferred_repo_name + try: + coll = get_collection_name(base_repo) + except Exception: + coll = get_collection_name(None) + try: + update_workspace_state( + workspace_path=ws_path, + updates={"qdrant_collection": coll}, + repo_name=preferred_repo_name, + ) + except Exception: + pass + + return coll + + +# ===== Symbol-Level Cache for Smart Reindexing ===== + +def _get_symbol_cache_path(file_path: str) -> Path: + """Get symbol cache file path for a given file.""" + try: + fp = str(Path(file_path).resolve()) + # Create symbol cache using file hash to handle renames + file_hash = hashlib.md5(fp.encode('utf-8')).hexdigest()[:8] + if is_multi_repo_mode(): + # Use the same repo-name detection as other state helpers so that + # symbol caches live under the correct per-repo .codebase directory + repo_name = _detect_repo_name_from_path(Path(file_path)) + state_dir = _get_repo_state_dir(repo_name) + return state_dir / f"symbols_{file_hash}.json" + else: + cache_dir = _get_cache_path(_resolve_workspace_root()).parent + return cache_dir / f"symbols_{file_hash}.json" + except Exception: + # Fallback to simple file-based path + cache_dir = _get_cache_path(_resolve_workspace_root()).parent + filename = Path(file_path).name.replace('.', '_').replace('/', '_') + return cache_dir / f"symbols_{filename}.json" + + +def get_cached_symbols(file_path: str) -> dict: + """Load cached symbol metadata for a file.""" + cache_path = _get_symbol_cache_path(file_path) + + if not cache_path.exists(): + return {} + + try: + with open(cache_path, 'r', encoding='utf-8') as f: + cache_data = json.load(f) + return cache_data.get("symbols", {}) + except Exception: + return {} + + +def set_cached_symbols(file_path: str, symbols: dict, file_hash: str) -> None: + """Save symbol metadata for a file. Extends existing to include pseudo data.""" + cache_path = _get_symbol_cache_path(file_path) + cache_path.parent.mkdir(parents=True, exist_ok=True) + + try: + cache_data = { + "file_path": str(file_path), + "file_hash": file_hash, + "updated_at": datetime.now().isoformat(), + "symbols": symbols + } + + with open(cache_path, 'w', encoding='utf-8') as f: + json.dump(cache_data, f, indent=2) + + # Ensure symbol cache files are group-writable so both indexer and + # watcher processes (potentially different users sharing a group) + # can update them on shared volumes. + try: + os.chmod(cache_path, 0o664) + except PermissionError: + pass + except Exception as e: + print(f"[SYMBOL_CACHE_WARNING] Failed to save symbol cache for {file_path}: {e}") + + +def get_cached_pseudo(file_path: str, symbol_id: str) -> tuple[str, list[str]]: + """Load cached pseudo description and tags for a specific symbol. + + Returns: + (pseudo, tags) tuple, or ("", []) if not found + """ + cached_symbols = get_cached_symbols(file_path) + + if symbol_id in cached_symbols: + symbol_info = cached_symbols[symbol_id] + pseudo = symbol_info.get("pseudo", "") + tags = symbol_info.get("tags", []) + + # Ensure correct types + if isinstance(pseudo, str): + pseudo = pseudo + else: + pseudo = "" + + if isinstance(tags, list): + tags = [str(tag) for tag in tags] + else: + tags = [] + + return pseudo, tags + + return "", [] + + +def set_cached_pseudo(file_path: str, symbol_id: str, pseudo: str, tags: list[str], file_hash: str) -> None: + """Update pseudo data for a specific symbol in the cache. + + This function updates only the pseudo data without recreating the entire symbol cache, + making it efficient for incremental updates during indexing. + """ + cached_symbols = get_cached_symbols(file_path) + + # Update the symbol with pseudo data + if symbol_id in cached_symbols: + cached_symbols[symbol_id]["pseudo"] = pseudo + cached_symbols[symbol_id]["tags"] = tags + + # Save the updated cache only when we actually have symbol entries, to + # avoid creating empty symbol cache files before the base symbol set + # has been seeded by the indexer/smart reindex path. + set_cached_symbols(file_path, cached_symbols, file_hash) + + +def update_symbols_with_pseudo(file_path: str, symbols_with_pseudo: dict, file_hash: str) -> None: + """Update symbols cache with pseudo data for multiple symbols at once. + + Args: + file_path: Path to the file + symbols_with_pseudo: Dict mapping symbol_id to (symbol_info, pseudo, tags) tuples + file_hash: Current file hash + """ + cached_symbols = get_cached_symbols(file_path) + + # Update symbols with their new pseudo data + for symbol_id, (symbol_info, pseudo, tags) in symbols_with_pseudo.items(): + if symbol_id in cached_symbols: + # Update existing symbol with pseudo data + cached_symbols[symbol_id]["pseudo"] = pseudo + cached_symbols[symbol_id]["tags"] = tags + + # Update content hash from symbol_info if available + if isinstance(symbol_info, dict): + cached_symbols[symbol_id].update(symbol_info) + + # Save the updated cache + set_cached_symbols(file_path, cached_symbols, file_hash) + + +def remove_cached_symbols(file_path: str) -> None: + """Remove symbol cache for a file (when file is deleted).""" + cache_path = _get_symbol_cache_path(file_path) + try: + if cache_path.exists(): + cache_path.unlink() + except Exception: + pass + + +def compare_symbol_changes(old_symbols: dict, new_symbols: dict) -> tuple[list, list]: + """ + Compare old and new symbols to identify changes. + + Returns: + (unchanged_symbols, changed_symbols) + """ + unchanged = [] + changed = [] + + for symbol_id, symbol_info in new_symbols.items(): + if symbol_id in old_symbols: + old_info = old_symbols[symbol_id] + # Compare content hash + if old_info.get("content_hash") == symbol_info.get("content_hash"): + unchanged.append(symbol_id) + else: + changed.append(symbol_id) + else: + # New symbol + changed.append(symbol_id) + + return unchanged, changed + + # Add missing functions that callers expect (already defined above) \ No newline at end of file diff --git a/tests/test_collection_memory_backup_restore.py b/tests/test_collection_memory_backup_restore.py new file mode 100644 index 00000000..6808a222 --- /dev/null +++ b/tests/test_collection_memory_backup_restore.py @@ -0,0 +1,233 @@ +import os +import uuid +import importlib +import subprocess +from types import SimpleNamespace + +import pytest +from qdrant_client import QdrantClient, models + +# Reuse the existing Qdrant testcontainer fixture +from tests.test_integration_qdrant import qdrant_container # noqa: F401 + + +ing = importlib.import_module("scripts.ingest_code") +mem_backup = importlib.import_module("scripts.memory_backup") +mem_restore = importlib.import_module("scripts.memory_restore") + +pytestmark = pytest.mark.integration + + +def _create_collection_with_memory(qdrant_url: str, name: str, dim: int = 8) -> QdrantClient: + """Create a collection with dense+lex vectors and a single memory point. + + The collection is intentionally created without the ReFRAG mini vector so that + ensure_collection(..., REFRAG_MODE=1) must add it, exercising the + backup/recreate/restore path. + """ + client = QdrantClient(url=qdrant_url) + + vectors_cfg = { + "code": models.VectorParams(size=dim, distance=models.Distance.COSINE), + ing.LEX_VECTOR_NAME: models.VectorParams( + size=ing.LEX_VECTOR_DIM, distance=models.Distance.COSINE + ), + } + client.create_collection(collection_name=name, vectors_config=vectors_cfg) + + # One "memory" point (no metadata.path) and one code point (with path). + # Use integer point IDs to match Qdrant's accepted ID types. + points = [ + models.PointStruct( + id=1, + vector={"code": [0.1] * dim}, + payload={"information": "test memory", "metadata": {}}, + ), + models.PointStruct( + id=2, + vector={"code": [0.2] * dim}, + payload={ + "information": "code chunk", + # Mark as real code: has a path and language/kind so is_memory_point() returns False + "metadata": {"path": "/tmp/example.py", "language": "python", "kind": "code"}, + }, + ), + ] + client.upsert(collection_name=name, points=points) + return client + + +def _get_point_ids(client: QdrantClient, collection_name: str) -> set[str]: + pts, _ = client.scroll( + collection_name=collection_name, + limit=None, + with_payload=False, + with_vectors=False, + ) + return {str(p.id) for p in pts} + + +def test_memory_backup_restore_happy_path(qdrant_container, monkeypatch): + """ensure_collection should backup, recreate, and restore memories. + + Scenario: + - Start with a collection that has dense+lex vectors and at least one + "memory" point. + - Enable REFRAG_MODE so ensure_collection wants to add the mini vector. + - Qdrant will reject adding a new vector name via update_collection, so we + exercise the backup -> delete -> recreate -> restore path. + - In tolerant mode (STRICT_MEMORY_RESTORE not set / 0) indexing should + succeed and the memory should still be present. + """ + os.environ["QDRANT_URL"] = qdrant_container + collection = f"test-mem-{uuid.uuid4().hex[:8]}" + + client = _create_collection_with_memory(qdrant_container, collection, dim=8) + + # Force ReFRAG on so ensure_collection tries to add MINI_VECTOR_NAME + os.environ["REFRAG_MODE"] = "1" + os.environ.pop("STRICT_MEMORY_RESTORE", None) + + # Run ensure_collection: this should trigger backup + recreate + restore + ing.ensure_collection(client, collection, dim=8, vector_name="code") + + info = client.get_collection(collection) + cfg = info.config.params.vectors + + # Dense + lex must be present + assert "code" in cfg + assert ing.LEX_VECTOR_NAME in cfg + + # When REFRAG_MODE is on, mini vector should be present too + mini_name = os.environ.get("MINI_VECTOR_NAME", getattr(ing, "MINI_VECTOR_NAME", "mini")) + assert mini_name in cfg + + # Memory id should still exist after restore, but code points are not restored + ids = _get_point_ids(client, collection) + assert "1" in ids + assert "2" not in ids + + +def test_memory_restore_strict_mode_raises_on_failure(qdrant_container, monkeypatch): + """STRICT_MEMORY_RESTORE=1 should turn restore failures into hard errors. + + We let the real backup script run against Qdrant, but we force the restore + subprocess to fail and assert that ensure_collection raises. + """ + os.environ["QDRANT_URL"] = qdrant_container + collection = f"test-mem-strict-{uuid.uuid4().hex[:8]}" + + client = _create_collection_with_memory(qdrant_container, collection, dim=8) + + os.environ["REFRAG_MODE"] = "1" + os.environ["STRICT_MEMORY_RESTORE"] = "1" + + # Patch subprocess.run to: + # - allow the real memory_backup.py to run + # - force memory_restore.py to fail with non-zero exit + orig_run = subprocess.run + + def fake_run(args, **kwargs): # type: ignore[override] + cmd_str = " ".join(map(str, args)) + if "memory_backup.py" in cmd_str: + return orig_run(args, **kwargs) + if "memory_restore.py" in cmd_str: + return SimpleNamespace(returncode=1, stdout="", stderr="simulated restore failure") + return orig_run(args, **kwargs) + + monkeypatch.setattr(subprocess, "run", fake_run) + + with pytest.raises(RuntimeError): + ing.ensure_collection(client, collection, dim=8, vector_name="code") + + +def test_memory_backup_failure_tolerant_mode_still_recreates_collection(qdrant_container, monkeypatch): + """If backup fails but STRICT_MEMORY_RESTORE is not set, ensure_collection + should still recreate the collection with the correct vectors, even though + memories may be dropped. + + This makes the behavior explicit: backup failure is best-effort by default. + """ + os.environ["QDRANT_URL"] = qdrant_container + collection = f"test-mem-backup-fail-{uuid.uuid4().hex[:8]}" + + client = _create_collection_with_memory(qdrant_container, collection, dim=8) + + os.environ["REFRAG_MODE"] = "1" + os.environ.pop("STRICT_MEMORY_RESTORE", None) + + # Patch subprocess.run so memory_backup.py fails, but everything else runs normally + orig_run = subprocess.run + + def fake_run(args, **kwargs): # type: ignore[override] + cmd_str = " ".join(map(str, args)) + if "memory_backup.py" in cmd_str: + return SimpleNamespace(returncode=1, stdout="", stderr="simulated backup failure") + return orig_run(args, **kwargs) + + monkeypatch.setattr(subprocess, "run", fake_run) + + # Should not raise even though backup fails + ing.ensure_collection(client, collection, dim=8, vector_name="code") + + info = client.get_collection(collection) + cfg = info.config.params.vectors + + # Collection should still have the expected vectors (including mini) + assert "code" in cfg + assert ing.LEX_VECTOR_NAME in cfg + mini_name = os.environ.get("MINI_VECTOR_NAME", getattr(ing, "MINI_VECTOR_NAME", "mini")) + assert mini_name in cfg + + # Because backup failed and no restore occurred, the original memory is gone + ids = _get_point_ids(client, collection) + assert "1" not in ids + + +def test_memory_backup_and_restore_scripts_roundtrip(qdrant_container, tmp_path): + """Directly exercise memory_backup.export_memories and + memory_restore.restore_memories without going through ensure_collection. + + This confirms that the backup file contains the expected memory and that + restore_memories can recreate it in a fresh collection. + """ + os.environ["QDRANT_URL"] = qdrant_container + collection = f"test-mem-scripts-{uuid.uuid4().hex[:8]}" + + client = _create_collection_with_memory(qdrant_container, collection, dim=8) + + # Backup memories from the collection + backup_file = tmp_path / "memories_backup.json" + result = mem_backup.export_memories( + collection_name=collection, + output_file=str(backup_file), + client=client, + include_vectors=True, + batch_size=100, + ) + + assert result["success"] is True + assert result["memory_count"] == 1 + assert backup_file.exists() + + # Drop the original collection entirely + client.delete_collection(collection) + + # Restore into a fresh collection; let restore_memories create it + restore_result = mem_restore.restore_memories( + backup_file=str(backup_file), + collection_name=collection, + client=client, + embedding_model_name=None, + vector_name="code", + batch_size=50, + skip_existing=True, + skip_collection_creation=False, + ) + + assert restore_result["success"] is True + + # After restore, there should be exactly one memory point (id 1) and no code point (id 2) + ids = _get_point_ids(client, collection) + assert "1" in ids + assert "2" not in ids diff --git a/vscode-extension/context-engine-uploader/README.md b/vscode-extension/context-engine-uploader/README.md index 356a3d84..50df124a 100644 --- a/vscode-extension/context-engine-uploader/README.md +++ b/vscode-extension/context-engine-uploader/README.md @@ -19,10 +19,19 @@ Configuration - **Python dependencies:** the extension runs the standalone upload client via your configured `pythonPath`. Ensure the interpreter has `requests`, `urllib3`, and `charset_normalizer` installed. Run `python3 -m pip install requests urllib3 charset_normalizer` (or replace `python3` with your configured path) before starting the uploader. - **Path mapping:** `Host Root` + `Container Root` control how local paths are rewritten before reaching the remote service. By default the host root mirrors your `Target Path` and the container root is `/work`, which keeps Windows paths working without extra config. - **Prompt+ decoder:** set `Context Engine Uploader: Decoder Url` (default `http://localhost:8081`, auto-appends `/completion`) to point at your local llama.cpp decoder. For Ollama, set it to `http://localhost:11434/api/chat`. Turn on `Use Gpu Decoder` to set `USE_GPU_DECODER=1` so ctx.py prefers the GPU llama.cpp sidecar. Prompt+ automatically runs the bundled `scripts/ctx.py` when an embedded copy is available, falling back to the workspace version if not. -- **Claude Code MCP config:** `MCP Indexer Url` and `MCP Memory Url` control the URLs written into the project-local `.mcp.json` when you run the `Write MCP Config` command. This is only for configuring Claude Code MCP clients; other MCP integrations can be added separately later. +- **Claude/Windsurf MCP config:** + - `MCP Indexer Url` and `MCP Memory Url` control the URLs written into the project-local `.mcp.json` (Claude) and Windsurf `mcp_config.json` when you run the `Write MCP Config` command. These URLs are used **literally** (e.g. `http://localhost:8001/sse` or `http://localhost:8003/mcp`). + - `MCP Transport Mode` (`contextEngineUploader.mcpTransportMode`) chooses how those URLs are wrapped: + - `sse-remote` (default): emit stdio configs that call `npx mcp-remote --transport sse-only`. + - `http`: emit direct HTTP MCP entries of the form `{ "type": "http", "url": "" }` for Claude/Windsurf. Use this when pointing at HTTP `/mcp` endpoints exposed by the Context-Engine MCP services. +- **MCP config on startup:** + - `contextEngineUploader.autoWriteMcpConfigOnStartup` (default `false`) controls whether the extension automatically runs the same logic as `Write MCP Config` on activation. When enabled, it refreshes `.mcp.json`, Windsurf `mcp_config.json`, and the Claude hook (`.claude/settings.local.json`) to match your current settings and the installed extension version. If `scaffoldCtxConfig` is also `true`, this startup path will additionally scaffold/update `ctx_config.json` and `.env` as described below. - **CTX + GLM settings:** - `contextEngineUploader.ctxIndexerUrl` is copied into `.env` (as `MCP_INDEXER_URL`) so the embedded `ctx.py` knows which MCP indexer to call when enhancing prompts. - `contextEngineUploader.glmApiKey`, `glmApiBase`, and `glmModel` are used when scaffolding `ctx_config.json`/`.env` to pre-fill GLM decoder options. Existing non-placeholder values are preserved, so you can override them in the files at any time. +- **Git history upload settings:** + - `contextEngineUploader.gitMaxCommits` controls `REMOTE_UPLOAD_GIT_MAX_COMMITS`, bounding how many commits the upload client includes per bundle (set to 0 to disable git history). + - `contextEngineUploader.gitSince` controls `REMOTE_UPLOAD_GIT_SINCE`, letting you constrain the git log window (e.g. `2 years ago` or `2023-01-01`). - **Context scaffolding:** - `contextEngineUploader.scaffoldCtxConfig` (default `true`) controls whether the extension keeps a minimal `ctx_config.json` + `.env` in sync with your workspace. When enabled, running `Write MCP Config` or `Write CTX Config` will reuse the workspace’s existing files (if present) and only backfill placeholder or missing values from the bundled `env.example` plus the inferred collection name. Existing custom values are preserved. - The scaffolder also enforces CTX defaults (e.g., `MULTI_REPO_MODE=1`, `REFRAG_RUNTIME=glm`, `REFRAG_DECODER=1`) so the embedded `ctx.py` is ready for remote uploads, regardless of the “Use GLM Decoder” toggle. @@ -44,6 +53,7 @@ Commands - Status-bar button (`Index Codebase`) mirrors Start/Stop/Restart/Index status, while the `Prompt+` status button runs the ctx rewrite command on the current selection. - `Context Engine Uploader: Write MCP Config (.mcp.json)` writes or updates a project-local `.mcp.json` with MCP server entries for the Qdrant indexer and memory/search endpoints, using the configured MCP URLs. - `Context Engine Uploader: Write CTX Config (ctx_config.json/.env)` scaffolds the ctx config + env files as described above. This command runs automatically after `Write MCP Config` if scaffolding is enabled, but it is also exposed in the Command Palette for manual use. +- `Context Engine Uploader: Upload Git History (force sync bundle)` triggers a one-off force sync using the configured git history settings, producing a bundle that includes a `metadata/git_history.json` manifest for remote lineage ingestion. Logs ---- diff --git a/vscode-extension/context-engine-uploader/extension.js b/vscode-extension/context-engine-uploader/extension.js index deb9d355..a72b7cdc 100644 --- a/vscode-extension/context-engine-uploader/extension.js +++ b/vscode-extension/context-engine-uploader/extension.js @@ -69,6 +69,11 @@ function activate(context) { } runSequence('force').catch(error => log(`Index failed: ${error instanceof Error ? error.message : String(error)}`)); }); + const uploadGitHistoryDisposable = vscode.commands.registerCommand('contextEngineUploader.uploadGitHistory', () => { + vscode.window.showInformationMessage('Context Engine git history upload (force sync) started.'); + if (outputChannel) { outputChannel.show(true); } + runSequence('force').catch(error => log(`Git history upload failed: ${error instanceof Error ? error.message : String(error)}`)); + }); const ctxConfigDisposable = vscode.commands.registerCommand('contextEngineUploader.writeCtxConfig', () => { writeCtxConfig().catch(error => log(`CTX config write failed: ${error instanceof Error ? error.message : String(error)}`)); }); @@ -96,6 +101,7 @@ function activate(context) { event.affectsConfiguration('contextEngineUploader.mcpMemoryUrl') || event.affectsConfiguration('contextEngineUploader.mcpClaudeEnabled') || event.affectsConfiguration('contextEngineUploader.mcpWindsurfEnabled') || + event.affectsConfiguration('contextEngineUploader.mcpTransportMode') || event.affectsConfiguration('contextEngineUploader.windsurfMcpPath') || event.affectsConfiguration('contextEngineUploader.claudeHookEnabled') || event.affectsConfiguration('contextEngineUploader.surfaceQdrantCollectionHint') @@ -118,6 +124,7 @@ function activate(context) { stopDisposable, restartDisposable, indexDisposable, + uploadGitHistoryDisposable, showLogsDisposable, promptEnhanceDisposable, mcpConfigDisposable, @@ -132,8 +139,11 @@ function activate(context) { runSequence('auto').catch(error => log(`Startup run failed: ${error instanceof Error ? error.message : String(error)}`)); } - // When enabled, best-effort auto-scaffold ctx_config.json/.env for the current targetPath on activation - if (config.get('scaffoldCtxConfig', true)) { + // Optionally keep MCP + hook + ctx config in sync on activation + if (config.get('autoWriteMcpConfigOnStartup')) { + writeMcpConfig().catch(error => log(`MCP config auto-write on activation failed: ${error instanceof Error ? error.message : String(error)}`)); + } else if (config.get('scaffoldCtxConfig', true)) { + // Legacy behavior: scaffold ctx_config.json/.env directly when MCP auto-write is disabled writeCtxConfig().catch(error => log(`CTX config auto-scaffold on activation failed: ${error instanceof Error ? error.message : String(error)}`)); } } @@ -279,8 +289,9 @@ function getTargetPath(config) { updateStatusBarTooltip(); return undefined; } - updateStatusBarTooltip(folderPath); - return folderPath; + const autoTarget = detectDefaultTargetPath(folderPath); + updateStatusBarTooltip(autoTarget); + return autoTarget; } function saveTargetPath(config, targetPath) { const hasWorkspace = vscode.workspace.workspaceFolders && vscode.workspace.workspaceFolders.length; @@ -296,6 +307,68 @@ function getWorkspaceFolderPath() { } return folders[0].uri.fsPath; } +function looksLikeRepoRoot(dirPath) { + try { + const codebaseStatePath = path.join(dirPath, '.codebase', 'state.json'); + const gitDir = path.join(dirPath, '.git'); + if (fs.existsSync(codebaseStatePath) || fs.existsSync(gitDir)) { + return true; + } + } catch (error) { + log(`Repo root detection failed for ${dirPath}: ${error instanceof Error ? error.message : String(error)}`); + } + return false; +} +function detectDefaultTargetPath(workspaceFolderPath) { + try { + const resolved = path.resolve(workspaceFolderPath); + if (!fs.existsSync(resolved)) { + return workspaceFolderPath; + } + const rootLooksLikeRepo = looksLikeRepoRoot(resolved); + let entries; + try { + entries = fs.readdirSync(resolved); + } catch (error) { + log(`Auto targetPath discovery failed to read workspace folder: ${error instanceof Error ? error.message : String(error)}`); + return resolved; + } + const candidates = []; + for (const name of entries) { + const fullPath = path.join(resolved, name); + let stats; + try { + stats = fs.statSync(fullPath); + } catch (_) { + continue; + } + if (!stats.isDirectory()) { + continue; + } + if (looksLikeRepoRoot(fullPath)) { + candidates.push(path.resolve(fullPath)); + } + } + if (candidates.length === 1) { + const detected = candidates[0]; + log(`Target path auto-detected as ${detected} (under workspace folder).`); + return detected; + } + if (rootLooksLikeRepo) { + if (candidates.length > 1) { + log('Auto targetPath discovery found multiple candidate repos under workspace; using workspace folder instead.'); + } + return resolved; + } + if (candidates.length > 1) { + log('Auto targetPath discovery found multiple candidate repos under workspace; using workspace folder instead.'); + } + return resolved; + } catch (error) { + log(`Auto targetPath discovery failed: ${error instanceof Error ? error.message : String(error)}`); + return workspaceFolderPath; + } +} function ensureTargetPathConfigured() { const config = vscode.workspace.getConfiguration('contextEngineUploader'); const current = (config.get('targetPath') || '').trim(); @@ -308,9 +381,10 @@ function ensureTargetPathConfigured() { updateStatusBarTooltip(); return; } - updateStatusBarTooltip(folderPath); + const autoTarget = detectDefaultTargetPath(folderPath); + updateStatusBarTooltip(autoTarget); } -function updateStatusBarTooltip(targetPath) { + function updateStatusBarTooltip(targetPath) { if (!statusBarItem) { return; } @@ -334,9 +408,24 @@ function needsForceSync(targetPath) { } } async function ensurePythonDependencies(pythonPath) { - // Probe current interpreter; if modules missing, offer to create a private venv and install deps - const ok = await checkPythonDeps(pythonPath); - if (ok) return true; + // Probe current interpreter with bundled python_libs first + let ok = await checkPythonDeps(pythonPath); + if (ok) { + return true; + } + + // If that fails, try to auto-detect a better system Python before falling back to a venv + const autoPython = await detectSystemPython(); + if (autoPython && autoPython !== pythonPath) { + log(`Falling back to auto-detected Python interpreter: ${autoPython}`); + ok = await checkPythonDeps(autoPython); + if (ok) { + pythonOverridePath = autoPython; + return true; + } + } + + // As a last resort, offer to create a private venv and install deps via pip const choice = await vscode.window.showErrorMessage( 'Context Engine Uploader: missing Python modules. Create isolated environment and auto-install?', 'Auto-install to private venv', @@ -356,7 +445,7 @@ async function ensurePythonDependencies(pythonPath) { if (!installed) return false; pythonOverridePath = venvPython; log(`Using private venv interpreter: ${pythonOverridePath}`); - return await checkPythonDeps(pythonOverridePath); + return await checkPythonDeps(venvPython); } async function checkPythonDeps(pythonPath) { @@ -509,7 +598,9 @@ function setStatusBarState(mode) { function runOnce(options) { return new Promise(resolve => { const args = buildArgs(options, 'force'); - const child = spawn(options.pythonPath, args, { cwd: options.workingDirectory, env: buildChildEnv(options) }); + const baseEnv = buildChildEnv(options); + const childEnv = { ...baseEnv, REMOTE_UPLOAD_GIT_FORCE: '1' }; + const child = spawn(options.pythonPath, args, { cwd: options.workingDirectory, env: childEnv }); forceProcess = child; attachOutput(child, 'force'); let finished = false; @@ -691,6 +782,21 @@ async function enhanceSelectionWithUnicorn() { if (useGpuDecoder) { env.USE_GPU_DECODER = '1'; } + let ctxWorkspaceDir; + try { + ctxWorkspaceDir = getTargetPath(cfg); + } catch (error) { + ctxWorkspaceDir = undefined; + } + if (!ctxWorkspaceDir) { + const wsFolder = getWorkspaceFolderPath(); + if (wsFolder) { + ctxWorkspaceDir = detectDefaultTargetPath(wsFolder); + } + } + if (ctxWorkspaceDir && typeof ctxWorkspaceDir === 'string' && fs.existsSync(ctxWorkspaceDir)) { + env.CTX_WORKSPACE_DIR = ctxWorkspaceDir; + } } catch (_) { // ignore config read failures; fall back to defaults } @@ -886,6 +992,15 @@ function buildChildEnv(options) { env.DEV_REMOTE_MODE = '1'; log('Context Engine Uploader: devRemoteMode enabled (REMOTE_UPLOAD_MODE=development, DEV_REMOTE_MODE=1).'); } + const gitMaxCommits = settings.get('gitMaxCommits'); + if (typeof gitMaxCommits === 'number' && !Number.isNaN(gitMaxCommits)) { + env.REMOTE_UPLOAD_GIT_MAX_COMMITS = String(gitMaxCommits); + } + const gitSinceRaw = settings.get('gitSince'); + const gitSince = typeof gitSinceRaw === 'string' ? gitSinceRaw.trim() : ''; + if (gitSince) { + env.REMOTE_UPLOAD_GIT_SINCE = gitSince; + } } catch (error) { log(`Failed to read devRemoteMode setting: ${error instanceof Error ? error.message : String(error)}`); } @@ -917,8 +1032,11 @@ async function writeMcpConfig() { vscode.window.showInformationMessage('Context Engine Uploader: MCP config writing is disabled in settings.'); return; } - const indexerUrl = (settings.get('mcpIndexerUrl') || 'http://localhost:8001/sse').trim(); - const memoryUrl = (settings.get('mcpMemoryUrl') || 'http://localhost:8000/sse').trim(); + const transportModeRaw = (settings.get('mcpTransportMode') || 'sse-remote'); + const transportMode = (typeof transportModeRaw === 'string' ? transportModeRaw.trim() : 'sse-remote') || 'sse-remote'; + + let indexerUrl = (settings.get('mcpIndexerUrl') || 'http://localhost:8001/sse').trim(); + let memoryUrl = (settings.get('mcpMemoryUrl') || 'http://localhost:8000/sse').trim(); let wroteAny = false; let hookWrote = false; if (claudeEnabled) { @@ -926,14 +1044,14 @@ async function writeMcpConfig() { if (!root) { vscode.window.showErrorMessage('Context Engine Uploader: open a folder before writing .mcp.json.'); } else { - const result = await writeClaudeMcpServers(root, indexerUrl, memoryUrl); + const result = await writeClaudeMcpServers(root, indexerUrl, memoryUrl, transportMode); wroteAny = wroteAny || result; } } if (windsurfEnabled) { const customPath = (settings.get('windsurfMcpPath') || '').trim(); const windsPath = customPath || getDefaultWindsurfMcpPath(); - const result = await writeWindsurfMcpServers(windsPath, indexerUrl, memoryUrl); + const result = await writeWindsurfMcpServers(windsPath, indexerUrl, memoryUrl, transportMode); wroteAny = wroteAny || result; } if (claudeHookEnabled) { @@ -975,10 +1093,17 @@ async function writeCtxConfig() { log('CTX config scaffolding skipped because scaffoldCtxConfig is false.'); return; } - const options = resolveOptions(); + let options = resolveOptions(); if (!options) { return; } + const depsOk = await ensurePythonDependencies(options.pythonPath); + if (!depsOk) { + return; + } + // ensurePythonDependencies may switch to a better interpreter (pythonOverridePath), + // so re-resolve options to pick up the updated pythonPath and script/working directory. + options = resolveOptions() || options; const collectionName = inferCollectionFromUpload(options); if (!collectionName) { vscode.window.showErrorMessage('Context Engine Uploader: failed to infer collection name from upload client. Check the Output panel for details.'); @@ -1046,6 +1171,8 @@ async function scaffoldCtxConfigFiles(workspaceDir, collectionName) { let glmApiKey = ''; let glmApiBase = 'https://api.z.ai/api/coding/paas/v4/'; let glmModel = 'glm-4.6'; + let gitMaxCommits = 500; + let gitSince = ''; if (uploaderSettings) { try { const runtimeSetting = String(uploaderSettings.get('decoderRuntime') ?? 'glm').trim().toLowerCase(); @@ -1065,6 +1192,14 @@ async function scaffoldCtxConfigFiles(workspaceDir, collectionName) { if (cfgModel) { glmModel = cfgModel; } + const maxCommitsSetting = uploaderSettings.get('gitMaxCommits'); + if (typeof maxCommitsSetting === 'number' && !Number.isNaN(maxCommitsSetting)) { + gitMaxCommits = maxCommitsSetting; + } + const sinceSetting = uploaderSettings.get('gitSince'); + if (typeof sinceSetting === 'string') { + gitSince = sinceSetting.trim(); + } } catch (error) { log(`Failed to read decoder/GLM settings from configuration: ${error instanceof Error ? error.message : String(error)}`); } @@ -1311,6 +1446,13 @@ async function scaffoldCtxConfigFiles(workspaceDir, collectionName) { } } + if (typeof gitMaxCommits === 'number' && !Number.isNaN(gitMaxCommits)) { + upsertEnv('REMOTE_UPLOAD_GIT_MAX_COMMITS', String(gitMaxCommits), { overwrite: true }); + } + if (gitSince) { + upsertEnv('REMOTE_UPLOAD_GIT_SINCE', gitSince, { overwrite: true, skipIfDesiredEmpty: true }); + } + if (envChanged) { fs.writeFileSync(envPath, envLines.join('\n') + '\n', 'utf8'); log(`Ensured decoder/GLM/MCP settings in .env at ${envPath}`); @@ -1356,7 +1498,7 @@ function getDefaultWindsurfMcpPath() { return path.join(os.homedir(), '.codeium', 'windsurf', 'mcp_config.json'); } -async function writeClaudeMcpServers(root, indexerUrl, memoryUrl) { +async function writeClaudeMcpServers(root, indexerUrl, memoryUrl, transportMode) { const configPath = path.join(root, '.mcp.json'); let config = { mcpServers: {} }; if (fs.existsSync(configPath)) { @@ -1376,27 +1518,46 @@ async function writeClaudeMcpServers(root, indexerUrl, memoryUrl) { config.mcpServers = {}; } log(`Preparing to write .mcp.json at ${configPath} with indexerUrl=${indexerUrl || '""'} memoryUrl=${memoryUrl || '""'}`); - const isWindows = process.platform === 'win32'; - const makeServer = url => { - if (isWindows) { + const servers = config.mcpServers; + const mode = (typeof transportMode === 'string' ? transportMode.trim() : 'sse-remote') || 'sse-remote'; + + if (mode === 'http') { + // Direct HTTP MCP endpoints for Claude (.mcp.json) + if (indexerUrl) { + servers['qdrant-indexer'] = { + type: 'http', + url: indexerUrl + }; + } + if (memoryUrl) { + servers.memory = { + type: 'http', + url: memoryUrl + }; + } + } else { + // Legacy/default: stdio via mcp-remote SSE bridge + const isWindows = process.platform === 'win32'; + const makeServer = url => { + if (isWindows) { + return { + command: 'cmd', + args: ['/c', 'npx', 'mcp-remote', url, '--transport', 'sse-only'], + env: {} + }; + } return { - command: 'cmd', - args: ['/c', 'npx', 'mcp-remote', url, '--transport', 'sse-only'], + command: 'npx', + args: ['mcp-remote', url, '--transport', 'sse-only'], env: {} }; - } - return { - command: 'npx', - args: ['mcp-remote', url, '--transport', 'sse-only'], - env: {} }; - }; - const servers = config.mcpServers; - if (indexerUrl) { - servers['qdrant-indexer'] = makeServer(indexerUrl); - } - if (memoryUrl) { - servers.memory = makeServer(memoryUrl); + if (indexerUrl) { + servers['qdrant-indexer'] = makeServer(indexerUrl); + } + if (memoryUrl) { + servers.memory = makeServer(memoryUrl); + } } try { const json = JSON.stringify(config, null, 2) + '\n'; @@ -1411,7 +1572,7 @@ async function writeClaudeMcpServers(root, indexerUrl, memoryUrl) { } } -async function writeWindsurfMcpServers(configPath, indexerUrl, memoryUrl) { +async function writeWindsurfMcpServers(configPath, indexerUrl, memoryUrl, transportMode) { try { fs.mkdirSync(path.dirname(configPath), { recursive: true }); } catch (error) { @@ -1437,17 +1598,53 @@ async function writeWindsurfMcpServers(configPath, indexerUrl, memoryUrl) { config.mcpServers = {}; } log(`Preparing to write Windsurf mcp_config.json at ${configPath} with indexerUrl=${indexerUrl || '""'} memoryUrl=${memoryUrl || '""'}`); - const makeServer = url => ({ - command: 'npx', - args: ['mcp-remote', url, '--transport', 'sse-only'], - env: {} - }); const servers = config.mcpServers; - if (indexerUrl) { - servers['qdrant-indexer'] = makeServer(indexerUrl); - } - if (memoryUrl) { - servers.memory = makeServer(memoryUrl); + const mode = (typeof transportMode === 'string' ? transportMode.trim() : 'sse-remote') || 'sse-remote'; + + if (mode === 'http') { + // Direct HTTP MCP endpoints for Windsurf mcp_config.json + if (indexerUrl) { + servers['qdrant-indexer'] = { + type: 'http', + url: indexerUrl + }; + } + if (memoryUrl) { + servers.memory = { + type: 'http', + url: memoryUrl + }; + } + } else { + // Legacy/default: use mcp-remote SSE bridge + const makeServer = url => { + // Default args for local/HTTPS endpoints + const args = ['mcp-remote', url, '--transport', 'sse-only']; + try { + const u = new URL(url); + const isLocalHost = + u.hostname === 'localhost' || + u.hostname === '127.0.0.1' || + u.hostname === '::1'; + // For non-local HTTP URLs, mcp-remote requires --allow-http + if (u.protocol === 'http:' && !isLocalHost) { + args.push('--allow-http'); + } + } catch (e) { + // If URL parsing fails, fall back to default args without additional flags + } + return { + command: 'npx', + args, + env: {} + }; + }; + if (indexerUrl) { + servers['qdrant-indexer'] = makeServer(indexerUrl); + } + if (memoryUrl) { + servers.memory = makeServer(memoryUrl); + } } try { const json = JSON.stringify(config, null, 2) + '\n'; diff --git a/vscode-extension/context-engine-uploader/package.json b/vscode-extension/context-engine-uploader/package.json index 082bc519..9d6c06ba 100644 --- a/vscode-extension/context-engine-uploader/package.json +++ b/vscode-extension/context-engine-uploader/package.json @@ -2,7 +2,7 @@ "name": "context-engine-uploader", "displayName": "Context Engine Uploader", "description": "Runs the Context-Engine remote upload client with a force sync on startup followed by watch mode. Requires Python with pip install requests urllib3 charset_normalizer.", - "version": "0.1.30", + "version": "0.1.31", "publisher": "context-engine", "engines": { "vscode": "^1.85.0" @@ -37,6 +37,10 @@ "command": "contextEngineUploader.indexCodebase", "title": "Context Engine Uploader: Index Codebase" }, + { + "command": "contextEngineUploader.uploadGitHistory", + "title": "Context Engine Uploader: Upload Git History (force sync bundle)" + }, { "command": "contextEngineUploader.writeMcpConfig", "title": "Context Engine Uploader: Write MCP Config (.mcp.json)" @@ -151,6 +155,17 @@ "default": false, "description": "Enable writing Windsurf's global MCP config (requires Windsurf or compatible clients)." }, + "contextEngineUploader.autoWriteMcpConfigOnStartup": { + "type": "boolean", + "default": false, + "description": "When enabled, automatically run 'Write MCP Config' on extension activation to keep .mcp.json, Windsurf mcp_config.json, and .claude/settings.local.json in sync with settings and the current extension version." + }, + "contextEngineUploader.mcpTransportMode": { + "type": "string", + "enum": ["sse-remote", "http"], + "default": "http", + "description": "Transport mode for Claude/Windsurf MCP configs: SSE via mcp-remote (sse-remote) or direct HTTP /mcp endpoints (http)." + }, "contextEngineUploader.mcpIndexerUrl": { "type": "string", "default": "http://localhost:8001/sse", @@ -205,6 +220,17 @@ "type": "string", "default": "glm-4.6", "description": "GLM model name (GLM_MODEL) used by refrag_glm/ctx.py when REFRAG_RUNTIME=glm." + }, + "contextEngineUploader.gitMaxCommits": { + "type": "number", + "default": 500, + "minimum": 0, + "description": "Upper bound for REMOTE_UPLOAD_GIT_MAX_COMMITS passed to the upload clients via .env/child process environment. Set to 0 or a negative value to disable git history collection." + }, + "contextEngineUploader.gitSince": { + "type": "string", + "default": "", + "description": "Optional REMOTE_UPLOAD_GIT_SINCE constraint (e.g. '2 years ago' or '2023-01-01') passed to the upload clients via .env/child process environment. Leave empty to use the clients' default behavior." } } }