From aa4b02d535d782c2664a17942d02ef7f29455d29 Mon Sep 17 00:00:00 2001 From: itsmostafa Date: Fri, 1 May 2026 23:38:23 -0600 Subject: [PATCH 01/16] fix(indexer): reactivate deactivated documents when file is restored Previously, indexFile only searched for active=1 documents. When a deactivated row existed for the same (collection, path), the subsequent INSERT violated the UNIQUE(collection, path) constraint and the file silently stayed unindexed. Fix: drop the active=1 filter from the existence lookup and include the active column. A deactivated row is now reactivated via the existing update branch (which already regenerates chunks), and its stats are counted as FilesAdded since the file was not searchable before. --- internal/indexer/indexer.go | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/internal/indexer/indexer.go b/internal/indexer/indexer.go index 3e43b75..98085e7 100644 --- a/internal/indexer/indexer.go +++ b/internal/indexer/indexer.go @@ -145,15 +145,16 @@ func (idx *Indexer) indexFile(ctx context.Context, col config.Collection, relPat hash := sha256sum(data) - // Check if document exists and unchanged + // Check if document exists (active or deactivated) and whether content changed var existingHash string var docID int64 + var existingActive int row := idx.db.QueryRowContext(ctx, - `SELECT id, content_hash FROM documents WHERE collection=? AND path=? AND active=1`, + `SELECT id, content_hash, active FROM documents WHERE collection=? AND path=?`, col.Name, relPath) - _ = row.Scan(&docID, &existingHash) + _ = row.Scan(&docID, &existingHash, &existingActive) - if existingHash == hash { + if existingActive == 1 && existingHash == hash { return nil // unchanged } @@ -198,9 +199,9 @@ func (idx *Indexer) indexFile(ctx context.Context, col config.Collection, relPat newDocID, _ = res.LastInsertId() stats.FilesAdded++ } else { - // Update + // Update (or reactivate a previously deactivated document) _, err = tx.ExecContext(ctx, - `UPDATE documents SET title=?, content_hash=?, updated_at=datetime('now') WHERE id=?`, + `UPDATE documents SET title=?, content_hash=?, active=1, updated_at=datetime('now') WHERE id=?`, title, hash, docID) if err != nil { return fmt.Errorf("updating document: %w", err) @@ -210,7 +211,11 @@ func (idx *Indexer) indexFile(ctx context.Context, col config.Collection, relPat if _, err := tx.ExecContext(ctx, `DELETE FROM chunks WHERE doc_id=?`, docID); err != nil { return fmt.Errorf("deleting old chunks: %w", err) } - stats.FilesUpdated++ + if existingActive == 0 { + stats.FilesAdded++ + } else { + stats.FilesUpdated++ + } } // Insert chunks From 964166d38b1d486f5af146048040cab19ec9d4b5 Mon Sep 17 00:00:00 2001 From: itsmostafa Date: Fri, 1 May 2026 23:38:26 -0600 Subject: [PATCH 02/16] test(indexer): add regression test for re-indexing a restored file MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Covers the full delete-then-restore cycle: index → remove from disk → re-index (FilesRemoved=1) → restore with new content → re-index (FilesAdded=1, active=1, chunks populated). --- internal/indexer/indexer_test.go | 62 ++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/internal/indexer/indexer_test.go b/internal/indexer/indexer_test.go index 384fc22..0cab8c2 100644 --- a/internal/indexer/indexer_test.go +++ b/internal/indexer/indexer_test.go @@ -103,6 +103,68 @@ func TestIndexer_IncrementalUpdate(t *testing.T) { } } +func TestIndexer_ReindexAfterDeletion(t *testing.T) { + database := openTestDB(t) + idx := New(database, 256) + dir := t.TempDir() + path := filepath.Join(dir, "doc.md") + col := config.Collection{Name: "test", Path: dir, Extensions: []string{".md"}} + + // Index the file + if err := os.WriteFile(path, []byte("# Original\nOriginal content."), 0o640); err != nil { + t.Fatal(err) + } + if _, err := idx.Index(context.Background(), col); err != nil { + t.Fatal(err) + } + + // Delete the file → deactivates the document + if err := os.Remove(path); err != nil { + t.Fatal(err) + } + stats, err := idx.Index(context.Background(), col) + if err != nil { + t.Fatal(err) + } + if stats.FilesRemoved != 1 { + t.Fatalf("expected 1 removed, got %d", stats.FilesRemoved) + } + + // Restore the file with new content + if err := os.WriteFile(path, []byte("# Restored\nRestored content."), 0o640); err != nil { + t.Fatal(err) + } + stats, err = idx.Index(context.Background(), col) + if err != nil { + t.Fatalf("index after restore: %v", err) + } + if stats.FilesAdded != 1 { + t.Errorf("expected 1 added after restore, got %d", stats.FilesAdded) + } + + // Verify the document is active with new hash and has chunks + var active int + var hash string + row := database.QueryRowContext(context.Background(), + `SELECT active, content_hash FROM documents WHERE collection='test' AND path='doc.md'`) + if err := row.Scan(&active, &hash); err != nil { + t.Fatalf("querying restored document: %v", err) + } + if active != 1 { + t.Errorf("expected active=1, got %d", active) + } + + var chunkCount int + cRow := database.QueryRowContext(context.Background(), + `SELECT COUNT(*) FROM chunks c JOIN documents d ON d.id=c.doc_id WHERE d.collection='test' AND d.path='doc.md' AND d.active=1`) + if err := cRow.Scan(&chunkCount); err != nil { + t.Fatalf("querying chunks: %v", err) + } + if chunkCount == 0 { + t.Error("expected at least one chunk after restore") + } +} + func TestIndexer_DeactivatesMissingFiles(t *testing.T) { database := openTestDB(t) idx := New(database, 256) From 07a05e87e9940544e357dffea23efe8a35a7e616 Mon Sep 17 00:00:00 2001 From: itsmostafa Date: Fri, 1 May 2026 23:40:35 -0600 Subject: [PATCH 03/16] fix(db): add ON DELETE CASCADE to chunk_vectors and embeddings chunk_vectors and embeddings referenced chunks(id) with NO ACTION, causing FK violations (and a silent rollback) whenever a changed document was reindexed while embeddings existed. SQLite does not support ALTER TABLE to change FK actions, so this migration rebuilds both tables with the correct ON DELETE CASCADE constraint. --- .../db/migrations/003_cascade_chunk_refs.sql | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 internal/db/migrations/003_cascade_chunk_refs.sql diff --git a/internal/db/migrations/003_cascade_chunk_refs.sql b/internal/db/migrations/003_cascade_chunk_refs.sql new file mode 100644 index 0000000..1741959 --- /dev/null +++ b/internal/db/migrations/003_cascade_chunk_refs.sql @@ -0,0 +1,30 @@ +-- Add ON DELETE CASCADE to chunk_vectors and embeddings so reindexing a +-- changed document (which deletes its chunks) does not fail or orphan rows. +-- SQLite requires a table rebuild to change foreign key actions. + +PRAGMA foreign_keys=OFF; + +CREATE TABLE chunk_vectors_new ( + chunk_id INTEGER PRIMARY KEY REFERENCES chunks(id) ON DELETE CASCADE, + vector BLOB NOT NULL +); +INSERT INTO chunk_vectors_new(chunk_id, vector) + SELECT chunk_id, vector FROM chunk_vectors; +DROP TABLE chunk_vectors; +ALTER TABLE chunk_vectors_new RENAME TO chunk_vectors; + +CREATE TABLE embeddings_new ( + chunk_id INTEGER PRIMARY KEY REFERENCES chunks(id) ON DELETE CASCADE, + provider TEXT NOT NULL, + model TEXT NOT NULL, + dimension INTEGER NOT NULL, + embedded_at TEXT NOT NULL DEFAULT (datetime('now')) +); +INSERT INTO embeddings_new(chunk_id, provider, model, dimension, embedded_at) + SELECT chunk_id, provider, model, dimension, embedded_at FROM embeddings; +DROP TABLE embeddings; +ALTER TABLE embeddings_new RENAME TO embeddings; + +PRAGMA foreign_keys=ON; + +INSERT OR IGNORE INTO schema_version(version) VALUES (3); From 3b6a63dd079a4e75588b3a839ae26d01679f8f66 Mon Sep 17 00:00:00 2001 From: itsmostafa Date: Fri, 1 May 2026 23:40:40 -0600 Subject: [PATCH 04/16] test(indexer): regression test for reindex with embeddings present MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit TestIndexer_ReindexWithEmbeddings indexes a file, inserts chunk_vectors and embeddings rows for a real chunk, modifies the file, reindexes, and asserts FilesUpdated==1 with zero orphan rows — the failure mode this change prevents. --- internal/indexer/indexer_test.go | 65 ++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) diff --git a/internal/indexer/indexer_test.go b/internal/indexer/indexer_test.go index 0cab8c2..8879687 100644 --- a/internal/indexer/indexer_test.go +++ b/internal/indexer/indexer_test.go @@ -103,6 +103,71 @@ func TestIndexer_IncrementalUpdate(t *testing.T) { } } +func TestIndexer_ReindexWithEmbeddings(t *testing.T) { + database := openTestDB(t) + idx := New(database, 256) + dir := t.TempDir() + path := filepath.Join(dir, "doc.md") + col := config.Collection{Name: "test", Path: dir, Extensions: []string{".md"}} + + if err := os.WriteFile(path, []byte("# Original\nOriginal content."), 0o640); err != nil { + t.Fatal(err) + } + if _, err := idx.Index(context.Background(), col); err != nil { + t.Fatal(err) + } + + // Find a chunk for this document and insert a fake embedding. + var chunkID int64 + row := database.QueryRowContext(context.Background(), + `SELECT c.id FROM chunks c JOIN documents d ON d.id=c.doc_id + WHERE d.collection='test' AND d.path='doc.md' LIMIT 1`) + if err := row.Scan(&chunkID); err != nil { + t.Fatalf("finding chunk: %v", err) + } + if err := database.InsertEmbedding(context.Background(), chunkID, []float32{0.1, 0.2, 0.3, 0.4}); err != nil { + t.Fatalf("inserting chunk_vector: %v", err) + } + if _, err := database.ExecContext(context.Background(), + `INSERT INTO embeddings(chunk_id, provider, model, dimension) VALUES (?, 'test', 'test-model', 4)`, + chunkID); err != nil { + t.Fatalf("inserting embeddings row: %v", err) + } + + // Modify the file so its hash changes. + if err := os.WriteFile(path, []byte("# Updated\nUpdated content."), 0o640); err != nil { + t.Fatal(err) + } + + // Reindex must succeed and report the file as updated. + stats, err := idx.Index(context.Background(), col) + if err != nil { + t.Fatalf("reindex failed: %v", err) + } + if stats.FilesUpdated != 1 { + t.Errorf("expected 1 updated, got %d", stats.FilesUpdated) + } + + // No orphaned rows should remain in chunk_vectors or embeddings. + var orphanVectors int + if err := database.QueryRowContext(context.Background(), + `SELECT COUNT(*) FROM chunk_vectors WHERE chunk_id NOT IN (SELECT id FROM chunks)`).Scan(&orphanVectors); err != nil { + t.Fatalf("querying orphan chunk_vectors: %v", err) + } + if orphanVectors != 0 { + t.Errorf("expected 0 orphan chunk_vectors rows, got %d", orphanVectors) + } + + var orphanEmbeddings int + if err := database.QueryRowContext(context.Background(), + `SELECT COUNT(*) FROM embeddings WHERE chunk_id NOT IN (SELECT id FROM chunks)`).Scan(&orphanEmbeddings); err != nil { + t.Fatalf("querying orphan embeddings: %v", err) + } + if orphanEmbeddings != 0 { + t.Errorf("expected 0 orphan embeddings rows, got %d", orphanEmbeddings) + } +} + func TestIndexer_ReindexAfterDeletion(t *testing.T) { database := openTestDB(t) idx := New(database, 256) From 3c63343abdb6b76fc35d4b477159d70e2385bdaa Mon Sep 17 00:00:00 2001 From: itsmostafa Date: Fri, 1 May 2026 23:42:03 -0600 Subject: [PATCH 05/16] fix(db): make migration 003 idempotent with DROP IF EXISTS guards If the migration failed mid-run (after creating embeddings_new but before completing), schema_version stayed at 2 and the next run would fail with "table embeddings_new already exists". Adding DROP TABLE IF EXISTS before each CREATE TABLE makes the migration safe to retry. --- internal/db/migrations/003_cascade_chunk_refs.sql | 2 ++ 1 file changed, 2 insertions(+) diff --git a/internal/db/migrations/003_cascade_chunk_refs.sql b/internal/db/migrations/003_cascade_chunk_refs.sql index 1741959..d33d102 100644 --- a/internal/db/migrations/003_cascade_chunk_refs.sql +++ b/internal/db/migrations/003_cascade_chunk_refs.sql @@ -4,6 +4,7 @@ PRAGMA foreign_keys=OFF; +DROP TABLE IF EXISTS chunk_vectors_new; CREATE TABLE chunk_vectors_new ( chunk_id INTEGER PRIMARY KEY REFERENCES chunks(id) ON DELETE CASCADE, vector BLOB NOT NULL @@ -13,6 +14,7 @@ INSERT INTO chunk_vectors_new(chunk_id, vector) DROP TABLE chunk_vectors; ALTER TABLE chunk_vectors_new RENAME TO chunk_vectors; +DROP TABLE IF EXISTS embeddings_new; CREATE TABLE embeddings_new ( chunk_id INTEGER PRIMARY KEY REFERENCES chunks(id) ON DELETE CASCADE, provider TEXT NOT NULL, From 6021475092ccbfd8c79f2c5fcdad090e79c7e879 Mon Sep 17 00:00:00 2001 From: itsmostafa Date: Fri, 1 May 2026 23:43:36 -0600 Subject: [PATCH 06/16] fix(db): handle missing dimension column in legacy embeddings table Migration 002 used CREATE TABLE IF NOT EXISTS, which was a no-op on databases where embeddings already existed without the dimension column. The INSERT in migration 003 then failed with "no such column: dimension". Fix: give dimension a DEFAULT 0 and omit it from the INSERT select list so the migration works regardless of the source table's schema. --- internal/db/migrations/003_cascade_chunk_refs.sql | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/internal/db/migrations/003_cascade_chunk_refs.sql b/internal/db/migrations/003_cascade_chunk_refs.sql index d33d102..11496ab 100644 --- a/internal/db/migrations/003_cascade_chunk_refs.sql +++ b/internal/db/migrations/003_cascade_chunk_refs.sql @@ -19,11 +19,13 @@ CREATE TABLE embeddings_new ( chunk_id INTEGER PRIMARY KEY REFERENCES chunks(id) ON DELETE CASCADE, provider TEXT NOT NULL, model TEXT NOT NULL, - dimension INTEGER NOT NULL, + dimension INTEGER NOT NULL DEFAULT 0, embedded_at TEXT NOT NULL DEFAULT (datetime('now')) ); -INSERT INTO embeddings_new(chunk_id, provider, model, dimension, embedded_at) - SELECT chunk_id, provider, model, dimension, embedded_at FROM embeddings; +-- Omit dimension: older DBs may not have it (002 used CREATE TABLE IF NOT EXISTS, +-- which was a no-op when the table already existed without that column). +INSERT INTO embeddings_new(chunk_id, provider, model, embedded_at) + SELECT chunk_id, provider, model, embedded_at FROM embeddings; DROP TABLE embeddings; ALTER TABLE embeddings_new RENAME TO embeddings; From 36a17ca643387773025713c6f501884a4ec9fcc0 Mon Sep 17 00:00:00 2001 From: itsmostafa Date: Fri, 1 May 2026 23:47:56 -0600 Subject: [PATCH 07/16] feat(indexer): skip all dot-directories during walk Dot-prefixed directories (e.g. .venv, .cache, .mypy_cache) are never user content, so skip them unconditionally rather than relying on an enumerated denylist. --- internal/indexer/indexer.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/internal/indexer/indexer.go b/internal/indexer/indexer.go index 98085e7..b1db939 100644 --- a/internal/indexer/indexer.go +++ b/internal/indexer/indexer.go @@ -95,7 +95,8 @@ func (idx *Indexer) Index(ctx context.Context, col config.Collection) (Stats, er return err } if d.IsDir() { - if defaultIgnoreDirs[d.Name()] || ignoreSet[d.Name()] { + name := d.Name() + if defaultIgnoreDirs[name] || ignoreSet[name] || (strings.HasPrefix(name, ".") && name != ".") { return filepath.SkipDir } return nil From 974548d33afd34d952a59703e6632a31fadbf721 Mon Sep 17 00:00:00 2001 From: itsmostafa Date: Fri, 1 May 2026 23:51:49 -0600 Subject: [PATCH 08/16] docs(skills): add command selection guidance to qi-cli skill Adds a section advising when to use qi search/query vs qi ask, and marks qi ask as to be used sparingly since it consumes LLM tokens. --- skills/qi-cli/SKILL.md | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/skills/qi-cli/SKILL.md b/skills/qi-cli/SKILL.md index b084669..38f1c4f 100644 --- a/skills/qi-cli/SKILL.md +++ b/skills/qi-cli/SKILL.md @@ -14,11 +14,19 @@ qi index # Index the current directory, or a na qi doctor # Verify setup qi search "your query" # BM25 keyword search (no provider needed) qi query "your semantic question" # Hybrid search (needs embedding provider) -qi ask "what does X do?" # RAG Q&A (needs generation provider) +qi ask "what does X do?" # RAG Q&A; use sparingly (needs generation provider) ``` --- +## Command selection guidance + +Prefer `qi index` when the task is about adding, refreshing, or organizing source material. +Prefer `qi search` or `qi query` when the task is about finding relevant documents, passages, or citations. +Use `qi ask` sparingly, only when the user specifically needs a synthesized answer from an LLM rather than retrieved source results. + +--- + ## Commands ### `qi init` @@ -67,6 +75,7 @@ qi query "question" --explain # show BM25/vector/RRF score breakdo ### `qi ask ` RAG Q&A: searches the knowledge base, sends relevant chunks to an LLM, returns an answer with citations. +Use this sparingly; prefer `qi query` for normal exploration, evidence gathering, and source lookup. ```bash qi ask "What authentication methods are supported?" @@ -240,9 +249,10 @@ qi index notes qi query "how does X work" --explain ``` -**RAG Q&A:** +**RAG Q&A (use sparingly):** ```bash # also add a generation provider to config +# prefer qi query unless you need a synthesized answer qi ask "Summarize the key decisions in my notes" ``` From 16976bb4121200ff541aa74070dff54d9c3d35dd Mon Sep 17 00:00:00 2001 From: itsmostafa Date: Fri, 1 May 2026 23:51:52 -0600 Subject: [PATCH 09/16] chore(plugin): bump plugin version to 0.4.0 --- .claude-plugin/marketplace.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json index 71e25a4..0ff8c54 100644 --- a/.claude-plugin/marketplace.json +++ b/.claude-plugin/marketplace.json @@ -8,7 +8,7 @@ "name": "qi", "source": "./", "description": "Local knowledge search CLI — index documents and search them using BM25 full-text search, vector embeddings, and LLM-powered Q&A, all running locally with no external dependencies.", - "version": "0.3.0", + "version": "0.4.0", "author": { "name": "itsmostafa" }, From 22060750d63def046cf5c3a2d93933e6cb60bb6b Mon Sep 17 00:00:00 2001 From: itsmostafa Date: Fri, 1 May 2026 23:54:43 -0600 Subject: [PATCH 10/16] update readme --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index 722fbd7..9bdb0d9 100644 --- a/README.md +++ b/README.md @@ -105,7 +105,6 @@ qi doctor | `qi delete ` | Delete a named collection and all its indexed data | | `qi stats` | Show index statistics | | `qi doctor` | Health check | -| `qi version` | Print version | ## Search Modes From 70f0d9f2892a12f8529722e2dce0845746c4bd51 Mon Sep 17 00:00:00 2001 From: itsmostafa Date: Sat, 2 May 2026 00:01:07 -0600 Subject: [PATCH 11/16] docs(readme): rewrite tagline for clarity and brevity --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 9bdb0d9..d30a957 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ qi logo

-A local-first knowledge search CLI for macOS and Linux. Index and search anything — codebases, documentation, research papers, notes, wikis, datasets, logs, contracts, books — using BM25 full-text search, vector embeddings, and LLM-powered Q&A. Choose your own models via Ollama, LM Studio, llama.cpp, MLX or using OpenAI's cloud models. +qi is an ultra-fast knowledge search CLI for your files on your local machine. No dependencies, no runtime, just a single executable that indexes code, docs, notes, papers, logs, and other text into SQLite, then gives you BM25 search, optional vector search, and grounded LLM Q&A with citations. Use it offline with Ollama, LM Studio, llama.cpp, or MLX, or connect OpenAI for cloud models. ## Features From 83217e3478d889d7e2e6f8bebb81234d3b07f44c Mon Sep 17 00:00:00 2001 From: itsmostafa Date: Sat, 2 May 2026 00:03:13 -0600 Subject: [PATCH 12/16] docs(readme): add AI agent token-saving use case blurb --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index d30a957..0c9c99c 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,8 @@ qi is an ultra-fast knowledge search CLI for your files on your local machine. No dependencies, no runtime, just a single executable that indexes code, docs, notes, papers, logs, and other text into SQLite, then gives you BM25 search, optional vector search, and grounded LLM Q&A with citations. Use it offline with Ollama, LM Studio, llama.cpp, or MLX, or connect OpenAI for cloud models. +Save tokens by delagating some of your AI Agent's work to qi. + ## Features - **Blazing-fast full-text search** — BM25 via SQLite FTS5, no external search engine required From 655480daf0436893476ae02e150c8a7aa7bb541b Mon Sep 17 00:00:00 2001 From: itsmostafa Date: Sat, 2 May 2026 21:53:38 -0600 Subject: [PATCH 13/16] docs(readme): fix typo delegating --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 0c9c99c..ac5a49a 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ qi is an ultra-fast knowledge search CLI for your files on your local machine. No dependencies, no runtime, just a single executable that indexes code, docs, notes, papers, logs, and other text into SQLite, then gives you BM25 search, optional vector search, and grounded LLM Q&A with citations. Use it offline with Ollama, LM Studio, llama.cpp, or MLX, or connect OpenAI for cloud models. -Save tokens by delagating some of your AI Agent's work to qi. +Save tokens by delegating some of your AI Agent's work to qi. ## Features From 1cc260747fea131a671cc52ab12b332102168956 Mon Sep 17 00:00:00 2001 From: itsmostafa Date: Sat, 2 May 2026 21:55:50 -0600 Subject: [PATCH 14/16] fix(db): preserve embedding dimension during migration 003 table rebuild The INSERT into embeddings_new omitted dimension, silently writing 0 for all migrated rows even when the source table had real values. Derive dimension from length(cv.vector)/4 via a LEFT JOIN on chunk_vectors so both old schemas (no dimension column) and current ones are handled correctly without touching a potentially-absent source column. --- internal/db/migrations/003_cascade_chunk_refs.sql | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/internal/db/migrations/003_cascade_chunk_refs.sql b/internal/db/migrations/003_cascade_chunk_refs.sql index 11496ab..c0e4bd8 100644 --- a/internal/db/migrations/003_cascade_chunk_refs.sql +++ b/internal/db/migrations/003_cascade_chunk_refs.sql @@ -22,10 +22,14 @@ CREATE TABLE embeddings_new ( dimension INTEGER NOT NULL DEFAULT 0, embedded_at TEXT NOT NULL DEFAULT (datetime('now')) ); --- Omit dimension: older DBs may not have it (002 used CREATE TABLE IF NOT EXISTS, --- which was a no-op when the table already existed without that column). -INSERT INTO embeddings_new(chunk_id, provider, model, embedded_at) - SELECT chunk_id, provider, model, embedded_at FROM embeddings; +-- Derive dimension from vector length (float32 = 4 bytes each) so that both +-- old schemas (no dimension column) and new ones are handled correctly. +INSERT INTO embeddings_new(chunk_id, provider, model, dimension, embedded_at) + SELECT e.chunk_id, e.provider, e.model, + COALESCE(length(cv.vector)/4, 0), + e.embedded_at + FROM embeddings e + LEFT JOIN chunk_vectors cv ON cv.chunk_id = e.chunk_id; DROP TABLE embeddings; ALTER TABLE embeddings_new RENAME TO embeddings; From d58e3d6559bc2461b5b098add1b88ca034e5c31b Mon Sep 17 00:00:00 2001 From: itsmostafa Date: Sat, 2 May 2026 21:58:14 -0600 Subject: [PATCH 15/16] fix(indexer): return error on non-ErrNoRows scan failure in indexFile Swallowing the Scan error left docID=0 on any transient DB failure, causing a fallthrough to INSERT that would hit the UNIQUE(collection,path) constraint and silently leave the file unindexed. --- internal/indexer/indexer.go | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/internal/indexer/indexer.go b/internal/indexer/indexer.go index b1db939..848b26d 100644 --- a/internal/indexer/indexer.go +++ b/internal/indexer/indexer.go @@ -5,6 +5,7 @@ import ( "crypto/sha256" "database/sql" "encoding/hex" + "errors" "fmt" "io/fs" "log/slog" @@ -153,7 +154,9 @@ func (idx *Indexer) indexFile(ctx context.Context, col config.Collection, relPat row := idx.db.QueryRowContext(ctx, `SELECT id, content_hash, active FROM documents WHERE collection=? AND path=?`, col.Name, relPath) - _ = row.Scan(&docID, &existingHash, &existingActive) + if err := row.Scan(&docID, &existingHash, &existingActive); err != nil && !errors.Is(err, sql.ErrNoRows) { + return fmt.Errorf("looking up existing document: %w", err) + } if existingActive == 1 && existingHash == hash { return nil // unchanged From 40c23765a076d44de4fca87d601315d6a5354456 Mon Sep 17 00:00:00 2001 From: itsmostafa Date: Sat, 2 May 2026 22:03:35 -0600 Subject: [PATCH 16/16] fix(indexer): preserve embeddings when reactivating unchanged documents When a deactivated document was restored with byte-identical content, the indexer still ran DELETE FROM chunks, which cascades into chunk_vectors and embeddings (added by migration 003), forcing unnecessary re-embedding work. Add a fast-path for `docID != 0 && existingActive == 0 && existingHash == hash`: reactivate the document row and return without touching chunks or embeddings. Adds TestIndexer_ReactivateSameContent to guard the preserved-embedding invariant. --- internal/indexer/indexer.go | 12 ++++ internal/indexer/indexer_test.go | 106 +++++++++++++++++++++++++++++++ 2 files changed, 118 insertions(+) diff --git a/internal/indexer/indexer.go b/internal/indexer/indexer.go index 848b26d..aa28e6d 100644 --- a/internal/indexer/indexer.go +++ b/internal/indexer/indexer.go @@ -162,6 +162,18 @@ func (idx *Indexer) indexFile(ctx context.Context, col config.Collection, relPat return nil // unchanged } + // Fast-path: previously deactivated document restored with byte-identical content. + // Reactivate the row without touching chunks or embeddings — deleting chunks would + // cascade into chunk_vectors/embeddings (migration 003) and force pointless re-embedding. + if docID != 0 && existingActive == 0 && existingHash == hash { + if _, err := idx.db.ExecContext(ctx, + `UPDATE documents SET active=1, updated_at=datetime('now') WHERE id=?`, docID); err != nil { + return fmt.Errorf("reactivating document: %w", err) + } + stats.FilesAdded++ + return nil + } + // Upsert content if _, err := idx.db.ExecContext(ctx, `INSERT OR IGNORE INTO content(hash, body) VALUES (?, ?)`, diff --git a/internal/indexer/indexer_test.go b/internal/indexer/indexer_test.go index 8879687..22f3def 100644 --- a/internal/indexer/indexer_test.go +++ b/internal/indexer/indexer_test.go @@ -230,6 +230,112 @@ func TestIndexer_ReindexAfterDeletion(t *testing.T) { } } +func TestIndexer_ReactivateSameContent(t *testing.T) { + database := openTestDB(t) + idx := New(database, 256) + dir := t.TempDir() + path := filepath.Join(dir, "doc.md") + col := config.Collection{Name: "test", Path: dir, Extensions: []string{".md"}} + + body := []byte("# Original\nOriginal content.") + if err := os.WriteFile(path, body, 0o640); err != nil { + t.Fatal(err) + } + if _, err := idx.Index(context.Background(), col); err != nil { + t.Fatal(err) + } + + // Capture chunk IDs and seed an embedding to detect spurious deletion. + rows, err := database.QueryContext(context.Background(), + `SELECT c.id FROM chunks c JOIN documents d ON d.id=c.doc_id + WHERE d.collection='test' AND d.path='doc.md' ORDER BY c.id`) + if err != nil { + t.Fatalf("listing chunks: %v", err) + } + var originalChunkIDs []int64 + for rows.Next() { + var id int64 + if err := rows.Scan(&id); err != nil { + t.Fatal(err) + } + originalChunkIDs = append(originalChunkIDs, id) + } + rows.Close() + if len(originalChunkIDs) == 0 { + t.Fatal("expected at least one chunk after initial index") + } + + seedID := originalChunkIDs[0] + if err := database.InsertEmbedding(context.Background(), seedID, []float32{0.1, 0.2, 0.3, 0.4}); err != nil { + t.Fatalf("inserting chunk_vector: %v", err) + } + if _, err := database.ExecContext(context.Background(), + `INSERT INTO embeddings(chunk_id, provider, model, dimension) VALUES (?, 'test', 'test-model', 4)`, + seedID); err != nil { + t.Fatalf("inserting embeddings row: %v", err) + } + + // Delete the file → deactivates the document. + if err := os.Remove(path); err != nil { + t.Fatal(err) + } + if _, err := idx.Index(context.Background(), col); err != nil { + t.Fatal(err) + } + + // Restore byte-identical content. + if err := os.WriteFile(path, body, 0o640); err != nil { + t.Fatal(err) + } + stats, err := idx.Index(context.Background(), col) + if err != nil { + t.Fatalf("index after restore: %v", err) + } + if stats.FilesAdded != 1 { + t.Errorf("expected 1 added, got %d", stats.FilesAdded) + } + + // Document must be active again. + var active int + if err := database.QueryRowContext(context.Background(), + `SELECT active FROM documents WHERE collection='test' AND path='doc.md'`). + Scan(&active); err != nil { + t.Fatalf("querying restored document: %v", err) + } + if active != 1 { + t.Errorf("expected active=1, got %d", active) + } + + // Chunk ID must be preserved — proves DELETE FROM chunks did not run. + var preservedCount int + if err := database.QueryRowContext(context.Background(), + `SELECT COUNT(*) FROM chunks WHERE id = ?`, seedID).Scan(&preservedCount); err != nil { + t.Fatalf("querying preserved chunk: %v", err) + } + if preservedCount != 1 { + t.Fatalf("expected seed chunk %d to survive restore, got count %d", seedID, preservedCount) + } + + // Embedding and vector for the seed chunk must still exist. + var embCount int + if err := database.QueryRowContext(context.Background(), + `SELECT COUNT(*) FROM embeddings WHERE chunk_id = ?`, seedID).Scan(&embCount); err != nil { + t.Fatalf("querying embedding: %v", err) + } + if embCount != 1 { + t.Errorf("expected embedding for chunk %d to survive restore, got %d", seedID, embCount) + } + + var vecCount int + if err := database.QueryRowContext(context.Background(), + `SELECT COUNT(*) FROM chunk_vectors WHERE chunk_id = ?`, seedID).Scan(&vecCount); err != nil { + t.Fatalf("querying chunk_vector: %v", err) + } + if vecCount != 1 { + t.Errorf("expected chunk_vector for chunk %d to survive restore, got %d", seedID, vecCount) + } +} + func TestIndexer_DeactivatesMissingFiles(t *testing.T) { database := openTestDB(t) idx := New(database, 256)