Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
16 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .claude-plugin/marketplace.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
"name": "qi",
"source": "./",
"description": "Local knowledge search CLI — index documents and search them using BM25 full-text search, vector embeddings, and LLM-powered Q&A, all running locally with no external dependencies.",
"version": "0.3.0",
"version": "0.4.0",
Comment thread
itsmostafa marked this conversation as resolved.
"author": {
"name": "itsmostafa"
},
Expand Down
5 changes: 3 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@
<img src="assets/img/qi-logo.png" alt="qi logo" width="200" />
</p>

A local-first knowledge search CLI for macOS and Linux. Index and search anything — codebases, documentation, research papers, notes, wikis, datasets, logs, contracts, books — using BM25 full-text search, vector embeddings, and LLM-powered Q&A. Choose your own models via Ollama, LM Studio, llama.cpp, MLX or using OpenAI's cloud models.
qi is an ultra-fast knowledge search CLI for your files on your local machine. No dependencies, no runtime, just a single executable that indexes code, docs, notes, papers, logs, and other text into SQLite, then gives you BM25 search, optional vector search, and grounded LLM Q&A with citations. Use it offline with Ollama, LM Studio, llama.cpp, or MLX, or connect OpenAI for cloud models.

Save tokens by delegating some of your AI Agent's work to qi.

## Features

Expand Down Expand Up @@ -105,7 +107,6 @@ qi doctor
| `qi delete <collection>` | Delete a named collection and all its indexed data |
| `qi stats` | Show index statistics |
| `qi doctor` | Health check |
| `qi version` | Print version |

## Search Modes

Expand Down
38 changes: 38 additions & 0 deletions internal/db/migrations/003_cascade_chunk_refs.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
-- Add ON DELETE CASCADE to chunk_vectors and embeddings so reindexing a
-- changed document (which deletes its chunks) does not fail or orphan rows.
-- SQLite requires a table rebuild to change foreign key actions.

Comment thread
itsmostafa marked this conversation as resolved.
PRAGMA foreign_keys=OFF;

DROP TABLE IF EXISTS chunk_vectors_new;
CREATE TABLE chunk_vectors_new (
chunk_id INTEGER PRIMARY KEY REFERENCES chunks(id) ON DELETE CASCADE,
vector BLOB NOT NULL
);
INSERT INTO chunk_vectors_new(chunk_id, vector)
SELECT chunk_id, vector FROM chunk_vectors;
DROP TABLE chunk_vectors;
ALTER TABLE chunk_vectors_new RENAME TO chunk_vectors;

DROP TABLE IF EXISTS embeddings_new;
CREATE TABLE embeddings_new (
chunk_id INTEGER PRIMARY KEY REFERENCES chunks(id) ON DELETE CASCADE,
provider TEXT NOT NULL,
model TEXT NOT NULL,
dimension INTEGER NOT NULL DEFAULT 0,
embedded_at TEXT NOT NULL DEFAULT (datetime('now'))
);
-- Derive dimension from vector length (float32 = 4 bytes each) so that both
-- old schemas (no dimension column) and new ones are handled correctly.
INSERT INTO embeddings_new(chunk_id, provider, model, dimension, embedded_at)
SELECT e.chunk_id, e.provider, e.model,
COALESCE(length(cv.vector)/4, 0),
e.embedded_at
FROM embeddings e
LEFT JOIN chunk_vectors cv ON cv.chunk_id = e.chunk_id;
DROP TABLE embeddings;
ALTER TABLE embeddings_new RENAME TO embeddings;

PRAGMA foreign_keys=ON;

INSERT OR IGNORE INTO schema_version(version) VALUES (3);
37 changes: 29 additions & 8 deletions internal/indexer/indexer.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"crypto/sha256"
"database/sql"
"encoding/hex"
"errors"
"fmt"
"io/fs"
"log/slog"
Expand Down Expand Up @@ -95,7 +96,8 @@ func (idx *Indexer) Index(ctx context.Context, col config.Collection) (Stats, er
return err
}
if d.IsDir() {
if defaultIgnoreDirs[d.Name()] || ignoreSet[d.Name()] {
name := d.Name()
if defaultIgnoreDirs[name] || ignoreSet[name] || (strings.HasPrefix(name, ".") && name != ".") {
Comment thread
itsmostafa marked this conversation as resolved.
return filepath.SkipDir
}
return nil
Expand Down Expand Up @@ -145,18 +147,33 @@ func (idx *Indexer) indexFile(ctx context.Context, col config.Collection, relPat

hash := sha256sum(data)

// Check if document exists and unchanged
// Check if document exists (active or deactivated) and whether content changed
var existingHash string
var docID int64
var existingActive int
row := idx.db.QueryRowContext(ctx,
`SELECT id, content_hash FROM documents WHERE collection=? AND path=? AND active=1`,
`SELECT id, content_hash, active FROM documents WHERE collection=? AND path=?`,
col.Name, relPath)
_ = row.Scan(&docID, &existingHash)
if err := row.Scan(&docID, &existingHash, &existingActive); err != nil && !errors.Is(err, sql.ErrNoRows) {
return fmt.Errorf("looking up existing document: %w", err)
}

if existingHash == hash {
if existingActive == 1 && existingHash == hash {
return nil // unchanged
}

Comment thread
itsmostafa marked this conversation as resolved.
// Fast-path: previously deactivated document restored with byte-identical content.
// Reactivate the row without touching chunks or embeddings — deleting chunks would
// cascade into chunk_vectors/embeddings (migration 003) and force pointless re-embedding.
if docID != 0 && existingActive == 0 && existingHash == hash {
if _, err := idx.db.ExecContext(ctx,
`UPDATE documents SET active=1, updated_at=datetime('now') WHERE id=?`, docID); err != nil {
return fmt.Errorf("reactivating document: %w", err)
}
stats.FilesAdded++
return nil
}

// Upsert content
if _, err := idx.db.ExecContext(ctx,
`INSERT OR IGNORE INTO content(hash, body) VALUES (?, ?)`,
Expand Down Expand Up @@ -198,9 +215,9 @@ func (idx *Indexer) indexFile(ctx context.Context, col config.Collection, relPat
newDocID, _ = res.LastInsertId()
stats.FilesAdded++
} else {
// Update
// Update (or reactivate a previously deactivated document)
_, err = tx.ExecContext(ctx,
`UPDATE documents SET title=?, content_hash=?, updated_at=datetime('now') WHERE id=?`,
`UPDATE documents SET title=?, content_hash=?, active=1, updated_at=datetime('now') WHERE id=?`,
title, hash, docID)
if err != nil {
return fmt.Errorf("updating document: %w", err)
Expand All @@ -210,7 +227,11 @@ func (idx *Indexer) indexFile(ctx context.Context, col config.Collection, relPat
if _, err := tx.ExecContext(ctx, `DELETE FROM chunks WHERE doc_id=?`, docID); err != nil {
return fmt.Errorf("deleting old chunks: %w", err)
}
stats.FilesUpdated++
if existingActive == 0 {
stats.FilesAdded++
} else {
stats.FilesUpdated++
}
}

// Insert chunks
Expand Down
233 changes: 233 additions & 0 deletions internal/indexer/indexer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,239 @@ func TestIndexer_IncrementalUpdate(t *testing.T) {
}
}

func TestIndexer_ReindexWithEmbeddings(t *testing.T) {
database := openTestDB(t)
idx := New(database, 256)
dir := t.TempDir()
path := filepath.Join(dir, "doc.md")
col := config.Collection{Name: "test", Path: dir, Extensions: []string{".md"}}

if err := os.WriteFile(path, []byte("# Original\nOriginal content."), 0o640); err != nil {
t.Fatal(err)
}
if _, err := idx.Index(context.Background(), col); err != nil {
t.Fatal(err)
}

// Find a chunk for this document and insert a fake embedding.
var chunkID int64
row := database.QueryRowContext(context.Background(),
`SELECT c.id FROM chunks c JOIN documents d ON d.id=c.doc_id
WHERE d.collection='test' AND d.path='doc.md' LIMIT 1`)
if err := row.Scan(&chunkID); err != nil {
t.Fatalf("finding chunk: %v", err)
}
if err := database.InsertEmbedding(context.Background(), chunkID, []float32{0.1, 0.2, 0.3, 0.4}); err != nil {
t.Fatalf("inserting chunk_vector: %v", err)
}
if _, err := database.ExecContext(context.Background(),
`INSERT INTO embeddings(chunk_id, provider, model, dimension) VALUES (?, 'test', 'test-model', 4)`,
chunkID); err != nil {
t.Fatalf("inserting embeddings row: %v", err)
}

// Modify the file so its hash changes.
if err := os.WriteFile(path, []byte("# Updated\nUpdated content."), 0o640); err != nil {
t.Fatal(err)
}

// Reindex must succeed and report the file as updated.
stats, err := idx.Index(context.Background(), col)
if err != nil {
t.Fatalf("reindex failed: %v", err)
}
if stats.FilesUpdated != 1 {
t.Errorf("expected 1 updated, got %d", stats.FilesUpdated)
}

// No orphaned rows should remain in chunk_vectors or embeddings.
var orphanVectors int
if err := database.QueryRowContext(context.Background(),
`SELECT COUNT(*) FROM chunk_vectors WHERE chunk_id NOT IN (SELECT id FROM chunks)`).Scan(&orphanVectors); err != nil {
t.Fatalf("querying orphan chunk_vectors: %v", err)
}
if orphanVectors != 0 {
t.Errorf("expected 0 orphan chunk_vectors rows, got %d", orphanVectors)
}

var orphanEmbeddings int
if err := database.QueryRowContext(context.Background(),
`SELECT COUNT(*) FROM embeddings WHERE chunk_id NOT IN (SELECT id FROM chunks)`).Scan(&orphanEmbeddings); err != nil {
t.Fatalf("querying orphan embeddings: %v", err)
}
if orphanEmbeddings != 0 {
t.Errorf("expected 0 orphan embeddings rows, got %d", orphanEmbeddings)
}
}

func TestIndexer_ReindexAfterDeletion(t *testing.T) {
database := openTestDB(t)
idx := New(database, 256)
dir := t.TempDir()
path := filepath.Join(dir, "doc.md")
col := config.Collection{Name: "test", Path: dir, Extensions: []string{".md"}}

// Index the file
if err := os.WriteFile(path, []byte("# Original\nOriginal content."), 0o640); err != nil {
t.Fatal(err)
}
if _, err := idx.Index(context.Background(), col); err != nil {
t.Fatal(err)
}

// Delete the file → deactivates the document
if err := os.Remove(path); err != nil {
t.Fatal(err)
}
stats, err := idx.Index(context.Background(), col)
if err != nil {
t.Fatal(err)
}
if stats.FilesRemoved != 1 {
t.Fatalf("expected 1 removed, got %d", stats.FilesRemoved)
}

// Restore the file with new content
if err := os.WriteFile(path, []byte("# Restored\nRestored content."), 0o640); err != nil {
t.Fatal(err)
}
stats, err = idx.Index(context.Background(), col)
if err != nil {
t.Fatalf("index after restore: %v", err)
}
if stats.FilesAdded != 1 {
t.Errorf("expected 1 added after restore, got %d", stats.FilesAdded)
}

// Verify the document is active with new hash and has chunks
var active int
var hash string
row := database.QueryRowContext(context.Background(),
`SELECT active, content_hash FROM documents WHERE collection='test' AND path='doc.md'`)
if err := row.Scan(&active, &hash); err != nil {
t.Fatalf("querying restored document: %v", err)
}
if active != 1 {
t.Errorf("expected active=1, got %d", active)
}

var chunkCount int
cRow := database.QueryRowContext(context.Background(),
`SELECT COUNT(*) FROM chunks c JOIN documents d ON d.id=c.doc_id WHERE d.collection='test' AND d.path='doc.md' AND d.active=1`)
if err := cRow.Scan(&chunkCount); err != nil {
t.Fatalf("querying chunks: %v", err)
}
if chunkCount == 0 {
t.Error("expected at least one chunk after restore")
}
}

func TestIndexer_ReactivateSameContent(t *testing.T) {
database := openTestDB(t)
idx := New(database, 256)
dir := t.TempDir()
path := filepath.Join(dir, "doc.md")
col := config.Collection{Name: "test", Path: dir, Extensions: []string{".md"}}

body := []byte("# Original\nOriginal content.")
if err := os.WriteFile(path, body, 0o640); err != nil {
t.Fatal(err)
}
if _, err := idx.Index(context.Background(), col); err != nil {
t.Fatal(err)
}

// Capture chunk IDs and seed an embedding to detect spurious deletion.
rows, err := database.QueryContext(context.Background(),
`SELECT c.id FROM chunks c JOIN documents d ON d.id=c.doc_id
WHERE d.collection='test' AND d.path='doc.md' ORDER BY c.id`)
if err != nil {
t.Fatalf("listing chunks: %v", err)
}
var originalChunkIDs []int64
for rows.Next() {
var id int64
if err := rows.Scan(&id); err != nil {
t.Fatal(err)
}
originalChunkIDs = append(originalChunkIDs, id)
}
rows.Close()
if len(originalChunkIDs) == 0 {
t.Fatal("expected at least one chunk after initial index")
}

seedID := originalChunkIDs[0]
if err := database.InsertEmbedding(context.Background(), seedID, []float32{0.1, 0.2, 0.3, 0.4}); err != nil {
t.Fatalf("inserting chunk_vector: %v", err)
}
if _, err := database.ExecContext(context.Background(),
`INSERT INTO embeddings(chunk_id, provider, model, dimension) VALUES (?, 'test', 'test-model', 4)`,
seedID); err != nil {
t.Fatalf("inserting embeddings row: %v", err)
}

// Delete the file → deactivates the document.
if err := os.Remove(path); err != nil {
t.Fatal(err)
}
if _, err := idx.Index(context.Background(), col); err != nil {
t.Fatal(err)
}

// Restore byte-identical content.
if err := os.WriteFile(path, body, 0o640); err != nil {
t.Fatal(err)
}
stats, err := idx.Index(context.Background(), col)
if err != nil {
t.Fatalf("index after restore: %v", err)
}
if stats.FilesAdded != 1 {
t.Errorf("expected 1 added, got %d", stats.FilesAdded)
}

// Document must be active again.
var active int
if err := database.QueryRowContext(context.Background(),
`SELECT active FROM documents WHERE collection='test' AND path='doc.md'`).
Scan(&active); err != nil {
t.Fatalf("querying restored document: %v", err)
}
if active != 1 {
t.Errorf("expected active=1, got %d", active)
}

// Chunk ID must be preserved — proves DELETE FROM chunks did not run.
var preservedCount int
if err := database.QueryRowContext(context.Background(),
`SELECT COUNT(*) FROM chunks WHERE id = ?`, seedID).Scan(&preservedCount); err != nil {
t.Fatalf("querying preserved chunk: %v", err)
}
if preservedCount != 1 {
t.Fatalf("expected seed chunk %d to survive restore, got count %d", seedID, preservedCount)
}

// Embedding and vector for the seed chunk must still exist.
var embCount int
if err := database.QueryRowContext(context.Background(),
`SELECT COUNT(*) FROM embeddings WHERE chunk_id = ?`, seedID).Scan(&embCount); err != nil {
t.Fatalf("querying embedding: %v", err)
}
if embCount != 1 {
t.Errorf("expected embedding for chunk %d to survive restore, got %d", seedID, embCount)
}

var vecCount int
if err := database.QueryRowContext(context.Background(),
`SELECT COUNT(*) FROM chunk_vectors WHERE chunk_id = ?`, seedID).Scan(&vecCount); err != nil {
t.Fatalf("querying chunk_vector: %v", err)
}
if vecCount != 1 {
t.Errorf("expected chunk_vector for chunk %d to survive restore, got %d", seedID, vecCount)
}
}

func TestIndexer_DeactivatesMissingFiles(t *testing.T) {
database := openTestDB(t)
idx := New(database, 256)
Expand Down
Loading