From 47b678b27b3e4406df9f8b2b063911d00c6df6a4 Mon Sep 17 00:00:00 2001 From: Greg von Nessi Date: Fri, 13 Mar 2026 18:13:17 +0000 Subject: [PATCH 1/2] Table-drive loadEnvConfig and toRuntimeConfig Replace ~230 lines of repetitive field-by-field boilerplate with declarative ENV_MAPPINGS (29 entries) and CONFIG_MAPPINGS (26 entries) tables plus generic applicator functions. Tables serve as self- documenting config mapping reference. --- src/config/loader.ts | 399 ++++++++++++++++++++----------------------- 1 file changed, 182 insertions(+), 217 deletions(-) diff --git a/src/config/loader.ts b/src/config/loader.ts index b9f4621..a7058f1 100644 --- a/src/config/loader.ts +++ b/src/config/loader.ts @@ -181,171 +181,115 @@ function loadConfigFile(path: string): ExternalConfig | null { } } -/** - * Load config from environment variables. - * Variables are prefixed with CAUSANTIC_ and use underscores for nesting. - * Examples: - * CAUSANTIC_DECAY_BACKWARD_TYPE=linear - * CAUSANTIC_CLUSTERING_THRESHOLD=0.09 - * CAUSANTIC_STORAGE_DB_PATH=~/.causantic/memory.db - */ -function loadEnvConfig(): ExternalConfig { - const config: ExternalConfig = {}; +/** Mapping from an environment variable to a dot-path in ExternalConfig. */ +export type EnvMapping = { + env: string; + path: string; // dot-separated path into ExternalConfig + type: 'string' | 'int' | 'float' | 'boolean'; +}; +/** All CAUSANTIC_* environment variable mappings. */ +export const ENV_MAPPINGS: EnvMapping[] = [ // Clustering - if (process.env.CAUSANTIC_CLUSTERING_THRESHOLD) { - config.clustering = config.clustering ?? {}; - config.clustering.threshold = parseFloat(process.env.CAUSANTIC_CLUSTERING_THRESHOLD); - } - if (process.env.CAUSANTIC_CLUSTERING_MIN_CLUSTER_SIZE) { - config.clustering = config.clustering ?? {}; - config.clustering.minClusterSize = parseInt( - process.env.CAUSANTIC_CLUSTERING_MIN_CLUSTER_SIZE, - 10, - ); - } - if (process.env.CAUSANTIC_CLUSTERING_INCREMENTAL_THRESHOLD) { - config.clustering = config.clustering ?? {}; - config.clustering.incrementalThreshold = parseFloat( - process.env.CAUSANTIC_CLUSTERING_INCREMENTAL_THRESHOLD, - ); - } - + { env: 'CAUSANTIC_CLUSTERING_THRESHOLD', path: 'clustering.threshold', type: 'float' }, + { env: 'CAUSANTIC_CLUSTERING_MIN_CLUSTER_SIZE', path: 'clustering.minClusterSize', type: 'int' }, + { + env: 'CAUSANTIC_CLUSTERING_INCREMENTAL_THRESHOLD', + path: 'clustering.incrementalThreshold', + type: 'float', + }, // Traversal - if (process.env.CAUSANTIC_TRAVERSAL_MAX_DEPTH) { - config.traversal = config.traversal ?? {}; - config.traversal.maxDepth = parseInt(process.env.CAUSANTIC_TRAVERSAL_MAX_DEPTH, 10); - } - + { env: 'CAUSANTIC_TRAVERSAL_MAX_DEPTH', path: 'traversal.maxDepth', type: 'int' }, // Tokens - if (process.env.CAUSANTIC_TOKENS_CLAUDE_MD_BUDGET) { - config.tokens = config.tokens ?? {}; - config.tokens.claudeMdBudget = parseInt(process.env.CAUSANTIC_TOKENS_CLAUDE_MD_BUDGET, 10); - } - if (process.env.CAUSANTIC_TOKENS_MCP_MAX_RESPONSE) { - config.tokens = config.tokens ?? {}; - config.tokens.mcpMaxResponse = parseInt(process.env.CAUSANTIC_TOKENS_MCP_MAX_RESPONSE, 10); - } - + { env: 'CAUSANTIC_TOKENS_CLAUDE_MD_BUDGET', path: 'tokens.claudeMdBudget', type: 'int' }, + { env: 'CAUSANTIC_TOKENS_MCP_MAX_RESPONSE', path: 'tokens.mcpMaxResponse', type: 'int' }, // Storage - if (process.env.CAUSANTIC_STORAGE_DB_PATH) { - config.storage = config.storage ?? {}; - config.storage.dbPath = process.env.CAUSANTIC_STORAGE_DB_PATH; - } - if (process.env.CAUSANTIC_STORAGE_VECTOR_PATH) { - config.storage = config.storage ?? {}; - config.storage.vectorPath = process.env.CAUSANTIC_STORAGE_VECTOR_PATH; - } - + { env: 'CAUSANTIC_STORAGE_DB_PATH', path: 'storage.dbPath', type: 'string' }, + { env: 'CAUSANTIC_STORAGE_VECTOR_PATH', path: 'storage.vectorPath', type: 'string' }, // LLM - if (process.env.CAUSANTIC_LLM_CLUSTER_REFRESH_MODEL) { - config.llm = config.llm ?? {}; - config.llm.clusterRefreshModel = process.env.CAUSANTIC_LLM_CLUSTER_REFRESH_MODEL; - } - if (process.env.CAUSANTIC_LLM_REFRESH_RATE_LIMIT) { - config.llm = config.llm ?? {}; - config.llm.refreshRateLimitPerMin = parseInt(process.env.CAUSANTIC_LLM_REFRESH_RATE_LIMIT, 10); - } - - if (process.env.CAUSANTIC_LLM_ENABLE_LABELLING) { - config.llm = config.llm ?? {}; - config.llm.enableLabelling = process.env.CAUSANTIC_LLM_ENABLE_LABELLING === 'true'; - } - + { env: 'CAUSANTIC_LLM_CLUSTER_REFRESH_MODEL', path: 'llm.clusterRefreshModel', type: 'string' }, + { env: 'CAUSANTIC_LLM_REFRESH_RATE_LIMIT', path: 'llm.refreshRateLimitPerMin', type: 'int' }, + { env: 'CAUSANTIC_LLM_ENABLE_LABELLING', path: 'llm.enableLabelling', type: 'boolean' }, // Encryption - if (process.env.CAUSANTIC_ENCRYPTION_ENABLED) { - config.encryption = config.encryption ?? {}; - config.encryption.enabled = process.env.CAUSANTIC_ENCRYPTION_ENABLED === 'true'; - } - if (process.env.CAUSANTIC_ENCRYPTION_CIPHER) { - config.encryption = config.encryption ?? {}; - config.encryption.cipher = process.env.CAUSANTIC_ENCRYPTION_CIPHER as 'chacha20' | 'sqlcipher'; - } - if (process.env.CAUSANTIC_ENCRYPTION_KEY_SOURCE) { - config.encryption = config.encryption ?? {}; - config.encryption.keySource = process.env.CAUSANTIC_ENCRYPTION_KEY_SOURCE as - | 'keychain' - | 'env' - | 'prompt'; - } - if (process.env.CAUSANTIC_ENCRYPTION_AUDIT_LOG) { - config.encryption = config.encryption ?? {}; - config.encryption.auditLog = process.env.CAUSANTIC_ENCRYPTION_AUDIT_LOG === 'true'; - } - + { env: 'CAUSANTIC_ENCRYPTION_ENABLED', path: 'encryption.enabled', type: 'boolean' }, + { env: 'CAUSANTIC_ENCRYPTION_CIPHER', path: 'encryption.cipher', type: 'string' }, + { env: 'CAUSANTIC_ENCRYPTION_KEY_SOURCE', path: 'encryption.keySource', type: 'string' }, + { env: 'CAUSANTIC_ENCRYPTION_AUDIT_LOG', path: 'encryption.auditLog', type: 'boolean' }, // Vectors - if (process.env.CAUSANTIC_VECTORS_TTL_DAYS) { - config.vectors = config.vectors ?? {}; - config.vectors.ttlDays = parseInt(process.env.CAUSANTIC_VECTORS_TTL_DAYS, 10); - } - if (process.env.CAUSANTIC_VECTORS_MAX_COUNT) { - config.vectors = config.vectors ?? {}; - config.vectors.maxCount = parseInt(process.env.CAUSANTIC_VECTORS_MAX_COUNT, 10); - } - + { env: 'CAUSANTIC_VECTORS_TTL_DAYS', path: 'vectors.ttlDays', type: 'int' }, + { env: 'CAUSANTIC_VECTORS_MAX_COUNT', path: 'vectors.maxCount', type: 'int' }, // Maintenance - if (process.env.CAUSANTIC_MAINTENANCE_CLUSTER_HOUR) { - config.maintenance = config.maintenance ?? {}; - config.maintenance.clusterHour = parseInt(process.env.CAUSANTIC_MAINTENANCE_CLUSTER_HOUR, 10); - } - + { env: 'CAUSANTIC_MAINTENANCE_CLUSTER_HOUR', path: 'maintenance.clusterHour', type: 'int' }, // Embedding - if (process.env.CAUSANTIC_EMBEDDING_DEVICE) { - config.embedding = config.embedding ?? {}; - config.embedding.device = process.env.CAUSANTIC_EMBEDDING_DEVICE; - } - if (process.env.CAUSANTIC_EMBEDDING_MODEL) { - config.embedding = config.embedding ?? {}; - config.embedding.model = process.env.CAUSANTIC_EMBEDDING_MODEL; - } - if (process.env.CAUSANTIC_EMBEDDING_EAGER) { - config.embedding = config.embedding ?? {}; - config.embedding.eager = process.env.CAUSANTIC_EMBEDDING_EAGER === 'true'; - } - + { env: 'CAUSANTIC_EMBEDDING_DEVICE', path: 'embedding.device', type: 'string' }, + { env: 'CAUSANTIC_EMBEDDING_MODEL', path: 'embedding.model', type: 'string' }, + { env: 'CAUSANTIC_EMBEDDING_EAGER', path: 'embedding.eager', type: 'boolean' }, // Retrieval - if (process.env.CAUSANTIC_RETRIEVAL_MMR_LAMBDA) { - config.retrieval = config.retrieval ?? {}; - config.retrieval.mmrLambda = parseFloat(process.env.CAUSANTIC_RETRIEVAL_MMR_LAMBDA); - } - if (process.env.CAUSANTIC_RETRIEVAL_FEEDBACK_WEIGHT) { - config.retrieval = config.retrieval ?? {}; - config.retrieval.feedbackWeight = parseFloat(process.env.CAUSANTIC_RETRIEVAL_FEEDBACK_WEIGHT); - } - if (process.env.CAUSANTIC_RETRIEVAL_PRIMARY) { - config.retrieval = config.retrieval ?? {}; - config.retrieval.primary = process.env.CAUSANTIC_RETRIEVAL_PRIMARY as - | 'keyword' - | 'vector' - | 'hybrid'; - } - if (process.env.CAUSANTIC_RETRIEVAL_VECTOR_ENRICHMENT) { - config.retrieval = config.retrieval ?? {}; - config.retrieval.vectorEnrichment = - process.env.CAUSANTIC_RETRIEVAL_VECTOR_ENRICHMENT === 'true'; - } - + { env: 'CAUSANTIC_RETRIEVAL_MMR_LAMBDA', path: 'retrieval.mmrLambda', type: 'float' }, + { env: 'CAUSANTIC_RETRIEVAL_FEEDBACK_WEIGHT', path: 'retrieval.feedbackWeight', type: 'float' }, + { env: 'CAUSANTIC_RETRIEVAL_PRIMARY', path: 'retrieval.primary', type: 'string' }, + { + env: 'CAUSANTIC_RETRIEVAL_VECTOR_ENRICHMENT', + path: 'retrieval.vectorEnrichment', + type: 'boolean', + }, // Recency - if (process.env.CAUSANTIC_RECENCY_DECAY_FACTOR) { - config.recency = config.recency ?? {}; - config.recency.decayFactor = parseFloat(process.env.CAUSANTIC_RECENCY_DECAY_FACTOR); - } - if (process.env.CAUSANTIC_RECENCY_HALF_LIFE_HOURS) { - config.recency = config.recency ?? {}; - config.recency.halfLifeHours = parseFloat(process.env.CAUSANTIC_RECENCY_HALF_LIFE_HOURS); - } - + { env: 'CAUSANTIC_RECENCY_DECAY_FACTOR', path: 'recency.decayFactor', type: 'float' }, + { env: 'CAUSANTIC_RECENCY_HALF_LIFE_HOURS', path: 'recency.halfLifeHours', type: 'float' }, // Semantic Index - if (process.env.CAUSANTIC_SEMANTIC_INDEX_ENABLED) { - config.semanticIndex = config.semanticIndex ?? {}; - config.semanticIndex.enabled = process.env.CAUSANTIC_SEMANTIC_INDEX_ENABLED === 'true'; - } - if (process.env.CAUSANTIC_SEMANTIC_INDEX_USE_FOR_SEARCH) { - config.semanticIndex = config.semanticIndex ?? {}; - config.semanticIndex.useForSearch = - process.env.CAUSANTIC_SEMANTIC_INDEX_USE_FOR_SEARCH === 'true'; + { env: 'CAUSANTIC_SEMANTIC_INDEX_ENABLED', path: 'semanticIndex.enabled', type: 'boolean' }, + { + env: 'CAUSANTIC_SEMANTIC_INDEX_USE_FOR_SEARCH', + path: 'semanticIndex.useForSearch', + type: 'boolean', + }, +]; + +/** + * Apply environment variable mappings to a config object. + * Parses values according to their declared type and sets them at the dot-path. + */ +function applyEnvMappings(config: ExternalConfig, mappings: EnvMapping[]): void { + for (const { env, path, type } of mappings) { + const value = process.env[env]; + if (value === undefined) continue; + + const parts = path.split('.'); + // eslint-disable-next-line @typescript-eslint/no-explicit-any + let obj: any = config; + for (let i = 0; i < parts.length - 1; i++) { + obj[parts[i]] = obj[parts[i]] ?? {}; + obj = obj[parts[i]]; + } + + const key = parts[parts.length - 1]; + switch (type) { + case 'int': + obj[key] = parseInt(value, 10); + break; + case 'float': + obj[key] = parseFloat(value); + break; + case 'boolean': + obj[key] = value === 'true'; + break; + case 'string': + obj[key] = value; + break; + } } +} +/** + * Load config from environment variables. + * Variables are prefixed with CAUSANTIC_ and use underscores for nesting. + * Examples: + * CAUSANTIC_CLUSTERING_THRESHOLD=0.09 + * CAUSANTIC_STORAGE_DB_PATH=~/.causantic/memory.db + */ +function loadEnvConfig(): ExternalConfig { + const config: ExternalConfig = {}; + applyEnvMappings(config, ENV_MAPPINGS); return config; } @@ -543,6 +487,75 @@ export function getResolvedPaths(config: Required): { }; } +/** Mapping from an ExternalConfig dot-path to a MemoryConfig dot-path. */ +export type ConfigMapping = { + from: string; // dot-path in ExternalConfig + to: string; // dot-path in MemoryConfig +}; + +/** All ExternalConfig -> MemoryConfig field mappings. */ +export const CONFIG_MAPPINGS: ConfigMapping[] = [ + // Clustering + { from: 'clustering.threshold', to: 'clusterThreshold' }, + { from: 'clustering.minClusterSize', to: 'minClusterSize' }, + { from: 'clustering.incrementalThreshold', to: 'incrementalClusterThreshold' }, + // Chain walking + { from: 'traversal.maxDepth', to: 'maxChainDepth' }, + // Tokens + { from: 'tokens.claudeMdBudget', to: 'claudeMdBudgetTokens' }, + { from: 'tokens.mcpMaxResponse', to: 'mcpMaxResponseTokens' }, + // Storage + { from: 'storage.dbPath', to: 'dbPath' }, + { from: 'storage.vectorPath', to: 'vectorStorePath' }, + // LLM + { from: 'llm.clusterRefreshModel', to: 'clusterRefreshModel' }, + { from: 'llm.refreshRateLimitPerMin', to: 'refreshRateLimitPerMin' }, + // Retrieval strategy + { from: 'retrieval.primary', to: 'retrievalPrimary' }, + { from: 'retrieval.vectorEnrichment', to: 'vectorEnrichment' }, + // Embedding + { from: 'embedding.model', to: 'embeddingModel' }, + { from: 'embedding.eager', to: 'embeddingEager' }, + // Retrieval scoring + { from: 'retrieval.mmrLambda', to: 'mmrReranking.lambda' }, + { from: 'retrieval.feedbackWeight', to: 'feedbackWeight' }, + // Recency + { from: 'recency.decayFactor', to: 'recency.decayFactor' }, + { from: 'recency.halfLifeHours', to: 'recency.halfLifeHours' }, + // Length penalty + { from: 'lengthPenalty.enabled', to: 'lengthPenalty.enabled' }, + { from: 'lengthPenalty.referenceTokens', to: 'lengthPenalty.referenceTokens' }, + // Semantic index + { from: 'semanticIndex.enabled', to: 'semanticIndex.enabled' }, + { from: 'semanticIndex.targetDescriptionTokens', to: 'semanticIndex.targetDescriptionTokens' }, + { from: 'semanticIndex.batchRefreshLimit', to: 'semanticIndex.batchRefreshLimit' }, + { from: 'semanticIndex.useForSearch', to: 'semanticIndex.useForSearch' }, +]; + +/** Read a value from a nested object using a dot-separated path. */ +// eslint-disable-next-line @typescript-eslint/no-explicit-any +function getPath(obj: any, path: string): unknown { + const parts = path.split('.'); + let current = obj; + for (const part of parts) { + if (current === null || current === undefined) return undefined; + current = current[part]; + } + return current; +} + +/** Set a value on a nested object using a dot-separated path, creating intermediates as needed. */ +// eslint-disable-next-line @typescript-eslint/no-explicit-any +function setPath(obj: any, path: string, value: unknown): void { + const parts = path.split('.'); + let current = obj; + for (let i = 0; i < parts.length - 1; i++) { + current[parts[i]] = current[parts[i]] ?? {}; + current = current[parts[i]]; + } + current[parts[parts.length - 1]] = value; +} + /** * Convert ExternalConfig to MemoryConfig (the runtime format). * @@ -550,75 +563,27 @@ export function getResolvedPaths(config: Required): { * DEFAULT_CONFIG so callers get a complete MemoryConfig. */ export function toRuntimeConfig(external: Required): MemoryConfig { - return { + // Start with a full copy of defaults + const runtime: MemoryConfig = { ...DEFAULT_CONFIG, - - // Clustering - clusterThreshold: external.clustering.threshold ?? DEFAULT_CONFIG.clusterThreshold, - minClusterSize: external.clustering.minClusterSize ?? DEFAULT_CONFIG.minClusterSize, - - // Clustering (incremental) - incrementalClusterThreshold: - external.clustering.incrementalThreshold ?? DEFAULT_CONFIG.incrementalClusterThreshold, - - // Chain walking - maxChainDepth: external.traversal.maxDepth ?? DEFAULT_CONFIG.maxChainDepth, - - // Tokens - claudeMdBudgetTokens: external.tokens.claudeMdBudget ?? DEFAULT_CONFIG.claudeMdBudgetTokens, - mcpMaxResponseTokens: external.tokens.mcpMaxResponse ?? DEFAULT_CONFIG.mcpMaxResponseTokens, - - // Storage - dbPath: external.storage.dbPath ?? DEFAULT_CONFIG.dbPath, - vectorStorePath: external.storage.vectorPath ?? DEFAULT_CONFIG.vectorStorePath, - - // LLM - clusterRefreshModel: external.llm.clusterRefreshModel ?? DEFAULT_CONFIG.clusterRefreshModel, - refreshRateLimitPerMin: - external.llm.refreshRateLimitPerMin ?? DEFAULT_CONFIG.refreshRateLimitPerMin, - - // Retrieval strategy - retrievalPrimary: external.retrieval?.primary ?? DEFAULT_CONFIG.retrievalPrimary, - vectorEnrichment: external.retrieval?.vectorEnrichment ?? DEFAULT_CONFIG.vectorEnrichment, - - // Embedding - embeddingModel: external.embedding?.model ?? DEFAULT_CONFIG.embeddingModel, - embeddingEager: external.embedding?.eager ?? DEFAULT_CONFIG.embeddingEager, - - // Retrieval - mmrReranking: { - lambda: external.retrieval?.mmrLambda ?? DEFAULT_CONFIG.mmrReranking.lambda, - }, - feedbackWeight: external.retrieval?.feedbackWeight ?? DEFAULT_CONFIG.feedbackWeight, - - // Recency - recency: { - decayFactor: external.recency?.decayFactor ?? DEFAULT_CONFIG.recency.decayFactor, - halfLifeHours: external.recency?.halfLifeHours ?? DEFAULT_CONFIG.recency.halfLifeHours, - }, - - // Length penalty - lengthPenalty: { - enabled: external.lengthPenalty?.enabled ?? DEFAULT_CONFIG.lengthPenalty.enabled, - referenceTokens: - external.lengthPenalty?.referenceTokens ?? DEFAULT_CONFIG.lengthPenalty.referenceTokens, - }, - - // Repo map (uses DEFAULT_CONFIG defaults — no external config mapping yet) - repomap: DEFAULT_CONFIG.repomap, - - // Semantic index - semanticIndex: { - enabled: external.semanticIndex?.enabled ?? DEFAULT_CONFIG.semanticIndex.enabled, - targetDescriptionTokens: - external.semanticIndex?.targetDescriptionTokens ?? - DEFAULT_CONFIG.semanticIndex.targetDescriptionTokens, - batchRefreshLimit: - external.semanticIndex?.batchRefreshLimit ?? DEFAULT_CONFIG.semanticIndex.batchRefreshLimit, - useForSearch: - external.semanticIndex?.useForSearch ?? DEFAULT_CONFIG.semanticIndex.useForSearch, - }, + hybridSearch: { ...DEFAULT_CONFIG.hybridSearch }, + clusterExpansion: { ...DEFAULT_CONFIG.clusterExpansion }, + mmrReranking: { ...DEFAULT_CONFIG.mmrReranking }, + recency: { ...DEFAULT_CONFIG.recency }, + lengthPenalty: { ...DEFAULT_CONFIG.lengthPenalty }, + repomap: { ...DEFAULT_CONFIG.repomap, languages: [...DEFAULT_CONFIG.repomap.languages] }, + semanticIndex: { ...DEFAULT_CONFIG.semanticIndex }, }; + + // Apply table-driven mappings: external value wins over default + for (const { from, to } of CONFIG_MAPPINGS) { + const value = getPath(external, from); + if (value !== null && value !== undefined) { + setPath(runtime, to, value); + } + } + + return runtime; } // Re-export for convenience From d0c491c397c6ec0652f87c5ddbbb3b232a903777 Mon Sep 17 00:00:00 2001 From: Greg von Nessi Date: Fri, 13 Mar 2026 18:13:17 +0000 Subject: [PATCH 2/2] Extract VectorStore cleanup into separate module Move DB cleanup logic (removeVectorsAndRelated, findExpiredVectorIds, findOldestVectorIds) to vector-store-cleanup.ts. VectorStore class reduced from 733 to 665 lines. Public API unchanged. --- src/storage/vector-store-cleanup.ts | 137 ++++++++++++++++++++++++++++ src/storage/vector-store.ts | 114 +++++------------------ 2 files changed, 160 insertions(+), 91 deletions(-) create mode 100644 src/storage/vector-store-cleanup.ts diff --git a/src/storage/vector-store-cleanup.ts b/src/storage/vector-store-cleanup.ts new file mode 100644 index 0000000..65c72d5 --- /dev/null +++ b/src/storage/vector-store-cleanup.ts @@ -0,0 +1,137 @@ +/** + * Vector store cleanup and eviction logic. + * + * Extracted from VectorStore to separate TTL/eviction concerns from the + * core vector storage and search responsibilities. + * + * These functions handle the DB-level operations (finding expired/excess + * vectors, deleting them and their related data). The caller is responsible + * for updating in-memory indexes after deletion. + * + * @module storage/vector-store-cleanup + */ + +import type Database from 'better-sqlite3-multiple-ciphers'; +import { sqlPlaceholders } from './db.js'; + +/** + * Remove vectors and all related data (chunks, clusters, index entries) by ID. + * + * Handles: + * - Chunk deletion (FK cascades handle chunk_clusters and edges) + * - Vector deletion from the specified table + * - Orphaned cluster cleanup + * - Orphaned index entry and index vector cleanup + * + * @param db - Database instance + * @param tableName - Vector table name (e.g. 'vectors' or 'index_vectors') + * @param ids - Vector/chunk IDs to remove + * @returns Number of vectors deleted from the DB + */ +export function removeVectorsAndRelated( + db: Database.Database, + tableName: string, + ids: string[], +): number { + const placeholders = sqlPlaceholders(ids.length); + + // Delete chunks first (FK cascades handle chunk_clusters and edges) + db.prepare(`DELETE FROM chunks WHERE id IN (${placeholders})`).run(...ids); + + // Delete vectors + const result = db.prepare(`DELETE FROM ${tableName} WHERE id IN (${placeholders})`).run(...ids); + + // Remove empty clusters (no remaining members after chunk deletion) + db.prepare( + ` + DELETE FROM clusters WHERE id NOT IN ( + SELECT DISTINCT cluster_id FROM chunk_clusters + ) + `, + ).run(); + + // Clean up index entries that referenced the deleted chunks + try { + const placeholdersForCleanup = sqlPlaceholders(ids.length); + db.prepare(`DELETE FROM index_entry_chunks WHERE chunk_id IN (${placeholdersForCleanup})`).run( + ...ids, + ); + + // Delete orphaned index entries (no remaining chunk references) + const orphaned = db + .prepare( + `SELECT id FROM index_entries WHERE id NOT IN ( + SELECT DISTINCT index_entry_id FROM index_entry_chunks + )`, + ) + .all() as Array<{ id: string }>; + + if (orphaned.length > 0) { + const orphanIds = orphaned.map((r) => r.id); + const orphanPlaceholders = sqlPlaceholders(orphanIds.length); + db.prepare(`DELETE FROM index_entries WHERE id IN (${orphanPlaceholders})`).run(...orphanIds); + + // Remove from index vector store (if this is the chunk vector store) + if (tableName === 'vectors') { + db.prepare(`DELETE FROM index_vectors WHERE id IN (${orphanPlaceholders})`).run( + ...orphanIds, + ); + } + } + } catch { + // index_entry_chunks table may not exist yet + } + + return result.changes; +} + +/** + * Find vector IDs that have expired based on their last_accessed timestamp. + * + * @param db - Database instance + * @param tableName - Vector table name + * @param ttlDays - Number of days after which unaccessed vectors expire + * @returns Array of expired vector IDs + */ +export function findExpiredVectorIds( + db: Database.Database, + tableName: string, + ttlDays: number, +): string[] { + const expiredRows = db + .prepare( + ` + SELECT id FROM ${tableName} + WHERE last_accessed < datetime('now', '-' || ? || ' days') + `, + ) + .all(ttlDays) as { id: string }[]; + + return expiredRows.map((r) => r.id); +} + +/** + * Find the oldest vector IDs that exceed a maximum count. + * + * @param db - Database instance + * @param tableName - Vector table name + * @param overage - Number of vectors to evict (currentCount - maxCount) + * @returns Array of vector IDs to evict (oldest by last_accessed) + */ +export function findOldestVectorIds( + db: Database.Database, + tableName: string, + overage: number, +): string[] { + const toEvict = db + .prepare( + ` + SELECT id FROM ${tableName} + ORDER BY last_accessed ASC + LIMIT ? + `, + ) + .all(overage) as { id: string }[]; + + return toEvict.map((r) => r.id); +} diff --git a/src/storage/vector-store.ts b/src/storage/vector-store.ts index 6323a0b..220a3f4 100644 --- a/src/storage/vector-store.ts +++ b/src/storage/vector-store.ts @@ -67,6 +67,11 @@ import { angularDistance } from '../utils/angular-distance.js'; import type { VectorSearchResult } from './types.js'; import { serializeEmbedding, deserializeEmbedding } from '../utils/embedding-utils.js'; import { getModel } from '../models/model-registry.js'; +import { + removeVectorsAndRelated, + findExpiredVectorIds, + findOldestVectorIds, +} from './vector-store-cleanup.js'; /** * In-memory vector index backed by SQLite for persistence. * @@ -576,73 +581,23 @@ export class VectorStore { /** * Remove vectors and all related data (chunks, clusters, index entries) by ID. - * Handles DB deletions with FK cascades, orphan cleanup, and in-memory removal. + * Delegates DB operations to the extracted cleanup module, then updates in-memory indexes. * * @param ids - Vector/chunk IDs to remove * @returns Number of vectors deleted from the DB */ - private removeVectorsAndRelated(ids: string[]): number { - const db = getDb(); - const placeholders = sqlPlaceholders(ids.length); - - // Delete chunks first (FK cascades handle chunk_clusters and edges) - db.prepare(`DELETE FROM chunks WHERE id IN (${placeholders})`).run(...ids); - - // Delete vectors - const result = db - .prepare(`DELETE FROM ${this.tableName} WHERE id IN (${placeholders})`) - .run(...ids); - - // Remove empty clusters (no remaining members after chunk deletion) - db.prepare( - ` - DELETE FROM clusters WHERE id NOT IN ( - SELECT DISTINCT cluster_id FROM chunk_clusters - ) - `, - ).run(); - - // Clean up index entries that referenced the deleted chunks - try { - const placeholdersForCleanup = sqlPlaceholders(ids.length); - db.prepare( - `DELETE FROM index_entry_chunks WHERE chunk_id IN (${placeholdersForCleanup})`, - ).run(...ids); - - // Delete orphaned index entries (no remaining chunk references) - const orphaned = db - .prepare( - `SELECT id FROM index_entries WHERE id NOT IN ( - SELECT DISTINCT index_entry_id FROM index_entry_chunks - )`, - ) - .all() as Array<{ id: string }>; - - if (orphaned.length > 0) { - const orphanIds = orphaned.map((r) => r.id); - const orphanPlaceholders = sqlPlaceholders(orphanIds.length); - db.prepare(`DELETE FROM index_entries WHERE id IN (${orphanPlaceholders})`).run( - ...orphanIds, - ); - - // Remove from index vector store (if this is the chunk vector store) - if (this.tableName === 'vectors') { - db.prepare(`DELETE FROM index_vectors WHERE id IN (${orphanPlaceholders})`).run( - ...orphanIds, - ); - } - } - } catch { - // index_entry_chunks table may not exist yet - } + private removeFromDb(ids: string[]): number { + return removeVectorsAndRelated(getDb(), this.tableName, ids); + } - // Remove from memory + /** + * Remove IDs from all in-memory indexes. + */ + private removeFromMemory(ids: string[]): void { for (const id of ids) { this.vectors.delete(id); this.chunkProjectIndex.delete(id); } - - return result.changes; } /** @@ -656,24 +611,12 @@ export class VectorStore { async cleanupExpired(ttlDays: number): Promise { await this.load(); - const db = getDb(); - - // Find expired vectors (any vector not accessed within TTL) - const expiredRows = db - .prepare( - ` - SELECT id FROM ${this.tableName} - WHERE last_accessed < datetime('now', '-' || ? || ' days') - `, - ) - .all(ttlDays) as { id: string }[]; - - if (expiredRows.length === 0) { - return 0; - } + const expiredIds = findExpiredVectorIds(getDb(), this.tableName, ttlDays); + if (expiredIds.length === 0) return 0; - const expiredIds = expiredRows.map((r) => r.id); - return this.removeVectorsAndRelated(expiredIds); + const deletedCount = this.removeFromDb(expiredIds); + this.removeFromMemory(expiredIds); + return deletedCount; } /** @@ -692,23 +635,12 @@ export class VectorStore { if (currentCount <= maxCount) return 0; const overage = currentCount - maxCount; - const db = getDb(); - - // Select the oldest vectors by last_accessed - const toEvict = db - .prepare( - ` - SELECT id FROM ${this.tableName} - ORDER BY last_accessed ASC - LIMIT ? - `, - ) - .all(overage) as { id: string }[]; - - if (toEvict.length === 0) return 0; + const evictIds = findOldestVectorIds(getDb(), this.tableName, overage); + if (evictIds.length === 0) return 0; - const evictIds = toEvict.map((r) => r.id); - return this.removeVectorsAndRelated(evictIds); + const deletedCount = this.removeFromDb(evictIds); + this.removeFromMemory(evictIds); + return deletedCount; } }