diff --git a/src/memory/observations.ts b/src/memory/observations.ts index cacc9b4..16d2d81 100644 --- a/src/memory/observations.ts +++ b/src/memory/observations.ts @@ -17,6 +17,8 @@ import { generateEmbedding, batchGenerateEmbeddings, getVectorDimensions, + hydrateIndex, + isEmbeddingEnabled, makeOramaObservationId, } from '../store/orama-store.js'; import { saveObservationsJson, loadObservationsJson, saveIdCounter, loadIdCounter } from '../store/persistence.js'; @@ -562,8 +564,8 @@ export function suggestTopicKey(type: string, title: string): string { } /** - * Reload observations into the Orama index. - * Called during server startup to restore the search index. + * Reload observations into the Orama index with full corpus embeddings. + * Intended for explicit heavy rebuilds, not normal MCP startup. * * Optimization: uses batch embedding (ONNX processes 64 texts at a time) * instead of individual embed calls. This reduces startup CPU from minutes @@ -641,6 +643,30 @@ export async function reindexObservations(): Promise { return count; } +/** + * Prepare the search index for startup and hot-reload without blocking on + * corpus-wide embedding generation. + * + * This hydrates the lexical/BM25 index immediately so MCP availability is not + * coupled to embedding provider throughput. Missing vectors are queued for the + * existing background backfill cycle. + */ +export async function prepareSearchIndex(): Promise { + await resetDb(); + const count = await hydrateIndex(observations as unknown as any[]); + + vectorMissingIds.clear(); + if (isEmbeddingEnabled()) { + for (const obs of observations) { + if ((obs.status ?? 'active') === 'active') { + vectorMissingIds.add(obs.id); + } + } + } + + return count; +} + // ── Vector-missing observability & backfill ───────────────────────── /** diff --git a/src/server.ts b/src/server.ts index 69162fd..1839ab2 100644 --- a/src/server.ts +++ b/src/server.ts @@ -20,8 +20,7 @@ import { watchFile } from 'node:fs'; import { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js'; import { z } from 'zod'; import { KnowledgeGraphManager } from './memory/graph.js'; -import { initObservations, storeObservation, reindexObservations, migrateProjectIds, getObservation } from './memory/observations.js'; -import { resetDb } from './store/orama-store.js'; +import { initObservations, storeObservation, prepareSearchIndex, migrateProjectIds, getObservation } from './memory/observations.js'; import { createAutoRelations } from './memory/auto-relations.js'; import { extractEntities } from './memory/entity-extractor.js'; import { compactSearch, compactTimeline, compactDetail } from './compact/engine.js'; @@ -264,9 +263,9 @@ export async function createMemorixServer( await graphManager.init(); await initObservations(projectDir); - const reindexed = await reindexObservations(); - if (reindexed > 0) { - console.error(`[memorix] Reindexed ${reindexed} observations for project: ${project.id}`); + const indexed = await prepareSearchIndex(); + if (indexed > 0) { + console.error(`[memorix] Prepared search index for ${indexed} observations in project: ${project.id}`); } const llmConfig = initLLM(); @@ -3286,11 +3285,10 @@ export async function createMemorixServer( if (reloading) return; reloading = true; try { - await resetDb(); await initObservations(projectDir); - const count = await reindexObservations(); + const count = await prepareSearchIndex(); if (count > 0) { - console.error(`[memorix] Hot-reloaded ${count} observations (external write detected)`); + console.error(`[memorix] Hot-reloaded search index for ${count} observations (external write detected)`); } } catch { /* silent */ } reloading = false; diff --git a/src/store/orama-store.ts b/src/store/orama-store.ts index e58f50f..2abdd7c 100644 --- a/src/store/orama-store.ts +++ b/src/store/orama-store.ts @@ -223,7 +223,6 @@ export async function hydrateIndex(observations: any[]): Promise { let inserted = 0; for (const obs of observations) { if (!obs || !obs.id || !obs.projectId) continue; - if ((obs.status ?? 'active') !== 'active') continue; try { const doc: MemorixDocument = { id: makeOramaObservationId(obs.projectId, obs.id), diff --git a/tests/memory/prepare-search-index.test.ts b/tests/memory/prepare-search-index.test.ts new file mode 100644 index 0000000..99bfef2 --- /dev/null +++ b/tests/memory/prepare-search-index.test.ts @@ -0,0 +1,139 @@ +import { beforeEach, describe, expect, it, vi } from 'vitest'; + +const mockResetDb = vi.fn(); +const mockBatchGenerateEmbeddings = vi.fn(); +const mockHydrateIndex = vi.fn(); +const mockInsertObservation = vi.fn(); +const mockLoadObservationsJson = vi.fn(); +const mockLoadIdCounter = vi.fn(); +const mockIsEmbeddingEnabled = vi.fn(); + +vi.mock('../../src/store/orama-store.js', () => ({ + insertObservation: mockInsertObservation, + removeObservation: vi.fn(), + resetDb: mockResetDb, + generateEmbedding: vi.fn(), + batchGenerateEmbeddings: mockBatchGenerateEmbeddings, + hydrateIndex: mockHydrateIndex, + isEmbeddingEnabled: mockIsEmbeddingEnabled, + makeOramaObservationId: (projectId: string, observationId: number) => `${projectId}:${observationId}`, +})); + +vi.mock('../../src/store/persistence.js', () => ({ + saveObservationsJson: vi.fn(), + loadObservationsJson: mockLoadObservationsJson, + saveIdCounter: vi.fn(), + loadIdCounter: mockLoadIdCounter, +})); + +vi.mock('../../src/store/file-lock.js', () => ({ + withFileLock: async (_dir: string, fn: () => Promise) => fn(), +})); + +vi.mock('../../src/compact/token-budget.js', () => ({ + countTextTokens: () => 0, +})); + +vi.mock('../../src/memory/entity-extractor.js', () => ({ + extractEntities: () => [], + enrichConcepts: (concepts: string[]) => concepts, +})); + +describe('prepareSearchIndex', () => { + beforeEach(() => { + vi.resetModules(); + mockResetDb.mockReset(); + mockBatchGenerateEmbeddings.mockReset(); + mockHydrateIndex.mockReset(); + mockInsertObservation.mockReset(); + mockLoadObservationsJson.mockReset(); + mockLoadIdCounter.mockReset(); + mockIsEmbeddingEnabled.mockReset(); + }); + + it('hydrates the lexical index without triggering batch embeddings and queues active docs for backfill', async () => { + mockLoadObservationsJson.mockResolvedValue([ + { + id: 1, + projectId: 'AVIDS2/memorix', + entityName: 'search-layer', + type: 'what-changed', + title: 'Prepared startup index', + narrative: 'Build lexical index first, defer vectors.', + facts: ['Startup should not block on embeddings'], + filesModified: ['src/server.ts'], + concepts: ['startup-index'], + tokens: 42, + createdAt: '2026-03-18T00:00:00.000Z', + status: 'active', + source: 'agent', + }, + { + id: 2, + projectId: 'AVIDS2/memorix', + entityName: 'history', + type: 'decision', + title: 'Resolved old note', + narrative: 'Should stay out of the backfill queue.', + facts: [], + filesModified: [], + concepts: ['resolved'], + tokens: 12, + createdAt: '2026-03-18T00:00:01.000Z', + status: 'resolved', + source: 'agent', + }, + ]); + mockLoadIdCounter.mockResolvedValue(3); + mockHydrateIndex.mockResolvedValue(2); + mockIsEmbeddingEnabled.mockReturnValue(true); + + const { initObservations, prepareSearchIndex, getVectorMissingIds } = await import('../../src/memory/observations.js'); + + await initObservations('E:/tmp/project'); + const count = await prepareSearchIndex(); + + expect(count).toBe(2); + expect(mockResetDb).toHaveBeenCalledOnce(); + expect(mockHydrateIndex).toHaveBeenCalledOnce(); + expect(mockHydrateIndex).toHaveBeenCalledWith( + expect.arrayContaining([ + expect.objectContaining({ id: 1, title: 'Prepared startup index' }), + expect.objectContaining({ id: 2, title: 'Resolved old note' }), + ]), + ); + expect(mockBatchGenerateEmbeddings).not.toHaveBeenCalled(); + expect(getVectorMissingIds()).toEqual([1]); + }); + + it('leaves the backfill queue empty when vector search is not enabled', async () => { + mockLoadObservationsJson.mockResolvedValue([ + { + id: 7, + projectId: 'AVIDS2/memorix', + entityName: 'fallback', + type: 'discovery', + title: 'Fulltext only startup', + narrative: 'Embedding provider disabled.', + facts: [], + filesModified: [], + concepts: ['bm25'], + tokens: 9, + createdAt: '2026-03-18T00:00:00.000Z', + status: 'active', + source: 'agent', + }, + ]); + mockLoadIdCounter.mockResolvedValue(8); + mockHydrateIndex.mockResolvedValue(1); + mockIsEmbeddingEnabled.mockReturnValue(false); + + const { initObservations, prepareSearchIndex, getVectorMissingIds } = await import('../../src/memory/observations.js'); + + await initObservations('E:/tmp/project'); + await prepareSearchIndex(); + + expect(mockBatchGenerateEmbeddings).not.toHaveBeenCalled(); + expect(getVectorMissingIds()).toEqual([]); + }); +}); diff --git a/tests/store/hydrate-index.test.ts b/tests/store/hydrate-index.test.ts new file mode 100644 index 0000000..c207169 --- /dev/null +++ b/tests/store/hydrate-index.test.ts @@ -0,0 +1,94 @@ +import { describe, it, expect, beforeEach } from 'vitest'; +import { resetDb, hydrateIndex, makeOramaObservationId } from '../../src/store/orama-store.js'; +import { count, search } from '@orama/orama'; + +// Minimal observation shape matching what hydrateIndex expects +function makeObs(id: number, status: string, title: string) { + return { + id, + projectId: 'test/hydrate-project', + entityName: `entity-${id}`, + type: 'discovery', + title, + narrative: `Narrative for observation ${id}`, + facts: ['fact-a'], + filesModified: [], + concepts: ['test'], + tokens: 100, + createdAt: new Date().toISOString(), + accessCount: 0, + lastAccessedAt: '', + status, + source: 'agent', + }; +} + +describe('hydrateIndex – status handling', () => { + beforeEach(async () => { + await resetDb(); + }); + + it('indexes active, resolved, AND archived observations', async () => { + const observations = [ + makeObs(1, 'active', 'Active observation'), + makeObs(2, 'resolved', 'Resolved observation'), + makeObs(3, 'archived', 'Archived observation'), + ]; + + const inserted = await hydrateIndex(observations); + expect(inserted).toBe(3); + }); + + it('stores the status field faithfully in the index', async () => { + const observations = [ + makeObs(10, 'active', 'Status active entry'), + makeObs(11, 'resolved', 'Status resolved entry'), + makeObs(12, 'archived', 'Status archived entry'), + ]; + + await hydrateIndex(observations); + + // Import getDb dynamically to access the raw database for verification + const { getDb } = await import('../../src/store/orama-store.js'); + const db = await getDb(); + + // Search for each status value to confirm they're indexed + const activeHits = await search(db, { term: 'Status active entry', properties: ['title'] }); + const resolvedHits = await search(db, { term: 'Status resolved entry', properties: ['title'] }); + const archivedHits = await search(db, { term: 'Status archived entry', properties: ['title'] }); + + expect(activeHits.count).toBeGreaterThanOrEqual(1); + expect(resolvedHits.count).toBeGreaterThanOrEqual(1); + expect(archivedHits.count).toBeGreaterThanOrEqual(1); + }); + + it('skips malformed observations without crashing', async () => { + const observations = [ + makeObs(20, 'active', 'Good observation'), + null, + { id: null, projectId: 'x' }, + { id: 21 }, // missing projectId + makeObs(22, 'resolved', 'Another good one'), + ]; + + const inserted = await hydrateIndex(observations as any[]); + expect(inserted).toBe(2); + }); + + it('is idempotent – second call is a no-op', async () => { + const observations = [ + makeObs(30, 'active', 'First hydration'), + makeObs(31, 'resolved', 'First hydration resolved'), + ]; + + const first = await hydrateIndex(observations); + expect(first).toBe(2); + + // Second call with more observations should return 0 (already hydrated) + const second = await hydrateIndex([ + ...observations, + makeObs(32, 'archived', 'Late arrival'), + ]); + expect(second).toBe(0); + }); +});