Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions src/cli.ts
Original file line number Diff line number Diff line change
Expand Up @@ -373,8 +373,13 @@ async function handleCliOnly(command: string, args: string[]) {
}
// doctor is handled before connectEngine() above
case 'migrate': {
const { runMigrateEngine } = await import('./commands/migrate-engine.ts');
await runMigrateEngine(engine, args);
if (args.includes('--provider')) {
const { runMigrateProvider } = await import('./commands/migrate-provider.ts');
await runMigrateProvider(engine, args);
} else {
const { runMigrateEngine } = await import('./commands/migrate-engine.ts');
await runMigrateEngine(engine, args);
}
break;
}
case 'eval': {
Expand Down
23 changes: 18 additions & 5 deletions src/commands/init.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,14 @@ export async function runInit(args: string[]) {
const apiKey = keyIndex !== -1 ? args[keyIndex + 1] : null;
const pathIndex = args.indexOf('--path');
const customPath = pathIndex !== -1 ? args[pathIndex + 1] : null;
const providerIndex = args.indexOf('--provider');
const embeddingProvider = providerIndex !== -1 ? args[providerIndex + 1] as 'openai' | 'gemini' : undefined;
const dimsIndex = args.indexOf('--dimensions');
const embeddingDimensions = dimsIndex !== -1 ? parseInt(args[dimsIndex + 1], 10) : undefined;
if (embeddingProvider) {
process.env.GBRAIN_EMBEDDING_PROVIDER = embeddingProvider;
if (embeddingDimensions !== undefined) process.env.GBRAIN_EMBEDDING_DIMENSIONS = String(embeddingDimensions);
}

// Schema-only path: apply initSchema against the already-configured engine
// without ever calling saveConfig. Used by apply-migrations, the stopgap
Expand All @@ -47,7 +55,7 @@ export async function runInit(args: string[]) {
}
}

return initPGLite({ jsonOutput, apiKey, customPath });
return initPGLite({ jsonOutput, apiKey, customPath, embeddingProvider, embeddingDimensions });
}

// Supabase/Postgres mode
Expand All @@ -66,7 +74,7 @@ export async function runInit(args: string[]) {
databaseUrl = await supabaseWizard();
}

return initPostgres({ databaseUrl, jsonOutput, apiKey });
return initPostgres({ databaseUrl, jsonOutput, apiKey, embeddingProvider, embeddingDimensions });
}

/**
Expand Down Expand Up @@ -102,9 +110,10 @@ async function initMigrateOnly(opts: { jsonOutput: boolean }) {
}
}

async function initPGLite(opts: { jsonOutput: boolean; apiKey: string | null; customPath: string | null }) {
async function initPGLite(opts: { jsonOutput: boolean; apiKey: string | null; customPath: string | null; embeddingProvider?: 'openai' | 'gemini'; embeddingDimensions?: number }) {
const dbPath = opts.customPath || join(homedir(), '.gbrain', 'brain.pglite');
console.log(`Setting up local brain with PGLite (no server needed)...`);
const providerLabel = opts.embeddingProvider ? ` (provider: ${opts.embeddingProvider})` : '';
console.log(`Setting up local brain with PGLite (no server needed)${providerLabel}...`);

const engine = await createEngine({ engine: 'pglite' });
await engine.connect({ database_path: dbPath, engine: 'pglite' });
Expand All @@ -114,6 +123,8 @@ async function initPGLite(opts: { jsonOutput: boolean; apiKey: string | null; cu
engine: 'pglite',
database_path: dbPath,
...(opts.apiKey ? { openai_api_key: opts.apiKey } : {}),
...(opts.embeddingProvider ? { embedding_provider: opts.embeddingProvider } : {}),
...(opts.embeddingDimensions ? { embedding_dimensions: opts.embeddingDimensions } : {}),
};
saveConfig(config);

Expand All @@ -140,7 +151,7 @@ async function initPGLite(opts: { jsonOutput: boolean; apiKey: string | null; cu
}
}

async function initPostgres(opts: { databaseUrl: string; jsonOutput: boolean; apiKey: string | null }) {
async function initPostgres(opts: { databaseUrl: string; jsonOutput: boolean; apiKey: string | null; embeddingProvider?: 'openai' | 'gemini'; embeddingDimensions?: number }) {
const { databaseUrl } = opts;

// Detect Supabase direct connection URLs and warn about IPv6
Expand Down Expand Up @@ -194,6 +205,8 @@ async function initPostgres(opts: { databaseUrl: string; jsonOutput: boolean; ap
engine: 'postgres',
database_url: databaseUrl,
...(opts.apiKey ? { openai_api_key: opts.apiKey } : {}),
...(opts.embeddingProvider ? { embedding_provider: opts.embeddingProvider } : {}),
...(opts.embeddingDimensions ? { embedding_dimensions: opts.embeddingDimensions } : {}),
};
saveConfig(config);
console.log('Config saved to ~/.gbrain/config.json');
Expand Down
217 changes: 217 additions & 0 deletions src/commands/migrate-provider.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,217 @@
/**
* FORK: gbrain migrate --provider <openai|gemini> [--dimensions N] [--dry-run]
*
* Migrates an existing brain from one embedding provider to another.
*
* Steps:
* 1. Read current provider from config table
* 2. Alter vector column to new dimensions (if dimensions differ)
* 3. Re-embed all chunks using the new provider
* 4. Update config table (embedding_model, embedding_dimensions, embedding_provider)
* 5. Persist new provider choice to ~/.gbrain/config.json
*
* This is safe to resume: if interrupted, re-running will re-embed any
* chunks whose embedding is NULL (step 3 is idempotent given that the
* ALTER completed).
*/

import type { BrainEngine } from '../core/engine.ts';
import { loadConfig, saveConfig } from '../core/config.ts';
import { getActiveProvider, resetActiveProvider } from '../core/embedding-provider.ts';
import { GeminiEmbedder } from '../core/providers/gemini-embedder.ts';
import { OpenAIEmbedder } from '../core/providers/openai-embedder.ts';
import type { ChunkInput } from '../core/types.ts';
import { PGLiteEngine } from '../core/pglite-engine.ts';
import { PostgresEngine } from '../core/postgres-engine.ts';

const EMBED_BATCH = 50; // conservative for migration (avoids rate-limit spikes)

/**
* CLI-only command. Must never be called with remote=true (MCP context).
* This command does destructive DDL (ALTER TABLE, DROP COLUMN) and mutates
* process.env — both are unsafe in a multi-tenant or remote-caller context.
*/
export async function runMigrateProvider(
engine: BrainEngine,
args: string[],
remote = false,
): Promise<void> {
if (remote) {
throw new Error('gbrain migrate --provider is a CLI-only command and cannot be called remotely.');
}
const providerIdx = args.indexOf('--provider');
if (providerIdx === -1 || !args[providerIdx + 1]) {
console.error('Usage: gbrain migrate --provider <openai|gemini> [--dimensions N] [--dry-run]');
process.exit(1);
}

const newProviderName = args[providerIdx + 1] as 'openai' | 'gemini';
if (newProviderName !== 'openai' && newProviderName !== 'gemini') {
console.error(`Unknown provider "${newProviderName}". Use: openai or gemini`);
process.exit(1);
}

const dimsIdx = args.indexOf('--dimensions');
let requestedDims: number | undefined;
if (dimsIdx !== -1) {
requestedDims = parseInt(args[dimsIdx + 1], 10);
if (!Number.isInteger(requestedDims) || requestedDims < 1 || requestedDims > 3072) {
console.error(`Invalid --dimensions "${args[dimsIdx + 1]}": must be an integer 1–3072`);
process.exit(1);
}
}
const dryRun = args.includes('--dry-run');

// Build the new provider instance to get its defaults (GeminiEmbedder defaults to 768)
const newProvider = newProviderName === 'gemini'
? new GeminiEmbedder(requestedDims)
: new OpenAIEmbedder();

// Read current state from config table
const currentModel = await getConfigValue(engine, 'embedding_model') ?? 'text-embedding-3-large';
const currentDims = parseInt(await getConfigValue(engine, 'embedding_dimensions') ?? '1536', 10);
const currentProviderName = await getConfigValue(engine, 'embedding_provider') ?? 'openai';

const newDims = newProvider.dimensions;
const newModel = newProvider.model;
const dimsChange = currentDims !== newDims;

// Count chunks to re-embed
const allSlugs = await engine.getAllSlugs();
let totalChunks = 0;
for (const slug of allSlugs) {
const chunks = await engine.getChunks(slug);
totalChunks += chunks.length;
}

console.log('');
console.log(`Switching embedding provider:`);
console.log(` From: ${currentProviderName} — ${currentModel} (${currentDims} dims)`);
console.log(` To: ${newProviderName} — ${newModel} (${newDims} dims)`);
console.log('');
console.log(`Brain has ${allSlugs.size} pages, ${totalChunks} chunks to re-embed.`);
if (dimsChange) {
console.log(`Vector column will change: vector(${currentDims}) → vector(${newDims})`);
console.log('All existing embeddings will be dropped during the alter.');
}
const batches = Math.ceil(totalChunks / EMBED_BATCH);
console.log(`Estimated API batches: ${batches} (${EMBED_BATCH} chunks/batch)`);
console.log('');

if (dryRun) {
console.log('[dry-run] No changes made.');
return;
}

// Step 1: Alter vector column if dimensions change
if (dimsChange) {
console.log(`Altering vector column: vector(${currentDims}) → vector(${newDims})...`);
const alterSql = [
`DROP INDEX IF EXISTS idx_chunks_embedding`,
`ALTER TABLE content_chunks DROP COLUMN IF EXISTS embedding`,
`ALTER TABLE content_chunks ADD COLUMN embedding vector(${newDims})`,
`CREATE INDEX idx_chunks_embedding ON content_chunks USING hnsw (embedding vector_cosine_ops)`,
].join(';\n');
await execRawSQL(engine, alterSql);
console.log(' Schema altered.');
}

// Step 2: Set the new provider in env before embedding
process.env.GBRAIN_EMBEDDING_PROVIDER = newProviderName;
if (requestedDims) process.env.GBRAIN_EMBEDDING_DIMENSIONS = String(requestedDims);
resetActiveProvider(); // force factory to re-read env

// Step 3: Re-embed all chunks slug by slug
console.log('Re-embedding chunks...');
let done = 0;
for (const slug of allSlugs) {
const chunks = await engine.getChunks(slug);
if (chunks.length === 0) continue;

// Embed in sub-batches
const chunkInputs: ChunkInput[] = [];
for (let i = 0; i < chunks.length; i += EMBED_BATCH) {
const batch = chunks.slice(i, i + EMBED_BATCH);
const texts = batch.map(c => c.chunk_text);
const embeddings = await newProvider.embedBatch(texts);
for (let j = 0; j < batch.length; j++) {
chunkInputs.push({
chunk_index: batch[j].chunk_index,
chunk_text: batch[j].chunk_text,
chunk_source: batch[j].chunk_source,
embedding: embeddings[j],
model: newModel,
token_count: batch[j].token_count,
});
}
done += batch.length;
const pct = Math.round((done / totalChunks) * 100);
process.stdout.write(`\r Progress: ${done}/${totalChunks} chunks (${pct}%)`);
}

await engine.upsertChunks(slug, chunkInputs);
}
console.log('\n Done re-embedding.');

// Step 4: Update config table
await setConfigValue(engine, 'embedding_model', newModel);
await setConfigValue(engine, 'embedding_dimensions', String(newDims));
await setConfigValue(engine, 'embedding_provider', newProviderName);
console.log('Config table updated.');

// Step 5: Persist to ~/.gbrain/config.json
const fileConfig = loadConfig();
if (fileConfig) {
saveConfig({
...fileConfig,
embedding_provider: newProviderName,
embedding_dimensions: newDims,
});
console.log('~/.gbrain/config.json updated.');
}

console.log('');
console.log(`Migration complete. Brain now uses ${newProviderName} (${newModel}, ${newDims} dims).`);
console.log(`Verify: gbrain query "test"`);
}

// ─── helpers ────────────────────────────────────────────────────────────────

async function getConfigValue(engine: BrainEngine, key: string): Promise<string | null> {
try {
if (engine instanceof PGLiteEngine) {
const { rows } = await engine.db.query<{ value: string }>(
`SELECT value FROM config WHERE key = $1`, [key]
);
return rows[0]?.value ?? null;
} else if (engine instanceof PostgresEngine) {
const rows = await engine.sql`SELECT value FROM config WHERE key = ${key}`;
return (rows[0] as { value: string } | undefined)?.value ?? null;
}
} catch { /* table may not exist yet */ }
return null;
}

async function setConfigValue(engine: BrainEngine, key: string, value: string): Promise<void> {
if (engine instanceof PGLiteEngine) {
await engine.db.query(
`INSERT INTO config (key, value) VALUES ($1, $2) ON CONFLICT (key) DO UPDATE SET value = EXCLUDED.value`,
[key, value]
);
} else if (engine instanceof PostgresEngine) {
await engine.sql`
INSERT INTO config (key, value) VALUES (${key}, ${value})
ON CONFLICT (key) DO UPDATE SET value = EXCLUDED.value
`;
}
}

async function execRawSQL(engine: BrainEngine, sql: string): Promise<void> {
if (engine instanceof PGLiteEngine) {
await engine.db.exec(sql);
} else if (engine instanceof PostgresEngine) {
await engine.sql.unsafe(sql);
} else {
throw new Error('Unsupported engine for raw SQL migration');
}
}
14 changes: 12 additions & 2 deletions src/core/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ export interface GBrainConfig {
database_path?: string;
openai_api_key?: string;
anthropic_api_key?: string;
embedding_provider?: 'openai' | 'gemini';
embedding_dimensions?: number;
}

/**
Expand Down Expand Up @@ -41,8 +43,16 @@ export function loadConfig(): GBrainConfig | null {
engine: inferredEngine,
...(dbUrl ? { database_url: dbUrl } : {}),
...(process.env.OPENAI_API_KEY ? { openai_api_key: process.env.OPENAI_API_KEY } : {}),
};
return merged as GBrainConfig;
} as GBrainConfig;

if (merged.embedding_provider && !process.env.GBRAIN_EMBEDDING_PROVIDER) {
process.env.GBRAIN_EMBEDDING_PROVIDER = merged.embedding_provider;
}
if (merged.embedding_dimensions && !process.env.GBRAIN_EMBEDDING_DIMENSIONS) {
process.env.GBRAIN_EMBEDDING_DIMENSIONS = String(merged.embedding_dimensions);
}

return merged;
}

export function saveConfig(config: GBrainConfig): void {
Expand Down
56 changes: 56 additions & 0 deletions src/core/embedding-provider.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
/**
* FORK: Provider-agnostic embedding abstraction (Option C).
*
* Providers:
* openai — text-embedding-3-large, 1536 dims (default)
* gemini — text-embedding-004, 768 dims (new brains)
*
* Config env vars:
* GBRAIN_EMBEDDING_PROVIDER=openai|gemini (default: openai)
* GBRAIN_EMBEDDING_DIMENSIONS=N (Gemini only: override output dims, 1–768)
*
* Schema note: changing provider on an existing brain requires a re-embed migration
* if dimensions differ. New brains pick up the dimension at init time.
*/

import { OpenAIEmbedder } from './providers/openai-embedder.ts';
import { GeminiEmbedder } from './providers/gemini-embedder.ts';

export interface EmbeddingProvider {
readonly model: string;
readonly dimensions: number;
embed(text: string): Promise<Float32Array>;
embedBatch(texts: string[]): Promise<Float32Array[]>;
}

let _active: EmbeddingProvider | null = null;

export function getActiveProvider(): EmbeddingProvider {
if (!_active) {
const name = (process.env.GBRAIN_EMBEDDING_PROVIDER ?? 'openai').toLowerCase();
if (name === 'gemini') {
const dims = parseInt(process.env.GBRAIN_EMBEDDING_DIMENSIONS ?? '768', 10);
_active = new GeminiEmbedder(dims);
} else {
_active = new OpenAIEmbedder();
}
}
return _active;
}

/**
* Returns true if the active provider's API key is present in the environment.
* Use this instead of checking OPENAI_API_KEY directly — supports all providers.
*/
export function isEmbeddingAvailable(): boolean {
const name = (process.env.GBRAIN_EMBEDDING_PROVIDER ?? 'openai').toLowerCase();
if (name === 'gemini') {
return !!(process.env.GOOGLE_API_KEY || process.env.GEMINI_API_KEY);
}
return !!process.env.OPENAI_API_KEY;
}

/** Reset cached provider. Used in tests when env vars change between cases. */
export function resetActiveProvider(): void {
_active = null;
}
Loading