garrytan · aloysiusmartis · Apr 18, 2026
diff --git a/src/cli.ts b/src/cli.ts
@@ -373,8 +373,13 @@ async function handleCliOnly(command: string, args: string[]) {
       }
       // doctor is handled before connectEngine() above
       case 'migrate': {
-        const { runMigrateEngine } = await import('./commands/migrate-engine.ts');
-        await runMigrateEngine(engine, args);
+        if (args.includes('--provider')) {
+          const { runMigrateProvider } = await import('./commands/migrate-provider.ts');
+          await runMigrateProvider(engine, args);
+        } else {
+          const { runMigrateEngine } = await import('./commands/migrate-engine.ts');
+          await runMigrateEngine(engine, args);
+        }
         break;
       }
       case 'eval': {

diff --git a/src/commands/init.ts b/src/commands/init.ts
@@ -21,6 +21,14 @@ export async function runInit(args: string[]) {
   const apiKey = keyIndex !== -1 ? args[keyIndex + 1] : null;
   const pathIndex = args.indexOf('--path');
   const customPath = pathIndex !== -1 ? args[pathIndex + 1] : null;
+  const providerIndex = args.indexOf('--provider');
+  const embeddingProvider = providerIndex !== -1 ? args[providerIndex + 1] as 'openai' | 'gemini' : undefined;
+  const dimsIndex = args.indexOf('--dimensions');
+  const embeddingDimensions = dimsIndex !== -1 ? parseInt(args[dimsIndex + 1], 10) : undefined;
+  if (embeddingProvider) {
+    process.env.GBRAIN_EMBEDDING_PROVIDER = embeddingProvider;
+    if (embeddingDimensions !== undefined) process.env.GBRAIN_EMBEDDING_DIMENSIONS = String(embeddingDimensions);
+  }
 
   // Schema-only path: apply initSchema against the already-configured engine
   // without ever calling saveConfig. Used by apply-migrations, the stopgap
@@ -47,7 +55,7 @@ export async function runInit(args: string[]) {
       }
     }
 
-    return initPGLite({ jsonOutput, apiKey, customPath });
+    return initPGLite({ jsonOutput, apiKey, customPath, embeddingProvider, embeddingDimensions });
   }
 
   // Supabase/Postgres mode
@@ -66,7 +74,7 @@ export async function runInit(args: string[]) {
     databaseUrl = await supabaseWizard();
   }
 
-  return initPostgres({ databaseUrl, jsonOutput, apiKey });
+  return initPostgres({ databaseUrl, jsonOutput, apiKey, embeddingProvider, embeddingDimensions });
 }
 
 /**
@@ -102,9 +110,10 @@ async function initMigrateOnly(opts: { jsonOutput: boolean }) {
   }
 }
 
-async function initPGLite(opts: { jsonOutput: boolean; apiKey: string | null; customPath: string | null }) {
+async function initPGLite(opts: { jsonOutput: boolean; apiKey: string | null; customPath: string | null; embeddingProvider?: 'openai' | 'gemini'; embeddingDimensions?: number }) {
   const dbPath = opts.customPath || join(homedir(), '.gbrain', 'brain.pglite');
-  console.log(`Setting up local brain with PGLite (no server needed)...`);
+  const providerLabel = opts.embeddingProvider ? ` (provider: ${opts.embeddingProvider})` : '';
+  console.log(`Setting up local brain with PGLite (no server needed)${providerLabel}...`);
 
   const engine = await createEngine({ engine: 'pglite' });
   await engine.connect({ database_path: dbPath, engine: 'pglite' });
@@ -114,6 +123,8 @@ async function initPGLite(opts: { jsonOutput: boolean; apiKey: string | null; cu
     engine: 'pglite',
     database_path: dbPath,
     ...(opts.apiKey ? { openai_api_key: opts.apiKey } : {}),
+    ...(opts.embeddingProvider ? { embedding_provider: opts.embeddingProvider } : {}),
+    ...(opts.embeddingDimensions ? { embedding_dimensions: opts.embeddingDimensions } : {}),
   };
   saveConfig(config);
 
@@ -140,7 +151,7 @@ async function initPGLite(opts: { jsonOutput: boolean; apiKey: string | null; cu
   }
 }
 
-async function initPostgres(opts: { databaseUrl: string; jsonOutput: boolean; apiKey: string | null }) {
+async function initPostgres(opts: { databaseUrl: string; jsonOutput: boolean; apiKey: string | null; embeddingProvider?: 'openai' | 'gemini'; embeddingDimensions?: number }) {
   const { databaseUrl } = opts;
 
   // Detect Supabase direct connection URLs and warn about IPv6
@@ -194,6 +205,8 @@ async function initPostgres(opts: { databaseUrl: string; jsonOutput: boolean; ap
     engine: 'postgres',
     database_url: databaseUrl,
     ...(opts.apiKey ? { openai_api_key: opts.apiKey } : {}),
+    ...(opts.embeddingProvider ? { embedding_provider: opts.embeddingProvider } : {}),
+    ...(opts.embeddingDimensions ? { embedding_dimensions: opts.embeddingDimensions } : {}),
   };
   saveConfig(config);
   console.log('Config saved to ~/.gbrain/config.json');

diff --git a/src/commands/migrate-provider.ts b/src/commands/migrate-provider.ts
@@ -0,0 +1,217 @@
+/**
+ * FORK: gbrain migrate --provider <openai|gemini> [--dimensions N] [--dry-run]
+ *
+ * Migrates an existing brain from one embedding provider to another.
+ *
+ * Steps:
+ *  1. Read current provider from config table
+ *  2. Alter vector column to new dimensions (if dimensions differ)
+ *  3. Re-embed all chunks using the new provider
+ *  4. Update config table (embedding_model, embedding_dimensions, embedding_provider)
+ *  5. Persist new provider choice to ~/.gbrain/config.json
+ *
+ * This is safe to resume: if interrupted, re-running will re-embed any
+ * chunks whose embedding is NULL (step 3 is idempotent given that the
+ * ALTER completed).
+ */
+
+import type { BrainEngine } from '../core/engine.ts';
+import { loadConfig, saveConfig } from '../core/config.ts';
+import { getActiveProvider, resetActiveProvider } from '../core/embedding-provider.ts';
+import { GeminiEmbedder } from '../core/providers/gemini-embedder.ts';
+import { OpenAIEmbedder } from '../core/providers/openai-embedder.ts';
+import type { ChunkInput } from '../core/types.ts';
+import { PGLiteEngine } from '../core/pglite-engine.ts';
+import { PostgresEngine } from '../core/postgres-engine.ts';
+
+const EMBED_BATCH = 50; // conservative for migration (avoids rate-limit spikes)
+
+/**
+ * CLI-only command. Must never be called with remote=true (MCP context).
+ * This command does destructive DDL (ALTER TABLE, DROP COLUMN) and mutates
+ * process.env — both are unsafe in a multi-tenant or remote-caller context.
+ */
+export async function runMigrateProvider(
+  engine: BrainEngine,
+  args: string[],
+  remote = false,
+): Promise<void> {
+  if (remote) {
+    throw new Error('gbrain migrate --provider is a CLI-only command and cannot be called remotely.');
+  }
+  const providerIdx = args.indexOf('--provider');
+  if (providerIdx === -1 || !args[providerIdx + 1]) {
+    console.error('Usage: gbrain migrate --provider <openai|gemini> [--dimensions N] [--dry-run]');
+    process.exit(1);
+  }
+
+  const newProviderName = args[providerIdx + 1] as 'openai' | 'gemini';
+  if (newProviderName !== 'openai' && newProviderName !== 'gemini') {
+    console.error(`Unknown provider "${newProviderName}". Use: openai or gemini`);
+    process.exit(1);
+  }
+
+  const dimsIdx = args.indexOf('--dimensions');
+  let requestedDims: number | undefined;
+  if (dimsIdx !== -1) {
+    requestedDims = parseInt(args[dimsIdx + 1], 10);
+    if (!Number.isInteger(requestedDims) || requestedDims < 1 || requestedDims > 3072) {
+      console.error(`Invalid --dimensions "${args[dimsIdx + 1]}": must be an integer 1–3072`);
+      process.exit(1);
+    }
+  }
+  const dryRun = args.includes('--dry-run');
+
+  // Build the new provider instance to get its defaults (GeminiEmbedder defaults to 768)
+  const newProvider = newProviderName === 'gemini'
+    ? new GeminiEmbedder(requestedDims)
+    : new OpenAIEmbedder();
+
+  // Read current state from config table
+  const currentModel = await getConfigValue(engine, 'embedding_model') ?? 'text-embedding-3-large';
+  const currentDims = parseInt(await getConfigValue(engine, 'embedding_dimensions') ?? '1536', 10);
+  const currentProviderName = await getConfigValue(engine, 'embedding_provider') ?? 'openai';
+
+  const newDims = newProvider.dimensions;
+  const newModel = newProvider.model;
+  const dimsChange = currentDims !== newDims;
+
+  // Count chunks to re-embed
+  const allSlugs = await engine.getAllSlugs();
+  let totalChunks = 0;
+  for (const slug of allSlugs) {
+    const chunks = await engine.getChunks(slug);
+    totalChunks += chunks.length;
+  }
+
+  console.log('');
+  console.log(`Switching embedding provider:`);
+  console.log(`  From: ${currentProviderName} — ${currentModel} (${currentDims} dims)`);
+  console.log(`  To:   ${newProviderName} — ${newModel} (${newDims} dims)`);
+  console.log('');
+  console.log(`Brain has ${allSlugs.size} pages, ${totalChunks} chunks to re-embed.`);
+  if (dimsChange) {
+    console.log(`Vector column will change: vector(${currentDims}) → vector(${newDims})`);
+    console.log('All existing embeddings will be dropped during the alter.');
+  }
+  const batches = Math.ceil(totalChunks / EMBED_BATCH);
+  console.log(`Estimated API batches: ${batches} (${EMBED_BATCH} chunks/batch)`);
+  console.log('');
+
+  if (dryRun) {
+    console.log('[dry-run] No changes made.');
+    return;
+  }
+
+  // Step 1: Alter vector column if dimensions change
+  if (dimsChange) {
+    console.log(`Altering vector column: vector(${currentDims}) → vector(${newDims})...`);
+    const alterSql = [
+      `DROP INDEX IF EXISTS idx_chunks_embedding`,
+      `ALTER TABLE content_chunks DROP COLUMN IF EXISTS embedding`,
+      `ALTER TABLE content_chunks ADD COLUMN embedding vector(${newDims})`,
+      `CREATE INDEX idx_chunks_embedding ON content_chunks USING hnsw (embedding vector_cosine_ops)`,
+    ].join(';\n');
+    await execRawSQL(engine, alterSql);
+    console.log('  Schema altered.');
+  }
+
+  // Step 2: Set the new provider in env before embedding
+  process.env.GBRAIN_EMBEDDING_PROVIDER = newProviderName;
+  if (requestedDims) process.env.GBRAIN_EMBEDDING_DIMENSIONS = String(requestedDims);
+  resetActiveProvider(); // force factory to re-read env
+
+  // Step 3: Re-embed all chunks slug by slug
+  console.log('Re-embedding chunks...');
+  let done = 0;
+  for (const slug of allSlugs) {
+    const chunks = await engine.getChunks(slug);
+    if (chunks.length === 0) continue;
+
+    // Embed in sub-batches
+    const chunkInputs: ChunkInput[] = [];
+    for (let i = 0; i < chunks.length; i += EMBED_BATCH) {
+      const batch = chunks.slice(i, i + EMBED_BATCH);
+      const texts = batch.map(c => c.chunk_text);
+      const embeddings = await newProvider.embedBatch(texts);
+      for (let j = 0; j < batch.length; j++) {
+        chunkInputs.push({
+          chunk_index: batch[j].chunk_index,
+          chunk_text: batch[j].chunk_text,
+          chunk_source: batch[j].chunk_source,
+          embedding: embeddings[j],
+          model: newModel,
+          token_count: batch[j].token_count,
+        });
+      }
+      done += batch.length;
+      const pct = Math.round((done / totalChunks) * 100);
+      process.stdout.write(`\r  Progress: ${done}/${totalChunks} chunks (${pct}%)`);
+    }
+
+    await engine.upsertChunks(slug, chunkInputs);
+  }
+  console.log('\n  Done re-embedding.');
+
+  // Step 4: Update config table
+  await setConfigValue(engine, 'embedding_model', newModel);
+  await setConfigValue(engine, 'embedding_dimensions', String(newDims));
+  await setConfigValue(engine, 'embedding_provider', newProviderName);
+  console.log('Config table updated.');
+
+  // Step 5: Persist to ~/.gbrain/config.json
+  const fileConfig = loadConfig();
+  if (fileConfig) {
+    saveConfig({
+      ...fileConfig,
+      embedding_provider: newProviderName,
+      embedding_dimensions: newDims,
+    });
+    console.log('~/.gbrain/config.json updated.');
+  }
+
+  console.log('');
+  console.log(`Migration complete. Brain now uses ${newProviderName} (${newModel}, ${newDims} dims).`);
+  console.log(`Verify: gbrain query "test"`);
+}
+
+// ─── helpers ────────────────────────────────────────────────────────────────
+
+async function getConfigValue(engine: BrainEngine, key: string): Promise<string | null> {
+  try {
+    if (engine instanceof PGLiteEngine) {
+      const { rows } = await engine.db.query<{ value: string }>(
+        `SELECT value FROM config WHERE key = $1`, [key]
+      );
+      return rows[0]?.value ?? null;
+    } else if (engine instanceof PostgresEngine) {
+      const rows = await engine.sql`SELECT value FROM config WHERE key = ${key}`;
+      return (rows[0] as { value: string } | undefined)?.value ?? null;
+    }
+  } catch { /* table may not exist yet */ }
+  return null;
+}
+
+async function setConfigValue(engine: BrainEngine, key: string, value: string): Promise<void> {
+  if (engine instanceof PGLiteEngine) {
+    await engine.db.query(
+      `INSERT INTO config (key, value) VALUES ($1, $2) ON CONFLICT (key) DO UPDATE SET value = EXCLUDED.value`,
+      [key, value]
+    );
+  } else if (engine instanceof PostgresEngine) {
+    await engine.sql`
+      INSERT INTO config (key, value) VALUES (${key}, ${value})
+      ON CONFLICT (key) DO UPDATE SET value = EXCLUDED.value
+    `;
+  }
+}
+
+async function execRawSQL(engine: BrainEngine, sql: string): Promise<void> {
+  if (engine instanceof PGLiteEngine) {
+    await engine.db.exec(sql);
+  } else if (engine instanceof PostgresEngine) {
+    await engine.sql.unsafe(sql);
+  } else {
+    throw new Error('Unsupported engine for raw SQL migration');
+  }
+}
diff --git a/src/core/config.ts b/src/core/config.ts
@@ -13,6 +13,8 @@ export interface GBrainConfig {
   database_path?: string;
   openai_api_key?: string;
   anthropic_api_key?: string;
+  embedding_provider?: 'openai' | 'gemini';
+  embedding_dimensions?: number;
 }
 
 /**
@@ -41,8 +43,16 @@ export function loadConfig(): GBrainConfig | null {
     engine: inferredEngine,
     ...(dbUrl ? { database_url: dbUrl } : {}),
     ...(process.env.OPENAI_API_KEY ? { openai_api_key: process.env.OPENAI_API_KEY } : {}),
-  };
-  return merged as GBrainConfig;
+  } as GBrainConfig;
+
+  if (merged.embedding_provider && !process.env.GBRAIN_EMBEDDING_PROVIDER) {
+    process.env.GBRAIN_EMBEDDING_PROVIDER = merged.embedding_provider;
+  }
+  if (merged.embedding_dimensions && !process.env.GBRAIN_EMBEDDING_DIMENSIONS) {
+    process.env.GBRAIN_EMBEDDING_DIMENSIONS = String(merged.embedding_dimensions);
+  }
+
+  return merged;
 }
 
 export function saveConfig(config: GBrainConfig): void {

diff --git a/src/core/embedding-provider.ts b/src/core/embedding-provider.ts
@@ -0,0 +1,56 @@
+/**
+ * FORK: Provider-agnostic embedding abstraction (Option C).
+ *
+ * Providers:
+ *   openai  — text-embedding-3-large, 1536 dims (default)
+ *   gemini  — text-embedding-004, 768 dims (new brains)
+ *
+ * Config env vars:
+ *   GBRAIN_EMBEDDING_PROVIDER=openai|gemini  (default: openai)
+ *   GBRAIN_EMBEDDING_DIMENSIONS=N            (Gemini only: override output dims, 1–768)
+ *
+ * Schema note: changing provider on an existing brain requires a re-embed migration
+ * if dimensions differ. New brains pick up the dimension at init time.
+ */
+
+import { OpenAIEmbedder } from './providers/openai-embedder.ts';
+import { GeminiEmbedder } from './providers/gemini-embedder.ts';
+
+export interface EmbeddingProvider {
+  readonly model: string;
+  readonly dimensions: number;
+  embed(text: string): Promise<Float32Array>;
+  embedBatch(texts: string[]): Promise<Float32Array[]>;
+}
+
+let _active: EmbeddingProvider | null = null;
+
+export function getActiveProvider(): EmbeddingProvider {
+  if (!_active) {
+    const name = (process.env.GBRAIN_EMBEDDING_PROVIDER ?? 'openai').toLowerCase();
+    if (name === 'gemini') {
+      const dims = parseInt(process.env.GBRAIN_EMBEDDING_DIMENSIONS ?? '768', 10);
+      _active = new GeminiEmbedder(dims);
+    } else {
+      _active = new OpenAIEmbedder();
+    }
+  }
+  return _active;
+}
+
+/**
+ * Returns true if the active provider's API key is present in the environment.
+ * Use this instead of checking OPENAI_API_KEY directly — supports all providers.
+ */
+export function isEmbeddingAvailable(): boolean {
+  const name = (process.env.GBRAIN_EMBEDDING_PROVIDER ?? 'openai').toLowerCase();
+  if (name === 'gemini') {
+    return !!(process.env.GOOGLE_API_KEY || process.env.GEMINI_API_KEY);
+  }
+  return !!process.env.OPENAI_API_KEY;
+}
+
+/** Reset cached provider. Used in tests when env vars change between cases. */
+export function resetActiveProvider(): void {
+  _active = null;
+}