farzaa · PsychoSatsujin · Apr 21, 2026 · Apr 21, 2026 · Apr 21, 2026 · Apr 21, 2026
diff --git a/.gitignore b/.gitignore
@@ -1,8 +1,14 @@
 worker/node_modules/
 worker/.dev.vars
+worker/.secrets.local
 .DS_Store
 *.xcuserstate
 build/
 releases/
 .claude/
 coding-plans/
+
+# Windows / .NET build output
+windows/**/bin/
+windows/**/obj/
+*.user
diff --git a/AGENTS.md b/AGENTS.md
@@ -5,16 +5,24 @@
 
 ## Overview
 
-macOS menu bar companion app. Lives entirely in the macOS status bar (no dock icon, no main window). Clicking the menu bar icon opens a custom floating panel with companion voice controls. Uses push-to-talk (ctrl+option) to capture voice input, transcribes it via AssemblyAI streaming, and sends the transcript + a screenshot of the user's screen to Claude. Claude responds with text (streamed via SSE) and voice (ElevenLabs TTS). A blue cursor overlay can fly to and point at UI elements Claude references on any connected monitor.
+Cross-platform companion app. Lives entirely in the OS's system tray / menu bar (no dock icon, no main window). Clicking the tray icon opens a custom floating panel with companion voice controls. Uses push-to-talk (ctrl+option on macOS, ctrl+alt on Windows) to capture voice input, transcribes it via AssemblyAI streaming, and sends the transcript + a screenshot of the user's screen to the active AI provider (Claude or Gemini). The AI responds with text (streamed via SSE) and voice (ElevenLabs TTS). A blue cursor overlay can fly to and point at UI elements the AI references on any connected monitor.
 
-All API keys live on a Cloudflare Worker proxy — nothing sensitive ships in the app.
+All API keys live on a Cloudflare Worker proxy — nothing sensitive ships in either app.
+
+## Repository layout
+
+| Folder | Purpose |
+|--------|---------|
+| `leanring-buddy/` + `leanring-buddy.xcodeproj` | macOS menu bar app. Swift + SwiftUI + AppKit. Ships first, most features live here. |
+| `windows/` | Windows port. C# + WPF on .NET 8. Currently at Milestone 1 (foundation). See `windows/README.md` for milestone progress. |
+| `worker/` | Cloudflare Worker proxy. Shared by both apps unchanged — same routes, same secrets. |
 
 ## Architecture
 
 - **App Type**: Menu bar-only (`LSUIElement=true`), no dock icon or main window
 - **Framework**: SwiftUI (macOS native) with AppKit bridging for menu bar panel and cursor overlay
 - **Pattern**: MVVM with `@StateObject` / `@Published` state management
-- **AI Chat**: Claude (Sonnet 4.6 default, Opus 4.6 optional) via Cloudflare Worker proxy with SSE streaming
+- **AI Chat**: User-selectable provider — Claude (Sonnet 4.6 default, Opus 4.6 optional) or Gemini (2.5 Flash, 2.5 Pro). Both route through the Cloudflare Worker proxy with SSE streaming.
 - **Speech-to-Text**: AssemblyAI real-time streaming (`u3-rt-pro` model) via websocket, with OpenAI and Apple Speech as fallbacks
 - **Text-to-Speech**: ElevenLabs (`eleven_flash_v2_5` model) via Cloudflare Worker proxy
 - **Screen Capture**: ScreenCaptureKit (macOS 14.2+), multi-monitor support
@@ -30,10 +38,11 @@ The app never calls external APIs directly. All requests go through a Cloudflare
 | Route | Upstream | Purpose |
 |-------|----------|---------|
 | `POST /chat` | `api.anthropic.com/v1/messages` | Claude vision + streaming chat |
+| `POST /chat-gemini` | `generativelanguage.googleapis.com/v1beta/models/{model}:streamGenerateContent` | Gemini vision + streaming chat. The `model` field in the request body is used to build the upstream URL path. |
 | `POST /tts` | `api.elevenlabs.io/v1/text-to-speech/{voiceId}` | ElevenLabs TTS audio |
 | `POST /transcribe-token` | `streaming.assemblyai.com/v3/token` | Fetches a short-lived (480s) AssemblyAI websocket token |
 
-Worker secrets: `ANTHROPIC_API_KEY`, `ASSEMBLYAI_API_KEY`, `ELEVENLABS_API_KEY`
+Worker secrets: `ANTHROPIC_API_KEY`, `GEMINI_API_KEY`, `ASSEMBLYAI_API_KEY`, `ELEVENLABS_API_KEY`
 Worker vars: `ELEVENLABS_VOICE_ID`
 
 ### Key Architecture Decisions
@@ -67,6 +76,7 @@ Worker vars: `ELEVENLABS_VOICE_ID`
 | `BuddyAudioConversionSupport.swift` | ~108 | Audio conversion helpers. Converts live mic buffers to PCM16 mono audio and builds WAV payloads for upload-based providers. |
 | `GlobalPushToTalkShortcutMonitor.swift` | ~132 | System-wide push-to-talk monitor. Owns the listen-only `CGEvent` tap and publishes press/release transitions. |
 | `ClaudeAPI.swift` | ~291 | Claude vision API client with streaming (SSE) and non-streaming modes. TLS warmup optimization, image MIME detection, conversation history support. |
+| `GeminiAPI.swift` | ~240 | Google Gemini vision API client. Mirrors `ClaudeAPI`'s public streaming signature so `CompanionManager` can swap providers transparently. Translates the Gemini-specific request shape (`contents`/`parts`/`inline_data`, `systemInstruction`, `role: "model"`) and parses Gemini's SSE events. Routes through the Worker `/chat-gemini` route — the model ID travels in the body and the Worker plugs it into the upstream URL path. |
 | `OpenAIAPI.swift` | ~142 | OpenAI GPT vision API client. |
 | `ElevenLabsTTSClient.swift` | ~81 | ElevenLabs TTS client. Sends text to the Worker proxy, plays back audio via `AVAudioPlayer`. Exposes `isPlaying` for transient cursor scheduling. |
 | `ElementLocationDetector.swift` | ~335 | Detects UI element locations in screenshots for cursor pointing. |
@@ -98,6 +108,7 @@ npm install
 
 # Add secrets
 npx wrangler secret put ANTHROPIC_API_KEY
+npx wrangler secret put GEMINI_API_KEY
 npx wrangler secret put ASSEMBLYAI_API_KEY
 npx wrangler secret put ELEVENLABS_API_KEY
 

diff --git a/leanring-buddy/CompanionManager.swift b/leanring-buddy/CompanionManager.swift
@@ -73,9 +73,27 @@ final class CompanionManager: ObservableObject {
     private static let workerBaseURL = "https://your-worker-name.your-subdomain.workers.dev"
 
     private lazy var claudeAPI: ClaudeAPI = {
-        return ClaudeAPI(proxyURL: "\(Self.workerBaseURL)/chat", model: selectedModel)
+        // Default to Sonnet when the current selection is a Gemini model so
+        // the Claude client ships with a valid Anthropic model ID even when
+        // Gemini is the active provider.
+        let initialClaudeModel = Self.isGeminiModelID(selectedModel) ? "claude-sonnet-4-6" : selectedModel
+        return ClaudeAPI(proxyURL: "\(Self.workerBaseURL)/chat", model: initialClaudeModel)
     }()
 
+    private lazy var geminiAPI: GeminiAPI = {
+        // Default to Flash when the current selection is a Claude model so
+        // the Gemini client ships with a valid Gemini model ID even when
+        // Claude is the active provider.
+        let initialGeminiModel = Self.isGeminiModelID(selectedModel) ? selectedModel : "gemini-2.5-flash"
+        return GeminiAPI(proxyURL: "\(Self.workerBaseURL)/chat-gemini", model: initialGeminiModel)
+    }()
+
+    /// Returns true when the given model ID belongs to the Gemini provider.
+    /// Used throughout the manager to route requests to the correct client.
+    static func isGeminiModelID(_ modelID: String) -> Bool {
+        return modelID.hasPrefix("gemini")
+    }
+
     private lazy var elevenLabsTTSClient: ElevenLabsTTSClient = {
         return ElevenLabsTTSClient(proxyURL: "\(Self.workerBaseURL)/tts")
     }()
@@ -107,13 +125,23 @@ final class CompanionManager: ObservableObject {
     /// Used by the panel to show accurate status text ("Active" vs "Ready").
     @Published private(set) var isOverlayVisible: Bool = false
 
-    /// The Claude model used for voice responses. Persisted to UserDefaults.
+    /// The model used for voice responses. May be a Claude ID (e.g. "claude-sonnet-4-6")
+    /// or a Gemini ID (e.g. "gemini-2.5-flash"). Persisted to UserDefaults.
+    /// The UserDefaults key is still "selectedClaudeModel" for backwards compatibility
+    /// with installs from before Gemini support existed.
     @Published var selectedModel: String = UserDefaults.standard.string(forKey: "selectedClaudeModel") ?? "claude-sonnet-4-6"
 
     func setSelectedModel(_ model: String) {
         selectedModel = model
         UserDefaults.standard.set(model, forKey: "selectedClaudeModel")
-        claudeAPI.model = model
+        // Route the new model ID to whichever provider owns it. We leave the
+        // other provider's model untouched — the next time the user flips back,
+        // that provider still remembers its previously-selected model.
+        if Self.isGeminiModelID(model) {
+            geminiAPI.model = model
+        } else {
+            claudeAPI.model = model
+        }
     }
 
     /// User preference for whether the Clicky cursor should be shown.
@@ -179,9 +207,13 @@ final class CompanionManager: ObservableObject {
         bindVoiceStateObservation()
         bindAudioPowerLevel()
         bindShortcutTransitions()
-        // Eagerly touch the Claude API so its TLS warmup handshake completes
-        // well before the onboarding demo fires at ~40s into the video.
-        _ = claudeAPI
+        // Eagerly touch the active AI provider so its TLS warmup handshake
+        // completes well before the onboarding demo fires at ~40s into the video.
+        if Self.isGeminiModelID(selectedModel) {
+            _ = geminiAPI
+        } else {
+            _ = claudeAPI
+        }
 
         // If the user already completed onboarding AND all permissions are
         // still granted, show the cursor overlay immediately. If permissions
@@ -578,11 +610,40 @@ final class CompanionManager: ObservableObject {
 
     // MARK: - AI Response Pipeline
 
-    /// Captures a screenshot, sends it along with the transcript to Claude,
-    /// and plays the response aloud via ElevenLabs TTS. The cursor stays in
-    /// the spinner/processing state until TTS audio begins playing.
-    /// Claude's response may include a [POINT:x,y:label] tag which triggers
-    /// the buddy to fly to that element on screen.
+    /// Dispatches a streaming vision request to whichever provider owns the
+    /// currently selected model. Both Claude and Gemini expose an identical
+    /// streaming signature, so call sites don't need to care which one runs.
+    private func runStreamingVisionRequest(
+        images: [(data: Data, label: String)],
+        systemPrompt: String,
+        conversationHistory: [(userPlaceholder: String, assistantResponse: String)],
+        userPrompt: String,
+        onTextChunk: @MainActor @Sendable (String) -> Void
+    ) async throws -> (text: String, duration: TimeInterval) {
+        if Self.isGeminiModelID(selectedModel) {
+            return try await geminiAPI.analyzeImageStreaming(
+                images: images,
+                systemPrompt: systemPrompt,
+                conversationHistory: conversationHistory,
+                userPrompt: userPrompt,
+                onTextChunk: onTextChunk
+            )
+        } else {
+            return try await claudeAPI.analyzeImageStreaming(
+                images: images,
+                systemPrompt: systemPrompt,
+                conversationHistory: conversationHistory,
+                userPrompt: userPrompt,
+                onTextChunk: onTextChunk
+            )
+        }
+    }
+
+    /// Captures a screenshot, sends it along with the transcript to the
+    /// selected AI provider (Claude or Gemini), and plays the response aloud
+    /// via ElevenLabs TTS. The cursor stays in the spinner/processing state
+    /// until TTS audio begins playing. The response may include a
+    /// [POINT:x,y:label] tag which triggers the buddy to fly to that element.
     private func sendTranscriptToClaudeWithScreenshot(transcript: String) {
         currentResponseTask?.cancel()
         elevenLabsTTSClient.stopPlayback()
@@ -610,7 +671,7 @@ final class CompanionManager: ObservableObject {
                     (userPlaceholder: entry.userTranscript, assistantResponse: entry.assistantResponse)
                 }
 
-                let (fullResponseText, _) = try await claudeAPI.analyzeImageStreaming(
+                let (fullResponseText, _) = try await runStreamingVisionRequest(
                     images: labeledImages,
                     systemPrompt: Self.companionVoiceResponseSystemPrompt,
                     conversationHistory: historyForAPI,
@@ -982,9 +1043,10 @@ final class CompanionManager: ObservableObject {
                 let dimensionInfo = " (image dimensions: \(cursorScreenCapture.screenshotWidthInPixels)x\(cursorScreenCapture.screenshotHeightInPixels) pixels)"
                 let labeledImages = [(data: cursorScreenCapture.imageData, label: cursorScreenCapture.label + dimensionInfo)]
 
-                let (fullResponseText, _) = try await claudeAPI.analyzeImageStreaming(
+                let (fullResponseText, _) = try await runStreamingVisionRequest(
                     images: labeledImages,
                     systemPrompt: Self.onboardingDemoSystemPrompt,
+                    conversationHistory: [],
                     userPrompt: "look around my screen and find something interesting to point at",
                     onTextChunk: { _ in }
                 )

diff --git a/leanring-buddy/CompanionPanelView.swift b/leanring-buddy/CompanionPanelView.swift
@@ -599,16 +599,42 @@ struct CompanionPanelView: View {
     // MARK: - Model Picker
 
     private var modelPickerRow: some View {
+        // Two provider rows stacked vertically — Claude and Gemini. Four buttons
+        // in a single row would be too cramped in the menu bar panel width.
+        VStack(alignment: .leading, spacing: 8) {
+            modelProviderRow(
+                providerLabel: "Claude",
+                options: [
+                    (displayLabel: "Sonnet", modelID: "claude-sonnet-4-6"),
+                    (displayLabel: "Opus", modelID: "claude-opus-4-6")
+                ]
+            )
+            modelProviderRow(
+                providerLabel: "Gemini",
+                options: [
+                    (displayLabel: "Flash", modelID: "gemini-2.5-flash"),
+                    (displayLabel: "Pro", modelID: "gemini-2.5-pro")
+                ]
+            )
+        }
+        .padding(.vertical, 4)
+    }
+
+    private func modelProviderRow(
+        providerLabel: String,
+        options: [(displayLabel: String, modelID: String)]
+    ) -> some View {
         HStack {
-            Text("Model")
+            Text(providerLabel)
                 .font(.system(size: 13, weight: .medium))
                 .foregroundColor(DS.Colors.textSecondary)
 
             Spacer()
 
             HStack(spacing: 0) {
-                modelOptionButton(label: "Sonnet", modelID: "claude-sonnet-4-6")
-                modelOptionButton(label: "Opus", modelID: "claude-opus-4-6")
+                ForEach(options, id: \.modelID) { option in
+                    modelOptionButton(label: option.displayLabel, modelID: option.modelID)
+                }
             }
             .background(
                 RoundedRectangle(cornerRadius: 6, style: .continuous)
@@ -619,7 +645,6 @@ struct CompanionPanelView: View {
                     .stroke(DS.Colors.borderSubtle, lineWidth: 0.5)
             )
         }
-        .padding(.vertical, 4)
     }
 
     private func modelOptionButton(label: String, modelID: String) -> some View {