diff --git a/.gitignore b/.gitignore
index 832e80a1..c7496055 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,8 +1,14 @@
worker/node_modules/
worker/.dev.vars
+worker/.secrets.local
.DS_Store
*.xcuserstate
build/
releases/
.claude/
coding-plans/
+
+# Windows / .NET build output
+windows/**/bin/
+windows/**/obj/
+*.user
diff --git a/AGENTS.md b/AGENTS.md
index 6946d441..f0b695df 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -5,16 +5,24 @@
## Overview
-macOS menu bar companion app. Lives entirely in the macOS status bar (no dock icon, no main window). Clicking the menu bar icon opens a custom floating panel with companion voice controls. Uses push-to-talk (ctrl+option) to capture voice input, transcribes it via AssemblyAI streaming, and sends the transcript + a screenshot of the user's screen to Claude. Claude responds with text (streamed via SSE) and voice (ElevenLabs TTS). A blue cursor overlay can fly to and point at UI elements Claude references on any connected monitor.
+Cross-platform companion app. Lives entirely in the OS's system tray / menu bar (no dock icon, no main window). Clicking the tray icon opens a custom floating panel with companion voice controls. Uses push-to-talk (ctrl+option on macOS, ctrl+alt on Windows) to capture voice input, transcribes it via AssemblyAI streaming, and sends the transcript + a screenshot of the user's screen to the active AI provider (Claude or Gemini). The AI responds with text (streamed via SSE) and voice (ElevenLabs TTS). A blue cursor overlay can fly to and point at UI elements the AI references on any connected monitor.
-All API keys live on a Cloudflare Worker proxy — nothing sensitive ships in the app.
+All API keys live on a Cloudflare Worker proxy — nothing sensitive ships in either app.
+
+## Repository layout
+
+| Folder | Purpose |
+|--------|---------|
+| `leanring-buddy/` + `leanring-buddy.xcodeproj` | macOS menu bar app. Swift + SwiftUI + AppKit. Ships first, most features live here. |
+| `windows/` | Windows port. C# + WPF on .NET 8. Currently at Milestone 1 (foundation). See `windows/README.md` for milestone progress. |
+| `worker/` | Cloudflare Worker proxy. Shared by both apps unchanged — same routes, same secrets. |
## Architecture
- **App Type**: Menu bar-only (`LSUIElement=true`), no dock icon or main window
- **Framework**: SwiftUI (macOS native) with AppKit bridging for menu bar panel and cursor overlay
- **Pattern**: MVVM with `@StateObject` / `@Published` state management
-- **AI Chat**: Claude (Sonnet 4.6 default, Opus 4.6 optional) via Cloudflare Worker proxy with SSE streaming
+- **AI Chat**: User-selectable provider — Claude (Sonnet 4.6 default, Opus 4.6 optional) or Gemini (2.5 Flash, 2.5 Pro). Both route through the Cloudflare Worker proxy with SSE streaming.
- **Speech-to-Text**: AssemblyAI real-time streaming (`u3-rt-pro` model) via websocket, with OpenAI and Apple Speech as fallbacks
- **Text-to-Speech**: ElevenLabs (`eleven_flash_v2_5` model) via Cloudflare Worker proxy
- **Screen Capture**: ScreenCaptureKit (macOS 14.2+), multi-monitor support
@@ -30,10 +38,11 @@ The app never calls external APIs directly. All requests go through a Cloudflare
| Route | Upstream | Purpose |
|-------|----------|---------|
| `POST /chat` | `api.anthropic.com/v1/messages` | Claude vision + streaming chat |
+| `POST /chat-gemini` | `generativelanguage.googleapis.com/v1beta/models/{model}:streamGenerateContent` | Gemini vision + streaming chat. The `model` field in the request body is used to build the upstream URL path. |
| `POST /tts` | `api.elevenlabs.io/v1/text-to-speech/{voiceId}` | ElevenLabs TTS audio |
| `POST /transcribe-token` | `streaming.assemblyai.com/v3/token` | Fetches a short-lived (480s) AssemblyAI websocket token |
-Worker secrets: `ANTHROPIC_API_KEY`, `ASSEMBLYAI_API_KEY`, `ELEVENLABS_API_KEY`
+Worker secrets: `ANTHROPIC_API_KEY`, `GEMINI_API_KEY`, `ASSEMBLYAI_API_KEY`, `ELEVENLABS_API_KEY`
Worker vars: `ELEVENLABS_VOICE_ID`
### Key Architecture Decisions
@@ -67,6 +76,7 @@ Worker vars: `ELEVENLABS_VOICE_ID`
| `BuddyAudioConversionSupport.swift` | ~108 | Audio conversion helpers. Converts live mic buffers to PCM16 mono audio and builds WAV payloads for upload-based providers. |
| `GlobalPushToTalkShortcutMonitor.swift` | ~132 | System-wide push-to-talk monitor. Owns the listen-only `CGEvent` tap and publishes press/release transitions. |
| `ClaudeAPI.swift` | ~291 | Claude vision API client with streaming (SSE) and non-streaming modes. TLS warmup optimization, image MIME detection, conversation history support. |
+| `GeminiAPI.swift` | ~240 | Google Gemini vision API client. Mirrors `ClaudeAPI`'s public streaming signature so `CompanionManager` can swap providers transparently. Translates the Gemini-specific request shape (`contents`/`parts`/`inline_data`, `systemInstruction`, `role: "model"`) and parses Gemini's SSE events. Routes through the Worker `/chat-gemini` route — the model ID travels in the body and the Worker plugs it into the upstream URL path. |
| `OpenAIAPI.swift` | ~142 | OpenAI GPT vision API client. |
| `ElevenLabsTTSClient.swift` | ~81 | ElevenLabs TTS client. Sends text to the Worker proxy, plays back audio via `AVAudioPlayer`. Exposes `isPlaying` for transient cursor scheduling. |
| `ElementLocationDetector.swift` | ~335 | Detects UI element locations in screenshots for cursor pointing. |
@@ -98,6 +108,7 @@ npm install
# Add secrets
npx wrangler secret put ANTHROPIC_API_KEY
+npx wrangler secret put GEMINI_API_KEY
npx wrangler secret put ASSEMBLYAI_API_KEY
npx wrangler secret put ELEVENLABS_API_KEY
diff --git a/leanring-buddy/CompanionManager.swift b/leanring-buddy/CompanionManager.swift
index 0234cf19..ccdb0eda 100644
--- a/leanring-buddy/CompanionManager.swift
+++ b/leanring-buddy/CompanionManager.swift
@@ -73,9 +73,27 @@ final class CompanionManager: ObservableObject {
private static let workerBaseURL = "https://your-worker-name.your-subdomain.workers.dev"
private lazy var claudeAPI: ClaudeAPI = {
- return ClaudeAPI(proxyURL: "\(Self.workerBaseURL)/chat", model: selectedModel)
+ // Default to Sonnet when the current selection is a Gemini model so
+ // the Claude client ships with a valid Anthropic model ID even when
+ // Gemini is the active provider.
+ let initialClaudeModel = Self.isGeminiModelID(selectedModel) ? "claude-sonnet-4-6" : selectedModel
+ return ClaudeAPI(proxyURL: "\(Self.workerBaseURL)/chat", model: initialClaudeModel)
}()
+ private lazy var geminiAPI: GeminiAPI = {
+ // Default to Flash when the current selection is a Claude model so
+ // the Gemini client ships with a valid Gemini model ID even when
+ // Claude is the active provider.
+ let initialGeminiModel = Self.isGeminiModelID(selectedModel) ? selectedModel : "gemini-2.5-flash"
+ return GeminiAPI(proxyURL: "\(Self.workerBaseURL)/chat-gemini", model: initialGeminiModel)
+ }()
+
+ /// Returns true when the given model ID belongs to the Gemini provider.
+ /// Used throughout the manager to route requests to the correct client.
+ static func isGeminiModelID(_ modelID: String) -> Bool {
+ return modelID.hasPrefix("gemini")
+ }
+
private lazy var elevenLabsTTSClient: ElevenLabsTTSClient = {
return ElevenLabsTTSClient(proxyURL: "\(Self.workerBaseURL)/tts")
}()
@@ -107,13 +125,23 @@ final class CompanionManager: ObservableObject {
/// Used by the panel to show accurate status text ("Active" vs "Ready").
@Published private(set) var isOverlayVisible: Bool = false
- /// The Claude model used for voice responses. Persisted to UserDefaults.
+ /// The model used for voice responses. May be a Claude ID (e.g. "claude-sonnet-4-6")
+ /// or a Gemini ID (e.g. "gemini-2.5-flash"). Persisted to UserDefaults.
+ /// The UserDefaults key is still "selectedClaudeModel" for backwards compatibility
+ /// with installs from before Gemini support existed.
@Published var selectedModel: String = UserDefaults.standard.string(forKey: "selectedClaudeModel") ?? "claude-sonnet-4-6"
func setSelectedModel(_ model: String) {
selectedModel = model
UserDefaults.standard.set(model, forKey: "selectedClaudeModel")
- claudeAPI.model = model
+ // Route the new model ID to whichever provider owns it. We leave the
+ // other provider's model untouched — the next time the user flips back,
+ // that provider still remembers its previously-selected model.
+ if Self.isGeminiModelID(model) {
+ geminiAPI.model = model
+ } else {
+ claudeAPI.model = model
+ }
}
/// User preference for whether the Clicky cursor should be shown.
@@ -179,9 +207,13 @@ final class CompanionManager: ObservableObject {
bindVoiceStateObservation()
bindAudioPowerLevel()
bindShortcutTransitions()
- // Eagerly touch the Claude API so its TLS warmup handshake completes
- // well before the onboarding demo fires at ~40s into the video.
- _ = claudeAPI
+ // Eagerly touch the active AI provider so its TLS warmup handshake
+ // completes well before the onboarding demo fires at ~40s into the video.
+ if Self.isGeminiModelID(selectedModel) {
+ _ = geminiAPI
+ } else {
+ _ = claudeAPI
+ }
// If the user already completed onboarding AND all permissions are
// still granted, show the cursor overlay immediately. If permissions
@@ -578,11 +610,40 @@ final class CompanionManager: ObservableObject {
// MARK: - AI Response Pipeline
- /// Captures a screenshot, sends it along with the transcript to Claude,
- /// and plays the response aloud via ElevenLabs TTS. The cursor stays in
- /// the spinner/processing state until TTS audio begins playing.
- /// Claude's response may include a [POINT:x,y:label] tag which triggers
- /// the buddy to fly to that element on screen.
+ /// Dispatches a streaming vision request to whichever provider owns the
+ /// currently selected model. Both Claude and Gemini expose an identical
+ /// streaming signature, so call sites don't need to care which one runs.
+ private func runStreamingVisionRequest(
+ images: [(data: Data, label: String)],
+ systemPrompt: String,
+ conversationHistory: [(userPlaceholder: String, assistantResponse: String)],
+ userPrompt: String,
+ onTextChunk: @MainActor @Sendable (String) -> Void
+ ) async throws -> (text: String, duration: TimeInterval) {
+ if Self.isGeminiModelID(selectedModel) {
+ return try await geminiAPI.analyzeImageStreaming(
+ images: images,
+ systemPrompt: systemPrompt,
+ conversationHistory: conversationHistory,
+ userPrompt: userPrompt,
+ onTextChunk: onTextChunk
+ )
+ } else {
+ return try await claudeAPI.analyzeImageStreaming(
+ images: images,
+ systemPrompt: systemPrompt,
+ conversationHistory: conversationHistory,
+ userPrompt: userPrompt,
+ onTextChunk: onTextChunk
+ )
+ }
+ }
+
+ /// Captures a screenshot, sends it along with the transcript to the
+ /// selected AI provider (Claude or Gemini), and plays the response aloud
+ /// via ElevenLabs TTS. The cursor stays in the spinner/processing state
+ /// until TTS audio begins playing. The response may include a
+ /// [POINT:x,y:label] tag which triggers the buddy to fly to that element.
private func sendTranscriptToClaudeWithScreenshot(transcript: String) {
currentResponseTask?.cancel()
elevenLabsTTSClient.stopPlayback()
@@ -610,7 +671,7 @@ final class CompanionManager: ObservableObject {
(userPlaceholder: entry.userTranscript, assistantResponse: entry.assistantResponse)
}
- let (fullResponseText, _) = try await claudeAPI.analyzeImageStreaming(
+ let (fullResponseText, _) = try await runStreamingVisionRequest(
images: labeledImages,
systemPrompt: Self.companionVoiceResponseSystemPrompt,
conversationHistory: historyForAPI,
@@ -982,9 +1043,10 @@ final class CompanionManager: ObservableObject {
let dimensionInfo = " (image dimensions: \(cursorScreenCapture.screenshotWidthInPixels)x\(cursorScreenCapture.screenshotHeightInPixels) pixels)"
let labeledImages = [(data: cursorScreenCapture.imageData, label: cursorScreenCapture.label + dimensionInfo)]
- let (fullResponseText, _) = try await claudeAPI.analyzeImageStreaming(
+ let (fullResponseText, _) = try await runStreamingVisionRequest(
images: labeledImages,
systemPrompt: Self.onboardingDemoSystemPrompt,
+ conversationHistory: [],
userPrompt: "look around my screen and find something interesting to point at",
onTextChunk: { _ in }
)
diff --git a/leanring-buddy/CompanionPanelView.swift b/leanring-buddy/CompanionPanelView.swift
index 76789b4c..d1d54c4d 100644
--- a/leanring-buddy/CompanionPanelView.swift
+++ b/leanring-buddy/CompanionPanelView.swift
@@ -599,16 +599,42 @@ struct CompanionPanelView: View {
// MARK: - Model Picker
private var modelPickerRow: some View {
+ // Two provider rows stacked vertically — Claude and Gemini. Four buttons
+ // in a single row would be too cramped in the menu bar panel width.
+ VStack(alignment: .leading, spacing: 8) {
+ modelProviderRow(
+ providerLabel: "Claude",
+ options: [
+ (displayLabel: "Sonnet", modelID: "claude-sonnet-4-6"),
+ (displayLabel: "Opus", modelID: "claude-opus-4-6")
+ ]
+ )
+ modelProviderRow(
+ providerLabel: "Gemini",
+ options: [
+ (displayLabel: "Flash", modelID: "gemini-2.5-flash"),
+ (displayLabel: "Pro", modelID: "gemini-2.5-pro")
+ ]
+ )
+ }
+ .padding(.vertical, 4)
+ }
+
+ private func modelProviderRow(
+ providerLabel: String,
+ options: [(displayLabel: String, modelID: String)]
+ ) -> some View {
HStack {
- Text("Model")
+ Text(providerLabel)
.font(.system(size: 13, weight: .medium))
.foregroundColor(DS.Colors.textSecondary)
Spacer()
HStack(spacing: 0) {
- modelOptionButton(label: "Sonnet", modelID: "claude-sonnet-4-6")
- modelOptionButton(label: "Opus", modelID: "claude-opus-4-6")
+ ForEach(options, id: \.modelID) { option in
+ modelOptionButton(label: option.displayLabel, modelID: option.modelID)
+ }
}
.background(
RoundedRectangle(cornerRadius: 6, style: .continuous)
@@ -619,7 +645,6 @@ struct CompanionPanelView: View {
.stroke(DS.Colors.borderSubtle, lineWidth: 0.5)
)
}
- .padding(.vertical, 4)
}
private func modelOptionButton(label: String, modelID: String) -> some View {
diff --git a/leanring-buddy/GeminiAPI.swift b/leanring-buddy/GeminiAPI.swift
new file mode 100644
index 00000000..1dc5e92f
--- /dev/null
+++ b/leanring-buddy/GeminiAPI.swift
@@ -0,0 +1,273 @@
+//
+// GeminiAPI.swift
+// Google Gemini API Implementation with streaming support
+//
+// Mirrors ClaudeAPI's public interface so CompanionManager can route to
+// either provider without the caller caring which one is active. The
+// request/response translation layer is Gemini-specific (different field
+// names, different SSE event shape, different role vocabulary).
+//
+
+import Foundation
+
+/// Gemini API helper with streaming for progressive text display.
+/// Routes through the Cloudflare Worker proxy so the Gemini API key never
+/// ships in the app.
+class GeminiAPI {
+ private static let tlsWarmupLock = NSLock()
+ private static var hasStartedTLSWarmup = false
+
+ private let apiURL: URL
+ var model: String
+ private let session: URLSession
+
+ init(proxyURL: String, model: String = "gemini-2.5-flash") {
+ self.apiURL = URL(string: proxyURL)!
+ self.model = model
+
+ // Use .default instead of .ephemeral so TLS session tickets are cached.
+ // Ephemeral sessions do a full TLS handshake on every request, which causes
+ // transient -1200 (errSSLPeerHandshakeFail) errors with large image payloads.
+ // Disable URL/cookie caching to avoid storing responses or credentials on disk.
+ let config = URLSessionConfiguration.default
+ config.timeoutIntervalForRequest = 120
+ config.timeoutIntervalForResource = 300
+ config.waitsForConnectivity = true
+ config.urlCache = nil
+ config.httpCookieStorage = nil
+ self.session = URLSession(configuration: config)
+
+ // Fire a lightweight HEAD request in the background to pre-establish the TLS
+ // connection. This caches the TLS session ticket so the first real API call
+ // (which carries a large image payload) doesn't need a cold TLS handshake.
+ warmUpTLSConnectionIfNeeded()
+ }
+
+ private func makeAPIRequest() -> URLRequest {
+ var request = URLRequest(url: apiURL)
+ request.httpMethod = "POST"
+ request.timeoutInterval = 120
+ request.setValue("application/json", forHTTPHeaderField: "Content-Type")
+ return request
+ }
+
+ /// Detects the MIME type of image data by inspecting the first bytes.
+ /// Screen captures from ScreenCaptureKit are JPEG, but pasted images from the
+ /// clipboard are PNG. Gemini rejects requests where the declared mime_type
+ /// doesn't match the actual image format.
+ private func detectImageMediaType(for imageData: Data) -> String {
+ // PNG files start with the 8-byte signature: 89 50 4E 47 0D 0A 1A 0A
+ if imageData.count >= 4 {
+ let pngSignature: [UInt8] = [0x89, 0x50, 0x4E, 0x47]
+ let firstFourBytes = [UInt8](imageData.prefix(4))
+ if firstFourBytes == pngSignature {
+ return "image/png"
+ }
+ }
+ // Default to JPEG — screen captures use JPEG compression
+ return "image/jpeg"
+ }
+
+ /// Sends a no-op HEAD request to the Worker to establish and cache a TLS session.
+ /// Failures are silently ignored — this is purely an optimization.
+ private func warmUpTLSConnectionIfNeeded() {
+ Self.tlsWarmupLock.lock()
+ let shouldStartTLSWarmup = !Self.hasStartedTLSWarmup
+ if shouldStartTLSWarmup {
+ Self.hasStartedTLSWarmup = true
+ }
+ Self.tlsWarmupLock.unlock()
+
+ guard shouldStartTLSWarmup else { return }
+
+ guard var warmupURLComponents = URLComponents(url: apiURL, resolvingAgainstBaseURL: false) else {
+ return
+ }
+
+ warmupURLComponents.path = "/"
+ warmupURLComponents.query = nil
+ warmupURLComponents.fragment = nil
+
+ guard let warmupURL = warmupURLComponents.url else {
+ return
+ }
+
+ var warmupRequest = URLRequest(url: warmupURL)
+ warmupRequest.httpMethod = "HEAD"
+ warmupRequest.timeoutInterval = 10
+ session.dataTask(with: warmupRequest) { _, _, _ in
+ // Response doesn't matter — the TLS handshake is the goal
+ }.resume()
+ }
+
+ /// Builds the Gemini-shaped request body for a vision + streaming call.
+ /// Gemini uses `contents` with `parts` (text + inline_data), a separate
+ /// `systemInstruction` field, and "model" as the assistant role.
+ private func buildGeminiRequestBody(
+ images: [(data: Data, label: String)],
+ systemPrompt: String,
+ conversationHistory: [(userPlaceholder: String, assistantResponse: String)],
+ userPrompt: String,
+ maxOutputTokens: Int
+ ) -> [String: Any] {
+ var contents: [[String: Any]] = []
+
+ for (userPlaceholder, assistantResponse) in conversationHistory {
+ contents.append([
+ "role": "user",
+ "parts": [["text": userPlaceholder]]
+ ])
+ contents.append([
+ "role": "model",
+ "parts": [["text": assistantResponse]]
+ ])
+ }
+
+ // Build current turn with all labeled images + prompt
+ var currentTurnParts: [[String: Any]] = []
+ for image in images {
+ currentTurnParts.append([
+ "inline_data": [
+ "mime_type": detectImageMediaType(for: image.data),
+ "data": image.data.base64EncodedString()
+ ]
+ ])
+ currentTurnParts.append([
+ "text": image.label
+ ])
+ }
+ currentTurnParts.append([
+ "text": userPrompt
+ ])
+ contents.append([
+ "role": "user",
+ "parts": currentTurnParts
+ ])
+
+ // `model` is forwarded to the Worker, which pulls it out and plugs it
+ // into the upstream Gemini URL path — Gemini itself doesn't read it.
+ return [
+ "model": model,
+ "systemInstruction": [
+ "parts": [["text": systemPrompt]]
+ ],
+ "contents": contents,
+ "generationConfig": [
+ "maxOutputTokens": maxOutputTokens
+ ]
+ ]
+ }
+
+ /// Send a vision request to Gemini with streaming.
+ /// Calls `onTextChunk` on the main actor each time new text arrives so the UI updates progressively.
+ /// Returns the full accumulated text and total duration when the stream completes.
+ func analyzeImageStreaming(
+ images: [(data: Data, label: String)],
+ systemPrompt: String,
+ conversationHistory: [(userPlaceholder: String, assistantResponse: String)] = [],
+ userPrompt: String,
+ onTextChunk: @MainActor @Sendable (String) -> Void
+ ) async throws -> (text: String, duration: TimeInterval) {
+ let startTime = Date()
+
+ var request = makeAPIRequest()
+
+ let body = buildGeminiRequestBody(
+ images: images,
+ systemPrompt: systemPrompt,
+ conversationHistory: conversationHistory,
+ userPrompt: userPrompt,
+ maxOutputTokens: 1024
+ )
+
+ let bodyData = try JSONSerialization.data(withJSONObject: body)
+ request.httpBody = bodyData
+ let payloadMB = Double(bodyData.count) / 1_048_576.0
+ print("🌐 Gemini streaming request (\(model)): \(String(format: "%.1f", payloadMB))MB, \(images.count) image(s)")
+
+ // Use bytes streaming for SSE (Server-Sent Events)
+ let (byteStream, response) = try await session.bytes(for: request)
+
+ guard let httpResponse = response as? HTTPURLResponse else {
+ throw NSError(
+ domain: "GeminiAPI",
+ code: -1,
+ userInfo: [NSLocalizedDescriptionKey: "Invalid HTTP response"]
+ )
+ }
+
+ // If non-2xx status, read the full body as error text
+ guard (200...299).contains(httpResponse.statusCode) else {
+ var errorBodyChunks: [String] = []
+ for try await line in byteStream.lines {
+ errorBodyChunks.append(line)
+ }
+ let errorBody = errorBodyChunks.joined(separator: "\n")
+ throw NSError(
+ domain: "GeminiAPI",
+ code: httpResponse.statusCode,
+ userInfo: [NSLocalizedDescriptionKey: "API Error (\(httpResponse.statusCode)): \(errorBody)"]
+ )
+ }
+
+ // Parse SSE stream — each event is "data: {json}\n\n".
+ // Gemini sends one event per chunk with shape:
+ // { "candidates": [ { "content": { "parts": [ {"text": "..."} ], "role": "model" } } ] }
+ var accumulatedResponseText = ""
+
+ for try await line in byteStream.lines {
+ guard line.hasPrefix("data: ") else { continue }
+ let jsonString = String(line.dropFirst(6))
+
+ // Gemini doesn't send an explicit [DONE] marker, but handle it defensively
+ guard jsonString != "[DONE]" else { break }
+
+ guard let jsonData = jsonString.data(using: .utf8),
+ let eventPayload = try? JSONSerialization.jsonObject(with: jsonData) as? [String: Any] else {
+ continue
+ }
+
+ // Extract text from candidates[0].content.parts[*].text
+ guard let candidates = eventPayload["candidates"] as? [[String: Any]],
+ let firstCandidate = candidates.first,
+ let content = firstCandidate["content"] as? [String: Any],
+ let parts = content["parts"] as? [[String: Any]] else {
+ continue
+ }
+
+ var chunkText = ""
+ for part in parts {
+ if let partText = part["text"] as? String {
+ chunkText += partText
+ }
+ }
+
+ if !chunkText.isEmpty {
+ accumulatedResponseText += chunkText
+ let currentAccumulatedText = accumulatedResponseText
+ await onTextChunk(currentAccumulatedText)
+ }
+ }
+
+ let duration = Date().timeIntervalSince(startTime)
+ return (text: accumulatedResponseText, duration: duration)
+ }
+
+ /// Non-streaming fallback for validation requests where we don't need progressive display.
+ /// Uses the same streaming endpoint internally — Gemini returns the full result via SSE
+ /// and we simply accumulate it before returning. This keeps the Worker route surface small.
+ func analyzeImage(
+ images: [(data: Data, label: String)],
+ systemPrompt: String,
+ conversationHistory: [(userPlaceholder: String, assistantResponse: String)] = [],
+ userPrompt: String
+ ) async throws -> (text: String, duration: TimeInterval) {
+ return try await analyzeImageStreaming(
+ images: images,
+ systemPrompt: systemPrompt,
+ conversationHistory: conversationHistory,
+ userPrompt: userPrompt,
+ onTextChunk: { _ in }
+ )
+ }
+}
diff --git a/windows/Clicky.sln b/windows/Clicky.sln
new file mode 100644
index 00000000..09a641ec
--- /dev/null
+++ b/windows/Clicky.sln
@@ -0,0 +1,21 @@
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio Version 17
+VisualStudioVersion = 17.9.34622.214
+MinimumVisualStudioVersion = 10.0.40219.1
+Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Clicky", "Clicky\Clicky.csproj", "{B1F9B6C0-5B7E-4D3A-8E4D-1A2B3C4D5E6F}"
+EndProject
+Global
+ GlobalSection(SolutionConfigurationPlatforms) = preSolution
+ Debug|Any CPU = Debug|Any CPU
+ Release|Any CPU = Release|Any CPU
+ EndGlobalSection
+ GlobalSection(ProjectConfigurationPlatforms) = postSolution
+ {B1F9B6C0-5B7E-4D3A-8E4D-1A2B3C4D5E6F}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+ {B1F9B6C0-5B7E-4D3A-8E4D-1A2B3C4D5E6F}.Debug|Any CPU.Build.0 = Debug|Any CPU
+ {B1F9B6C0-5B7E-4D3A-8E4D-1A2B3C4D5E6F}.Release|Any CPU.ActiveCfg = Release|Any CPU
+ {B1F9B6C0-5B7E-4D3A-8E4D-1A2B3C4D5E6F}.Release|Any CPU.Build.0 = Release|Any CPU
+ EndGlobalSection
+ GlobalSection(SolutionProperties) = preSolution
+ HideSolutionNode = FALSE
+ EndGlobalSection
+EndGlobal
diff --git a/windows/Clicky/App.xaml b/windows/Clicky/App.xaml
new file mode 100644
index 00000000..2be35400
--- /dev/null
+++ b/windows/Clicky/App.xaml
@@ -0,0 +1,17 @@
+
+
+
+
+
+
+
+
+
+
diff --git a/windows/Clicky/App.xaml.cs b/windows/Clicky/App.xaml.cs
new file mode 100644
index 00000000..f2f07460
--- /dev/null
+++ b/windows/Clicky/App.xaml.cs
@@ -0,0 +1,251 @@
+using System.IO;
+using System.Windows;
+using System.Windows.Media;
+using System.Windows.Media.Imaging;
+using H.NotifyIcon;
+using Clicky.Interop;
+using Clicky.Services;
+using Clicky.ViewModels;
+using Clicky.Views;
+
+namespace Clicky;
+
+///
+/// WPF application entry. Boots the tray icon, wires the popover panel,
+/// installs the global push-to-talk hotkey, and holds the root AppState
+/// for the app's lifetime.
+///
+/// This is the Windows analog of the macOS CompanionAppDelegate +
+/// MenuBarPanelManager combination (leanring_buddyApp.swift + MenuBarPanelManager.swift).
+///
+public partial class App : Application
+{
+ // Keep singletons alive for the app's lifetime. No DI container in M1 —
+ // the dependency graph is small enough to thread manually.
+ private Mutex? _singleInstanceMutex;
+ private SettingsService? _settingsService;
+ private AppState? _appState;
+ private GlobalHotkeyService? _globalHotkeyService;
+ private TaskbarIcon? _trayIcon;
+ private TrayPanelWindow? _trayPanelWindow;
+ private TrayPanelViewModel? _trayPanelViewModel;
+ private VoicePipelineOrchestrator? _voicePipelineOrchestrator;
+ private OverlayWindowManager? _overlayWindowManager;
+
+ protected override void OnStartup(StartupEventArgs eventArgs)
+ {
+ base.OnStartup(eventArgs);
+
+ if (!TryAcquireSingleInstanceMutex())
+ {
+ // Another instance is already running. Exit quietly — no error
+ // dialog, so double-clicks from the Start menu are benign.
+ Shutdown();
+ return;
+ }
+
+ _settingsService = new SettingsService();
+ _appState = new AppState(_settingsService);
+ _trayPanelViewModel = new TrayPanelViewModel(_appState);
+ _trayPanelWindow = new TrayPanelWindow(_trayPanelViewModel);
+
+ // PostHog setup — idempotent, silent no-op until the write key in
+ // WorkerConfig.cs is replaced with a real project key. Fires
+ // app_opened on success.
+ ClickyAnalytics.Configure(_settingsService.AnalyticsDistinctId);
+
+ InstallTrayIcon();
+ InstallGlobalHotkey();
+
+ // The overlay windows are created after the tray is up so nothing
+ // flashes in an uninitialized state. Transparent + click-through, so
+ // their presence is invisible to the desktop beneath. The voice
+ // pipeline takes a reference so the [POINT:…] tag on each reply can
+ // fire an element-pointing flight before TTS speaks the text.
+ _overlayWindowManager = new OverlayWindowManager(_appState, Dispatcher);
+ _overlayWindowManager.Start();
+
+ _voicePipelineOrchestrator = new VoicePipelineOrchestrator(_appState, Dispatcher, _overlayWindowManager);
+
+ // First-run onboarding: if the user hasn't completed it, auto-open
+ // the panel on a centered position so the very first launch shows
+ // the welcome copy instead of a silent tray icon. Also probe the
+ // microphone so a disabled capture endpoint is surfaced before the
+ // first push-to-talk attempt.
+ ProbeMicrophoneAvailabilityAndUpdateState();
+ if (!_appState.HasCompletedOnboarding)
+ {
+ _trayPanelWindow.ShowPanelCenteredOnPrimaryScreen();
+ ClickyAnalytics.TrackOnboardingStarted();
+ }
+ }
+
+ private void ProbeMicrophoneAvailabilityAndUpdateState()
+ {
+ if (_appState is null) return;
+ var hasMic = MicrophonePermissionHelper.HasActiveCaptureDevice();
+ _appState.IsMicrophonePermissionIssue = !hasMic;
+ if (!hasMic)
+ {
+ _appState.LastStatusMessage =
+ "Microphone appears to be off or blocked. Open Windows privacy settings to enable it.";
+ ClickyAnalytics.TrackPermissionDenied("microphone");
+ }
+ else
+ {
+ ClickyAnalytics.TrackPermissionGranted("microphone");
+ }
+ }
+
+ protected override void OnExit(ExitEventArgs eventArgs)
+ {
+ _overlayWindowManager?.Dispose();
+ _globalHotkeyService?.Dispose();
+ _trayIcon?.Dispose();
+ // Orchestrator owns mic/websocket/TTS — dispose synchronously so
+ // their background threads are joined before the process exits.
+ if (_voicePipelineOrchestrator is not null)
+ {
+ _voicePipelineOrchestrator.DisposeAsync().AsTask().GetAwaiter().GetResult();
+ }
+ _singleInstanceMutex?.ReleaseMutex();
+ _singleInstanceMutex?.Dispose();
+ base.OnExit(eventArgs);
+ }
+
+ private bool TryAcquireSingleInstanceMutex()
+ {
+ // Per-user mutex — two different users on the same machine can each
+ // run their own Clicky instance without colliding.
+ var mutexName = $"Local\\Clicky.SingleInstance.{Environment.UserName}";
+ _singleInstanceMutex = new Mutex(initiallyOwned: true, name: mutexName, createdNew: out var createdNew);
+ return createdNew;
+ }
+
+ private void InstallTrayIcon()
+ {
+ // H.NotifyIcon's IconSource (ImageSource) path can't reliably consume
+ // a programmatically-rendered bitmap (it tries to round-trip via a
+ // BitmapImage.UriSource it never has). The Icon property accepts a
+ // System.Drawing.Icon directly and bypasses that whole conversion,
+ // so we generate or load a real Win32 icon instead.
+ _trayIcon = new TaskbarIcon
+ {
+ ToolTipText = "Clicky - hold Ctrl+Alt to talk",
+ Icon = LoadTrayIcon(),
+ // No built-in context menu - left- and right-click both open the
+ // custom popover. Quit lives inside the panel.
+ NoLeftClickDelay = true,
+ };
+
+ _trayIcon.TrayLeftMouseUp += (_, _) => ToggleTrayPanel();
+ _trayIcon.TrayRightMouseUp += (_, _) => ToggleTrayPanel();
+
+ _trayIcon.ForceCreate();
+ }
+
+ private void InstallGlobalHotkey()
+ {
+ _globalHotkeyService = new GlobalHotkeyService();
+ _globalHotkeyService.ShortcutPressed += OnPushToTalkPressed;
+ _globalHotkeyService.ShortcutReleased += OnPushToTalkReleased;
+ _globalHotkeyService.Start();
+ }
+
+ private void OnPushToTalkPressed(object? sender, EventArgs eventArgs)
+ {
+ // Panel shouldn't stay visible while the user is talking to the
+ // app — dismiss it if it happens to be open.
+ Dispatcher.BeginInvoke(() => _trayPanelWindow?.HidePanel());
+
+ // The orchestrator owns the state transitions (Listening / Processing
+ // / Responding / Idle) from here. Swallow exceptions — the
+ // orchestrator reports them via AppState.LastStatusMessage.
+ _ = _voicePipelineOrchestrator?.HandlePushToTalkPressedAsync();
+ }
+
+ private void OnPushToTalkReleased(object? sender, EventArgs eventArgs)
+ {
+ _ = _voicePipelineOrchestrator?.HandlePushToTalkReleasedAsync();
+ }
+
+ private void ToggleTrayPanel()
+ {
+ if (_trayPanelWindow is null) return;
+
+ if (_trayPanelWindow.IsVisible)
+ {
+ _trayPanelWindow.HidePanel();
+ return;
+ }
+
+ NativeMethods.GetCursorPos(out var cursorPositionDevicePixels);
+ _trayPanelWindow.ShowNearTrayCursor(
+ cursorPositionDevicePixels.X,
+ cursorPositionDevicePixels.Y);
+ }
+
+ ///
+ /// Loads the tray icon from the bundled resource. Falls back to a
+ /// generated blue-dot placeholder if the resource is missing so the app
+ /// is runnable before an artist drops a real .ico in.
+ ///
+ private static System.Drawing.Icon LoadTrayIcon()
+ {
+ try
+ {
+ var packIconUri = new Uri("pack://application:,,,/Resources/clicky-tray.ico", UriKind.Absolute);
+ var packResource = GetResourceStream(packIconUri);
+ if (packResource?.Stream is not null)
+ {
+ using var iconStream = packResource.Stream;
+ return new System.Drawing.Icon(iconStream);
+ }
+ }
+ catch
+ {
+ // Fall through to the generated placeholder.
+ }
+
+ return CreatePlaceholderBlueDotIcon();
+ }
+
+ ///
+ /// Builds a 32x32 transparent-background blue dot Icon using GDI so
+ /// H.NotifyIcon can take it directly. Used when no real clicky-tray.ico
+ /// resource has been bundled.
+ ///
+ private static System.Drawing.Icon CreatePlaceholderBlueDotIcon()
+ {
+ const int iconPixelSize = 32;
+ const int iconPadding = 6;
+
+ using var bitmap = new System.Drawing.Bitmap(
+ iconPixelSize,
+ iconPixelSize,
+ System.Drawing.Imaging.PixelFormat.Format32bppArgb);
+
+ using (var graphics = System.Drawing.Graphics.FromImage(bitmap))
+ {
+ graphics.SmoothingMode = System.Drawing.Drawing2D.SmoothingMode.AntiAlias;
+ graphics.Clear(System.Drawing.Color.Transparent);
+
+ using var overlayCursorBlueBrush = new System.Drawing.SolidBrush(
+ System.Drawing.Color.FromArgb(0xFF, 0x33, 0x80, 0xFF));
+
+ graphics.FillEllipse(
+ overlayCursorBlueBrush,
+ iconPadding,
+ iconPadding,
+ iconPixelSize - (iconPadding * 2),
+ iconPixelSize - (iconPadding * 2));
+ }
+
+ // GetHicon hands ownership of the HICON to us; FromHandle doesn't take
+ // ownership, so we'd normally have to clean it up. The TaskbarIcon
+ // keeps this Icon for the lifetime of the app, so the leak is bounded
+ // to a single 32x32 cursor handle.
+ var hIcon = bitmap.GetHicon();
+ return (System.Drawing.Icon)System.Drawing.Icon.FromHandle(hIcon).Clone();
+ }
+}
diff --git a/windows/Clicky/AppState.cs b/windows/Clicky/AppState.cs
new file mode 100644
index 00000000..b7e58329
--- /dev/null
+++ b/windows/Clicky/AppState.cs
@@ -0,0 +1,103 @@
+using CommunityToolkit.Mvvm.ComponentModel;
+using Clicky.Services;
+
+namespace Clicky;
+
+///
+/// Root observable state for the entire Windows app. The C# analog of the
+/// macOS CompanionManager (leanring-buddy/CompanionManager.swift). Milestone 1
+/// holds only the persisted preferences and the voice-state enum; later
+/// milestones attach the screen-capture, dictation, and AI-chat services.
+///
+public sealed partial class AppState : ObservableObject
+{
+ private readonly SettingsService _settingsService;
+
+ public AppState(SettingsService settingsService)
+ {
+ _settingsService = settingsService;
+ _selectedModelId = settingsService.SelectedModelId;
+ _isClickyCursorEnabled = settingsService.IsClickyCursorEnabled;
+ _hasCompletedOnboarding = settingsService.HasCompletedOnboarding;
+ }
+
+ // ---- Voice pipeline state (populated by later milestones) ----
+
+ public enum VoiceState
+ {
+ Idle,
+ Listening,
+ Processing,
+ Responding,
+ }
+
+ [ObservableProperty]
+ private VoiceState _currentVoiceState = VoiceState.Idle;
+
+ ///
+ /// Live-updating transcript while the user holds push-to-talk. Shows
+ /// partials as they arrive from AssemblyAI and the finalized text once
+ /// the shortcut releases. Cleared at the start of each session.
+ ///
+ [ObservableProperty]
+ private string _liveTranscript = string.Empty;
+
+ ///
+ /// Streaming response text from the active AI provider. Appended to
+ /// as SSE chunks arrive so the panel can show the answer forming in
+ /// real time.
+ ///
+ [ObservableProperty]
+ private string _streamedResponseText = string.Empty;
+
+ ///
+ /// Latest error/status message surfaced from any pipeline component.
+ /// The panel shows it in the tertiary footer row when present.
+ ///
+ [ObservableProperty]
+ private string _lastStatusMessage = string.Empty;
+
+ ///
+ /// Set when microphone access is blocked or unavailable. The tray panel
+ /// shows a "Open privacy settings" shortcut when this is true so the
+ /// user can fix the permission in one click.
+ ///
+ [ObservableProperty]
+ private bool _isMicrophonePermissionIssue;
+
+ // ---- Persisted preferences ----
+
+ [ObservableProperty]
+ private string _selectedModelId;
+
+ partial void OnSelectedModelIdChanged(string value)
+ {
+ _settingsService.SelectedModelId = value;
+ }
+
+ [ObservableProperty]
+ private bool _isClickyCursorEnabled;
+
+ partial void OnIsClickyCursorEnabledChanged(bool value)
+ {
+ _settingsService.IsClickyCursorEnabled = value;
+ }
+
+ [ObservableProperty]
+ private bool _hasCompletedOnboarding;
+
+ partial void OnHasCompletedOnboardingChanged(bool value)
+ {
+ _settingsService.HasCompletedOnboarding = value;
+ }
+
+ // ---- Model routing helpers (mirror CompanionManager.isGeminiModelID) ----
+
+ ///
+ /// Returns true when the given model ID belongs to the Gemini provider.
+ /// Used by later milestones to route vision requests to the right client.
+ ///
+ public static bool IsGeminiModelId(string modelId) => modelId.StartsWith("gemini", StringComparison.OrdinalIgnoreCase);
+
+ public bool IsCurrentModelGemini => IsGeminiModelId(SelectedModelId);
+}
diff --git a/windows/Clicky/Clicky.csproj b/windows/Clicky/Clicky.csproj
new file mode 100644
index 00000000..193648fd
--- /dev/null
+++ b/windows/Clicky/Clicky.csproj
@@ -0,0 +1,30 @@
+
+
+
+ WinExe
+
+ net8.0-windows10.0.19041.0
+ 10.0.17763.0
+ Clicky
+ Clicky
+ enable
+ enable
+ true
+ app.manifest
+
+ PerMonitorV2
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/windows/Clicky/Interop/NativeMethods.cs b/windows/Clicky/Interop/NativeMethods.cs
new file mode 100644
index 00000000..4970cb3c
--- /dev/null
+++ b/windows/Clicky/Interop/NativeMethods.cs
@@ -0,0 +1,230 @@
+using System.Runtime.InteropServices;
+using System.Windows;
+
+namespace Clicky.Interop;
+
+///
+/// Win32 P/Invoke surface. Grouped here so the rest of the app can stay
+/// managed-code-only. Each method is documented with the underlying Win32
+/// function it wraps.
+///
+internal static class NativeMethods
+{
+ // ---- Extended window style bits used by the panel + overlay ----
+ public const int GWL_EXSTYLE = -20;
+ public const int WS_EX_TRANSPARENT = 0x00000020;
+ public const int WS_EX_TOOLWINDOW = 0x00000080;
+ public const int WS_EX_LAYERED = 0x00080000;
+ public const int WS_EX_NOACTIVATE = 0x08000000;
+
+ // ---- SetWindowPos flags (used for non-activating positioning) ----
+ public static readonly IntPtr HWND_TOPMOST = new(-1);
+ public const uint SWP_NOSIZE = 0x0001;
+ public const uint SWP_NOMOVE = 0x0002;
+ public const uint SWP_NOACTIVATE = 0x0010;
+ public const uint SWP_SHOWWINDOW = 0x0040;
+
+ // ---- AppBar query for the Windows taskbar bounds ----
+ public const uint ABM_GETTASKBARPOS = 0x00000005;
+
+ [DllImport("user32.dll", SetLastError = true)]
+ [return: MarshalAs(UnmanagedType.Bool)]
+ public static extern bool SetWindowPos(
+ IntPtr hWnd,
+ IntPtr hWndInsertAfter,
+ int X,
+ int Y,
+ int cx,
+ int cy,
+ uint uFlags);
+
+ // 32-bit and 64-bit variants of GetWindowLong / SetWindowLong. The correct
+ // one is selected at runtime by GetExtendedStyle / SetExtendedStyle below.
+ [DllImport("user32.dll", EntryPoint = "GetWindowLong")]
+ private static extern int GetWindowLong32(IntPtr hWnd, int nIndex);
+
+ [DllImport("user32.dll", EntryPoint = "GetWindowLongPtr")]
+ private static extern IntPtr GetWindowLongPtr64(IntPtr hWnd, int nIndex);
+
+ [DllImport("user32.dll", EntryPoint = "SetWindowLong")]
+ private static extern int SetWindowLong32(IntPtr hWnd, int nIndex, int dwNewLong);
+
+ [DllImport("user32.dll", EntryPoint = "SetWindowLongPtr")]
+ private static extern IntPtr SetWindowLongPtr64(IntPtr hWnd, int nIndex, IntPtr dwNewLong);
+
+ public static int GetExtendedStyle(IntPtr hWnd)
+ {
+ return IntPtr.Size == 8
+ ? (int)GetWindowLongPtr64(hWnd, GWL_EXSTYLE)
+ : GetWindowLong32(hWnd, GWL_EXSTYLE);
+ }
+
+ public static void SetExtendedStyle(IntPtr hWnd, int newStyle)
+ {
+ if (IntPtr.Size == 8)
+ {
+ SetWindowLongPtr64(hWnd, GWL_EXSTYLE, new IntPtr(newStyle));
+ }
+ else
+ {
+ SetWindowLong32(hWnd, GWL_EXSTYLE, newStyle);
+ }
+ }
+
+ // ---- Taskbar position (used to anchor the panel near the tray icon) ----
+
+ [StructLayout(LayoutKind.Sequential)]
+ public struct RECT
+ {
+ public int Left;
+ public int Top;
+ public int Right;
+ public int Bottom;
+
+ public int Width => Right - Left;
+ public int Height => Bottom - Top;
+ }
+
+ [StructLayout(LayoutKind.Sequential)]
+ public struct APPBARDATA
+ {
+ public uint cbSize;
+ public IntPtr hWnd;
+ public uint uCallbackMessage;
+ public uint uEdge;
+ public RECT rc;
+ public int lParam;
+ }
+
+ [DllImport("shell32.dll", CallingConvention = CallingConvention.StdCall)]
+ public static extern IntPtr SHAppBarMessage(uint dwMessage, ref APPBARDATA pData);
+
+ // ---- Low-level keyboard hook (push-to-talk hotkey detection) ----
+
+ public const int WH_KEYBOARD_LL = 13;
+ public const int WM_KEYDOWN = 0x0100;
+ public const int WM_KEYUP = 0x0101;
+ public const int WM_SYSKEYDOWN = 0x0104;
+ public const int WM_SYSKEYUP = 0x0105;
+
+ [StructLayout(LayoutKind.Sequential)]
+ public struct KBDLLHOOKSTRUCT
+ {
+ public uint vkCode;
+ public uint scanCode;
+ public uint flags;
+ public uint time;
+ public UIntPtr dwExtraInfo;
+ }
+
+ public delegate IntPtr LowLevelKeyboardProc(int nCode, IntPtr wParam, IntPtr lParam);
+
+ [DllImport("user32.dll", CharSet = CharSet.Auto, SetLastError = true)]
+ public static extern IntPtr SetWindowsHookEx(int idHook, LowLevelKeyboardProc lpfn, IntPtr hMod, uint dwThreadId);
+
+ [DllImport("user32.dll", CharSet = CharSet.Auto, SetLastError = true)]
+ [return: MarshalAs(UnmanagedType.Bool)]
+ public static extern bool UnhookWindowsHookEx(IntPtr hhk);
+
+ [DllImport("user32.dll", CharSet = CharSet.Auto, SetLastError = true)]
+ public static extern IntPtr CallNextHookEx(IntPtr hhk, int nCode, IntPtr wParam, IntPtr lParam);
+
+ [DllImport("kernel32.dll", CharSet = CharSet.Auto, SetLastError = true)]
+ public static extern IntPtr GetModuleHandle(string? lpModuleName);
+
+ // ---- Cursor position (used by the overlay cursor-follow logic) ----
+
+ [StructLayout(LayoutKind.Sequential)]
+ public struct POINT
+ {
+ public int X;
+ public int Y;
+ }
+
+ [DllImport("user32.dll")]
+ [return: MarshalAs(UnmanagedType.Bool)]
+ public static extern bool GetCursorPos(out POINT lpPoint);
+
+ // ---- Display enumeration + screen capture (BitBlt) ----
+ // Used by ScreenCaptureService to grab per-monitor JPEGs. PerMonitorV2
+ // DPI awareness (set in app.manifest) means GetMonitorInfo returns
+ // physical device pixels and BitBlt copies at the monitor's native
+ // resolution, which is what the AI needs to reason about coordinates.
+
+ public const int MONITOR_DEFAULTTONEAREST = 2;
+
+ [StructLayout(LayoutKind.Sequential, CharSet = CharSet.Auto)]
+ public struct MONITORINFOEX
+ {
+ public int cbSize;
+ public RECT rcMonitor;
+ public RECT rcWork;
+ public uint dwFlags;
+ [MarshalAs(UnmanagedType.ByValTStr, SizeConst = 32)]
+ public string szDevice;
+ }
+
+ public const uint MONITORINFOF_PRIMARY = 1;
+
+ public delegate bool MonitorEnumProc(IntPtr hMonitor, IntPtr hdcMonitor, ref RECT lprcMonitor, IntPtr dwData);
+
+ [DllImport("user32.dll", SetLastError = true)]
+ [return: MarshalAs(UnmanagedType.Bool)]
+ public static extern bool EnumDisplayMonitors(IntPtr hdc, IntPtr lprcClip, MonitorEnumProc lpfnEnum, IntPtr dwData);
+
+ [DllImport("user32.dll", CharSet = CharSet.Auto, SetLastError = true)]
+ [return: MarshalAs(UnmanagedType.Bool)]
+ public static extern bool GetMonitorInfo(IntPtr hMonitor, ref MONITORINFOEX lpmi);
+
+ [DllImport("user32.dll", SetLastError = true)]
+ public static extern IntPtr MonitorFromPoint(POINT pt, uint dwFlags);
+
+ [DllImport("user32.dll", SetLastError = true)]
+ public static extern IntPtr GetDesktopWindow();
+
+ [DllImport("user32.dll", SetLastError = true)]
+ public static extern IntPtr GetDC(IntPtr hWnd);
+
+ [DllImport("user32.dll", SetLastError = true)]
+ public static extern int ReleaseDC(IntPtr hWnd, IntPtr hDC);
+
+ [DllImport("gdi32.dll", SetLastError = true)]
+ public static extern IntPtr CreateCompatibleDC(IntPtr hDC);
+
+ [DllImport("gdi32.dll", SetLastError = true)]
+ public static extern IntPtr CreateCompatibleBitmap(IntPtr hDC, int nWidth, int nHeight);
+
+ [DllImport("gdi32.dll", SetLastError = true)]
+ public static extern IntPtr SelectObject(IntPtr hDC, IntPtr hObject);
+
+ [DllImport("gdi32.dll", SetLastError = true)]
+ [return: MarshalAs(UnmanagedType.Bool)]
+ public static extern bool DeleteObject(IntPtr hObject);
+
+ [DllImport("gdi32.dll", SetLastError = true)]
+ [return: MarshalAs(UnmanagedType.Bool)]
+ public static extern bool DeleteDC(IntPtr hDC);
+
+ // BitBlt raster-operation codes.
+ public const int SRCCOPY = 0x00CC0020;
+ public const int CAPTUREBLT = 0x40000000; // Includes layered windows in the capture
+
+ [DllImport("gdi32.dll", SetLastError = true)]
+ [return: MarshalAs(UnmanagedType.Bool)]
+ public static extern bool BitBlt(
+ IntPtr hDCDest, int xDest, int yDest, int width, int height,
+ IntPtr hDCSource, int xSource, int ySource, int rop);
+
+ // ---- DPI helpers (used when positioning the panel in device-pixel coords) ----
+
+ ///
+ /// Returns the device-to-DIP scale for the window's monitor. Multiply a
+ /// device-pixel coord by the reciprocal to get WPF DIPs, or pass WPF DIPs
+ /// in and multiply by this to get device pixels.
+ ///
+ public static double GetDpiScale(Window window)
+ {
+ var source = PresentationSource.FromVisual(window);
+ return source?.CompositionTarget?.TransformToDevice.M11 ?? 1.0;
+ }
+}
diff --git a/windows/Clicky/Resources/DesignSystem.xaml b/windows/Clicky/Resources/DesignSystem.xaml
new file mode 100644
index 00000000..d474f620
--- /dev/null
+++ b/windows/Clicky/Resources/DesignSystem.xaml
@@ -0,0 +1,94 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 6
+ 8
+ 10
+ 12
+
+ 6
+ 8
+ 10
+ 12
+
+
+ Segoe UI Variable, Segoe UI, Arial
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/windows/Clicky/Services/AssemblyAIStreamingClient.cs b/windows/Clicky/Services/AssemblyAIStreamingClient.cs
new file mode 100644
index 00000000..2413ccb8
--- /dev/null
+++ b/windows/Clicky/Services/AssemblyAIStreamingClient.cs
@@ -0,0 +1,276 @@
+using System.IO;
+using System.Net.Http;
+using System.Net.WebSockets;
+using System.Text;
+using System.Text.Json;
+using System.Threading.Channels;
+
+namespace Clicky.Services;
+
+///
+/// Streaming AssemblyAI realtime transcription over WebSocket (v3).
+/// Port of AssemblyAIStreamingTranscriptionProvider.swift.
+///
+/// Lifecycle:
+/// 1. — fetches a temporary token from the
+/// Worker, opens the websocket with the required query params, and
+/// spawns a background receive loop.
+/// 2. — caller pushes raw PCM16 little-endian
+/// 16-kHz mono frames; they're forwarded as binary websocket messages.
+/// 3. — sends ForceEndpoint
+/// to flush the partial into a final turn.
+/// 4. — sends Terminate, closes the socket.
+///
+/// The class raises two events on a worker thread. Marshal to the UI thread
+/// at the call site if needed.
+///
+public sealed class AssemblyAIStreamingClient : IAsyncDisposable
+{
+ private const int SampleRateHz = 16_000;
+ private const string SpeechModel = "u3-rt-pro";
+
+ private readonly HttpClient _tokenHttpClient = new() { Timeout = TimeSpan.FromSeconds(20) };
+ private ClientWebSocket? _webSocket;
+ private Task? _receiveLoopTask;
+ private Task? _sendLoopTask;
+ private CancellationTokenSource? _lifetimeCts;
+ private Channel>? _audioChannel;
+
+ /// Partial or final transcript text — fires on every Turn message.
+ public event EventHandler? TranscriptUpdated;
+
+ /// Fires once when AssemblyAI signals end-of-turn (final transcript).
+ public event EventHandler? FinalTranscriptReady;
+
+ /// Fires if the session errors out (network, upstream rejection).
+ public event EventHandler? SessionFaulted;
+
+ public bool IsRunning => _webSocket?.State == WebSocketState.Open;
+
+ public async Task StartAsync(CancellationToken cancellationToken)
+ {
+ var temporaryToken = await FetchTemporaryTokenAsync(cancellationToken).ConfigureAwait(false);
+ var websocketUri = BuildWebsocketUri(temporaryToken);
+
+ _lifetimeCts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken);
+ _webSocket = new ClientWebSocket();
+ await _webSocket.ConnectAsync(websocketUri, _lifetimeCts.Token).ConfigureAwait(false);
+
+ _audioChannel = Channel.CreateUnbounded>(new UnboundedChannelOptions
+ {
+ SingleReader = true,
+ SingleWriter = false,
+ AllowSynchronousContinuations = false,
+ });
+
+ _receiveLoopTask = Task.Run(() => RunReceiveLoopAsync(_lifetimeCts.Token));
+ _sendLoopTask = Task.Run(() => RunSendLoopAsync(_lifetimeCts.Token));
+ }
+
+ ///
+ /// Enqueue a PCM16 frame for transmission. Non-blocking — frames are
+ /// buffered in an unbounded channel and flushed by the background sender.
+ ///
+ public void AppendAudio(ReadOnlyMemory pcm16LittleEndianBytes)
+ {
+ _audioChannel?.Writer.TryWrite(pcm16LittleEndianBytes);
+ }
+
+ ///
+ /// Tells AssemblyAI to cut the current partial into a final turn without
+ /// waiting for natural silence. Used when the user releases push-to-talk.
+ ///
+ public async Task RequestFinalTranscriptAsync(CancellationToken cancellationToken)
+ {
+ if (_webSocket?.State != WebSocketState.Open) return;
+ var forceEndpointJson = Encoding.UTF8.GetBytes("{\"type\":\"ForceEndpoint\"}");
+ await _webSocket.SendAsync(forceEndpointJson, WebSocketMessageType.Text, endOfMessage: true, cancellationToken)
+ .ConfigureAwait(false);
+ }
+
+ public async Task StopAsync(CancellationToken cancellationToken)
+ {
+ if (_webSocket is null) return;
+
+ try
+ {
+ if (_webSocket.State == WebSocketState.Open)
+ {
+ var terminateJson = Encoding.UTF8.GetBytes("{\"type\":\"Terminate\"}");
+ await _webSocket.SendAsync(terminateJson, WebSocketMessageType.Text, endOfMessage: true, cancellationToken)
+ .ConfigureAwait(false);
+ await _webSocket.CloseOutputAsync(WebSocketCloseStatus.NormalClosure, "client-terminate", cancellationToken)
+ .ConfigureAwait(false);
+ }
+ }
+ catch (WebSocketException) { /* socket already closed — ignore */ }
+ catch (OperationCanceledException) { /* shutdown during cancel — ignore */ }
+
+ _lifetimeCts?.Cancel();
+ _audioChannel?.Writer.TryComplete();
+
+ try { if (_sendLoopTask is not null) await _sendLoopTask.ConfigureAwait(false); }
+ catch (OperationCanceledException) { /* expected */ }
+
+ try { if (_receiveLoopTask is not null) await _receiveLoopTask.ConfigureAwait(false); }
+ catch (OperationCanceledException) { /* expected */ }
+
+ _webSocket.Dispose();
+ _webSocket = null;
+ }
+
+ private async Task FetchTemporaryTokenAsync(CancellationToken cancellationToken)
+ {
+ using var tokenRequest = new HttpRequestMessage(HttpMethod.Post, WorkerConfig.TranscribeTokenUrl);
+ using var tokenResponse = await _tokenHttpClient.SendAsync(tokenRequest, cancellationToken).ConfigureAwait(false);
+ tokenResponse.EnsureSuccessStatusCode();
+
+ var responseBody = await tokenResponse.Content.ReadAsStringAsync(cancellationToken).ConfigureAwait(false);
+ using var parsedDocument = JsonDocument.Parse(responseBody);
+ if (!parsedDocument.RootElement.TryGetProperty("token", out var tokenProperty))
+ {
+ throw new InvalidOperationException($"Token proxy response missing 'token' field: {responseBody}");
+ }
+
+ var tokenValue = tokenProperty.GetString();
+ if (string.IsNullOrWhiteSpace(tokenValue))
+ {
+ throw new InvalidOperationException("Token proxy returned an empty token.");
+ }
+ return tokenValue;
+ }
+
+ private static Uri BuildWebsocketUri(string temporaryToken)
+ {
+ var encodedToken = Uri.EscapeDataString(temporaryToken);
+ var queryString =
+ $"sample_rate={SampleRateHz}" +
+ $"&encoding=pcm_s16le" +
+ $"&format_turns=true" +
+ $"&speech_model={SpeechModel}" +
+ $"&token={encodedToken}";
+ return new Uri($"wss://streaming.assemblyai.com/v3/ws?{queryString}");
+ }
+
+ private async Task RunSendLoopAsync(CancellationToken cancellationToken)
+ {
+ if (_audioChannel is null || _webSocket is null) return;
+ var channelReader = _audioChannel.Reader;
+
+ try
+ {
+ while (await channelReader.WaitToReadAsync(cancellationToken).ConfigureAwait(false))
+ {
+ while (channelReader.TryRead(out var pcmFrame))
+ {
+ if (_webSocket.State != WebSocketState.Open) return;
+ await _webSocket.SendAsync(pcmFrame, WebSocketMessageType.Binary, endOfMessage: true, cancellationToken)
+ .ConfigureAwait(false);
+ }
+ }
+ }
+ catch (OperationCanceledException) { /* shutdown — ignore */ }
+ catch (WebSocketException webSocketException)
+ {
+ SessionFaulted?.Invoke(this, webSocketException);
+ }
+ }
+
+ private async Task RunReceiveLoopAsync(CancellationToken cancellationToken)
+ {
+ if (_webSocket is null) return;
+ var receiveBuffer = new byte[16 * 1024];
+ var messageBuffer = new MemoryStream();
+
+ try
+ {
+ while (_webSocket.State == WebSocketState.Open && !cancellationToken.IsCancellationRequested)
+ {
+ messageBuffer.SetLength(0);
+ WebSocketReceiveResult receiveResult;
+ do
+ {
+ receiveResult = await _webSocket
+ .ReceiveAsync(new ArraySegment(receiveBuffer), cancellationToken)
+ .ConfigureAwait(false);
+
+ if (receiveResult.MessageType == WebSocketMessageType.Close)
+ {
+ return;
+ }
+
+ messageBuffer.Write(receiveBuffer, 0, receiveResult.Count);
+ } while (!receiveResult.EndOfMessage);
+
+ if (receiveResult.MessageType != WebSocketMessageType.Text) continue;
+
+ var messageText = Encoding.UTF8.GetString(messageBuffer.GetBuffer(), 0, (int)messageBuffer.Length);
+ HandleIncomingMessage(messageText);
+ }
+ }
+ catch (OperationCanceledException) { /* shutdown — ignore */ }
+ catch (WebSocketException webSocketException)
+ {
+ SessionFaulted?.Invoke(this, webSocketException);
+ }
+ }
+
+ ///
+ /// Parses an AssemblyAI v3 realtime message. We only act on
+ /// Turn messages; session lifecycle (Begin,
+ /// Termination) doesn't need caller notification here.
+ ///
+ private void HandleIncomingMessage(string messageText)
+ {
+ try
+ {
+ using var parsedDocument = JsonDocument.Parse(messageText);
+ var rootObject = parsedDocument.RootElement;
+ if (!rootObject.TryGetProperty("type", out var typeProperty)) return;
+
+ var messageType = typeProperty.GetString();
+ if (messageType != "Turn") return;
+
+ var transcriptText = rootObject.TryGetProperty("transcript", out var transcriptProperty)
+ ? transcriptProperty.GetString() ?? string.Empty
+ : string.Empty;
+
+ var isEndOfTurn = rootObject.TryGetProperty("end_of_turn", out var endOfTurnProperty)
+ && endOfTurnProperty.ValueKind == JsonValueKind.True;
+ var isFormatted = rootObject.TryGetProperty("turn_is_formatted", out var formattedProperty)
+ && formattedProperty.ValueKind == JsonValueKind.True;
+ var isFinal = isEndOfTurn || isFormatted;
+
+ var eventArgs = new TranscriptEventArgs(transcriptText, isFinal);
+ TranscriptUpdated?.Invoke(this, eventArgs);
+ if (isFinal)
+ {
+ FinalTranscriptReady?.Invoke(this, eventArgs);
+ }
+ }
+ catch (JsonException)
+ {
+ // Malformed message — ignore rather than fault the session;
+ // AssemblyAI occasionally emits empty keepalive frames.
+ }
+ }
+
+ public async ValueTask DisposeAsync()
+ {
+ await StopAsync(CancellationToken.None).ConfigureAwait(false);
+ _lifetimeCts?.Dispose();
+ _tokenHttpClient.Dispose();
+ }
+}
+
+public sealed class TranscriptEventArgs : EventArgs
+{
+ public TranscriptEventArgs(string transcript, bool isFinal)
+ {
+ Transcript = transcript;
+ IsFinal = isFinal;
+ }
+
+ public string Transcript { get; }
+ public bool IsFinal { get; }
+}
diff --git a/windows/Clicky/Services/ClaudeClient.cs b/windows/Clicky/Services/ClaudeClient.cs
new file mode 100644
index 00000000..8e970422
--- /dev/null
+++ b/windows/Clicky/Services/ClaudeClient.cs
@@ -0,0 +1,185 @@
+using System.Diagnostics;
+using System.IO;
+using System.Net.Http;
+using System.Net.Http.Headers;
+using System.Text;
+using System.Text.Json;
+
+namespace Clicky.Services;
+
+///
+/// Streaming Anthropic Messages client. Talks to the Cloudflare Worker's
+/// /chat route — the Worker injects the API key and forwards the
+/// SSE stream unchanged. Port of ClaudeAPI.swift.
+///
+public sealed class ClaudeClient : IChatClient, IDisposable
+{
+ public const string DefaultModel = "claude-sonnet-4-6";
+ private const int MaxOutputTokens = 1024;
+
+ private readonly HttpClient _httpClient;
+ private readonly bool _ownsHttpClient;
+
+ public string Model { get; set; }
+
+ public ClaudeClient(string model = DefaultModel, HttpClient? httpClient = null)
+ {
+ Model = model;
+ if (httpClient is null)
+ {
+ _httpClient = new HttpClient { Timeout = Timeout.InfiniteTimeSpan };
+ _ownsHttpClient = true;
+ }
+ else
+ {
+ _httpClient = httpClient;
+ _ownsHttpClient = false;
+ }
+ }
+
+ public async Task StreamChatAsync(
+ string systemPrompt,
+ IReadOnlyList conversationHistory,
+ string userPrompt,
+ IReadOnlyList images,
+ Action onTextChunk,
+ CancellationToken cancellationToken)
+ {
+ var stopwatch = Stopwatch.StartNew();
+
+ var requestPayload = BuildRequestPayload(systemPrompt, conversationHistory, userPrompt, images);
+ using var requestMessage = new HttpRequestMessage(HttpMethod.Post, WorkerConfig.ChatClaudeUrl)
+ {
+ Content = new StringContent(requestPayload, Encoding.UTF8, "application/json"),
+ };
+ requestMessage.Headers.Accept.Add(new MediaTypeWithQualityHeaderValue("text/event-stream"));
+
+ using var responseMessage = await _httpClient
+ .SendAsync(requestMessage, HttpCompletionOption.ResponseHeadersRead, cancellationToken)
+ .ConfigureAwait(false);
+
+ if (!responseMessage.IsSuccessStatusCode)
+ {
+ var errorBody = await responseMessage.Content.ReadAsStringAsync(cancellationToken).ConfigureAwait(false);
+ throw new HttpRequestException(
+ $"Claude proxy returned {(int)responseMessage.StatusCode}: {errorBody}");
+ }
+
+ await using var responseStream = await responseMessage.Content
+ .ReadAsStreamAsync(cancellationToken)
+ .ConfigureAwait(false);
+ using var streamReader = new StreamReader(responseStream, Encoding.UTF8);
+
+ var accumulatedText = new StringBuilder();
+
+ // SSE frames are separated by blank lines. Within a frame we care
+ // about the `data:` lines; Anthropic also emits `event:` lines but
+ // the JSON payload carries its own `type` so we don't need them.
+ string? currentLine;
+ while ((currentLine = await streamReader.ReadLineAsync(cancellationToken).ConfigureAwait(false)) is not null)
+ {
+ if (currentLine.Length == 0) continue;
+ if (!currentLine.StartsWith("data:", StringComparison.Ordinal)) continue;
+
+ var jsonPayload = currentLine.AsSpan(5).TrimStart().ToString();
+ if (jsonPayload == "[DONE]") break;
+ if (jsonPayload.Length == 0) continue;
+
+ var chunk = ParseTextDelta(jsonPayload);
+ if (chunk.Length > 0)
+ {
+ accumulatedText.Append(chunk);
+ onTextChunk(chunk);
+ }
+ }
+
+ stopwatch.Stop();
+ return new ChatStreamResult(accumulatedText.ToString(), stopwatch.Elapsed);
+ }
+
+ ///
+ /// Extracts the delta.text string from an Anthropic streaming
+ /// payload, or returns empty if this event type doesn't carry text.
+ /// Anthropic emits many event types (message_start,
+ /// content_block_start, ping, message_delta, etc.)
+ /// — we only act on content_block_delta with a
+ /// text_delta payload, which mirrors the macOS client.
+ ///
+ private static string ParseTextDelta(string jsonPayload)
+ {
+ try
+ {
+ using var parsedDocument = JsonDocument.Parse(jsonPayload);
+ var rootObject = parsedDocument.RootElement;
+ if (!rootObject.TryGetProperty("type", out var typeProperty)) return string.Empty;
+ if (typeProperty.GetString() != "content_block_delta") return string.Empty;
+ if (!rootObject.TryGetProperty("delta", out var deltaProperty)) return string.Empty;
+ if (!deltaProperty.TryGetProperty("type", out var deltaTypeProperty)) return string.Empty;
+ if (deltaTypeProperty.GetString() != "text_delta") return string.Empty;
+ if (!deltaProperty.TryGetProperty("text", out var textProperty)) return string.Empty;
+ return textProperty.GetString() ?? string.Empty;
+ }
+ catch (JsonException)
+ {
+ return string.Empty;
+ }
+ }
+
+ private string BuildRequestPayload(
+ string systemPrompt,
+ IReadOnlyList conversationHistory,
+ string userPrompt,
+ IReadOnlyList images)
+ {
+ // Anthropic accepts either a plain string or an array of content
+ // parts. We use the array form for the latest user turn so we can
+ // include images; historical turns have no images and can stay
+ // as plain strings to keep the payload compact.
+ var messageArray = new List