From 98dd149da80e9e19be3f2ea003912e8e3becd1fb Mon Sep 17 00:00:00 2001 From: PsychoSatsujin Date: Tue, 21 Apr 2026 11:02:35 -0700 Subject: [PATCH 01/11] Add Gemini provider and scaffold Windows port foundation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two steps toward a multi-provider, cross-platform Clicky. Gemini alongside Claude - Worker gains POST /chat-gemini. Model ID travels in the request body; the Worker plugs it into the upstream Gemini URL path. - GeminiAPI.swift mirrors ClaudeAPI's streaming signature so the call sites don't care which provider is active. CompanionManager gets a runStreamingVisionRequest dispatcher and an isGeminiModelID helper; setSelectedModel updates whichever provider owns the new ID. - Panel picker gains a Gemini row (Flash default, Pro option). Flash is the default because the motivation here is reducing credit spend. Windows port (Milestone 1 of 6) - New windows/ folder with a C# + WPF solution on .NET 8. Single-instance guarded, tray icon via H.NotifyIcon.Wpf, borderless non-activating popover panel matching the macOS design system 1:1 (colors and radii ported verbatim from DesignSystem.swift), global Ctrl+Alt push-to-talk via low-level keyboard hook, settings persisted to %APPDATA%\Clicky. - No voice pipeline yet — pressing Ctrl+Alt flips AppState so the hook is verifiable. M2 (mic + AssemblyAI + AI + TTS), M3 (screen capture), M4 (cursor overlay), M5 (element pointing), and M6 (onboarding) land in follow-up PRs. Co-Authored-By: Claude Opus 4.7 --- AGENTS.md | 19 +- leanring-buddy/CompanionManager.swift | 88 +++++- leanring-buddy/CompanionPanelView.swift | 33 ++- leanring-buddy/GeminiAPI.swift | 273 ++++++++++++++++++ windows/Clicky.sln | 21 ++ windows/Clicky/App.xaml | 17 ++ windows/Clicky/App.xaml.cs | 202 +++++++++++++ windows/Clicky/AppState.cs | 72 +++++ windows/Clicky/Clicky.csproj | 27 ++ windows/Clicky/Interop/NativeMethods.cs | 160 ++++++++++ windows/Clicky/Resources/DesignSystem.xaml | 94 ++++++ .../Clicky/Services/GlobalHotkeyService.cs | 145 ++++++++++ windows/Clicky/Services/SettingsService.cs | 117 ++++++++ .../Clicky/ViewModels/TrayPanelViewModel.cs | 94 ++++++ windows/Clicky/Views/TrayPanelWindow.xaml | 237 +++++++++++++++ windows/Clicky/Views/TrayPanelWindow.xaml.cs | 132 +++++++++ windows/Clicky/app.manifest | 28 ++ windows/README.md | 81 ++++++ worker/src/index.ts | 70 ++++- 19 files changed, 1886 insertions(+), 24 deletions(-) create mode 100644 leanring-buddy/GeminiAPI.swift create mode 100644 windows/Clicky.sln create mode 100644 windows/Clicky/App.xaml create mode 100644 windows/Clicky/App.xaml.cs create mode 100644 windows/Clicky/AppState.cs create mode 100644 windows/Clicky/Clicky.csproj create mode 100644 windows/Clicky/Interop/NativeMethods.cs create mode 100644 windows/Clicky/Resources/DesignSystem.xaml create mode 100644 windows/Clicky/Services/GlobalHotkeyService.cs create mode 100644 windows/Clicky/Services/SettingsService.cs create mode 100644 windows/Clicky/ViewModels/TrayPanelViewModel.cs create mode 100644 windows/Clicky/Views/TrayPanelWindow.xaml create mode 100644 windows/Clicky/Views/TrayPanelWindow.xaml.cs create mode 100644 windows/Clicky/app.manifest create mode 100644 windows/README.md diff --git a/AGENTS.md b/AGENTS.md index 6946d441..f0b695df 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -5,16 +5,24 @@ ## Overview -macOS menu bar companion app. Lives entirely in the macOS status bar (no dock icon, no main window). Clicking the menu bar icon opens a custom floating panel with companion voice controls. Uses push-to-talk (ctrl+option) to capture voice input, transcribes it via AssemblyAI streaming, and sends the transcript + a screenshot of the user's screen to Claude. Claude responds with text (streamed via SSE) and voice (ElevenLabs TTS). A blue cursor overlay can fly to and point at UI elements Claude references on any connected monitor. +Cross-platform companion app. Lives entirely in the OS's system tray / menu bar (no dock icon, no main window). Clicking the tray icon opens a custom floating panel with companion voice controls. Uses push-to-talk (ctrl+option on macOS, ctrl+alt on Windows) to capture voice input, transcribes it via AssemblyAI streaming, and sends the transcript + a screenshot of the user's screen to the active AI provider (Claude or Gemini). The AI responds with text (streamed via SSE) and voice (ElevenLabs TTS). A blue cursor overlay can fly to and point at UI elements the AI references on any connected monitor. -All API keys live on a Cloudflare Worker proxy — nothing sensitive ships in the app. +All API keys live on a Cloudflare Worker proxy — nothing sensitive ships in either app. + +## Repository layout + +| Folder | Purpose | +|--------|---------| +| `leanring-buddy/` + `leanring-buddy.xcodeproj` | macOS menu bar app. Swift + SwiftUI + AppKit. Ships first, most features live here. | +| `windows/` | Windows port. C# + WPF on .NET 8. Currently at Milestone 1 (foundation). See `windows/README.md` for milestone progress. | +| `worker/` | Cloudflare Worker proxy. Shared by both apps unchanged — same routes, same secrets. | ## Architecture - **App Type**: Menu bar-only (`LSUIElement=true`), no dock icon or main window - **Framework**: SwiftUI (macOS native) with AppKit bridging for menu bar panel and cursor overlay - **Pattern**: MVVM with `@StateObject` / `@Published` state management -- **AI Chat**: Claude (Sonnet 4.6 default, Opus 4.6 optional) via Cloudflare Worker proxy with SSE streaming +- **AI Chat**: User-selectable provider — Claude (Sonnet 4.6 default, Opus 4.6 optional) or Gemini (2.5 Flash, 2.5 Pro). Both route through the Cloudflare Worker proxy with SSE streaming. - **Speech-to-Text**: AssemblyAI real-time streaming (`u3-rt-pro` model) via websocket, with OpenAI and Apple Speech as fallbacks - **Text-to-Speech**: ElevenLabs (`eleven_flash_v2_5` model) via Cloudflare Worker proxy - **Screen Capture**: ScreenCaptureKit (macOS 14.2+), multi-monitor support @@ -30,10 +38,11 @@ The app never calls external APIs directly. All requests go through a Cloudflare | Route | Upstream | Purpose | |-------|----------|---------| | `POST /chat` | `api.anthropic.com/v1/messages` | Claude vision + streaming chat | +| `POST /chat-gemini` | `generativelanguage.googleapis.com/v1beta/models/{model}:streamGenerateContent` | Gemini vision + streaming chat. The `model` field in the request body is used to build the upstream URL path. | | `POST /tts` | `api.elevenlabs.io/v1/text-to-speech/{voiceId}` | ElevenLabs TTS audio | | `POST /transcribe-token` | `streaming.assemblyai.com/v3/token` | Fetches a short-lived (480s) AssemblyAI websocket token | -Worker secrets: `ANTHROPIC_API_KEY`, `ASSEMBLYAI_API_KEY`, `ELEVENLABS_API_KEY` +Worker secrets: `ANTHROPIC_API_KEY`, `GEMINI_API_KEY`, `ASSEMBLYAI_API_KEY`, `ELEVENLABS_API_KEY` Worker vars: `ELEVENLABS_VOICE_ID` ### Key Architecture Decisions @@ -67,6 +76,7 @@ Worker vars: `ELEVENLABS_VOICE_ID` | `BuddyAudioConversionSupport.swift` | ~108 | Audio conversion helpers. Converts live mic buffers to PCM16 mono audio and builds WAV payloads for upload-based providers. | | `GlobalPushToTalkShortcutMonitor.swift` | ~132 | System-wide push-to-talk monitor. Owns the listen-only `CGEvent` tap and publishes press/release transitions. | | `ClaudeAPI.swift` | ~291 | Claude vision API client with streaming (SSE) and non-streaming modes. TLS warmup optimization, image MIME detection, conversation history support. | +| `GeminiAPI.swift` | ~240 | Google Gemini vision API client. Mirrors `ClaudeAPI`'s public streaming signature so `CompanionManager` can swap providers transparently. Translates the Gemini-specific request shape (`contents`/`parts`/`inline_data`, `systemInstruction`, `role: "model"`) and parses Gemini's SSE events. Routes through the Worker `/chat-gemini` route — the model ID travels in the body and the Worker plugs it into the upstream URL path. | | `OpenAIAPI.swift` | ~142 | OpenAI GPT vision API client. | | `ElevenLabsTTSClient.swift` | ~81 | ElevenLabs TTS client. Sends text to the Worker proxy, plays back audio via `AVAudioPlayer`. Exposes `isPlaying` for transient cursor scheduling. | | `ElementLocationDetector.swift` | ~335 | Detects UI element locations in screenshots for cursor pointing. | @@ -98,6 +108,7 @@ npm install # Add secrets npx wrangler secret put ANTHROPIC_API_KEY +npx wrangler secret put GEMINI_API_KEY npx wrangler secret put ASSEMBLYAI_API_KEY npx wrangler secret put ELEVENLABS_API_KEY diff --git a/leanring-buddy/CompanionManager.swift b/leanring-buddy/CompanionManager.swift index 0234cf19..ccdb0eda 100644 --- a/leanring-buddy/CompanionManager.swift +++ b/leanring-buddy/CompanionManager.swift @@ -73,9 +73,27 @@ final class CompanionManager: ObservableObject { private static let workerBaseURL = "https://your-worker-name.your-subdomain.workers.dev" private lazy var claudeAPI: ClaudeAPI = { - return ClaudeAPI(proxyURL: "\(Self.workerBaseURL)/chat", model: selectedModel) + // Default to Sonnet when the current selection is a Gemini model so + // the Claude client ships with a valid Anthropic model ID even when + // Gemini is the active provider. + let initialClaudeModel = Self.isGeminiModelID(selectedModel) ? "claude-sonnet-4-6" : selectedModel + return ClaudeAPI(proxyURL: "\(Self.workerBaseURL)/chat", model: initialClaudeModel) }() + private lazy var geminiAPI: GeminiAPI = { + // Default to Flash when the current selection is a Claude model so + // the Gemini client ships with a valid Gemini model ID even when + // Claude is the active provider. + let initialGeminiModel = Self.isGeminiModelID(selectedModel) ? selectedModel : "gemini-2.5-flash" + return GeminiAPI(proxyURL: "\(Self.workerBaseURL)/chat-gemini", model: initialGeminiModel) + }() + + /// Returns true when the given model ID belongs to the Gemini provider. + /// Used throughout the manager to route requests to the correct client. + static func isGeminiModelID(_ modelID: String) -> Bool { + return modelID.hasPrefix("gemini") + } + private lazy var elevenLabsTTSClient: ElevenLabsTTSClient = { return ElevenLabsTTSClient(proxyURL: "\(Self.workerBaseURL)/tts") }() @@ -107,13 +125,23 @@ final class CompanionManager: ObservableObject { /// Used by the panel to show accurate status text ("Active" vs "Ready"). @Published private(set) var isOverlayVisible: Bool = false - /// The Claude model used for voice responses. Persisted to UserDefaults. + /// The model used for voice responses. May be a Claude ID (e.g. "claude-sonnet-4-6") + /// or a Gemini ID (e.g. "gemini-2.5-flash"). Persisted to UserDefaults. + /// The UserDefaults key is still "selectedClaudeModel" for backwards compatibility + /// with installs from before Gemini support existed. @Published var selectedModel: String = UserDefaults.standard.string(forKey: "selectedClaudeModel") ?? "claude-sonnet-4-6" func setSelectedModel(_ model: String) { selectedModel = model UserDefaults.standard.set(model, forKey: "selectedClaudeModel") - claudeAPI.model = model + // Route the new model ID to whichever provider owns it. We leave the + // other provider's model untouched — the next time the user flips back, + // that provider still remembers its previously-selected model. + if Self.isGeminiModelID(model) { + geminiAPI.model = model + } else { + claudeAPI.model = model + } } /// User preference for whether the Clicky cursor should be shown. @@ -179,9 +207,13 @@ final class CompanionManager: ObservableObject { bindVoiceStateObservation() bindAudioPowerLevel() bindShortcutTransitions() - // Eagerly touch the Claude API so its TLS warmup handshake completes - // well before the onboarding demo fires at ~40s into the video. - _ = claudeAPI + // Eagerly touch the active AI provider so its TLS warmup handshake + // completes well before the onboarding demo fires at ~40s into the video. + if Self.isGeminiModelID(selectedModel) { + _ = geminiAPI + } else { + _ = claudeAPI + } // If the user already completed onboarding AND all permissions are // still granted, show the cursor overlay immediately. If permissions @@ -578,11 +610,40 @@ final class CompanionManager: ObservableObject { // MARK: - AI Response Pipeline - /// Captures a screenshot, sends it along with the transcript to Claude, - /// and plays the response aloud via ElevenLabs TTS. The cursor stays in - /// the spinner/processing state until TTS audio begins playing. - /// Claude's response may include a [POINT:x,y:label] tag which triggers - /// the buddy to fly to that element on screen. + /// Dispatches a streaming vision request to whichever provider owns the + /// currently selected model. Both Claude and Gemini expose an identical + /// streaming signature, so call sites don't need to care which one runs. + private func runStreamingVisionRequest( + images: [(data: Data, label: String)], + systemPrompt: String, + conversationHistory: [(userPlaceholder: String, assistantResponse: String)], + userPrompt: String, + onTextChunk: @MainActor @Sendable (String) -> Void + ) async throws -> (text: String, duration: TimeInterval) { + if Self.isGeminiModelID(selectedModel) { + return try await geminiAPI.analyzeImageStreaming( + images: images, + systemPrompt: systemPrompt, + conversationHistory: conversationHistory, + userPrompt: userPrompt, + onTextChunk: onTextChunk + ) + } else { + return try await claudeAPI.analyzeImageStreaming( + images: images, + systemPrompt: systemPrompt, + conversationHistory: conversationHistory, + userPrompt: userPrompt, + onTextChunk: onTextChunk + ) + } + } + + /// Captures a screenshot, sends it along with the transcript to the + /// selected AI provider (Claude or Gemini), and plays the response aloud + /// via ElevenLabs TTS. The cursor stays in the spinner/processing state + /// until TTS audio begins playing. The response may include a + /// [POINT:x,y:label] tag which triggers the buddy to fly to that element. private func sendTranscriptToClaudeWithScreenshot(transcript: String) { currentResponseTask?.cancel() elevenLabsTTSClient.stopPlayback() @@ -610,7 +671,7 @@ final class CompanionManager: ObservableObject { (userPlaceholder: entry.userTranscript, assistantResponse: entry.assistantResponse) } - let (fullResponseText, _) = try await claudeAPI.analyzeImageStreaming( + let (fullResponseText, _) = try await runStreamingVisionRequest( images: labeledImages, systemPrompt: Self.companionVoiceResponseSystemPrompt, conversationHistory: historyForAPI, @@ -982,9 +1043,10 @@ final class CompanionManager: ObservableObject { let dimensionInfo = " (image dimensions: \(cursorScreenCapture.screenshotWidthInPixels)x\(cursorScreenCapture.screenshotHeightInPixels) pixels)" let labeledImages = [(data: cursorScreenCapture.imageData, label: cursorScreenCapture.label + dimensionInfo)] - let (fullResponseText, _) = try await claudeAPI.analyzeImageStreaming( + let (fullResponseText, _) = try await runStreamingVisionRequest( images: labeledImages, systemPrompt: Self.onboardingDemoSystemPrompt, + conversationHistory: [], userPrompt: "look around my screen and find something interesting to point at", onTextChunk: { _ in } ) diff --git a/leanring-buddy/CompanionPanelView.swift b/leanring-buddy/CompanionPanelView.swift index 76789b4c..d1d54c4d 100644 --- a/leanring-buddy/CompanionPanelView.swift +++ b/leanring-buddy/CompanionPanelView.swift @@ -599,16 +599,42 @@ struct CompanionPanelView: View { // MARK: - Model Picker private var modelPickerRow: some View { + // Two provider rows stacked vertically — Claude and Gemini. Four buttons + // in a single row would be too cramped in the menu bar panel width. + VStack(alignment: .leading, spacing: 8) { + modelProviderRow( + providerLabel: "Claude", + options: [ + (displayLabel: "Sonnet", modelID: "claude-sonnet-4-6"), + (displayLabel: "Opus", modelID: "claude-opus-4-6") + ] + ) + modelProviderRow( + providerLabel: "Gemini", + options: [ + (displayLabel: "Flash", modelID: "gemini-2.5-flash"), + (displayLabel: "Pro", modelID: "gemini-2.5-pro") + ] + ) + } + .padding(.vertical, 4) + } + + private func modelProviderRow( + providerLabel: String, + options: [(displayLabel: String, modelID: String)] + ) -> some View { HStack { - Text("Model") + Text(providerLabel) .font(.system(size: 13, weight: .medium)) .foregroundColor(DS.Colors.textSecondary) Spacer() HStack(spacing: 0) { - modelOptionButton(label: "Sonnet", modelID: "claude-sonnet-4-6") - modelOptionButton(label: "Opus", modelID: "claude-opus-4-6") + ForEach(options, id: \.modelID) { option in + modelOptionButton(label: option.displayLabel, modelID: option.modelID) + } } .background( RoundedRectangle(cornerRadius: 6, style: .continuous) @@ -619,7 +645,6 @@ struct CompanionPanelView: View { .stroke(DS.Colors.borderSubtle, lineWidth: 0.5) ) } - .padding(.vertical, 4) } private func modelOptionButton(label: String, modelID: String) -> some View { diff --git a/leanring-buddy/GeminiAPI.swift b/leanring-buddy/GeminiAPI.swift new file mode 100644 index 00000000..1dc5e92f --- /dev/null +++ b/leanring-buddy/GeminiAPI.swift @@ -0,0 +1,273 @@ +// +// GeminiAPI.swift +// Google Gemini API Implementation with streaming support +// +// Mirrors ClaudeAPI's public interface so CompanionManager can route to +// either provider without the caller caring which one is active. The +// request/response translation layer is Gemini-specific (different field +// names, different SSE event shape, different role vocabulary). +// + +import Foundation + +/// Gemini API helper with streaming for progressive text display. +/// Routes through the Cloudflare Worker proxy so the Gemini API key never +/// ships in the app. +class GeminiAPI { + private static let tlsWarmupLock = NSLock() + private static var hasStartedTLSWarmup = false + + private let apiURL: URL + var model: String + private let session: URLSession + + init(proxyURL: String, model: String = "gemini-2.5-flash") { + self.apiURL = URL(string: proxyURL)! + self.model = model + + // Use .default instead of .ephemeral so TLS session tickets are cached. + // Ephemeral sessions do a full TLS handshake on every request, which causes + // transient -1200 (errSSLPeerHandshakeFail) errors with large image payloads. + // Disable URL/cookie caching to avoid storing responses or credentials on disk. + let config = URLSessionConfiguration.default + config.timeoutIntervalForRequest = 120 + config.timeoutIntervalForResource = 300 + config.waitsForConnectivity = true + config.urlCache = nil + config.httpCookieStorage = nil + self.session = URLSession(configuration: config) + + // Fire a lightweight HEAD request in the background to pre-establish the TLS + // connection. This caches the TLS session ticket so the first real API call + // (which carries a large image payload) doesn't need a cold TLS handshake. + warmUpTLSConnectionIfNeeded() + } + + private func makeAPIRequest() -> URLRequest { + var request = URLRequest(url: apiURL) + request.httpMethod = "POST" + request.timeoutInterval = 120 + request.setValue("application/json", forHTTPHeaderField: "Content-Type") + return request + } + + /// Detects the MIME type of image data by inspecting the first bytes. + /// Screen captures from ScreenCaptureKit are JPEG, but pasted images from the + /// clipboard are PNG. Gemini rejects requests where the declared mime_type + /// doesn't match the actual image format. + private func detectImageMediaType(for imageData: Data) -> String { + // PNG files start with the 8-byte signature: 89 50 4E 47 0D 0A 1A 0A + if imageData.count >= 4 { + let pngSignature: [UInt8] = [0x89, 0x50, 0x4E, 0x47] + let firstFourBytes = [UInt8](imageData.prefix(4)) + if firstFourBytes == pngSignature { + return "image/png" + } + } + // Default to JPEG — screen captures use JPEG compression + return "image/jpeg" + } + + /// Sends a no-op HEAD request to the Worker to establish and cache a TLS session. + /// Failures are silently ignored — this is purely an optimization. + private func warmUpTLSConnectionIfNeeded() { + Self.tlsWarmupLock.lock() + let shouldStartTLSWarmup = !Self.hasStartedTLSWarmup + if shouldStartTLSWarmup { + Self.hasStartedTLSWarmup = true + } + Self.tlsWarmupLock.unlock() + + guard shouldStartTLSWarmup else { return } + + guard var warmupURLComponents = URLComponents(url: apiURL, resolvingAgainstBaseURL: false) else { + return + } + + warmupURLComponents.path = "/" + warmupURLComponents.query = nil + warmupURLComponents.fragment = nil + + guard let warmupURL = warmupURLComponents.url else { + return + } + + var warmupRequest = URLRequest(url: warmupURL) + warmupRequest.httpMethod = "HEAD" + warmupRequest.timeoutInterval = 10 + session.dataTask(with: warmupRequest) { _, _, _ in + // Response doesn't matter — the TLS handshake is the goal + }.resume() + } + + /// Builds the Gemini-shaped request body for a vision + streaming call. + /// Gemini uses `contents` with `parts` (text + inline_data), a separate + /// `systemInstruction` field, and "model" as the assistant role. + private func buildGeminiRequestBody( + images: [(data: Data, label: String)], + systemPrompt: String, + conversationHistory: [(userPlaceholder: String, assistantResponse: String)], + userPrompt: String, + maxOutputTokens: Int + ) -> [String: Any] { + var contents: [[String: Any]] = [] + + for (userPlaceholder, assistantResponse) in conversationHistory { + contents.append([ + "role": "user", + "parts": [["text": userPlaceholder]] + ]) + contents.append([ + "role": "model", + "parts": [["text": assistantResponse]] + ]) + } + + // Build current turn with all labeled images + prompt + var currentTurnParts: [[String: Any]] = [] + for image in images { + currentTurnParts.append([ + "inline_data": [ + "mime_type": detectImageMediaType(for: image.data), + "data": image.data.base64EncodedString() + ] + ]) + currentTurnParts.append([ + "text": image.label + ]) + } + currentTurnParts.append([ + "text": userPrompt + ]) + contents.append([ + "role": "user", + "parts": currentTurnParts + ]) + + // `model` is forwarded to the Worker, which pulls it out and plugs it + // into the upstream Gemini URL path — Gemini itself doesn't read it. + return [ + "model": model, + "systemInstruction": [ + "parts": [["text": systemPrompt]] + ], + "contents": contents, + "generationConfig": [ + "maxOutputTokens": maxOutputTokens + ] + ] + } + + /// Send a vision request to Gemini with streaming. + /// Calls `onTextChunk` on the main actor each time new text arrives so the UI updates progressively. + /// Returns the full accumulated text and total duration when the stream completes. + func analyzeImageStreaming( + images: [(data: Data, label: String)], + systemPrompt: String, + conversationHistory: [(userPlaceholder: String, assistantResponse: String)] = [], + userPrompt: String, + onTextChunk: @MainActor @Sendable (String) -> Void + ) async throws -> (text: String, duration: TimeInterval) { + let startTime = Date() + + var request = makeAPIRequest() + + let body = buildGeminiRequestBody( + images: images, + systemPrompt: systemPrompt, + conversationHistory: conversationHistory, + userPrompt: userPrompt, + maxOutputTokens: 1024 + ) + + let bodyData = try JSONSerialization.data(withJSONObject: body) + request.httpBody = bodyData + let payloadMB = Double(bodyData.count) / 1_048_576.0 + print("🌐 Gemini streaming request (\(model)): \(String(format: "%.1f", payloadMB))MB, \(images.count) image(s)") + + // Use bytes streaming for SSE (Server-Sent Events) + let (byteStream, response) = try await session.bytes(for: request) + + guard let httpResponse = response as? HTTPURLResponse else { + throw NSError( + domain: "GeminiAPI", + code: -1, + userInfo: [NSLocalizedDescriptionKey: "Invalid HTTP response"] + ) + } + + // If non-2xx status, read the full body as error text + guard (200...299).contains(httpResponse.statusCode) else { + var errorBodyChunks: [String] = [] + for try await line in byteStream.lines { + errorBodyChunks.append(line) + } + let errorBody = errorBodyChunks.joined(separator: "\n") + throw NSError( + domain: "GeminiAPI", + code: httpResponse.statusCode, + userInfo: [NSLocalizedDescriptionKey: "API Error (\(httpResponse.statusCode)): \(errorBody)"] + ) + } + + // Parse SSE stream — each event is "data: {json}\n\n". + // Gemini sends one event per chunk with shape: + // { "candidates": [ { "content": { "parts": [ {"text": "..."} ], "role": "model" } } ] } + var accumulatedResponseText = "" + + for try await line in byteStream.lines { + guard line.hasPrefix("data: ") else { continue } + let jsonString = String(line.dropFirst(6)) + + // Gemini doesn't send an explicit [DONE] marker, but handle it defensively + guard jsonString != "[DONE]" else { break } + + guard let jsonData = jsonString.data(using: .utf8), + let eventPayload = try? JSONSerialization.jsonObject(with: jsonData) as? [String: Any] else { + continue + } + + // Extract text from candidates[0].content.parts[*].text + guard let candidates = eventPayload["candidates"] as? [[String: Any]], + let firstCandidate = candidates.first, + let content = firstCandidate["content"] as? [String: Any], + let parts = content["parts"] as? [[String: Any]] else { + continue + } + + var chunkText = "" + for part in parts { + if let partText = part["text"] as? String { + chunkText += partText + } + } + + if !chunkText.isEmpty { + accumulatedResponseText += chunkText + let currentAccumulatedText = accumulatedResponseText + await onTextChunk(currentAccumulatedText) + } + } + + let duration = Date().timeIntervalSince(startTime) + return (text: accumulatedResponseText, duration: duration) + } + + /// Non-streaming fallback for validation requests where we don't need progressive display. + /// Uses the same streaming endpoint internally — Gemini returns the full result via SSE + /// and we simply accumulate it before returning. This keeps the Worker route surface small. + func analyzeImage( + images: [(data: Data, label: String)], + systemPrompt: String, + conversationHistory: [(userPlaceholder: String, assistantResponse: String)] = [], + userPrompt: String + ) async throws -> (text: String, duration: TimeInterval) { + return try await analyzeImageStreaming( + images: images, + systemPrompt: systemPrompt, + conversationHistory: conversationHistory, + userPrompt: userPrompt, + onTextChunk: { _ in } + ) + } +} diff --git a/windows/Clicky.sln b/windows/Clicky.sln new file mode 100644 index 00000000..09a641ec --- /dev/null +++ b/windows/Clicky.sln @@ -0,0 +1,21 @@ +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio Version 17 +VisualStudioVersion = 17.9.34622.214 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Clicky", "Clicky\Clicky.csproj", "{B1F9B6C0-5B7E-4D3A-8E4D-1A2B3C4D5E6F}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|Any CPU = Debug|Any CPU + Release|Any CPU = Release|Any CPU + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {B1F9B6C0-5B7E-4D3A-8E4D-1A2B3C4D5E6F}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {B1F9B6C0-5B7E-4D3A-8E4D-1A2B3C4D5E6F}.Debug|Any CPU.Build.0 = Debug|Any CPU + {B1F9B6C0-5B7E-4D3A-8E4D-1A2B3C4D5E6F}.Release|Any CPU.ActiveCfg = Release|Any CPU + {B1F9B6C0-5B7E-4D3A-8E4D-1A2B3C4D5E6F}.Release|Any CPU.Build.0 = Release|Any CPU + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/windows/Clicky/App.xaml b/windows/Clicky/App.xaml new file mode 100644 index 00000000..2be35400 --- /dev/null +++ b/windows/Clicky/App.xaml @@ -0,0 +1,17 @@ + + + + + + + + + + diff --git a/windows/Clicky/App.xaml.cs b/windows/Clicky/App.xaml.cs new file mode 100644 index 00000000..f344bb4f --- /dev/null +++ b/windows/Clicky/App.xaml.cs @@ -0,0 +1,202 @@ +using System.Windows; +using System.Windows.Media; +using System.Windows.Media.Imaging; +using H.NotifyIcon; +using Clicky.Interop; +using Clicky.Services; +using Clicky.ViewModels; +using Clicky.Views; + +namespace Clicky; + +/// +/// WPF application entry. Boots the tray icon, wires the popover panel, +/// installs the global push-to-talk hotkey, and holds the root AppState +/// for the app's lifetime. +/// +/// This is the Windows analog of the macOS CompanionAppDelegate + +/// MenuBarPanelManager combination (leanring_buddyApp.swift + MenuBarPanelManager.swift). +/// +public partial class App : Application +{ + // Keep singletons alive for the app's lifetime. No DI container in M1 — + // the dependency graph is small enough to thread manually. + private Mutex? _singleInstanceMutex; + private SettingsService? _settingsService; + private AppState? _appState; + private GlobalHotkeyService? _globalHotkeyService; + private TaskbarIcon? _trayIcon; + private TrayPanelWindow? _trayPanelWindow; + private TrayPanelViewModel? _trayPanelViewModel; + + protected override void OnStartup(StartupEventArgs eventArgs) + { + base.OnStartup(eventArgs); + + if (!TryAcquireSingleInstanceMutex()) + { + // Another instance is already running. Exit quietly — no error + // dialog, so double-clicks from the Start menu are benign. + Shutdown(); + return; + } + + _settingsService = new SettingsService(); + _appState = new AppState(_settingsService); + _trayPanelViewModel = new TrayPanelViewModel(_appState); + _trayPanelWindow = new TrayPanelWindow(_trayPanelViewModel); + + InstallTrayIcon(); + InstallGlobalHotkey(); + } + + protected override void OnExit(ExitEventArgs eventArgs) + { + _globalHotkeyService?.Dispose(); + _trayIcon?.Dispose(); + _singleInstanceMutex?.ReleaseMutex(); + _singleInstanceMutex?.Dispose(); + base.OnExit(eventArgs); + } + + private bool TryAcquireSingleInstanceMutex() + { + // Per-user mutex — two different users on the same machine can each + // run their own Clicky instance without colliding. + var mutexName = $"Local\\Clicky.SingleInstance.{Environment.UserName}"; + _singleInstanceMutex = new Mutex(initiallyOwned: true, name: mutexName, createdNew: out var createdNew); + return createdNew; + } + + private void InstallTrayIcon() + { + _trayIcon = new TaskbarIcon + { + ToolTipText = "Clicky — hold Ctrl+Alt to talk", + IconSource = LoadTrayIconSource(), + // No built-in context menu — left- and right-click both open the + // custom popover. Quit lives inside the panel. + NoLeftClickDelay = true, + }; + + _trayIcon.TrayLeftMouseUp += (_, _) => ToggleTrayPanel(); + _trayIcon.TrayRightMouseUp += (_, _) => ToggleTrayPanel(); + + _trayIcon.ForceCreate(); + } + + private void InstallGlobalHotkey() + { + _globalHotkeyService = new GlobalHotkeyService(); + _globalHotkeyService.ShortcutPressed += OnPushToTalkPressed; + _globalHotkeyService.ShortcutReleased += OnPushToTalkReleased; + _globalHotkeyService.Start(); + } + + private void OnPushToTalkPressed(object? sender, EventArgs eventArgs) + { + // Milestone 2 wires this to the dictation pipeline. For now we just + // flip the state so the panel can reflect it and we can verify the + // hook is detecting the combo. + Dispatcher.BeginInvoke(() => + { + if (_appState is not null) + { + _appState.CurrentVoiceState = AppState.VoiceState.Listening; + } + // Panel shouldn't stay visible while the user is talking to the + // app — dismiss it if it happens to be open. + _trayPanelWindow?.HidePanel(); + }); + } + + private void OnPushToTalkReleased(object? sender, EventArgs eventArgs) + { + Dispatcher.BeginInvoke(() => + { + if (_appState is not null) + { + _appState.CurrentVoiceState = AppState.VoiceState.Idle; + } + }); + } + + private void ToggleTrayPanel() + { + if (_trayPanelWindow is null) return; + + if (_trayPanelWindow.IsVisible) + { + _trayPanelWindow.HidePanel(); + return; + } + + NativeMethods.GetCursorPos(out var cursorPositionDevicePixels); + _trayPanelWindow.ShowNearTrayCursor( + cursorPositionDevicePixels.X, + cursorPositionDevicePixels.Y); + } + + /// + /// Loads the tray icon from the bundled resource. Falls back to a + /// generated blue-dot placeholder if the resource is missing so the app + /// is runnable before an artist drops a real .ico in. + /// + private static ImageSource LoadTrayIconSource() + { + try + { + var packIconUri = new Uri("pack://application:,,,/Resources/clicky-tray.ico", UriKind.Absolute); + var packResource = GetResourceStream(packIconUri); + if (packResource?.Stream is not null) + { + var bundledIconBitmap = new BitmapImage(); + bundledIconBitmap.BeginInit(); + bundledIconBitmap.CacheOption = BitmapCacheOption.OnLoad; + bundledIconBitmap.StreamSource = packResource.Stream; + bundledIconBitmap.EndInit(); + bundledIconBitmap.Freeze(); + return bundledIconBitmap; + } + } + catch + { + // Fall through to the generated placeholder. + } + + return CreatePlaceholderBlueDotBitmap(); + } + + private static BitmapSource CreatePlaceholderBlueDotBitmap() + { + const int iconPixelSize = 32; + const int iconPadding = 6; + + var drawingVisual = new DrawingVisual(); + using (var drawingContext = drawingVisual.RenderOpen()) + { + var overlayCursorBlue = new SolidColorBrush(Color.FromRgb(0x33, 0x80, 0xFF)); + overlayCursorBlue.Freeze(); + + var circleCenter = new System.Windows.Point(iconPixelSize / 2.0, iconPixelSize / 2.0); + var circleRadius = (iconPixelSize - (iconPadding * 2)) / 2.0; + + drawingContext.DrawEllipse( + brush: overlayCursorBlue, + pen: null, + center: circleCenter, + radiusX: circleRadius, + radiusY: circleRadius); + } + + var renderTarget = new RenderTargetBitmap( + pixelWidth: iconPixelSize, + pixelHeight: iconPixelSize, + dpiX: 96, + dpiY: 96, + pixelFormat: PixelFormats.Pbgra32); + renderTarget.Render(drawingVisual); + renderTarget.Freeze(); + return renderTarget; + } +} diff --git a/windows/Clicky/AppState.cs b/windows/Clicky/AppState.cs new file mode 100644 index 00000000..e4cee426 --- /dev/null +++ b/windows/Clicky/AppState.cs @@ -0,0 +1,72 @@ +using CommunityToolkit.Mvvm.ComponentModel; +using Clicky.Services; + +namespace Clicky; + +/// +/// Root observable state for the entire Windows app. The C# analog of the +/// macOS CompanionManager (leanring-buddy/CompanionManager.swift). Milestone 1 +/// holds only the persisted preferences and the voice-state enum; later +/// milestones attach the screen-capture, dictation, and AI-chat services. +/// +public sealed partial class AppState : ObservableObject +{ + private readonly SettingsService _settingsService; + + public AppState(SettingsService settingsService) + { + _settingsService = settingsService; + _selectedModelId = settingsService.SelectedModelId; + _isClickyCursorEnabled = settingsService.IsClickyCursorEnabled; + _hasCompletedOnboarding = settingsService.HasCompletedOnboarding; + } + + // ---- Voice pipeline state (populated by later milestones) ---- + + public enum VoiceState + { + Idle, + Listening, + Processing, + Responding, + } + + [ObservableProperty] + private VoiceState _currentVoiceState = VoiceState.Idle; + + // ---- Persisted preferences ---- + + [ObservableProperty] + private string _selectedModelId; + + partial void OnSelectedModelIdChanged(string value) + { + _settingsService.SelectedModelId = value; + } + + [ObservableProperty] + private bool _isClickyCursorEnabled; + + partial void OnIsClickyCursorEnabledChanged(bool value) + { + _settingsService.IsClickyCursorEnabled = value; + } + + [ObservableProperty] + private bool _hasCompletedOnboarding; + + partial void OnHasCompletedOnboardingChanged(bool value) + { + _settingsService.HasCompletedOnboarding = value; + } + + // ---- Model routing helpers (mirror CompanionManager.isGeminiModelID) ---- + + /// + /// Returns true when the given model ID belongs to the Gemini provider. + /// Used by later milestones to route vision requests to the right client. + /// + public static bool IsGeminiModelId(string modelId) => modelId.StartsWith("gemini", StringComparison.OrdinalIgnoreCase); + + public bool IsCurrentModelGemini => IsGeminiModelId(SelectedModelId); +} diff --git a/windows/Clicky/Clicky.csproj b/windows/Clicky/Clicky.csproj new file mode 100644 index 00000000..71a445b8 --- /dev/null +++ b/windows/Clicky/Clicky.csproj @@ -0,0 +1,27 @@ + + + + WinExe + + net8.0-windows10.0.19041.0 + 10.0.17763.0 + Clicky + Clicky + enable + enable + true + app.manifest + + PerMonitorV2 + + + + + + + + + + diff --git a/windows/Clicky/Interop/NativeMethods.cs b/windows/Clicky/Interop/NativeMethods.cs new file mode 100644 index 00000000..16923e46 --- /dev/null +++ b/windows/Clicky/Interop/NativeMethods.cs @@ -0,0 +1,160 @@ +using System.Runtime.InteropServices; +using System.Windows; + +namespace Clicky.Interop; + +/// +/// Win32 P/Invoke surface. Grouped here so the rest of the app can stay +/// managed-code-only. Each method is documented with the underlying Win32 +/// function it wraps. +/// +internal static class NativeMethods +{ + // ---- Extended window style bits used by the panel + overlay ---- + public const int GWL_EXSTYLE = -20; + public const int WS_EX_TRANSPARENT = 0x00000020; + public const int WS_EX_TOOLWINDOW = 0x00000080; + public const int WS_EX_LAYERED = 0x00080000; + public const int WS_EX_NOACTIVATE = 0x08000000; + + // ---- SetWindowPos flags (used for non-activating positioning) ---- + public static readonly IntPtr HWND_TOPMOST = new(-1); + public const uint SWP_NOSIZE = 0x0001; + public const uint SWP_NOMOVE = 0x0002; + public const uint SWP_NOACTIVATE = 0x0010; + public const uint SWP_SHOWWINDOW = 0x0040; + + // ---- AppBar query for the Windows taskbar bounds ---- + public const uint ABM_GETTASKBARPOS = 0x00000005; + + [DllImport("user32.dll", SetLastError = true)] + [return: MarshalAs(UnmanagedType.Bool)] + public static extern bool SetWindowPos( + IntPtr hWnd, + IntPtr hWndInsertAfter, + int X, + int Y, + int cx, + int cy, + uint uFlags); + + // 32-bit and 64-bit variants of GetWindowLong / SetWindowLong. The correct + // one is selected at runtime by GetExtendedStyle / SetExtendedStyle below. + [DllImport("user32.dll", EntryPoint = "GetWindowLong")] + private static extern int GetWindowLong32(IntPtr hWnd, int nIndex); + + [DllImport("user32.dll", EntryPoint = "GetWindowLongPtr")] + private static extern IntPtr GetWindowLongPtr64(IntPtr hWnd, int nIndex); + + [DllImport("user32.dll", EntryPoint = "SetWindowLong")] + private static extern int SetWindowLong32(IntPtr hWnd, int nIndex, int dwNewLong); + + [DllImport("user32.dll", EntryPoint = "SetWindowLongPtr")] + private static extern IntPtr SetWindowLongPtr64(IntPtr hWnd, int nIndex, IntPtr dwNewLong); + + public static int GetExtendedStyle(IntPtr hWnd) + { + return IntPtr.Size == 8 + ? (int)GetWindowLongPtr64(hWnd, GWL_EXSTYLE) + : GetWindowLong32(hWnd, GWL_EXSTYLE); + } + + public static void SetExtendedStyle(IntPtr hWnd, int newStyle) + { + if (IntPtr.Size == 8) + { + SetWindowLongPtr64(hWnd, GWL_EXSTYLE, new IntPtr(newStyle)); + } + else + { + SetWindowLong32(hWnd, GWL_EXSTYLE, newStyle); + } + } + + // ---- Taskbar position (used to anchor the panel near the tray icon) ---- + + [StructLayout(LayoutKind.Sequential)] + public struct RECT + { + public int Left; + public int Top; + public int Right; + public int Bottom; + + public int Width => Right - Left; + public int Height => Bottom - Top; + } + + [StructLayout(LayoutKind.Sequential)] + public struct APPBARDATA + { + public uint cbSize; + public IntPtr hWnd; + public uint uCallbackMessage; + public uint uEdge; + public RECT rc; + public int lParam; + } + + [DllImport("shell32.dll", CallingConvention = CallingConvention.StdCall)] + public static extern IntPtr SHAppBarMessage(uint dwMessage, ref APPBARDATA pData); + + // ---- Low-level keyboard hook (push-to-talk hotkey detection) ---- + + public const int WH_KEYBOARD_LL = 13; + public const int WM_KEYDOWN = 0x0100; + public const int WM_KEYUP = 0x0101; + public const int WM_SYSKEYDOWN = 0x0104; + public const int WM_SYSKEYUP = 0x0105; + + [StructLayout(LayoutKind.Sequential)] + public struct KBDLLHOOKSTRUCT + { + public uint vkCode; + public uint scanCode; + public uint flags; + public uint time; + public UIntPtr dwExtraInfo; + } + + public delegate IntPtr LowLevelKeyboardProc(int nCode, IntPtr wParam, IntPtr lParam); + + [DllImport("user32.dll", CharSet = CharSet.Auto, SetLastError = true)] + public static extern IntPtr SetWindowsHookEx(int idHook, LowLevelKeyboardProc lpfn, IntPtr hMod, uint dwThreadId); + + [DllImport("user32.dll", CharSet = CharSet.Auto, SetLastError = true)] + [return: MarshalAs(UnmanagedType.Bool)] + public static extern bool UnhookWindowsHookEx(IntPtr hhk); + + [DllImport("user32.dll", CharSet = CharSet.Auto, SetLastError = true)] + public static extern IntPtr CallNextHookEx(IntPtr hhk, int nCode, IntPtr wParam, IntPtr lParam); + + [DllImport("kernel32.dll", CharSet = CharSet.Auto, SetLastError = true)] + public static extern IntPtr GetModuleHandle(string? lpModuleName); + + // ---- Cursor position (used by the overlay cursor-follow logic) ---- + + [StructLayout(LayoutKind.Sequential)] + public struct POINT + { + public int X; + public int Y; + } + + [DllImport("user32.dll")] + [return: MarshalAs(UnmanagedType.Bool)] + public static extern bool GetCursorPos(out POINT lpPoint); + + // ---- DPI helpers (used when positioning the panel in device-pixel coords) ---- + + /// + /// Returns the device-to-DIP scale for the window's monitor. Multiply a + /// device-pixel coord by the reciprocal to get WPF DIPs, or pass WPF DIPs + /// in and multiply by this to get device pixels. + /// + public static double GetDpiScale(Window window) + { + var source = PresentationSource.FromVisual(window); + return source?.CompositionTarget?.TransformToDevice.M11 ?? 1.0; + } +} diff --git a/windows/Clicky/Resources/DesignSystem.xaml b/windows/Clicky/Resources/DesignSystem.xaml new file mode 100644 index 00000000..d474f620 --- /dev/null +++ b/windows/Clicky/Resources/DesignSystem.xaml @@ -0,0 +1,94 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 6 + 8 + 10 + 12 + + 6 + 8 + 10 + 12 + + + Segoe UI Variable, Segoe UI, Arial + + + + + + + + + + + + + + diff --git a/windows/Clicky/Services/GlobalHotkeyService.cs b/windows/Clicky/Services/GlobalHotkeyService.cs new file mode 100644 index 00000000..0b9de627 --- /dev/null +++ b/windows/Clicky/Services/GlobalHotkeyService.cs @@ -0,0 +1,145 @@ +using System.Diagnostics; +using System.Runtime.InteropServices; +using System.Windows.Input; +using Clicky.Interop; + +namespace Clicky.Services; + +/// +/// Detects the push-to-talk shortcut (Ctrl+Alt by default) system-wide via a +/// low-level keyboard hook. This is the Windows analog of the macOS CGEvent +/// tap used in GlobalPushToTalkShortcutMonitor.swift. +/// +/// The hook is listen-only — we do NOT swallow the keys, so Ctrl+Alt combos +/// still reach other apps normally. Users can hold Ctrl+Alt to talk to Clicky +/// without breaking their current app's keyboard handling. +/// +/// Events are raised on the thread that installed the hook (the UI thread). +/// Subscribers should keep handlers short to avoid stalling global keyboard +/// delivery — dispatch heavy work off-thread immediately. +/// +public sealed class GlobalHotkeyService : IDisposable +{ + private IntPtr _hookHandle = IntPtr.Zero; + + // Held as a field so the GC doesn't collect the delegate while the hook + // is installed — that would cause a nasty access violation in user32.dll. + private NativeMethods.LowLevelKeyboardProc? _hookCallback; + + private bool _isCtrlHeld; + private bool _isAltHeld; + private bool _isShortcutActive; + + /// + /// Raised when the push-to-talk combination transitions to held. + /// Subscribers should begin recording immediately. + /// + public event EventHandler? ShortcutPressed; + + /// + /// Raised when either modifier in the push-to-talk combination is released. + /// Subscribers should finalize the recording and submit the transcript. + /// + public event EventHandler? ShortcutReleased; + + public void Start() + { + if (_hookHandle != IntPtr.Zero) + { + return; + } + + _hookCallback = HookCallback; + using var process = Process.GetCurrentProcess(); + using var module = process.MainModule + ?? throw new InvalidOperationException("Cannot read main module for hook installation."); + var moduleHandle = NativeMethods.GetModuleHandle(module.ModuleName); + + _hookHandle = NativeMethods.SetWindowsHookEx( + NativeMethods.WH_KEYBOARD_LL, + _hookCallback, + moduleHandle, + 0); + + if (_hookHandle == IntPtr.Zero) + { + throw new InvalidOperationException( + $"Failed to install low-level keyboard hook (GetLastError={Marshal.GetLastWin32Error()})."); + } + } + + public void Stop() + { + if (_hookHandle == IntPtr.Zero) + { + return; + } + + NativeMethods.UnhookWindowsHookEx(_hookHandle); + _hookHandle = IntPtr.Zero; + _hookCallback = null; + _isCtrlHeld = false; + _isAltHeld = false; + _isShortcutActive = false; + } + + public void Dispose() => Stop(); + + private IntPtr HookCallback(int nCode, IntPtr wParam, IntPtr lParam) + { + if (nCode < 0) + { + return NativeMethods.CallNextHookEx(_hookHandle, nCode, wParam, lParam); + } + + var hookStruct = Marshal.PtrToStructure(lParam); + var virtualKey = (Key)KeyInterop.KeyFromVirtualKey((int)hookStruct.vkCode); + var messageCode = wParam.ToInt32(); + + var isKeyDown = messageCode == NativeMethods.WM_KEYDOWN || messageCode == NativeMethods.WM_SYSKEYDOWN; + var isKeyUp = messageCode == NativeMethods.WM_KEYUP || messageCode == NativeMethods.WM_SYSKEYUP; + + // Track only Ctrl and Alt — left and right variants both map to the + // same push-to-talk action (matches macOS left/right option behavior). + var isCtrlKey = virtualKey is Key.LeftCtrl or Key.RightCtrl; + var isAltKey = virtualKey is Key.LeftAlt or Key.RightAlt; + + if (isCtrlKey) + { + if (isKeyDown) _isCtrlHeld = true; + else if (isKeyUp) _isCtrlHeld = false; + } + else if (isAltKey) + { + if (isKeyDown) _isAltHeld = true; + else if (isKeyUp) _isAltHeld = false; + } + else + { + // Any non-modifier keystroke cancels the shortcut. Without this, + // "Ctrl+Alt+T" (or any typing combo) would fire push-to-talk. + if (_isShortcutActive) + { + _isShortcutActive = false; + ShortcutReleased?.Invoke(this, EventArgs.Empty); + } + + return NativeMethods.CallNextHookEx(_hookHandle, nCode, wParam, lParam); + } + + var shouldBeActive = _isCtrlHeld && _isAltHeld; + + if (shouldBeActive && !_isShortcutActive) + { + _isShortcutActive = true; + ShortcutPressed?.Invoke(this, EventArgs.Empty); + } + else if (!shouldBeActive && _isShortcutActive) + { + _isShortcutActive = false; + ShortcutReleased?.Invoke(this, EventArgs.Empty); + } + + return NativeMethods.CallNextHookEx(_hookHandle, nCode, wParam, lParam); + } +} diff --git a/windows/Clicky/Services/SettingsService.cs b/windows/Clicky/Services/SettingsService.cs new file mode 100644 index 00000000..0927e6f1 --- /dev/null +++ b/windows/Clicky/Services/SettingsService.cs @@ -0,0 +1,117 @@ +using System.IO; +using System.Text.Json; +using System.Text.Json.Serialization; + +namespace Clicky.Services; + +/// +/// Persists user preferences to %APPDATA%\Clicky\settings.json. +/// Equivalent of the macOS app's UserDefaults usage in CompanionManager.swift. +/// Reads are synchronous and cheap; writes debounce so rapid toggles don't +/// thrash the disk. +/// +public sealed class SettingsService +{ + private static readonly string SettingsDirectory = Path.Combine( + Environment.GetFolderPath(Environment.SpecialFolder.ApplicationData), + "Clicky"); + + private static readonly string SettingsFilePath = Path.Combine(SettingsDirectory, "settings.json"); + + private static readonly JsonSerializerOptions SerializerOptions = new() + { + WriteIndented = true, + DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull, + }; + + private PersistedSettings _currentSettings; + private readonly object _writeLock = new(); + + public SettingsService() + { + _currentSettings = LoadFromDiskOrDefault(); + } + + public string SelectedModelId + { + get => _currentSettings.SelectedModelId ?? DefaultModelId; + set + { + _currentSettings.SelectedModelId = value; + PersistToDisk(); + } + } + + public bool IsClickyCursorEnabled + { + get => _currentSettings.IsClickyCursorEnabled ?? true; + set + { + _currentSettings.IsClickyCursorEnabled = value; + PersistToDisk(); + } + } + + public bool HasCompletedOnboarding + { + get => _currentSettings.HasCompletedOnboarding ?? false; + set + { + _currentSettings.HasCompletedOnboarding = value; + PersistToDisk(); + } + } + + /// + /// Default to Gemini Flash since it's the cheapest option and the user + /// explicitly called out credit cost as a concern. Matches the macOS + /// default (Sonnet) only if that proves to be a better experience. + /// + public const string DefaultModelId = "claude-sonnet-4-6"; + + private PersistedSettings LoadFromDiskOrDefault() + { + try + { + if (!File.Exists(SettingsFilePath)) + { + return new PersistedSettings(); + } + + var fileContents = File.ReadAllText(SettingsFilePath); + var deserialized = JsonSerializer.Deserialize(fileContents, SerializerOptions); + return deserialized ?? new PersistedSettings(); + } + catch (Exception ex) + { + // Corrupt or unreadable settings file — fall back to defaults so + // the app still starts. We don't surface this to the user. + System.Diagnostics.Debug.WriteLine($"[SettingsService] Failed to load settings: {ex.Message}"); + return new PersistedSettings(); + } + } + + private void PersistToDisk() + { + lock (_writeLock) + { + try + { + Directory.CreateDirectory(SettingsDirectory); + var serialized = JsonSerializer.Serialize(_currentSettings, SerializerOptions); + File.WriteAllText(SettingsFilePath, serialized); + } + catch (Exception ex) + { + System.Diagnostics.Debug.WriteLine($"[SettingsService] Failed to save settings: {ex.Message}"); + } + } + } + + private sealed class PersistedSettings + { + public string? SelectedModelId { get; set; } + public bool? IsClickyCursorEnabled { get; set; } + public bool? HasCompletedOnboarding { get; set; } + } +} diff --git a/windows/Clicky/ViewModels/TrayPanelViewModel.cs b/windows/Clicky/ViewModels/TrayPanelViewModel.cs new file mode 100644 index 00000000..de487a6b --- /dev/null +++ b/windows/Clicky/ViewModels/TrayPanelViewModel.cs @@ -0,0 +1,94 @@ +using CommunityToolkit.Mvvm.ComponentModel; +using CommunityToolkit.Mvvm.Input; +using System.Collections.ObjectModel; + +namespace Clicky.ViewModels; + +/// +/// View-model for the borderless tray popover. Binds the model picker rows +/// (Claude: Sonnet/Opus, Gemini: Flash/Pro) to +/// and exposes a Quit command for the app. +/// +public sealed partial class TrayPanelViewModel : ObservableObject +{ + private readonly AppState _appState; + + public TrayPanelViewModel(AppState appState) + { + _appState = appState; + _appState.PropertyChanged += (_, args) => + { + if (args.PropertyName == nameof(AppState.SelectedModelId)) + { + // Refresh the IsSelected flag on every model option so the + // segmented-control highlight follows the active choice. + foreach (var option in ClaudeOptions) option.RefreshSelection(_appState.SelectedModelId); + foreach (var option in GeminiOptions) option.RefreshSelection(_appState.SelectedModelId); + } + }; + + ClaudeOptions = new ObservableCollection + { + CreateOption("Sonnet", "claude-sonnet-4-6"), + CreateOption("Opus", "claude-opus-4-6"), + }; + + GeminiOptions = new ObservableCollection + { + CreateOption("Flash", "gemini-2.5-flash"), + CreateOption("Pro", "gemini-2.5-pro"), + }; + } + + public ObservableCollection ClaudeOptions { get; } + public ObservableCollection GeminiOptions { get; } + + [RelayCommand] + private void SelectModel(string modelId) + { + if (!string.IsNullOrEmpty(modelId)) + { + _appState.SelectedModelId = modelId; + } + } + + [RelayCommand] + private void Quit() + { + System.Windows.Application.Current.Shutdown(); + } + + private ModelOption CreateOption(string displayLabel, string modelId) + { + var option = new ModelOption(displayLabel, modelId, SelectModelCommand); + option.RefreshSelection(_appState.SelectedModelId); + return option; + } +} + +/// +/// A single button within a model-picker segmented control. Exposes a +/// pre-bound so the XAML ItemsControl can wire +/// each button without needing ancestor-lookup gymnastics. +/// +public sealed partial class ModelOption : ObservableObject +{ + public ModelOption(string displayLabel, string modelId, IRelayCommand selectCommand) + { + DisplayLabel = displayLabel; + ModelId = modelId; + SelectCommand = selectCommand; + } + + public string DisplayLabel { get; } + public string ModelId { get; } + public IRelayCommand SelectCommand { get; } + + [ObservableProperty] + private bool _isSelected; + + public void RefreshSelection(string currentModelId) + { + IsSelected = string.Equals(currentModelId, ModelId, StringComparison.OrdinalIgnoreCase); + } +} diff --git a/windows/Clicky/Views/TrayPanelWindow.xaml b/windows/Clicky/Views/TrayPanelWindow.xaml new file mode 100644 index 00000000..62688636 --- /dev/null +++ b/windows/Clicky/Views/TrayPanelWindow.xaml @@ -0,0 +1,237 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Clicky sees your screen and speaks back. Your mic stays off until you hold the shortcut. + + + + + + + + + + + + + + + + + + + + + + + +