diff --git a/.gitignore b/.gitignore index 832e80a1..c7496055 100644 --- a/.gitignore +++ b/.gitignore @@ -1,8 +1,14 @@ worker/node_modules/ worker/.dev.vars +worker/.secrets.local .DS_Store *.xcuserstate build/ releases/ .claude/ coding-plans/ + +# Windows / .NET build output +windows/**/bin/ +windows/**/obj/ +*.user diff --git a/AGENTS.md b/AGENTS.md index 6946d441..f0b695df 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -5,16 +5,24 @@ ## Overview -macOS menu bar companion app. Lives entirely in the macOS status bar (no dock icon, no main window). Clicking the menu bar icon opens a custom floating panel with companion voice controls. Uses push-to-talk (ctrl+option) to capture voice input, transcribes it via AssemblyAI streaming, and sends the transcript + a screenshot of the user's screen to Claude. Claude responds with text (streamed via SSE) and voice (ElevenLabs TTS). A blue cursor overlay can fly to and point at UI elements Claude references on any connected monitor. +Cross-platform companion app. Lives entirely in the OS's system tray / menu bar (no dock icon, no main window). Clicking the tray icon opens a custom floating panel with companion voice controls. Uses push-to-talk (ctrl+option on macOS, ctrl+alt on Windows) to capture voice input, transcribes it via AssemblyAI streaming, and sends the transcript + a screenshot of the user's screen to the active AI provider (Claude or Gemini). The AI responds with text (streamed via SSE) and voice (ElevenLabs TTS). A blue cursor overlay can fly to and point at UI elements the AI references on any connected monitor. -All API keys live on a Cloudflare Worker proxy — nothing sensitive ships in the app. +All API keys live on a Cloudflare Worker proxy — nothing sensitive ships in either app. + +## Repository layout + +| Folder | Purpose | +|--------|---------| +| `leanring-buddy/` + `leanring-buddy.xcodeproj` | macOS menu bar app. Swift + SwiftUI + AppKit. Ships first, most features live here. | +| `windows/` | Windows port. C# + WPF on .NET 8. Currently at Milestone 1 (foundation). See `windows/README.md` for milestone progress. | +| `worker/` | Cloudflare Worker proxy. Shared by both apps unchanged — same routes, same secrets. | ## Architecture - **App Type**: Menu bar-only (`LSUIElement=true`), no dock icon or main window - **Framework**: SwiftUI (macOS native) with AppKit bridging for menu bar panel and cursor overlay - **Pattern**: MVVM with `@StateObject` / `@Published` state management -- **AI Chat**: Claude (Sonnet 4.6 default, Opus 4.6 optional) via Cloudflare Worker proxy with SSE streaming +- **AI Chat**: User-selectable provider — Claude (Sonnet 4.6 default, Opus 4.6 optional) or Gemini (2.5 Flash, 2.5 Pro). Both route through the Cloudflare Worker proxy with SSE streaming. - **Speech-to-Text**: AssemblyAI real-time streaming (`u3-rt-pro` model) via websocket, with OpenAI and Apple Speech as fallbacks - **Text-to-Speech**: ElevenLabs (`eleven_flash_v2_5` model) via Cloudflare Worker proxy - **Screen Capture**: ScreenCaptureKit (macOS 14.2+), multi-monitor support @@ -30,10 +38,11 @@ The app never calls external APIs directly. All requests go through a Cloudflare | Route | Upstream | Purpose | |-------|----------|---------| | `POST /chat` | `api.anthropic.com/v1/messages` | Claude vision + streaming chat | +| `POST /chat-gemini` | `generativelanguage.googleapis.com/v1beta/models/{model}:streamGenerateContent` | Gemini vision + streaming chat. The `model` field in the request body is used to build the upstream URL path. | | `POST /tts` | `api.elevenlabs.io/v1/text-to-speech/{voiceId}` | ElevenLabs TTS audio | | `POST /transcribe-token` | `streaming.assemblyai.com/v3/token` | Fetches a short-lived (480s) AssemblyAI websocket token | -Worker secrets: `ANTHROPIC_API_KEY`, `ASSEMBLYAI_API_KEY`, `ELEVENLABS_API_KEY` +Worker secrets: `ANTHROPIC_API_KEY`, `GEMINI_API_KEY`, `ASSEMBLYAI_API_KEY`, `ELEVENLABS_API_KEY` Worker vars: `ELEVENLABS_VOICE_ID` ### Key Architecture Decisions @@ -67,6 +76,7 @@ Worker vars: `ELEVENLABS_VOICE_ID` | `BuddyAudioConversionSupport.swift` | ~108 | Audio conversion helpers. Converts live mic buffers to PCM16 mono audio and builds WAV payloads for upload-based providers. | | `GlobalPushToTalkShortcutMonitor.swift` | ~132 | System-wide push-to-talk monitor. Owns the listen-only `CGEvent` tap and publishes press/release transitions. | | `ClaudeAPI.swift` | ~291 | Claude vision API client with streaming (SSE) and non-streaming modes. TLS warmup optimization, image MIME detection, conversation history support. | +| `GeminiAPI.swift` | ~240 | Google Gemini vision API client. Mirrors `ClaudeAPI`'s public streaming signature so `CompanionManager` can swap providers transparently. Translates the Gemini-specific request shape (`contents`/`parts`/`inline_data`, `systemInstruction`, `role: "model"`) and parses Gemini's SSE events. Routes through the Worker `/chat-gemini` route — the model ID travels in the body and the Worker plugs it into the upstream URL path. | | `OpenAIAPI.swift` | ~142 | OpenAI GPT vision API client. | | `ElevenLabsTTSClient.swift` | ~81 | ElevenLabs TTS client. Sends text to the Worker proxy, plays back audio via `AVAudioPlayer`. Exposes `isPlaying` for transient cursor scheduling. | | `ElementLocationDetector.swift` | ~335 | Detects UI element locations in screenshots for cursor pointing. | @@ -98,6 +108,7 @@ npm install # Add secrets npx wrangler secret put ANTHROPIC_API_KEY +npx wrangler secret put GEMINI_API_KEY npx wrangler secret put ASSEMBLYAI_API_KEY npx wrangler secret put ELEVENLABS_API_KEY diff --git a/leanring-buddy/CompanionManager.swift b/leanring-buddy/CompanionManager.swift index 0234cf19..ccdb0eda 100644 --- a/leanring-buddy/CompanionManager.swift +++ b/leanring-buddy/CompanionManager.swift @@ -73,9 +73,27 @@ final class CompanionManager: ObservableObject { private static let workerBaseURL = "https://your-worker-name.your-subdomain.workers.dev" private lazy var claudeAPI: ClaudeAPI = { - return ClaudeAPI(proxyURL: "\(Self.workerBaseURL)/chat", model: selectedModel) + // Default to Sonnet when the current selection is a Gemini model so + // the Claude client ships with a valid Anthropic model ID even when + // Gemini is the active provider. + let initialClaudeModel = Self.isGeminiModelID(selectedModel) ? "claude-sonnet-4-6" : selectedModel + return ClaudeAPI(proxyURL: "\(Self.workerBaseURL)/chat", model: initialClaudeModel) }() + private lazy var geminiAPI: GeminiAPI = { + // Default to Flash when the current selection is a Claude model so + // the Gemini client ships with a valid Gemini model ID even when + // Claude is the active provider. + let initialGeminiModel = Self.isGeminiModelID(selectedModel) ? selectedModel : "gemini-2.5-flash" + return GeminiAPI(proxyURL: "\(Self.workerBaseURL)/chat-gemini", model: initialGeminiModel) + }() + + /// Returns true when the given model ID belongs to the Gemini provider. + /// Used throughout the manager to route requests to the correct client. + static func isGeminiModelID(_ modelID: String) -> Bool { + return modelID.hasPrefix("gemini") + } + private lazy var elevenLabsTTSClient: ElevenLabsTTSClient = { return ElevenLabsTTSClient(proxyURL: "\(Self.workerBaseURL)/tts") }() @@ -107,13 +125,23 @@ final class CompanionManager: ObservableObject { /// Used by the panel to show accurate status text ("Active" vs "Ready"). @Published private(set) var isOverlayVisible: Bool = false - /// The Claude model used for voice responses. Persisted to UserDefaults. + /// The model used for voice responses. May be a Claude ID (e.g. "claude-sonnet-4-6") + /// or a Gemini ID (e.g. "gemini-2.5-flash"). Persisted to UserDefaults. + /// The UserDefaults key is still "selectedClaudeModel" for backwards compatibility + /// with installs from before Gemini support existed. @Published var selectedModel: String = UserDefaults.standard.string(forKey: "selectedClaudeModel") ?? "claude-sonnet-4-6" func setSelectedModel(_ model: String) { selectedModel = model UserDefaults.standard.set(model, forKey: "selectedClaudeModel") - claudeAPI.model = model + // Route the new model ID to whichever provider owns it. We leave the + // other provider's model untouched — the next time the user flips back, + // that provider still remembers its previously-selected model. + if Self.isGeminiModelID(model) { + geminiAPI.model = model + } else { + claudeAPI.model = model + } } /// User preference for whether the Clicky cursor should be shown. @@ -179,9 +207,13 @@ final class CompanionManager: ObservableObject { bindVoiceStateObservation() bindAudioPowerLevel() bindShortcutTransitions() - // Eagerly touch the Claude API so its TLS warmup handshake completes - // well before the onboarding demo fires at ~40s into the video. - _ = claudeAPI + // Eagerly touch the active AI provider so its TLS warmup handshake + // completes well before the onboarding demo fires at ~40s into the video. + if Self.isGeminiModelID(selectedModel) { + _ = geminiAPI + } else { + _ = claudeAPI + } // If the user already completed onboarding AND all permissions are // still granted, show the cursor overlay immediately. If permissions @@ -578,11 +610,40 @@ final class CompanionManager: ObservableObject { // MARK: - AI Response Pipeline - /// Captures a screenshot, sends it along with the transcript to Claude, - /// and plays the response aloud via ElevenLabs TTS. The cursor stays in - /// the spinner/processing state until TTS audio begins playing. - /// Claude's response may include a [POINT:x,y:label] tag which triggers - /// the buddy to fly to that element on screen. + /// Dispatches a streaming vision request to whichever provider owns the + /// currently selected model. Both Claude and Gemini expose an identical + /// streaming signature, so call sites don't need to care which one runs. + private func runStreamingVisionRequest( + images: [(data: Data, label: String)], + systemPrompt: String, + conversationHistory: [(userPlaceholder: String, assistantResponse: String)], + userPrompt: String, + onTextChunk: @MainActor @Sendable (String) -> Void + ) async throws -> (text: String, duration: TimeInterval) { + if Self.isGeminiModelID(selectedModel) { + return try await geminiAPI.analyzeImageStreaming( + images: images, + systemPrompt: systemPrompt, + conversationHistory: conversationHistory, + userPrompt: userPrompt, + onTextChunk: onTextChunk + ) + } else { + return try await claudeAPI.analyzeImageStreaming( + images: images, + systemPrompt: systemPrompt, + conversationHistory: conversationHistory, + userPrompt: userPrompt, + onTextChunk: onTextChunk + ) + } + } + + /// Captures a screenshot, sends it along with the transcript to the + /// selected AI provider (Claude or Gemini), and plays the response aloud + /// via ElevenLabs TTS. The cursor stays in the spinner/processing state + /// until TTS audio begins playing. The response may include a + /// [POINT:x,y:label] tag which triggers the buddy to fly to that element. private func sendTranscriptToClaudeWithScreenshot(transcript: String) { currentResponseTask?.cancel() elevenLabsTTSClient.stopPlayback() @@ -610,7 +671,7 @@ final class CompanionManager: ObservableObject { (userPlaceholder: entry.userTranscript, assistantResponse: entry.assistantResponse) } - let (fullResponseText, _) = try await claudeAPI.analyzeImageStreaming( + let (fullResponseText, _) = try await runStreamingVisionRequest( images: labeledImages, systemPrompt: Self.companionVoiceResponseSystemPrompt, conversationHistory: historyForAPI, @@ -982,9 +1043,10 @@ final class CompanionManager: ObservableObject { let dimensionInfo = " (image dimensions: \(cursorScreenCapture.screenshotWidthInPixels)x\(cursorScreenCapture.screenshotHeightInPixels) pixels)" let labeledImages = [(data: cursorScreenCapture.imageData, label: cursorScreenCapture.label + dimensionInfo)] - let (fullResponseText, _) = try await claudeAPI.analyzeImageStreaming( + let (fullResponseText, _) = try await runStreamingVisionRequest( images: labeledImages, systemPrompt: Self.onboardingDemoSystemPrompt, + conversationHistory: [], userPrompt: "look around my screen and find something interesting to point at", onTextChunk: { _ in } ) diff --git a/leanring-buddy/CompanionPanelView.swift b/leanring-buddy/CompanionPanelView.swift index 76789b4c..d1d54c4d 100644 --- a/leanring-buddy/CompanionPanelView.swift +++ b/leanring-buddy/CompanionPanelView.swift @@ -599,16 +599,42 @@ struct CompanionPanelView: View { // MARK: - Model Picker private var modelPickerRow: some View { + // Two provider rows stacked vertically — Claude and Gemini. Four buttons + // in a single row would be too cramped in the menu bar panel width. + VStack(alignment: .leading, spacing: 8) { + modelProviderRow( + providerLabel: "Claude", + options: [ + (displayLabel: "Sonnet", modelID: "claude-sonnet-4-6"), + (displayLabel: "Opus", modelID: "claude-opus-4-6") + ] + ) + modelProviderRow( + providerLabel: "Gemini", + options: [ + (displayLabel: "Flash", modelID: "gemini-2.5-flash"), + (displayLabel: "Pro", modelID: "gemini-2.5-pro") + ] + ) + } + .padding(.vertical, 4) + } + + private func modelProviderRow( + providerLabel: String, + options: [(displayLabel: String, modelID: String)] + ) -> some View { HStack { - Text("Model") + Text(providerLabel) .font(.system(size: 13, weight: .medium)) .foregroundColor(DS.Colors.textSecondary) Spacer() HStack(spacing: 0) { - modelOptionButton(label: "Sonnet", modelID: "claude-sonnet-4-6") - modelOptionButton(label: "Opus", modelID: "claude-opus-4-6") + ForEach(options, id: \.modelID) { option in + modelOptionButton(label: option.displayLabel, modelID: option.modelID) + } } .background( RoundedRectangle(cornerRadius: 6, style: .continuous) @@ -619,7 +645,6 @@ struct CompanionPanelView: View { .stroke(DS.Colors.borderSubtle, lineWidth: 0.5) ) } - .padding(.vertical, 4) } private func modelOptionButton(label: String, modelID: String) -> some View { diff --git a/leanring-buddy/GeminiAPI.swift b/leanring-buddy/GeminiAPI.swift new file mode 100644 index 00000000..1dc5e92f --- /dev/null +++ b/leanring-buddy/GeminiAPI.swift @@ -0,0 +1,273 @@ +// +// GeminiAPI.swift +// Google Gemini API Implementation with streaming support +// +// Mirrors ClaudeAPI's public interface so CompanionManager can route to +// either provider without the caller caring which one is active. The +// request/response translation layer is Gemini-specific (different field +// names, different SSE event shape, different role vocabulary). +// + +import Foundation + +/// Gemini API helper with streaming for progressive text display. +/// Routes through the Cloudflare Worker proxy so the Gemini API key never +/// ships in the app. +class GeminiAPI { + private static let tlsWarmupLock = NSLock() + private static var hasStartedTLSWarmup = false + + private let apiURL: URL + var model: String + private let session: URLSession + + init(proxyURL: String, model: String = "gemini-2.5-flash") { + self.apiURL = URL(string: proxyURL)! + self.model = model + + // Use .default instead of .ephemeral so TLS session tickets are cached. + // Ephemeral sessions do a full TLS handshake on every request, which causes + // transient -1200 (errSSLPeerHandshakeFail) errors with large image payloads. + // Disable URL/cookie caching to avoid storing responses or credentials on disk. + let config = URLSessionConfiguration.default + config.timeoutIntervalForRequest = 120 + config.timeoutIntervalForResource = 300 + config.waitsForConnectivity = true + config.urlCache = nil + config.httpCookieStorage = nil + self.session = URLSession(configuration: config) + + // Fire a lightweight HEAD request in the background to pre-establish the TLS + // connection. This caches the TLS session ticket so the first real API call + // (which carries a large image payload) doesn't need a cold TLS handshake. + warmUpTLSConnectionIfNeeded() + } + + private func makeAPIRequest() -> URLRequest { + var request = URLRequest(url: apiURL) + request.httpMethod = "POST" + request.timeoutInterval = 120 + request.setValue("application/json", forHTTPHeaderField: "Content-Type") + return request + } + + /// Detects the MIME type of image data by inspecting the first bytes. + /// Screen captures from ScreenCaptureKit are JPEG, but pasted images from the + /// clipboard are PNG. Gemini rejects requests where the declared mime_type + /// doesn't match the actual image format. + private func detectImageMediaType(for imageData: Data) -> String { + // PNG files start with the 8-byte signature: 89 50 4E 47 0D 0A 1A 0A + if imageData.count >= 4 { + let pngSignature: [UInt8] = [0x89, 0x50, 0x4E, 0x47] + let firstFourBytes = [UInt8](imageData.prefix(4)) + if firstFourBytes == pngSignature { + return "image/png" + } + } + // Default to JPEG — screen captures use JPEG compression + return "image/jpeg" + } + + /// Sends a no-op HEAD request to the Worker to establish and cache a TLS session. + /// Failures are silently ignored — this is purely an optimization. + private func warmUpTLSConnectionIfNeeded() { + Self.tlsWarmupLock.lock() + let shouldStartTLSWarmup = !Self.hasStartedTLSWarmup + if shouldStartTLSWarmup { + Self.hasStartedTLSWarmup = true + } + Self.tlsWarmupLock.unlock() + + guard shouldStartTLSWarmup else { return } + + guard var warmupURLComponents = URLComponents(url: apiURL, resolvingAgainstBaseURL: false) else { + return + } + + warmupURLComponents.path = "/" + warmupURLComponents.query = nil + warmupURLComponents.fragment = nil + + guard let warmupURL = warmupURLComponents.url else { + return + } + + var warmupRequest = URLRequest(url: warmupURL) + warmupRequest.httpMethod = "HEAD" + warmupRequest.timeoutInterval = 10 + session.dataTask(with: warmupRequest) { _, _, _ in + // Response doesn't matter — the TLS handshake is the goal + }.resume() + } + + /// Builds the Gemini-shaped request body for a vision + streaming call. + /// Gemini uses `contents` with `parts` (text + inline_data), a separate + /// `systemInstruction` field, and "model" as the assistant role. + private func buildGeminiRequestBody( + images: [(data: Data, label: String)], + systemPrompt: String, + conversationHistory: [(userPlaceholder: String, assistantResponse: String)], + userPrompt: String, + maxOutputTokens: Int + ) -> [String: Any] { + var contents: [[String: Any]] = [] + + for (userPlaceholder, assistantResponse) in conversationHistory { + contents.append([ + "role": "user", + "parts": [["text": userPlaceholder]] + ]) + contents.append([ + "role": "model", + "parts": [["text": assistantResponse]] + ]) + } + + // Build current turn with all labeled images + prompt + var currentTurnParts: [[String: Any]] = [] + for image in images { + currentTurnParts.append([ + "inline_data": [ + "mime_type": detectImageMediaType(for: image.data), + "data": image.data.base64EncodedString() + ] + ]) + currentTurnParts.append([ + "text": image.label + ]) + } + currentTurnParts.append([ + "text": userPrompt + ]) + contents.append([ + "role": "user", + "parts": currentTurnParts + ]) + + // `model` is forwarded to the Worker, which pulls it out and plugs it + // into the upstream Gemini URL path — Gemini itself doesn't read it. + return [ + "model": model, + "systemInstruction": [ + "parts": [["text": systemPrompt]] + ], + "contents": contents, + "generationConfig": [ + "maxOutputTokens": maxOutputTokens + ] + ] + } + + /// Send a vision request to Gemini with streaming. + /// Calls `onTextChunk` on the main actor each time new text arrives so the UI updates progressively. + /// Returns the full accumulated text and total duration when the stream completes. + func analyzeImageStreaming( + images: [(data: Data, label: String)], + systemPrompt: String, + conversationHistory: [(userPlaceholder: String, assistantResponse: String)] = [], + userPrompt: String, + onTextChunk: @MainActor @Sendable (String) -> Void + ) async throws -> (text: String, duration: TimeInterval) { + let startTime = Date() + + var request = makeAPIRequest() + + let body = buildGeminiRequestBody( + images: images, + systemPrompt: systemPrompt, + conversationHistory: conversationHistory, + userPrompt: userPrompt, + maxOutputTokens: 1024 + ) + + let bodyData = try JSONSerialization.data(withJSONObject: body) + request.httpBody = bodyData + let payloadMB = Double(bodyData.count) / 1_048_576.0 + print("🌐 Gemini streaming request (\(model)): \(String(format: "%.1f", payloadMB))MB, \(images.count) image(s)") + + // Use bytes streaming for SSE (Server-Sent Events) + let (byteStream, response) = try await session.bytes(for: request) + + guard let httpResponse = response as? HTTPURLResponse else { + throw NSError( + domain: "GeminiAPI", + code: -1, + userInfo: [NSLocalizedDescriptionKey: "Invalid HTTP response"] + ) + } + + // If non-2xx status, read the full body as error text + guard (200...299).contains(httpResponse.statusCode) else { + var errorBodyChunks: [String] = [] + for try await line in byteStream.lines { + errorBodyChunks.append(line) + } + let errorBody = errorBodyChunks.joined(separator: "\n") + throw NSError( + domain: "GeminiAPI", + code: httpResponse.statusCode, + userInfo: [NSLocalizedDescriptionKey: "API Error (\(httpResponse.statusCode)): \(errorBody)"] + ) + } + + // Parse SSE stream — each event is "data: {json}\n\n". + // Gemini sends one event per chunk with shape: + // { "candidates": [ { "content": { "parts": [ {"text": "..."} ], "role": "model" } } ] } + var accumulatedResponseText = "" + + for try await line in byteStream.lines { + guard line.hasPrefix("data: ") else { continue } + let jsonString = String(line.dropFirst(6)) + + // Gemini doesn't send an explicit [DONE] marker, but handle it defensively + guard jsonString != "[DONE]" else { break } + + guard let jsonData = jsonString.data(using: .utf8), + let eventPayload = try? JSONSerialization.jsonObject(with: jsonData) as? [String: Any] else { + continue + } + + // Extract text from candidates[0].content.parts[*].text + guard let candidates = eventPayload["candidates"] as? [[String: Any]], + let firstCandidate = candidates.first, + let content = firstCandidate["content"] as? [String: Any], + let parts = content["parts"] as? [[String: Any]] else { + continue + } + + var chunkText = "" + for part in parts { + if let partText = part["text"] as? String { + chunkText += partText + } + } + + if !chunkText.isEmpty { + accumulatedResponseText += chunkText + let currentAccumulatedText = accumulatedResponseText + await onTextChunk(currentAccumulatedText) + } + } + + let duration = Date().timeIntervalSince(startTime) + return (text: accumulatedResponseText, duration: duration) + } + + /// Non-streaming fallback for validation requests where we don't need progressive display. + /// Uses the same streaming endpoint internally — Gemini returns the full result via SSE + /// and we simply accumulate it before returning. This keeps the Worker route surface small. + func analyzeImage( + images: [(data: Data, label: String)], + systemPrompt: String, + conversationHistory: [(userPlaceholder: String, assistantResponse: String)] = [], + userPrompt: String + ) async throws -> (text: String, duration: TimeInterval) { + return try await analyzeImageStreaming( + images: images, + systemPrompt: systemPrompt, + conversationHistory: conversationHistory, + userPrompt: userPrompt, + onTextChunk: { _ in } + ) + } +} diff --git a/windows/Clicky.sln b/windows/Clicky.sln new file mode 100644 index 00000000..09a641ec --- /dev/null +++ b/windows/Clicky.sln @@ -0,0 +1,21 @@ +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio Version 17 +VisualStudioVersion = 17.9.34622.214 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Clicky", "Clicky\Clicky.csproj", "{B1F9B6C0-5B7E-4D3A-8E4D-1A2B3C4D5E6F}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|Any CPU = Debug|Any CPU + Release|Any CPU = Release|Any CPU + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {B1F9B6C0-5B7E-4D3A-8E4D-1A2B3C4D5E6F}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {B1F9B6C0-5B7E-4D3A-8E4D-1A2B3C4D5E6F}.Debug|Any CPU.Build.0 = Debug|Any CPU + {B1F9B6C0-5B7E-4D3A-8E4D-1A2B3C4D5E6F}.Release|Any CPU.ActiveCfg = Release|Any CPU + {B1F9B6C0-5B7E-4D3A-8E4D-1A2B3C4D5E6F}.Release|Any CPU.Build.0 = Release|Any CPU + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/windows/Clicky/App.xaml b/windows/Clicky/App.xaml new file mode 100644 index 00000000..2be35400 --- /dev/null +++ b/windows/Clicky/App.xaml @@ -0,0 +1,17 @@ + + + + + + + + + + diff --git a/windows/Clicky/App.xaml.cs b/windows/Clicky/App.xaml.cs new file mode 100644 index 00000000..f2f07460 --- /dev/null +++ b/windows/Clicky/App.xaml.cs @@ -0,0 +1,251 @@ +using System.IO; +using System.Windows; +using System.Windows.Media; +using System.Windows.Media.Imaging; +using H.NotifyIcon; +using Clicky.Interop; +using Clicky.Services; +using Clicky.ViewModels; +using Clicky.Views; + +namespace Clicky; + +/// +/// WPF application entry. Boots the tray icon, wires the popover panel, +/// installs the global push-to-talk hotkey, and holds the root AppState +/// for the app's lifetime. +/// +/// This is the Windows analog of the macOS CompanionAppDelegate + +/// MenuBarPanelManager combination (leanring_buddyApp.swift + MenuBarPanelManager.swift). +/// +public partial class App : Application +{ + // Keep singletons alive for the app's lifetime. No DI container in M1 — + // the dependency graph is small enough to thread manually. + private Mutex? _singleInstanceMutex; + private SettingsService? _settingsService; + private AppState? _appState; + private GlobalHotkeyService? _globalHotkeyService; + private TaskbarIcon? _trayIcon; + private TrayPanelWindow? _trayPanelWindow; + private TrayPanelViewModel? _trayPanelViewModel; + private VoicePipelineOrchestrator? _voicePipelineOrchestrator; + private OverlayWindowManager? _overlayWindowManager; + + protected override void OnStartup(StartupEventArgs eventArgs) + { + base.OnStartup(eventArgs); + + if (!TryAcquireSingleInstanceMutex()) + { + // Another instance is already running. Exit quietly — no error + // dialog, so double-clicks from the Start menu are benign. + Shutdown(); + return; + } + + _settingsService = new SettingsService(); + _appState = new AppState(_settingsService); + _trayPanelViewModel = new TrayPanelViewModel(_appState); + _trayPanelWindow = new TrayPanelWindow(_trayPanelViewModel); + + // PostHog setup — idempotent, silent no-op until the write key in + // WorkerConfig.cs is replaced with a real project key. Fires + // app_opened on success. + ClickyAnalytics.Configure(_settingsService.AnalyticsDistinctId); + + InstallTrayIcon(); + InstallGlobalHotkey(); + + // The overlay windows are created after the tray is up so nothing + // flashes in an uninitialized state. Transparent + click-through, so + // their presence is invisible to the desktop beneath. The voice + // pipeline takes a reference so the [POINT:…] tag on each reply can + // fire an element-pointing flight before TTS speaks the text. + _overlayWindowManager = new OverlayWindowManager(_appState, Dispatcher); + _overlayWindowManager.Start(); + + _voicePipelineOrchestrator = new VoicePipelineOrchestrator(_appState, Dispatcher, _overlayWindowManager); + + // First-run onboarding: if the user hasn't completed it, auto-open + // the panel on a centered position so the very first launch shows + // the welcome copy instead of a silent tray icon. Also probe the + // microphone so a disabled capture endpoint is surfaced before the + // first push-to-talk attempt. + ProbeMicrophoneAvailabilityAndUpdateState(); + if (!_appState.HasCompletedOnboarding) + { + _trayPanelWindow.ShowPanelCenteredOnPrimaryScreen(); + ClickyAnalytics.TrackOnboardingStarted(); + } + } + + private void ProbeMicrophoneAvailabilityAndUpdateState() + { + if (_appState is null) return; + var hasMic = MicrophonePermissionHelper.HasActiveCaptureDevice(); + _appState.IsMicrophonePermissionIssue = !hasMic; + if (!hasMic) + { + _appState.LastStatusMessage = + "Microphone appears to be off or blocked. Open Windows privacy settings to enable it."; + ClickyAnalytics.TrackPermissionDenied("microphone"); + } + else + { + ClickyAnalytics.TrackPermissionGranted("microphone"); + } + } + + protected override void OnExit(ExitEventArgs eventArgs) + { + _overlayWindowManager?.Dispose(); + _globalHotkeyService?.Dispose(); + _trayIcon?.Dispose(); + // Orchestrator owns mic/websocket/TTS — dispose synchronously so + // their background threads are joined before the process exits. + if (_voicePipelineOrchestrator is not null) + { + _voicePipelineOrchestrator.DisposeAsync().AsTask().GetAwaiter().GetResult(); + } + _singleInstanceMutex?.ReleaseMutex(); + _singleInstanceMutex?.Dispose(); + base.OnExit(eventArgs); + } + + private bool TryAcquireSingleInstanceMutex() + { + // Per-user mutex — two different users on the same machine can each + // run their own Clicky instance without colliding. + var mutexName = $"Local\\Clicky.SingleInstance.{Environment.UserName}"; + _singleInstanceMutex = new Mutex(initiallyOwned: true, name: mutexName, createdNew: out var createdNew); + return createdNew; + } + + private void InstallTrayIcon() + { + // H.NotifyIcon's IconSource (ImageSource) path can't reliably consume + // a programmatically-rendered bitmap (it tries to round-trip via a + // BitmapImage.UriSource it never has). The Icon property accepts a + // System.Drawing.Icon directly and bypasses that whole conversion, + // so we generate or load a real Win32 icon instead. + _trayIcon = new TaskbarIcon + { + ToolTipText = "Clicky - hold Ctrl+Alt to talk", + Icon = LoadTrayIcon(), + // No built-in context menu - left- and right-click both open the + // custom popover. Quit lives inside the panel. + NoLeftClickDelay = true, + }; + + _trayIcon.TrayLeftMouseUp += (_, _) => ToggleTrayPanel(); + _trayIcon.TrayRightMouseUp += (_, _) => ToggleTrayPanel(); + + _trayIcon.ForceCreate(); + } + + private void InstallGlobalHotkey() + { + _globalHotkeyService = new GlobalHotkeyService(); + _globalHotkeyService.ShortcutPressed += OnPushToTalkPressed; + _globalHotkeyService.ShortcutReleased += OnPushToTalkReleased; + _globalHotkeyService.Start(); + } + + private void OnPushToTalkPressed(object? sender, EventArgs eventArgs) + { + // Panel shouldn't stay visible while the user is talking to the + // app — dismiss it if it happens to be open. + Dispatcher.BeginInvoke(() => _trayPanelWindow?.HidePanel()); + + // The orchestrator owns the state transitions (Listening / Processing + // / Responding / Idle) from here. Swallow exceptions — the + // orchestrator reports them via AppState.LastStatusMessage. + _ = _voicePipelineOrchestrator?.HandlePushToTalkPressedAsync(); + } + + private void OnPushToTalkReleased(object? sender, EventArgs eventArgs) + { + _ = _voicePipelineOrchestrator?.HandlePushToTalkReleasedAsync(); + } + + private void ToggleTrayPanel() + { + if (_trayPanelWindow is null) return; + + if (_trayPanelWindow.IsVisible) + { + _trayPanelWindow.HidePanel(); + return; + } + + NativeMethods.GetCursorPos(out var cursorPositionDevicePixels); + _trayPanelWindow.ShowNearTrayCursor( + cursorPositionDevicePixels.X, + cursorPositionDevicePixels.Y); + } + + /// + /// Loads the tray icon from the bundled resource. Falls back to a + /// generated blue-dot placeholder if the resource is missing so the app + /// is runnable before an artist drops a real .ico in. + /// + private static System.Drawing.Icon LoadTrayIcon() + { + try + { + var packIconUri = new Uri("pack://application:,,,/Resources/clicky-tray.ico", UriKind.Absolute); + var packResource = GetResourceStream(packIconUri); + if (packResource?.Stream is not null) + { + using var iconStream = packResource.Stream; + return new System.Drawing.Icon(iconStream); + } + } + catch + { + // Fall through to the generated placeholder. + } + + return CreatePlaceholderBlueDotIcon(); + } + + /// + /// Builds a 32x32 transparent-background blue dot Icon using GDI so + /// H.NotifyIcon can take it directly. Used when no real clicky-tray.ico + /// resource has been bundled. + /// + private static System.Drawing.Icon CreatePlaceholderBlueDotIcon() + { + const int iconPixelSize = 32; + const int iconPadding = 6; + + using var bitmap = new System.Drawing.Bitmap( + iconPixelSize, + iconPixelSize, + System.Drawing.Imaging.PixelFormat.Format32bppArgb); + + using (var graphics = System.Drawing.Graphics.FromImage(bitmap)) + { + graphics.SmoothingMode = System.Drawing.Drawing2D.SmoothingMode.AntiAlias; + graphics.Clear(System.Drawing.Color.Transparent); + + using var overlayCursorBlueBrush = new System.Drawing.SolidBrush( + System.Drawing.Color.FromArgb(0xFF, 0x33, 0x80, 0xFF)); + + graphics.FillEllipse( + overlayCursorBlueBrush, + iconPadding, + iconPadding, + iconPixelSize - (iconPadding * 2), + iconPixelSize - (iconPadding * 2)); + } + + // GetHicon hands ownership of the HICON to us; FromHandle doesn't take + // ownership, so we'd normally have to clean it up. The TaskbarIcon + // keeps this Icon for the lifetime of the app, so the leak is bounded + // to a single 32x32 cursor handle. + var hIcon = bitmap.GetHicon(); + return (System.Drawing.Icon)System.Drawing.Icon.FromHandle(hIcon).Clone(); + } +} diff --git a/windows/Clicky/AppState.cs b/windows/Clicky/AppState.cs new file mode 100644 index 00000000..b7e58329 --- /dev/null +++ b/windows/Clicky/AppState.cs @@ -0,0 +1,103 @@ +using CommunityToolkit.Mvvm.ComponentModel; +using Clicky.Services; + +namespace Clicky; + +/// +/// Root observable state for the entire Windows app. The C# analog of the +/// macOS CompanionManager (leanring-buddy/CompanionManager.swift). Milestone 1 +/// holds only the persisted preferences and the voice-state enum; later +/// milestones attach the screen-capture, dictation, and AI-chat services. +/// +public sealed partial class AppState : ObservableObject +{ + private readonly SettingsService _settingsService; + + public AppState(SettingsService settingsService) + { + _settingsService = settingsService; + _selectedModelId = settingsService.SelectedModelId; + _isClickyCursorEnabled = settingsService.IsClickyCursorEnabled; + _hasCompletedOnboarding = settingsService.HasCompletedOnboarding; + } + + // ---- Voice pipeline state (populated by later milestones) ---- + + public enum VoiceState + { + Idle, + Listening, + Processing, + Responding, + } + + [ObservableProperty] + private VoiceState _currentVoiceState = VoiceState.Idle; + + /// + /// Live-updating transcript while the user holds push-to-talk. Shows + /// partials as they arrive from AssemblyAI and the finalized text once + /// the shortcut releases. Cleared at the start of each session. + /// + [ObservableProperty] + private string _liveTranscript = string.Empty; + + /// + /// Streaming response text from the active AI provider. Appended to + /// as SSE chunks arrive so the panel can show the answer forming in + /// real time. + /// + [ObservableProperty] + private string _streamedResponseText = string.Empty; + + /// + /// Latest error/status message surfaced from any pipeline component. + /// The panel shows it in the tertiary footer row when present. + /// + [ObservableProperty] + private string _lastStatusMessage = string.Empty; + + /// + /// Set when microphone access is blocked or unavailable. The tray panel + /// shows a "Open privacy settings" shortcut when this is true so the + /// user can fix the permission in one click. + /// + [ObservableProperty] + private bool _isMicrophonePermissionIssue; + + // ---- Persisted preferences ---- + + [ObservableProperty] + private string _selectedModelId; + + partial void OnSelectedModelIdChanged(string value) + { + _settingsService.SelectedModelId = value; + } + + [ObservableProperty] + private bool _isClickyCursorEnabled; + + partial void OnIsClickyCursorEnabledChanged(bool value) + { + _settingsService.IsClickyCursorEnabled = value; + } + + [ObservableProperty] + private bool _hasCompletedOnboarding; + + partial void OnHasCompletedOnboardingChanged(bool value) + { + _settingsService.HasCompletedOnboarding = value; + } + + // ---- Model routing helpers (mirror CompanionManager.isGeminiModelID) ---- + + /// + /// Returns true when the given model ID belongs to the Gemini provider. + /// Used by later milestones to route vision requests to the right client. + /// + public static bool IsGeminiModelId(string modelId) => modelId.StartsWith("gemini", StringComparison.OrdinalIgnoreCase); + + public bool IsCurrentModelGemini => IsGeminiModelId(SelectedModelId); +} diff --git a/windows/Clicky/Clicky.csproj b/windows/Clicky/Clicky.csproj new file mode 100644 index 00000000..193648fd --- /dev/null +++ b/windows/Clicky/Clicky.csproj @@ -0,0 +1,30 @@ + + + + WinExe + + net8.0-windows10.0.19041.0 + 10.0.17763.0 + Clicky + Clicky + enable + enable + true + app.manifest + + PerMonitorV2 + + + + + + + + + + + + diff --git a/windows/Clicky/Interop/NativeMethods.cs b/windows/Clicky/Interop/NativeMethods.cs new file mode 100644 index 00000000..4970cb3c --- /dev/null +++ b/windows/Clicky/Interop/NativeMethods.cs @@ -0,0 +1,230 @@ +using System.Runtime.InteropServices; +using System.Windows; + +namespace Clicky.Interop; + +/// +/// Win32 P/Invoke surface. Grouped here so the rest of the app can stay +/// managed-code-only. Each method is documented with the underlying Win32 +/// function it wraps. +/// +internal static class NativeMethods +{ + // ---- Extended window style bits used by the panel + overlay ---- + public const int GWL_EXSTYLE = -20; + public const int WS_EX_TRANSPARENT = 0x00000020; + public const int WS_EX_TOOLWINDOW = 0x00000080; + public const int WS_EX_LAYERED = 0x00080000; + public const int WS_EX_NOACTIVATE = 0x08000000; + + // ---- SetWindowPos flags (used for non-activating positioning) ---- + public static readonly IntPtr HWND_TOPMOST = new(-1); + public const uint SWP_NOSIZE = 0x0001; + public const uint SWP_NOMOVE = 0x0002; + public const uint SWP_NOACTIVATE = 0x0010; + public const uint SWP_SHOWWINDOW = 0x0040; + + // ---- AppBar query for the Windows taskbar bounds ---- + public const uint ABM_GETTASKBARPOS = 0x00000005; + + [DllImport("user32.dll", SetLastError = true)] + [return: MarshalAs(UnmanagedType.Bool)] + public static extern bool SetWindowPos( + IntPtr hWnd, + IntPtr hWndInsertAfter, + int X, + int Y, + int cx, + int cy, + uint uFlags); + + // 32-bit and 64-bit variants of GetWindowLong / SetWindowLong. The correct + // one is selected at runtime by GetExtendedStyle / SetExtendedStyle below. + [DllImport("user32.dll", EntryPoint = "GetWindowLong")] + private static extern int GetWindowLong32(IntPtr hWnd, int nIndex); + + [DllImport("user32.dll", EntryPoint = "GetWindowLongPtr")] + private static extern IntPtr GetWindowLongPtr64(IntPtr hWnd, int nIndex); + + [DllImport("user32.dll", EntryPoint = "SetWindowLong")] + private static extern int SetWindowLong32(IntPtr hWnd, int nIndex, int dwNewLong); + + [DllImport("user32.dll", EntryPoint = "SetWindowLongPtr")] + private static extern IntPtr SetWindowLongPtr64(IntPtr hWnd, int nIndex, IntPtr dwNewLong); + + public static int GetExtendedStyle(IntPtr hWnd) + { + return IntPtr.Size == 8 + ? (int)GetWindowLongPtr64(hWnd, GWL_EXSTYLE) + : GetWindowLong32(hWnd, GWL_EXSTYLE); + } + + public static void SetExtendedStyle(IntPtr hWnd, int newStyle) + { + if (IntPtr.Size == 8) + { + SetWindowLongPtr64(hWnd, GWL_EXSTYLE, new IntPtr(newStyle)); + } + else + { + SetWindowLong32(hWnd, GWL_EXSTYLE, newStyle); + } + } + + // ---- Taskbar position (used to anchor the panel near the tray icon) ---- + + [StructLayout(LayoutKind.Sequential)] + public struct RECT + { + public int Left; + public int Top; + public int Right; + public int Bottom; + + public int Width => Right - Left; + public int Height => Bottom - Top; + } + + [StructLayout(LayoutKind.Sequential)] + public struct APPBARDATA + { + public uint cbSize; + public IntPtr hWnd; + public uint uCallbackMessage; + public uint uEdge; + public RECT rc; + public int lParam; + } + + [DllImport("shell32.dll", CallingConvention = CallingConvention.StdCall)] + public static extern IntPtr SHAppBarMessage(uint dwMessage, ref APPBARDATA pData); + + // ---- Low-level keyboard hook (push-to-talk hotkey detection) ---- + + public const int WH_KEYBOARD_LL = 13; + public const int WM_KEYDOWN = 0x0100; + public const int WM_KEYUP = 0x0101; + public const int WM_SYSKEYDOWN = 0x0104; + public const int WM_SYSKEYUP = 0x0105; + + [StructLayout(LayoutKind.Sequential)] + public struct KBDLLHOOKSTRUCT + { + public uint vkCode; + public uint scanCode; + public uint flags; + public uint time; + public UIntPtr dwExtraInfo; + } + + public delegate IntPtr LowLevelKeyboardProc(int nCode, IntPtr wParam, IntPtr lParam); + + [DllImport("user32.dll", CharSet = CharSet.Auto, SetLastError = true)] + public static extern IntPtr SetWindowsHookEx(int idHook, LowLevelKeyboardProc lpfn, IntPtr hMod, uint dwThreadId); + + [DllImport("user32.dll", CharSet = CharSet.Auto, SetLastError = true)] + [return: MarshalAs(UnmanagedType.Bool)] + public static extern bool UnhookWindowsHookEx(IntPtr hhk); + + [DllImport("user32.dll", CharSet = CharSet.Auto, SetLastError = true)] + public static extern IntPtr CallNextHookEx(IntPtr hhk, int nCode, IntPtr wParam, IntPtr lParam); + + [DllImport("kernel32.dll", CharSet = CharSet.Auto, SetLastError = true)] + public static extern IntPtr GetModuleHandle(string? lpModuleName); + + // ---- Cursor position (used by the overlay cursor-follow logic) ---- + + [StructLayout(LayoutKind.Sequential)] + public struct POINT + { + public int X; + public int Y; + } + + [DllImport("user32.dll")] + [return: MarshalAs(UnmanagedType.Bool)] + public static extern bool GetCursorPos(out POINT lpPoint); + + // ---- Display enumeration + screen capture (BitBlt) ---- + // Used by ScreenCaptureService to grab per-monitor JPEGs. PerMonitorV2 + // DPI awareness (set in app.manifest) means GetMonitorInfo returns + // physical device pixels and BitBlt copies at the monitor's native + // resolution, which is what the AI needs to reason about coordinates. + + public const int MONITOR_DEFAULTTONEAREST = 2; + + [StructLayout(LayoutKind.Sequential, CharSet = CharSet.Auto)] + public struct MONITORINFOEX + { + public int cbSize; + public RECT rcMonitor; + public RECT rcWork; + public uint dwFlags; + [MarshalAs(UnmanagedType.ByValTStr, SizeConst = 32)] + public string szDevice; + } + + public const uint MONITORINFOF_PRIMARY = 1; + + public delegate bool MonitorEnumProc(IntPtr hMonitor, IntPtr hdcMonitor, ref RECT lprcMonitor, IntPtr dwData); + + [DllImport("user32.dll", SetLastError = true)] + [return: MarshalAs(UnmanagedType.Bool)] + public static extern bool EnumDisplayMonitors(IntPtr hdc, IntPtr lprcClip, MonitorEnumProc lpfnEnum, IntPtr dwData); + + [DllImport("user32.dll", CharSet = CharSet.Auto, SetLastError = true)] + [return: MarshalAs(UnmanagedType.Bool)] + public static extern bool GetMonitorInfo(IntPtr hMonitor, ref MONITORINFOEX lpmi); + + [DllImport("user32.dll", SetLastError = true)] + public static extern IntPtr MonitorFromPoint(POINT pt, uint dwFlags); + + [DllImport("user32.dll", SetLastError = true)] + public static extern IntPtr GetDesktopWindow(); + + [DllImport("user32.dll", SetLastError = true)] + public static extern IntPtr GetDC(IntPtr hWnd); + + [DllImport("user32.dll", SetLastError = true)] + public static extern int ReleaseDC(IntPtr hWnd, IntPtr hDC); + + [DllImport("gdi32.dll", SetLastError = true)] + public static extern IntPtr CreateCompatibleDC(IntPtr hDC); + + [DllImport("gdi32.dll", SetLastError = true)] + public static extern IntPtr CreateCompatibleBitmap(IntPtr hDC, int nWidth, int nHeight); + + [DllImport("gdi32.dll", SetLastError = true)] + public static extern IntPtr SelectObject(IntPtr hDC, IntPtr hObject); + + [DllImport("gdi32.dll", SetLastError = true)] + [return: MarshalAs(UnmanagedType.Bool)] + public static extern bool DeleteObject(IntPtr hObject); + + [DllImport("gdi32.dll", SetLastError = true)] + [return: MarshalAs(UnmanagedType.Bool)] + public static extern bool DeleteDC(IntPtr hDC); + + // BitBlt raster-operation codes. + public const int SRCCOPY = 0x00CC0020; + public const int CAPTUREBLT = 0x40000000; // Includes layered windows in the capture + + [DllImport("gdi32.dll", SetLastError = true)] + [return: MarshalAs(UnmanagedType.Bool)] + public static extern bool BitBlt( + IntPtr hDCDest, int xDest, int yDest, int width, int height, + IntPtr hDCSource, int xSource, int ySource, int rop); + + // ---- DPI helpers (used when positioning the panel in device-pixel coords) ---- + + /// + /// Returns the device-to-DIP scale for the window's monitor. Multiply a + /// device-pixel coord by the reciprocal to get WPF DIPs, or pass WPF DIPs + /// in and multiply by this to get device pixels. + /// + public static double GetDpiScale(Window window) + { + var source = PresentationSource.FromVisual(window); + return source?.CompositionTarget?.TransformToDevice.M11 ?? 1.0; + } +} diff --git a/windows/Clicky/Resources/DesignSystem.xaml b/windows/Clicky/Resources/DesignSystem.xaml new file mode 100644 index 00000000..d474f620 --- /dev/null +++ b/windows/Clicky/Resources/DesignSystem.xaml @@ -0,0 +1,94 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 6 + 8 + 10 + 12 + + 6 + 8 + 10 + 12 + + + Segoe UI Variable, Segoe UI, Arial + + + + + + + + + + + + + + diff --git a/windows/Clicky/Services/AssemblyAIStreamingClient.cs b/windows/Clicky/Services/AssemblyAIStreamingClient.cs new file mode 100644 index 00000000..2413ccb8 --- /dev/null +++ b/windows/Clicky/Services/AssemblyAIStreamingClient.cs @@ -0,0 +1,276 @@ +using System.IO; +using System.Net.Http; +using System.Net.WebSockets; +using System.Text; +using System.Text.Json; +using System.Threading.Channels; + +namespace Clicky.Services; + +/// +/// Streaming AssemblyAI realtime transcription over WebSocket (v3). +/// Port of AssemblyAIStreamingTranscriptionProvider.swift. +/// +/// Lifecycle: +/// 1. — fetches a temporary token from the +/// Worker, opens the websocket with the required query params, and +/// spawns a background receive loop. +/// 2. — caller pushes raw PCM16 little-endian +/// 16-kHz mono frames; they're forwarded as binary websocket messages. +/// 3. — sends ForceEndpoint +/// to flush the partial into a final turn. +/// 4. — sends Terminate, closes the socket. +/// +/// The class raises two events on a worker thread. Marshal to the UI thread +/// at the call site if needed. +/// +public sealed class AssemblyAIStreamingClient : IAsyncDisposable +{ + private const int SampleRateHz = 16_000; + private const string SpeechModel = "u3-rt-pro"; + + private readonly HttpClient _tokenHttpClient = new() { Timeout = TimeSpan.FromSeconds(20) }; + private ClientWebSocket? _webSocket; + private Task? _receiveLoopTask; + private Task? _sendLoopTask; + private CancellationTokenSource? _lifetimeCts; + private Channel>? _audioChannel; + + /// Partial or final transcript text — fires on every Turn message. + public event EventHandler? TranscriptUpdated; + + /// Fires once when AssemblyAI signals end-of-turn (final transcript). + public event EventHandler? FinalTranscriptReady; + + /// Fires if the session errors out (network, upstream rejection). + public event EventHandler? SessionFaulted; + + public bool IsRunning => _webSocket?.State == WebSocketState.Open; + + public async Task StartAsync(CancellationToken cancellationToken) + { + var temporaryToken = await FetchTemporaryTokenAsync(cancellationToken).ConfigureAwait(false); + var websocketUri = BuildWebsocketUri(temporaryToken); + + _lifetimeCts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken); + _webSocket = new ClientWebSocket(); + await _webSocket.ConnectAsync(websocketUri, _lifetimeCts.Token).ConfigureAwait(false); + + _audioChannel = Channel.CreateUnbounded>(new UnboundedChannelOptions + { + SingleReader = true, + SingleWriter = false, + AllowSynchronousContinuations = false, + }); + + _receiveLoopTask = Task.Run(() => RunReceiveLoopAsync(_lifetimeCts.Token)); + _sendLoopTask = Task.Run(() => RunSendLoopAsync(_lifetimeCts.Token)); + } + + /// + /// Enqueue a PCM16 frame for transmission. Non-blocking — frames are + /// buffered in an unbounded channel and flushed by the background sender. + /// + public void AppendAudio(ReadOnlyMemory pcm16LittleEndianBytes) + { + _audioChannel?.Writer.TryWrite(pcm16LittleEndianBytes); + } + + /// + /// Tells AssemblyAI to cut the current partial into a final turn without + /// waiting for natural silence. Used when the user releases push-to-talk. + /// + public async Task RequestFinalTranscriptAsync(CancellationToken cancellationToken) + { + if (_webSocket?.State != WebSocketState.Open) return; + var forceEndpointJson = Encoding.UTF8.GetBytes("{\"type\":\"ForceEndpoint\"}"); + await _webSocket.SendAsync(forceEndpointJson, WebSocketMessageType.Text, endOfMessage: true, cancellationToken) + .ConfigureAwait(false); + } + + public async Task StopAsync(CancellationToken cancellationToken) + { + if (_webSocket is null) return; + + try + { + if (_webSocket.State == WebSocketState.Open) + { + var terminateJson = Encoding.UTF8.GetBytes("{\"type\":\"Terminate\"}"); + await _webSocket.SendAsync(terminateJson, WebSocketMessageType.Text, endOfMessage: true, cancellationToken) + .ConfigureAwait(false); + await _webSocket.CloseOutputAsync(WebSocketCloseStatus.NormalClosure, "client-terminate", cancellationToken) + .ConfigureAwait(false); + } + } + catch (WebSocketException) { /* socket already closed — ignore */ } + catch (OperationCanceledException) { /* shutdown during cancel — ignore */ } + + _lifetimeCts?.Cancel(); + _audioChannel?.Writer.TryComplete(); + + try { if (_sendLoopTask is not null) await _sendLoopTask.ConfigureAwait(false); } + catch (OperationCanceledException) { /* expected */ } + + try { if (_receiveLoopTask is not null) await _receiveLoopTask.ConfigureAwait(false); } + catch (OperationCanceledException) { /* expected */ } + + _webSocket.Dispose(); + _webSocket = null; + } + + private async Task FetchTemporaryTokenAsync(CancellationToken cancellationToken) + { + using var tokenRequest = new HttpRequestMessage(HttpMethod.Post, WorkerConfig.TranscribeTokenUrl); + using var tokenResponse = await _tokenHttpClient.SendAsync(tokenRequest, cancellationToken).ConfigureAwait(false); + tokenResponse.EnsureSuccessStatusCode(); + + var responseBody = await tokenResponse.Content.ReadAsStringAsync(cancellationToken).ConfigureAwait(false); + using var parsedDocument = JsonDocument.Parse(responseBody); + if (!parsedDocument.RootElement.TryGetProperty("token", out var tokenProperty)) + { + throw new InvalidOperationException($"Token proxy response missing 'token' field: {responseBody}"); + } + + var tokenValue = tokenProperty.GetString(); + if (string.IsNullOrWhiteSpace(tokenValue)) + { + throw new InvalidOperationException("Token proxy returned an empty token."); + } + return tokenValue; + } + + private static Uri BuildWebsocketUri(string temporaryToken) + { + var encodedToken = Uri.EscapeDataString(temporaryToken); + var queryString = + $"sample_rate={SampleRateHz}" + + $"&encoding=pcm_s16le" + + $"&format_turns=true" + + $"&speech_model={SpeechModel}" + + $"&token={encodedToken}"; + return new Uri($"wss://streaming.assemblyai.com/v3/ws?{queryString}"); + } + + private async Task RunSendLoopAsync(CancellationToken cancellationToken) + { + if (_audioChannel is null || _webSocket is null) return; + var channelReader = _audioChannel.Reader; + + try + { + while (await channelReader.WaitToReadAsync(cancellationToken).ConfigureAwait(false)) + { + while (channelReader.TryRead(out var pcmFrame)) + { + if (_webSocket.State != WebSocketState.Open) return; + await _webSocket.SendAsync(pcmFrame, WebSocketMessageType.Binary, endOfMessage: true, cancellationToken) + .ConfigureAwait(false); + } + } + } + catch (OperationCanceledException) { /* shutdown — ignore */ } + catch (WebSocketException webSocketException) + { + SessionFaulted?.Invoke(this, webSocketException); + } + } + + private async Task RunReceiveLoopAsync(CancellationToken cancellationToken) + { + if (_webSocket is null) return; + var receiveBuffer = new byte[16 * 1024]; + var messageBuffer = new MemoryStream(); + + try + { + while (_webSocket.State == WebSocketState.Open && !cancellationToken.IsCancellationRequested) + { + messageBuffer.SetLength(0); + WebSocketReceiveResult receiveResult; + do + { + receiveResult = await _webSocket + .ReceiveAsync(new ArraySegment(receiveBuffer), cancellationToken) + .ConfigureAwait(false); + + if (receiveResult.MessageType == WebSocketMessageType.Close) + { + return; + } + + messageBuffer.Write(receiveBuffer, 0, receiveResult.Count); + } while (!receiveResult.EndOfMessage); + + if (receiveResult.MessageType != WebSocketMessageType.Text) continue; + + var messageText = Encoding.UTF8.GetString(messageBuffer.GetBuffer(), 0, (int)messageBuffer.Length); + HandleIncomingMessage(messageText); + } + } + catch (OperationCanceledException) { /* shutdown — ignore */ } + catch (WebSocketException webSocketException) + { + SessionFaulted?.Invoke(this, webSocketException); + } + } + + /// + /// Parses an AssemblyAI v3 realtime message. We only act on + /// Turn messages; session lifecycle (Begin, + /// Termination) doesn't need caller notification here. + /// + private void HandleIncomingMessage(string messageText) + { + try + { + using var parsedDocument = JsonDocument.Parse(messageText); + var rootObject = parsedDocument.RootElement; + if (!rootObject.TryGetProperty("type", out var typeProperty)) return; + + var messageType = typeProperty.GetString(); + if (messageType != "Turn") return; + + var transcriptText = rootObject.TryGetProperty("transcript", out var transcriptProperty) + ? transcriptProperty.GetString() ?? string.Empty + : string.Empty; + + var isEndOfTurn = rootObject.TryGetProperty("end_of_turn", out var endOfTurnProperty) + && endOfTurnProperty.ValueKind == JsonValueKind.True; + var isFormatted = rootObject.TryGetProperty("turn_is_formatted", out var formattedProperty) + && formattedProperty.ValueKind == JsonValueKind.True; + var isFinal = isEndOfTurn || isFormatted; + + var eventArgs = new TranscriptEventArgs(transcriptText, isFinal); + TranscriptUpdated?.Invoke(this, eventArgs); + if (isFinal) + { + FinalTranscriptReady?.Invoke(this, eventArgs); + } + } + catch (JsonException) + { + // Malformed message — ignore rather than fault the session; + // AssemblyAI occasionally emits empty keepalive frames. + } + } + + public async ValueTask DisposeAsync() + { + await StopAsync(CancellationToken.None).ConfigureAwait(false); + _lifetimeCts?.Dispose(); + _tokenHttpClient.Dispose(); + } +} + +public sealed class TranscriptEventArgs : EventArgs +{ + public TranscriptEventArgs(string transcript, bool isFinal) + { + Transcript = transcript; + IsFinal = isFinal; + } + + public string Transcript { get; } + public bool IsFinal { get; } +} diff --git a/windows/Clicky/Services/ClaudeClient.cs b/windows/Clicky/Services/ClaudeClient.cs new file mode 100644 index 00000000..8e970422 --- /dev/null +++ b/windows/Clicky/Services/ClaudeClient.cs @@ -0,0 +1,185 @@ +using System.Diagnostics; +using System.IO; +using System.Net.Http; +using System.Net.Http.Headers; +using System.Text; +using System.Text.Json; + +namespace Clicky.Services; + +/// +/// Streaming Anthropic Messages client. Talks to the Cloudflare Worker's +/// /chat route — the Worker injects the API key and forwards the +/// SSE stream unchanged. Port of ClaudeAPI.swift. +/// +public sealed class ClaudeClient : IChatClient, IDisposable +{ + public const string DefaultModel = "claude-sonnet-4-6"; + private const int MaxOutputTokens = 1024; + + private readonly HttpClient _httpClient; + private readonly bool _ownsHttpClient; + + public string Model { get; set; } + + public ClaudeClient(string model = DefaultModel, HttpClient? httpClient = null) + { + Model = model; + if (httpClient is null) + { + _httpClient = new HttpClient { Timeout = Timeout.InfiniteTimeSpan }; + _ownsHttpClient = true; + } + else + { + _httpClient = httpClient; + _ownsHttpClient = false; + } + } + + public async Task StreamChatAsync( + string systemPrompt, + IReadOnlyList conversationHistory, + string userPrompt, + IReadOnlyList images, + Action onTextChunk, + CancellationToken cancellationToken) + { + var stopwatch = Stopwatch.StartNew(); + + var requestPayload = BuildRequestPayload(systemPrompt, conversationHistory, userPrompt, images); + using var requestMessage = new HttpRequestMessage(HttpMethod.Post, WorkerConfig.ChatClaudeUrl) + { + Content = new StringContent(requestPayload, Encoding.UTF8, "application/json"), + }; + requestMessage.Headers.Accept.Add(new MediaTypeWithQualityHeaderValue("text/event-stream")); + + using var responseMessage = await _httpClient + .SendAsync(requestMessage, HttpCompletionOption.ResponseHeadersRead, cancellationToken) + .ConfigureAwait(false); + + if (!responseMessage.IsSuccessStatusCode) + { + var errorBody = await responseMessage.Content.ReadAsStringAsync(cancellationToken).ConfigureAwait(false); + throw new HttpRequestException( + $"Claude proxy returned {(int)responseMessage.StatusCode}: {errorBody}"); + } + + await using var responseStream = await responseMessage.Content + .ReadAsStreamAsync(cancellationToken) + .ConfigureAwait(false); + using var streamReader = new StreamReader(responseStream, Encoding.UTF8); + + var accumulatedText = new StringBuilder(); + + // SSE frames are separated by blank lines. Within a frame we care + // about the `data:` lines; Anthropic also emits `event:` lines but + // the JSON payload carries its own `type` so we don't need them. + string? currentLine; + while ((currentLine = await streamReader.ReadLineAsync(cancellationToken).ConfigureAwait(false)) is not null) + { + if (currentLine.Length == 0) continue; + if (!currentLine.StartsWith("data:", StringComparison.Ordinal)) continue; + + var jsonPayload = currentLine.AsSpan(5).TrimStart().ToString(); + if (jsonPayload == "[DONE]") break; + if (jsonPayload.Length == 0) continue; + + var chunk = ParseTextDelta(jsonPayload); + if (chunk.Length > 0) + { + accumulatedText.Append(chunk); + onTextChunk(chunk); + } + } + + stopwatch.Stop(); + return new ChatStreamResult(accumulatedText.ToString(), stopwatch.Elapsed); + } + + /// + /// Extracts the delta.text string from an Anthropic streaming + /// payload, or returns empty if this event type doesn't carry text. + /// Anthropic emits many event types (message_start, + /// content_block_start, ping, message_delta, etc.) + /// — we only act on content_block_delta with a + /// text_delta payload, which mirrors the macOS client. + /// + private static string ParseTextDelta(string jsonPayload) + { + try + { + using var parsedDocument = JsonDocument.Parse(jsonPayload); + var rootObject = parsedDocument.RootElement; + if (!rootObject.TryGetProperty("type", out var typeProperty)) return string.Empty; + if (typeProperty.GetString() != "content_block_delta") return string.Empty; + if (!rootObject.TryGetProperty("delta", out var deltaProperty)) return string.Empty; + if (!deltaProperty.TryGetProperty("type", out var deltaTypeProperty)) return string.Empty; + if (deltaTypeProperty.GetString() != "text_delta") return string.Empty; + if (!deltaProperty.TryGetProperty("text", out var textProperty)) return string.Empty; + return textProperty.GetString() ?? string.Empty; + } + catch (JsonException) + { + return string.Empty; + } + } + + private string BuildRequestPayload( + string systemPrompt, + IReadOnlyList conversationHistory, + string userPrompt, + IReadOnlyList images) + { + // Anthropic accepts either a plain string or an array of content + // parts. We use the array form for the latest user turn so we can + // include images; historical turns have no images and can stay + // as plain strings to keep the payload compact. + var messageArray = new List(conversationHistory.Count * 2 + 1); + foreach (var historicalTurn in conversationHistory) + { + messageArray.Add(new { role = "user", content = historicalTurn.UserMessage }); + messageArray.Add(new { role = "assistant", content = historicalTurn.AssistantMessage }); + } + + // Each image is followed by a text part carrying its label so the + // model can distinguish multiple screens (e.g. "screen 1 of 2 — …"). + // Mirrors the macOS ClaudeAPI.analyzeImageStreaming payload shape. + var latestUserContentParts = new List(images.Count * 2 + 1); + foreach (var image in images) + { + latestUserContentParts.Add(new + { + type = "image", + source = new + { + type = "base64", + media_type = image.MimeType, + data = Convert.ToBase64String(image.Data), + }, + }); + if (!string.IsNullOrEmpty(image.Label)) + { + latestUserContentParts.Add(new { type = "text", text = image.Label }); + } + } + latestUserContentParts.Add(new { type = "text", text = userPrompt }); + messageArray.Add(new { role = "user", content = latestUserContentParts }); + + var requestObject = new + { + model = Model, + max_tokens = MaxOutputTokens, + stream = true, + system = systemPrompt, + messages = messageArray, + }; + + return JsonSerializer.Serialize(requestObject); + } + + public void Dispose() + { + if (_ownsHttpClient) _httpClient.Dispose(); + } +} diff --git a/windows/Clicky/Services/ClickyAnalytics.cs b/windows/Clicky/Services/ClickyAnalytics.cs new file mode 100644 index 00000000..04d843c4 --- /dev/null +++ b/windows/Clicky/Services/ClickyAnalytics.cs @@ -0,0 +1,150 @@ +using System.Diagnostics; +using System.Net.Http; +using System.Reflection; +using System.Text; +using System.Text.Json; + +namespace Clicky.Services; + +/// +/// Fire-and-forget PostHog client. Mirrors the event surface of the macOS +/// ClickyAnalytics.swift so the two clients show up side-by-side in +/// the same PostHog project. +/// +/// Calls POST directly to /capture/ — one small HTTP request per +/// event, no batching — which is plenty for the event volume a single +/// desktop app produces. The whole class is a no-op unless +/// has been called with a real write key; swap the +/// placeholder in to turn on. +/// +/// Thread-safety: every method is safe to call from any thread. Failures +/// never propagate — analytics must never break the app. +/// +public static class ClickyAnalytics +{ + private const string PlaceholderWriteKey = "phc_YOUR_POSTHOG_WRITE_KEY_HERE"; + + private static readonly JsonSerializerOptions JsonOptions = new() + { + DefaultIgnoreCondition = System.Text.Json.Serialization.JsonIgnoreCondition.WhenWritingNull, + }; + + private static HttpClient? _httpClient; + private static string? _distinctId; + private static string? _appVersion; + private static bool _isEnabled; + + /// + /// Wires the PostHog client with the persisted distinct-id. Idempotent — + /// safe to call more than once. Fires app_opened as soon as the + /// first call succeeds. + /// + public static void Configure(string distinctId) + { + if (_httpClient is not null) return; + + if (string.IsNullOrWhiteSpace(WorkerConfig.PostHogWriteKey) + || string.Equals(WorkerConfig.PostHogWriteKey, PlaceholderWriteKey, StringComparison.Ordinal)) + { + _isEnabled = false; + return; + } + + _httpClient = new HttpClient + { + Timeout = TimeSpan.FromSeconds(10), + }; + _distinctId = distinctId; + _appVersion = Assembly.GetExecutingAssembly().GetName().Version?.ToString() ?? "unknown"; + _isEnabled = true; + + TrackAppOpened(); + } + + // ---- Event helpers (one per macOS ClickyAnalytics case) ---- + + public static void TrackAppOpened() => Track("app_opened"); + public static void TrackOnboardingStarted() => Track("onboarding_started"); + public static void TrackOnboardingReplayed() => Track("onboarding_replayed"); + public static void TrackOnboardingCompleted() => Track("onboarding_completed"); + + public static void TrackPermissionGranted(string permissionName) => + Track("permission_granted", ("permission", permissionName)); + + public static void TrackAllPermissionsGranted() => Track("all_permissions_granted"); + + public static void TrackPermissionDenied(string permissionName) => + Track("permission_denied", ("permission", permissionName)); + + public static void TrackPushToTalkStarted() => Track("push_to_talk_started"); + public static void TrackPushToTalkReleased() => Track("push_to_talk_released"); + + public static void TrackUserMessageSent(string transcript) => + Track("user_message_sent", + ("transcript", transcript), + ("character_count", transcript.Length)); + + public static void TrackAiResponseReceived(string responseText, string modelId) => + Track("ai_response_received", + ("response_text", responseText), + ("character_count", responseText.Length), + ("model", modelId)); + + public static void TrackElementPointed(string? elementLabel, int? screenNumber) => + Track("element_pointed", + ("element_label", elementLabel ?? string.Empty), + ("screen_number", screenNumber ?? 1)); + + public static void TrackResponseError(string errorMessage) => + Track("response_error", ("error", errorMessage)); + + public static void TrackTtsError(string errorMessage) => + Track("tts_error", ("error", errorMessage)); + + // ---- Core capture ---- + + private static void Track(string eventName, params (string Key, object? Value)[] extraProperties) + { + if (!_isEnabled || _httpClient is null || _distinctId is null) return; + + var properties = new Dictionary + { + ["$os"] = "Windows", + ["$os_version"] = Environment.OSVersion.Version.ToString(), + ["app_version"] = _appVersion ?? "unknown", + ["platform"] = "windows", + }; + foreach (var (key, value) in extraProperties) + { + properties[key] = value; + } + + var payload = new + { + api_key = WorkerConfig.PostHogWriteKey, + @event = eventName, + distinct_id = _distinctId, + properties, + timestamp = DateTimeOffset.UtcNow.ToString("o"), + }; + + // Fire-and-forget. Log failures to debug output only — analytics + // must not surface errors to the user or break the flow. + _ = Task.Run(async () => + { + try + { + var json = JsonSerializer.Serialize(payload, JsonOptions); + using var content = new StringContent(json, Encoding.UTF8, "application/json"); + using var response = await _httpClient.PostAsync(WorkerConfig.PostHogCaptureUrl, content) + .ConfigureAwait(false); + // Don't throw on non-success; PostHog returns 200 with an + // error body when rate-limited or misconfigured. + } + catch (Exception ex) + { + Debug.WriteLine($"[ClickyAnalytics] capture failed: {ex.Message}"); + } + }); + } +} diff --git a/windows/Clicky/Services/DictationSession.cs b/windows/Clicky/Services/DictationSession.cs new file mode 100644 index 00000000..9778ce93 --- /dev/null +++ b/windows/Clicky/Services/DictationSession.cs @@ -0,0 +1,142 @@ +namespace Clicky.Services; + +/// +/// Bridges to +/// and exposes a simple +/// start/finalize/stop API for the orchestrator. Equivalent role to the +/// macOS BuddyDictationManager — minus the Apple-speech fallback +/// since Windows only ships with AssemblyAI in M2. +/// +public sealed class DictationSession : IAsyncDisposable +{ + /// Fallback window — if AssemblyAI hasn't emitted a final + /// transcript within this time after , + /// the session resolves with whatever partial it last saw. Matches the + /// 2.8 s fallback in the macOS provider. + private static readonly TimeSpan FinalTranscriptFallback = TimeSpan.FromSeconds(2.8); + + private readonly MicrophoneCaptureService _microphoneCaptureService; + private readonly AssemblyAIStreamingClient _assemblyAIStreamingClient; + + private string _latestPartialTranscript = string.Empty; + private TaskCompletionSource? _finalTranscriptCompletionSource; + private CancellationTokenSource? _sessionLifetimeCts; + + public event EventHandler? PartialTranscriptUpdated; + public event EventHandler? SessionFaulted; + + public bool IsActive { get; private set; } + + public DictationSession() + { + _microphoneCaptureService = new MicrophoneCaptureService(); + _assemblyAIStreamingClient = new AssemblyAIStreamingClient(); + } + + public async Task StartAsync(CancellationToken cancellationToken) + { + if (IsActive) return; + + _sessionLifetimeCts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken); + _latestPartialTranscript = string.Empty; + _finalTranscriptCompletionSource = null; + + _assemblyAIStreamingClient.TranscriptUpdated += OnAssemblyAITranscriptUpdated; + _assemblyAIStreamingClient.FinalTranscriptReady += OnAssemblyAIFinalTranscriptReady; + _assemblyAIStreamingClient.SessionFaulted += OnUpstreamSessionFaulted; + + _microphoneCaptureService.AudioFrameCaptured += OnMicrophoneAudioFrameCaptured; + _microphoneCaptureService.CaptureFaulted += OnUpstreamSessionFaulted; + + // Open the websocket first — if this throws, the mic never starts. + await _assemblyAIStreamingClient.StartAsync(_sessionLifetimeCts.Token).ConfigureAwait(false); + _microphoneCaptureService.Start(); + + IsActive = true; + } + + /// + /// Called on push-to-talk release. Stops the mic (so no more audio is + /// sent), asks AssemblyAI to finalize the current turn, and awaits the + /// final transcript — with a fallback to the last partial if the + /// websocket doesn't echo a final within the grace window. + /// + public async Task RequestFinalTranscriptAsync(CancellationToken cancellationToken) + { + if (!IsActive) return string.Empty; + + _microphoneCaptureService.Stop(); + + _finalTranscriptCompletionSource = new TaskCompletionSource( + TaskCreationOptions.RunContinuationsAsynchronously); + + try + { + await _assemblyAIStreamingClient.RequestFinalTranscriptAsync(cancellationToken).ConfigureAwait(false); + } + catch (Exception requestException) + { + SessionFaulted?.Invoke(this, requestException); + return _latestPartialTranscript; + } + + // Wait for FinalTranscriptReady or the fallback timer. + using var fallbackCts = new CancellationTokenSource(FinalTranscriptFallback); + using var registration = fallbackCts.Token.Register(() => + _finalTranscriptCompletionSource?.TrySetResult(_latestPartialTranscript)); + + return await _finalTranscriptCompletionSource.Task.ConfigureAwait(false); + } + + public async Task StopAsync(CancellationToken cancellationToken) + { + if (!IsActive) return; + IsActive = false; + + _microphoneCaptureService.Stop(); + await _assemblyAIStreamingClient.StopAsync(cancellationToken).ConfigureAwait(false); + + _assemblyAIStreamingClient.TranscriptUpdated -= OnAssemblyAITranscriptUpdated; + _assemblyAIStreamingClient.FinalTranscriptReady -= OnAssemblyAIFinalTranscriptReady; + _assemblyAIStreamingClient.SessionFaulted -= OnUpstreamSessionFaulted; + _microphoneCaptureService.AudioFrameCaptured -= OnMicrophoneAudioFrameCaptured; + _microphoneCaptureService.CaptureFaulted -= OnUpstreamSessionFaulted; + + _sessionLifetimeCts?.Cancel(); + _sessionLifetimeCts?.Dispose(); + _sessionLifetimeCts = null; + } + + private void OnMicrophoneAudioFrameCaptured(object? sender, ReadOnlyMemory frameBytes) + { + _assemblyAIStreamingClient.AppendAudio(frameBytes); + } + + private void OnAssemblyAITranscriptUpdated(object? sender, TranscriptEventArgs args) + { + if (args.Transcript.Length > 0) + { + _latestPartialTranscript = args.Transcript; + } + PartialTranscriptUpdated?.Invoke(this, _latestPartialTranscript); + } + + private void OnAssemblyAIFinalTranscriptReady(object? sender, TranscriptEventArgs args) + { + var finalText = args.Transcript.Length > 0 ? args.Transcript : _latestPartialTranscript; + _finalTranscriptCompletionSource?.TrySetResult(finalText); + } + + private void OnUpstreamSessionFaulted(object? sender, Exception exception) + { + SessionFaulted?.Invoke(this, exception); + _finalTranscriptCompletionSource?.TrySetResult(_latestPartialTranscript); + } + + public async ValueTask DisposeAsync() + { + await StopAsync(CancellationToken.None).ConfigureAwait(false); + _microphoneCaptureService.Dispose(); + await _assemblyAIStreamingClient.DisposeAsync().ConfigureAwait(false); + } +} diff --git a/windows/Clicky/Services/ElevenLabsTtsClient.cs b/windows/Clicky/Services/ElevenLabsTtsClient.cs new file mode 100644 index 00000000..a2b20b55 --- /dev/null +++ b/windows/Clicky/Services/ElevenLabsTtsClient.cs @@ -0,0 +1,151 @@ +using System.IO; +using System.Net.Http; +using System.Text; +using System.Text.Json; +using NAudio.Wave; + +namespace Clicky.Services; + +/// +/// Port of ElevenLabsTTSClient.swift. Posts the caption text to the +/// Worker's /tts route, receives an MP3 stream, and plays it through +/// the default output device via NAudio. Only one utterance plays at a +/// time — a new call cancels the previous playback. +/// +public sealed class ElevenLabsTtsClient : IDisposable +{ + private const string ElevenLabsModel = "eleven_flash_v2_5"; + private const double VoiceStability = 0.5; + private const double VoiceSimilarityBoost = 0.75; + + private readonly HttpClient _httpClient; + private readonly bool _ownsHttpClient; + + private readonly object _playbackLock = new(); + private WaveOutEvent? _activeWaveOut; + private Mp3FileReader? _activeMp3Reader; + private MemoryStream? _activeMp3Stream; + + public event EventHandler? PlaybackFinished; + + public bool IsPlaying + { + get + { + lock (_playbackLock) + { + return _activeWaveOut?.PlaybackState == PlaybackState.Playing; + } + } + } + + public ElevenLabsTtsClient(HttpClient? httpClient = null) + { + if (httpClient is null) + { + _httpClient = new HttpClient { Timeout = TimeSpan.FromSeconds(30) }; + _ownsHttpClient = true; + } + else + { + _httpClient = httpClient; + _ownsHttpClient = false; + } + } + + public async Task SpeakAsync(string caption, CancellationToken cancellationToken) + { + if (string.IsNullOrWhiteSpace(caption)) return; + + var requestPayload = JsonSerializer.Serialize(new + { + text = caption, + model_id = ElevenLabsModel, + voice_settings = new + { + stability = VoiceStability, + similarity_boost = VoiceSimilarityBoost, + }, + }); + + using var requestMessage = new HttpRequestMessage(HttpMethod.Post, WorkerConfig.TtsUrl) + { + Content = new StringContent(requestPayload, Encoding.UTF8, "application/json"), + }; + + using var responseMessage = await _httpClient + .SendAsync(requestMessage, cancellationToken) + .ConfigureAwait(false); + + if (!responseMessage.IsSuccessStatusCode) + { + var errorBody = await responseMessage.Content.ReadAsStringAsync(cancellationToken).ConfigureAwait(false); + throw new HttpRequestException( + $"ElevenLabs proxy returned {(int)responseMessage.StatusCode}: {errorBody}"); + } + + var mp3Bytes = await responseMessage.Content.ReadAsByteArrayAsync(cancellationToken).ConfigureAwait(false); + StartPlayback(mp3Bytes); + } + + public void StopPlayback() + { + lock (_playbackLock) + { + TeardownCurrentPlaybackLocked(); + } + } + + private void StartPlayback(byte[] mp3Bytes) + { + lock (_playbackLock) + { + TeardownCurrentPlaybackLocked(); + + // Ownership of these disposables transfers to the field until + // PlaybackStopped fires — the stream must outlive the reader. + var mp3Stream = new MemoryStream(mp3Bytes, writable: false); + var mp3Reader = new Mp3FileReader(mp3Stream); + var waveOut = new WaveOutEvent(); + waveOut.Init(mp3Reader); + + waveOut.PlaybackStopped += OnWaveOutPlaybackStopped; + + _activeMp3Stream = mp3Stream; + _activeMp3Reader = mp3Reader; + _activeWaveOut = waveOut; + + waveOut.Play(); + } + } + + private void OnWaveOutPlaybackStopped(object? sender, StoppedEventArgs stoppedEventArgs) + { + lock (_playbackLock) + { + TeardownCurrentPlaybackLocked(); + } + PlaybackFinished?.Invoke(this, EventArgs.Empty); + } + + private void TeardownCurrentPlaybackLocked() + { + if (_activeWaveOut is not null) + { + _activeWaveOut.PlaybackStopped -= OnWaveOutPlaybackStopped; + try { _activeWaveOut.Stop(); } catch { /* already stopped */ } + _activeWaveOut.Dispose(); + _activeWaveOut = null; + } + _activeMp3Reader?.Dispose(); + _activeMp3Reader = null; + _activeMp3Stream?.Dispose(); + _activeMp3Stream = null; + } + + public void Dispose() + { + StopPlayback(); + if (_ownsHttpClient) _httpClient.Dispose(); + } +} diff --git a/windows/Clicky/Services/GeminiClient.cs b/windows/Clicky/Services/GeminiClient.cs new file mode 100644 index 00000000..34f87543 --- /dev/null +++ b/windows/Clicky/Services/GeminiClient.cs @@ -0,0 +1,197 @@ +using System.Diagnostics; +using System.IO; +using System.Net.Http; +using System.Net.Http.Headers; +using System.Text; +using System.Text.Json; + +namespace Clicky.Services; + +/// +/// Streaming Google Gemini client. Talks to the Cloudflare Worker's +/// /chat-gemini route. The Worker extracts the model field +/// from the body (Gemini requires it in the URL path) and forwards the +/// rest. Port of GeminiAPI.swift. +/// +public sealed class GeminiClient : IChatClient, IDisposable +{ + public const string DefaultModel = "gemini-2.5-flash"; + private const int MaxOutputTokens = 1024; + + private readonly HttpClient _httpClient; + private readonly bool _ownsHttpClient; + + public string Model { get; set; } + + public GeminiClient(string model = DefaultModel, HttpClient? httpClient = null) + { + Model = model; + if (httpClient is null) + { + _httpClient = new HttpClient { Timeout = Timeout.InfiniteTimeSpan }; + _ownsHttpClient = true; + } + else + { + _httpClient = httpClient; + _ownsHttpClient = false; + } + } + + public async Task StreamChatAsync( + string systemPrompt, + IReadOnlyList conversationHistory, + string userPrompt, + IReadOnlyList images, + Action onTextChunk, + CancellationToken cancellationToken) + { + var stopwatch = Stopwatch.StartNew(); + + var requestPayload = BuildRequestPayload(systemPrompt, conversationHistory, userPrompt, images); + using var requestMessage = new HttpRequestMessage(HttpMethod.Post, WorkerConfig.ChatGeminiUrl) + { + Content = new StringContent(requestPayload, Encoding.UTF8, "application/json"), + }; + requestMessage.Headers.Accept.Add(new MediaTypeWithQualityHeaderValue("text/event-stream")); + + using var responseMessage = await _httpClient + .SendAsync(requestMessage, HttpCompletionOption.ResponseHeadersRead, cancellationToken) + .ConfigureAwait(false); + + if (!responseMessage.IsSuccessStatusCode) + { + var errorBody = await responseMessage.Content.ReadAsStringAsync(cancellationToken).ConfigureAwait(false); + throw new HttpRequestException( + $"Gemini proxy returned {(int)responseMessage.StatusCode}: {errorBody}"); + } + + await using var responseStream = await responseMessage.Content + .ReadAsStreamAsync(cancellationToken) + .ConfigureAwait(false); + using var streamReader = new StreamReader(responseStream, Encoding.UTF8); + + var accumulatedText = new StringBuilder(); + + string? currentLine; + while ((currentLine = await streamReader.ReadLineAsync(cancellationToken).ConfigureAwait(false)) is not null) + { + if (currentLine.Length == 0) continue; + if (!currentLine.StartsWith("data:", StringComparison.Ordinal)) continue; + + var jsonPayload = currentLine.AsSpan(5).TrimStart().ToString(); + if (jsonPayload.Length == 0) continue; + + var chunk = ExtractTextFromGeminiChunk(jsonPayload); + if (chunk.Length > 0) + { + accumulatedText.Append(chunk); + onTextChunk(chunk); + } + } + + stopwatch.Stop(); + return new ChatStreamResult(accumulatedText.ToString(), stopwatch.Elapsed); + } + + /// + /// Reads candidates[0].content.parts[*].text from a Gemini SSE + /// chunk and concatenates all text parts. Gemini may split a single + /// emission across multiple parts. + /// + private static string ExtractTextFromGeminiChunk(string jsonPayload) + { + try + { + using var parsedDocument = JsonDocument.Parse(jsonPayload); + var rootObject = parsedDocument.RootElement; + if (!rootObject.TryGetProperty("candidates", out var candidatesProperty)) return string.Empty; + if (candidatesProperty.ValueKind != JsonValueKind.Array || candidatesProperty.GetArrayLength() == 0) return string.Empty; + var firstCandidate = candidatesProperty[0]; + if (!firstCandidate.TryGetProperty("content", out var contentProperty)) return string.Empty; + if (!contentProperty.TryGetProperty("parts", out var partsProperty)) return string.Empty; + if (partsProperty.ValueKind != JsonValueKind.Array) return string.Empty; + + var combinedTextBuilder = new StringBuilder(); + foreach (var singlePart in partsProperty.EnumerateArray()) + { + if (singlePart.TryGetProperty("text", out var textProperty) && textProperty.ValueKind == JsonValueKind.String) + { + combinedTextBuilder.Append(textProperty.GetString()); + } + } + return combinedTextBuilder.ToString(); + } + catch (JsonException) + { + return string.Empty; + } + } + + private string BuildRequestPayload( + string systemPrompt, + IReadOnlyList conversationHistory, + string userPrompt, + IReadOnlyList images) + { + // Gemini's roles are "user" and "model" (not "assistant"). System + // instructions ride in a separate top-level field. Inline images + // use `inline_data` with snake_case field names. + var contentsArray = new List(conversationHistory.Count * 2 + 1); + foreach (var historicalTurn in conversationHistory) + { + contentsArray.Add(new + { + role = "user", + parts = new object[] { new { text = historicalTurn.UserMessage } }, + }); + contentsArray.Add(new + { + role = "model", + parts = new object[] { new { text = historicalTurn.AssistantMessage } }, + }); + } + + // Each inline_data image is followed by a text part carrying its + // label, matching the macOS GeminiAPI.analyzeImageStreaming payload. + var latestUserParts = new List(images.Count * 2 + 1); + foreach (var image in images) + { + latestUserParts.Add(new + { + inline_data = new + { + mime_type = image.MimeType, + data = Convert.ToBase64String(image.Data), + }, + }); + if (!string.IsNullOrEmpty(image.Label)) + { + latestUserParts.Add(new { text = image.Label }); + } + } + latestUserParts.Add(new { text = userPrompt }); + contentsArray.Add(new { role = "user", parts = latestUserParts }); + + var requestObject = new + { + model = Model, + systemInstruction = new + { + parts = new object[] { new { text = systemPrompt } }, + }, + contents = contentsArray, + generationConfig = new + { + maxOutputTokens = MaxOutputTokens, + }, + }; + + return JsonSerializer.Serialize(requestObject); + } + + public void Dispose() + { + if (_ownsHttpClient) _httpClient.Dispose(); + } +} diff --git a/windows/Clicky/Services/GlobalHotkeyService.cs b/windows/Clicky/Services/GlobalHotkeyService.cs new file mode 100644 index 00000000..0b9de627 --- /dev/null +++ b/windows/Clicky/Services/GlobalHotkeyService.cs @@ -0,0 +1,145 @@ +using System.Diagnostics; +using System.Runtime.InteropServices; +using System.Windows.Input; +using Clicky.Interop; + +namespace Clicky.Services; + +/// +/// Detects the push-to-talk shortcut (Ctrl+Alt by default) system-wide via a +/// low-level keyboard hook. This is the Windows analog of the macOS CGEvent +/// tap used in GlobalPushToTalkShortcutMonitor.swift. +/// +/// The hook is listen-only — we do NOT swallow the keys, so Ctrl+Alt combos +/// still reach other apps normally. Users can hold Ctrl+Alt to talk to Clicky +/// without breaking their current app's keyboard handling. +/// +/// Events are raised on the thread that installed the hook (the UI thread). +/// Subscribers should keep handlers short to avoid stalling global keyboard +/// delivery — dispatch heavy work off-thread immediately. +/// +public sealed class GlobalHotkeyService : IDisposable +{ + private IntPtr _hookHandle = IntPtr.Zero; + + // Held as a field so the GC doesn't collect the delegate while the hook + // is installed — that would cause a nasty access violation in user32.dll. + private NativeMethods.LowLevelKeyboardProc? _hookCallback; + + private bool _isCtrlHeld; + private bool _isAltHeld; + private bool _isShortcutActive; + + /// + /// Raised when the push-to-talk combination transitions to held. + /// Subscribers should begin recording immediately. + /// + public event EventHandler? ShortcutPressed; + + /// + /// Raised when either modifier in the push-to-talk combination is released. + /// Subscribers should finalize the recording and submit the transcript. + /// + public event EventHandler? ShortcutReleased; + + public void Start() + { + if (_hookHandle != IntPtr.Zero) + { + return; + } + + _hookCallback = HookCallback; + using var process = Process.GetCurrentProcess(); + using var module = process.MainModule + ?? throw new InvalidOperationException("Cannot read main module for hook installation."); + var moduleHandle = NativeMethods.GetModuleHandle(module.ModuleName); + + _hookHandle = NativeMethods.SetWindowsHookEx( + NativeMethods.WH_KEYBOARD_LL, + _hookCallback, + moduleHandle, + 0); + + if (_hookHandle == IntPtr.Zero) + { + throw new InvalidOperationException( + $"Failed to install low-level keyboard hook (GetLastError={Marshal.GetLastWin32Error()})."); + } + } + + public void Stop() + { + if (_hookHandle == IntPtr.Zero) + { + return; + } + + NativeMethods.UnhookWindowsHookEx(_hookHandle); + _hookHandle = IntPtr.Zero; + _hookCallback = null; + _isCtrlHeld = false; + _isAltHeld = false; + _isShortcutActive = false; + } + + public void Dispose() => Stop(); + + private IntPtr HookCallback(int nCode, IntPtr wParam, IntPtr lParam) + { + if (nCode < 0) + { + return NativeMethods.CallNextHookEx(_hookHandle, nCode, wParam, lParam); + } + + var hookStruct = Marshal.PtrToStructure(lParam); + var virtualKey = (Key)KeyInterop.KeyFromVirtualKey((int)hookStruct.vkCode); + var messageCode = wParam.ToInt32(); + + var isKeyDown = messageCode == NativeMethods.WM_KEYDOWN || messageCode == NativeMethods.WM_SYSKEYDOWN; + var isKeyUp = messageCode == NativeMethods.WM_KEYUP || messageCode == NativeMethods.WM_SYSKEYUP; + + // Track only Ctrl and Alt — left and right variants both map to the + // same push-to-talk action (matches macOS left/right option behavior). + var isCtrlKey = virtualKey is Key.LeftCtrl or Key.RightCtrl; + var isAltKey = virtualKey is Key.LeftAlt or Key.RightAlt; + + if (isCtrlKey) + { + if (isKeyDown) _isCtrlHeld = true; + else if (isKeyUp) _isCtrlHeld = false; + } + else if (isAltKey) + { + if (isKeyDown) _isAltHeld = true; + else if (isKeyUp) _isAltHeld = false; + } + else + { + // Any non-modifier keystroke cancels the shortcut. Without this, + // "Ctrl+Alt+T" (or any typing combo) would fire push-to-talk. + if (_isShortcutActive) + { + _isShortcutActive = false; + ShortcutReleased?.Invoke(this, EventArgs.Empty); + } + + return NativeMethods.CallNextHookEx(_hookHandle, nCode, wParam, lParam); + } + + var shouldBeActive = _isCtrlHeld && _isAltHeld; + + if (shouldBeActive && !_isShortcutActive) + { + _isShortcutActive = true; + ShortcutPressed?.Invoke(this, EventArgs.Empty); + } + else if (!shouldBeActive && _isShortcutActive) + { + _isShortcutActive = false; + ShortcutReleased?.Invoke(this, EventArgs.Empty); + } + + return NativeMethods.CallNextHookEx(_hookHandle, nCode, wParam, lParam); + } +} diff --git a/windows/Clicky/Services/IChatClient.cs b/windows/Clicky/Services/IChatClient.cs new file mode 100644 index 00000000..60156a6e --- /dev/null +++ b/windows/Clicky/Services/IChatClient.cs @@ -0,0 +1,49 @@ +namespace Clicky.Services; + +/// +/// Provider-agnostic streaming chat interface. Both +/// and implement it so the orchestrator can swap +/// providers based on the user's model selection without caring which is +/// running. Mirrors the shared shape that ClaudeAPI.swift and +/// GeminiAPI.swift expose on macOS. +/// +public interface IChatClient +{ + /// + /// Model identifier sent to the provider. Setter is used when the user + /// changes the selection in the tray panel mid-session. + /// + string Model { get; set; } + + /// + /// Streams a response for given the + /// accumulated and the optional + /// (inline base64 parts on the wire). + /// fires on the calling thread for every + /// incremental text delta; the returned task resolves with the full + /// accumulated text once the stream closes. + /// + Task StreamChatAsync( + string systemPrompt, + IReadOnlyList conversationHistory, + string userPrompt, + IReadOnlyList images, + Action onTextChunk, + CancellationToken cancellationToken); +} + +/// A completed (user, assistant) pair in the rolling history. +public sealed record ConversationTurn(string UserMessage, string AssistantMessage); + +/// +/// Inline image for vision calls — raw bytes plus IANA media type, and an +/// optional . When Label is non-null the chat +/// clients emit it as a text part immediately before the image so the model +/// knows which screen it's looking at (e.g. "screen 1 of 2 — cursor is on +/// this screen (primary focus) (image dimensions: 1280x800 pixels)"). This +/// matches the macOS analyzeImageStreaming contract. +/// +public sealed record InlineImage(byte[] Data, string MimeType, string? Label = null); + +/// Result of a streaming chat call — full accumulated text + wall time. +public sealed record ChatStreamResult(string FullText, TimeSpan Duration); diff --git a/windows/Clicky/Services/MicrophoneCaptureService.cs b/windows/Clicky/Services/MicrophoneCaptureService.cs new file mode 100644 index 00000000..d4563f69 --- /dev/null +++ b/windows/Clicky/Services/MicrophoneCaptureService.cs @@ -0,0 +1,100 @@ +using NAudio.Wave; + +namespace Clicky.Services; + +/// +/// Captures the default input device as 16-kHz, 16-bit, mono PCM — the +/// exact format AssemblyAI's realtime endpoint expects. This is the +/// Windows equivalent of AVAudioEngine.inputNode.installTap in the +/// macOS BuddyDictationManager. +/// +/// Uses (winmm wrapper) because it can ask the +/// Windows audio engine for a specific format directly — shared-mode +/// conversion to 16 kHz mono happens in the mixer so we don't need to +/// pull in MediaFoundation for resampling. +/// +public sealed class MicrophoneCaptureService : IDisposable +{ + private const int TargetSampleRateHz = 16_000; + private const int TargetBitsPerSample = 16; + private const int TargetChannelCount = 1; + + // 100 ms of 16-kHz 16-bit mono = 3,200 bytes per buffer. AssemblyAI + // accepts 50–1000 ms frames; 100 ms gives snappy partials without + // drowning the websocket in tiny frames. + private const int BufferMilliseconds = 100; + + private WaveInEvent? _waveInDevice; + + public event EventHandler>? AudioFrameCaptured; + public event EventHandler? CaptureFaulted; + + public bool IsRunning { get; private set; } + + public void Start() + { + if (IsRunning) return; + + _waveInDevice = new WaveInEvent + { + WaveFormat = new WaveFormat(TargetSampleRateHz, TargetBitsPerSample, TargetChannelCount), + BufferMilliseconds = BufferMilliseconds, + // Three queued buffers keeps the capture pipeline saturated + // without introducing perceptible latency. + NumberOfBuffers = 3, + }; + + _waveInDevice.DataAvailable += OnMicrophoneDataAvailable; + _waveInDevice.RecordingStopped += OnMicrophoneRecordingStopped; + + _waveInDevice.StartRecording(); + IsRunning = true; + } + + public void Stop() + { + if (!IsRunning) return; + IsRunning = false; + + try + { + _waveInDevice?.StopRecording(); + } + catch + { + // StopRecording throws if the device was already released — + // swallow; the consumer has no action to take. + } + } + + private void OnMicrophoneDataAvailable(object? sender, WaveInEventArgs waveInEventArgs) + { + if (waveInEventArgs.BytesRecorded <= 0) return; + // Copy into a fresh buffer — NAudio reuses the internal one across + // events, so consumers (channels, async sends) must own their slice. + var frameCopy = new byte[waveInEventArgs.BytesRecorded]; + Buffer.BlockCopy(waveInEventArgs.Buffer, 0, frameCopy, 0, waveInEventArgs.BytesRecorded); + AudioFrameCaptured?.Invoke(this, frameCopy); + } + + private void OnMicrophoneRecordingStopped(object? sender, StoppedEventArgs stoppedEventArgs) + { + if (stoppedEventArgs.Exception is not null) + { + CaptureFaulted?.Invoke(this, stoppedEventArgs.Exception); + } + + if (_waveInDevice is not null) + { + _waveInDevice.DataAvailable -= OnMicrophoneDataAvailable; + _waveInDevice.RecordingStopped -= OnMicrophoneRecordingStopped; + _waveInDevice.Dispose(); + _waveInDevice = null; + } + } + + public void Dispose() + { + Stop(); + } +} diff --git a/windows/Clicky/Services/MicrophonePermissionHelper.cs b/windows/Clicky/Services/MicrophonePermissionHelper.cs new file mode 100644 index 00000000..5e8def8b --- /dev/null +++ b/windows/Clicky/Services/MicrophonePermissionHelper.cs @@ -0,0 +1,66 @@ +using System.Diagnostics; +using NAudio.CoreAudioApi; + +namespace Clicky.Services; + +/// +/// Windows-only equivalent of the TCC microphone prompt in the macOS +/// BuddyDictationManager.requestInitialPushToTalkPermissionsIfNeeded. +/// +/// There is no first-party Win32 API to prompt for microphone access on +/// unpackaged desktop apps — the privacy toggle lives in +/// ms-settings:privacy-microphone. The best we can do is: +/// 1. probe for an active capture endpoint at startup so a disabled mic +/// is surfaced before the user tries to talk, and +/// 2. offer a one-click shortcut to the relevant Settings page when a +/// capture attempt fails. +/// +public static class MicrophonePermissionHelper +{ + /// + /// Returns true iff Windows has at least one Active capture + /// endpoint — i.e. a microphone is present and the privacy/device toggle + /// isn't blocking it. Privacy-blocked microphones move to the + /// Disabled state and are excluded here. + /// + public static bool HasActiveCaptureDevice() + { + try + { + using var deviceEnumerator = new MMDeviceEnumerator(); + var activeCaptureEndpoints = deviceEnumerator.EnumerateAudioEndPoints( + DataFlow.Capture, + DeviceState.Active); + return activeCaptureEndpoints.Count > 0; + } + catch + { + // MMDeviceEnumerator throwing usually means the audio service + // is down or we're running in a very unusual environment — treat + // as "no mic" so the UI nudges the user to check settings. + return false; + } + } + + /// + /// Opens the Windows 10/11 "Microphone" privacy page in Settings. Uses + /// the ms-settings: protocol so the user lands one click away + /// from the per-app toggle. + /// + public static void OpenWindowsMicrophonePrivacySettings() + { + try + { + Process.Start(new ProcessStartInfo + { + FileName = "ms-settings:privacy-microphone", + UseShellExecute = true, + }); + } + catch + { + // Settings URI handler missing — nothing useful to show the + // user, they can open Settings manually. + } + } +} diff --git a/windows/Clicky/Services/OverlayWindowManager.cs b/windows/Clicky/Services/OverlayWindowManager.cs new file mode 100644 index 00000000..79ecae12 --- /dev/null +++ b/windows/Clicky/Services/OverlayWindowManager.cs @@ -0,0 +1,227 @@ +using System.ComponentModel; +using System.Windows.Threading; +using Clicky.Interop; +using Clicky.Views; + +namespace Clicky.Services; + +/// +/// Owns the per-monitor instances and drives the +/// 60 fps cursor tracker that moves the blue triangle. Mirrors the macOS +/// OverlayWindowManager in OverlayWindow.swift. +/// +/// Lifecycle: +/// 1. — enumerates monitors, spawns one overlay per +/// display, and starts the dispatcher timer. +/// 2. Tracker tick — reads GetCursorPos, finds the monitor +/// containing the cursor, asks every overlay to re-render its +/// triangle (visible on the cursor's monitor, hidden elsewhere). +/// 3. — stops the timer and closes every overlay. +/// +/// Visibility follows so the +/// triangle appears only during Idle / Responding, matching +/// the macOS contract (during Listening the waveform replaces it, +/// during Processing the spinner does — both are M5/M6 work; for +/// now the triangle simply hides and the system cursor stays). +/// +public sealed class OverlayWindowManager : IDisposable +{ + // 60 fps — matches the macOS Timer(withTimeInterval: 0.016). Feels + // smooth and keeps CPU well below 1% of a modern core. + private static readonly TimeSpan CursorTrackerInterval = TimeSpan.FromMilliseconds(16); + + private readonly AppState _appState; + private readonly Dispatcher _uiDispatcher; + private readonly List _mountedOverlays = new(); + private DispatcherTimer? _cursorTrackerTimer; + private bool _isDisposed; + + public OverlayWindowManager(AppState appState, Dispatcher uiDispatcher) + { + _appState = appState; + _uiDispatcher = uiDispatcher; + } + + /// + /// Boots every overlay and starts the tracking timer. Must be called + /// on the UI thread during app startup. + /// + public void Start() + { + foreach (var enumeratedMonitor in EnumerateMonitors()) + { + var overlay = new OverlayWindow( + monitorBoundsLeftDevicePixels: enumeratedMonitor.BoundsLeft, + monitorBoundsTopDevicePixels: enumeratedMonitor.BoundsTop, + monitorWidthDevicePixels: enumeratedMonitor.PhysicalWidthPixels, + monitorHeightDevicePixels: enumeratedMonitor.PhysicalHeightPixels); + overlay.ShowOnMonitor(); + _mountedOverlays.Add(new MountedOverlay(enumeratedMonitor, overlay)); + } + + _cursorTrackerTimer = new DispatcherTimer(DispatcherPriority.Render, _uiDispatcher) + { + Interval = CursorTrackerInterval, + }; + _cursorTrackerTimer.Tick += OnCursorTrackerTick; + _cursorTrackerTimer.Start(); + + _appState.PropertyChanged += OnAppStatePropertyChanged; + } + + private void OnCursorTrackerTick(object? sender, EventArgs eventArgs) + { + if (_mountedOverlays.Count == 0) return; + if (!NativeMethods.GetCursorPos(out var cursorPositionDevicePixels)) return; + + var triangleShouldBeVisible = TriangleVisibleForVoiceState(_appState.CurrentVoiceState); + var anyOverlayIsFlying = _mountedOverlays.Any(mounted => mounted.Window.IsFlightInProgress); + + foreach (var mountedOverlay in _mountedOverlays) + { + var cursorIsOnThisMonitor = mountedOverlay.Monitor.ContainsDevicePoint( + cursorPositionDevicePixels.X, + cursorPositionDevicePixels.Y); + + // While any overlay is running a pointing flight, suppress the + // cursor-following triangle everywhere else — only one buddy at a + // time, matching the macOS single-active-overlay contract. + var triangleVisibleOnThisOverlay = triangleShouldBeVisible + && (!anyOverlayIsFlying || mountedOverlay.Window.IsFlightInProgress); + + mountedOverlay.Window.UpdateCursorState( + cursorGlobalDeviceX: cursorPositionDevicePixels.X, + cursorGlobalDeviceY: cursorPositionDevicePixels.Y, + cursorIsOnThisMonitor: cursorIsOnThisMonitor, + triangleShouldBeVisible: triangleVisibleOnThisOverlay); + } + } + + /// + /// Kicks off the element-pointing flight on whichever overlay owns the + /// given monitor bounds. The caller passes display-local device pixels + /// (from space), so the + /// overlay can scale to DIPs with its own per-monitor DPI. + /// + /// No-ops if the target monitor is no longer mounted (e.g. unplugged + /// between capture and reply) or if a flight is already in progress — + /// the AI would have to emit a second [POINT:…] mid-flight for that to + /// happen, which in practice doesn't occur during a single TTS turn. + /// + public void FlyToElement( + int targetMonitorBoundsLeftDevicePixels, + int targetMonitorBoundsTopDevicePixels, + double targetDisplayLocalDeviceX, + double targetDisplayLocalDeviceY, + string bubblePhrase) + { + _uiDispatcher.BeginInvoke(() => + { + if (_isDisposed) return; + if (_mountedOverlays.Any(mounted => mounted.Window.IsFlightInProgress)) return; + + var targetOverlay = _mountedOverlays.FirstOrDefault(mounted => + mounted.Monitor.BoundsLeft == targetMonitorBoundsLeftDevicePixels + && mounted.Monitor.BoundsTop == targetMonitorBoundsTopDevicePixels); + if (targetOverlay is null) return; + + targetOverlay.Window.BeginElementPointingFlight( + targetDisplayLocalDeviceX: targetDisplayLocalDeviceX, + targetDisplayLocalDeviceY: targetDisplayLocalDeviceY, + bubblePhrase: bubblePhrase); + }); + } + + /// + /// Triangle is visible while the user is passively present (Idle) or + /// hearing the response back (Responding). During Listening / + /// Processing the macOS overlay swaps in a waveform / spinner — those + /// are M5/M6 work; for now we just hide the triangle so the user + /// sees the system cursor only. + /// + private static bool TriangleVisibleForVoiceState(AppState.VoiceState voiceState) + { + return voiceState == AppState.VoiceState.Idle + || voiceState == AppState.VoiceState.Responding; + } + + private void OnAppStatePropertyChanged(object? sender, PropertyChangedEventArgs args) + { + // Redraw on the next tick; no extra work required here — we only + // subscribe so future state-based extras (e.g. waveform for + // Listening in M5) have a hook to attach to. + } + + // ---- Monitor enumeration ---- + // Duplicated from ScreenCaptureService rather than shared so each + // feature owns a narrow, local view of the monitor topology. If a + // third caller shows up we can extract a MonitorEnumerator. + + private static List EnumerateMonitors() + { + var enumeratedList = new List(); + + bool MonitorEnumCallback(IntPtr hMonitor, IntPtr hdcMonitor, ref NativeMethods.RECT lprcMonitor, IntPtr dwData) + { + var monitorInfo = new NativeMethods.MONITORINFOEX + { + cbSize = System.Runtime.InteropServices.Marshal.SizeOf(), + }; + if (!NativeMethods.GetMonitorInfo(hMonitor, ref monitorInfo)) + { + return true; + } + + enumeratedList.Add(new EnumeratedOverlayMonitor( + Handle: hMonitor, + BoundsLeft: monitorInfo.rcMonitor.Left, + BoundsTop: monitorInfo.rcMonitor.Top, + PhysicalWidthPixels: monitorInfo.rcMonitor.Width, + PhysicalHeightPixels: monitorInfo.rcMonitor.Height)); + return true; + } + + NativeMethods.EnumDisplayMonitors(IntPtr.Zero, IntPtr.Zero, MonitorEnumCallback, IntPtr.Zero); + return enumeratedList; + } + + public void Dispose() + { + if (_isDisposed) return; + _isDisposed = true; + + _appState.PropertyChanged -= OnAppStatePropertyChanged; + + if (_cursorTrackerTimer is not null) + { + _cursorTrackerTimer.Stop(); + _cursorTrackerTimer.Tick -= OnCursorTrackerTick; + _cursorTrackerTimer = null; + } + + foreach (var mountedOverlay in _mountedOverlays) + { + try { mountedOverlay.Window.Close(); } + catch { /* window already torn down during app shutdown — ignore */ } + } + _mountedOverlays.Clear(); + } + + private sealed record EnumeratedOverlayMonitor( + IntPtr Handle, + int BoundsLeft, + int BoundsTop, + int PhysicalWidthPixels, + int PhysicalHeightPixels) + { + public bool ContainsDevicePoint(int globalDeviceX, int globalDeviceY) + { + return globalDeviceX >= BoundsLeft + && globalDeviceX < BoundsLeft + PhysicalWidthPixels + && globalDeviceY >= BoundsTop + && globalDeviceY < BoundsTop + PhysicalHeightPixels; + } + } + + private sealed record MountedOverlay(EnumeratedOverlayMonitor Monitor, OverlayWindow Window); +} diff --git a/windows/Clicky/Services/PointingTagParser.cs b/windows/Clicky/Services/PointingTagParser.cs new file mode 100644 index 00000000..d2530dbc --- /dev/null +++ b/windows/Clicky/Services/PointingTagParser.cs @@ -0,0 +1,77 @@ +using System.Text.RegularExpressions; + +namespace Clicky.Services; + +/// +/// Parses the trailing [POINT:x,y:label:screenN] / [POINT:none] +/// tag the AI appends to voice responses. Port of +/// CompanionManager.parsePointingCoordinates. +/// +/// The orchestrator calls this after the stream completes to split the +/// reply into "spoken text" (TTS input) and the optional pointing target +/// (coordinate + screen + human-readable label). +/// +public static class PointingTagParser +{ + // Same regex the Swift app uses — groups: + // 1 = x (integer pixels), 2 = y, 3 = label (optional), 4 = screen index (optional, 1-based) + private static readonly Regex TrailingPointTagRegex = new( + @"\[POINT:(?:none|(\d+)\s*,\s*(\d+)(?::([^\]:\s][^\]:]*?))?(?::screen(\d+))?)\]\s*$", + RegexOptions.Compiled | RegexOptions.CultureInvariant); + + public static PointingParseResult Parse(string responseText) + { + if (string.IsNullOrEmpty(responseText)) + { + return new PointingParseResult(string.Empty, null, null, null); + } + + var match = TrailingPointTagRegex.Match(responseText); + if (!match.Success) + { + return new PointingParseResult(responseText, null, null, null); + } + + // The "spoken text" is everything before the tag, with trailing + // whitespace trimmed — TTS should read the reply, not the tag. + var spokenText = responseText.Substring(0, match.Index).TrimEnd(); + + var hasCoordinate = match.Groups[1].Success && match.Groups[2].Success; + if (!hasCoordinate) + { + // [POINT:none] — spoken text only, no flight. + return new PointingParseResult(spokenText, null, "none", null); + } + + var pointX = int.Parse(match.Groups[1].Value, System.Globalization.CultureInfo.InvariantCulture); + var pointY = int.Parse(match.Groups[2].Value, System.Globalization.CultureInfo.InvariantCulture); + + string? elementLabel = null; + if (match.Groups[3].Success) + { + elementLabel = match.Groups[3].Value.Trim(); + if (elementLabel.Length == 0) elementLabel = null; + } + + int? screenNumber = null; + if (match.Groups[4].Success + && int.TryParse(match.Groups[4].Value, System.Globalization.NumberStyles.Integer, System.Globalization.CultureInfo.InvariantCulture, out var parsedScreenNumber)) + { + screenNumber = parsedScreenNumber; + } + + return new PointingParseResult(spokenText, (pointX, pointY), elementLabel, screenNumber); + } +} + +/// +/// Result of parsing a [POINT:…] tag. is +/// null when the AI emitted [POINT:none] (or no tag at all); +/// is 1-based and references the cursor-first +/// capture list the AI saw, or null to default to the cursor screen. +/// +public sealed record PointingParseResult( + string SpokenText, + (int X, int Y)? Coordinate, + string? ElementLabel, + int? ScreenNumber); diff --git a/windows/Clicky/Services/ScreenCaptureService.cs b/windows/Clicky/Services/ScreenCaptureService.cs new file mode 100644 index 00000000..fb2ae6ab --- /dev/null +++ b/windows/Clicky/Services/ScreenCaptureService.cs @@ -0,0 +1,300 @@ +using System.Globalization; +using System.IO; +using System.Windows; +using System.Windows.Interop; +using System.Windows.Media; +using System.Windows.Media.Imaging; +using Clicky.Interop; + +namespace Clicky.Services; + +/// +/// Grabs a JPEG of every attached display and returns them ordered with +/// the cursor's display first. Port of the macOS +/// CompanionScreenCaptureUtility.captureAllScreensAsJPEG(). +/// +/// Uses GDI BitBlt against the desktop DC — simple, per-monitor-DPI-aware +/// (thanks to PerMonitorV2 in app.manifest), and doesn't require the +/// WinRT Windows.Graphics.Capture picker flow. Acceptable for +/// static snapshots; if we ever need continuous capture we can swap in +/// GraphicsCaptureItem later. +/// +public sealed class ScreenCaptureService +{ + /// JPEG encoder quality, matches the macOS client (0.8 → 80%). + private const int JpegQualityPercent = 80; + + /// Longest-side pixel budget. Anything larger is downscaled so + /// the API request stays well under Anthropic/Gemini inline image + /// size limits and keeps uploads fast. Matches macOS (1280 points). + private const int MaxLongestSidePixels = 1280; + + /// + /// Captures every monitor synchronously. Returns a list ordered with + /// the cursor's monitor first (flagged "primary focus" in the label) + /// and the rest in enumeration order. + /// + public IReadOnlyList CaptureAllMonitors() + { + var enumeratedMonitors = EnumerateMonitors(); + if (enumeratedMonitors.Count == 0) + { + return Array.Empty(); + } + + var cursorMonitorHandle = FindCursorMonitorHandle(); + var orderedMonitors = OrderCursorFirst(enumeratedMonitors, cursorMonitorHandle); + + var capturedList = new List(orderedMonitors.Count); + for (var orderedIndex = 0; orderedIndex < orderedMonitors.Count; orderedIndex++) + { + var monitor = orderedMonitors[orderedIndex]; + var humanReadableLabel = BuildMonitorLabel( + orderedIndex: orderedIndex, + totalCount: orderedMonitors.Count, + isCursorMonitor: monitor.HandleEquals(cursorMonitorHandle), + isPrimaryMonitor: (monitor.Flags & NativeMethods.MONITORINFOF_PRIMARY) != 0); + + var capture = CaptureSingleMonitor(monitor, humanReadableLabel); + capturedList.Add(capture); + } + return capturedList; + } + + private MonitorCapture CaptureSingleMonitor(EnumeratedMonitor monitor, string humanReadableLabel) + { + // 1. Capture raw pixels via GDI BitBlt. + var sourceBitmap = BitBltMonitorToBitmapSource(monitor); + + // 2. Downscale so the largest side fits MaxLongestSidePixels. + var downscaledBitmap = DownscaleIfLarger(sourceBitmap, MaxLongestSidePixels); + + // 3. Encode as JPEG at the configured quality. + var jpegBytes = EncodeAsJpeg(downscaledBitmap, JpegQualityPercent); + + return new MonitorCapture( + JpegData: jpegBytes, + MimeType: "image/jpeg", + Label: humanReadableLabel, + IsCursorMonitor: monitor.HandleEquals(FindCursorMonitorHandle()), + DisplayWidthPixels: monitor.PhysicalWidthPixels, + DisplayHeightPixels: monitor.PhysicalHeightPixels, + ScreenshotWidthPixels: downscaledBitmap.PixelWidth, + ScreenshotHeightPixels: downscaledBitmap.PixelHeight, + DisplayBoundsDevicePixels: new Int32Rect( + monitor.BoundsLeft, monitor.BoundsTop, + monitor.PhysicalWidthPixels, monitor.PhysicalHeightPixels)); + } + + /// + /// Walks every monitor via EnumDisplayMonitors. The callback gives us + /// an HMONITOR per display; we turn each into a MONITORINFOEX for the + /// bounds + device name. Returned in system enumeration order. + /// + private static List EnumerateMonitors() + { + var enumeratedList = new List(); + + bool MonitorEnumCallback(IntPtr hMonitor, IntPtr hdcMonitor, ref NativeMethods.RECT lprcMonitor, IntPtr dwData) + { + var monitorInfo = new NativeMethods.MONITORINFOEX + { + cbSize = System.Runtime.InteropServices.Marshal.SizeOf(), + }; + if (!NativeMethods.GetMonitorInfo(hMonitor, ref monitorInfo)) + { + // Skip monitors we couldn't query — shouldn't happen in practice. + return true; + } + + enumeratedList.Add(new EnumeratedMonitor( + Handle: hMonitor, + BoundsLeft: monitorInfo.rcMonitor.Left, + BoundsTop: monitorInfo.rcMonitor.Top, + PhysicalWidthPixels: monitorInfo.rcMonitor.Width, + PhysicalHeightPixels: monitorInfo.rcMonitor.Height, + Flags: monitorInfo.dwFlags, + DeviceName: monitorInfo.szDevice ?? string.Empty)); + return true; + } + + NativeMethods.EnumDisplayMonitors(IntPtr.Zero, IntPtr.Zero, MonitorEnumCallback, IntPtr.Zero); + return enumeratedList; + } + + private static IntPtr FindCursorMonitorHandle() + { + if (!NativeMethods.GetCursorPos(out var cursorPosition)) + { + return IntPtr.Zero; + } + return NativeMethods.MonitorFromPoint(cursorPosition, NativeMethods.MONITOR_DEFAULTTONEAREST); + } + + /// Moves the cursor monitor to index 0; preserves the rest in + /// original order. Mirrors the macOS ordering so the AI prompt places + /// the "primary focus" screen first. + private static List OrderCursorFirst( + IReadOnlyList sourceMonitors, + IntPtr cursorMonitorHandle) + { + var orderedList = new List(sourceMonitors.Count); + EnumeratedMonitor? cursorMonitor = null; + foreach (var monitor in sourceMonitors) + { + if (monitor.HandleEquals(cursorMonitorHandle)) { cursorMonitor = monitor; } + else { orderedList.Add(monitor); } + } + if (cursorMonitor is not null) + { + orderedList.Insert(0, cursorMonitor); + } + return orderedList; + } + + /// + /// Copies a monitor's pixels into a WPF-consumable BitmapSource via + /// GDI BitBlt. We operate on the desktop DC so the coordinates are + /// virtual-screen coordinates (matches what GetMonitorInfo returns + /// under PerMonitorV2 DPI awareness). + /// + private static BitmapSource BitBltMonitorToBitmapSource(EnumeratedMonitor monitor) + { + var desktopDC = NativeMethods.GetDC(IntPtr.Zero); + if (desktopDC == IntPtr.Zero) + { + throw new InvalidOperationException("GetDC(desktop) returned NULL."); + } + + IntPtr memoryDC = IntPtr.Zero; + IntPtr compatibleBitmap = IntPtr.Zero; + IntPtr previousBitmap = IntPtr.Zero; + + try + { + memoryDC = NativeMethods.CreateCompatibleDC(desktopDC); + if (memoryDC == IntPtr.Zero) + { + throw new InvalidOperationException("CreateCompatibleDC failed."); + } + + compatibleBitmap = NativeMethods.CreateCompatibleBitmap( + desktopDC, + monitor.PhysicalWidthPixels, + monitor.PhysicalHeightPixels); + if (compatibleBitmap == IntPtr.Zero) + { + throw new InvalidOperationException("CreateCompatibleBitmap failed."); + } + + previousBitmap = NativeMethods.SelectObject(memoryDC, compatibleBitmap); + + var bitBltSucceeded = NativeMethods.BitBlt( + hDCDest: memoryDC, + xDest: 0, yDest: 0, + width: monitor.PhysicalWidthPixels, + height: monitor.PhysicalHeightPixels, + hDCSource: desktopDC, + xSource: monitor.BoundsLeft, + ySource: monitor.BoundsTop, + rop: NativeMethods.SRCCOPY | NativeMethods.CAPTUREBLT); + + if (!bitBltSucceeded) + { + var lastError = System.Runtime.InteropServices.Marshal.GetLastWin32Error(); + throw new InvalidOperationException($"BitBlt failed (Win32 error {lastError})."); + } + + // Snapshot into a WPF BitmapSource. CreateBitmapSourceFromHBitmap + // copies the pixels into managed memory — safe to free the + // HBITMAP immediately after. + var bitmapSource = Imaging.CreateBitmapSourceFromHBitmap( + compatibleBitmap, + IntPtr.Zero, + Int32Rect.Empty, + BitmapSizeOptions.FromEmptyOptions()); + bitmapSource.Freeze(); + return bitmapSource; + } + finally + { + if (previousBitmap != IntPtr.Zero && memoryDC != IntPtr.Zero) + { + NativeMethods.SelectObject(memoryDC, previousBitmap); + } + if (compatibleBitmap != IntPtr.Zero) NativeMethods.DeleteObject(compatibleBitmap); + if (memoryDC != IntPtr.Zero) NativeMethods.DeleteDC(memoryDC); + NativeMethods.ReleaseDC(IntPtr.Zero, desktopDC); + } + } + + private static BitmapSource DownscaleIfLarger(BitmapSource sourceBitmap, int maxLongestSide) + { + var longestSide = Math.Max(sourceBitmap.PixelWidth, sourceBitmap.PixelHeight); + if (longestSide <= maxLongestSide) return sourceBitmap; + + var scaleFactor = (double)maxLongestSide / longestSide; + var scaledTransform = new ScaleTransform(scaleFactor, scaleFactor); + var transformedBitmap = new TransformedBitmap(sourceBitmap, scaledTransform); + transformedBitmap.Freeze(); + return transformedBitmap; + } + + private static byte[] EncodeAsJpeg(BitmapSource bitmap, int qualityPercent) + { + var encoder = new JpegBitmapEncoder { QualityLevel = qualityPercent }; + encoder.Frames.Add(BitmapFrame.Create(bitmap)); + using var memoryStream = new MemoryStream(); + encoder.Save(memoryStream); + return memoryStream.ToArray(); + } + + /// + /// Builds the per-monitor label the AI sees in the prompt. Matches the + /// macOS format so prompt-engineering tweaks there translate directly. + /// + private static string BuildMonitorLabel(int orderedIndex, int totalCount, bool isCursorMonitor, bool isPrimaryMonitor) + { + var labelBuilder = new System.Text.StringBuilder(); + labelBuilder.Append("screen ").Append((orderedIndex + 1).ToString(CultureInfo.InvariantCulture)); + labelBuilder.Append(" of ").Append(totalCount.ToString(CultureInfo.InvariantCulture)); + + if (isCursorMonitor) + { + labelBuilder.Append(" — cursor is on this screen (primary focus)"); + } + else if (isPrimaryMonitor) + { + labelBuilder.Append(" — primary display"); + } + return labelBuilder.ToString(); + } + + private sealed record EnumeratedMonitor( + IntPtr Handle, + int BoundsLeft, + int BoundsTop, + int PhysicalWidthPixels, + int PhysicalHeightPixels, + uint Flags, + string DeviceName) + { + public bool HandleEquals(IntPtr otherHandle) => Handle == otherHandle && Handle != IntPtr.Zero; + } +} + +/// +/// A single captured monitor ready to ship to an AI provider. Mirrors the +/// macOS CompanionScreenCapture struct — field names adjusted for +/// C# conventions. +/// +public sealed record MonitorCapture( + byte[] JpegData, + string MimeType, + string Label, + bool IsCursorMonitor, + int DisplayWidthPixels, + int DisplayHeightPixels, + int ScreenshotWidthPixels, + int ScreenshotHeightPixels, + Int32Rect DisplayBoundsDevicePixels); diff --git a/windows/Clicky/Services/SettingsService.cs b/windows/Clicky/Services/SettingsService.cs new file mode 100644 index 00000000..3b9c42dd --- /dev/null +++ b/windows/Clicky/Services/SettingsService.cs @@ -0,0 +1,137 @@ +using System.IO; +using System.Text.Json; +using System.Text.Json.Serialization; + +namespace Clicky.Services; + +/// +/// Persists user preferences to %APPDATA%\Clicky\settings.json. +/// Equivalent of the macOS app's UserDefaults usage in CompanionManager.swift. +/// Reads are synchronous and cheap; writes debounce so rapid toggles don't +/// thrash the disk. +/// +public sealed class SettingsService +{ + private static readonly string SettingsDirectory = Path.Combine( + Environment.GetFolderPath(Environment.SpecialFolder.ApplicationData), + "Clicky"); + + private static readonly string SettingsFilePath = Path.Combine(SettingsDirectory, "settings.json"); + + private static readonly JsonSerializerOptions SerializerOptions = new() + { + WriteIndented = true, + DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull, + }; + + private PersistedSettings _currentSettings; + private readonly object _writeLock = new(); + + public SettingsService() + { + _currentSettings = LoadFromDiskOrDefault(); + } + + public string SelectedModelId + { + get => _currentSettings.SelectedModelId ?? DefaultModelId; + set + { + _currentSettings.SelectedModelId = value; + PersistToDisk(); + } + } + + public bool IsClickyCursorEnabled + { + get => _currentSettings.IsClickyCursorEnabled ?? true; + set + { + _currentSettings.IsClickyCursorEnabled = value; + PersistToDisk(); + } + } + + public bool HasCompletedOnboarding + { + get => _currentSettings.HasCompletedOnboarding ?? false; + set + { + _currentSettings.HasCompletedOnboarding = value; + PersistToDisk(); + } + } + + /// + /// Stable, anonymous per-install identifier used as PostHog's + /// distinct_id. Generated lazily on first access so events can be + /// correlated across launches without ever linking to an identity. + /// + public string AnalyticsDistinctId + { + get + { + if (!string.IsNullOrEmpty(_currentSettings.AnalyticsDistinctId)) + { + return _currentSettings.AnalyticsDistinctId; + } + _currentSettings.AnalyticsDistinctId = Guid.NewGuid().ToString("N"); + PersistToDisk(); + return _currentSettings.AnalyticsDistinctId; + } + } + + /// + /// Default to Gemini Flash since it's the cheapest option and the user + /// explicitly called out credit cost as a concern. Matches the macOS + /// default (Sonnet) only if that proves to be a better experience. + /// + public const string DefaultModelId = "claude-sonnet-4-6"; + + private PersistedSettings LoadFromDiskOrDefault() + { + try + { + if (!File.Exists(SettingsFilePath)) + { + return new PersistedSettings(); + } + + var fileContents = File.ReadAllText(SettingsFilePath); + var deserialized = JsonSerializer.Deserialize(fileContents, SerializerOptions); + return deserialized ?? new PersistedSettings(); + } + catch (Exception ex) + { + // Corrupt or unreadable settings file — fall back to defaults so + // the app still starts. We don't surface this to the user. + System.Diagnostics.Debug.WriteLine($"[SettingsService] Failed to load settings: {ex.Message}"); + return new PersistedSettings(); + } + } + + private void PersistToDisk() + { + lock (_writeLock) + { + try + { + Directory.CreateDirectory(SettingsDirectory); + var serialized = JsonSerializer.Serialize(_currentSettings, SerializerOptions); + File.WriteAllText(SettingsFilePath, serialized); + } + catch (Exception ex) + { + System.Diagnostics.Debug.WriteLine($"[SettingsService] Failed to save settings: {ex.Message}"); + } + } + } + + private sealed class PersistedSettings + { + public string? SelectedModelId { get; set; } + public bool? IsClickyCursorEnabled { get; set; } + public bool? HasCompletedOnboarding { get; set; } + public string? AnalyticsDistinctId { get; set; } + } +} diff --git a/windows/Clicky/Services/VoicePipelineOrchestrator.cs b/windows/Clicky/Services/VoicePipelineOrchestrator.cs new file mode 100644 index 00000000..ab4d0846 --- /dev/null +++ b/windows/Clicky/Services/VoicePipelineOrchestrator.cs @@ -0,0 +1,459 @@ +using System.Globalization; +using System.Windows.Threading; + +namespace Clicky.Services; + +/// +/// End-to-end voice pipeline. Drives the push-to-talk flow: +/// press → mic + AssemblyAI start, AppState.VoiceState.Listening +/// release → finalize transcript, dispatch to Claude/Gemini streaming +/// (VoiceState.Processing), stream response to the +/// panel, hand the final caption to ElevenLabs for playback +/// (VoiceState.Responding), return to +/// VoiceState.Idle when audio stops. +/// +/// The macOS equivalent is the transcript→AI→TTS pipeline embedded in +/// CompanionManager.swift. We pulled it into its own class on +/// Windows so App.xaml.cs and AppState stay small. +/// +public sealed class VoicePipelineOrchestrator : IAsyncDisposable +{ + // Verbatim port of CompanionManager.companionVoiceResponseSystemPrompt. + // Kept in sync with the macOS version so prompt-engineering tweaks there + // translate directly. The trailing [POINT:...] tag is stripped before + // TTS / display; the M4 overlay will start consuming it. + private const string VoiceSystemPrompt = + "you're clicky, a friendly always-on companion that lives in the user's menu bar. the user just spoke to you via push-to-talk and you can see their screen(s). your reply will be spoken aloud via text-to-speech, so write the way you'd actually talk. this is an ongoing conversation — you remember everything they've said before.\n" + + "\n" + + "rules:\n" + + "- default to one or two sentences. be direct and dense. BUT if the user asks you to explain more, go deeper, or elaborate, then go all out — give a thorough, detailed explanation with no length limit.\n" + + "- all lowercase, casual, warm. no emojis.\n" + + "- write for the ear, not the eye. short sentences. no lists, bullet points, markdown, or formatting — just natural speech.\n" + + "- don't use abbreviations or symbols that sound weird read aloud. write \"for example\" not \"e.g.\", spell out small numbers.\n" + + "- if the user's question relates to what's on their screen, reference specific things you see.\n" + + "- if the screenshot doesn't seem relevant to their question, just answer the question directly.\n" + + "- you can help with anything — coding, writing, general knowledge, brainstorming.\n" + + "- never say \"simply\" or \"just\".\n" + + "- don't read out code verbatim. describe what the code does or what needs to change conversationally.\n" + + "- focus on giving a thorough, useful explanation. don't end with simple yes/no questions like \"want me to explain more?\" or \"should i show you?\" — those are dead ends that force the user to just say yes.\n" + + "- instead, when it fits naturally, end by planting a seed — mention something bigger or more ambitious they could try, a related concept that goes deeper, or a next-level technique that builds on what you just explained. make it something worth coming back for, not a question they'd just nod to. it's okay to not end with anything extra if the answer is complete on its own.\n" + + "- if you receive multiple screen images, the one labeled \"primary focus\" is where the cursor is — prioritize that one but reference others if relevant.\n" + + "\n" + + "element pointing:\n" + + "you have a small blue triangle cursor that can fly to and point at things on screen. use it whenever pointing would genuinely help the user — if they're asking how to do something, looking for a menu, trying to find a button, or need help navigating an app, point at the relevant element. err on the side of pointing rather than not pointing, because it makes your help way more useful and concrete.\n" + + "\n" + + "don't point at things when it would be pointless — like if the user asks a general knowledge question, or the conversation has nothing to do with what's on screen, or you'd just be pointing at something obvious they're already looking at. but if there's a specific UI element, menu, button, or area on screen that's relevant to what you're helping with, point at it.\n" + + "\n" + + "when you point, append a coordinate tag at the very end of your response, AFTER your spoken text. the screenshot images are labeled with their pixel dimensions. use those dimensions as the coordinate space. the origin (0,0) is the top-left corner of the image. x increases rightward, y increases downward.\n" + + "\n" + + "format: [POINT:x,y:label] where x,y are integer pixel coordinates in the screenshot's coordinate space, and label is a short 1-3 word description of the element (like \"search bar\" or \"save button\"). if the element is on the cursor's screen you can omit the screen number. if the element is on a DIFFERENT screen, append :screenN where N is the screen number from the image label (e.g. :screen2). this is important — without the screen number, the cursor will point at the wrong place.\n" + + "\n" + + "if pointing wouldn't help, append [POINT:none].\n" + + "\n" + + "examples:\n" + + "- user asks how to color grade in final cut: \"you'll want to open the color inspector — it's right up in the top right area of the toolbar. click that and you'll get all the color wheels and curves. [POINT:1100,42:color inspector]\"\n" + + "- user asks what html is: \"html stands for hypertext markup language, it's basically the skeleton of every web page. curious how it connects to the css you're looking at? [POINT:none]\"\n" + + "- user asks how to commit in xcode: \"see that source control menu up top? click that and hit commit, or you can use command option c as a shortcut. [POINT:285,11:source control]\"\n" + + "- element is on screen 2 (not where cursor is): \"that's over on your other monitor — see the terminal window? [POINT:400,300:terminal:screen2]\""; + + // Short "here!" phrases picked at random for the speech bubble the + // triangle shows once it reaches the element. Mirrors the macOS list + // in OverlayWindow.navigationBubblePhrases. + private static readonly string[] PointerBubblePhrases = + { + "right here!", + "this one!", + "over here!", + "click this!", + "here it is!", + "found it!", + }; + + private const int ConversationHistoryMaxTurns = 10; + + private readonly AppState _appState; + private readonly Dispatcher _uiDispatcher; + private readonly ClaudeClient _claudeClient; + private readonly GeminiClient _geminiClient; + private readonly ElevenLabsTtsClient _elevenLabsTtsClient; + private readonly ScreenCaptureService _screenCaptureService; + private readonly OverlayWindowManager? _overlayWindowManager; + + private DictationSession? _activeDictationSession; + private readonly List _conversationHistory = new(); + + private CancellationTokenSource? _currentRequestCts; + + public VoicePipelineOrchestrator( + AppState appState, + Dispatcher uiDispatcher, + OverlayWindowManager? overlayWindowManager = null) + { + _appState = appState; + _uiDispatcher = uiDispatcher; + _overlayWindowManager = overlayWindowManager; + + _claudeClient = new ClaudeClient(model: InferInitialClaudeModel(appState.SelectedModelId)); + _geminiClient = new GeminiClient(model: InferInitialGeminiModel(appState.SelectedModelId)); + _elevenLabsTtsClient = new ElevenLabsTtsClient(); + _elevenLabsTtsClient.PlaybackFinished += OnTtsPlaybackFinished; + _screenCaptureService = new ScreenCaptureService(); + + _appState.PropertyChanged += OnAppStatePropertyChanged; + } + + public async Task HandlePushToTalkPressedAsync() + { + ClickyAnalytics.TrackPushToTalkStarted(); + + // Talking over the previous reply → cancel in-flight AI request and + // stop TTS so the user isn't competing with the assistant's voice. + _currentRequestCts?.Cancel(); + _elevenLabsTtsClient.StopPlayback(); + + SetVoiceStateOnUi(AppState.VoiceState.Listening); + SetLiveTranscriptOnUi(string.Empty); + SetStreamedResponseOnUi(string.Empty); + + try + { + var newSession = new DictationSession(); + newSession.PartialTranscriptUpdated += OnPartialTranscriptUpdated; + newSession.SessionFaulted += OnDictationFaulted; + await newSession.StartAsync(CancellationToken.None).ConfigureAwait(false); + _activeDictationSession = newSession; + + // Successful capture start is the clearest signal that the mic + // privacy toggle is granted — clear any stale "blocked" state. + SetMicrophonePermissionIssueOnUi(false); + } + catch (Exception startException) + { + SetVoiceStateOnUi(AppState.VoiceState.Idle); + SetMicrophonePermissionIssueOnUi(true); + ReportFailureOnUi( + "Couldn't start microphone. Check Windows privacy settings. " + + $"({startException.Message})"); + ClickyAnalytics.TrackPermissionDenied("microphone"); + ClickyAnalytics.TrackResponseError(startException.Message); + } + } + + public async Task HandlePushToTalkReleasedAsync() + { + ClickyAnalytics.TrackPushToTalkReleased(); + + var releasedSession = _activeDictationSession; + if (releasedSession is null) + { + SetVoiceStateOnUi(AppState.VoiceState.Idle); + return; + } + + SetVoiceStateOnUi(AppState.VoiceState.Processing); + + string finalTranscript; + try + { + finalTranscript = await releasedSession.RequestFinalTranscriptAsync(CancellationToken.None) + .ConfigureAwait(false); + } + catch (Exception finalizeException) + { + SetVoiceStateOnUi(AppState.VoiceState.Idle); + ReportFailureOnUi($"Transcription ended unexpectedly: {finalizeException.Message}"); + await TeardownDictationSessionAsync(releasedSession).ConfigureAwait(false); + return; + } + + await TeardownDictationSessionAsync(releasedSession).ConfigureAwait(false); + + if (string.IsNullOrWhiteSpace(finalTranscript)) + { + SetVoiceStateOnUi(AppState.VoiceState.Idle); + return; + } + + SetLiveTranscriptOnUi(finalTranscript); + ClickyAnalytics.TrackUserMessageSent(finalTranscript); + await DispatchToAiAndSpeakAsync(finalTranscript).ConfigureAwait(false); + } + + private async Task DispatchToAiAndSpeakAsync(string userPrompt) + { + _currentRequestCts?.Dispose(); + _currentRequestCts = new CancellationTokenSource(); + var cancellationToken = _currentRequestCts.Token; + + var selectedClient = ResolveClientForCurrentModel(); + + try + { + // Grab every monitor JPEG before contacting the model. BitBlt is + // synchronous and runs on a thread pool thread so the UI doesn't + // freeze while the capture happens. We also keep the raw capture + // list so [POINT:…] coordinates can be mapped back to the matching + // monitor's bounds once the reply is parsed. + var capturedMonitors = await Task.Run( + () => _screenCaptureService.CaptureAllMonitors(), + cancellationToken).ConfigureAwait(false); + var inlineImages = BuildInlineImagesFromCaptures(capturedMonitors); + + if (cancellationToken.IsCancellationRequested) return; + + var streamResult = await selectedClient.StreamChatAsync( + systemPrompt: VoiceSystemPrompt, + conversationHistory: _conversationHistory, + userPrompt: userPrompt, + images: inlineImages, + onTextChunk: AppendToStreamedResponseOnUi, + cancellationToken: cancellationToken).ConfigureAwait(false); + + if (cancellationToken.IsCancellationRequested) return; + + // Split the reply into spoken text + optional pointing target. + // TTS speaks the spoken text; the flight fires before playback + // starts so the triangle is already en route when the user + // hears Clicky start talking. + var pointingParseResult = PointingTagParser.Parse(streamResult.FullText); + var spokenText = pointingParseResult.SpokenText; + SetStreamedResponseOnUi(spokenText); + + AppendTurnToHistory(userPrompt, spokenText); + + ClickyAnalytics.TrackAiResponseReceived(spokenText, _appState.SelectedModelId); + + TriggerPointingFlightIfRequested(pointingParseResult, capturedMonitors); + + if (spokenText.Length == 0) + { + SetVoiceStateOnUi(AppState.VoiceState.Idle); + return; + } + + SetVoiceStateOnUi(AppState.VoiceState.Responding); + try + { + await _elevenLabsTtsClient.SpeakAsync(spokenText, cancellationToken).ConfigureAwait(false); + } + catch (OperationCanceledException) { throw; } + catch (Exception ttsException) + { + ClickyAnalytics.TrackTtsError(ttsException.Message); + throw; + } + } + catch (OperationCanceledException) + { + // User cut us off — leave the state reset to the next handler. + } + catch (Exception aiException) + { + SetVoiceStateOnUi(AppState.VoiceState.Idle); + ReportFailureOnUi($"AI request failed: {aiException.Message}"); + ClickyAnalytics.TrackResponseError(aiException.Message); + } + } + + /// + /// Wraps each monitor capture into an whose + /// label matches the macOS format — "screen N of M — cursor is on this + /// screen (primary focus) (image dimensions: WxH pixels)" — so the + /// model's coordinate space maps to the pixels it actually sees. + /// + private static IReadOnlyList BuildInlineImagesFromCaptures(IReadOnlyList capturedMonitors) + { + if (capturedMonitors.Count == 0) return Array.Empty(); + + var inlineImages = new List(capturedMonitors.Count); + foreach (var monitorCapture in capturedMonitors) + { + var dimensionSuffix = string.Format( + CultureInfo.InvariantCulture, + " (image dimensions: {0}x{1} pixels)", + monitorCapture.ScreenshotWidthPixels, + monitorCapture.ScreenshotHeightPixels); + + inlineImages.Add(new InlineImage( + Data: monitorCapture.JpegData, + MimeType: monitorCapture.MimeType, + Label: monitorCapture.Label + dimensionSuffix)); + } + return inlineImages; + } + + /// + /// Maps a parsed [POINT:x,y:label:screenN] tag back to a concrete + /// overlay flight: picks the target monitor (by screen index, defaulting + /// to the cursor's monitor i.e. index 0), rescales screenshot pixels to + /// the monitor's native device pixels, clamps into bounds, and tells the + /// to fly the triangle there. + /// + private void TriggerPointingFlightIfRequested( + PointingParseResult parseResult, + IReadOnlyList capturedMonitors) + { + if (_overlayWindowManager is null) return; + if (parseResult.Coordinate is not (int pointX, int pointY)) return; + if (capturedMonitors.Count == 0) return; + + // screenNumber is 1-based and indexes into the cursor-first capture + // list. Out-of-range values fall back to the cursor's screen so a + // sloppy AI reply still lands somewhere sensible. + var targetCaptureIndex = 0; + if (parseResult.ScreenNumber is int screenNumber) + { + var candidateIndex = screenNumber - 1; + if (candidateIndex >= 0 && candidateIndex < capturedMonitors.Count) + { + targetCaptureIndex = candidateIndex; + } + } + + var targetCapture = capturedMonitors[targetCaptureIndex]; + if (targetCapture.ScreenshotWidthPixels <= 0 || targetCapture.ScreenshotHeightPixels <= 0) return; + + // Screenshot coords → display-local device pixels. The JPEG may be + // downscaled (MaxLongestSidePixels in ScreenCaptureService), so we + // rescale to the monitor's native resolution before handing off. + var screenshotScaleX = (double)targetCapture.DisplayWidthPixels / targetCapture.ScreenshotWidthPixels; + var screenshotScaleY = (double)targetCapture.DisplayHeightPixels / targetCapture.ScreenshotHeightPixels; + var displayLocalDeviceX = Math.Clamp(pointX * screenshotScaleX, 0, targetCapture.DisplayWidthPixels - 1); + var displayLocalDeviceY = Math.Clamp(pointY * screenshotScaleY, 0, targetCapture.DisplayHeightPixels - 1); + + var bubblePhrase = PointerBubblePhrases[Random.Shared.Next(PointerBubblePhrases.Length)]; + + _overlayWindowManager.FlyToElement( + targetMonitorBoundsLeftDevicePixels: targetCapture.DisplayBoundsDevicePixels.X, + targetMonitorBoundsTopDevicePixels: targetCapture.DisplayBoundsDevicePixels.Y, + targetDisplayLocalDeviceX: displayLocalDeviceX, + targetDisplayLocalDeviceY: displayLocalDeviceY, + bubblePhrase: bubblePhrase); + + ClickyAnalytics.TrackElementPointed(parseResult.ElementLabel, targetCaptureIndex + 1); + } + + private IChatClient ResolveClientForCurrentModel() + { + var currentModelId = _appState.SelectedModelId; + if (AppState.IsGeminiModelId(currentModelId)) + { + _geminiClient.Model = currentModelId; + return _geminiClient; + } + _claudeClient.Model = currentModelId; + return _claudeClient; + } + + private void AppendTurnToHistory(string userPrompt, string assistantReply) + { + _conversationHistory.Add(new ConversationTurn(userPrompt, assistantReply)); + while (_conversationHistory.Count > ConversationHistoryMaxTurns) + { + _conversationHistory.RemoveAt(0); + } + } + + private async Task TeardownDictationSessionAsync(DictationSession session) + { + session.PartialTranscriptUpdated -= OnPartialTranscriptUpdated; + session.SessionFaulted -= OnDictationFaulted; + try + { + await session.StopAsync(CancellationToken.None).ConfigureAwait(false); + } + catch { /* tearing down — best effort */ } + await session.DisposeAsync().ConfigureAwait(false); + if (ReferenceEquals(_activeDictationSession, session)) + { + _activeDictationSession = null; + } + } + + private void OnPartialTranscriptUpdated(object? sender, string partialTranscript) + { + SetLiveTranscriptOnUi(partialTranscript); + } + + private void OnDictationFaulted(object? sender, Exception exception) + { + ReportFailureOnUi($"Dictation error: {exception.Message}"); + SetVoiceStateOnUi(AppState.VoiceState.Idle); + } + + private void OnTtsPlaybackFinished(object? sender, EventArgs eventArgs) + { + SetVoiceStateOnUi(AppState.VoiceState.Idle); + } + + private void OnAppStatePropertyChanged(object? sender, System.ComponentModel.PropertyChangedEventArgs args) + { + // Keep the chat clients' models in sync with the user's selection + // so a mid-conversation switch takes effect on the next turn. + if (args.PropertyName == nameof(AppState.SelectedModelId)) + { + ResolveClientForCurrentModel(); + } + } + + // ---- UI marshaling helpers ---- + + private void SetVoiceStateOnUi(AppState.VoiceState newState) + { + _uiDispatcher.BeginInvoke(() => _appState.CurrentVoiceState = newState); + } + + private void SetLiveTranscriptOnUi(string transcript) + { + _uiDispatcher.BeginInvoke(() => _appState.LiveTranscript = transcript); + } + + private void SetStreamedResponseOnUi(string newText) + { + _uiDispatcher.BeginInvoke(() => _appState.StreamedResponseText = newText); + } + + private void AppendToStreamedResponseOnUi(string textChunk) + { + _uiDispatcher.BeginInvoke(() => _appState.StreamedResponseText += textChunk); + } + + private void ReportFailureOnUi(string failureMessage) + { + _uiDispatcher.BeginInvoke(() => _appState.LastStatusMessage = failureMessage); + } + + private void SetMicrophonePermissionIssueOnUi(bool hasIssue) + { + _uiDispatcher.BeginInvoke(() => _appState.IsMicrophonePermissionIssue = hasIssue); + } + + // ---- Model defaults ---- + + private static string InferInitialClaudeModel(string selectedModelId) + { + return AppState.IsGeminiModelId(selectedModelId) ? ClaudeClient.DefaultModel : selectedModelId; + } + + private static string InferInitialGeminiModel(string selectedModelId) + { + return AppState.IsGeminiModelId(selectedModelId) ? selectedModelId : GeminiClient.DefaultModel; + } + + public async ValueTask DisposeAsync() + { + _appState.PropertyChanged -= OnAppStatePropertyChanged; + _currentRequestCts?.Cancel(); + _currentRequestCts?.Dispose(); + + if (_activeDictationSession is not null) + { + await TeardownDictationSessionAsync(_activeDictationSession).ConfigureAwait(false); + } + + _elevenLabsTtsClient.PlaybackFinished -= OnTtsPlaybackFinished; + _elevenLabsTtsClient.Dispose(); + _claudeClient.Dispose(); + _geminiClient.Dispose(); + } +} diff --git a/windows/Clicky/Services/WorkerConfig.cs b/windows/Clicky/Services/WorkerConfig.cs new file mode 100644 index 00000000..ef3c8d75 --- /dev/null +++ b/windows/Clicky/Services/WorkerConfig.cs @@ -0,0 +1,39 @@ +namespace Clicky.Services; + +/// +/// Cloudflare Worker proxy endpoints. Mirrors the single workerBaseURL +/// constant in the macOS CompanionManager.swift. All provider secrets +/// (Anthropic, Gemini, AssemblyAI, ElevenLabs) live on the Worker — the +/// desktop app ships with zero embedded keys and reaches the upstream APIs +/// only through these routes. +/// +/// Swap for your own Worker deployment. Everything +/// else is derived from it. +/// +public static class WorkerConfig +{ + /// + /// Base URL of the Cloudflare Worker deployment. Matches the placeholder + /// used in the macOS source — replace with your own Worker subdomain + /// before shipping. + /// + public const string BaseUrl = "https://your-worker-name.your-subdomain.workers.dev"; + + public static string ChatClaudeUrl => $"{BaseUrl}/chat"; + public static string ChatGeminiUrl => $"{BaseUrl}/chat-gemini"; + public static string TranscribeTokenUrl => $"{BaseUrl}/transcribe-token"; + public static string TtsUrl => $"{BaseUrl}/tts"; + + /// + /// PostHog project write-only key. PostHog keys are designed to ship in + /// client apps — they can only post events, not read data — so it's safe + /// to bundle one here. Swap the placeholder for your own project key to + /// enable analytics; leave it unset and + /// silently drops every event. + /// + public const string PostHogWriteKey = "phc_YOUR_POSTHOG_WRITE_KEY_HERE"; + + /// PostHog capture endpoint (US region). Matches the macOS + /// ClickyAnalytics.swift host. + public const string PostHogCaptureUrl = "https://us.i.posthog.com/capture/"; +} diff --git a/windows/Clicky/ViewModels/TrayPanelViewModel.cs b/windows/Clicky/ViewModels/TrayPanelViewModel.cs new file mode 100644 index 00000000..5b06ad15 --- /dev/null +++ b/windows/Clicky/ViewModels/TrayPanelViewModel.cs @@ -0,0 +1,140 @@ +using CommunityToolkit.Mvvm.ComponentModel; +using CommunityToolkit.Mvvm.Input; +using System.Collections.ObjectModel; +using Clicky.Services; + +namespace Clicky.ViewModels; + +/// +/// View-model for the borderless tray popover. Binds the model picker rows +/// (Claude: Sonnet/Opus, Gemini: Flash/Pro) to +/// and exposes Quit, onboarding, and privacy-settings commands for the app. +/// +public sealed partial class TrayPanelViewModel : ObservableObject +{ + private readonly AppState _appState; + + public TrayPanelViewModel(AppState appState) + { + _appState = appState; + _appState.PropertyChanged += (_, args) => + { + if (args.PropertyName == nameof(AppState.SelectedModelId)) + { + // Refresh the IsSelected flag on every model option so the + // segmented-control highlight follows the active choice. + foreach (var option in ClaudeOptions) option.RefreshSelection(_appState.SelectedModelId); + foreach (var option in GeminiOptions) option.RefreshSelection(_appState.SelectedModelId); + } + else if (args.PropertyName == nameof(AppState.HasCompletedOnboarding)) + { + OnPropertyChanged(nameof(IsOnboardingVisible)); + OnPropertyChanged(nameof(IsMainContentVisible)); + } + }; + + ClaudeOptions = new ObservableCollection + { + CreateOption("Sonnet", "claude-sonnet-4-6"), + CreateOption("Opus", "claude-opus-4-6"), + }; + + GeminiOptions = new ObservableCollection + { + CreateOption("Flash", "gemini-2.5-flash"), + CreateOption("Pro", "gemini-2.5-pro"), + }; + } + + /// + /// Shows the welcome/onboarding block (Get started button + intro copy) + /// until the user completes it once. After that it stays hidden and the + /// regular panel body takes over. + /// + public bool IsOnboardingVisible => !_appState.HasCompletedOnboarding; + + /// The main panel body (transcript, model picker, footer) — + /// shown once onboarding is done. + public bool IsMainContentVisible => _appState.HasCompletedOnboarding; + + [RelayCommand] + private void CompleteOnboarding() + { + _appState.HasCompletedOnboarding = true; + ClickyAnalytics.TrackOnboardingCompleted(); + } + + [RelayCommand] + private void ReplayOnboarding() + { + _appState.HasCompletedOnboarding = false; + ClickyAnalytics.TrackOnboardingReplayed(); + } + + [RelayCommand] + private void OpenMicrophonePrivacySettings() + { + MicrophonePermissionHelper.OpenWindowsMicrophonePrivacySettings(); + } + + public ObservableCollection ClaudeOptions { get; } + public ObservableCollection GeminiOptions { get; } + + /// + /// Exposed so the panel can bind directly to + /// , + /// , and + /// without the view-model + /// having to re-publish them. + /// + public AppState AppState => _appState; + + [RelayCommand] + private void SelectModel(string modelId) + { + if (!string.IsNullOrEmpty(modelId)) + { + _appState.SelectedModelId = modelId; + } + } + + [RelayCommand] + private void Quit() + { + System.Windows.Application.Current.Shutdown(); + } + + private ModelOption CreateOption(string displayLabel, string modelId) + { + var option = new ModelOption(displayLabel, modelId, SelectModelCommand); + option.RefreshSelection(_appState.SelectedModelId); + return option; + } +} + +/// +/// A single button within a model-picker segmented control. Exposes a +/// pre-bound so the XAML ItemsControl can wire +/// each button without needing ancestor-lookup gymnastics. +/// +public sealed partial class ModelOption : ObservableObject +{ + public ModelOption(string displayLabel, string modelId, IRelayCommand selectCommand) + { + DisplayLabel = displayLabel; + ModelId = modelId; + SelectCommand = selectCommand; + } + + public string DisplayLabel { get; } + public string ModelId { get; } + public IRelayCommand SelectCommand { get; } + + [ObservableProperty] + private bool _isSelected; + + public void RefreshSelection(string currentModelId) + { + IsSelected = string.Equals(currentModelId, ModelId, StringComparison.OrdinalIgnoreCase); + } +} diff --git a/windows/Clicky/Views/BooleanToVisibilityConverter.cs b/windows/Clicky/Views/BooleanToVisibilityConverter.cs new file mode 100644 index 00000000..06c37dac --- /dev/null +++ b/windows/Clicky/Views/BooleanToVisibilityConverter.cs @@ -0,0 +1,30 @@ +using System.Globalization; +using System.Windows; +using System.Windows.Data; + +namespace Clicky.Views; + +/// +/// Maps a bound boolean to . True → Visible, +/// False → Collapsed by default; pass "Invert" as the converter +/// parameter to flip the mapping (used by the tray panel to show the +/// welcome block while the main panel is collapsed, and vice-versa). +/// +[ValueConversion(typeof(bool), typeof(Visibility))] +public sealed class BooleanToVisibilityConverter : IValueConverter +{ + public object Convert(object? value, Type targetType, object? parameter, CultureInfo culture) + { + var boolValue = value is bool b && b; + if (string.Equals(parameter as string, "Invert", StringComparison.OrdinalIgnoreCase)) + { + boolValue = !boolValue; + } + return boolValue ? Visibility.Visible : Visibility.Collapsed; + } + + public object ConvertBack(object? value, Type targetType, object? parameter, CultureInfo culture) + { + throw new NotSupportedException(); + } +} diff --git a/windows/Clicky/Views/OverlayWindow.xaml b/windows/Clicky/Views/OverlayWindow.xaml new file mode 100644 index 00000000..10632651 --- /dev/null +++ b/windows/Clicky/Views/OverlayWindow.xaml @@ -0,0 +1,92 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/windows/Clicky/Views/OverlayWindow.xaml.cs b/windows/Clicky/Views/OverlayWindow.xaml.cs new file mode 100644 index 00000000..9ab222f2 --- /dev/null +++ b/windows/Clicky/Views/OverlayWindow.xaml.cs @@ -0,0 +1,497 @@ +using System.Windows; +using System.Windows.Controls; +using System.Windows.Interop; +using System.Windows.Threading; +using Clicky.Interop; + +namespace Clicky.Views; + +/// +/// Transparent, click-through, always-on-top overlay covering a single +/// monitor. Renders the blue triangle cursor that follows the system mouse +/// and, during element pointing (M5), animates along a bezier arc to a +/// target location and shows a speech bubble. +/// +/// One instance is created per connected display; the +/// owns their lifecycle, the +/// cursor-tracking timer, and the per-element flight requests. +/// +/// Mirrors the macOS OverlayWindow in OverlayWindow.swift. +/// +public partial class OverlayWindow : Window +{ + // Cursor-to-triangle offset matches the macOS overlay (35 px right, + // 25 px down) so the triangle sits beside the system cursor rather + // than on top of it. Interpreted in DIPs. + private const double CursorOffsetDipX = 35; + private const double CursorOffsetDipY = 25; + + // Element-target offset: the triangle lands *next to* the element, + // not on top of it — 8 DIPs right and 12 DIPs below, matching + // macOS OverlayWindow.startNavigatingToElement. + private const double ElementOffsetDipX = 8; + private const double ElementOffsetDipY = 12; + + // Speech bubble sits to the lower-right of the triangle tip (same + // relative position as macOS: x + 10, y + 18 in DIPs). + private const double BubbleOffsetDipX = 10; + private const double BubbleOffsetDipY = 18; + + // Triangle bounding box (16 × 13.856 DIPs). RenderTransformOrigin is + // (0.5, 1/3) so the rotation/scale pivot is the centroid. When we move + // the triangle we place that centroid at the target position — these + // constants shift Canvas.Left/Top from centroid-coord back to bounding- + // box coord. + private const double TriangleBoundingBoxDipWidth = 16.0; + private const double TriangleBoundingBoxDipHeight = 13.856; + private const double TriangleCentroidOffsetDipX = TriangleBoundingBoxDipWidth / 2.0; + private const double TriangleCentroidOffsetDipY = TriangleBoundingBoxDipHeight / 3.0; + + // Default "resting" rotation for the triangle (matches the macOS -35°). + private const double RestingRotationDegrees = -35.0; + + private const int AnimationFramesPerSecond = 60; + private static readonly TimeSpan AnimationFrameInterval = + TimeSpan.FromSeconds(1.0 / AnimationFramesPerSecond); + + // Bezier flight clamps — duration scales linearly with distance / 800 DIPs, + // clamped to [0.6s, 1.4s] so tiny hops still feel purposeful and cross- + // monitor flights don't drag on forever. + private const double FlightMinDurationSeconds = 0.6; + private const double FlightMaxDurationSeconds = 1.4; + private const double FlightDurationDistanceDivisor = 800.0; + + // Bubble hold before flying back, matches macOS (3s pill + 0.5s fade). + private static readonly TimeSpan BubbleHoldDuration = TimeSpan.FromSeconds(3.0); + private static readonly TimeSpan BubbleFadeDuration = TimeSpan.FromMilliseconds(500); + + private readonly int _monitorBoundsLeftDevicePixels; + private readonly int _monitorBoundsTopDevicePixels; + private readonly int _monitorWidthDevicePixels; + private readonly int _monitorHeightDevicePixels; + + private bool _hasBeenPositioned; + + // Flight state. When _isFlightActive is true, the cursor-follow path + // in UpdateCursorState is skipped — the flight animation drives the + // triangle position directly. + private bool _isFlightActive; + private DispatcherTimer? _flightFrameTimer; + + // Position in DIPs where the triangle currently sits (as last rendered). + // Flights start from this point so successive calls chain smoothly. + private double _triangleCurrentDipX; + private double _triangleCurrentDipY; + + public OverlayWindow( + int monitorBoundsLeftDevicePixels, + int monitorBoundsTopDevicePixels, + int monitorWidthDevicePixels, + int monitorHeightDevicePixels) + { + _monitorBoundsLeftDevicePixels = monitorBoundsLeftDevicePixels; + _monitorBoundsTopDevicePixels = monitorBoundsTopDevicePixels; + _monitorWidthDevicePixels = monitorWidthDevicePixels; + _monitorHeightDevicePixels = monitorHeightDevicePixels; + + InitializeComponent(); + SourceInitialized += ApplyClickThroughExtendedStyles; + } + + /// + /// Called once during startup by . + /// Applies click-through window styles and positions the overlay over + /// the monitor in device-pixel coordinates. + /// + public void ShowOnMonitor() + { + var windowHandle = new WindowInteropHelper(this).EnsureHandle(); + + NativeMethods.SetWindowPos( + windowHandle, + NativeMethods.HWND_TOPMOST, + _monitorBoundsLeftDevicePixels, + _monitorBoundsTopDevicePixels, + _monitorWidthDevicePixels, + _monitorHeightDevicePixels, + NativeMethods.SWP_NOACTIVATE | NativeMethods.SWP_SHOWWINDOW); + + Visibility = Visibility.Visible; + _hasBeenPositioned = true; + } + + /// True while a flight/point/return sequence is running on + /// this overlay. The manager consults this to suppress cursor updates + /// here and on other overlays. + public bool IsFlightInProgress => _isFlightActive; + + /// + /// Updates the overlay's triangle for a single cursor tracker tick. + /// Ignored while a flight is active — the flight animation owns the + /// triangle's position and rotation for its duration. + /// + public void UpdateCursorState( + int cursorGlobalDeviceX, + int cursorGlobalDeviceY, + bool cursorIsOnThisMonitor, + bool triangleShouldBeVisible) + { + if (!_hasBeenPositioned) return; + if (_isFlightActive) return; + + if (!cursorIsOnThisMonitor || !triangleShouldBeVisible) + { + if (BlueTriangle.Visibility != Visibility.Collapsed) + { + BlueTriangle.Visibility = Visibility.Collapsed; + } + return; + } + + var (localDipX, localDipY) = ConvertGlobalDeviceToLocalDip(cursorGlobalDeviceX, cursorGlobalDeviceY); + var triangleDipX = localDipX + CursorOffsetDipX; + var triangleDipY = localDipY + CursorOffsetDipY; + PositionTriangle(triangleDipX, triangleDipY); + + // Keep the resting pose while cursor-following. + BlueTriangleRotation.Angle = RestingRotationDegrees; + BlueTriangleScale.ScaleX = 1.0; + BlueTriangleScale.ScaleY = 1.0; + + if (BlueTriangle.Visibility != Visibility.Visible) + { + BlueTriangle.Visibility = Visibility.Visible; + } + } + + /// + /// Starts the full element-pointing sequence on this overlay: + /// bezier flight out → speech bubble hold → bezier flight back to the + /// current cursor. All UI updates happen on the caller's Dispatcher + /// (expected to be the UI thread). + /// + /// Target X in device pixels, local to this monitor's top-left. + /// Target Y in device pixels, local to this monitor's top-left. + /// Text for the speech bubble. Streamed character-by-character with jittered delays. + public void BeginElementPointingFlight( + double targetDisplayLocalDeviceX, + double targetDisplayLocalDeviceY, + string bubblePhrase) + { + if (!_hasBeenPositioned) return; + + var dpiScale = NativeMethods.GetDpiScale(this); + if (dpiScale <= 0) dpiScale = 1.0; + + var targetDipX = targetDisplayLocalDeviceX / dpiScale; + var targetDipY = targetDisplayLocalDeviceY / dpiScale; + + // Offset so the triangle lands beside the element. Clamp inside the + // overlay bounds with a small margin so the triangle never clips off + // the edge of the monitor. + var monitorWidthDip = _monitorWidthDevicePixels / dpiScale; + var monitorHeightDip = _monitorHeightDevicePixels / dpiScale; + var destinationDipX = Math.Clamp(targetDipX + ElementOffsetDipX, 20, Math.Max(20, monitorWidthDip - 20)); + var destinationDipY = Math.Clamp(targetDipY + ElementOffsetDipY, 20, Math.Max(20, monitorHeightDip - 20)); + + // Starting point = current triangle position (if we don't have one, + // fall back to the system cursor's local position so the first + // flight of the app still looks natural). + EnsureCurrentTrianglePositionInitialized(); + + _isFlightActive = true; + BlueTriangle.Visibility = Visibility.Visible; + + FlyTriangleAlongBezier( + startDipX: _triangleCurrentDipX, + startDipY: _triangleCurrentDipY, + endDipX: destinationDipX, + endDipY: destinationDipY, + onFlightComplete: () => ShowPointerBubbleAndScheduleReturn(bubblePhrase)); + } + + private void ShowPointerBubbleAndScheduleReturn(string bubblePhrase) + { + // Triangle has arrived — reset rotation and scale, then bounce the + // bubble in with streaming text. + BlueTriangleRotation.Angle = RestingRotationDegrees; + BlueTriangleScale.ScaleX = 1.0; + BlueTriangleScale.ScaleY = 1.0; + + PositionPointerBubble(); + PointerBubbleText.Text = string.Empty; + PointerBubble.Opacity = 1.0; + PointerBubbleScale.ScaleX = 0.5; + PointerBubbleScale.ScaleY = 0.5; + PointerBubble.Visibility = Visibility.Visible; + + // Spring-bounce the bubble to 1.0x — WPF has no spring easing, so a + // short ease-out gives a close-enough bounce for this size. + AnimateBubbleScaleTo(targetScale: 1.0, durationMs: 260); + + StreamBubbleCharacters(bubblePhrase, characterIndex: 0, onStreamComplete: () => + { + // Hold for 3s, fade out 0.5s, then fly back. + var holdTimer = new DispatcherTimer { Interval = BubbleHoldDuration }; + holdTimer.Tick += (_, _) => + { + holdTimer.Stop(); + FadeOutBubbleThenFlyBack(); + }; + holdTimer.Start(); + }); + } + + private void FadeOutBubbleThenFlyBack() + { + var fadeTimer = new DispatcherTimer { Interval = AnimationFrameInterval }; + var fadeStartTime = DateTime.UtcNow; + + fadeTimer.Tick += (_, _) => + { + var elapsed = DateTime.UtcNow - fadeStartTime; + var progress = Math.Clamp(elapsed.TotalMilliseconds / BubbleFadeDuration.TotalMilliseconds, 0.0, 1.0); + PointerBubble.Opacity = 1.0 - progress; + + if (progress >= 1.0) + { + fadeTimer.Stop(); + PointerBubble.Visibility = Visibility.Collapsed; + PointerBubbleText.Text = string.Empty; + FlyTriangleBackToCursor(); + } + }; + fadeTimer.Start(); + } + + private void FlyTriangleBackToCursor() + { + // Return target = system cursor + follow-offset, in local DIPs. + if (!NativeMethods.GetCursorPos(out var cursorDevicePixels)) + { + EndFlight(); + return; + } + + var (cursorLocalDipX, cursorLocalDipY) = ConvertGlobalDeviceToLocalDip(cursorDevicePixels.X, cursorDevicePixels.Y); + var returnDipX = cursorLocalDipX + CursorOffsetDipX; + var returnDipY = cursorLocalDipY + CursorOffsetDipY; + + FlyTriangleAlongBezier( + startDipX: _triangleCurrentDipX, + startDipY: _triangleCurrentDipY, + endDipX: returnDipX, + endDipY: returnDipY, + onFlightComplete: EndFlight); + } + + private void EndFlight() + { + _isFlightActive = false; + BlueTriangleRotation.Angle = RestingRotationDegrees; + BlueTriangleScale.ScaleX = 1.0; + BlueTriangleScale.ScaleY = 1.0; + } + + /// + /// Bezier flight with smoothstep easing, tangent-based rotation and a + /// scale pulse peaking at the midpoint — straight port of the macOS + /// animateBezierFlightArc. + /// + private void FlyTriangleAlongBezier( + double startDipX, + double startDipY, + double endDipX, + double endDipY, + Action onFlightComplete) + { + _flightFrameTimer?.Stop(); + + var deltaX = endDipX - startDipX; + var deltaY = endDipY - startDipY; + var distance = Math.Sqrt(deltaX * deltaX + deltaY * deltaY); + + var flightDurationSeconds = Math.Clamp( + distance / FlightDurationDistanceDivisor, + FlightMinDurationSeconds, + FlightMaxDurationSeconds); + var totalFrames = Math.Max(1, (int)(flightDurationSeconds * AnimationFramesPerSecond)); + + // Arc control point — lifted upward (negative Y in screen coords) + // so the triangle swoops. Height capped at 80 DIPs like macOS. + var midpointDipX = (startDipX + endDipX) / 2.0; + var midpointDipY = (startDipY + endDipY) / 2.0; + var arcHeight = Math.Min(distance * 0.2, 80.0); + var controlPointDipX = midpointDipX; + var controlPointDipY = midpointDipY - arcHeight; + + var currentFrame = 0; + _flightFrameTimer = new DispatcherTimer(DispatcherPriority.Render, Dispatcher) + { + Interval = AnimationFrameInterval, + }; + _flightFrameTimer.Tick += (_, _) => + { + currentFrame++; + + if (currentFrame > totalFrames) + { + _flightFrameTimer.Stop(); + _flightFrameTimer = null; + PositionTriangle(endDipX, endDipY); + BlueTriangleScale.ScaleX = 1.0; + BlueTriangleScale.ScaleY = 1.0; + _triangleCurrentDipX = endDipX; + _triangleCurrentDipY = endDipY; + onFlightComplete(); + return; + } + + var linearProgress = (double)currentFrame / totalFrames; + // Smoothstep easeInOut: 3t² - 2t³ + var t = linearProgress * linearProgress * (3.0 - 2.0 * linearProgress); + var oneMinusT = 1.0 - t; + + // Quadratic bezier B(t) + var bezierDipX = oneMinusT * oneMinusT * startDipX + + 2.0 * oneMinusT * t * controlPointDipX + + t * t * endDipX; + var bezierDipY = oneMinusT * oneMinusT * startDipY + + 2.0 * oneMinusT * t * controlPointDipY + + t * t * endDipY; + PositionTriangle(bezierDipX, bezierDipY); + + // Rotation along the curve tangent B'(t). The +90° offset aligns + // the triangle's tip (which points up at 0°) with the direction + // of travel returned by atan2. + var tangentX = 2.0 * oneMinusT * (controlPointDipX - startDipX) + + 2.0 * t * (endDipX - controlPointDipX); + var tangentY = 2.0 * oneMinusT * (controlPointDipY - startDipY) + + 2.0 * t * (endDipY - controlPointDipY); + BlueTriangleRotation.Angle = Math.Atan2(tangentY, tangentX) * (180.0 / Math.PI) + 90.0; + + // Scale pulse — sin curve, peaks at 1.3× at mid-flight. + var scalePulse = 1.0 + Math.Sin(linearProgress * Math.PI) * 0.3; + BlueTriangleScale.ScaleX = scalePulse; + BlueTriangleScale.ScaleY = scalePulse; + }; + _flightFrameTimer.Start(); + } + + private void StreamBubbleCharacters(string phrase, int characterIndex, Action onStreamComplete) + { + if (!_isFlightActive) + { + // Flight was cancelled / interrupted — stop streaming. + return; + } + + if (characterIndex >= phrase.Length) + { + onStreamComplete(); + return; + } + + PointerBubbleText.Text += phrase[characterIndex]; + PositionPointerBubble(); + + var characterDelayMs = 30 + Random.Shared.Next(31); // 30..60 ms + var characterTimer = new DispatcherTimer { Interval = TimeSpan.FromMilliseconds(characterDelayMs) }; + characterTimer.Tick += (_, _) => + { + characterTimer.Stop(); + StreamBubbleCharacters(phrase, characterIndex + 1, onStreamComplete); + }; + characterTimer.Start(); + } + + private void AnimateBubbleScaleTo(double targetScale, int durationMs) + { + var startingScale = PointerBubbleScale.ScaleX; + var startTime = DateTime.UtcNow; + var scaleTimer = new DispatcherTimer(DispatcherPriority.Render, Dispatcher) + { + Interval = AnimationFrameInterval, + }; + scaleTimer.Tick += (_, _) => + { + var elapsed = (DateTime.UtcNow - startTime).TotalMilliseconds; + var progress = Math.Clamp(elapsed / durationMs, 0.0, 1.0); + // Ease-out cubic for a gentle overshoot-free bounce. + var eased = 1.0 - Math.Pow(1.0 - progress, 3.0); + var currentScale = startingScale + (targetScale - startingScale) * eased; + PointerBubbleScale.ScaleX = currentScale; + PointerBubbleScale.ScaleY = currentScale; + + if (progress >= 1.0) + { + scaleTimer.Stop(); + } + }; + scaleTimer.Start(); + } + + private void EnsureCurrentTrianglePositionInitialized() + { + if (_triangleCurrentDipX != 0 || _triangleCurrentDipY != 0) return; + + // No prior position recorded — seed from the current system cursor. + if (!NativeMethods.GetCursorPos(out var cursorDevicePixels)) return; + var (cursorLocalDipX, cursorLocalDipY) = ConvertGlobalDeviceToLocalDip(cursorDevicePixels.X, cursorDevicePixels.Y); + _triangleCurrentDipX = cursorLocalDipX + CursorOffsetDipX; + _triangleCurrentDipY = cursorLocalDipY + CursorOffsetDipY; + PositionTriangle(_triangleCurrentDipX, _triangleCurrentDipY); + } + + private void PositionTriangle(double centroidDipX, double centroidDipY) + { + Canvas.SetLeft(BlueTriangle, centroidDipX - TriangleCentroidOffsetDipX); + Canvas.SetTop(BlueTriangle, centroidDipY - TriangleCentroidOffsetDipY); + _triangleCurrentDipX = centroidDipX; + _triangleCurrentDipY = centroidDipY; + if (PointerBubble.Visibility == Visibility.Visible) + { + PositionPointerBubble(); + } + } + + private void PositionPointerBubble() + { + // Measure the bubble so we can center it around (triangle + offset). + PointerBubble.Measure(new Size(double.PositiveInfinity, double.PositiveInfinity)); + var bubbleDesired = PointerBubble.DesiredSize; + + var anchorDipX = _triangleCurrentDipX + BubbleOffsetDipX; + var anchorDipY = _triangleCurrentDipY + BubbleOffsetDipY; + Canvas.SetLeft(PointerBubble, anchorDipX - bubbleDesired.Width / 2.0); + Canvas.SetTop(PointerBubble, anchorDipY - bubbleDesired.Height / 2.0); + } + + private (double LocalDipX, double LocalDipY) ConvertGlobalDeviceToLocalDip(int globalDeviceX, int globalDeviceY) + { + var dpiScale = NativeMethods.GetDpiScale(this); + if (dpiScale <= 0) dpiScale = 1.0; + + var localDeviceX = globalDeviceX - _monitorBoundsLeftDevicePixels; + var localDeviceY = globalDeviceY - _monitorBoundsTopDevicePixels; + return (localDeviceX / dpiScale, localDeviceY / dpiScale); + } + + private static void ApplyClickThroughExtendedStyles(object? sender, EventArgs eventArgs) + { + if (sender is not OverlayWindow overlayWindow) return; + + var windowHandle = new WindowInteropHelper(overlayWindow).Handle; + var currentExtendedStyle = NativeMethods.GetExtendedStyle(windowHandle); + // WS_EX_TRANSPARENT — forwards mouse events to the window beneath + // WS_EX_LAYERED — required for WS_EX_TRANSPARENT on a non-child + // WS_EX_NOACTIVATE — clicking the overlay never steals focus + // WS_EX_TOOLWINDOW — never appears in Alt+Tab / taskbar + var desiredExtendedStyle = currentExtendedStyle + | NativeMethods.WS_EX_TRANSPARENT + | NativeMethods.WS_EX_LAYERED + | NativeMethods.WS_EX_NOACTIVATE + | NativeMethods.WS_EX_TOOLWINDOW; + NativeMethods.SetExtendedStyle(windowHandle, desiredExtendedStyle); + } +} diff --git a/windows/Clicky/Views/StringToVisibilityConverter.cs b/windows/Clicky/Views/StringToVisibilityConverter.cs new file mode 100644 index 00000000..849f9180 --- /dev/null +++ b/windows/Clicky/Views/StringToVisibilityConverter.cs @@ -0,0 +1,24 @@ +using System.Globalization; +using System.Windows; +using System.Windows.Data; + +namespace Clicky.Views; + +/// +/// Collapses a UI element when its bound string is null, empty, or +/// whitespace; shows it otherwise. Used by the tray panel to hide the +/// transcript and response rows until they have content. +/// +[ValueConversion(typeof(string), typeof(Visibility))] +public sealed class StringToVisibilityConverter : IValueConverter +{ + public object Convert(object? value, Type targetType, object? parameter, CultureInfo culture) + { + return string.IsNullOrWhiteSpace(value as string) ? Visibility.Collapsed : Visibility.Visible; + } + + public object ConvertBack(object? value, Type targetType, object? parameter, CultureInfo culture) + { + throw new NotSupportedException(); + } +} diff --git a/windows/Clicky/Views/TrayPanelWindow.xaml b/windows/Clicky/Views/TrayPanelWindow.xaml new file mode 100644 index 00000000..59a75b9e --- /dev/null +++ b/windows/Clicky/Views/TrayPanelWindow.xaml @@ -0,0 +1,401 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Your mic stays off until you hold the shortcut. Responses are spoken back through your speakers. + + +