From 7e9853df10be1f2bdf52bc6a00d151fc51226731 Mon Sep 17 00:00:00 2001 From: Hari Kesavan Date: Sat, 18 Apr 2026 12:09:55 +0200 Subject: [PATCH 1/6] Switch from Claude to OpenAI and organize project into domain directories MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace Claude API with OpenAI GPT-4o for the main chat pipeline: - Worker /chat route now proxies to api.openai.com - OpenAIAPI.swift rewritten with SSE streaming via Worker proxy - CompanionManager swapped from ClaudeAPI to OpenAIAPI - Model picker updated to GPT-4o / GPT-4o mini - ClaudeAPI.swift kept as unused reference Organize flat 22-file structure into App/, Voice/, AI/, UI/, Utilities/, Resources/ subdirectories. Xcode auto-syncs via PBXFileSystemSynchronizedRootGroup — no manual pbxproj edits. Add .dev.vars and IDE directories to .gitignore. --- .gitignore | 6 + AGENTS.md | 56 ++-- leanring-buddy.xcodeproj/project.pbxproj | 12 +- leanring-buddy/AGENTS.md | 112 ++++++-- leanring-buddy/{ => AI}/ClaudeAPI.swift | 0 .../{ => AI}/ElementLocationDetector.swift | 0 .../{ => AI}/ElevenLabsTTSClient.swift | 0 leanring-buddy/AI/OpenAIAPI.swift | 253 ++++++++++++++++++ .../{ => App}/AppBundleConfiguration.swift | 0 .../{ => App}/CompanionManager.swift | 28 +- .../{ => App}/leanring_buddyApp.swift | 0 leanring-buddy/OpenAIAPI.swift | 142 ---------- .../AccentColor.colorset/Contents.json | 0 .../AppIcon.appiconset/1024-mac.png | Bin .../AppIcon.appiconset/128-mac.png | Bin .../AppIcon.appiconset/16-mac.png | Bin .../AppIcon.appiconset/256-mac.png | Bin .../AppIcon.appiconset/32-mac.png | Bin .../AppIcon.appiconset/512-mac.png | Bin .../AppIcon.appiconset/64-mac.png | Bin .../AppIcon.appiconset/Contents.json | 0 .../Assets.xcassets/Contents.json | 0 .../Contents.json | 0 .../Image.png | Bin .../Contents.json | 0 .../add-a-project-in-codex.png | Bin .../Contents.json | 0 .../codex-app-screenshot.jpg | Bin .../codex-home-screen.imageset/Contents.json | 0 .../codex-home-screen.png | Bin .../codex-permissions.imageset/Contents.json | 0 .../codex-permissions.png | Bin .../discord-logo.imageset/Contents.json | 0 .../discord-logo.imageset/discord-logo.svg | 0 .../git-tools-prompt.imageset/Contents.json | 0 .../git-tools-prompt.png | Bin .../google-logo.imageset/Contents.json | 0 .../google-logo.imageset/google-logo.svg | 0 .../Contents.json | 0 ...nside-the-makesomething-project-folder.png | Bin .../Contents.json | 0 ...esomething-project-folder-in-downloads.png | Bin .../steve.imageset/Contents.json | 0 .../Assets.xcassets/steve.imageset/steve.jpg | Bin .../{ => Resources}/Assets.xcassets/steve.jpg | Bin .../{ => Resources}/codex-add-project.png | Bin leanring-buddy/{ => Resources}/enter.mp3 | Bin leanring-buddy/{ => Resources}/eshop.mp3 | Bin leanring-buddy/{ => Resources}/ff.mp3 | Bin leanring-buddy/{ => Resources}/steve.jpg | Bin .../{ => UI}/CompanionPanelView.swift | 4 +- .../{ => UI}/CompanionResponseOverlay.swift | 0 leanring-buddy/{ => UI}/DesignSystem.swift | 0 .../{ => UI}/MenuBarPanelManager.swift | 0 leanring-buddy/{ => UI}/OverlayWindow.swift | 0 .../{ => Utilities}/ClickyAnalytics.swift | 0 .../CompanionScreenCaptureUtility.swift | 0 .../WindowPositionManager.swift | 0 .../AppleSpeechTranscriptionProvider.swift | 0 ...mblyAIStreamingTranscriptionProvider.swift | 0 .../BuddyAudioConversionSupport.swift | 0 .../{ => Voice}/BuddyDictationManager.swift | 0 .../BuddyTranscriptionProvider.swift | 0 .../GlobalPushToTalkShortcutMonitor.swift | 0 .../OpenAIAudioTranscriptionProvider.swift | 0 worker/src/index.ts | 14 +- 66 files changed, 400 insertions(+), 227 deletions(-) rename leanring-buddy/{ => AI}/ClaudeAPI.swift (100%) rename leanring-buddy/{ => AI}/ElementLocationDetector.swift (100%) rename leanring-buddy/{ => AI}/ElevenLabsTTSClient.swift (100%) create mode 100644 leanring-buddy/AI/OpenAIAPI.swift rename leanring-buddy/{ => App}/AppBundleConfiguration.swift (100%) rename leanring-buddy/{ => App}/CompanionManager.swift (98%) rename leanring-buddy/{ => App}/leanring_buddyApp.swift (100%) delete mode 100644 leanring-buddy/OpenAIAPI.swift rename leanring-buddy/{ => Resources}/Assets.xcassets/AccentColor.colorset/Contents.json (100%) rename leanring-buddy/{ => Resources}/Assets.xcassets/AppIcon.appiconset/1024-mac.png (100%) rename leanring-buddy/{ => Resources}/Assets.xcassets/AppIcon.appiconset/128-mac.png (100%) rename leanring-buddy/{ => Resources}/Assets.xcassets/AppIcon.appiconset/16-mac.png (100%) rename leanring-buddy/{ => Resources}/Assets.xcassets/AppIcon.appiconset/256-mac.png (100%) rename leanring-buddy/{ => Resources}/Assets.xcassets/AppIcon.appiconset/32-mac.png (100%) rename leanring-buddy/{ => Resources}/Assets.xcassets/AppIcon.appiconset/512-mac.png (100%) rename leanring-buddy/{ => Resources}/Assets.xcassets/AppIcon.appiconset/64-mac.png (100%) rename leanring-buddy/{ => Resources}/Assets.xcassets/AppIcon.appiconset/Contents.json (100%) rename leanring-buddy/{ => Resources}/Assets.xcassets/Contents.json (100%) rename leanring-buddy/{ => Resources}/Assets.xcassets/add-a-project-in-codex-from-sidebar.imageset/Contents.json (100%) rename leanring-buddy/{ => Resources}/Assets.xcassets/add-a-project-in-codex-from-sidebar.imageset/Image.png (100%) rename leanring-buddy/{ => Resources}/Assets.xcassets/add-a-project-in-codex.imageset/Contents.json (100%) rename leanring-buddy/{ => Resources}/Assets.xcassets/add-a-project-in-codex.imageset/add-a-project-in-codex.png (100%) rename leanring-buddy/{ => Resources}/Assets.xcassets/codex-app-screenshot.imageset/Contents.json (100%) rename leanring-buddy/{ => Resources}/Assets.xcassets/codex-app-screenshot.imageset/codex-app-screenshot.jpg (100%) rename leanring-buddy/{ => Resources}/Assets.xcassets/codex-home-screen.imageset/Contents.json (100%) rename leanring-buddy/{ => Resources}/Assets.xcassets/codex-home-screen.imageset/codex-home-screen.png (100%) rename leanring-buddy/{ => Resources}/Assets.xcassets/codex-permissions.imageset/Contents.json (100%) rename leanring-buddy/{ => Resources}/Assets.xcassets/codex-permissions.imageset/codex-permissions.png (100%) rename leanring-buddy/{ => Resources}/Assets.xcassets/discord-logo.imageset/Contents.json (100%) rename leanring-buddy/{ => Resources}/Assets.xcassets/discord-logo.imageset/discord-logo.svg (100%) rename leanring-buddy/{ => Resources}/Assets.xcassets/git-tools-prompt.imageset/Contents.json (100%) rename leanring-buddy/{ => Resources}/Assets.xcassets/git-tools-prompt.imageset/git-tools-prompt.png (100%) rename leanring-buddy/{ => Resources}/Assets.xcassets/google-logo.imageset/Contents.json (100%) rename leanring-buddy/{ => Resources}/Assets.xcassets/google-logo.imageset/google-logo.svg (100%) rename leanring-buddy/{ => Resources}/Assets.xcassets/inside-the-makesomething-project-folder.imageset/Contents.json (100%) rename leanring-buddy/{ => Resources}/Assets.xcassets/inside-the-makesomething-project-folder.imageset/inside-the-makesomething-project-folder.png (100%) rename leanring-buddy/{ => Resources}/Assets.xcassets/makesomething-project-folder-in-downloads.imageset/Contents.json (100%) rename leanring-buddy/{ => Resources}/Assets.xcassets/makesomething-project-folder-in-downloads.imageset/makesomething-project-folder-in-downloads.png (100%) rename leanring-buddy/{ => Resources}/Assets.xcassets/steve.imageset/Contents.json (100%) rename leanring-buddy/{ => Resources}/Assets.xcassets/steve.imageset/steve.jpg (100%) rename leanring-buddy/{ => Resources}/Assets.xcassets/steve.jpg (100%) rename leanring-buddy/{ => Resources}/codex-add-project.png (100%) rename leanring-buddy/{ => Resources}/enter.mp3 (100%) rename leanring-buddy/{ => Resources}/eshop.mp3 (100%) rename leanring-buddy/{ => Resources}/ff.mp3 (100%) rename leanring-buddy/{ => Resources}/steve.jpg (100%) rename leanring-buddy/{ => UI}/CompanionPanelView.swift (99%) rename leanring-buddy/{ => UI}/CompanionResponseOverlay.swift (100%) rename leanring-buddy/{ => UI}/DesignSystem.swift (100%) rename leanring-buddy/{ => UI}/MenuBarPanelManager.swift (100%) rename leanring-buddy/{ => UI}/OverlayWindow.swift (100%) rename leanring-buddy/{ => Utilities}/ClickyAnalytics.swift (100%) rename leanring-buddy/{ => Utilities}/CompanionScreenCaptureUtility.swift (100%) rename leanring-buddy/{ => Utilities}/WindowPositionManager.swift (100%) rename leanring-buddy/{ => Voice}/AppleSpeechTranscriptionProvider.swift (100%) rename leanring-buddy/{ => Voice}/AssemblyAIStreamingTranscriptionProvider.swift (100%) rename leanring-buddy/{ => Voice}/BuddyAudioConversionSupport.swift (100%) rename leanring-buddy/{ => Voice}/BuddyDictationManager.swift (100%) rename leanring-buddy/{ => Voice}/BuddyTranscriptionProvider.swift (100%) rename leanring-buddy/{ => Voice}/GlobalPushToTalkShortcutMonitor.swift (100%) rename leanring-buddy/{ => Voice}/OpenAIAudioTranscriptionProvider.swift (100%) diff --git a/.gitignore b/.gitignore index 832e80a1..3f7848cc 100644 --- a/.gitignore +++ b/.gitignore @@ -1,8 +1,14 @@ worker/node_modules/ worker/.dev.vars +worker/.wrangler/ +.dev.vars .DS_Store *.xcuserstate +*.xcuserdatad +xcuserdata/ build/ releases/ .claude/ +.sisyphus/ +.idea/ coding-plans/ diff --git a/AGENTS.md b/AGENTS.md index 6946d441..f1768b57 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -14,12 +14,12 @@ All API keys live on a Cloudflare Worker proxy — nothing sensitive ships in th - **App Type**: Menu bar-only (`LSUIElement=true`), no dock icon or main window - **Framework**: SwiftUI (macOS native) with AppKit bridging for menu bar panel and cursor overlay - **Pattern**: MVVM with `@StateObject` / `@Published` state management -- **AI Chat**: Claude (Sonnet 4.6 default, Opus 4.6 optional) via Cloudflare Worker proxy with SSE streaming +- **AI Chat**: OpenAI (GPT-4o default, GPT-4o mini optional) via Cloudflare Worker proxy with SSE streaming - **Speech-to-Text**: AssemblyAI real-time streaming (`u3-rt-pro` model) via websocket, with OpenAI and Apple Speech as fallbacks - **Text-to-Speech**: ElevenLabs (`eleven_flash_v2_5` model) via Cloudflare Worker proxy - **Screen Capture**: ScreenCaptureKit (macOS 14.2+), multi-monitor support - **Voice Input**: Push-to-talk via `AVAudioEngine` + pluggable transcription-provider layer. System-wide keyboard shortcut via listen-only CGEvent tap. -- **Element Pointing**: Claude embeds `[POINT:x,y:label:screenN]` tags in responses. The overlay parses these, maps coordinates to the correct monitor, and animates the blue cursor along a bezier arc to the target. +- **Element Pointing**: The AI embeds `[POINT:x,y:label:screenN]` tags in responses. The overlay parses these, maps coordinates to the correct monitor, and animates the blue cursor along a bezier arc to the target. - **Concurrency**: `@MainActor` isolation, async/await throughout - **Analytics**: PostHog via `ClickyAnalytics.swift` @@ -29,11 +29,11 @@ The app never calls external APIs directly. All requests go through a Cloudflare | Route | Upstream | Purpose | |-------|----------|---------| -| `POST /chat` | `api.anthropic.com/v1/messages` | Claude vision + streaming chat | +| `POST /chat` | `api.openai.com/v1/chat/completions` | OpenAI vision + streaming chat | | `POST /tts` | `api.elevenlabs.io/v1/text-to-speech/{voiceId}` | ElevenLabs TTS audio | | `POST /transcribe-token` | `streaming.assemblyai.com/v3/token` | Fetches a short-lived (480s) AssemblyAI websocket token | -Worker secrets: `ANTHROPIC_API_KEY`, `ASSEMBLYAI_API_KEY`, `ELEVENLABS_API_KEY` +Worker secrets: `OPENAI_API_KEY`, `ASSEMBLYAI_API_KEY`, `ELEVENLABS_API_KEY` Worker vars: `ELEVENLABS_VOICE_ID` ### Key Architecture Decisions @@ -52,29 +52,29 @@ Worker vars: `ELEVENLABS_VOICE_ID` | File | Lines | Purpose | |------|-------|---------| -| `leanring_buddyApp.swift` | ~89 | Menu bar app entry point. Uses `@NSApplicationDelegateAdaptor` with `CompanionAppDelegate` which creates `MenuBarPanelManager` and starts `CompanionManager`. No main window — the app lives entirely in the status bar. | -| `CompanionManager.swift` | ~1026 | Central state machine. Owns dictation, shortcut monitoring, screen capture, Claude API, ElevenLabs TTS, and overlay management. Tracks voice state (idle/listening/processing/responding), conversation history, model selection, and cursor visibility. Coordinates the full push-to-talk → screenshot → Claude → TTS → pointing pipeline. | -| `MenuBarPanelManager.swift` | ~243 | NSStatusItem + custom NSPanel lifecycle. Creates the menu bar icon, manages the floating companion panel (show/hide/position), installs click-outside-to-dismiss monitor. | -| `CompanionPanelView.swift` | ~761 | SwiftUI panel content for the menu bar dropdown. Shows companion status, push-to-talk instructions, model picker (Sonnet/Opus), permissions UI, DM feedback button, and quit button. Dark aesthetic using `DS` design system. | -| `OverlayWindow.swift` | ~881 | Full-screen transparent overlay hosting the blue cursor, response text, waveform, and spinner. Handles cursor animation, element pointing with bezier arcs, multi-monitor coordinate mapping, and fade-out transitions. | -| `CompanionResponseOverlay.swift` | ~217 | SwiftUI view for the response text bubble and waveform displayed next to the cursor in the overlay. | -| `CompanionScreenCaptureUtility.swift` | ~132 | Multi-monitor screenshot capture using ScreenCaptureKit. Returns labeled image data for each connected display. | -| `BuddyDictationManager.swift` | ~866 | Push-to-talk voice pipeline. Handles microphone capture via `AVAudioEngine`, provider-aware permission checks, keyboard/button dictation sessions, transcript finalization, shortcut parsing, contextual keyterms, and live audio-level reporting for waveform feedback. | -| `BuddyTranscriptionProvider.swift` | ~100 | Protocol surface and provider factory for voice transcription backends. Resolves provider based on `VoiceTranscriptionProvider` in Info.plist — AssemblyAI, OpenAI, or Apple Speech. | -| `AssemblyAIStreamingTranscriptionProvider.swift` | ~478 | Streaming transcription provider. Fetches temp tokens from the Cloudflare Worker, opens an AssemblyAI v3 websocket, streams PCM16 audio, tracks turn-based transcripts, and delivers finalized text on key-up. Shares a single URLSession across all sessions. | -| `OpenAIAudioTranscriptionProvider.swift` | ~317 | Upload-based transcription provider. Buffers push-to-talk audio locally, uploads as WAV on release, returns finalized transcript. | -| `AppleSpeechTranscriptionProvider.swift` | ~147 | Local fallback transcription provider backed by Apple's Speech framework. | -| `BuddyAudioConversionSupport.swift` | ~108 | Audio conversion helpers. Converts live mic buffers to PCM16 mono audio and builds WAV payloads for upload-based providers. | -| `GlobalPushToTalkShortcutMonitor.swift` | ~132 | System-wide push-to-talk monitor. Owns the listen-only `CGEvent` tap and publishes press/release transitions. | -| `ClaudeAPI.swift` | ~291 | Claude vision API client with streaming (SSE) and non-streaming modes. TLS warmup optimization, image MIME detection, conversation history support. | -| `OpenAIAPI.swift` | ~142 | OpenAI GPT vision API client. | -| `ElevenLabsTTSClient.swift` | ~81 | ElevenLabs TTS client. Sends text to the Worker proxy, plays back audio via `AVAudioPlayer`. Exposes `isPlaying` for transient cursor scheduling. | -| `ElementLocationDetector.swift` | ~335 | Detects UI element locations in screenshots for cursor pointing. | -| `DesignSystem.swift` | ~880 | Design system tokens — colors, corner radii, shared styles. All UI references `DS.Colors`, `DS.CornerRadius`, etc. | -| `ClickyAnalytics.swift` | ~121 | PostHog analytics integration for usage tracking. | -| `WindowPositionManager.swift` | ~262 | Window placement logic, Screen Recording permission flow, and accessibility permission helpers. | -| `AppBundleConfiguration.swift` | ~28 | Runtime configuration reader for keys stored in the app bundle Info.plist. | -| `worker/src/index.ts` | ~142 | Cloudflare Worker proxy. Three routes: `/chat` (Claude), `/tts` (ElevenLabs), `/transcribe-token` (AssemblyAI temp token). | +| `App/leanring_buddyApp.swift` | ~89 | Menu bar app entry point. Uses `@NSApplicationDelegateAdaptor` with `CompanionAppDelegate` which creates `MenuBarPanelManager` and starts `CompanionManager`. No main window — the app lives entirely in the status bar. | +| `App/CompanionManager.swift` | ~1026 | Central state machine. Owns dictation, shortcut monitoring, screen capture, OpenAI API, ElevenLabs TTS, and overlay management. Tracks voice state (idle/listening/processing/responding), conversation history, model selection, and cursor visibility. Coordinates the full push-to-talk → screenshot → OpenAI → TTS → pointing pipeline. | +| `App/AppBundleConfiguration.swift` | ~28 | Runtime configuration reader for keys stored in the app bundle Info.plist. | +| `UI/MenuBarPanelManager.swift` | ~243 | NSStatusItem + custom NSPanel lifecycle. Creates the menu bar icon, manages the floating companion panel (show/hide/position), installs click-outside-to-dismiss monitor. | +| `UI/CompanionPanelView.swift` | ~761 | SwiftUI panel content for the menu bar dropdown. Shows companion status, push-to-talk instructions, model picker (GPT-4o/GPT-4o mini), permissions UI, DM feedback button, and quit button. Dark aesthetic using `DS` design system. | +| `UI/OverlayWindow.swift` | ~881 | Full-screen transparent overlay hosting the blue cursor, response text, waveform, and spinner. Handles cursor animation, element pointing with bezier arcs, multi-monitor coordinate mapping, and fade-out transitions. | +| `UI/CompanionResponseOverlay.swift` | ~217 | SwiftUI view for the response text bubble and waveform displayed next to the cursor in the overlay. | +| `UI/DesignSystem.swift` | ~880 | Design system tokens — colors, corner radii, shared styles. All UI references `DS.Colors`, `DS.CornerRadius`, etc. | +| `Voice/BuddyDictationManager.swift` | ~866 | Push-to-talk voice pipeline. Handles microphone capture via `AVAudioEngine`, provider-aware permission checks, keyboard/button dictation sessions, transcript finalization, shortcut parsing, contextual keyterms, and live audio-level reporting for waveform feedback. | +| `Voice/GlobalPushToTalkShortcutMonitor.swift` | ~132 | System-wide push-to-talk monitor. Owns the listen-only `CGEvent` tap and publishes press/release transitions. | +| `Voice/BuddyTranscriptionProvider.swift` | ~100 | Protocol surface and provider factory for voice transcription backends. Resolves provider based on `VoiceTranscriptionProvider` in Info.plist — AssemblyAI, OpenAI, or Apple Speech. | +| `Voice/AssemblyAIStreamingTranscriptionProvider.swift` | ~478 | Streaming transcription provider. Fetches temp tokens from the Cloudflare Worker, opens an AssemblyAI v3 websocket, streams PCM16 audio, tracks turn-based transcripts, and delivers finalized text on key-up. Shares a single URLSession across all sessions. | +| `Voice/OpenAIAudioTranscriptionProvider.swift` | ~317 | Upload-based transcription provider. Buffers push-to-talk audio locally, uploads as WAV on release, returns finalized transcript. | +| `Voice/AppleSpeechTranscriptionProvider.swift` | ~147 | Local fallback transcription provider backed by Apple's Speech framework. | +| `Voice/BuddyAudioConversionSupport.swift` | ~108 | Audio conversion helpers. Converts live mic buffers to PCM16 mono audio and builds WAV payloads for upload-based providers. | +| `AI/OpenAIAPI.swift` | ~230 | OpenAI GPT-4o vision API client with SSE streaming. Routes through Cloudflare Worker proxy. | +| `AI/ClaudeAPI.swift` | ~291 | Claude vision API client with streaming (SSE) and non-streaming modes. Currently unused — kept as reference. | +| `AI/ElevenLabsTTSClient.swift` | ~81 | ElevenLabs TTS client. Sends text to the Worker proxy, plays back audio via `AVAudioPlayer`. Exposes `isPlaying` for transient cursor scheduling. | +| `AI/ElementLocationDetector.swift` | ~335 | Detects UI element locations in screenshots using Claude Computer Use API for cursor pointing. | +| `Utilities/CompanionScreenCaptureUtility.swift` | ~132 | Multi-monitor screenshot capture using ScreenCaptureKit. Returns labeled image data for each connected display. | +| `Utilities/WindowPositionManager.swift` | ~262 | Window placement logic, Screen Recording permission flow, and accessibility permission helpers. | +| `Utilities/ClickyAnalytics.swift` | ~121 | PostHog analytics integration for usage tracking. | +| `worker/src/index.ts` | ~141 | Cloudflare Worker proxy. Three routes: `/chat` (OpenAI), `/tts` (ElevenLabs), `/transcribe-token` (AssemblyAI temp token). | ## Build & Run @@ -97,7 +97,7 @@ cd worker npm install # Add secrets -npx wrangler secret put ANTHROPIC_API_KEY +npx wrangler secret put OPENAI_API_KEY npx wrangler secret put ASSEMBLYAI_API_KEY npx wrangler secret put ELEVENLABS_API_KEY diff --git a/leanring-buddy.xcodeproj/project.pbxproj b/leanring-buddy.xcodeproj/project.pbxproj index 75e57261..39eacb4c 100644 --- a/leanring-buddy.xcodeproj/project.pbxproj +++ b/leanring-buddy.xcodeproj/project.pbxproj @@ -411,7 +411,7 @@ CODE_SIGN_STYLE = Automatic; COMBINE_HIDPI_IMAGES = YES; CURRENT_PROJECT_VERSION = 1; - DEVELOPMENT_TEAM = 2UDAY4J48G; + DEVELOPMENT_TEAM = 6B89JTKXCY; ENABLE_APP_SANDBOX = NO; ENABLE_HARDENED_RUNTIME = YES; ENABLE_OUTGOING_NETWORK_CONNECTIONS = YES; @@ -449,7 +449,7 @@ CODE_SIGN_STYLE = Automatic; COMBINE_HIDPI_IMAGES = YES; CURRENT_PROJECT_VERSION = 1; - DEVELOPMENT_TEAM = 2UDAY4J48G; + DEVELOPMENT_TEAM = 6B89JTKXCY; ENABLE_APP_SANDBOX = NO; ENABLE_HARDENED_RUNTIME = YES; ENABLE_OUTGOING_NETWORK_CONNECTIONS = YES; @@ -484,7 +484,7 @@ BUNDLE_LOADER = "$(TEST_HOST)"; CODE_SIGN_STYLE = Automatic; CURRENT_PROJECT_VERSION = 1; - DEVELOPMENT_TEAM = 6D7X9GGZAW; + DEVELOPMENT_TEAM = 6B89JTKXCY; GENERATE_INFOPLIST_FILE = YES; MACOSX_DEPLOYMENT_TARGET = 14.2; MARKETING_VERSION = 1.0; @@ -505,7 +505,7 @@ BUNDLE_LOADER = "$(TEST_HOST)"; CODE_SIGN_STYLE = Automatic; CURRENT_PROJECT_VERSION = 1; - DEVELOPMENT_TEAM = 6D7X9GGZAW; + DEVELOPMENT_TEAM = 6B89JTKXCY; GENERATE_INFOPLIST_FILE = YES; MACOSX_DEPLOYMENT_TARGET = 14.2; MARKETING_VERSION = 1.0; @@ -525,7 +525,7 @@ buildSettings = { CODE_SIGN_STYLE = Automatic; CURRENT_PROJECT_VERSION = 1; - DEVELOPMENT_TEAM = 6D7X9GGZAW; + DEVELOPMENT_TEAM = 6B89JTKXCY; GENERATE_INFOPLIST_FILE = YES; MARKETING_VERSION = 1.0; PRODUCT_BUNDLE_IDENTIFIER = "com.yourcompany.leanring-buddyUITests"; @@ -544,7 +544,7 @@ buildSettings = { CODE_SIGN_STYLE = Automatic; CURRENT_PROJECT_VERSION = 1; - DEVELOPMENT_TEAM = 6D7X9GGZAW; + DEVELOPMENT_TEAM = 6B89JTKXCY; GENERATE_INFOPLIST_FILE = YES; MARKETING_VERSION = 1.0; PRODUCT_BUNDLE_IDENTIFIER = "com.yourcompany.leanring-buddyUITests"; diff --git a/leanring-buddy/AGENTS.md b/leanring-buddy/AGENTS.md index b59cb091..bde23b56 100644 --- a/leanring-buddy/AGENTS.md +++ b/leanring-buddy/AGENTS.md @@ -1,28 +1,84 @@ -# AGENTS.md - leanring-buddy (Main App Target) - -## Source Files - -### FloatingSessionButton.swift -- `FloatingSessionButtonManager` — `@MainActor` class managing the `NSPanel` lifecycle - - `showFloatingButton()` — Creates/shows the panel in top-right of primary screen - - `hideFloatingButton()` — Hides panel (keeps it alive for quick re-show) - - `destroyFloatingButton()` — Removes panel permanently (session ended) - - `onFloatingButtonClicked` — Callback closure, set by ContentView to bring main window to front - - `floatingButtonPanel` — Exposed `NSPanel` reference for screenshot exclusion -- `FloatingButtonView` — Private SwiftUI view with gradient circle, scale+glow hover animation, pointer cursor - -### ContentView.swift -- Receives `FloatingSessionButtonManager` via `@EnvironmentObject` -- `isMainWindowCurrentlyFocused` — Tracks main window focus state -- `configureFloatingButtonManager()` — Wires up the click callback -- `startObservingMainWindowFocusChanges()` — Sets up `NSWindow` notification observers -- `updateFloatingButtonVisibility()` — Core logic: show if running + not focused, hide otherwise -- `bringMainWindowToFront()` — Activates app and orders main window front - -### ScreenshotManager.swift -- `floatingButtonWindowToExcludeFromCaptures` — `NSWindow?` reference set by ContentView -- `captureScreen()` — Matches the floating window to an `SCWindow` and excludes it from capture filter - -### leanring_buddyApp.swift -- Owns `FloatingSessionButtonManager` as `@StateObject` -- Injects it into ContentView via `.environmentObject()` +# leanring-buddy — Main App Target + +> 22 Swift files, ~7,751 LOC. Organized into subdirectories by domain. + +## Directory Structure + +``` +leanring-buddy/ +├── App/ Entry point, central orchestrator, runtime config +├── Voice/ Push-to-talk, mic capture, transcription providers +├── AI/ API clients (OpenAI, Claude, ElevenLabs, element detection) +├── UI/ Menu bar panel, overlay, design system +├── Utilities/ Screenshots, permissions, analytics +├── Resources/ Assets, audio files, images +├── Info.plist +├── leanring-buddy.entitlements +└── AGENTS.md +``` + +## WHERE TO LOOK + +| Task | File(s) | Dir | Notes | +|------|---------|-----|-------| +| App bootstrap / lifecycle | `leanring_buddyApp.swift` | App/ | `@main` + `CompanionAppDelegate` inline. No separate AppDelegate file. | +| Voice state machine | `CompanionManager.swift` | App/ | Central orchestrator. Owns dictation, OpenAI API, TTS, overlay, onboarding. 9 MARK sections. | +| Runtime config | `AppBundleConfiguration.swift` | App/ | Reads keys from `Info.plist` at runtime. | +| Menu bar icon + panel | `MenuBarPanelManager.swift` | UI/ | `NSStatusItem` + custom borderless `NSPanel`. Non-activating, auto-dismiss on outside click. | +| Panel UI (dropdown) | `CompanionPanelView.swift` | UI/ | SwiftUI. Model picker, permissions, push-to-talk instructions, quit button. | +| Cursor overlay | `OverlayWindow.swift` | UI/ | Full-screen transparent `NSPanel` via `NSHostingView`. Cursor animation, bezier arc pointing, multi-monitor coordinate mapping. | +| Response bubble + waveform | `CompanionResponseOverlay.swift` | UI/ | SwiftUI view rendered in the overlay next to the cursor. | +| Design tokens | `DesignSystem.swift` | UI/ | `DS.Colors.*`, `DS.CornerRadius.*`, button styles. All UI references this. | +| Push-to-talk pipeline | `BuddyDictationManager.swift` | Voice/ | `AVAudioEngine` mic capture, provider-aware permissions, transcript finalization, contextual keyterms. | +| Global hotkey | `GlobalPushToTalkShortcutMonitor.swift` | Voice/ | Listen-only `CGEvent` tap (not AppKit global monitor). Publishes press/release transitions. | +| Transcription protocol | `BuddyTranscriptionProvider.swift` | Voice/ | Protocol + factory. Provider resolved from `Info.plist` `VoiceTranscriptionProvider` key. | +| Transcription (default) | `AssemblyAIStreamingTranscriptionProvider.swift` | Voice/ | Real-time websocket (`u3-rt-pro`). Fetches temp token from Worker. **Shares a single `URLSession`** — never create per-session. | +| Transcription (upload) | `OpenAIAudioTranscriptionProvider.swift` | Voice/ | Buffers audio, uploads WAV on key-up. | +| Transcription (local) | `AppleSpeechTranscriptionProvider.swift` | Voice/ | Apple Speech framework fallback. | +| Audio conversion | `BuddyAudioConversionSupport.swift` | Voice/ | PCM16 mono conversion, WAV payload builder. | +| OpenAI chat | `OpenAIAPI.swift` | AI/ | GPT-4o vision client with SSE streaming. Routes through Worker proxy. | +| Claude chat | `ClaudeAPI.swift` | AI/ | SSE streaming + non-streaming. Currently unused (kept as reference). | +| TTS playback | `ElevenLabsTTSClient.swift` | AI/ | Worker proxy → `AVAudioPlayer`. Exposes `isPlaying` for transient cursor scheduling. | +| Element pointing | `ElementLocationDetector.swift` | AI/ | Uses Claude Computer Use API to detect UI element coordinates in screenshots. | +| Screenshots | `CompanionScreenCaptureUtility.swift` | Utilities/ | ScreenCaptureKit multi-monitor capture. Returns labeled image data per display. | +| Window placement + perms | `WindowPositionManager.swift` | Utilities/ | Screen Recording permission gate, accessibility permission helpers, window positioning. | +| Analytics | `ClickyAnalytics.swift` | Utilities/ | PostHog integration. | + +## CODE MAP — Key Symbols + +| Symbol | Type | File | Role | +|--------|------|------|------| +| `CompanionVoiceState` | enum | CompanionManager | `.idle` / `.listening` / `.processing` / `.responding` | +| `CompanionManager` | class | CompanionManager | Central `@MainActor ObservableObject`. Owns everything. | +| `CompanionManager.start()` | method | CompanionManager | Bootstrap: permissions → polling → bindings → TLS warmup → overlay | +| `CompanionManager.sendTranscriptToAIWithScreenshot` | method | CompanionManager | Core pipeline: screenshot → OpenAI SSE → parse pointing → TTS | +| `CompanionManager.handleShortcutTransition` | method | CompanionManager | Push-to-talk state machine (pressed → record, released → finalize) | +| `BuddyDictationManager` | class | BuddyDictationManager | Mic capture + transcript lifecycle | +| `BuddyTranscriptionProvider` | protocol | BuddyTranscriptionProvider | Abstraction over AssemblyAI/OpenAI/Apple Speech | +| `BuddyPushToTalkShortcut` | enum | BuddyDictationManager | Shortcut options + transition detection logic | +| `MenuBarPanelManager` | class | MenuBarPanelManager | `NSStatusItem` + `NSPanel` lifecycle | +| `OverlayWindowManager` | class | OverlayWindow | Creates/manages full-screen overlay panels per screen | +| `PointingParseResult` | struct | CompanionManager | Parsed `[POINT:x,y:label:screenN]` tag data | +| `DS` | enum | DesignSystem | Namespace for all design tokens | + +## CONVENTIONS (specific to this directory) + +- **Organized by domain**: Files grouped into `App/`, `Voice/`, `AI/`, `UI/`, `Utilities/`, `Resources/`. Xcode auto-syncs via `PBXFileSystemSynchronizedRootGroup`. +- **MARK sections**: Large files use `// MARK: - Section Name` to organize logical subsystems (CompanionManager has 9 sections). +- **Provider pattern**: Transcription uses protocol + factory + Info.plist key. Add new providers by implementing `BuddyTranscriptionProvider` and registering in `BuddyTranscriptionProviderFactory`. +- **AppKit bridging**: `NSPanel` + `NSHostingView` for menu bar panel and overlay. Comments explain "why" for all bridging code. +- **No `@EnvironmentObject`**: State flows through `CompanionManager` passed explicitly to views via init parameters. + +## ANTI-PATTERNS (this directory only) + +- **Never create/destroy `URLSession` per AssemblyAI session** — use `sharedWebSocketURLSession`. Per-session sessions corrupt the OS connection pool. +- **Never suppress type errors** with force casts or `// swiftlint:disable` — fix them properly. +- **Never suppress or ignore the deprecated onChange warning** in OverlayWindow.swift — it's a known non-blocking warning, leave it. +- **Never add features/refactor beyond what was asked** — scope discipline. +- **Never add docstrings/comments to code you didn't change.** + +## TESTS + +- Unit tests: `leanring-buddyTests/` — Swift Testing framework, 3 tests for `WindowPositionManager` permission logic only +- UI tests: `leanring-buddyUITests/` — XCTest, placeholder/boilerplate +- Coverage: ~0.5%. Most code untested. diff --git a/leanring-buddy/ClaudeAPI.swift b/leanring-buddy/AI/ClaudeAPI.swift similarity index 100% rename from leanring-buddy/ClaudeAPI.swift rename to leanring-buddy/AI/ClaudeAPI.swift diff --git a/leanring-buddy/ElementLocationDetector.swift b/leanring-buddy/AI/ElementLocationDetector.swift similarity index 100% rename from leanring-buddy/ElementLocationDetector.swift rename to leanring-buddy/AI/ElementLocationDetector.swift diff --git a/leanring-buddy/ElevenLabsTTSClient.swift b/leanring-buddy/AI/ElevenLabsTTSClient.swift similarity index 100% rename from leanring-buddy/ElevenLabsTTSClient.swift rename to leanring-buddy/AI/ElevenLabsTTSClient.swift diff --git a/leanring-buddy/AI/OpenAIAPI.swift b/leanring-buddy/AI/OpenAIAPI.swift new file mode 100644 index 00000000..7426b2bc --- /dev/null +++ b/leanring-buddy/AI/OpenAIAPI.swift @@ -0,0 +1,253 @@ +// +// OpenAIAPI.swift +// OpenAI API Implementation with streaming support +// + +import Foundation + +/// OpenAI API helper with streaming for progressive text display. +/// Routes through the Cloudflare Worker proxy so no API key ships in the app. +class OpenAIAPI { + private static let tlsWarmupLock = NSLock() + private static var hasStartedTLSWarmup = false + + private let apiURL: URL + var model: String + private let session: URLSession + + init(proxyURL: String, model: String = "gpt-4o") { + self.apiURL = URL(string: proxyURL)! + self.model = model + + let config = URLSessionConfiguration.default + config.timeoutIntervalForRequest = 120 + config.timeoutIntervalForResource = 300 + config.waitsForConnectivity = true + config.urlCache = nil + config.httpCookieStorage = nil + self.session = URLSession(configuration: config) + + warmUpTLSConnectionIfNeeded() + } + + private func makeAPIRequest() -> URLRequest { + var request = URLRequest(url: apiURL) + request.httpMethod = "POST" + request.timeoutInterval = 120 + request.setValue("application/json", forHTTPHeaderField: "Content-Type") + return request + } + + private func warmUpTLSConnectionIfNeeded() { + Self.tlsWarmupLock.lock() + let shouldStartTLSWarmup = !Self.hasStartedTLSWarmup + if shouldStartTLSWarmup { + Self.hasStartedTLSWarmup = true + } + Self.tlsWarmupLock.unlock() + + guard shouldStartTLSWarmup else { return } + + guard var warmupURLComponents = URLComponents(url: apiURL, resolvingAgainstBaseURL: false) else { + return + } + + warmupURLComponents.path = "/" + warmupURLComponents.query = nil + warmupURLComponents.fragment = nil + + guard let warmupURL = warmupURLComponents.url else { + return + } + + var warmupRequest = URLRequest(url: warmupURL) + warmupRequest.httpMethod = "HEAD" + warmupRequest.timeoutInterval = 10 + session.dataTask(with: warmupRequest) { _, _, _ in }.resume() + } + + /// Send a vision request to OpenAI with streaming. + /// Calls `onTextChunk` on the main actor each time new text arrives so the UI updates progressively. + /// Returns the full accumulated text and total duration when the stream completes. + func analyzeImageStreaming( + images: [(data: Data, label: String)], + systemPrompt: String, + conversationHistory: [(userPlaceholder: String, assistantResponse: String)] = [], + userPrompt: String, + onTextChunk: @MainActor @Sendable (String) -> Void + ) async throws -> (text: String, duration: TimeInterval) { + let startTime = Date() + + var request = makeAPIRequest() + + var messages: [[String: Any]] = [] + + // OpenAI uses a system message in the messages array + messages.append(["role": "system", "content": systemPrompt]) + + for (userPlaceholder, assistantResponse) in conversationHistory { + messages.append(["role": "user", "content": userPlaceholder]) + messages.append(["role": "assistant", "content": assistantResponse]) + } + + // Build current message with all labeled images + prompt + var contentBlocks: [[String: Any]] = [] + for image in images { + contentBlocks.append([ + "type": "text", + "text": image.label + ]) + contentBlocks.append([ + "type": "image_url", + "image_url": [ + "url": "data:image/jpeg;base64,\(image.data.base64EncodedString())" + ] + ]) + } + contentBlocks.append([ + "type": "text", + "text": userPrompt + ]) + messages.append(["role": "user", "content": contentBlocks]) + + let body: [String: Any] = [ + "model": model, + "max_completion_tokens": 1024, + "stream": true, + "messages": messages + ] + + let bodyData = try JSONSerialization.data(withJSONObject: body) + request.httpBody = bodyData + let payloadMB = Double(bodyData.count) / 1_048_576.0 + print("🌐 OpenAI streaming request: \(String(format: "%.1f", payloadMB))MB, \(images.count) image(s)") + + let (byteStream, response) = try await session.bytes(for: request) + + guard let httpResponse = response as? HTTPURLResponse else { + throw NSError( + domain: "OpenAIAPI", + code: -1, + userInfo: [NSLocalizedDescriptionKey: "Invalid HTTP response"] + ) + } + + guard (200...299).contains(httpResponse.statusCode) else { + var errorBodyChunks: [String] = [] + for try await line in byteStream.lines { + errorBodyChunks.append(line) + } + let errorBody = errorBodyChunks.joined(separator: "\n") + throw NSError( + domain: "OpenAIAPI", + code: httpResponse.statusCode, + userInfo: [NSLocalizedDescriptionKey: "API Error (\(httpResponse.statusCode)): \(errorBody)"] + ) + } + + // Parse OpenAI SSE stream — each event is "data: {json}\n\n" + // OpenAI uses choices[0].delta.content for streaming text chunks + var accumulatedResponseText = "" + + for try await line in byteStream.lines { + guard line.hasPrefix("data: ") else { continue } + let jsonString = String(line.dropFirst(6)) + + guard jsonString != "[DONE]" else { break } + + guard let jsonData = jsonString.data(using: .utf8), + let eventPayload = try? JSONSerialization.jsonObject(with: jsonData) as? [String: Any], + let choices = eventPayload["choices"] as? [[String: Any]], + let firstChoice = choices.first, + let delta = firstChoice["delta"] as? [String: Any], + let textChunk = delta["content"] as? String else { + continue + } + + accumulatedResponseText += textChunk + let currentAccumulatedText = accumulatedResponseText + await onTextChunk(currentAccumulatedText) + } + + let duration = Date().timeIntervalSince(startTime) + return (text: accumulatedResponseText, duration: duration) + } + + /// Non-streaming fallback for validation requests where we don't need progressive display. + func analyzeImage( + images: [(data: Data, label: String)], + systemPrompt: String, + conversationHistory: [(userPlaceholder: String, assistantResponse: String)] = [], + userPrompt: String + ) async throws -> (text: String, duration: TimeInterval) { + let startTime = Date() + + var request = makeAPIRequest() + + var messages: [[String: Any]] = [] + + messages.append(["role": "system", "content": systemPrompt]) + + for (userPlaceholder, assistantResponse) in conversationHistory { + messages.append(["role": "user", "content": userPlaceholder]) + messages.append(["role": "assistant", "content": assistantResponse]) + } + + var contentBlocks: [[String: Any]] = [] + for image in images { + contentBlocks.append([ + "type": "text", + "text": image.label + ]) + contentBlocks.append([ + "type": "image_url", + "image_url": [ + "url": "data:image/jpeg;base64,\(image.data.base64EncodedString())" + ] + ]) + } + contentBlocks.append([ + "type": "text", + "text": userPrompt + ]) + messages.append(["role": "user", "content": contentBlocks]) + + let body: [String: Any] = [ + "model": model, + "max_completion_tokens": 256, + "messages": messages + ] + + let bodyData = try JSONSerialization.data(withJSONObject: body) + request.httpBody = bodyData + let payloadMB = Double(bodyData.count) / 1_048_576.0 + print("🌐 OpenAI request: \(String(format: "%.1f", payloadMB))MB, \(images.count) image(s)") + + let (data, response) = try await session.data(for: request) + + guard let httpResponse = response as? HTTPURLResponse, + (200...299).contains(httpResponse.statusCode) else { + let responseString = String(data: data, encoding: .utf8) ?? "Unknown error" + throw NSError( + domain: "OpenAIAPI", + code: (response as? HTTPURLResponse)?.statusCode ?? -1, + userInfo: [NSLocalizedDescriptionKey: "API Error: \(responseString)"] + ) + } + + let json = try JSONSerialization.jsonObject(with: data) as? [String: Any] + guard let choices = json?["choices"] as? [[String: Any]], + let firstChoice = choices.first, + let message = firstChoice["message"] as? [String: Any], + let text = message["content"] as? String else { + throw NSError( + domain: "OpenAIAPI", + code: -1, + userInfo: [NSLocalizedDescriptionKey: "Invalid response format"] + ) + } + + let duration = Date().timeIntervalSince(startTime) + return (text: text, duration: duration) + } +} diff --git a/leanring-buddy/AppBundleConfiguration.swift b/leanring-buddy/App/AppBundleConfiguration.swift similarity index 100% rename from leanring-buddy/AppBundleConfiguration.swift rename to leanring-buddy/App/AppBundleConfiguration.swift diff --git a/leanring-buddy/CompanionManager.swift b/leanring-buddy/App/CompanionManager.swift similarity index 98% rename from leanring-buddy/CompanionManager.swift rename to leanring-buddy/App/CompanionManager.swift index 0234cf19..7bcb65a1 100644 --- a/leanring-buddy/CompanionManager.swift +++ b/leanring-buddy/App/CompanionManager.swift @@ -72,16 +72,16 @@ final class CompanionManager: ObservableObject { /// through this so keys never ship in the app binary. private static let workerBaseURL = "https://your-worker-name.your-subdomain.workers.dev" - private lazy var claudeAPI: ClaudeAPI = { - return ClaudeAPI(proxyURL: "\(Self.workerBaseURL)/chat", model: selectedModel) + private lazy var openAIAPI: OpenAIAPI = { + return OpenAIAPI(proxyURL: "\(Self.workerBaseURL)/chat", model: selectedModel) }() private lazy var elevenLabsTTSClient: ElevenLabsTTSClient = { return ElevenLabsTTSClient(proxyURL: "\(Self.workerBaseURL)/tts") }() - /// Conversation history so Claude remembers prior exchanges within a session. - /// Each entry is the user's transcript and Claude's response. + /// Conversation history so the AI remembers prior exchanges within a session. + /// Each entry is the user's transcript and the AI's response. private var conversationHistory: [(userTranscript: String, assistantResponse: String)] = [] /// The currently running AI response task, if any. Cancelled when the user @@ -107,13 +107,13 @@ final class CompanionManager: ObservableObject { /// Used by the panel to show accurate status text ("Active" vs "Ready"). @Published private(set) var isOverlayVisible: Bool = false - /// The Claude model used for voice responses. Persisted to UserDefaults. - @Published var selectedModel: String = UserDefaults.standard.string(forKey: "selectedClaudeModel") ?? "claude-sonnet-4-6" + /// The AI model used for voice responses. Persisted to UserDefaults. + @Published var selectedModel: String = UserDefaults.standard.string(forKey: "selectedAIModel") ?? "gpt-4o" func setSelectedModel(_ model: String) { selectedModel = model - UserDefaults.standard.set(model, forKey: "selectedClaudeModel") - claudeAPI.model = model + UserDefaults.standard.set(model, forKey: "selectedAIModel") + openAIAPI.model = model } /// User preference for whether the Clicky cursor should be shown. @@ -179,9 +179,9 @@ final class CompanionManager: ObservableObject { bindVoiceStateObservation() bindAudioPowerLevel() bindShortcutTransitions() - // Eagerly touch the Claude API so its TLS warmup handshake completes + // Eagerly touch the OpenAI API so its TLS warmup handshake completes // well before the onboarding demo fires at ~40s into the video. - _ = claudeAPI + _ = openAIAPI // If the user already completed onboarding AND all permissions are // still granted, show the cursor overlay immediately. If permissions @@ -521,7 +521,7 @@ final class CompanionManager: ObservableObject { self?.lastTranscript = finalTranscript print("🗣️ Companion received transcript: \(finalTranscript)") ClickyAnalytics.trackUserMessageSent(transcript: finalTranscript) - self?.sendTranscriptToClaudeWithScreenshot(transcript: finalTranscript) + self?.sendTranscriptToAIWithScreenshot(transcript: finalTranscript) } ) } @@ -583,7 +583,7 @@ final class CompanionManager: ObservableObject { /// the spinner/processing state until TTS audio begins playing. /// Claude's response may include a [POINT:x,y:label] tag which triggers /// the buddy to fly to that element on screen. - private func sendTranscriptToClaudeWithScreenshot(transcript: String) { + private func sendTranscriptToAIWithScreenshot(transcript: String) { currentResponseTask?.cancel() elevenLabsTTSClient.stopPlayback() @@ -610,7 +610,7 @@ final class CompanionManager: ObservableObject { (userPlaceholder: entry.userTranscript, assistantResponse: entry.assistantResponse) } - let (fullResponseText, _) = try await claudeAPI.analyzeImageStreaming( + let (fullResponseText, _) = try await openAIAPI.analyzeImageStreaming( images: labeledImages, systemPrompt: Self.companionVoiceResponseSystemPrompt, conversationHistory: historyForAPI, @@ -982,7 +982,7 @@ final class CompanionManager: ObservableObject { let dimensionInfo = " (image dimensions: \(cursorScreenCapture.screenshotWidthInPixels)x\(cursorScreenCapture.screenshotHeightInPixels) pixels)" let labeledImages = [(data: cursorScreenCapture.imageData, label: cursorScreenCapture.label + dimensionInfo)] - let (fullResponseText, _) = try await claudeAPI.analyzeImageStreaming( + let (fullResponseText, _) = try await openAIAPI.analyzeImageStreaming( images: labeledImages, systemPrompt: Self.onboardingDemoSystemPrompt, userPrompt: "look around my screen and find something interesting to point at", diff --git a/leanring-buddy/leanring_buddyApp.swift b/leanring-buddy/App/leanring_buddyApp.swift similarity index 100% rename from leanring-buddy/leanring_buddyApp.swift rename to leanring-buddy/App/leanring_buddyApp.swift diff --git a/leanring-buddy/OpenAIAPI.swift b/leanring-buddy/OpenAIAPI.swift deleted file mode 100644 index d0c3f2ae..00000000 --- a/leanring-buddy/OpenAIAPI.swift +++ /dev/null @@ -1,142 +0,0 @@ -// -// OpenAIAPI.swift -// OpenAI API Implementation -// - -import Foundation - -/// OpenAI API helper for vision analysis -class OpenAIAPI { - private let apiKey: String - private let apiURL: URL - private let model: String - private let session: URLSession - - init(apiKey: String, model: String = "gpt-5.2-2025-12-11") { - self.apiKey = apiKey - self.apiURL = URL(string: "https://api.openai.com/v1/chat/completions")! - self.model = model - - // Use .default instead of .ephemeral so TLS session tickets are cached. - // Ephemeral sessions do a full TLS handshake on every request, which causes - // transient -1200 (errSSLPeerHandshakeFail) errors with large image payloads. - // Disable URL/cookie caching to avoid storing responses or credentials on disk. - let config = URLSessionConfiguration.default - config.timeoutIntervalForRequest = 120 - config.timeoutIntervalForResource = 300 - config.waitsForConnectivity = true - config.urlCache = nil - config.httpCookieStorage = nil - self.session = URLSession(configuration: config) - - // Fire a lightweight HEAD request in the background to pre-establish the TLS - // connection. This caches the TLS session ticket so the first real API call - // (which carries a large image payload) doesn't need a cold TLS handshake. - warmUpTLSConnection() - } - - /// Sends a no-op HEAD request to the API host to establish and cache a TLS session. - /// Failures are silently ignored — this is purely an optimization. - private func warmUpTLSConnection() { - var warmupRequest = URLRequest(url: apiURL) - warmupRequest.httpMethod = "HEAD" - warmupRequest.timeoutInterval = 10 - session.dataTask(with: warmupRequest) { _, _, _ in - // Response doesn't matter — the TLS handshake is the goal - }.resume() - } - - /// Send a vision request to OpenAI with one or more labeled images. - func analyzeImage( - images: [(data: Data, label: String)], - systemPrompt: String, - conversationHistory: [(userPlaceholder: String, assistantResponse: String)] = [], - userPrompt: String - ) async throws -> (text: String, duration: TimeInterval) { - let startTime = Date() - - // Build request - var request = URLRequest(url: apiURL) - request.httpMethod = "POST" - request.timeoutInterval = 120 - request.setValue("Bearer \(apiKey)", forHTTPHeaderField: "Authorization") - request.setValue("application/json", forHTTPHeaderField: "Content-Type") - - // Build messages array - var messages: [[String: Any]] = [] - - // Add system message first - messages.append([ - "role": "system", - "content": systemPrompt - ]) - - // Add conversation history - for (userPlaceholder, assistantResponse) in conversationHistory { - messages.append(["role": "user", "content": userPlaceholder]) - messages.append(["role": "assistant", "content": assistantResponse]) - } - - // Build current message with all labeled images + prompt - var contentBlocks: [[String: Any]] = [] - for image in images { - contentBlocks.append([ - "type": "text", - "text": image.label - ]) - contentBlocks.append([ - "type": "image_url", - "image_url": [ - "url": "data:image/jpeg;base64,\(image.data.base64EncodedString())" - ] - ]) - } - contentBlocks.append([ - "type": "text", - "text": userPrompt - ]) - messages.append(["role": "user", "content": contentBlocks]) - - // Build request body - let body: [String: Any] = [ - "model": model, - // `max_tokens` is deprecated/incompatible for some newer OpenAI models. - "max_completion_tokens": 600, - "messages": messages - ] - - let bodyData = try JSONSerialization.data(withJSONObject: body) - request.httpBody = bodyData - let payloadMB = Double(bodyData.count) / 1_048_576.0 - print("🌐 OpenAI request: \(String(format: "%.1f", payloadMB))MB, \(images.count) image(s)") - - // Send request - let (data, response) = try await session.data(for: request) - - guard let httpResponse = response as? HTTPURLResponse, - (200...299).contains(httpResponse.statusCode) else { - let responseString = String(data: data, encoding: .utf8) ?? "Unknown error" - throw NSError( - domain: "OpenAIAPI", - code: (response as? HTTPURLResponse)?.statusCode ?? -1, - userInfo: [NSLocalizedDescriptionKey: "API Error: \(responseString)"] - ) - } - - // Parse response - let json = try JSONSerialization.jsonObject(with: data) as? [String: Any] - guard let choices = json?["choices"] as? [[String: Any]], - let firstChoice = choices.first, - let message = firstChoice["message"] as? [String: Any], - let text = message["content"] as? String else { - throw NSError( - domain: "OpenAIAPI", - code: -1, - userInfo: [NSLocalizedDescriptionKey: "Invalid response format"] - ) - } - - let duration = Date().timeIntervalSince(startTime) - return (text: text, duration: duration) - } -} diff --git a/leanring-buddy/Assets.xcassets/AccentColor.colorset/Contents.json b/leanring-buddy/Resources/Assets.xcassets/AccentColor.colorset/Contents.json similarity index 100% rename from leanring-buddy/Assets.xcassets/AccentColor.colorset/Contents.json rename to leanring-buddy/Resources/Assets.xcassets/AccentColor.colorset/Contents.json diff --git a/leanring-buddy/Assets.xcassets/AppIcon.appiconset/1024-mac.png b/leanring-buddy/Resources/Assets.xcassets/AppIcon.appiconset/1024-mac.png similarity index 100% rename from leanring-buddy/Assets.xcassets/AppIcon.appiconset/1024-mac.png rename to leanring-buddy/Resources/Assets.xcassets/AppIcon.appiconset/1024-mac.png diff --git a/leanring-buddy/Assets.xcassets/AppIcon.appiconset/128-mac.png b/leanring-buddy/Resources/Assets.xcassets/AppIcon.appiconset/128-mac.png similarity index 100% rename from leanring-buddy/Assets.xcassets/AppIcon.appiconset/128-mac.png rename to leanring-buddy/Resources/Assets.xcassets/AppIcon.appiconset/128-mac.png diff --git a/leanring-buddy/Assets.xcassets/AppIcon.appiconset/16-mac.png b/leanring-buddy/Resources/Assets.xcassets/AppIcon.appiconset/16-mac.png similarity index 100% rename from leanring-buddy/Assets.xcassets/AppIcon.appiconset/16-mac.png rename to leanring-buddy/Resources/Assets.xcassets/AppIcon.appiconset/16-mac.png diff --git a/leanring-buddy/Assets.xcassets/AppIcon.appiconset/256-mac.png b/leanring-buddy/Resources/Assets.xcassets/AppIcon.appiconset/256-mac.png similarity index 100% rename from leanring-buddy/Assets.xcassets/AppIcon.appiconset/256-mac.png rename to leanring-buddy/Resources/Assets.xcassets/AppIcon.appiconset/256-mac.png diff --git a/leanring-buddy/Assets.xcassets/AppIcon.appiconset/32-mac.png b/leanring-buddy/Resources/Assets.xcassets/AppIcon.appiconset/32-mac.png similarity index 100% rename from leanring-buddy/Assets.xcassets/AppIcon.appiconset/32-mac.png rename to leanring-buddy/Resources/Assets.xcassets/AppIcon.appiconset/32-mac.png diff --git a/leanring-buddy/Assets.xcassets/AppIcon.appiconset/512-mac.png b/leanring-buddy/Resources/Assets.xcassets/AppIcon.appiconset/512-mac.png similarity index 100% rename from leanring-buddy/Assets.xcassets/AppIcon.appiconset/512-mac.png rename to leanring-buddy/Resources/Assets.xcassets/AppIcon.appiconset/512-mac.png diff --git a/leanring-buddy/Assets.xcassets/AppIcon.appiconset/64-mac.png b/leanring-buddy/Resources/Assets.xcassets/AppIcon.appiconset/64-mac.png similarity index 100% rename from leanring-buddy/Assets.xcassets/AppIcon.appiconset/64-mac.png rename to leanring-buddy/Resources/Assets.xcassets/AppIcon.appiconset/64-mac.png diff --git a/leanring-buddy/Assets.xcassets/AppIcon.appiconset/Contents.json b/leanring-buddy/Resources/Assets.xcassets/AppIcon.appiconset/Contents.json similarity index 100% rename from leanring-buddy/Assets.xcassets/AppIcon.appiconset/Contents.json rename to leanring-buddy/Resources/Assets.xcassets/AppIcon.appiconset/Contents.json diff --git a/leanring-buddy/Assets.xcassets/Contents.json b/leanring-buddy/Resources/Assets.xcassets/Contents.json similarity index 100% rename from leanring-buddy/Assets.xcassets/Contents.json rename to leanring-buddy/Resources/Assets.xcassets/Contents.json diff --git a/leanring-buddy/Assets.xcassets/add-a-project-in-codex-from-sidebar.imageset/Contents.json b/leanring-buddy/Resources/Assets.xcassets/add-a-project-in-codex-from-sidebar.imageset/Contents.json similarity index 100% rename from leanring-buddy/Assets.xcassets/add-a-project-in-codex-from-sidebar.imageset/Contents.json rename to leanring-buddy/Resources/Assets.xcassets/add-a-project-in-codex-from-sidebar.imageset/Contents.json diff --git a/leanring-buddy/Assets.xcassets/add-a-project-in-codex-from-sidebar.imageset/Image.png b/leanring-buddy/Resources/Assets.xcassets/add-a-project-in-codex-from-sidebar.imageset/Image.png similarity index 100% rename from leanring-buddy/Assets.xcassets/add-a-project-in-codex-from-sidebar.imageset/Image.png rename to leanring-buddy/Resources/Assets.xcassets/add-a-project-in-codex-from-sidebar.imageset/Image.png diff --git a/leanring-buddy/Assets.xcassets/add-a-project-in-codex.imageset/Contents.json b/leanring-buddy/Resources/Assets.xcassets/add-a-project-in-codex.imageset/Contents.json similarity index 100% rename from leanring-buddy/Assets.xcassets/add-a-project-in-codex.imageset/Contents.json rename to leanring-buddy/Resources/Assets.xcassets/add-a-project-in-codex.imageset/Contents.json diff --git a/leanring-buddy/Assets.xcassets/add-a-project-in-codex.imageset/add-a-project-in-codex.png b/leanring-buddy/Resources/Assets.xcassets/add-a-project-in-codex.imageset/add-a-project-in-codex.png similarity index 100% rename from leanring-buddy/Assets.xcassets/add-a-project-in-codex.imageset/add-a-project-in-codex.png rename to leanring-buddy/Resources/Assets.xcassets/add-a-project-in-codex.imageset/add-a-project-in-codex.png diff --git a/leanring-buddy/Assets.xcassets/codex-app-screenshot.imageset/Contents.json b/leanring-buddy/Resources/Assets.xcassets/codex-app-screenshot.imageset/Contents.json similarity index 100% rename from leanring-buddy/Assets.xcassets/codex-app-screenshot.imageset/Contents.json rename to leanring-buddy/Resources/Assets.xcassets/codex-app-screenshot.imageset/Contents.json diff --git a/leanring-buddy/Assets.xcassets/codex-app-screenshot.imageset/codex-app-screenshot.jpg b/leanring-buddy/Resources/Assets.xcassets/codex-app-screenshot.imageset/codex-app-screenshot.jpg similarity index 100% rename from leanring-buddy/Assets.xcassets/codex-app-screenshot.imageset/codex-app-screenshot.jpg rename to leanring-buddy/Resources/Assets.xcassets/codex-app-screenshot.imageset/codex-app-screenshot.jpg diff --git a/leanring-buddy/Assets.xcassets/codex-home-screen.imageset/Contents.json b/leanring-buddy/Resources/Assets.xcassets/codex-home-screen.imageset/Contents.json similarity index 100% rename from leanring-buddy/Assets.xcassets/codex-home-screen.imageset/Contents.json rename to leanring-buddy/Resources/Assets.xcassets/codex-home-screen.imageset/Contents.json diff --git a/leanring-buddy/Assets.xcassets/codex-home-screen.imageset/codex-home-screen.png b/leanring-buddy/Resources/Assets.xcassets/codex-home-screen.imageset/codex-home-screen.png similarity index 100% rename from leanring-buddy/Assets.xcassets/codex-home-screen.imageset/codex-home-screen.png rename to leanring-buddy/Resources/Assets.xcassets/codex-home-screen.imageset/codex-home-screen.png diff --git a/leanring-buddy/Assets.xcassets/codex-permissions.imageset/Contents.json b/leanring-buddy/Resources/Assets.xcassets/codex-permissions.imageset/Contents.json similarity index 100% rename from leanring-buddy/Assets.xcassets/codex-permissions.imageset/Contents.json rename to leanring-buddy/Resources/Assets.xcassets/codex-permissions.imageset/Contents.json diff --git a/leanring-buddy/Assets.xcassets/codex-permissions.imageset/codex-permissions.png b/leanring-buddy/Resources/Assets.xcassets/codex-permissions.imageset/codex-permissions.png similarity index 100% rename from leanring-buddy/Assets.xcassets/codex-permissions.imageset/codex-permissions.png rename to leanring-buddy/Resources/Assets.xcassets/codex-permissions.imageset/codex-permissions.png diff --git a/leanring-buddy/Assets.xcassets/discord-logo.imageset/Contents.json b/leanring-buddy/Resources/Assets.xcassets/discord-logo.imageset/Contents.json similarity index 100% rename from leanring-buddy/Assets.xcassets/discord-logo.imageset/Contents.json rename to leanring-buddy/Resources/Assets.xcassets/discord-logo.imageset/Contents.json diff --git a/leanring-buddy/Assets.xcassets/discord-logo.imageset/discord-logo.svg b/leanring-buddy/Resources/Assets.xcassets/discord-logo.imageset/discord-logo.svg similarity index 100% rename from leanring-buddy/Assets.xcassets/discord-logo.imageset/discord-logo.svg rename to leanring-buddy/Resources/Assets.xcassets/discord-logo.imageset/discord-logo.svg diff --git a/leanring-buddy/Assets.xcassets/git-tools-prompt.imageset/Contents.json b/leanring-buddy/Resources/Assets.xcassets/git-tools-prompt.imageset/Contents.json similarity index 100% rename from leanring-buddy/Assets.xcassets/git-tools-prompt.imageset/Contents.json rename to leanring-buddy/Resources/Assets.xcassets/git-tools-prompt.imageset/Contents.json diff --git a/leanring-buddy/Assets.xcassets/git-tools-prompt.imageset/git-tools-prompt.png b/leanring-buddy/Resources/Assets.xcassets/git-tools-prompt.imageset/git-tools-prompt.png similarity index 100% rename from leanring-buddy/Assets.xcassets/git-tools-prompt.imageset/git-tools-prompt.png rename to leanring-buddy/Resources/Assets.xcassets/git-tools-prompt.imageset/git-tools-prompt.png diff --git a/leanring-buddy/Assets.xcassets/google-logo.imageset/Contents.json b/leanring-buddy/Resources/Assets.xcassets/google-logo.imageset/Contents.json similarity index 100% rename from leanring-buddy/Assets.xcassets/google-logo.imageset/Contents.json rename to leanring-buddy/Resources/Assets.xcassets/google-logo.imageset/Contents.json diff --git a/leanring-buddy/Assets.xcassets/google-logo.imageset/google-logo.svg b/leanring-buddy/Resources/Assets.xcassets/google-logo.imageset/google-logo.svg similarity index 100% rename from leanring-buddy/Assets.xcassets/google-logo.imageset/google-logo.svg rename to leanring-buddy/Resources/Assets.xcassets/google-logo.imageset/google-logo.svg diff --git a/leanring-buddy/Assets.xcassets/inside-the-makesomething-project-folder.imageset/Contents.json b/leanring-buddy/Resources/Assets.xcassets/inside-the-makesomething-project-folder.imageset/Contents.json similarity index 100% rename from leanring-buddy/Assets.xcassets/inside-the-makesomething-project-folder.imageset/Contents.json rename to leanring-buddy/Resources/Assets.xcassets/inside-the-makesomething-project-folder.imageset/Contents.json diff --git a/leanring-buddy/Assets.xcassets/inside-the-makesomething-project-folder.imageset/inside-the-makesomething-project-folder.png b/leanring-buddy/Resources/Assets.xcassets/inside-the-makesomething-project-folder.imageset/inside-the-makesomething-project-folder.png similarity index 100% rename from leanring-buddy/Assets.xcassets/inside-the-makesomething-project-folder.imageset/inside-the-makesomething-project-folder.png rename to leanring-buddy/Resources/Assets.xcassets/inside-the-makesomething-project-folder.imageset/inside-the-makesomething-project-folder.png diff --git a/leanring-buddy/Assets.xcassets/makesomething-project-folder-in-downloads.imageset/Contents.json b/leanring-buddy/Resources/Assets.xcassets/makesomething-project-folder-in-downloads.imageset/Contents.json similarity index 100% rename from leanring-buddy/Assets.xcassets/makesomething-project-folder-in-downloads.imageset/Contents.json rename to leanring-buddy/Resources/Assets.xcassets/makesomething-project-folder-in-downloads.imageset/Contents.json diff --git a/leanring-buddy/Assets.xcassets/makesomething-project-folder-in-downloads.imageset/makesomething-project-folder-in-downloads.png b/leanring-buddy/Resources/Assets.xcassets/makesomething-project-folder-in-downloads.imageset/makesomething-project-folder-in-downloads.png similarity index 100% rename from leanring-buddy/Assets.xcassets/makesomething-project-folder-in-downloads.imageset/makesomething-project-folder-in-downloads.png rename to leanring-buddy/Resources/Assets.xcassets/makesomething-project-folder-in-downloads.imageset/makesomething-project-folder-in-downloads.png diff --git a/leanring-buddy/Assets.xcassets/steve.imageset/Contents.json b/leanring-buddy/Resources/Assets.xcassets/steve.imageset/Contents.json similarity index 100% rename from leanring-buddy/Assets.xcassets/steve.imageset/Contents.json rename to leanring-buddy/Resources/Assets.xcassets/steve.imageset/Contents.json diff --git a/leanring-buddy/Assets.xcassets/steve.imageset/steve.jpg b/leanring-buddy/Resources/Assets.xcassets/steve.imageset/steve.jpg similarity index 100% rename from leanring-buddy/Assets.xcassets/steve.imageset/steve.jpg rename to leanring-buddy/Resources/Assets.xcassets/steve.imageset/steve.jpg diff --git a/leanring-buddy/Assets.xcassets/steve.jpg b/leanring-buddy/Resources/Assets.xcassets/steve.jpg similarity index 100% rename from leanring-buddy/Assets.xcassets/steve.jpg rename to leanring-buddy/Resources/Assets.xcassets/steve.jpg diff --git a/leanring-buddy/codex-add-project.png b/leanring-buddy/Resources/codex-add-project.png similarity index 100% rename from leanring-buddy/codex-add-project.png rename to leanring-buddy/Resources/codex-add-project.png diff --git a/leanring-buddy/enter.mp3 b/leanring-buddy/Resources/enter.mp3 similarity index 100% rename from leanring-buddy/enter.mp3 rename to leanring-buddy/Resources/enter.mp3 diff --git a/leanring-buddy/eshop.mp3 b/leanring-buddy/Resources/eshop.mp3 similarity index 100% rename from leanring-buddy/eshop.mp3 rename to leanring-buddy/Resources/eshop.mp3 diff --git a/leanring-buddy/ff.mp3 b/leanring-buddy/Resources/ff.mp3 similarity index 100% rename from leanring-buddy/ff.mp3 rename to leanring-buddy/Resources/ff.mp3 diff --git a/leanring-buddy/steve.jpg b/leanring-buddy/Resources/steve.jpg similarity index 100% rename from leanring-buddy/steve.jpg rename to leanring-buddy/Resources/steve.jpg diff --git a/leanring-buddy/CompanionPanelView.swift b/leanring-buddy/UI/CompanionPanelView.swift similarity index 99% rename from leanring-buddy/CompanionPanelView.swift rename to leanring-buddy/UI/CompanionPanelView.swift index 76789b4c..8c0e3965 100644 --- a/leanring-buddy/CompanionPanelView.swift +++ b/leanring-buddy/UI/CompanionPanelView.swift @@ -607,8 +607,8 @@ struct CompanionPanelView: View { Spacer() HStack(spacing: 0) { - modelOptionButton(label: "Sonnet", modelID: "claude-sonnet-4-6") - modelOptionButton(label: "Opus", modelID: "claude-opus-4-6") + modelOptionButton(label: "GPT-4o", modelID: "gpt-4o") + modelOptionButton(label: "GPT-4o mini", modelID: "gpt-4o-mini") } .background( RoundedRectangle(cornerRadius: 6, style: .continuous) diff --git a/leanring-buddy/CompanionResponseOverlay.swift b/leanring-buddy/UI/CompanionResponseOverlay.swift similarity index 100% rename from leanring-buddy/CompanionResponseOverlay.swift rename to leanring-buddy/UI/CompanionResponseOverlay.swift diff --git a/leanring-buddy/DesignSystem.swift b/leanring-buddy/UI/DesignSystem.swift similarity index 100% rename from leanring-buddy/DesignSystem.swift rename to leanring-buddy/UI/DesignSystem.swift diff --git a/leanring-buddy/MenuBarPanelManager.swift b/leanring-buddy/UI/MenuBarPanelManager.swift similarity index 100% rename from leanring-buddy/MenuBarPanelManager.swift rename to leanring-buddy/UI/MenuBarPanelManager.swift diff --git a/leanring-buddy/OverlayWindow.swift b/leanring-buddy/UI/OverlayWindow.swift similarity index 100% rename from leanring-buddy/OverlayWindow.swift rename to leanring-buddy/UI/OverlayWindow.swift diff --git a/leanring-buddy/ClickyAnalytics.swift b/leanring-buddy/Utilities/ClickyAnalytics.swift similarity index 100% rename from leanring-buddy/ClickyAnalytics.swift rename to leanring-buddy/Utilities/ClickyAnalytics.swift diff --git a/leanring-buddy/CompanionScreenCaptureUtility.swift b/leanring-buddy/Utilities/CompanionScreenCaptureUtility.swift similarity index 100% rename from leanring-buddy/CompanionScreenCaptureUtility.swift rename to leanring-buddy/Utilities/CompanionScreenCaptureUtility.swift diff --git a/leanring-buddy/WindowPositionManager.swift b/leanring-buddy/Utilities/WindowPositionManager.swift similarity index 100% rename from leanring-buddy/WindowPositionManager.swift rename to leanring-buddy/Utilities/WindowPositionManager.swift diff --git a/leanring-buddy/AppleSpeechTranscriptionProvider.swift b/leanring-buddy/Voice/AppleSpeechTranscriptionProvider.swift similarity index 100% rename from leanring-buddy/AppleSpeechTranscriptionProvider.swift rename to leanring-buddy/Voice/AppleSpeechTranscriptionProvider.swift diff --git a/leanring-buddy/AssemblyAIStreamingTranscriptionProvider.swift b/leanring-buddy/Voice/AssemblyAIStreamingTranscriptionProvider.swift similarity index 100% rename from leanring-buddy/AssemblyAIStreamingTranscriptionProvider.swift rename to leanring-buddy/Voice/AssemblyAIStreamingTranscriptionProvider.swift diff --git a/leanring-buddy/BuddyAudioConversionSupport.swift b/leanring-buddy/Voice/BuddyAudioConversionSupport.swift similarity index 100% rename from leanring-buddy/BuddyAudioConversionSupport.swift rename to leanring-buddy/Voice/BuddyAudioConversionSupport.swift diff --git a/leanring-buddy/BuddyDictationManager.swift b/leanring-buddy/Voice/BuddyDictationManager.swift similarity index 100% rename from leanring-buddy/BuddyDictationManager.swift rename to leanring-buddy/Voice/BuddyDictationManager.swift diff --git a/leanring-buddy/BuddyTranscriptionProvider.swift b/leanring-buddy/Voice/BuddyTranscriptionProvider.swift similarity index 100% rename from leanring-buddy/BuddyTranscriptionProvider.swift rename to leanring-buddy/Voice/BuddyTranscriptionProvider.swift diff --git a/leanring-buddy/GlobalPushToTalkShortcutMonitor.swift b/leanring-buddy/Voice/GlobalPushToTalkShortcutMonitor.swift similarity index 100% rename from leanring-buddy/GlobalPushToTalkShortcutMonitor.swift rename to leanring-buddy/Voice/GlobalPushToTalkShortcutMonitor.swift diff --git a/leanring-buddy/OpenAIAudioTranscriptionProvider.swift b/leanring-buddy/Voice/OpenAIAudioTranscriptionProvider.swift similarity index 100% rename from leanring-buddy/OpenAIAudioTranscriptionProvider.swift rename to leanring-buddy/Voice/OpenAIAudioTranscriptionProvider.swift diff --git a/worker/src/index.ts b/worker/src/index.ts index 2e3e9345..a3da23c0 100644 --- a/worker/src/index.ts +++ b/worker/src/index.ts @@ -1,16 +1,17 @@ /** * Clicky Proxy Worker * - * Proxies requests to Claude and ElevenLabs APIs so the app never + * Proxies requests to OpenAI and ElevenLabs APIs so the app never * ships with raw API keys. Keys are stored as Cloudflare secrets. * * Routes: - * POST /chat → Anthropic Messages API (streaming) + * POST /chat → OpenAI Chat Completions API (streaming) * POST /tts → ElevenLabs TTS API + * POST /transcribe-token → AssemblyAI temp token */ interface Env { - ANTHROPIC_API_KEY: string; + OPENAI_API_KEY: string; ELEVENLABS_API_KEY: string; ELEVENLABS_VOICE_ID: string; ASSEMBLYAI_API_KEY: string; @@ -51,11 +52,10 @@ export default { async function handleChat(request: Request, env: Env): Promise { const body = await request.text(); - const response = await fetch("https://api.anthropic.com/v1/messages", { + const response = await fetch("https://api.openai.com/v1/chat/completions", { method: "POST", headers: { - "x-api-key": env.ANTHROPIC_API_KEY, - "anthropic-version": "2023-06-01", + authorization: `Bearer ${env.OPENAI_API_KEY}`, "content-type": "application/json", }, body, @@ -63,7 +63,7 @@ async function handleChat(request: Request, env: Env): Promise { if (!response.ok) { const errorBody = await response.text(); - console.error(`[/chat] Anthropic API error ${response.status}: ${errorBody}`); + console.error(`[/chat] OpenAI API error ${response.status}: ${errorBody}`); return new Response(errorBody, { status: response.status, headers: { "content-type": "application/json" }, From f1947c7434832858cd60cbf178708f14beec9b38 Mon Sep 17 00:00:00 2001 From: Hari Kesavan Date: Sat, 18 Apr 2026 12:14:51 +0200 Subject: [PATCH 2/6] Skip onboarding flow by hardcoding `hasCompletedOnboarding` to always return true --- leanring-buddy/App/CompanionManager.swift | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/leanring-buddy/App/CompanionManager.swift b/leanring-buddy/App/CompanionManager.swift index 7bcb65a1..3027e241 100644 --- a/leanring-buddy/App/CompanionManager.swift +++ b/leanring-buddy/App/CompanionManager.swift @@ -139,10 +139,10 @@ final class CompanionManager: ObservableObject { } } - /// Whether the user has completed onboarding at least once. Persisted - /// to UserDefaults so the Start button only appears on first launch. + /// Onboarding is disabled — always returns true so the app skips + /// the intro video and goes straight to the companion overlay. var hasCompletedOnboarding: Bool { - get { UserDefaults.standard.bool(forKey: "hasCompletedOnboarding") } + get { return true } set { UserDefaults.standard.set(newValue, forKey: "hasCompletedOnboarding") } } From 013d6e68e992f20f00e19effbdd65dabe33a19dd Mon Sep 17 00:00:00 2001 From: Hari Kesavan Date: Sat, 18 Apr 2026 12:55:07 +0200 Subject: [PATCH 3/6] Integrate `/transcribe` route with OpenAI transcription API and migrate text-to-speech pipeline to OpenAI TTS: - Add `/transcribe` endpoint to Worker for audio transcription. - Replace ElevenLabs TTS with OpenAI TTS (`OpenAITTSClient`), updating dependencies and proxy routes. - Update proxy URLs to localhost for local testing. - Refactor transcription providers to use proxy-based routing. - Modify companion logic to support OpenAI TTS playback. --- .gitignore | 2 +- leanring-buddy/AI/ElevenLabsTTSClient.swift | 31 +++------- leanring-buddy/App/CompanionManager.swift | 14 ++--- leanring-buddy/Info.plist | 2 +- ...mblyAIStreamingTranscriptionProvider.swift | 2 +- .../OpenAIAudioTranscriptionProvider.swift | 42 +++++--------- worker/src/index.ts | 57 ++++++++++++++----- 7 files changed, 75 insertions(+), 75 deletions(-) diff --git a/.gitignore b/.gitignore index 3f7848cc..b20bf11e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,7 @@ worker/node_modules/ worker/.dev.vars worker/.wrangler/ -.dev.vars +worker/.dev.vars .DS_Store *.xcuserstate *.xcuserdatad diff --git a/leanring-buddy/AI/ElevenLabsTTSClient.swift b/leanring-buddy/AI/ElevenLabsTTSClient.swift index 35545c9d..2fed1ff0 100644 --- a/leanring-buddy/AI/ElevenLabsTTSClient.swift +++ b/leanring-buddy/AI/ElevenLabsTTSClient.swift @@ -1,22 +1,15 @@ // -// ElevenLabsTTSClient.swift +// OpenAITTSClient.swift // leanring-buddy // -// Streams text-to-speech audio from ElevenLabs and plays it back -// through the system audio output. Uses the streaming endpoint so -// playback begins before the full audio has been generated. -// import AVFoundation import Foundation @MainActor -final class ElevenLabsTTSClient { +final class OpenAITTSClient { private let proxyURL: URL private let session: URLSession - - /// The audio player for the current TTS playback. Kept alive so the - /// audio finishes playing even if the caller doesn't hold a reference. private var audioPlayer: AVAudioPlayer? init(proxyURL: String) { @@ -28,21 +21,15 @@ final class ElevenLabsTTSClient { self.session = URLSession(configuration: configuration) } - /// Sends `text` to ElevenLabs TTS and plays the resulting audio. - /// Throws on network or decoding errors. Cancellation-safe. func speakText(_ text: String) async throws { var request = URLRequest(url: proxyURL) request.httpMethod = "POST" request.setValue("application/json", forHTTPHeaderField: "Content-Type") - request.setValue("audio/mpeg", forHTTPHeaderField: "Accept") let body: [String: Any] = [ - "text": text, - "model_id": "eleven_flash_v2_5", - "voice_settings": [ - "stability": 0.5, - "similarity_boost": 0.75 - ] + "model": "tts-1", + "voice": "alloy", + "input": text ] request.httpBody = try JSONSerialization.data(withJSONObject: body) @@ -50,13 +37,13 @@ final class ElevenLabsTTSClient { let (data, response) = try await session.data(for: request) guard let httpResponse = response as? HTTPURLResponse else { - throw NSError(domain: "ElevenLabsTTS", code: -1, + throw NSError(domain: "OpenAITTS", code: -1, userInfo: [NSLocalizedDescriptionKey: "Invalid response"]) } guard (200...299).contains(httpResponse.statusCode) else { let errorBody = String(data: data, encoding: .utf8) ?? "Unknown error" - throw NSError(domain: "ElevenLabsTTS", code: httpResponse.statusCode, + throw NSError(domain: "OpenAITTS", code: httpResponse.statusCode, userInfo: [NSLocalizedDescriptionKey: "TTS API error (\(httpResponse.statusCode)): \(errorBody)"]) } @@ -65,15 +52,13 @@ final class ElevenLabsTTSClient { let player = try AVAudioPlayer(data: data) self.audioPlayer = player player.play() - print("🔊 ElevenLabs TTS: playing \(data.count / 1024)KB audio") + print("🔊 OpenAI TTS: playing \(data.count / 1024)KB audio") } - /// Whether TTS audio is currently playing back. var isPlaying: Bool { audioPlayer?.isPlaying ?? false } - /// Stops any in-progress playback immediately. func stopPlayback() { audioPlayer?.stop() audioPlayer = nil diff --git a/leanring-buddy/App/CompanionManager.swift b/leanring-buddy/App/CompanionManager.swift index 3027e241..8d5b9ce8 100644 --- a/leanring-buddy/App/CompanionManager.swift +++ b/leanring-buddy/App/CompanionManager.swift @@ -70,14 +70,14 @@ final class CompanionManager: ObservableObject { /// Base URL for the Cloudflare Worker proxy. All API requests route /// through this so keys never ship in the app binary. - private static let workerBaseURL = "https://your-worker-name.your-subdomain.workers.dev" + private static let workerBaseURL = "http://localhost:8787" private lazy var openAIAPI: OpenAIAPI = { return OpenAIAPI(proxyURL: "\(Self.workerBaseURL)/chat", model: selectedModel) }() - private lazy var elevenLabsTTSClient: ElevenLabsTTSClient = { - return ElevenLabsTTSClient(proxyURL: "\(Self.workerBaseURL)/tts") + private lazy var ttsClient: OpenAITTSClient = { + return OpenAITTSClient(proxyURL: "\(Self.workerBaseURL)/tts") }() /// Conversation history so the AI remembers prior exchanges within a session. @@ -493,7 +493,7 @@ final class CompanionManager: ObservableObject { // Cancel any in-progress response and TTS from a previous utterance currentResponseTask?.cancel() - elevenLabsTTSClient.stopPlayback() + ttsClient.stopPlayback() clearDetectedElementLocation() // Dismiss the onboarding prompt if it's showing @@ -585,7 +585,7 @@ final class CompanionManager: ObservableObject { /// the buddy to fly to that element on screen. private func sendTranscriptToAIWithScreenshot(transcript: String) { currentResponseTask?.cancel() - elevenLabsTTSClient.stopPlayback() + ttsClient.stopPlayback() currentResponseTask = Task { // Stay in processing (spinner) state — no streaming text displayed @@ -701,7 +701,7 @@ final class CompanionManager: ObservableObject { // until the audio actually starts playing, then switch to responding. if !spokenText.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty { do { - try await elevenLabsTTSClient.speakText(spokenText) + try await ttsClient.speakText(spokenText) // speakText returns after player.play() — audio is now playing voiceState = .responding } catch { @@ -735,7 +735,7 @@ final class CompanionManager: ObservableObject { transientHideTask?.cancel() transientHideTask = Task { // Wait for TTS audio to finish playing - while elevenLabsTTSClient.isPlaying { + while ttsClient.isPlaying { try? await Task.sleep(nanoseconds: 200_000_000) guard !Task.isCancelled else { return } } diff --git a/leanring-buddy/Info.plist b/leanring-buddy/Info.plist index e3d2b455..db0244da 100644 --- a/leanring-buddy/Info.plist +++ b/leanring-buddy/Info.plist @@ -9,7 +9,7 @@ SUPublicEDKey /l3d2rw5ZZFRU3AadP/w2Zf8FHfhA6bKv16BQOV5OSk= VoiceTranscriptionProvider - assemblyai + openai NSMicrophoneUsageDescription Clicky uses your microphone so you can talk to it NSScreenCaptureUsageDescription diff --git a/leanring-buddy/Voice/AssemblyAIStreamingTranscriptionProvider.swift b/leanring-buddy/Voice/AssemblyAIStreamingTranscriptionProvider.swift index d21286b6..29bde66b 100644 --- a/leanring-buddy/Voice/AssemblyAIStreamingTranscriptionProvider.swift +++ b/leanring-buddy/Voice/AssemblyAIStreamingTranscriptionProvider.swift @@ -19,7 +19,7 @@ struct AssemblyAIStreamingTranscriptionProviderError: LocalizedError { final class AssemblyAIStreamingTranscriptionProvider: BuddyTranscriptionProvider { /// URL for the Cloudflare Worker endpoint that returns a short-lived /// AssemblyAI streaming token. The real API key never leaves the server. - private static let tokenProxyURL = "https://your-worker-name.your-subdomain.workers.dev/transcribe-token" + private static let tokenProxyURL = "http://localhost:8787/transcribe-token" let displayName = "AssemblyAI" let requiresSpeechRecognitionPermission = false diff --git a/leanring-buddy/Voice/OpenAIAudioTranscriptionProvider.swift b/leanring-buddy/Voice/OpenAIAudioTranscriptionProvider.swift index 75092092..3b7dde9e 100644 --- a/leanring-buddy/Voice/OpenAIAudioTranscriptionProvider.swift +++ b/leanring-buddy/Voice/OpenAIAudioTranscriptionProvider.swift @@ -17,21 +17,15 @@ struct OpenAIAudioTranscriptionProviderError: LocalizedError { } final class OpenAIAudioTranscriptionProvider: BuddyTranscriptionProvider { - private let apiKey = AppBundleConfiguration.stringValue(forKey: "OpenAIAPIKey") + private static let transcribeProxyURL = "http://localhost:8787/transcribe" private let modelName = AppBundleConfiguration.stringValue(forKey: "OpenAITranscriptionModel") ?? "gpt-4o-transcribe" let displayName = "OpenAI" let requiresSpeechRecognitionPermission = false - var isConfigured: Bool { - apiKey != nil - } - - var unavailableExplanation: String? { - guard !isConfigured else { return nil } - return "OpenAI transcription is not configured. Add OpenAIAPIKey to Info.plist." - } + var isConfigured: Bool { true } + var unavailableExplanation: String? { nil } func startStreamingSession( keyterms: [String], @@ -39,14 +33,8 @@ final class OpenAIAudioTranscriptionProvider: BuddyTranscriptionProvider { onFinalTranscriptReady: @escaping (String) -> Void, onError: @escaping (Error) -> Void ) async throws -> any BuddyStreamingTranscriptionSession { - guard let apiKey else { - throw OpenAIAudioTranscriptionProviderError( - message: unavailableExplanation ?? "OpenAI transcription is not configured." - ) - } - return OpenAIAudioTranscriptionSession( - apiKey: apiKey, + proxyURL: Self.transcribeProxyURL, modelName: modelName, keyterms: keyterms, onTranscriptUpdate: onTranscriptUpdate, @@ -63,10 +51,9 @@ private final class OpenAIAudioTranscriptionSession: BuddyStreamingTranscription let text: String } - private static let transcriptionURL = URL(string: "https://api.openai.com/v1/audio/transcriptions")! private static let targetSampleRate = 16_000 - private let apiKey: String + private let proxyURL: URL private let modelName: String private let keyterms: [String] private let onTranscriptUpdate: (String) -> Void @@ -86,14 +73,14 @@ private final class OpenAIAudioTranscriptionSession: BuddyStreamingTranscription private var transcriptionUploadTask: Task? init( - apiKey: String, + proxyURL: String, modelName: String, keyterms: [String], onTranscriptUpdate: @escaping (String) -> Void, onFinalTranscriptReady: @escaping (String) -> Void, onError: @escaping (Error) -> Void ) { - self.apiKey = apiKey + self.proxyURL = URL(string: proxyURL)! self.modelName = modelName self.keyterms = keyterms self.onTranscriptUpdate = onTranscriptUpdate @@ -113,15 +100,15 @@ private final class OpenAIAudioTranscriptionSession: BuddyStreamingTranscription return } - stateQueue.async { - guard !self.hasRequestedFinalTranscript, !self.isCancelled else { return } + stateQueue.async { [weak self] in + guard let self, !self.hasRequestedFinalTranscript, !self.isCancelled else { return } self.bufferedPCM16AudioData.append(audioPCM16Data) } } func requestFinalTranscript() { - stateQueue.async { - guard !self.hasRequestedFinalTranscript, !self.isCancelled else { return } + stateQueue.async { [weak self] in + guard let self, !self.hasRequestedFinalTranscript, !self.isCancelled else { return } self.hasRequestedFinalTranscript = true let bufferedPCM16AudioData = self.bufferedPCM16AudioData @@ -132,13 +119,13 @@ private final class OpenAIAudioTranscriptionSession: BuddyStreamingTranscription } func cancel() { - stateQueue.async { + stateQueue.async { [weak self] in + guard let self else { return } self.isCancelled = true self.bufferedPCM16AudioData.removeAll(keepingCapacity: false) } transcriptionUploadTask?.cancel() - urlSession.invalidateAndCancel() } private func transcribeBufferedAudio(_ bufferedPCM16AudioData: Data) async { @@ -176,9 +163,8 @@ private final class OpenAIAudioTranscriptionSession: BuddyStreamingTranscription private func requestTranscription(for wavAudioData: Data) async throws -> String { let multipartBoundary = "Boundary-\(UUID().uuidString)" - var request = URLRequest(url: Self.transcriptionURL) + var request = URLRequest(url: proxyURL) request.httpMethod = "POST" - request.setValue("Bearer \(apiKey)", forHTTPHeaderField: "Authorization") request.setValue("multipart/form-data; boundary=\(multipartBoundary)", forHTTPHeaderField: "Content-Type") let requestBodyData = makeMultipartRequestBody( diff --git a/worker/src/index.ts b/worker/src/index.ts index a3da23c0..6c482c43 100644 --- a/worker/src/index.ts +++ b/worker/src/index.ts @@ -37,6 +37,10 @@ export default { if (url.pathname === "/transcribe-token") { return await handleTranscribeToken(env); } + + if (url.pathname === "/transcribe") { + return await handleTranscribe(request, env); + } } catch (error) { console.error(`[${url.pathname}] Unhandled error:`, error); return new Response( @@ -106,26 +110,51 @@ async function handleTranscribeToken(env: Env): Promise { }); } +async function handleTranscribe(request: Request, env: Env): Promise { + const body = await request.arrayBuffer(); + const contentType = request.headers.get("content-type") || "multipart/form-data"; + + const response = await fetch("https://api.openai.com/v1/audio/transcriptions", { + method: "POST", + headers: { + authorization: `Bearer ${env.OPENAI_API_KEY}`, + "content-type": contentType, + }, + body, + }); + + if (!response.ok) { + const errorBody = await response.text(); + console.error(`[/transcribe] OpenAI API error ${response.status}: ${errorBody}`); + return new Response(errorBody, { + status: response.status, + headers: { "content-type": "application/json" }, + }); + } + + return new Response(response.body, { + status: response.status, + headers: { + "content-type": response.headers.get("content-type") || "application/json", + }, + }); +} + async function handleTTS(request: Request, env: Env): Promise { const body = await request.text(); - const voiceId = env.ELEVENLABS_VOICE_ID; - const response = await fetch( - `https://api.elevenlabs.io/v1/text-to-speech/${voiceId}`, - { - method: "POST", - headers: { - "xi-api-key": env.ELEVENLABS_API_KEY, - "content-type": "application/json", - accept: "audio/mpeg", - }, - body, - } - ); + const response = await fetch("https://api.openai.com/v1/audio/speech", { + method: "POST", + headers: { + authorization: `Bearer ${env.OPENAI_API_KEY}`, + "content-type": "application/json", + }, + body, + }); if (!response.ok) { const errorBody = await response.text(); - console.error(`[/tts] ElevenLabs API error ${response.status}: ${errorBody}`); + console.error(`[/tts] OpenAI TTS API error ${response.status}: ${errorBody}`); return new Response(errorBody, { status: response.status, headers: { "content-type": "application/json" }, From a3ee35751d831b6115dd4da79aecfc00a948fe40 Mon Sep 17 00:00:00 2001 From: GeorgeZudikhin Date: Sat, 18 Apr 2026 11:48:52 +0200 Subject: [PATCH 4/6] setup --- .gitignore | 1 - .../xcschemes/xcschememanagement.plist | 14 ++++++++ leanring-buddy/App/CompanionManager.swift | 2 +- leanring-buddy/Info.plist | 6 ++++ ...mblyAIStreamingTranscriptionProvider.swift | 2 +- worker/LOCAL_DEV.md | 29 +++++++++++++++ worker/package-lock.json | 36 ------------------- 7 files changed, 51 insertions(+), 39 deletions(-) create mode 100644 leanring-buddy.xcodeproj/xcuserdata/macbook.xcuserdatad/xcschemes/xcschememanagement.plist create mode 100644 worker/LOCAL_DEV.md diff --git a/.gitignore b/.gitignore index b20bf11e..17aa32fc 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,6 @@ worker/node_modules/ worker/.dev.vars worker/.wrangler/ -worker/.dev.vars .DS_Store *.xcuserstate *.xcuserdatad diff --git a/leanring-buddy.xcodeproj/xcuserdata/macbook.xcuserdatad/xcschemes/xcschememanagement.plist b/leanring-buddy.xcodeproj/xcuserdata/macbook.xcuserdatad/xcschemes/xcschememanagement.plist new file mode 100644 index 00000000..6c13490a --- /dev/null +++ b/leanring-buddy.xcodeproj/xcuserdata/macbook.xcuserdatad/xcschemes/xcschememanagement.plist @@ -0,0 +1,14 @@ + + + + + SchemeUserState + + leanring-buddy.xcscheme_^#shared#^_ + + orderHint + 0 + + + + diff --git a/leanring-buddy/App/CompanionManager.swift b/leanring-buddy/App/CompanionManager.swift index 8d5b9ce8..ee465efc 100644 --- a/leanring-buddy/App/CompanionManager.swift +++ b/leanring-buddy/App/CompanionManager.swift @@ -70,7 +70,7 @@ final class CompanionManager: ObservableObject { /// Base URL for the Cloudflare Worker proxy. All API requests route /// through this so keys never ship in the app binary. - private static let workerBaseURL = "http://localhost:8787" + private static let workerBaseURL = "http://127.0.0.1:8787" private lazy var openAIAPI: OpenAIAPI = { return OpenAIAPI(proxyURL: "\(Self.workerBaseURL)/chat", model: selectedModel) diff --git a/leanring-buddy/Info.plist b/leanring-buddy/Info.plist index db0244da..aa0306a9 100644 --- a/leanring-buddy/Info.plist +++ b/leanring-buddy/Info.plist @@ -10,6 +10,12 @@ /l3d2rw5ZZFRU3AadP/w2Zf8FHfhA6bKv16BQOV5OSk= VoiceTranscriptionProvider openai + assemblyai + NSAppTransportSecurity + + NSAllowsLocalNetworking + + NSMicrophoneUsageDescription Clicky uses your microphone so you can talk to it NSScreenCaptureUsageDescription diff --git a/leanring-buddy/Voice/AssemblyAIStreamingTranscriptionProvider.swift b/leanring-buddy/Voice/AssemblyAIStreamingTranscriptionProvider.swift index 29bde66b..619e5cf6 100644 --- a/leanring-buddy/Voice/AssemblyAIStreamingTranscriptionProvider.swift +++ b/leanring-buddy/Voice/AssemblyAIStreamingTranscriptionProvider.swift @@ -19,7 +19,7 @@ struct AssemblyAIStreamingTranscriptionProviderError: LocalizedError { final class AssemblyAIStreamingTranscriptionProvider: BuddyTranscriptionProvider { /// URL for the Cloudflare Worker endpoint that returns a short-lived /// AssemblyAI streaming token. The real API key never leaves the server. - private static let tokenProxyURL = "http://localhost:8787/transcribe-token" + private static let tokenProxyURL = "http://127.0.0.1:8787/transcribe-token" let displayName = "AssemblyAI" let requiresSpeechRecognitionPermission = false diff --git a/worker/LOCAL_DEV.md b/worker/LOCAL_DEV.md new file mode 100644 index 00000000..61cb6939 --- /dev/null +++ b/worker/LOCAL_DEV.md @@ -0,0 +1,29 @@ +# Local Worker Development + +This app is configured to call the local Worker at `http://127.0.0.1:8787`. + +## Setup + +1. Fill in real API keys in `worker/.dev.vars`. +2. Install Worker dependencies: + +```bash +npm install +``` + +3. Start the local Worker: + +```bash +npx wrangler dev +``` + +4. Run the macOS app from Xcode using the `leanring-buddy` scheme. + +Do not run `xcodebuild` from the terminal for this project, because it can disturb macOS privacy permissions. + +## What The Keys Do + +- `ANTHROPIC_API_KEY`: sends the screen plus transcript to Claude and streams the answer. +- `ASSEMBLYAI_API_KEY`: powers push-to-talk speech-to-text. +- `ELEVENLABS_API_KEY`: generates the spoken reply audio. +- `ELEVENLABS_VOICE_ID`: chooses which ElevenLabs voice speaks the reply. diff --git a/worker/package-lock.json b/worker/package-lock.json index c2383cc1..4d043455 100644 --- a/worker/package-lock.json +++ b/worker/package-lock.json @@ -643,9 +643,6 @@ "arm" ], "dev": true, - "libc": [ - "glibc" - ], "license": "LGPL-3.0-or-later", "optional": true, "os": [ @@ -663,9 +660,6 @@ "arm64" ], "dev": true, - "libc": [ - "glibc" - ], "license": "LGPL-3.0-or-later", "optional": true, "os": [ @@ -683,9 +677,6 @@ "s390x" ], "dev": true, - "libc": [ - "glibc" - ], "license": "LGPL-3.0-or-later", "optional": true, "os": [ @@ -703,9 +694,6 @@ "x64" ], "dev": true, - "libc": [ - "glibc" - ], "license": "LGPL-3.0-or-later", "optional": true, "os": [ @@ -723,9 +711,6 @@ "arm64" ], "dev": true, - "libc": [ - "musl" - ], "license": "LGPL-3.0-or-later", "optional": true, "os": [ @@ -743,9 +728,6 @@ "x64" ], "dev": true, - "libc": [ - "musl" - ], "license": "LGPL-3.0-or-later", "optional": true, "os": [ @@ -763,9 +745,6 @@ "arm" ], "dev": true, - "libc": [ - "glibc" - ], "license": "Apache-2.0", "optional": true, "os": [ @@ -789,9 +768,6 @@ "arm64" ], "dev": true, - "libc": [ - "glibc" - ], "license": "Apache-2.0", "optional": true, "os": [ @@ -815,9 +791,6 @@ "s390x" ], "dev": true, - "libc": [ - "glibc" - ], "license": "Apache-2.0", "optional": true, "os": [ @@ -841,9 +814,6 @@ "x64" ], "dev": true, - "libc": [ - "glibc" - ], "license": "Apache-2.0", "optional": true, "os": [ @@ -867,9 +837,6 @@ "arm64" ], "dev": true, - "libc": [ - "musl" - ], "license": "Apache-2.0", "optional": true, "os": [ @@ -893,9 +860,6 @@ "x64" ], "dev": true, - "libc": [ - "musl" - ], "license": "Apache-2.0", "optional": true, "os": [ From 6498e1599c531b78e50fb22049266aca222f2bb5 Mon Sep 17 00:00:00 2001 From: GeorgeZudikhin Date: Sat, 18 Apr 2026 12:53:10 +0200 Subject: [PATCH 5/6] feat: chat v1 --- AGENTS.md | 55 ++-- leanring-buddy.xcodeproj/project.pbxproj | 4 +- .../xcshareddata/swiftpm/Package.resolved | 33 --- leanring-buddy/App/CompanionManager.swift | 110 +++++++- .../GlobalTextPromptShortcutMonitor.swift | 132 ++++++++++ leanring-buddy/TextPromptWindowManager.swift | 235 ++++++++++++++++++ leanring-buddy/UI/CompanionPanelView.swift | 58 ++++- worker/src/index.ts | 13 + 8 files changed, 568 insertions(+), 72 deletions(-) delete mode 100644 leanring-buddy.xcodeproj/project.xcworkspace/xcshareddata/swiftpm/Package.resolved create mode 100644 leanring-buddy/GlobalTextPromptShortcutMonitor.swift create mode 100644 leanring-buddy/TextPromptWindowManager.swift diff --git a/AGENTS.md b/AGENTS.md index f1768b57..4b05c8d7 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -5,7 +5,7 @@ ## Overview -macOS menu bar companion app. Lives entirely in the macOS status bar (no dock icon, no main window). Clicking the menu bar icon opens a custom floating panel with companion voice controls. Uses push-to-talk (ctrl+option) to capture voice input, transcribes it via AssemblyAI streaming, and sends the transcript + a screenshot of the user's screen to Claude. Claude responds with text (streamed via SSE) and voice (ElevenLabs TTS). A blue cursor overlay can fly to and point at UI elements Claude references on any connected monitor. +macOS menu bar companion app. Lives entirely in the macOS status bar (no dock icon, no main window). Clicking the menu bar icon opens a custom floating panel with companion voice and text controls. Uses push-to-talk (ctrl+option) to capture voice input, transcribes it via AssemblyAI streaming, and sends the transcript + a screenshot of the user's screen to Claude. Users can also open a typed prompt with Command+Shift+Return; typed messages use the same screenshot → Claude → TTS → pointing pipeline as voice. Claude responds with text (streamed via SSE) and voice (ElevenLabs TTS). A blue cursor overlay can fly to and point at UI elements Claude references on any connected monitor. All API keys live on a Cloudflare Worker proxy — nothing sensitive ships in the app. @@ -19,7 +19,8 @@ All API keys live on a Cloudflare Worker proxy — nothing sensitive ships in th - **Text-to-Speech**: ElevenLabs (`eleven_flash_v2_5` model) via Cloudflare Worker proxy - **Screen Capture**: ScreenCaptureKit (macOS 14.2+), multi-monitor support - **Voice Input**: Push-to-talk via `AVAudioEngine` + pluggable transcription-provider layer. System-wide keyboard shortcut via listen-only CGEvent tap. -- **Element Pointing**: The AI embeds `[POINT:x,y:label:screenN]` tags in responses. The overlay parses these, maps coordinates to the correct monitor, and animates the blue cursor along a bezier arc to the target. +- **Text Input**: Command+Shift+Return opens a floating typed prompt. Submitted text bypasses transcription and enters the same screenshot + Claude + ElevenLabs + pointing response pipeline as voice transcripts. +- **Element Pointing**: Claude embeds `[POINT:x,y:label:screenN]` tags in responses. The overlay parses these, maps coordinates to the correct monitor, and animates the blue cursor along a bezier arc to the target. - **Concurrency**: `@MainActor` isolation, async/await throughout - **Analytics**: PostHog via `ClickyAnalytics.swift` @@ -44,6 +45,8 @@ Worker vars: `ELEVENLABS_VOICE_ID` **Global Push-To-Talk Shortcut**: Background push-to-talk uses a listen-only `CGEvent` tap instead of an AppKit global monitor so modifier-based shortcuts like `ctrl + option` are detected more reliably while the app is running in the background. +**Global Text Prompt Shortcut**: Background typed input uses a separate listen-only `CGEvent` tap for Command+Shift+Return. This avoids sharing `ctrl + option`, which starts voice recording as soon as those modifiers are pressed. + **Shared URLSession for AssemblyAI**: A single long-lived `URLSession` is shared across all AssemblyAI streaming sessions (owned by the provider, not the session). Creating and invalidating a URLSession per session corrupts the OS connection pool and causes "Socket is not connected" errors after a few rapid reconnections. **Transient Cursor Mode**: When "Show Clicky" is off, pressing the hotkey fades in the cursor overlay for the duration of the interaction (recording → response → TTS → optional pointing), then fades it out automatically after 1 second of inactivity. @@ -52,29 +55,31 @@ Worker vars: `ELEVENLABS_VOICE_ID` | File | Lines | Purpose | |------|-------|---------| -| `App/leanring_buddyApp.swift` | ~89 | Menu bar app entry point. Uses `@NSApplicationDelegateAdaptor` with `CompanionAppDelegate` which creates `MenuBarPanelManager` and starts `CompanionManager`. No main window — the app lives entirely in the status bar. | -| `App/CompanionManager.swift` | ~1026 | Central state machine. Owns dictation, shortcut monitoring, screen capture, OpenAI API, ElevenLabs TTS, and overlay management. Tracks voice state (idle/listening/processing/responding), conversation history, model selection, and cursor visibility. Coordinates the full push-to-talk → screenshot → OpenAI → TTS → pointing pipeline. | -| `App/AppBundleConfiguration.swift` | ~28 | Runtime configuration reader for keys stored in the app bundle Info.plist. | -| `UI/MenuBarPanelManager.swift` | ~243 | NSStatusItem + custom NSPanel lifecycle. Creates the menu bar icon, manages the floating companion panel (show/hide/position), installs click-outside-to-dismiss monitor. | -| `UI/CompanionPanelView.swift` | ~761 | SwiftUI panel content for the menu bar dropdown. Shows companion status, push-to-talk instructions, model picker (GPT-4o/GPT-4o mini), permissions UI, DM feedback button, and quit button. Dark aesthetic using `DS` design system. | -| `UI/OverlayWindow.swift` | ~881 | Full-screen transparent overlay hosting the blue cursor, response text, waveform, and spinner. Handles cursor animation, element pointing with bezier arcs, multi-monitor coordinate mapping, and fade-out transitions. | -| `UI/CompanionResponseOverlay.swift` | ~217 | SwiftUI view for the response text bubble and waveform displayed next to the cursor in the overlay. | -| `UI/DesignSystem.swift` | ~880 | Design system tokens — colors, corner radii, shared styles. All UI references `DS.Colors`, `DS.CornerRadius`, etc. | -| `Voice/BuddyDictationManager.swift` | ~866 | Push-to-talk voice pipeline. Handles microphone capture via `AVAudioEngine`, provider-aware permission checks, keyboard/button dictation sessions, transcript finalization, shortcut parsing, contextual keyterms, and live audio-level reporting for waveform feedback. | -| `Voice/GlobalPushToTalkShortcutMonitor.swift` | ~132 | System-wide push-to-talk monitor. Owns the listen-only `CGEvent` tap and publishes press/release transitions. | -| `Voice/BuddyTranscriptionProvider.swift` | ~100 | Protocol surface and provider factory for voice transcription backends. Resolves provider based on `VoiceTranscriptionProvider` in Info.plist — AssemblyAI, OpenAI, or Apple Speech. | -| `Voice/AssemblyAIStreamingTranscriptionProvider.swift` | ~478 | Streaming transcription provider. Fetches temp tokens from the Cloudflare Worker, opens an AssemblyAI v3 websocket, streams PCM16 audio, tracks turn-based transcripts, and delivers finalized text on key-up. Shares a single URLSession across all sessions. | -| `Voice/OpenAIAudioTranscriptionProvider.swift` | ~317 | Upload-based transcription provider. Buffers push-to-talk audio locally, uploads as WAV on release, returns finalized transcript. | -| `Voice/AppleSpeechTranscriptionProvider.swift` | ~147 | Local fallback transcription provider backed by Apple's Speech framework. | -| `Voice/BuddyAudioConversionSupport.swift` | ~108 | Audio conversion helpers. Converts live mic buffers to PCM16 mono audio and builds WAV payloads for upload-based providers. | -| `AI/OpenAIAPI.swift` | ~230 | OpenAI GPT-4o vision API client with SSE streaming. Routes through Cloudflare Worker proxy. | -| `AI/ClaudeAPI.swift` | ~291 | Claude vision API client with streaming (SSE) and non-streaming modes. Currently unused — kept as reference. | -| `AI/ElevenLabsTTSClient.swift` | ~81 | ElevenLabs TTS client. Sends text to the Worker proxy, plays back audio via `AVAudioPlayer`. Exposes `isPlaying` for transient cursor scheduling. | -| `AI/ElementLocationDetector.swift` | ~335 | Detects UI element locations in screenshots using Claude Computer Use API for cursor pointing. | -| `Utilities/CompanionScreenCaptureUtility.swift` | ~132 | Multi-monitor screenshot capture using ScreenCaptureKit. Returns labeled image data for each connected display. | -| `Utilities/WindowPositionManager.swift` | ~262 | Window placement logic, Screen Recording permission flow, and accessibility permission helpers. | -| `Utilities/ClickyAnalytics.swift` | ~121 | PostHog analytics integration for usage tracking. | -| `worker/src/index.ts` | ~141 | Cloudflare Worker proxy. Three routes: `/chat` (OpenAI), `/tts` (ElevenLabs), `/transcribe-token` (AssemblyAI temp token). | +| `leanring_buddyApp.swift` | ~89 | Menu bar app entry point. Uses `@NSApplicationDelegateAdaptor` with `CompanionAppDelegate` which creates `MenuBarPanelManager` and starts `CompanionManager`. No main window — the app lives entirely in the status bar. | +| `CompanionManager.swift` | ~1082 | Central state machine. Owns dictation, shortcut monitoring, text prompt routing, screen capture, Claude API, ElevenLabs TTS, and overlay management. Tracks voice state (idle/listening/processing/responding), conversation history, model selection, and cursor visibility. Coordinates the full voice/text → screenshot → Claude → TTS → pointing pipeline. | +| `MenuBarPanelManager.swift` | ~243 | NSStatusItem + custom NSPanel lifecycle. Creates the menu bar icon, manages the floating companion panel (show/hide/position), installs click-outside-to-dismiss monitor. | +| `CompanionPanelView.swift` | ~811 | SwiftUI panel content for the menu bar dropdown. Shows companion status, push-to-talk and typed-input instructions, model picker (Sonnet/Opus), permissions UI, DM feedback button, and quit button. Dark aesthetic using `DS` design system. | +| `OverlayWindow.swift` | ~881 | Full-screen transparent overlay hosting the blue cursor, response text, waveform, and spinner. Handles cursor animation, element pointing with bezier arcs, multi-monitor coordinate mapping, and fade-out transitions. | +| `TextPromptWindowManager.swift` | ~233 | Floating text prompt `NSPanel` and SwiftUI input view. Lets users type messages and submit them into the shared companion response pipeline. | +| `CompanionResponseOverlay.swift` | ~217 | SwiftUI view for the response text bubble and waveform displayed next to the cursor in the overlay. | +| `CompanionScreenCaptureUtility.swift` | ~132 | Multi-monitor screenshot capture using ScreenCaptureKit. Returns labeled image data for each connected display. | +| `BuddyDictationManager.swift` | ~866 | Push-to-talk voice pipeline. Handles microphone capture via `AVAudioEngine`, provider-aware permission checks, keyboard/button dictation sessions, transcript finalization, shortcut parsing, contextual keyterms, and live audio-level reporting for waveform feedback. | +| `BuddyTranscriptionProvider.swift` | ~100 | Protocol surface and provider factory for voice transcription backends. Resolves provider based on `VoiceTranscriptionProvider` in Info.plist — AssemblyAI, OpenAI, or Apple Speech. | +| `AssemblyAIStreamingTranscriptionProvider.swift` | ~478 | Streaming transcription provider. Fetches temp tokens from the Cloudflare Worker, opens an AssemblyAI v3 websocket, streams PCM16 audio, tracks turn-based transcripts, and delivers finalized text on key-up. Shares a single URLSession across all sessions. | +| `OpenAIAudioTranscriptionProvider.swift` | ~317 | Upload-based transcription provider. Buffers push-to-talk audio locally, uploads as WAV on release, returns finalized transcript. | +| `AppleSpeechTranscriptionProvider.swift` | ~147 | Local fallback transcription provider backed by Apple's Speech framework. | +| `BuddyAudioConversionSupport.swift` | ~108 | Audio conversion helpers. Converts live mic buffers to PCM16 mono audio and builds WAV payloads for upload-based providers. | +| `GlobalPushToTalkShortcutMonitor.swift` | ~132 | System-wide push-to-talk monitor. Owns the listen-only `CGEvent` tap and publishes press/release transitions. | +| `GlobalTextPromptShortcutMonitor.swift` | ~132 | System-wide typed prompt monitor. Owns the listen-only `CGEvent` tap for Command+Shift+Return and publishes prompt-open events. | +| `ClaudeAPI.swift` | ~291 | Claude vision API client with streaming (SSE) and non-streaming modes. TLS warmup optimization, image MIME detection, conversation history support. | +| `OpenAIAPI.swift` | ~142 | OpenAI GPT vision API client. | +| `ElevenLabsTTSClient.swift` | ~81 | ElevenLabs TTS client. Sends text to the Worker proxy, plays back audio via `AVAudioPlayer`. Exposes `isPlaying` for transient cursor scheduling. | +| `ElementLocationDetector.swift` | ~335 | Detects UI element locations in screenshots for cursor pointing. | +| `DesignSystem.swift` | ~880 | Design system tokens — colors, corner radii, shared styles. All UI references `DS.Colors`, `DS.CornerRadius`, etc. | +| `ClickyAnalytics.swift` | ~121 | PostHog analytics integration for usage tracking. | +| `WindowPositionManager.swift` | ~262 | Window placement logic, Screen Recording permission flow, and accessibility permission helpers. | +| `AppBundleConfiguration.swift` | ~28 | Runtime configuration reader for keys stored in the app bundle Info.plist. | +| `worker/src/index.ts` | ~142 | Cloudflare Worker proxy. Three routes: `/chat` (Claude), `/tts` (ElevenLabs), `/transcribe-token` (AssemblyAI temp token). | ## Build & Run diff --git a/leanring-buddy.xcodeproj/project.pbxproj b/leanring-buddy.xcodeproj/project.pbxproj index 39eacb4c..11cda144 100644 --- a/leanring-buddy.xcodeproj/project.pbxproj +++ b/leanring-buddy.xcodeproj/project.pbxproj @@ -411,7 +411,7 @@ CODE_SIGN_STYLE = Automatic; COMBINE_HIDPI_IMAGES = YES; CURRENT_PROJECT_VERSION = 1; - DEVELOPMENT_TEAM = 6B89JTKXCY; + DEVELOPMENT_TEAM = 222KQDN363; ENABLE_APP_SANDBOX = NO; ENABLE_HARDENED_RUNTIME = YES; ENABLE_OUTGOING_NETWORK_CONNECTIONS = YES; @@ -449,7 +449,7 @@ CODE_SIGN_STYLE = Automatic; COMBINE_HIDPI_IMAGES = YES; CURRENT_PROJECT_VERSION = 1; - DEVELOPMENT_TEAM = 6B89JTKXCY; + DEVELOPMENT_TEAM = 222KQDN363; ENABLE_APP_SANDBOX = NO; ENABLE_HARDENED_RUNTIME = YES; ENABLE_OUTGOING_NETWORK_CONNECTIONS = YES; diff --git a/leanring-buddy.xcodeproj/project.xcworkspace/xcshareddata/swiftpm/Package.resolved b/leanring-buddy.xcodeproj/project.xcworkspace/xcshareddata/swiftpm/Package.resolved deleted file mode 100644 index d88adb21..00000000 --- a/leanring-buddy.xcodeproj/project.xcworkspace/xcshareddata/swiftpm/Package.resolved +++ /dev/null @@ -1,33 +0,0 @@ -{ - "originHash" : "3c6fb67fefedcfcd00708e24ca8088151f21dccfc0ade32ea80c406646277e89", - "pins" : [ - { - "identity" : "plcrashreporter", - "kind" : "remoteSourceControl", - "location" : "https://github.com/microsoft/plcrashreporter.git", - "state" : { - "revision" : "0254f941c646b1ed17b243654723d0f071e990d0", - "version" : "1.12.2" - } - }, - { - "identity" : "posthog-ios", - "kind" : "remoteSourceControl", - "location" : "https://github.com/PostHog/posthog-ios.git", - "state" : { - "revision" : "09da1be6a614325a6a464c6d2017a9ac858d1b5a", - "version" : "3.47.0" - } - }, - { - "identity" : "sparkle", - "kind" : "remoteSourceControl", - "location" : "https://github.com/sparkle-project/Sparkle", - "state" : { - "revision" : "21d8df80440b1ca3b65fa82e40782f1e5a9e6ba2", - "version" : "2.9.0" - } - } - ], - "version" : 3 -} diff --git a/leanring-buddy/App/CompanionManager.swift b/leanring-buddy/App/CompanionManager.swift index ee465efc..ba7c71b5 100644 --- a/leanring-buddy/App/CompanionManager.swift +++ b/leanring-buddy/App/CompanionManager.swift @@ -64,7 +64,9 @@ final class CompanionManager: ObservableObject { let buddyDictationManager = BuddyDictationManager() let globalPushToTalkShortcutMonitor = GlobalPushToTalkShortcutMonitor() + let globalTextPromptShortcutMonitor = GlobalTextPromptShortcutMonitor() let overlayWindowManager = OverlayWindowManager() + private let textPromptWindowManager = TextPromptWindowManager() // Response text is now displayed inline on the cursor overlay via // streamingResponseText, so no separate response overlay manager is needed. @@ -80,15 +82,18 @@ final class CompanionManager: ObservableObject { return OpenAITTSClient(proxyURL: "\(Self.workerBaseURL)/tts") }() - /// Conversation history so the AI remembers prior exchanges within a session. - /// Each entry is the user's transcript and the AI's response. + /// Conversation history so Claude remembers prior exchanges within a session. + /// Each entry is the user's transcript and Claude's response. private var conversationHistory: [(userTranscript: String, assistantResponse: String)] = [] /// The currently running AI response task, if any. Cancelled when the user /// speaks again so a new response can begin immediately. private var currentResponseTask: Task? + private var systemSpeechSynthesizer: NSSpeechSynthesizer? private var shortcutTransitionCancellable: AnyCancellable? + private var textPromptShortcutCancellable: AnyCancellable? + private var isKeyboardShortcutInteractionActive = false private var voiceStateCancellable: AnyCancellable? private var audioPowerCancellable: AnyCancellable? private var accessibilityCheckTimer: Timer? @@ -180,6 +185,8 @@ final class CompanionManager: ObservableObject { bindAudioPowerLevel() bindShortcutTransitions() // Eagerly touch the OpenAI API so its TLS warmup handshake completes + bindTextPromptShortcut() + // Eagerly touch the Claude API so its TLS warmup handshake completes // well before the onboarding demo fires at ~40s into the video. _ = openAIAPI @@ -289,13 +296,19 @@ final class CompanionManager: ObservableObject { func stop() { globalPushToTalkShortcutMonitor.stop() + globalTextPromptShortcutMonitor.stop() buddyDictationManager.cancelCurrentDictation() + textPromptWindowManager.hide() overlayWindowManager.hideOverlay() transientHideTask?.cancel() currentResponseTask?.cancel() currentResponseTask = nil + systemSpeechSynthesizer?.stopSpeaking() + systemSpeechSynthesizer = nil shortcutTransitionCancellable?.cancel() + textPromptShortcutCancellable?.cancel() + isKeyboardShortcutInteractionActive = false voiceStateCancellable?.cancel() audioPowerCancellable?.cancel() accessibilityCheckTimer?.invalidate() @@ -313,8 +326,10 @@ final class CompanionManager: ObservableObject { if currentlyHasAccessibility { globalPushToTalkShortcutMonitor.start() + globalTextPromptShortcutMonitor.start() } else { globalPushToTalkShortcutMonitor.stop() + globalTextPromptShortcutMonitor.stop() } hasScreenRecordingPermission = WindowPositionManager.hasScreenRecordingPermission() @@ -470,12 +485,61 @@ final class CompanionManager: ObservableObject { } } + private func bindTextPromptShortcut() { + textPromptShortcutCancellable = globalTextPromptShortcutMonitor + .shortcutPublisher + .receive(on: DispatchQueue.main) + .sink { [weak self] in + self?.showTextPromptWindow() + } + } + + func showTextPromptWindow() { + guard !showOnboardingVideo else { return } + textPromptWindowManager.show(companionManager: self) + } + + func submitTypedMessage(_ message: String) { + let trimmedMessage = message.trimmingCharacters(in: .whitespacesAndNewlines) + guard !trimmedMessage.isEmpty else { return } + guard !buddyDictationManager.isDictationInProgress else { return } + + transientHideTask?.cancel() + transientHideTask = nil + + if !isOverlayVisible { + overlayWindowManager.hasShownOverlayBefore = true + overlayWindowManager.showOverlay(onScreens: NSScreen.screens, companionManager: self) + isOverlayVisible = true + } + + NotificationCenter.default.post(name: .clickyDismissPanel, object: nil) + + if showOnboardingPrompt { + withAnimation(.easeOut(duration: 0.3)) { + onboardingPromptOpacity = 0.0 + } + DispatchQueue.main.asyncAfter(deadline: .now() + 0.35) { + self.showOnboardingPrompt = false + self.onboardingPromptText = "" + } + } + + clearDetectedElementLocation() + lastTranscript = trimmedMessage + print("⌨️ Companion received typed message: \(trimmedMessage)") + ClickyAnalytics.trackUserMessageSent(transcript: trimmedMessage) + sendTranscriptToClaudeWithScreenshot(transcript: trimmedMessage) + } + private func handleShortcutTransition(_ transition: BuddyPushToTalkShortcut.ShortcutTransition) { switch transition { case .pressed: + guard !isKeyboardShortcutInteractionActive else { return } guard !buddyDictationManager.isDictationInProgress else { return } // Don't register push-to-talk while the onboarding video is playing guard !showOnboardingVideo else { return } + isKeyboardShortcutInteractionActive = true // Cancel any pending transient hide so the overlay stays visible transientHideTask?.cancel() @@ -494,6 +558,9 @@ final class CompanionManager: ObservableObject { // Cancel any in-progress response and TTS from a previous utterance currentResponseTask?.cancel() ttsClient.stopPlayback() + elevenLabsTTSClient.stopPlayback() + systemSpeechSynthesizer?.stopSpeaking() + systemSpeechSynthesizer = nil clearDetectedElementLocation() // Dismiss the onboarding prompt if it's showing @@ -531,6 +598,7 @@ final class CompanionManager: ObservableObject { // Without this, a quick press-and-release drops the release event and // leaves the waveform overlay stuck on screen indefinitely. ClickyAnalytics.trackPushToTalkReleased() + isKeyboardShortcutInteractionActive = false pendingKeyboardShortcutStartTask?.cancel() pendingKeyboardShortcutStartTask = nil buddyDictationManager.stopPushToTalkFromKeyboardShortcut() @@ -586,6 +654,9 @@ final class CompanionManager: ObservableObject { private func sendTranscriptToAIWithScreenshot(transcript: String) { currentResponseTask?.cancel() ttsClient.stopPlayback() + elevenLabsTTSClient.stopPlayback() + systemSpeechSynthesizer?.stopSpeaking() + systemSpeechSynthesizer = nil currentResponseTask = Task { // Stay in processing (spinner) state — no streaming text displayed @@ -708,6 +779,19 @@ final class CompanionManager: ObservableObject { ClickyAnalytics.trackTTSError(error: error.localizedDescription) print("⚠️ ElevenLabs TTS error: \(error)") speakCreditsErrorFallback() + if Self.shouldUseElevenLabsTTS { + do { + try await elevenLabsTTSClient.speakText(spokenText) + // speakText returns after player.play() — audio is now playing + voiceState = .responding + } catch { + ClickyAnalytics.trackTTSError(error: error.localizedDescription) + print("⚠️ ElevenLabs TTS error: \(error)") + speakWithSystemVoice(spokenText) + } + } else { + print("🔊 System TTS: ElevenLabs disabled for local development") + speakWithSystemVoice(spokenText) } } } catch is CancellationError { @@ -715,7 +799,7 @@ final class CompanionManager: ObservableObject { } catch { ClickyAnalytics.trackResponseError(error: error.localizedDescription) print("⚠️ Companion response error: \(error)") - speakCreditsErrorFallback() + speakResponseErrorFallback() } if !Task.isCancelled { @@ -755,16 +839,26 @@ final class CompanionManager: ObservableObject { } } - /// Speaks a hardcoded error message using macOS system TTS when API - /// credits run out. Uses NSSpeechSynthesizer so it works even when - /// ElevenLabs is down. - private func speakCreditsErrorFallback() { - let utterance = "I'm all out of credits. Please DM Farza and tell him to bring me back to life." + /// Uses macOS system TTS when ElevenLabs is unavailable, so local + /// development can still verify that Claude answered correctly. + private func speakWithSystemVoice(_ text: String) { + let utterance = text.trimmingCharacters(in: .whitespacesAndNewlines) + guard !utterance.isEmpty else { return } + + systemSpeechSynthesizer?.stopSpeaking() let synthesizer = NSSpeechSynthesizer() + systemSpeechSynthesizer = synthesizer + print("🔊 System TTS: speaking fallback response") synthesizer.startSpeaking(utterance) voiceState = .responding } + /// Speaks a generic error using macOS system TTS when the AI response + /// request fails before Clicky has any real answer to read aloud. + private func speakResponseErrorFallback() { + speakWithSystemVoice("I couldn't get a response from the AI service. Check the local Worker logs for the exact error.") + } + // MARK: - Point Tag Parsing /// Result of parsing a [POINT:...] tag from Claude's response. diff --git a/leanring-buddy/GlobalTextPromptShortcutMonitor.swift b/leanring-buddy/GlobalTextPromptShortcutMonitor.swift new file mode 100644 index 00000000..512c055b --- /dev/null +++ b/leanring-buddy/GlobalTextPromptShortcutMonitor.swift @@ -0,0 +1,132 @@ +// +// GlobalTextPromptShortcutMonitor.swift +// leanring-buddy +// +// Opens the typed prompt window from a global keyboard shortcut while Clicky +// is running in the background. +// + +import AppKit +import Combine +import CoreGraphics +import Foundation + +final class GlobalTextPromptShortcutMonitor: ObservableObject { + static let displayText = "Command+Shift+Return" + + let shortcutPublisher = PassthroughSubject() + + private var globalEventTap: CFMachPort? + private var globalEventTapRunLoopSource: CFRunLoopSource? + private var isShortcutCurrentlyPressed = false + + deinit { + stop() + } + + func start() { + guard globalEventTap == nil else { return } + + let monitoredEventTypes: [CGEventType] = [.keyDown, .keyUp] + let eventMask = monitoredEventTypes.reduce(CGEventMask(0)) { currentMask, eventType in + currentMask | (CGEventMask(1) << eventType.rawValue) + } + + let eventTapCallback: CGEventTapCallBack = { _, eventType, event, userInfo in + guard let userInfo else { + return Unmanaged.passUnretained(event) + } + + let shortcutMonitor = Unmanaged + .fromOpaque(userInfo) + .takeUnretainedValue() + + return shortcutMonitor.handleGlobalEventTap( + eventType: eventType, + event: event + ) + } + + guard let globalEventTap = CGEvent.tapCreate( + tap: .cgSessionEventTap, + place: .headInsertEventTap, + options: .listenOnly, + eventsOfInterest: eventMask, + callback: eventTapCallback, + userInfo: Unmanaged.passUnretained(self).toOpaque() + ) else { + print("⚠️ Global text prompt: couldn't create CGEvent tap") + return + } + + guard let globalEventTapRunLoopSource = CFMachPortCreateRunLoopSource( + kCFAllocatorDefault, + globalEventTap, + 0 + ) else { + CFMachPortInvalidate(globalEventTap) + print("⚠️ Global text prompt: couldn't create event tap run loop source") + return + } + + self.globalEventTap = globalEventTap + self.globalEventTapRunLoopSource = globalEventTapRunLoopSource + + CFRunLoopAddSource(CFRunLoopGetMain(), globalEventTapRunLoopSource, .commonModes) + CGEvent.tapEnable(tap: globalEventTap, enable: true) + } + + func stop() { + isShortcutCurrentlyPressed = false + + if let globalEventTapRunLoopSource { + CFRunLoopRemoveSource(CFRunLoopGetMain(), globalEventTapRunLoopSource, .commonModes) + self.globalEventTapRunLoopSource = nil + } + + if let globalEventTap { + CFMachPortInvalidate(globalEventTap) + self.globalEventTap = nil + } + } + + private func handleGlobalEventTap( + eventType: CGEventType, + event: CGEvent + ) -> Unmanaged? { + if eventType == .tapDisabledByTimeout || eventType == .tapDisabledByUserInput { + if let globalEventTap { + CGEvent.tapEnable(tap: globalEventTap, enable: true) + } + return Unmanaged.passUnretained(event) + } + + let eventKeyCode = UInt16(event.getIntegerValueField(.keyboardEventKeycode)) + let isReturnKey = eventKeyCode == 36 + let modifierFlags = NSEvent.ModifierFlags(rawValue: UInt(event.flags.rawValue)) + .intersection(.deviceIndependentFlagsMask) + let hasRequiredModifiers = modifierFlags.contains([.command, .shift]) + + guard isReturnKey && hasRequiredModifiers else { + if eventType == .keyUp { + isShortcutCurrentlyPressed = false + } + return Unmanaged.passUnretained(event) + } + + switch eventType { + case .keyDown: + let isRepeat = event.getIntegerValueField(.keyboardEventAutorepeat) != 0 + if !isShortcutCurrentlyPressed && !isRepeat { + isShortcutCurrentlyPressed = true + shortcutPublisher.send() + } + case .keyUp: + isShortcutCurrentlyPressed = false + default: + break + } + + return Unmanaged.passUnretained(event) + } +} diff --git a/leanring-buddy/TextPromptWindowManager.swift b/leanring-buddy/TextPromptWindowManager.swift new file mode 100644 index 00000000..2a880720 --- /dev/null +++ b/leanring-buddy/TextPromptWindowManager.swift @@ -0,0 +1,235 @@ +// +// TextPromptWindowManager.swift +// leanring-buddy +// +// A small floating prompt for sending typed messages through the same +// screenshot -> Claude -> speech/pointing pipeline as push-to-talk. +// + +import AppKit +import SwiftUI + +private final class TextPromptPanel: NSPanel { + override var canBecomeKey: Bool { true } + override var canBecomeMain: Bool { true } + + override func cancelOperation(_ sender: Any?) { + orderOut(nil) + } +} + +@MainActor +final class TextPromptWindowManager { + private var panel: NSPanel? + + func show(companionManager: CompanionManager) { + if panel == nil { + createPanel(companionManager: companionManager) + } + + positionPanelNearCursor() + NSApp.activate(ignoringOtherApps: true) + panel?.makeKeyAndOrderFront(nil) + panel?.orderFrontRegardless() + } + + func hide() { + panel?.orderOut(nil) + } + + private func createPanel(companionManager: CompanionManager) { + let promptPanel = TextPromptPanel( + contentRect: NSRect(x: 0, y: 0, width: 520, height: 250), + styleMask: [.borderless, .nonactivatingPanel], + backing: .buffered, + defer: false + ) + + let promptView = TextPromptPanelView( + companionManager: companionManager, + onClose: { [weak self] in + Task { @MainActor in + self?.hide() + } + } + ) + .frame(width: 520, height: 250) + + let hostingView = NSHostingView(rootView: promptView) + hostingView.frame = NSRect(x: 0, y: 0, width: 520, height: 250) + hostingView.wantsLayer = true + hostingView.layer?.backgroundColor = .clear + + promptPanel.isFloatingPanel = true + promptPanel.level = .floating + promptPanel.isOpaque = false + promptPanel.backgroundColor = .clear + promptPanel.hasShadow = true + promptPanel.hidesOnDeactivate = false + promptPanel.isExcludedFromWindowsMenu = true + promptPanel.collectionBehavior = [.canJoinAllSpaces, .fullScreenAuxiliary] + promptPanel.isMovableByWindowBackground = true + promptPanel.titleVisibility = .hidden + promptPanel.titlebarAppearsTransparent = true + promptPanel.contentView = hostingView + + panel = promptPanel + } + + private func positionPanelNearCursor() { + guard let panel else { return } + + let panelSize = panel.frame.size + let cursorLocation = NSEvent.mouseLocation + let targetScreen = NSScreen.screens.first { screen in + screen.frame.contains(cursorLocation) + } ?? NSScreen.main + + guard let targetScreen else { return } + + let visibleFrame = targetScreen.visibleFrame + let proposedOriginX = cursorLocation.x - (panelSize.width / 2) + let proposedOriginY = cursorLocation.y - panelSize.height - 24 + let clampedOriginX = min( + max(proposedOriginX, visibleFrame.minX + 16), + visibleFrame.maxX - panelSize.width - 16 + ) + let clampedOriginY = min( + max(proposedOriginY, visibleFrame.minY + 16), + visibleFrame.maxY - panelSize.height - 16 + ) + + panel.setFrame( + NSRect( + x: clampedOriginX, + y: clampedOriginY, + width: panelSize.width, + height: panelSize.height + ), + display: true + ) + } +} + +private struct TextPromptPanelView: View { + @ObservedObject var companionManager: CompanionManager + let onClose: () -> Void + + @State private var promptText = "" + @FocusState private var isPromptFocused: Bool + + private var trimmedPromptText: String { + promptText.trimmingCharacters(in: .whitespacesAndNewlines) + } + + var body: some View { + VStack(alignment: .leading, spacing: 14) { + header + promptEditor + footer + } + .padding(16) + .background(panelBackground) + .onAppear { + DispatchQueue.main.asyncAfter(deadline: .now() + 0.1) { + isPromptFocused = true + } + } + } + + private var header: some View { + HStack(spacing: 10) { + Image(systemName: "keyboard") + .font(.system(size: 13, weight: .semibold)) + .foregroundColor(DS.Colors.accentText) + + VStack(alignment: .leading, spacing: 2) { + Text("Ask Clicky") + .font(.system(size: 14, weight: .semibold)) + .foregroundColor(DS.Colors.textPrimary) + + Text("Type a message. Clicky will still look at your screen.") + .font(.system(size: 11, weight: .medium)) + .foregroundColor(DS.Colors.textTertiary) + } + + Spacer() + + Button(action: onClose) { + Image(systemName: "xmark") + .font(.system(size: 10, weight: .semibold)) + .foregroundColor(DS.Colors.textTertiary) + .frame(width: 22, height: 22) + .background( + Circle() + .fill(Color.white.opacity(0.08)) + ) + } + .buttonStyle(.plain) + .pointerCursor() + } + } + + private var promptEditor: some View { + TextEditor(text: $promptText) + .font(.system(size: 13)) + .foregroundColor(DS.Colors.textPrimary) + .scrollContentBackground(.hidden) + .background(DS.Colors.surface2) + .focused($isPromptFocused) + .frame(height: 118) + .padding(8) + .background( + RoundedRectangle(cornerRadius: 8, style: .continuous) + .fill(DS.Colors.surface2) + ) + .overlay( + RoundedRectangle(cornerRadius: 8, style: .continuous) + .stroke(DS.Colors.borderSubtle, lineWidth: 1) + ) + } + + private var footer: some View { + HStack { + Text("Command+Return sends") + .font(.system(size: 11, weight: .medium)) + .foregroundColor(DS.Colors.textTertiary) + + Spacer() + + Button(action: submitPrompt) { + Text("Send") + .font(.system(size: 12, weight: .semibold)) + .foregroundColor(DS.Colors.textOnAccent) + .padding(.horizontal, 16) + .padding(.vertical, 8) + .background( + RoundedRectangle(cornerRadius: 8, style: .continuous) + .fill(trimmedPromptText.isEmpty ? DS.Colors.surface4 : DS.Colors.accent) + ) + } + .buttonStyle(.plain) + .keyboardShortcut(.return, modifiers: .command) + .disabled(trimmedPromptText.isEmpty) + .pointerCursor(isEnabled: !trimmedPromptText.isEmpty) + } + } + + private var panelBackground: some View { + RoundedRectangle(cornerRadius: 8, style: .continuous) + .fill(DS.Colors.surface1) + .overlay( + RoundedRectangle(cornerRadius: 8, style: .continuous) + .stroke(DS.Colors.borderSubtle, lineWidth: 1) + ) + } + + private func submitPrompt() { + let textToSubmit = trimmedPromptText + guard !textToSubmit.isEmpty else { return } + + companionManager.submitTypedMessage(textToSubmit) + promptText = "" + onClose() + } +} diff --git a/leanring-buddy/UI/CompanionPanelView.swift b/leanring-buddy/UI/CompanionPanelView.swift index 8c0e3965..bac7e8f0 100644 --- a/leanring-buddy/UI/CompanionPanelView.swift +++ b/leanring-buddy/UI/CompanionPanelView.swift @@ -62,6 +62,14 @@ struct CompanionPanelView: View { Spacer() .frame(height: 16) + typeMessageButton + .padding(.horizontal, 16) + } + + if companionManager.hasCompletedOnboarding && companionManager.allPermissionsGranted { + Spacer() + .frame(height: 12) + dmFarzaButton .padding(.horizontal, 16) } @@ -127,10 +135,16 @@ struct CompanionPanelView: View { @ViewBuilder private var permissionsCopySection: some View { if companionManager.hasCompletedOnboarding && companionManager.allPermissionsGranted { - Text("Hold Control+Option to talk.") - .font(.system(size: 12, weight: .medium)) - .foregroundColor(DS.Colors.textSecondary) - .frame(maxWidth: .infinity, alignment: .leading) + VStack(alignment: .leading, spacing: 4) { + Text("Hold Control+Option to talk.") + .font(.system(size: 12, weight: .medium)) + .foregroundColor(DS.Colors.textSecondary) + + Text("Press \(GlobalTextPromptShortcutMonitor.displayText) to type.") + .font(.system(size: 11, weight: .medium)) + .foregroundColor(DS.Colors.textTertiary) + } + .frame(maxWidth: .infinity, alignment: .leading) } else if companionManager.allPermissionsGranted && !companionManager.hasSubmittedEmail { VStack(alignment: .leading, spacing: 4) { Text("Drop your email to get started.") @@ -641,6 +655,42 @@ struct CompanionPanelView: View { .pointerCursor() } + // MARK: - Typed Input + + private var typeMessageButton: some View { + Button(action: { + companionManager.showTextPromptWindow() + }) { + HStack(spacing: 8) { + Image(systemName: "keyboard") + .font(.system(size: 12, weight: .medium)) + + VStack(alignment: .leading, spacing: 2) { + Text("Type a message") + .font(.system(size: 12, weight: .semibold)) + + Text(GlobalTextPromptShortcutMonitor.displayText) + .font(.system(size: 10, weight: .medium)) + .foregroundColor(DS.Colors.textTertiary) + } + } + .foregroundColor(DS.Colors.textSecondary) + .frame(maxWidth: .infinity, alignment: .leading) + .padding(.horizontal, 12) + .padding(.vertical, 10) + .background( + RoundedRectangle(cornerRadius: DS.CornerRadius.medium, style: .continuous) + .fill(Color.white.opacity(0.06)) + ) + .overlay( + RoundedRectangle(cornerRadius: DS.CornerRadius.medium, style: .continuous) + .stroke(DS.Colors.borderSubtle, lineWidth: 0.5) + ) + } + .buttonStyle(.plain) + .pointerCursor() + } + // MARK: - DM Farza Button private var dmFarzaButton: some View { diff --git a/worker/src/index.ts b/worker/src/index.ts index 6c482c43..6d381f1f 100644 --- a/worker/src/index.ts +++ b/worker/src/index.ts @@ -21,6 +21,19 @@ export default { async fetch(request: Request, env: Env): Promise { const url = new URL(request.url); + if (request.method === "GET" && url.pathname === "/health") { + return new Response( + JSON.stringify({ + ok: true, + hasAnthropicKey: Boolean(env.ANTHROPIC_API_KEY), + hasAssemblyAIKey: Boolean(env.ASSEMBLYAI_API_KEY), + hasElevenLabsKey: Boolean(env.ELEVENLABS_API_KEY), + hasElevenLabsVoiceId: Boolean(env.ELEVENLABS_VOICE_ID), + }), + { status: 200, headers: { "content-type": "application/json" } } + ); + } + if (request.method !== "POST") { return new Response("Method not allowed", { status: 405 }); } From b6fa04298437de69b6aa07485b54a296c867c785 Mon Sep 17 00:00:00 2001 From: GeorgeZudikhin Date: Sat, 18 Apr 2026 13:39:40 +0200 Subject: [PATCH 6/6] fix: merge conflicts & remove elevenlabs & assembly --- AGENTS.md | 36 +- .../xcshareddata/swiftpm/Package.resolved | 33 ++ leanring-buddy/AGENTS.md | 22 +- ...sTTSClient.swift => OpenAITTSClient.swift} | 0 leanring-buddy/App/CompanionManager.swift | 90 +--- leanring-buddy/Info.plist | 1 - leanring-buddy/UI/OverlayWindow.swift | 2 +- .../{ => UI}/TextPromptWindowManager.swift | 8 +- ...mblyAIStreamingTranscriptionProvider.swift | 478 ------------------ .../Voice/BuddyDictationManager.swift | 2 - .../Voice/BuddyTranscriptionProvider.swift | 27 - .../GlobalTextPromptShortcutMonitor.swift | 0 worker/.dev.vars.example | 1 + worker/LOCAL_DEV.md | 5 +- worker/src/index.ts | 47 +- worker/wrangler.toml | 3 - 16 files changed, 94 insertions(+), 661 deletions(-) create mode 100644 leanring-buddy.xcodeproj/project.xcworkspace/xcshareddata/swiftpm/Package.resolved rename leanring-buddy/AI/{ElevenLabsTTSClient.swift => OpenAITTSClient.swift} (100%) rename leanring-buddy/{ => UI}/TextPromptWindowManager.swift (97%) delete mode 100644 leanring-buddy/Voice/AssemblyAIStreamingTranscriptionProvider.swift rename leanring-buddy/{ => Voice}/GlobalTextPromptShortcutMonitor.swift (100%) create mode 100644 worker/.dev.vars.example diff --git a/AGENTS.md b/AGENTS.md index 4b05c8d7..6176427b 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -5,7 +5,7 @@ ## Overview -macOS menu bar companion app. Lives entirely in the macOS status bar (no dock icon, no main window). Clicking the menu bar icon opens a custom floating panel with companion voice and text controls. Uses push-to-talk (ctrl+option) to capture voice input, transcribes it via AssemblyAI streaming, and sends the transcript + a screenshot of the user's screen to Claude. Users can also open a typed prompt with Command+Shift+Return; typed messages use the same screenshot → Claude → TTS → pointing pipeline as voice. Claude responds with text (streamed via SSE) and voice (ElevenLabs TTS). A blue cursor overlay can fly to and point at UI elements Claude references on any connected monitor. +macOS menu bar companion app. Lives entirely in the macOS status bar (no dock icon, no main window). Clicking the menu bar icon opens a custom floating panel with companion voice and text controls. Uses push-to-talk (ctrl+option) to capture voice input, transcribes it via OpenAI audio transcription, and sends the transcript + a screenshot of the user's screen to OpenAI. Users can also open a typed prompt with Command+Shift+Return; typed messages use the same screenshot → OpenAI → TTS → pointing pipeline as voice. OpenAI responds with text (streamed via SSE) and voice (OpenAI TTS). A blue cursor overlay can fly to and point at UI elements the AI references on any connected monitor. All API keys live on a Cloudflare Worker proxy — nothing sensitive ships in the app. @@ -15,12 +15,12 @@ All API keys live on a Cloudflare Worker proxy — nothing sensitive ships in th - **Framework**: SwiftUI (macOS native) with AppKit bridging for menu bar panel and cursor overlay - **Pattern**: MVVM with `@StateObject` / `@Published` state management - **AI Chat**: OpenAI (GPT-4o default, GPT-4o mini optional) via Cloudflare Worker proxy with SSE streaming -- **Speech-to-Text**: AssemblyAI real-time streaming (`u3-rt-pro` model) via websocket, with OpenAI and Apple Speech as fallbacks -- **Text-to-Speech**: ElevenLabs (`eleven_flash_v2_5` model) via Cloudflare Worker proxy +- **Speech-to-Text**: OpenAI audio transcription via Cloudflare Worker proxy, with Apple Speech as a local fallback +- **Text-to-Speech**: OpenAI speech API via Cloudflare Worker proxy - **Screen Capture**: ScreenCaptureKit (macOS 14.2+), multi-monitor support - **Voice Input**: Push-to-talk via `AVAudioEngine` + pluggable transcription-provider layer. System-wide keyboard shortcut via listen-only CGEvent tap. -- **Text Input**: Command+Shift+Return opens a floating typed prompt. Submitted text bypasses transcription and enters the same screenshot + Claude + ElevenLabs + pointing response pipeline as voice transcripts. -- **Element Pointing**: Claude embeds `[POINT:x,y:label:screenN]` tags in responses. The overlay parses these, maps coordinates to the correct monitor, and animates the blue cursor along a bezier arc to the target. +- **Text Input**: Command+Shift+Return opens a floating typed prompt. Submitted text bypasses transcription and enters the same screenshot + OpenAI + TTS + pointing response pipeline as voice transcripts. +- **Element Pointing**: The AI embeds `[POINT:x,y:label:screenN]` tags in responses. The overlay parses these, maps coordinates to the correct monitor, and animates the blue cursor along a bezier arc to the target. - **Concurrency**: `@MainActor` isolation, async/await throughout - **Analytics**: PostHog via `ClickyAnalytics.swift` @@ -31,11 +31,10 @@ The app never calls external APIs directly. All requests go through a Cloudflare | Route | Upstream | Purpose | |-------|----------|---------| | `POST /chat` | `api.openai.com/v1/chat/completions` | OpenAI vision + streaming chat | -| `POST /tts` | `api.elevenlabs.io/v1/text-to-speech/{voiceId}` | ElevenLabs TTS audio | -| `POST /transcribe-token` | `streaming.assemblyai.com/v3/token` | Fetches a short-lived (480s) AssemblyAI websocket token | +| `POST /tts` | `api.openai.com/v1/audio/speech` | OpenAI TTS audio | +| `POST /transcribe` | `api.openai.com/v1/audio/transcriptions` | OpenAI audio transcription | -Worker secrets: `OPENAI_API_KEY`, `ASSEMBLYAI_API_KEY`, `ELEVENLABS_API_KEY` -Worker vars: `ELEVENLABS_VOICE_ID` +Worker secrets: `OPENAI_API_KEY` ### Key Architecture Decisions @@ -47,8 +46,6 @@ Worker vars: `ELEVENLABS_VOICE_ID` **Global Text Prompt Shortcut**: Background typed input uses a separate listen-only `CGEvent` tap for Command+Shift+Return. This avoids sharing `ctrl + option`, which starts voice recording as soon as those modifiers are pressed. -**Shared URLSession for AssemblyAI**: A single long-lived `URLSession` is shared across all AssemblyAI streaming sessions (owned by the provider, not the session). Creating and invalidating a URLSession per session corrupts the OS connection pool and causes "Socket is not connected" errors after a few rapid reconnections. - **Transient Cursor Mode**: When "Show Clicky" is off, pressing the hotkey fades in the cursor overlay for the duration of the interaction (recording → response → TTS → optional pointing), then fades it out automatically after 1 second of inactivity. ## Key Files @@ -56,30 +53,27 @@ Worker vars: `ELEVENLABS_VOICE_ID` | File | Lines | Purpose | |------|-------|---------| | `leanring_buddyApp.swift` | ~89 | Menu bar app entry point. Uses `@NSApplicationDelegateAdaptor` with `CompanionAppDelegate` which creates `MenuBarPanelManager` and starts `CompanionManager`. No main window — the app lives entirely in the status bar. | -| `CompanionManager.swift` | ~1082 | Central state machine. Owns dictation, shortcut monitoring, text prompt routing, screen capture, Claude API, ElevenLabs TTS, and overlay management. Tracks voice state (idle/listening/processing/responding), conversation history, model selection, and cursor visibility. Coordinates the full voice/text → screenshot → Claude → TTS → pointing pipeline. | +| `CompanionManager.swift` | ~1076 | Central state machine. Owns dictation, shortcut monitoring, text prompt routing, screen capture, OpenAI API, OpenAI TTS, and overlay management. Tracks voice state (idle/listening/processing/responding), conversation history, model selection, and cursor visibility. Coordinates the full voice/text → screenshot → OpenAI → TTS → pointing pipeline. | | `MenuBarPanelManager.swift` | ~243 | NSStatusItem + custom NSPanel lifecycle. Creates the menu bar icon, manages the floating companion panel (show/hide/position), installs click-outside-to-dismiss monitor. | -| `CompanionPanelView.swift` | ~811 | SwiftUI panel content for the menu bar dropdown. Shows companion status, push-to-talk and typed-input instructions, model picker (Sonnet/Opus), permissions UI, DM feedback button, and quit button. Dark aesthetic using `DS` design system. | +| `CompanionPanelView.swift` | ~811 | SwiftUI panel content for the menu bar dropdown. Shows companion status, push-to-talk and typed-input instructions, model picker, permissions UI, DM feedback button, and quit button. Dark aesthetic using `DS` design system. | | `OverlayWindow.swift` | ~881 | Full-screen transparent overlay hosting the blue cursor, response text, waveform, and spinner. Handles cursor animation, element pointing with bezier arcs, multi-monitor coordinate mapping, and fade-out transitions. | | `TextPromptWindowManager.swift` | ~233 | Floating text prompt `NSPanel` and SwiftUI input view. Lets users type messages and submit them into the shared companion response pipeline. | | `CompanionResponseOverlay.swift` | ~217 | SwiftUI view for the response text bubble and waveform displayed next to the cursor in the overlay. | | `CompanionScreenCaptureUtility.swift` | ~132 | Multi-monitor screenshot capture using ScreenCaptureKit. Returns labeled image data for each connected display. | | `BuddyDictationManager.swift` | ~866 | Push-to-talk voice pipeline. Handles microphone capture via `AVAudioEngine`, provider-aware permission checks, keyboard/button dictation sessions, transcript finalization, shortcut parsing, contextual keyterms, and live audio-level reporting for waveform feedback. | -| `BuddyTranscriptionProvider.swift` | ~100 | Protocol surface and provider factory for voice transcription backends. Resolves provider based on `VoiceTranscriptionProvider` in Info.plist — AssemblyAI, OpenAI, or Apple Speech. | -| `AssemblyAIStreamingTranscriptionProvider.swift` | ~478 | Streaming transcription provider. Fetches temp tokens from the Cloudflare Worker, opens an AssemblyAI v3 websocket, streams PCM16 audio, tracks turn-based transcripts, and delivers finalized text on key-up. Shares a single URLSession across all sessions. | +| `BuddyTranscriptionProvider.swift` | ~73 | Protocol surface and provider factory for voice transcription backends. Resolves provider based on `VoiceTranscriptionProvider` in Info.plist — OpenAI or Apple Speech. | | `OpenAIAudioTranscriptionProvider.swift` | ~317 | Upload-based transcription provider. Buffers push-to-talk audio locally, uploads as WAV on release, returns finalized transcript. | | `AppleSpeechTranscriptionProvider.swift` | ~147 | Local fallback transcription provider backed by Apple's Speech framework. | | `BuddyAudioConversionSupport.swift` | ~108 | Audio conversion helpers. Converts live mic buffers to PCM16 mono audio and builds WAV payloads for upload-based providers. | | `GlobalPushToTalkShortcutMonitor.swift` | ~132 | System-wide push-to-talk monitor. Owns the listen-only `CGEvent` tap and publishes press/release transitions. | | `GlobalTextPromptShortcutMonitor.swift` | ~132 | System-wide typed prompt monitor. Owns the listen-only `CGEvent` tap for Command+Shift+Return and publishes prompt-open events. | -| `ClaudeAPI.swift` | ~291 | Claude vision API client with streaming (SSE) and non-streaming modes. TLS warmup optimization, image MIME detection, conversation history support. | -| `OpenAIAPI.swift` | ~142 | OpenAI GPT vision API client. | -| `ElevenLabsTTSClient.swift` | ~81 | ElevenLabs TTS client. Sends text to the Worker proxy, plays back audio via `AVAudioPlayer`. Exposes `isPlaying` for transient cursor scheduling. | -| `ElementLocationDetector.swift` | ~335 | Detects UI element locations in screenshots for cursor pointing. | +| `OpenAIAPI.swift` | ~253 | OpenAI GPT vision API client with streaming (SSE) and non-streaming modes. | +| `OpenAITTSClient.swift` | ~66 | OpenAI TTS client. Sends text to the Worker proxy, plays back audio via `AVAudioPlayer`. Exposes `isPlaying` for transient cursor scheduling. | | `DesignSystem.swift` | ~880 | Design system tokens — colors, corner radii, shared styles. All UI references `DS.Colors`, `DS.CornerRadius`, etc. | | `ClickyAnalytics.swift` | ~121 | PostHog analytics integration for usage tracking. | | `WindowPositionManager.swift` | ~262 | Window placement logic, Screen Recording permission flow, and accessibility permission helpers. | | `AppBundleConfiguration.swift` | ~28 | Runtime configuration reader for keys stored in the app bundle Info.plist. | -| `worker/src/index.ts` | ~142 | Cloudflare Worker proxy. Three routes: `/chat` (Claude), `/tts` (ElevenLabs), `/transcribe-token` (AssemblyAI temp token). | +| `worker/src/index.ts` | ~146 | Cloudflare Worker proxy. Routes: `/chat`, `/tts`, `/transcribe`, and `/health`, all using OpenAI except `/health`. | ## Build & Run @@ -103,8 +97,6 @@ npm install # Add secrets npx wrangler secret put OPENAI_API_KEY -npx wrangler secret put ASSEMBLYAI_API_KEY -npx wrangler secret put ELEVENLABS_API_KEY # Deploy npx wrangler deploy diff --git a/leanring-buddy.xcodeproj/project.xcworkspace/xcshareddata/swiftpm/Package.resolved b/leanring-buddy.xcodeproj/project.xcworkspace/xcshareddata/swiftpm/Package.resolved new file mode 100644 index 00000000..d88adb21 --- /dev/null +++ b/leanring-buddy.xcodeproj/project.xcworkspace/xcshareddata/swiftpm/Package.resolved @@ -0,0 +1,33 @@ +{ + "originHash" : "3c6fb67fefedcfcd00708e24ca8088151f21dccfc0ade32ea80c406646277e89", + "pins" : [ + { + "identity" : "plcrashreporter", + "kind" : "remoteSourceControl", + "location" : "https://github.com/microsoft/plcrashreporter.git", + "state" : { + "revision" : "0254f941c646b1ed17b243654723d0f071e990d0", + "version" : "1.12.2" + } + }, + { + "identity" : "posthog-ios", + "kind" : "remoteSourceControl", + "location" : "https://github.com/PostHog/posthog-ios.git", + "state" : { + "revision" : "09da1be6a614325a6a464c6d2017a9ac858d1b5a", + "version" : "3.47.0" + } + }, + { + "identity" : "sparkle", + "kind" : "remoteSourceControl", + "location" : "https://github.com/sparkle-project/Sparkle", + "state" : { + "revision" : "21d8df80440b1ca3b65fa82e40782f1e5a9e6ba2", + "version" : "2.9.0" + } + } + ], + "version" : 3 +} diff --git a/leanring-buddy/AGENTS.md b/leanring-buddy/AGENTS.md index bde23b56..50408a9e 100644 --- a/leanring-buddy/AGENTS.md +++ b/leanring-buddy/AGENTS.md @@ -1,6 +1,6 @@ # leanring-buddy — Main App Target -> 22 Swift files, ~7,751 LOC. Organized into subdirectories by domain. +> Swift files are organized into subdirectories by domain. ## Directory Structure @@ -8,7 +8,7 @@ leanring-buddy/ ├── App/ Entry point, central orchestrator, runtime config ├── Voice/ Push-to-talk, mic capture, transcription providers -├── AI/ API clients (OpenAI, Claude, ElevenLabs, element detection) +├── AI/ API clients (OpenAI chat + OpenAI TTS) ├── UI/ Menu bar panel, overlay, design system ├── Utilities/ Screenshots, permissions, analytics ├── Resources/ Assets, audio files, images @@ -25,21 +25,20 @@ leanring-buddy/ | Voice state machine | `CompanionManager.swift` | App/ | Central orchestrator. Owns dictation, OpenAI API, TTS, overlay, onboarding. 9 MARK sections. | | Runtime config | `AppBundleConfiguration.swift` | App/ | Reads keys from `Info.plist` at runtime. | | Menu bar icon + panel | `MenuBarPanelManager.swift` | UI/ | `NSStatusItem` + custom borderless `NSPanel`. Non-activating, auto-dismiss on outside click. | -| Panel UI (dropdown) | `CompanionPanelView.swift` | UI/ | SwiftUI. Model picker, permissions, push-to-talk instructions, quit button. | +| Panel UI (dropdown) | `CompanionPanelView.swift` | UI/ | SwiftUI. Model picker, permissions, push-to-talk and typed-input instructions, quit button. | +| Typed prompt window | `TextPromptWindowManager.swift` | UI/ | Floating `NSPanel` text input. Submits typed messages into the shared screenshot → OpenAI → TTS pipeline. | | Cursor overlay | `OverlayWindow.swift` | UI/ | Full-screen transparent `NSPanel` via `NSHostingView`. Cursor animation, bezier arc pointing, multi-monitor coordinate mapping. | | Response bubble + waveform | `CompanionResponseOverlay.swift` | UI/ | SwiftUI view rendered in the overlay next to the cursor. | | Design tokens | `DesignSystem.swift` | UI/ | `DS.Colors.*`, `DS.CornerRadius.*`, button styles. All UI references this. | | Push-to-talk pipeline | `BuddyDictationManager.swift` | Voice/ | `AVAudioEngine` mic capture, provider-aware permissions, transcript finalization, contextual keyterms. | | Global hotkey | `GlobalPushToTalkShortcutMonitor.swift` | Voice/ | Listen-only `CGEvent` tap (not AppKit global monitor). Publishes press/release transitions. | +| Text prompt hotkey | `GlobalTextPromptShortcutMonitor.swift` | Voice/ | Listen-only `CGEvent` tap for Command+Shift+Return. Publishes typed-prompt open events. | | Transcription protocol | `BuddyTranscriptionProvider.swift` | Voice/ | Protocol + factory. Provider resolved from `Info.plist` `VoiceTranscriptionProvider` key. | -| Transcription (default) | `AssemblyAIStreamingTranscriptionProvider.swift` | Voice/ | Real-time websocket (`u3-rt-pro`). Fetches temp token from Worker. **Shares a single `URLSession`** — never create per-session. | -| Transcription (upload) | `OpenAIAudioTranscriptionProvider.swift` | Voice/ | Buffers audio, uploads WAV on key-up. | +| Transcription (default) | `OpenAIAudioTranscriptionProvider.swift` | Voice/ | Buffers audio, uploads WAV to OpenAI on key-up. | | Transcription (local) | `AppleSpeechTranscriptionProvider.swift` | Voice/ | Apple Speech framework fallback. | | Audio conversion | `BuddyAudioConversionSupport.swift` | Voice/ | PCM16 mono conversion, WAV payload builder. | | OpenAI chat | `OpenAIAPI.swift` | AI/ | GPT-4o vision client with SSE streaming. Routes through Worker proxy. | -| Claude chat | `ClaudeAPI.swift` | AI/ | SSE streaming + non-streaming. Currently unused (kept as reference). | -| TTS playback | `ElevenLabsTTSClient.swift` | AI/ | Worker proxy → `AVAudioPlayer`. Exposes `isPlaying` for transient cursor scheduling. | -| Element pointing | `ElementLocationDetector.swift` | AI/ | Uses Claude Computer Use API to detect UI element coordinates in screenshots. | +| OpenAI TTS playback | `OpenAITTSClient.swift` | AI/ | Worker proxy → OpenAI speech API → `AVAudioPlayer`. Exposes `isPlaying` for transient cursor scheduling. | | Screenshots | `CompanionScreenCaptureUtility.swift` | Utilities/ | ScreenCaptureKit multi-monitor capture. Returns labeled image data per display. | | Window placement + perms | `WindowPositionManager.swift` | Utilities/ | Screen Recording permission gate, accessibility permission helpers, window positioning. | | Analytics | `ClickyAnalytics.swift` | Utilities/ | PostHog integration. | @@ -51,10 +50,10 @@ leanring-buddy/ | `CompanionVoiceState` | enum | CompanionManager | `.idle` / `.listening` / `.processing` / `.responding` | | `CompanionManager` | class | CompanionManager | Central `@MainActor ObservableObject`. Owns everything. | | `CompanionManager.start()` | method | CompanionManager | Bootstrap: permissions → polling → bindings → TLS warmup → overlay | -| `CompanionManager.sendTranscriptToAIWithScreenshot` | method | CompanionManager | Core pipeline: screenshot → OpenAI SSE → parse pointing → TTS | +| `CompanionManager.sendTranscriptToAIWithScreenshot` | method | CompanionManager | Core pipeline: screenshot → OpenAI SSE → parse pointing → OpenAI TTS | | `CompanionManager.handleShortcutTransition` | method | CompanionManager | Push-to-talk state machine (pressed → record, released → finalize) | | `BuddyDictationManager` | class | BuddyDictationManager | Mic capture + transcript lifecycle | -| `BuddyTranscriptionProvider` | protocol | BuddyTranscriptionProvider | Abstraction over AssemblyAI/OpenAI/Apple Speech | +| `BuddyTranscriptionProvider` | protocol | BuddyTranscriptionProvider | Abstraction over OpenAI/Apple Speech | | `BuddyPushToTalkShortcut` | enum | BuddyDictationManager | Shortcut options + transition detection logic | | `MenuBarPanelManager` | class | MenuBarPanelManager | `NSStatusItem` + `NSPanel` lifecycle | | `OverlayWindowManager` | class | OverlayWindow | Creates/manages full-screen overlay panels per screen | @@ -65,13 +64,12 @@ leanring-buddy/ - **Organized by domain**: Files grouped into `App/`, `Voice/`, `AI/`, `UI/`, `Utilities/`, `Resources/`. Xcode auto-syncs via `PBXFileSystemSynchronizedRootGroup`. - **MARK sections**: Large files use `// MARK: - Section Name` to organize logical subsystems (CompanionManager has 9 sections). -- **Provider pattern**: Transcription uses protocol + factory + Info.plist key. Add new providers by implementing `BuddyTranscriptionProvider` and registering in `BuddyTranscriptionProviderFactory`. +- **Provider pattern**: Transcription uses protocol + factory + Info.plist key. OpenAI is the default provider; Apple Speech remains a local fallback. - **AppKit bridging**: `NSPanel` + `NSHostingView` for menu bar panel and overlay. Comments explain "why" for all bridging code. - **No `@EnvironmentObject`**: State flows through `CompanionManager` passed explicitly to views via init parameters. ## ANTI-PATTERNS (this directory only) -- **Never create/destroy `URLSession` per AssemblyAI session** — use `sharedWebSocketURLSession`. Per-session sessions corrupt the OS connection pool. - **Never suppress type errors** with force casts or `// swiftlint:disable` — fix them properly. - **Never suppress or ignore the deprecated onChange warning** in OverlayWindow.swift — it's a known non-blocking warning, leave it. - **Never add features/refactor beyond what was asked** — scope discipline. diff --git a/leanring-buddy/AI/ElevenLabsTTSClient.swift b/leanring-buddy/AI/OpenAITTSClient.swift similarity index 100% rename from leanring-buddy/AI/ElevenLabsTTSClient.swift rename to leanring-buddy/AI/OpenAITTSClient.swift diff --git a/leanring-buddy/App/CompanionManager.swift b/leanring-buddy/App/CompanionManager.swift index ba7c71b5..1d649e9f 100644 --- a/leanring-buddy/App/CompanionManager.swift +++ b/leanring-buddy/App/CompanionManager.swift @@ -32,7 +32,7 @@ final class CompanionManager: ObservableObject { @Published private(set) var hasScreenContentPermission = false /// Screen location (global AppKit coords) of a detected UI element the - /// buddy should fly to and point at. Parsed from Claude's response; + /// buddy should fly to and point at. Parsed from the AI response; /// observed by BlueCursorView to trigger the flight animation. @Published var detectedElementScreenLocation: CGPoint? /// The display frame (global AppKit coords) of the screen the detected @@ -82,14 +82,13 @@ final class CompanionManager: ObservableObject { return OpenAITTSClient(proxyURL: "\(Self.workerBaseURL)/tts") }() - /// Conversation history so Claude remembers prior exchanges within a session. - /// Each entry is the user's transcript and Claude's response. + /// Conversation history so the AI remembers prior exchanges within a session. + /// Each entry is the user's transcript and the assistant's response. private var conversationHistory: [(userTranscript: String, assistantResponse: String)] = [] /// The currently running AI response task, if any. Cancelled when the user /// speaks again so a new response can begin immediately. private var currentResponseTask: Task? - private var systemSpeechSynthesizer: NSSpeechSynthesizer? private var shortcutTransitionCancellable: AnyCancellable? private var textPromptShortcutCancellable: AnyCancellable? @@ -186,7 +185,7 @@ final class CompanionManager: ObservableObject { bindShortcutTransitions() // Eagerly touch the OpenAI API so its TLS warmup handshake completes bindTextPromptShortcut() - // Eagerly touch the Claude API so its TLS warmup handshake completes + // Eagerly touch the OpenAI API so its TLS warmup handshake completes // well before the onboarding demo fires at ~40s into the video. _ = openAIAPI @@ -304,8 +303,6 @@ final class CompanionManager: ObservableObject { currentResponseTask?.cancel() currentResponseTask = nil - systemSpeechSynthesizer?.stopSpeaking() - systemSpeechSynthesizer = nil shortcutTransitionCancellable?.cancel() textPromptShortcutCancellable?.cancel() isKeyboardShortcutInteractionActive = false @@ -529,7 +526,7 @@ final class CompanionManager: ObservableObject { lastTranscript = trimmedMessage print("⌨️ Companion received typed message: \(trimmedMessage)") ClickyAnalytics.trackUserMessageSent(transcript: trimmedMessage) - sendTranscriptToClaudeWithScreenshot(transcript: trimmedMessage) + sendTranscriptToAIWithScreenshot(transcript: trimmedMessage) } private func handleShortcutTransition(_ transition: BuddyPushToTalkShortcut.ShortcutTransition) { @@ -558,9 +555,6 @@ final class CompanionManager: ObservableObject { // Cancel any in-progress response and TTS from a previous utterance currentResponseTask?.cancel() ttsClient.stopPlayback() - elevenLabsTTSClient.stopPlayback() - systemSpeechSynthesizer?.stopSpeaking() - systemSpeechSynthesizer = nil clearDetectedElementLocation() // Dismiss the onboarding prompt if it's showing @@ -627,7 +621,7 @@ final class CompanionManager: ObservableObject { - if you receive multiple screen images, the one labeled "primary focus" is where the cursor is — prioritize that one but reference others if relevant. element pointing: - you have a small blue triangle cursor that can fly to and point at things on screen. use it whenever pointing would genuinely help the user — if they're asking how to do something, looking for a menu, trying to find a button, or need help navigating an app, point at the relevant element. err on the side of pointing rather than not pointing, because it makes your help way more useful and concrete. + you have a small blue triangle cursor that can fly to and point at things on screen. use it whenever pointing would genuinely help the user — if they're asking how to do something, looking for a menu, trying to find a button, or need help navigating an app, point at the relevant element. err on the side of pointing rather than not pointing, because it makes your help way more useful and concrete. if the user asks anything like "where", "what do i click", "show me", "point at", "find", "open", "press", "select", or "how do i", you should almost always return a real coordinate tag instead of [POINT:none]. don't point at things when it would be pointless — like if the user asks a general knowledge question, or the conversation has nothing to do with what's on screen, or you'd just be pointing at something obvious they're already looking at. but if there's a specific UI element, menu, button, or area on screen that's relevant to what you're helping with, point at it. @@ -646,17 +640,14 @@ final class CompanionManager: ObservableObject { // MARK: - AI Response Pipeline - /// Captures a screenshot, sends it along with the transcript to Claude, - /// and plays the response aloud via ElevenLabs TTS. The cursor stays in + /// Captures a screenshot, sends it along with the transcript to OpenAI, + /// and plays the response aloud via OpenAI TTS. The cursor stays in /// the spinner/processing state until TTS audio begins playing. - /// Claude's response may include a [POINT:x,y:label] tag which triggers + /// The AI response may include a [POINT:x,y:label] tag which triggers /// the buddy to fly to that element on screen. private func sendTranscriptToAIWithScreenshot(transcript: String) { currentResponseTask?.cancel() ttsClient.stopPlayback() - elevenLabsTTSClient.stopPlayback() - systemSpeechSynthesizer?.stopSpeaking() - systemSpeechSynthesizer = nil currentResponseTask = Task { // Stay in processing (spinner) state — no streaming text displayed @@ -669,14 +660,15 @@ final class CompanionManager: ObservableObject { guard !Task.isCancelled else { return } // Build image labels with the actual screenshot pixel dimensions - // so Claude's coordinate space matches the image it sees. We + // so the AI's coordinate space matches the image it sees. We // scale from screenshot pixels to display points ourselves. let labeledImages = screenCaptures.map { capture in let dimensionInfo = " (image dimensions: \(capture.screenshotWidthInPixels)x\(capture.screenshotHeightInPixels) pixels)" return (data: capture.imageData, label: capture.label + dimensionInfo) } + print("📸 Captured \(labeledImages.count) screenshot(s) for OpenAI: \(labeledImages.map { $0.label }.joined(separator: " | "))") - // Pass conversation history so Claude remembers prior exchanges + // Pass conversation history so the AI remembers prior exchanges let historyForAPI = conversationHistory.map { entry in (userPlaceholder: entry.userTranscript, assistantResponse: entry.assistantResponse) } @@ -693,11 +685,12 @@ final class CompanionManager: ObservableObject { guard !Task.isCancelled else { return } - // Parse the [POINT:...] tag from Claude's response + // Parse the [POINT:...] tag from the AI response let parseResult = Self.parsePointingCoordinates(from: fullResponseText) let spokenText = parseResult.spokenText + print("🧭 Point tag parse: coordinate=\(parseResult.coordinate.map { "\(Int($0.x)),\(Int($0.y))" } ?? "none"), label=\(parseResult.elementLabel ?? "none"), screen=\(parseResult.screenNumber.map(String.init) ?? "cursor")") - // Handle element pointing if Claude returned coordinates. + // Handle element pointing if the AI returned coordinates. // Switch to idle BEFORE setting the location so the triangle // becomes visible and can fly to the target. Without this, the // spinner hides the triangle and the flight animation is invisible. @@ -706,7 +699,7 @@ final class CompanionManager: ObservableObject { voiceState = .idle } - // Pick the screen capture matching Claude's screen number, + // Pick the screen capture matching the AI's screen number, // falling back to the cursor screen if not specified. let targetScreenCapture: CompanionScreenCapture? = { if let screenNumber = parseResult.screenNumber, @@ -718,7 +711,7 @@ final class CompanionManager: ObservableObject { if let pointCoordinate = parseResult.coordinate, let targetScreenCapture { - // Claude's coordinates are in the screenshot's pixel space + // The AI's coordinates are in the screenshot's pixel space // (top-left origin, e.g. 1280x831). Scale to the display's // point space (e.g. 1512x982), then convert to AppKit global coords. let screenshotWidth = CGFloat(targetScreenCapture.screenshotWidthInPixels) @@ -777,21 +770,7 @@ final class CompanionManager: ObservableObject { voiceState = .responding } catch { ClickyAnalytics.trackTTSError(error: error.localizedDescription) - print("⚠️ ElevenLabs TTS error: \(error)") - speakCreditsErrorFallback() - if Self.shouldUseElevenLabsTTS { - do { - try await elevenLabsTTSClient.speakText(spokenText) - // speakText returns after player.play() — audio is now playing - voiceState = .responding - } catch { - ClickyAnalytics.trackTTSError(error: error.localizedDescription) - print("⚠️ ElevenLabs TTS error: \(error)") - speakWithSystemVoice(spokenText) - } - } else { - print("🔊 System TTS: ElevenLabs disabled for local development") - speakWithSystemVoice(spokenText) + print("⚠️ OpenAI TTS error: \(error)") } } } catch is CancellationError { @@ -799,7 +778,6 @@ final class CompanionManager: ObservableObject { } catch { ClickyAnalytics.trackResponseError(error: error.localizedDescription) print("⚠️ Companion response error: \(error)") - speakResponseErrorFallback() } if !Task.isCancelled { @@ -839,33 +817,13 @@ final class CompanionManager: ObservableObject { } } - /// Uses macOS system TTS when ElevenLabs is unavailable, so local - /// development can still verify that Claude answered correctly. - private func speakWithSystemVoice(_ text: String) { - let utterance = text.trimmingCharacters(in: .whitespacesAndNewlines) - guard !utterance.isEmpty else { return } - - systemSpeechSynthesizer?.stopSpeaking() - let synthesizer = NSSpeechSynthesizer() - systemSpeechSynthesizer = synthesizer - print("🔊 System TTS: speaking fallback response") - synthesizer.startSpeaking(utterance) - voiceState = .responding - } - - /// Speaks a generic error using macOS system TTS when the AI response - /// request fails before Clicky has any real answer to read aloud. - private func speakResponseErrorFallback() { - speakWithSystemVoice("I couldn't get a response from the AI service. Check the local Worker logs for the exact error.") - } - // MARK: - Point Tag Parsing - /// Result of parsing a [POINT:...] tag from Claude's response. + /// Result of parsing a [POINT:...] tag from the AI response. struct PointingParseResult { /// The response text with the [POINT:...] tag removed — this is what gets spoken. let spokenText: String - /// The parsed pixel coordinate, or nil if Claude said "none" or no tag was found. + /// The parsed pixel coordinate, or nil if the AI said "none" or no tag was found. let coordinate: CGPoint? /// Short label describing the element (e.g. "run button"), or "none". let elementLabel: String? @@ -873,7 +831,7 @@ final class CompanionManager: ObservableObject { let screenNumber: Int? } - /// Parses a [POINT:x,y:label:screenN] or [POINT:none] tag from the end of Claude's response. + /// Parses a [POINT:x,y:label:screenN] or [POINT:none] tag from the end of the AI response. /// Returns the spoken text (tag removed) and the optional coordinate + label + screen number. static func parsePointingCoordinates(from responseText: String) -> PointingParseResult { // Match [POINT:none] or [POINT:123,456:label] or [POINT:123,456:label:screen2] @@ -1055,7 +1013,7 @@ final class CompanionManager: ObservableObject { the screenshot images are labeled with their pixel dimensions. use those dimensions as the coordinate space. origin (0,0) is top-left. x increases rightward, y increases downward. """ - /// Captures a screenshot and asks Claude to find something interesting to + /// Captures a screenshot and asks the AI to find something interesting to /// point at, then triggers the buddy's flight animation. Used during /// onboarding to demo the pointing feature while the intro video plays. func performOnboardingDemoInteraction() { @@ -1066,7 +1024,7 @@ final class CompanionManager: ObservableObject { do { let screenCaptures = try await CompanionScreenCaptureUtility.captureAllScreensAsJPEG() - // Only send the cursor screen so Claude can't pick something + // Only send the cursor screen so the AI can't pick something // on a different monitor that we can't point at. guard let cursorScreenCapture = screenCaptures.first(where: { $0.isCursorScreen }) else { print("🎯 Onboarding demo: no cursor screen found") @@ -1106,7 +1064,7 @@ final class CompanionManager: ObservableObject { y: appKitY + displayFrame.origin.y ) - // Set custom bubble text so the pointing animation uses Claude's + // Set custom bubble text so the pointing animation uses the AI's // comment instead of a random phrase detectedElementBubbleText = parseResult.spokenText detectedElementScreenLocation = globalLocation diff --git a/leanring-buddy/Info.plist b/leanring-buddy/Info.plist index aa0306a9..a5caa6eb 100644 --- a/leanring-buddy/Info.plist +++ b/leanring-buddy/Info.plist @@ -10,7 +10,6 @@ /l3d2rw5ZZFRU3AadP/w2Zf8FHfhA6bKv16BQOV5OSk= VoiceTranscriptionProvider openai - assemblyai NSAppTransportSecurity NSAllowsLocalNetworking diff --git a/leanring-buddy/UI/OverlayWindow.swift b/leanring-buddy/UI/OverlayWindow.swift index 884ebcbf..cdbce4c3 100644 --- a/leanring-buddy/UI/OverlayWindow.swift +++ b/leanring-buddy/UI/OverlayWindow.swift @@ -329,7 +329,7 @@ struct BlueCursorView: View { .animation(.spring(response: 0.2, dampingFraction: 0.6, blendDuration: 0), value: cursorPosition) .animation(.easeIn(duration: 0.15), value: companionManager.voiceState) - // Blue spinner — shown while the AI is processing (transcription + Claude + waiting for TTS) + // Blue spinner — shown while the AI is processing (transcription + response + waiting for TTS) BlueCursorSpinnerView() .opacity(buddyIsVisibleOnThisScreen && companionManager.voiceState == .processing ? cursorOpacity : 0) .position(cursorPosition) diff --git a/leanring-buddy/TextPromptWindowManager.swift b/leanring-buddy/UI/TextPromptWindowManager.swift similarity index 97% rename from leanring-buddy/TextPromptWindowManager.swift rename to leanring-buddy/UI/TextPromptWindowManager.swift index 2a880720..ab4f1cfc 100644 --- a/leanring-buddy/TextPromptWindowManager.swift +++ b/leanring-buddy/UI/TextPromptWindowManager.swift @@ -3,7 +3,7 @@ // leanring-buddy // // A small floating prompt for sending typed messages through the same -// screenshot -> Claude -> speech/pointing pipeline as push-to-talk. +// screenshot -> AI -> speech/pointing pipeline as push-to-talk. // import AppKit @@ -27,7 +27,9 @@ final class TextPromptWindowManager { createPanel(companionManager: companionManager) } - positionPanelNearCursor() + if panel?.isVisible != true { + positionPanelNearCursor() + } NSApp.activate(ignoringOtherApps: true) panel?.makeKeyAndOrderFront(nil) panel?.orderFrontRegardless() @@ -230,6 +232,6 @@ private struct TextPromptPanelView: View { companionManager.submitTypedMessage(textToSubmit) promptText = "" - onClose() + isPromptFocused = true } } diff --git a/leanring-buddy/Voice/AssemblyAIStreamingTranscriptionProvider.swift b/leanring-buddy/Voice/AssemblyAIStreamingTranscriptionProvider.swift deleted file mode 100644 index 619e5cf6..00000000 --- a/leanring-buddy/Voice/AssemblyAIStreamingTranscriptionProvider.swift +++ /dev/null @@ -1,478 +0,0 @@ -// -// AssemblyAIStreamingTranscriptionProvider.swift -// leanring-buddy -// -// Streaming AI transcription provider backed by AssemblyAI's websocket API. -// - -import AVFoundation -import Foundation - -struct AssemblyAIStreamingTranscriptionProviderError: LocalizedError { - let message: String - - var errorDescription: String? { - message - } -} - -final class AssemblyAIStreamingTranscriptionProvider: BuddyTranscriptionProvider { - /// URL for the Cloudflare Worker endpoint that returns a short-lived - /// AssemblyAI streaming token. The real API key never leaves the server. - private static let tokenProxyURL = "http://127.0.0.1:8787/transcribe-token" - - let displayName = "AssemblyAI" - let requiresSpeechRecognitionPermission = false - - var isConfigured: Bool { true } - var unavailableExplanation: String? { nil } - - /// Single long-lived URLSession shared across all streaming sessions. - /// Creating and invalidating a URLSession per session corrupts the OS - /// connection pool and causes "Socket is not connected" errors after - /// a few rapid reconnections to the same host. - private let sharedWebSocketURLSession = URLSession(configuration: .default) - - func startStreamingSession( - keyterms: [String], - onTranscriptUpdate: @escaping (String) -> Void, - onFinalTranscriptReady: @escaping (String) -> Void, - onError: @escaping (Error) -> Void - ) async throws -> any BuddyStreamingTranscriptionSession { - // Fetch a fresh temporary token from the proxy before each session - let temporaryToken = try await fetchTemporaryToken() - print("🎙️ AssemblyAI: fetched temporary token (\(temporaryToken.prefix(20))...)") - - let session = AssemblyAIStreamingTranscriptionSession( - apiKey: nil, - temporaryToken: temporaryToken, - urlSession: sharedWebSocketURLSession, - keyterms: keyterms, - onTranscriptUpdate: onTranscriptUpdate, - onFinalTranscriptReady: onFinalTranscriptReady, - onError: onError - ) - - try await session.open() - return session - } - - /// Calls the Cloudflare Worker to get a short-lived AssemblyAI token. - private func fetchTemporaryToken() async throws -> String { - var request = URLRequest(url: URL(string: Self.tokenProxyURL)!) - request.httpMethod = "POST" - - let (data, response) = try await URLSession.shared.data(for: request) - - guard let httpResponse = response as? HTTPURLResponse, - (200...299).contains(httpResponse.statusCode) else { - let statusCode = (response as? HTTPURLResponse)?.statusCode ?? -1 - let body = String(data: data, encoding: .utf8) ?? "unknown" - throw AssemblyAIStreamingTranscriptionProviderError( - message: "Failed to fetch AssemblyAI token (HTTP \(statusCode)): \(body)" - ) - } - - guard let json = try? JSONSerialization.jsonObject(with: data) as? [String: Any], - let token = json["token"] as? String else { - throw AssemblyAIStreamingTranscriptionProviderError( - message: "Invalid token response from proxy." - ) - } - - return token - } -} - -private final class AssemblyAIStreamingTranscriptionSession: NSObject, BuddyStreamingTranscriptionSession { - private struct MessageEnvelope: Decodable { - let type: String - } - - private struct TurnMessage: Decodable { - let type: String - let transcript: String? - let turn_order: Int? - let end_of_turn: Bool? - let turn_is_formatted: Bool? - } - - private struct ErrorMessage: Decodable { - let type: String - let error: String? - let message: String? - } - - private struct StoredTurnTranscript { - var transcriptText: String - var isFormatted: Bool - } - - private static let websocketBaseURLString = "wss://streaming.assemblyai.com/v3/ws" - private static let targetSampleRate = 16_000.0 - private static let explicitFinalTranscriptGracePeriodSeconds = 1.4 - - let finalTranscriptFallbackDelaySeconds: TimeInterval = 2.8 - - private let apiKey: String? - private let temporaryToken: String? - private let keyterms: [String] - private let onTranscriptUpdate: (String) -> Void - private let onFinalTranscriptReady: (String) -> Void - private let onError: (Error) -> Void - - private let stateQueue = DispatchQueue(label: "com.learningbuddy.assemblyai.state") - private let sendQueue = DispatchQueue(label: "com.learningbuddy.assemblyai.send") - private let audioPCM16Converter = BuddyPCM16AudioConverter(targetSampleRate: targetSampleRate) - private let urlSession: URLSession - - private var webSocketTask: URLSessionWebSocketTask? - private var readyContinuation: CheckedContinuation? - private var hasResolvedReadyContinuation = false - private var hasDeliveredFinalTranscript = false - private var isAwaitingExplicitFinalTranscript = false - private var latestTranscriptText = "" - private var activeTurnOrder: Int? - private var activeTurnTranscriptText = "" - private var storedTurnTranscriptsByOrder: [Int: StoredTurnTranscript] = [:] - private var explicitFinalTranscriptDeadlineWorkItem: DispatchWorkItem? - - init( - apiKey: String?, - temporaryToken: String?, - urlSession: URLSession, - keyterms: [String], - onTranscriptUpdate: @escaping (String) -> Void, - onFinalTranscriptReady: @escaping (String) -> Void, - onError: @escaping (Error) -> Void - ) { - self.apiKey = apiKey - self.temporaryToken = temporaryToken - self.urlSession = urlSession - self.keyterms = keyterms - self.onTranscriptUpdate = onTranscriptUpdate - self.onFinalTranscriptReady = onFinalTranscriptReady - self.onError = onError - } - - func open() async throws { - let websocketURL = try Self.makeWebsocketURL( - temporaryToken: temporaryToken, - keyterms: keyterms - ) - - var websocketRequest = URLRequest(url: websocketURL) - if let apiKey { - websocketRequest.setValue(apiKey, forHTTPHeaderField: "Authorization") - } - - let webSocketTask = urlSession.webSocketTask(with: websocketRequest) - self.webSocketTask = webSocketTask - webSocketTask.resume() - - receiveNextMessage() - - try await withCheckedThrowingContinuation { continuation in - stateQueue.async { - self.readyContinuation = continuation - } - } - } - - func appendAudioBuffer(_ audioBuffer: AVAudioPCMBuffer) { - guard let audioPCM16Data = audioPCM16Converter.convertToPCM16Data(from: audioBuffer), - !audioPCM16Data.isEmpty else { - return - } - - sendQueue.async { [weak self] in - guard let self, let webSocketTask = self.webSocketTask else { return } - webSocketTask.send(.data(audioPCM16Data)) { [weak self] error in - if let error { - self?.failSession(with: error) - } - } - } - } - - func requestFinalTranscript() { - stateQueue.async { - guard !self.hasDeliveredFinalTranscript else { return } - self.isAwaitingExplicitFinalTranscript = true - self.scheduleExplicitFinalTranscriptDeadline() - } - - sendJSONMessage(["type": "ForceEndpoint"]) - } - - func cancel() { - stateQueue.async { - self.explicitFinalTranscriptDeadlineWorkItem?.cancel() - self.explicitFinalTranscriptDeadlineWorkItem = nil - } - - sendJSONMessage(["type": "Terminate"]) - webSocketTask?.cancel(with: .goingAway, reason: nil) - } - - private func receiveNextMessage() { - webSocketTask?.receive { [weak self] result in - guard let self else { return } - - switch result { - case .success(let message): - switch message { - case .string(let text): - self.handleIncomingTextMessage(text) - case .data(let data): - if let text = String(data: data, encoding: .utf8) { - self.handleIncomingTextMessage(text) - } - @unknown default: - break - } - - self.receiveNextMessage() - case .failure(let error): - self.failSession(with: error) - } - } - } - - private func handleIncomingTextMessage(_ text: String) { - guard let messageData = text.data(using: .utf8) else { return } - - do { - let envelope = try JSONDecoder().decode(MessageEnvelope.self, from: messageData) - - switch envelope.type.lowercased() { - case "begin": - resolveReadyContinuationIfNeeded(with: .success(())) - case "turn": - let turnMessage = try JSONDecoder().decode(TurnMessage.self, from: messageData) - handleTurnMessage(turnMessage) - case "termination": - resolveReadyContinuationIfNeeded(with: .success(())) - stateQueue.async { - if self.isAwaitingExplicitFinalTranscript && !self.hasDeliveredFinalTranscript { - self.deliverFinalTranscriptIfNeeded(self.bestAvailableTranscriptText()) - } - } - case "error": - let errorMessage = try JSONDecoder().decode(ErrorMessage.self, from: messageData) - let messageText = errorMessage.error ?? errorMessage.message ?? "AssemblyAI returned an error." - failSession(with: AssemblyAIStreamingTranscriptionProviderError(message: messageText)) - default: - break - } - } catch { - failSession(with: error) - } - } - - private func handleTurnMessage(_ turnMessage: TurnMessage) { - let transcriptText = turnMessage.transcript? - .trimmingCharacters(in: .whitespacesAndNewlines) ?? "" - - stateQueue.async { - let turnOrder = turnMessage.turn_order - ?? self.activeTurnOrder - ?? ((self.storedTurnTranscriptsByOrder.keys.max() ?? -1) + 1) - - if turnMessage.end_of_turn == true || turnMessage.turn_is_formatted == true { - self.activeTurnOrder = nil - self.activeTurnTranscriptText = "" - self.storeTurnTranscript( - transcriptText, - forTurnOrder: turnOrder, - isFormatted: turnMessage.turn_is_formatted == true - ) - } else { - self.activeTurnOrder = turnOrder - self.activeTurnTranscriptText = transcriptText - } - - let fullTranscriptText = self.composeFullTranscript() - self.latestTranscriptText = fullTranscriptText - - if !fullTranscriptText.isEmpty { - self.onTranscriptUpdate(fullTranscriptText) - } - - guard self.isAwaitingExplicitFinalTranscript else { return } - - if turnMessage.end_of_turn == true || turnMessage.turn_is_formatted == true { - self.explicitFinalTranscriptDeadlineWorkItem?.cancel() - self.explicitFinalTranscriptDeadlineWorkItem = nil - self.deliverFinalTranscriptIfNeeded(self.bestAvailableTranscriptText()) - } - } - } - - private func storeTurnTranscript( - _ transcriptText: String, - forTurnOrder turnOrder: Int, - isFormatted: Bool - ) { - guard !transcriptText.isEmpty else { return } - - if let existingTurnTranscript = storedTurnTranscriptsByOrder[turnOrder] { - if existingTurnTranscript.isFormatted && !isFormatted { - return - } - } - - storedTurnTranscriptsByOrder[turnOrder] = StoredTurnTranscript( - transcriptText: transcriptText, - isFormatted: isFormatted - ) - } - - private func composeFullTranscript() -> String { - let committedTranscriptSegments = storedTurnTranscriptsByOrder - .sorted(by: { $0.key < $1.key }) - .map(\.value.transcriptText) - .filter { !$0.isEmpty } - - var transcriptSegments = committedTranscriptSegments - - let trimmedActiveTurnTranscriptText = activeTurnTranscriptText - .trimmingCharacters(in: .whitespacesAndNewlines) - - if !trimmedActiveTurnTranscriptText.isEmpty { - transcriptSegments.append(trimmedActiveTurnTranscriptText) - } - - return transcriptSegments.joined(separator: " ") - } - - private func scheduleExplicitFinalTranscriptDeadline() { - explicitFinalTranscriptDeadlineWorkItem?.cancel() - - let deadlineWorkItem = DispatchWorkItem { [weak self] in - self?.stateQueue.async { - guard let self else { return } - self.deliverFinalTranscriptIfNeeded(self.bestAvailableTranscriptText()) - } - } - - explicitFinalTranscriptDeadlineWorkItem = deadlineWorkItem - - DispatchQueue.main.asyncAfter( - deadline: .now() + Self.explicitFinalTranscriptGracePeriodSeconds, - execute: deadlineWorkItem - ) - } - - private func deliverFinalTranscriptIfNeeded(_ transcriptText: String) { - guard !hasDeliveredFinalTranscript else { return } - hasDeliveredFinalTranscript = true - explicitFinalTranscriptDeadlineWorkItem?.cancel() - explicitFinalTranscriptDeadlineWorkItem = nil - onFinalTranscriptReady(transcriptText) - sendJSONMessage(["type": "Terminate"]) - } - - private func sendJSONMessage(_ payload: [String: Any]) { - guard let jsonData = try? JSONSerialization.data(withJSONObject: payload), - let jsonString = String(data: jsonData, encoding: .utf8) else { - return - } - - sendQueue.async { [weak self] in - guard let self, let webSocketTask = self.webSocketTask else { return } - webSocketTask.send(.string(jsonString)) { [weak self] error in - if let error { - self?.failSession(with: error) - } - } - } - } - - private func failSession(with error: Error) { - resolveReadyContinuationIfNeeded(with: .failure(error)) - stateQueue.async { - let latestTranscriptText = self.bestAvailableTranscriptText() - - if self.isAwaitingExplicitFinalTranscript - && !self.hasDeliveredFinalTranscript - && !latestTranscriptText.isEmpty { - print("[AssemblyAI] ⚠️ WebSocket error during active session, delivering partial transcript as fallback: \(error.localizedDescription)") - self.deliverFinalTranscriptIfNeeded(latestTranscriptText) - return - } - print("[AssemblyAI] ❌ Session failed with error: \(error.localizedDescription)") - - self.onError(error) - } - } - - private func bestAvailableTranscriptText() -> String { - let composedTranscriptText = composeFullTranscript() - .trimmingCharacters(in: .whitespacesAndNewlines) - - if !composedTranscriptText.isEmpty { - return composedTranscriptText - } - - return latestTranscriptText.trimmingCharacters(in: .whitespacesAndNewlines) - } - - private func resolveReadyContinuationIfNeeded(with result: Result) { - stateQueue.async { - guard !self.hasResolvedReadyContinuation else { return } - self.hasResolvedReadyContinuation = true - - switch result { - case .success: - self.readyContinuation?.resume() - case .failure(let error): - self.readyContinuation?.resume(throwing: error) - } - - self.readyContinuation = nil - } - } - - private static func makeWebsocketURL( - temporaryToken: String?, - keyterms: [String] - ) throws -> URL { - guard var websocketURLComponents = URLComponents(string: websocketBaseURLString) else { - throw AssemblyAIStreamingTranscriptionProviderError( - message: "AssemblyAI websocket URL is invalid." - ) - } - - var queryItems = [ - URLQueryItem(name: "sample_rate", value: "16000"), - URLQueryItem(name: "encoding", value: "pcm_s16le"), - URLQueryItem(name: "format_turns", value: "true"), - URLQueryItem(name: "speech_model", value: "u3-rt-pro") - ] - - let normalizedKeyterms = keyterms - .map { $0.trimmingCharacters(in: .whitespacesAndNewlines) } - .filter { !$0.isEmpty } - - if !normalizedKeyterms.isEmpty, - let keytermsData = try? JSONSerialization.data(withJSONObject: normalizedKeyterms), - let keytermsJSONString = String(data: keytermsData, encoding: .utf8) { - queryItems.append(URLQueryItem(name: "keyterms_prompt", value: keytermsJSONString)) - } - - if let temporaryToken { - queryItems.append(URLQueryItem(name: "token", value: temporaryToken)) - } - - websocketURLComponents.queryItems = queryItems - - guard let websocketURL = websocketURLComponents.url else { - throw AssemblyAIStreamingTranscriptionProviderError( - message: "AssemblyAI websocket URL could not be created." - ) - } - - return websocketURL - } -} diff --git a/leanring-buddy/Voice/BuddyDictationManager.swift b/leanring-buddy/Voice/BuddyDictationManager.swift index 5bca2677..3ee268a0 100644 --- a/leanring-buddy/Voice/BuddyDictationManager.swift +++ b/leanring-buddy/Voice/BuddyDictationManager.swift @@ -654,8 +654,6 @@ final class BuddyDictationManager: NSObject, ObservableObject { "makesomething", "Learning Buddy", "Codex", - "Claude", - "Anthropic", "OpenAI", "SwiftUI", "Xcode", diff --git a/leanring-buddy/Voice/BuddyTranscriptionProvider.swift b/leanring-buddy/Voice/BuddyTranscriptionProvider.swift index 0a75715d..1a7bfb1d 100644 --- a/leanring-buddy/Voice/BuddyTranscriptionProvider.swift +++ b/leanring-buddy/Voice/BuddyTranscriptionProvider.swift @@ -31,7 +31,6 @@ protocol BuddyTranscriptionProvider { enum BuddyTranscriptionProviderFactory { private enum PreferredProvider: String { - case assemblyAI = "assemblyai" case openAI = "openai" case appleSpeech = "apple" } @@ -48,29 +47,12 @@ enum BuddyTranscriptionProviderFactory { .lowercased() let preferredProvider = preferredProviderRawValue.flatMap(PreferredProvider.init(rawValue:)) - let assemblyAIProvider = AssemblyAIStreamingTranscriptionProvider() let openAIProvider = OpenAIAudioTranscriptionProvider() if preferredProvider == .appleSpeech { return AppleSpeechTranscriptionProvider() } - if preferredProvider == .assemblyAI { - if assemblyAIProvider.isConfigured { - return assemblyAIProvider - } - - print("⚠️ Transcription: AssemblyAI preferred but not configured, falling back") - - if openAIProvider.isConfigured { - print("⚠️ Transcription: using OpenAI as fallback") - return openAIProvider - } - - print("⚠️ Transcription: using Apple Speech as fallback") - return AppleSpeechTranscriptionProvider() - } - if preferredProvider == .openAI { if openAIProvider.isConfigured { return openAIProvider @@ -78,19 +60,10 @@ enum BuddyTranscriptionProviderFactory { print("⚠️ Transcription: OpenAI preferred but not configured, falling back") - if assemblyAIProvider.isConfigured { - print("⚠️ Transcription: using AssemblyAI as fallback") - return assemblyAIProvider - } - print("⚠️ Transcription: using Apple Speech as fallback") return AppleSpeechTranscriptionProvider() } - if assemblyAIProvider.isConfigured { - return assemblyAIProvider - } - if openAIProvider.isConfigured { return openAIProvider } diff --git a/leanring-buddy/GlobalTextPromptShortcutMonitor.swift b/leanring-buddy/Voice/GlobalTextPromptShortcutMonitor.swift similarity index 100% rename from leanring-buddy/GlobalTextPromptShortcutMonitor.swift rename to leanring-buddy/Voice/GlobalTextPromptShortcutMonitor.swift diff --git a/worker/.dev.vars.example b/worker/.dev.vars.example new file mode 100644 index 00000000..28d02e3c --- /dev/null +++ b/worker/.dev.vars.example @@ -0,0 +1 @@ +OPENAI_API_KEY=sk-your-openai-key-here diff --git a/worker/LOCAL_DEV.md b/worker/LOCAL_DEV.md index 61cb6939..b32180a9 100644 --- a/worker/LOCAL_DEV.md +++ b/worker/LOCAL_DEV.md @@ -23,7 +23,4 @@ Do not run `xcodebuild` from the terminal for this project, because it can distu ## What The Keys Do -- `ANTHROPIC_API_KEY`: sends the screen plus transcript to Claude and streams the answer. -- `ASSEMBLYAI_API_KEY`: powers push-to-talk speech-to-text. -- `ELEVENLABS_API_KEY`: generates the spoken reply audio. -- `ELEVENLABS_VOICE_ID`: chooses which ElevenLabs voice speaks the reply. +- `OPENAI_API_KEY`: sends the screen plus transcript to OpenAI, transcribes push-to-talk audio, and generates spoken reply audio. diff --git a/worker/src/index.ts b/worker/src/index.ts index 6d381f1f..77820bf0 100644 --- a/worker/src/index.ts +++ b/worker/src/index.ts @@ -1,20 +1,17 @@ /** * Clicky Proxy Worker * - * Proxies requests to OpenAI and ElevenLabs APIs so the app never - * ships with raw API keys. Keys are stored as Cloudflare secrets. + * Proxies requests to OpenAI so the app never ships with raw API keys. + * Keys are stored as Cloudflare secrets. * * Routes: * POST /chat → OpenAI Chat Completions API (streaming) - * POST /tts → ElevenLabs TTS API - * POST /transcribe-token → AssemblyAI temp token + * POST /tts → OpenAI Speech API + * POST /transcribe → OpenAI Audio Transcriptions API */ interface Env { OPENAI_API_KEY: string; - ELEVENLABS_API_KEY: string; - ELEVENLABS_VOICE_ID: string; - ASSEMBLYAI_API_KEY: string; } export default { @@ -25,10 +22,7 @@ export default { return new Response( JSON.stringify({ ok: true, - hasAnthropicKey: Boolean(env.ANTHROPIC_API_KEY), - hasAssemblyAIKey: Boolean(env.ASSEMBLYAI_API_KEY), - hasElevenLabsKey: Boolean(env.ELEVENLABS_API_KEY), - hasElevenLabsVoiceId: Boolean(env.ELEVENLABS_VOICE_ID), + hasOpenAIKey: Boolean(env.OPENAI_API_KEY), }), { status: 200, headers: { "content-type": "application/json" } } ); @@ -47,10 +41,6 @@ export default { return await handleTTS(request, env); } - if (url.pathname === "/transcribe-token") { - return await handleTranscribeToken(env); - } - if (url.pathname === "/transcribe") { return await handleTranscribe(request, env); } @@ -96,33 +86,6 @@ async function handleChat(request: Request, env: Env): Promise { }); } -async function handleTranscribeToken(env: Env): Promise { - const response = await fetch( - "https://streaming.assemblyai.com/v3/token?expires_in_seconds=480", - { - method: "GET", - headers: { - authorization: env.ASSEMBLYAI_API_KEY, - }, - } - ); - - if (!response.ok) { - const errorBody = await response.text(); - console.error(`[/transcribe-token] AssemblyAI token error ${response.status}: ${errorBody}`); - return new Response(errorBody, { - status: response.status, - headers: { "content-type": "application/json" }, - }); - } - - const data = await response.text(); - return new Response(data, { - status: 200, - headers: { "content-type": "application/json" }, - }); -} - async function handleTranscribe(request: Request, env: Env): Promise { const body = await request.arrayBuffer(); const contentType = request.headers.get("content-type") || "multipart/form-data"; diff --git a/worker/wrangler.toml b/worker/wrangler.toml index b4bdbf38..f5f6726f 100644 --- a/worker/wrangler.toml +++ b/worker/wrangler.toml @@ -1,6 +1,3 @@ name = "clicky-proxy" main = "src/index.ts" compatibility_date = "2024-01-01" - -[vars] -ELEVENLABS_VOICE_ID = "kPzsL2i3teMYv0FxEYQ6"