diff --git a/frontend/public/audio/mic-off-full.wav b/frontend/public/audio/mic-off-full.wav new file mode 100644 index 00000000..8935145c Binary files /dev/null and b/frontend/public/audio/mic-off-full.wav differ diff --git a/frontend/public/audio/mic-on-full.wav b/frontend/public/audio/mic-on-full.wav new file mode 100644 index 00000000..28c94340 Binary files /dev/null and b/frontend/public/audio/mic-on-full.wav differ diff --git a/frontend/src/components/RecordingOverlay.tsx b/frontend/src/components/RecordingOverlay.tsx index caa7fb26..73e0bb26 100644 --- a/frontend/src/components/RecordingOverlay.tsx +++ b/frontend/src/components/RecordingOverlay.tsx @@ -1,8 +1,17 @@ import { useEffect, useState, useRef, useMemo } from "react"; -import { X, CornerRightUp, Loader2 } from "lucide-react"; +import { X, CornerRightUp, Loader2, RotateCcw, Trash2 } from "lucide-react"; import { Button } from "@/components/ui/button"; import { cn } from "@/utils/utils"; +/** The current phase shown by the overlay. */ +export type VoiceOverlayState = + | "recording" + | "processing" + | "error" + | "waiting" + | "generating" + | "playing"; + interface RecordingOverlayProps { isRecording: boolean; isProcessing?: boolean; @@ -10,6 +19,18 @@ interface RecordingOverlayProps { onCancel: () => void; isCompact?: boolean; className?: string; + + // Voice-mode extensions + /** Current voice-mode state (defaults to recording/processing based on isRecording/isProcessing) */ + voiceState?: VoiceOverlayState; + /** Error message to display in error state */ + errorMessage?: string; + /** Duration of the recording that failed (shown in error state) */ + savedDuration?: number; + /** Called when user taps Retry in error state */ + onRetry?: () => void; + /** Called when user taps Discard in error state */ + onDiscard?: () => void; } export function RecordingOverlay({ @@ -18,14 +39,38 @@ export function RecordingOverlay({ onSend, onCancel, isCompact = false, - className + className, + voiceState: voiceStateProp, + errorMessage, + savedDuration, + onRetry, + onDiscard }: RecordingOverlayProps) { + // Derive the effective state: use voiceState prop if provided, otherwise fall back + const effectiveState: VoiceOverlayState = voiceStateProp + ? voiceStateProp + : isProcessing + ? "processing" + : "recording"; + const [duration, setDuration] = useState(0); const startTimeRef = useRef(0); const animationFrameRef = useRef(); + // Reset duration immediately when effectiveState changes away from recording + // so the next time recording starts, it doesn't flash the old value + const prevEffectiveStateRef = useRef(effectiveState); + if (prevEffectiveStateRef.current !== effectiveState) { + prevEffectiveStateRef.current = effectiveState; + if (effectiveState === "recording") { + // Synchronous state reset — avoids the one-frame flash of stale duration + // that happens when setDuration(0) is called only inside useEffect + setDuration(0); + } + } + useEffect(() => { - if (isRecording && !isProcessing) { + if (effectiveState === "recording") { startTimeRef.current = Date.now(); const updateTimer = () => { @@ -42,7 +87,7 @@ export function RecordingOverlay({ } }; } - }, [isRecording, isProcessing]); + }, [effectiveState]); const formatTime = (seconds: number) => { const mins = Math.floor(seconds / 60); @@ -50,6 +95,9 @@ export function RecordingOverlay({ return `${mins}:${secs.toString().padStart(2, "0")}`; }; + // Determine color scheme based on state + const isPlaybackStyle = effectiveState === "generating" || effectiveState === "playing"; + // Generate stable bar configurations once when component mounts const waveformBars = useMemo(() => { const barCount = 30; @@ -83,16 +131,24 @@ export function RecordingOverlay({ return bars; }, []); // Empty deps = generated once + const shouldAnimate = + effectiveState === "recording" || + effectiveState === "generating" || + effectiveState === "playing"; + const renderWaveformBars = () => { + const barColorClass = isPlaybackStyle ? "bg-blue-400/50" : "bg-primary/40"; + const animName = isPlaybackStyle ? "pulse-blue" : "pulse"; + return waveformBars.map((bar, i) => (
{ + switch (effectiveState) { + case "recording": + return ( + <> +
+ Recording + + ); + case "processing": + return ( + <> + + Processing... + + ); + case "error": + return ( +
+
+ {errorMessage || "Transcription failed"} +
+ {savedDuration !== undefined && ( +
+ Recording: {formatTime(savedDuration)} +
+ )} +
+ {onRetry && ( + + )} + {onDiscard && ( + + )} +
+
+ ); + case "waiting": + return ( +
+
+ Waiting for response... +
+ ); + case "generating": + return ( +
+ + Generating audio... +
+ ); + case "playing": + return ( +
+
+ Playing +
+ ); + } + }; return (
-
- {/* Top buttons - Cancel on left, Send on right */} +
+ {/* Top buttons */}
- + {showSendButton && ( + + )}
-
- {/* Waveform visualization - only show when not compact */} - {!isCompact && ( +
+ {/* Waveform visualization - show for recording (non-compact), generating, and playing (always) */} + {((!isCompact && effectiveState === "recording") || + effectiveState === "generating" || + effectiveState === "playing") && (
{renderWaveformBars()}
)} - {/* Timer */} -
{formatTime(duration)}
+ {/* Timer - show during recording */} + {(effectiveState === "recording" || effectiveState === "processing") && ( +
{formatTime(duration)}
+ )} - {/* Status indicator - only show when not compact */} - {!isCompact && ( + {/* Status indicator - show in all modes for voice states, only non-compact for recording */} + {(!isCompact || (effectiveState !== "recording" && effectiveState !== "processing")) && (
- {isProcessing ? ( - <> - - Processing... - - ) : ( - <> -
- Recording - - )} + {renderStatusContent()}
)}
diff --git a/frontend/src/components/UnifiedChat.tsx b/frontend/src/components/UnifiedChat.tsx index 41ed4874..e81cd248 100644 --- a/frontend/src/components/UnifiedChat.tsx +++ b/frontend/src/components/UnifiedChat.tsx @@ -39,7 +39,8 @@ import { Maximize2, Minimize2, Volume2, - Square + Square, + Download } from "lucide-react"; import RecordRTC from "recordrtc"; import { useQueryClient } from "@tanstack/react-query"; @@ -59,7 +60,7 @@ import { useOpenSecret } from "@opensecret/react"; import { UpgradePromptDialog } from "@/components/UpgradePromptDialog"; import { DocumentPlatformDialog } from "@/components/DocumentPlatformDialog"; import { ContextLimitDialog } from "@/components/ContextLimitDialog"; -import { RecordingOverlay } from "@/components/RecordingOverlay"; +import { RecordingOverlay, type VoiceOverlayState } from "@/components/RecordingOverlay"; import { WebSearchInfoDialog } from "@/components/WebSearchInfoDialog"; import { TTSDownloadDialog } from "@/components/TTSDownloadDialog"; import { useTTS } from "@/services/tts/TTSContext"; @@ -71,7 +72,7 @@ import { DropdownMenuItem, DropdownMenuTrigger } from "@/components/ui/dropdown-menu"; -import { isTauri } from "@/utils/platform"; +import { isTauri, isTauriDesktop, isIOS } from "@/utils/platform"; import type { InputTextContent, OutputTextContent, @@ -201,8 +202,18 @@ function TTSButton({ onNeedsSetup: () => void; onManage: () => void; }) { - const { status, isPlaying, currentPlayingId, speak, stop, isTauriEnv } = useTTS(); + const { + status, + isPlaying, + isGenerating, + currentPlayingId, + speak, + stop, + cancelGeneration, + isTauriEnv + } = useTTS(); const isThisPlaying = isPlaying && currentPlayingId === messageId; + const isThisGenerating = isGenerating && currentPlayingId === messageId; const longPressTimer = useRef | null>(null); // Cleanup timer on unmount @@ -226,7 +237,10 @@ function TTSButton({ } if (status === "ready") { - if (isThisPlaying) { + if (isThisGenerating) { + // Tap during generation cancels it + cancelGeneration(); + } else if (isThisPlaying) { stop(); } else { await speak(text, messageId); @@ -258,18 +272,22 @@ function TTSButton({ @@ -3059,14 +3550,31 @@ export function UnifiedChat() {
{/* Recording overlay for centered input */} - {isRecording && ( + {(isRecording || voiceState) && ( stopRecording(true)} - onCancel={() => stopRecording(false)} + onCancel={() => { + if (voiceState === "error" && !voiceModeRef.current) { + // Dismiss error overlay in non-voice mode + setVoiceState(null); + setRecordingBlob(null); + setVoiceErrorMessage(null); + setVoiceRetryCount(0); + setRecordingDuration(0); + } else { + stopRecording(false); + if (voiceModeRef.current) exitVoiceMode(); + } + }} isCompact={false} className="absolute inset-0 rounded-xl" + voiceState={voiceState || undefined} + errorMessage={voiceErrorMessage || undefined} + savedDuration={recordingDuration ?? undefined} + onRetry={handleVoiceRetry} + onDiscard={handleVoiceDiscard} /> )}
@@ -3269,11 +3777,13 @@ export function UnifiedChat() { {/* Mic button */} @@ -3302,14 +3812,30 @@ export function UnifiedChat() {
{/* Recording overlay for bottom input */} - {isRecording && ( + {(isRecording || voiceState) && ( stopRecording(true)} - onCancel={() => stopRecording(false)} + onCancel={() => { + if (voiceState === "error" && !voiceModeRef.current) { + setVoiceState(null); + setRecordingBlob(null); + setVoiceErrorMessage(null); + setVoiceRetryCount(0); + setRecordingDuration(0); + } else { + stopRecording(false); + if (voiceModeRef.current) exitVoiceMode(); + } + }} isCompact={true} className="absolute inset-0 rounded-xl" + voiceState={voiceState || undefined} + errorMessage={voiceErrorMessage || undefined} + savedDuration={recordingDuration ?? undefined} + onRetry={handleVoiceRetry} + onDiscard={handleVoiceDiscard} /> )}
@@ -3372,6 +3898,42 @@ export function UnifiedChat() { {/* TTS setup dialog */} + {/* TTS discovery prompt - shown once after first voice message on supported platforms */} + {showTTSDiscovery && ( +
+
+
+

Enable voice responses?

+

+ Download a text-to-speech model (~264 MB) to hear Maple read responses aloud. +

+
+
+ + +
+
+
+ )} + {/* Hidden file inputs - must be outside conditional rendering to work in both views */} Promise; deleteModels: () => Promise; speak: (text: string, messageId: string) => Promise; + /** Like speak(), but returns a promise that resolves when playback ends (for voice mode loop). */ + speakAndWait: (text: string, messageId: string) => Promise; stop: () => void; + /** Cancel an in-progress TTS generation (returns to idle without playing). */ + cancelGeneration: () => void; clearPlaybackError: () => void; } @@ -72,6 +77,7 @@ export function TTSProvider({ children }: { children: ReactNode }) { const [downloadDetail, setDownloadDetail] = useState(""); const [totalSizeMB, setTotalSizeMB] = useState(264); const [isPlaying, setIsPlaying] = useState(false); + const [isGenerating, setIsGenerating] = useState(false); const [currentPlayingId, setCurrentPlayingId] = useState(null); const [playbackError, setPlaybackError] = useState(null); @@ -85,6 +91,9 @@ export function TTSProvider({ children }: { children: ReactNode }) { } | null>(null); const unlistenRef = useRef<(() => void) | null>(null); + // Generation sequence ID to detect stale results + const generationSeqRef = useRef(0); + const cleanupDownloadListener = useCallback(() => { if (unlistenRef.current) { unlistenRef.current(); @@ -171,7 +180,7 @@ export function TTSProvider({ children }: { children: ReactNode }) { } }, [isTauriEnv, cleanupDownloadListener]); - const stop = useCallback(() => { + const stopPlayback = useCallback(() => { if (audioUrlRef.current) { URL.revokeObjectURL(audioUrlRef.current); audioUrlRef.current = null; @@ -219,6 +228,22 @@ export function TTSProvider({ children }: { children: ReactNode }) { setCurrentPlayingId(null); }, []); + const stop = useCallback(() => { + // Invalidate any in-flight generation + generationSeqRef.current++; + setIsGenerating(false); + stopPlayback(); + }, [stopPlayback]); + + const cancelGeneration = useCallback(() => { + // Bump the sequence so the in-flight tts_synthesize result is discarded + generationSeqRef.current++; + setIsGenerating(false); + // Also stop playback to close the AudioContext — this prevents audio from + // playing if cancelGeneration is called during the async decoding window. + stopPlayback(); + }, [stopPlayback]); + const deleteModels = useCallback(async () => { if (!isTauriEnv) return; @@ -238,27 +263,44 @@ export function TTSProvider({ children }: { children: ReactNode }) { } }, [isTauriEnv, stop]); - const speak = useCallback( - async (text: string, messageId: string) => { + /** + * Internal speak implementation. Returns a promise that resolves when playback ends + * if `waitForEnd` is true, otherwise resolves immediately after starting playback. + */ + const speakInternal = useCallback( + async (text: string, messageId: string, waitForEnd: boolean): Promise => { if (!isTauriEnv || status !== "ready") return; - // Stop any currently playing audio + // Stop any currently playing audio and invalidate previous generation stop(); // Preprocess text to remove think blocks and other non-speakable content const processedText = preprocessTextForTTS(text); if (!processedText) { - return; + // Signal to callers that there was nothing to play (not a real error). + // Voice loop catches this to restart recording instead of exiting. + throw new Error("no_speakable_text"); } + // Capture a sequence ID for this generation so we can detect staleness + const mySeq = ++generationSeqRef.current; + try { - setIsPlaying(true); + setIsGenerating(true); setCurrentPlayingId(messageId); const result = await invoke("tts_synthesize", { text: processedText }); + // Check if this generation is still current + if (generationSeqRef.current !== mySeq) { + // Stale - a newer speak() call or cancelGeneration() happened + return; + } + + setIsGenerating(false); + // Create audio from base64 const audioBlob = base64ToBlob(result.audio_base64, "audio/wav"); const audioUrl = URL.createObjectURL(audioBlob); @@ -304,7 +346,7 @@ export function TTSProvider({ children }: { children: ReactNode }) { } // iOS: try to force media playback routing (speaker) for Web Audio. - // This helps avoid “only works with headphones / earpiece” routing issues. + // This helps avoid "only works with headphones / earpiece" routing issues. try { // eslint-disable-next-line @typescript-eslint/no-explicit-any const nav = navigator as any; @@ -318,11 +360,25 @@ export function TTSProvider({ children }: { children: ReactNode }) { const audioContext = new AudioContextClass() as AudioContext; + // Store context ref immediately so stopPlayback() can clean it up + // if any operation below throws (prevents AudioContext resource leak) + audioContextRef.current = audioContext; + // iOS requires user interaction to start audio - resume if suspended if (audioContext.state === "suspended") { await audioContext.resume(); } + // Re-check staleness after async work + if (generationSeqRef.current !== mySeq) { + void audioContext.close().catch(() => {}); + if (audioContextRef.current === audioContext) { + audioContextRef.current = null; + } + URL.revokeObjectURL(audioUrl); + return; + } + const arrayBuffer = await audioBlob.arrayBuffer(); const audioBuffer = await audioContext.decodeAudioData(arrayBuffer); @@ -330,62 +386,97 @@ export function TTSProvider({ children }: { children: ReactNode }) { source.buffer = audioBuffer; source.connect(audioContext.destination); - // Store context and source for stop functionality - audioContextRef.current = audioContext; sourceNodeRef.current = source; - source.onended = () => { - if (sourceNodeRef.current !== source) { - return; - } - setIsPlaying(false); - setCurrentPlayingId(null); + setIsPlaying(true); - if (audioUrlRef.current === audioUrl) { - URL.revokeObjectURL(audioUrlRef.current); - audioUrlRef.current = null; - } - void audioContext.close().catch(() => { - // Ignore - }); - audioContextRef.current = null; - sourceNodeRef.current = null; - - if (audioSessionPrevTypeRef.current) { - try { - // eslint-disable-next-line @typescript-eslint/no-explicit-any - const nav = navigator as any; - if (nav.audioSession && typeof nav.audioSession.type === "string") { - nav.audioSession.type = audioSessionPrevTypeRef.current; - } - } catch { + // Wrap playback in a promise that resolves when it ends (for waitForEnd mode) + const playbackPromise = new Promise((resolve) => { + source.onended = () => { + if (sourceNodeRef.current !== source) { + resolve(); + return; + } + setIsPlaying(false); + setCurrentPlayingId(null); + + if (audioUrlRef.current === audioUrl) { + URL.revokeObjectURL(audioUrlRef.current); + audioUrlRef.current = null; + } + void audioContext.close().catch(() => { // Ignore + }); + audioContextRef.current = null; + sourceNodeRef.current = null; + + if (audioSessionPrevTypeRef.current) { + try { + // eslint-disable-next-line @typescript-eslint/no-explicit-any + const nav = navigator as any; + if (nav.audioSession && typeof nav.audioSession.type === "string") { + nav.audioSession.type = audioSessionPrevTypeRef.current; + } + } catch { + // Ignore + } + audioSessionPrevTypeRef.current = null; } - audioSessionPrevTypeRef.current = null; - } - if (mediaSessionPrevStateRef.current) { - try { - if ("mediaSession" in navigator) { - navigator.mediaSession.metadata = mediaSessionPrevStateRef.current.metadata; - navigator.mediaSession.playbackState = - mediaSessionPrevStateRef.current.playbackState; + if (mediaSessionPrevStateRef.current) { + try { + if ("mediaSession" in navigator) { + navigator.mediaSession.metadata = mediaSessionPrevStateRef.current.metadata; + navigator.mediaSession.playbackState = + mediaSessionPrevStateRef.current.playbackState; + } + } catch { + // Ignore } - } catch { - // Ignore + mediaSessionPrevStateRef.current = null; } - mediaSessionPrevStateRef.current = null; - } - }; + + resolve(); + }; + }); source.start(0); + + if (waitForEnd) { + await playbackPromise; + } } catch (err) { + // Check staleness before setting error state + if (generationSeqRef.current !== mySeq) return; + console.error("TTS playback failed:", err); + setIsGenerating(false); setPlaybackError(err instanceof Error ? err.message : "TTS playback failed"); - stop(); + stopPlayback(); + // Re-throw so speakAndWait callers (e.g. voice mode loop) can catch and exit + throw err; } }, - [isTauriEnv, status, stop] + [isTauriEnv, status, stop, stopPlayback] + ); + + const speak = useCallback( + async (text: string, messageId: string) => { + try { + await speakInternal(text, messageId, false); + } catch { + // Error already handled by speakInternal (playbackError state set). + // Only speakAndWait needs to propagate for voice mode loop. + } + }, + [speakInternal] + ); + + const speakAndWait = useCallback( + async (text: string, messageId: string) => { + await speakInternal(text, messageId, true); + }, + [speakInternal] ); const clearPlaybackError = useCallback(() => { @@ -452,13 +543,16 @@ export function TTSProvider({ children }: { children: ReactNode }) { downloadDetail, totalSizeMB, isPlaying, + isGenerating, currentPlayingId, isTauriEnv, checkStatus, startDownload, deleteModels, speak, + speakAndWait, stop, + cancelGeneration, clearPlaybackError }} >