diff --git a/frontend/public/audio/mic-off-full.wav b/frontend/public/audio/mic-off-full.wav
new file mode 100644
index 00000000..8935145c
Binary files /dev/null and b/frontend/public/audio/mic-off-full.wav differ
diff --git a/frontend/public/audio/mic-on-full.wav b/frontend/public/audio/mic-on-full.wav
new file mode 100644
index 00000000..28c94340
Binary files /dev/null and b/frontend/public/audio/mic-on-full.wav differ
diff --git a/frontend/src/components/RecordingOverlay.tsx b/frontend/src/components/RecordingOverlay.tsx
index caa7fb26..73e0bb26 100644
--- a/frontend/src/components/RecordingOverlay.tsx
+++ b/frontend/src/components/RecordingOverlay.tsx
@@ -1,8 +1,17 @@
 import { useEffect, useState, useRef, useMemo } from "react";
-import { X, CornerRightUp, Loader2 } from "lucide-react";
+import { X, CornerRightUp, Loader2, RotateCcw, Trash2 } from "lucide-react";
 import { Button } from "@/components/ui/button";
 import { cn } from "@/utils/utils";
 
+/** The current phase shown by the overlay. */
+export type VoiceOverlayState =
+  | "recording"
+  | "processing"
+  | "error"
+  | "waiting"
+  | "generating"
+  | "playing";
+
 interface RecordingOverlayProps {
   isRecording: boolean;
   isProcessing?: boolean;
@@ -10,6 +19,18 @@ interface RecordingOverlayProps {
   onCancel: () => void;
   isCompact?: boolean;
   className?: string;
+
+  // Voice-mode extensions
+  /** Current voice-mode state (defaults to recording/processing based on isRecording/isProcessing) */
+  voiceState?: VoiceOverlayState;
+  /** Error message to display in error state */
+  errorMessage?: string;
+  /** Duration of the recording that failed (shown in error state) */
+  savedDuration?: number;
+  /** Called when user taps Retry in error state */
+  onRetry?: () => void;
+  /** Called when user taps Discard in error state */
+  onDiscard?: () => void;
 }
 
 export function RecordingOverlay({
@@ -18,14 +39,38 @@ export function RecordingOverlay({
   onSend,
   onCancel,
   isCompact = false,
-  className
+  className,
+  voiceState: voiceStateProp,
+  errorMessage,
+  savedDuration,
+  onRetry,
+  onDiscard
 }: RecordingOverlayProps) {
+  // Derive the effective state: use voiceState prop if provided, otherwise fall back
+  const effectiveState: VoiceOverlayState = voiceStateProp
+    ? voiceStateProp
+    : isProcessing
+      ? "processing"
+      : "recording";
+
   const [duration, setDuration] = useState(0);
   const startTimeRef = useRef<number>(0);
   const animationFrameRef = useRef<number>();
 
+  // Reset duration immediately when effectiveState changes away from recording
+  // so the next time recording starts, it doesn't flash the old value
+  const prevEffectiveStateRef = useRef(effectiveState);
+  if (prevEffectiveStateRef.current !== effectiveState) {
+    prevEffectiveStateRef.current = effectiveState;
+    if (effectiveState === "recording") {
+      // Synchronous state reset — avoids the one-frame flash of stale duration
+      // that happens when setDuration(0) is called only inside useEffect
+      setDuration(0);
+    }
+  }
+
   useEffect(() => {
-    if (isRecording && !isProcessing) {
+    if (effectiveState === "recording") {
       startTimeRef.current = Date.now();
 
       const updateTimer = () => {
@@ -42,7 +87,7 @@ export function RecordingOverlay({
         }
       };
     }
-  }, [isRecording, isProcessing]);
+  }, [effectiveState]);
 
   const formatTime = (seconds: number) => {
     const mins = Math.floor(seconds / 60);
@@ -50,6 +95,9 @@ export function RecordingOverlay({
     return `${mins}:${secs.toString().padStart(2, "0")}`;
   };
 
+  // Determine color scheme based on state
+  const isPlaybackStyle = effectiveState === "generating" || effectiveState === "playing";
+
   // Generate stable bar configurations once when component mounts
   const waveformBars = useMemo(() => {
     const barCount = 30;
@@ -83,16 +131,24 @@ export function RecordingOverlay({
     return bars;
   }, []); // Empty deps = generated once
 
+  const shouldAnimate =
+    effectiveState === "recording" ||
+    effectiveState === "generating" ||
+    effectiveState === "playing";
+
   const renderWaveformBars = () => {
+    const barColorClass = isPlaybackStyle ? "bg-blue-400/50" : "bg-primary/40";
+    const animName = isPlaybackStyle ? "pulse-blue" : "pulse";
+
     return waveformBars.map((bar, i) => (
       <div
         key={i}
-        className="flex-shrink-0 bg-primary/40 rounded-full"
+        className={cn("flex-shrink-0 rounded-full", barColorClass)}
         style={{
           width: "2px",
           height: `${bar.height}%`,
-          animation: isRecording
-            ? `pulse ${bar.animationDuration}s ease-in-out ${bar.animationDelay}s infinite`
+          animation: shouldAnimate
+            ? `${animName} ${bar.animationDuration}s ease-in-out ${bar.animationDelay}s infinite`
             : "none",
           transition: "height 0.3s ease-out"
         }}
@@ -100,7 +156,84 @@ export function RecordingOverlay({
     ));
   };
 
-  if (!isRecording) return null;
+  // Show the overlay when recording OR when in any voice-mode state
+  const isVisible = isRecording || !!voiceStateProp;
+  if (!isVisible) return null;
+
+  // Whether the top-right send button should be shown (only in recording state)
+  const showSendButton = effectiveState === "recording" || effectiveState === "processing";
+
+  const renderStatusContent = () => {
+    switch (effectiveState) {
+      case "recording":
+        return (
+          <>
+            <div className="w-2 h-2 bg-destructive rounded-full animate-pulse" />
+            Recording
+          </>
+        );
+      case "processing":
+        return (
+          <>
+            <Loader2 className="w-4 h-4 animate-spin" />
+            Processing...
+          </>
+        );
+      case "error":
+        return (
+          <div className="flex flex-col items-center gap-3">
+            <div className="text-destructive text-sm text-center max-w-xs">
+              {errorMessage || "Transcription failed"}
+            </div>
+            {savedDuration !== undefined && (
+              <div className="text-xs text-muted-foreground">
+                Recording: {formatTime(savedDuration)}
+              </div>
+            )}
+            <div className="flex items-center gap-2">
+              {onRetry && (
+                <Button onClick={onRetry} variant="outline" size="sm" className="gap-1.5">
+                  <RotateCcw className="h-3.5 w-3.5" />
+                  Retry
+                </Button>
+              )}
+              {onDiscard && (
+                <Button
+                  onClick={onDiscard}
+                  variant="ghost"
+                  size="sm"
+                  className="gap-1.5 text-muted-foreground"
+                >
+                  <Trash2 className="h-3.5 w-3.5" />
+                  Discard
+                </Button>
+              )}
+            </div>
+          </div>
+        );
+      case "waiting":
+        return (
+          <div className="flex items-center gap-2">
+            <div className="w-2 h-2 bg-blue-400 rounded-full animate-[breathing_2s_ease-in-out_infinite]" />
+            Waiting for response...
+          </div>
+        );
+      case "generating":
+        return (
+          <div className="flex items-center gap-2 text-blue-400">
+            <Loader2 className="w-4 h-4 animate-spin" />
+            Generating audio...
+          </div>
+        );
+      case "playing":
+        return (
+          <div className="flex items-center gap-2 text-blue-400">
+            <div className="w-2 h-2 bg-blue-400 rounded-full animate-pulse" />
+            Playing
+          </div>
+        );
+    }
+  };
 
   return (
     <div
@@ -116,68 +249,82 @@ export function RecordingOverlay({
             0%, 100% { transform: scaleY(0.5); opacity: 0.6; }
             50% { transform: scaleY(1); opacity: 1; }
           }
+          @keyframes pulse-blue {
+            0%, 100% { transform: scaleY(0.5); opacity: 0.5; }
+            50% { transform: scaleY(1); opacity: 0.9; }
+          }
+          @keyframes breathing {
+            0%, 100% { opacity: 0.4; transform: scale(0.9); }
+            50% { opacity: 1; transform: scale(1.1); }
+          }
         `}
       </style>
 
-      <div className="w-full h-full rounded-lg bg-background/95 backdrop-blur-sm border border-primary/20 relative overflow-hidden flex flex-col items-center justify-center p-4">
-        {/* Top buttons - Cancel on left, Send on right */}
+      <div
+        className={cn(
+          "w-full h-full rounded-lg bg-background/95 backdrop-blur-sm border relative overflow-hidden flex flex-col items-center justify-center p-4",
+          isPlaybackStyle ? "border-blue-400/30" : "border-primary/20"
+        )}
+      >
+        {/* Top buttons */}
         <div className="absolute top-3 left-3 right-3 flex justify-between">
           <Button
             onClick={onCancel}
             variant="ghost"
             size="icon"
             className="rounded-full hover:bg-muted"
-            aria-label="Cancel recording"
-            disabled={isProcessing}
+            aria-label={effectiveState === "recording" ? "Cancel recording" : "Exit voice mode"}
+            disabled={effectiveState === "processing"}
           >
             <X className="h-4 w-4" />
           </Button>
 
-          <Button
-            onClick={onSend}
-            size={isCompact ? "icon" : "sm"}
-            className={cn(isCompact ? "rounded-full" : "gap-1.5")}
-            aria-label="Send recording"
-            disabled={isProcessing}
-          >
-            {isProcessing ? (
-              <Loader2 className={cn(isCompact ? "h-4 w-4" : "h-3.5 w-3.5", "animate-spin")} />
-            ) : isCompact ? (
-              <CornerRightUp className="h-4 w-4" />
-            ) : (
-              <>
-                <CornerRightUp className="h-3.5 w-3.5" />
-                Send
-              </>
-            )}
-          </Button>
+          {showSendButton && (
+            <Button
+              onClick={onSend}
+              size={isCompact ? "icon" : "sm"}
+              className={cn(isCompact ? "rounded-full" : "gap-1.5")}
+              aria-label="Send recording"
+              disabled={effectiveState === "processing"}
+            >
+              {effectiveState === "processing" ? (
+                <Loader2 className={cn(isCompact ? "h-4 w-4" : "h-3.5 w-3.5", "animate-spin")} />
+              ) : isCompact ? (
+                <CornerRightUp className="h-4 w-4" />
+              ) : (
+                <>
+                  <CornerRightUp className="h-3.5 w-3.5" />
+                  Send
+                </>
+              )}
+            </Button>
+          )}
         </div>
 
-        <div className="flex flex-col items-center gap-6 max-w-md w-full">
-          {/* Waveform visualization - only show when not compact */}
-          {!isCompact && (
+        <div
+          className={cn(
+            "flex flex-col items-center max-w-md w-full",
+            isCompact ? "gap-2" : "gap-6"
+          )}
+        >
+          {/* Waveform visualization - show for recording (non-compact), generating, and playing (always) */}
+          {((!isCompact && effectiveState === "recording") ||
+            effectiveState === "generating" ||
+            effectiveState === "playing") && (
             <div className="flex items-center justify-center h-12 w-full gap-0.5 px-4">
               {renderWaveformBars()}
             </div>
           )}
 
-          {/* Timer */}
-          <div className="text-2xl font-mono text-muted-foreground">{formatTime(duration)}</div>
+          {/* Timer - show during recording */}
+          {(effectiveState === "recording" || effectiveState === "processing") && (
+            <div className="text-2xl font-mono text-muted-foreground">{formatTime(duration)}</div>
+          )}
 
-          {/* Status indicator - only show when not compact */}
-          {!isCompact && (
+          {/* Status indicator - show in all modes for voice states, only non-compact for recording */}
+          {(!isCompact || (effectiveState !== "recording" && effectiveState !== "processing")) && (
             <div className="flex items-center gap-2 text-sm text-muted-foreground">
-              {isProcessing ? (
-                <>
-                  <Loader2 className="w-4 h-4 animate-spin" />
-                  Processing...
-                </>
-              ) : (
-                <>
-                  <div className="w-2 h-2 bg-destructive rounded-full animate-pulse" />
-                  Recording
-                </>
-              )}
+              {renderStatusContent()}
             </div>
           )}
         </div>
diff --git a/frontend/src/components/UnifiedChat.tsx b/frontend/src/components/UnifiedChat.tsx
index 41ed4874..e81cd248 100644
--- a/frontend/src/components/UnifiedChat.tsx
+++ b/frontend/src/components/UnifiedChat.tsx
@@ -39,7 +39,8 @@ import {
   Maximize2,
   Minimize2,
   Volume2,
-  Square
+  Square,
+  Download
 } from "lucide-react";
 import RecordRTC from "recordrtc";
 import { useQueryClient } from "@tanstack/react-query";
@@ -59,7 +60,7 @@ import { useOpenSecret } from "@opensecret/react";
 import { UpgradePromptDialog } from "@/components/UpgradePromptDialog";
 import { DocumentPlatformDialog } from "@/components/DocumentPlatformDialog";
 import { ContextLimitDialog } from "@/components/ContextLimitDialog";
-import { RecordingOverlay } from "@/components/RecordingOverlay";
+import { RecordingOverlay, type VoiceOverlayState } from "@/components/RecordingOverlay";
 import { WebSearchInfoDialog } from "@/components/WebSearchInfoDialog";
 import { TTSDownloadDialog } from "@/components/TTSDownloadDialog";
 import { useTTS } from "@/services/tts/TTSContext";
@@ -71,7 +72,7 @@ import {
   DropdownMenuItem,
   DropdownMenuTrigger
 } from "@/components/ui/dropdown-menu";
-import { isTauri } from "@/utils/platform";
+import { isTauri, isTauriDesktop, isIOS } from "@/utils/platform";
 import type {
   InputTextContent,
   OutputTextContent,
@@ -201,8 +202,18 @@ function TTSButton({
   onNeedsSetup: () => void;
   onManage: () => void;
 }) {
-  const { status, isPlaying, currentPlayingId, speak, stop, isTauriEnv } = useTTS();
+  const {
+    status,
+    isPlaying,
+    isGenerating,
+    currentPlayingId,
+    speak,
+    stop,
+    cancelGeneration,
+    isTauriEnv
+  } = useTTS();
   const isThisPlaying = isPlaying && currentPlayingId === messageId;
+  const isThisGenerating = isGenerating && currentPlayingId === messageId;
   const longPressTimer = useRef<ReturnType<typeof setTimeout> | null>(null);
 
   // Cleanup timer on unmount
@@ -226,7 +237,10 @@ function TTSButton({
     }
 
     if (status === "ready") {
-      if (isThisPlaying) {
+      if (isThisGenerating) {
+        // Tap during generation cancels it
+        cancelGeneration();
+      } else if (isThisPlaying) {
         stop();
       } else {
         await speak(text, messageId);
@@ -258,18 +272,22 @@ function TTSButton({
     <Button
       variant="ghost"
       size="sm"
-      className="h-7 w-7 p-0 text-muted-foreground hover:text-foreground"
+      className="h-7 w-7 p-0 text-muted-foreground hover:text-foreground relative"
       onClick={handleClick}
       onPointerDown={handlePointerDown}
       onPointerUp={handlePointerUp}
       onPointerLeave={handlePointerUp}
       disabled={isDisabled}
-      aria-label={isThisPlaying ? "Stop speaking" : "Read aloud"}
+      aria-label={
+        isThisGenerating ? "Cancel generation" : isThisPlaying ? "Stop speaking" : "Read aloud"
+      }
     >
       {showSpinner ? (
         <Loader2 className="h-3.5 w-3.5 animate-spin" />
       ) : isThisPlaying ? (
         <Square className="h-3.5 w-3.5" />
+      ) : isThisGenerating ? (
+        <Volume2 className="h-3.5 w-3.5 animate-pulse text-primary" />
       ) : (
         <Volume2 className="h-3.5 w-3.5" />
       )}
@@ -894,7 +912,16 @@ export function UnifiedChat() {
   const os = useOpenSecret();
   const isTauriEnv = isTauri();
   const queryClient = useQueryClient();
-  const { playbackError, clearPlaybackError } = useTTS();
+  const {
+    playbackError,
+    clearPlaybackError,
+    status: ttsStatus,
+    speakAndWait,
+    cancelGeneration: cancelTTSGeneration,
+    stop: stopTTS,
+    isPlaying: ttsIsPlaying,
+    isGenerating: ttsIsGenerating
+  } = useTTS();
 
   // Track chatId from URL - use state so we can update it
   const [chatId, setChatId] = useState<string | undefined>(() => {
@@ -909,6 +936,9 @@ export function UnifiedChat() {
   const [isGenerating, setIsGenerating] = useState(false);
   const [isSidebarOpen, setIsSidebarOpen] = useState(!isMobile);
   const [error, setError] = useState<string | null>(null);
+  const errorRef = useRef<string | null>(null);
+  // Keep errorRef in sync so voice continuation effect can check for errors without stale closures
+  errorRef.current = error;
   const [lastSeenItemId, setLastSeenItemId] = useState<string | undefined>();
   const [isNewConversationJustCreated, setIsNewConversationJustCreated] = useState(false);
   const [currentResponseId, setCurrentResponseId] = useState<string | undefined>();
@@ -941,6 +971,16 @@ export function UnifiedChat() {
   const [isProcessingSend, setIsProcessingSend] = useState(false);
   const [audioError, setAudioError] = useState<string | null>(null);
 
+  // Voice mode states
+  const [voiceMode, setVoiceMode] = useState(false);
+  const [voiceState, setVoiceState] = useState<VoiceOverlayState | null>(null);
+  const [recordingBlob, setRecordingBlob] = useState<Blob | null>(null);
+  const [recordingDuration, setRecordingDuration] = useState(0);
+  const [voiceRetryCount, setVoiceRetryCount] = useState(0);
+  const [voiceErrorMessage, setVoiceErrorMessage] = useState<string | null>(null);
+  const [showTTSDiscovery, setShowTTSDiscovery] = useState(false);
+  const voiceModeRef = useRef(false);
+
   // Web search toggle state - persisted in localStorage
   const [isWebSearchEnabled, setIsWebSearchEnabled] = useState(() => {
     return localStorage.getItem("webSearchEnabled") === "true";
@@ -981,6 +1021,12 @@ export function UnifiedChat() {
   const [hasNewPolledMessages, setHasNewPolledMessages] = useState(false);
   const recorderRef = useRef<RecordRTC | null>(null);
   const streamRef = useRef<MediaStream | null>(null);
+  const recordingStartTimeRef = useRef<number>(0);
+  const startRecordingRef = useRef<() => void>(() => {});
+  const handleSendMessageRef = useRef<
+    (e?: React.FormEvent, overrideInput?: string) => Promise<void>
+  >(async () => {});
+  const handleTTSDiscoveryRef = useRef<() => void>(() => {});
   const assistantStreamingRef = useRef(false);
 
   const abortControllerRef = useRef<AbortController | null>(null);
@@ -1164,6 +1210,10 @@ export function UnifiedChat() {
       clearAllAttachments();
       // Reset scroll tracking
       prevMessageCountRef.current = 0;
+      // Exit voice mode on new chat
+      if (voiceModeRef.current) {
+        exitVoiceMode();
+      }
     };
 
     // Handle conversation selection from sidebar
@@ -1175,6 +1225,10 @@ export function UnifiedChat() {
         // Update our local chatId state to trigger load
         setChatId(conversationId);
         setError(null);
+        // Exit voice mode on conversation switch
+        if (voiceModeRef.current) {
+          exitVoiceMode();
+        }
       }
     };
 
@@ -1201,6 +1255,7 @@ export function UnifiedChat() {
       );
       window.removeEventListener("popstate", handlePopState);
     };
+    // eslint-disable-next-line react-hooks/exhaustive-deps
   }, [chatId, clearAllAttachments]);
 
   // Cancel the current response
@@ -1797,6 +1852,99 @@ export function UnifiedChat() {
     setDocumentName("");
   }, []);
 
+  // --- Voice mode helpers ---
+
+  /** Play an audio cue file from /audio/ directory using Web Audio API for iOS compatibility.
+   *  Returns a Promise that resolves when the sound finishes playing (or immediately on error). */
+  const playAudioCue = useCallback((file: "mic-on" | "mic-off"): Promise<void> => {
+    return new Promise((resolve) => {
+      try {
+        // Use 'play-and-record' audio session (Safari 17+) to bypass the iOS silent
+        // switch while remaining compatible with getUserMedia / mic capture. Unlike
+        // 'playback', this type does NOT cause an InvalidStateError when getUserMedia
+        // is called afterward, so no save/restore dance is needed.
+        try {
+          // eslint-disable-next-line @typescript-eslint/no-explicit-any
+          const nav = navigator as any;
+          if (nav.audioSession && typeof nav.audioSession.type === "string") {
+            nav.audioSession.type = "play-and-record";
+          }
+        } catch {
+          // audioSession API not available — ignore
+        }
+
+        // Use Web Audio API instead of new Audio() for better iOS WebView compatibility
+        const ctx = new (window.AudioContext ||
+          (window as unknown as { webkitAudioContext: typeof AudioContext }).webkitAudioContext)();
+        // iOS AudioContext starts in "suspended" state — must resume before playing
+        const ready = ctx.state === "suspended" ? ctx.resume() : Promise.resolve();
+        ready
+          .then(() => fetch(`/audio/${file}-full.wav`))
+          .then((res) => res.arrayBuffer())
+          .then((buf) => ctx.decodeAudioData(buf))
+          .then((decoded) => {
+            const source = ctx.createBufferSource();
+            const gain = ctx.createGain();
+            gain.gain.value = 7.0;
+            source.buffer = decoded;
+            source.connect(gain);
+            gain.connect(ctx.destination);
+            source.onended = () => {
+              void ctx.close().catch(() => {});
+              resolve();
+            };
+            source.start(0);
+          })
+          .catch(() => {
+            void ctx.close().catch(() => {});
+            resolve();
+          });
+      } catch {
+        resolve();
+      }
+    });
+  }, []);
+
+  /** Check whether TTS is available on this platform (desktop or iOS in Tauri) */
+  const isTTSPlatform = useMemo(() => {
+    return isTauriDesktop() || (isTauri() && isIOS());
+  }, []);
+
+  /** Exit voice mode and clean up all in-flight work */
+  const exitVoiceMode = useCallback(() => {
+    setVoiceMode(false);
+    voiceModeRef.current = false;
+    setVoiceState(null);
+    setRecordingBlob(null);
+    setRecordingDuration(0);
+    setVoiceRetryCount(0);
+    setVoiceErrorMessage(null);
+
+    // Stop any in-progress TTS (call unconditionally — these are idempotent
+    // and guarding on booleans risks stale closures)
+    cancelTTSGeneration();
+    stopTTS();
+
+    // Stop recording if active (use ref instead of isRecording to avoid stale closure).
+    // Capture and clear refs synchronously so an async stopRecording callback
+    // cannot clobber a newly-started recording's refs.
+    if (recorderRef.current) {
+      const recorderToCleanup = recorderRef.current;
+      const streamToCleanup = streamRef.current;
+      recorderRef.current = null;
+      streamRef.current = null;
+      if (streamToCleanup) {
+        streamToCleanup.getTracks().forEach((track) => track.stop());
+      }
+      recorderToCleanup.stopRecording(() => {
+        // Resources already cleaned up synchronously above.
+      });
+      setIsRecording(false);
+      setIsTranscribing(false);
+      setIsProcessingSend(false);
+    }
+  }, [cancelTTSGeneration, stopTTS]);
+
   // Audio recording functions
   const startRecording = async () => {
     // Prevent duplicate starts
@@ -1819,6 +1967,11 @@ export function UnifiedChat() {
         return;
       }
 
+      // Play mic-on audio cue BEFORE activating microphone.
+      // On iOS, getUserMedia switches the audio session to 'play-and-record'
+      // which can mute/interrupt any in-progress Web Audio playback.
+      await playAudioCue("mic-on");
+
       const stream = await navigator.mediaDevices.getUserMedia({
         audio: {
           echoCancellation: false,
@@ -1841,8 +1994,16 @@ export function UnifiedChat() {
 
       recorderRef.current = recorder;
       recorder.startRecording();
+      recordingStartTimeRef.current = Date.now();
       setIsRecording(true);
       setAudioError(null);
+
+      // If TTS is available, activate voice mode (continuous loop)
+      if (isTTSPlatform && ttsStatus === "ready") {
+        setVoiceMode(true);
+        voiceModeRef.current = true;
+        setVoiceState("recording");
+      }
     } catch (error) {
       console.error("Failed to start recording:", error);
       const err = error as Error & { name?: string };
@@ -1863,18 +2024,128 @@ export function UnifiedChat() {
       }
 
       setTimeout(() => setAudioError(null), 5000);
+
+      // If voice mode is active, exit cleanly so the UI doesn't show
+      // a "Recording" overlay when no recording is actually happening
+      if (voiceModeRef.current) {
+        exitVoiceMode();
+      }
     }
   };
 
+  // Keep startRecordingRef in sync so callbacks with stale closures can call it
+  startRecordingRef.current = startRecording;
+
+  /** Transcribe a blob and handle error recovery (used by both normal and voice mode) */
+  const transcribeAndSend = useCallback(
+    async (blob: Blob, currentDuration: number) => {
+      const audioFile = new File([blob], "recording.wav", { type: "audio/wav" });
+
+      try {
+        const result = await os.transcribeAudio(audioFile, "whisper-large-v3");
+        const transcribedText = result.text.trim();
+
+        if (transcribedText) {
+          // Play mic-off cue on successful send
+          playAudioCue("mic-off");
+
+          // In voice mode, transition to waiting state; otherwise clear overlay
+          if (voiceModeRef.current) {
+            setVoiceState("waiting");
+          } else {
+            setVoiceState(null);
+          }
+
+          // Combine with existing input if any
+          const newValue = input ? `${input} ${transcribedText}` : transcribedText;
+
+          // Clear states before sending
+          setInput("");
+          clearAllAttachments();
+          setIsRecording(false);
+          setIsTranscribing(false);
+          setIsProcessingSend(false);
+          setRecordingBlob(null);
+          setVoiceRetryCount(0);
+
+          // Send the message directly with the transcribed text (use ref for latest closure)
+          await handleSendMessageRef.current(undefined, newValue);
+
+          // Voice mode: after message sent, wait for response then TTS
+          if (voiceModeRef.current) {
+            // The voice mode continuation is handled by the effect that watches
+            // isGenerating (chat generation) transitions
+          } else {
+            // Not in voice mode: check if we should show TTS discovery prompt
+            handleTTSDiscoveryRef.current();
+          }
+
+          return true; // success
+        } else {
+          // No speech detected - treat as error with retry option
+          const errMsg = "No speech detected. Please try again.";
+          if (voiceModeRef.current) {
+            setVoiceState("error");
+            setVoiceErrorMessage(errMsg);
+            setRecordingDuration(currentDuration);
+            setRecordingBlob(blob);
+          } else {
+            // Error recovery for non-voice mode too
+            setVoiceState("error");
+            setVoiceErrorMessage(errMsg);
+            setRecordingDuration(currentDuration);
+            setRecordingBlob(blob);
+          }
+          return false;
+        }
+      } catch (error) {
+        console.error("Transcription failed:", error);
+        const newRetryCount = voiceRetryCount + 1;
+        setVoiceRetryCount(newRetryCount);
+
+        let errMsg = "Failed to transcribe audio. Please try again.";
+        if (newRetryCount >= 3) {
+          errMsg += " If this keeps failing, check your internet connection and try again later.";
+        }
+
+        // Transition to error state (works for both voice mode and single recording)
+        setVoiceState("error");
+        setVoiceErrorMessage(errMsg);
+        setRecordingDuration(currentDuration);
+        setRecordingBlob(blob);
+        return false;
+      } finally {
+        setIsTranscribing(false);
+        setIsProcessingSend(false);
+        setIsRecording(false);
+      }
+    },
+    // eslint-disable-next-line react-hooks/exhaustive-deps
+    [os, input, playAudioCue, clearAllAttachments, voiceRetryCount]
+  );
+
   const stopRecording = (shouldSend: boolean = false) => {
     if (recorderRef.current && isRecording) {
       // Only hide immediately if canceling, keep visible if sending
       if (!shouldSend) {
+        // If in voice mode and canceling, exit voice mode entirely
+        if (voiceModeRef.current) {
+          exitVoiceMode();
+          return;
+        }
         setIsRecording(false);
       } else {
         setIsProcessingSend(true);
+        if (voiceModeRef.current) {
+          setVoiceState("processing");
+        }
       }
 
+      // Capture duration before stopping
+      const capturedDuration = Math.floor(
+        (Date.now() - (recordingStartTimeRef.current || Date.now())) / 1000
+      );
+
       recorderRef.current.stopRecording(async () => {
         const blob = recorderRef.current?.getBlob();
 
@@ -1892,58 +2163,58 @@ export function UnifiedChat() {
           recorderRef.current = null;
           setIsProcessingSend(false);
           setIsRecording(false);
-          return;
-        }
-
-        // Create a proper WAV file
-        const audioFile = new File([blob], "recording.wav", {
-          type: "audio/wav"
-        });
-
-        if (shouldSend) {
-          setIsTranscribing(true);
-          try {
-            const result = await os.transcribeAudio(audioFile, "whisper-large-v3");
-            const transcribedText = result.text.trim();
-
-            if (transcribedText) {
-              // Combine with existing input if any
-              const newValue = input ? `${input} ${transcribedText}` : transcribedText;
-
-              // Clear states before sending
-              setInput("");
-              clearAllAttachments();
-              setIsRecording(false);
-              setIsTranscribing(false);
-              setIsProcessingSend(false);
-
-              // Send the message directly with the transcribed text
-              await handleSendMessage(undefined, newValue);
-            } else {
-              setAudioError("No speech detected. Please try again.");
-              setTimeout(() => setAudioError(null), 5000);
-            }
-          } catch (error) {
-            console.error("Transcription failed:", error);
-            setAudioError("Failed to transcribe audio. Please try again.");
-            setTimeout(() => setAudioError(null), 5000);
-          } finally {
-            setIsTranscribing(false);
-            setIsProcessingSend(false);
-            setIsRecording(false);
+          if (voiceModeRef.current) {
+            setVoiceState("recording");
+            // Defer to allow React batch (setIsRecording(false)) to commit
+            // before startRecording checks the isRecording guard.
+            setTimeout(() => startRecordingRef.current(), 0);
           }
+          return;
         }
 
-        // Clean up resources
+        // Stop microphone stream BEFORE playing mic-off cue or transcribing.
+        // On iOS, the active mic session can mute/interrupt Web Audio playback,
+        // so we must release the mic first.
         if (streamRef.current) {
           streamRef.current.getTracks().forEach((track) => track.stop());
           streamRef.current = null;
         }
         recorderRef.current = null;
+
+        if (shouldSend) {
+          setIsTranscribing(true);
+          await transcribeAndSend(blob, capturedDuration);
+        }
       });
     }
   };
 
+  /** Retry transcription with the saved blob */
+  const handleVoiceRetry = useCallback(() => {
+    if (!recordingBlob) return;
+    setVoiceState("processing");
+    setVoiceErrorMessage(null);
+    setIsTranscribing(true);
+    setIsProcessingSend(true);
+    transcribeAndSend(recordingBlob, recordingDuration);
+  }, [recordingBlob, recordingDuration, transcribeAndSend]);
+
+  /** Discard the saved blob and return to normal */
+  const handleVoiceDiscard = useCallback(() => {
+    setRecordingBlob(null);
+    setVoiceRetryCount(0);
+    setVoiceErrorMessage(null);
+    setRecordingDuration(0);
+    if (voiceModeRef.current) {
+      // In voice mode, go back to recording
+      setVoiceState("recording");
+      startRecordingRef.current();
+    } else {
+      // Outside voice mode, just dismiss the overlay
+      setVoiceState(null);
+    }
+  }, []);
+
   // Helper function to process streaming response - used by both initial request and retry
   const processStreamingResponse = useCallback(async (stream: AsyncIterable<unknown>) => {
     let serverAssistantId: string | undefined;
@@ -2656,6 +2927,224 @@ export function UnifiedChat() {
     ]
   );
 
+  // Keep handleSendMessageRef in sync so transcribeAndSend always uses latest closure
+  handleSendMessageRef.current = handleSendMessage;
+
+  // Voice mode continuation effect: when chat generation finishes (isGenerating goes false),
+  // and we're in voice mode waiting state, proceed to TTS generation → playback → recording loop
+  const prevIsGeneratingRef = useRef(false);
+  useEffect(() => {
+    const wasGenerating = prevIsGeneratingRef.current;
+    prevIsGeneratingRef.current = isGenerating;
+
+    // Detect transition: isGenerating went from true → false
+    if (wasGenerating && !isGenerating && voiceModeRef.current && voiceState === "waiting") {
+      // If there was an error during generation, exit voice mode instead of speaking stale message
+      if (errorRef.current) {
+        exitVoiceMode();
+        return;
+      }
+
+      // Get the latest assistant message text
+      const lastAssistantMsg = [...messages]
+        .reverse()
+        .find(
+          (m): m is ExtendedMessage => "role" in m && m.role === "assistant" && m.type === "message"
+        );
+
+      if (lastAssistantMsg) {
+        const msgContent = lastAssistantMsg.content;
+        const textParts = msgContent?.filter(
+          (p: ConversationContent) => p.type === "output_text" || p.type === "text"
+        );
+        const fullText = textParts
+          ?.map((p: ConversationContent) => (p as OutputTextContent | TextContent).text)
+          .join("\n");
+
+        if (fullText && voiceModeRef.current) {
+          // Transition to generating state
+          setVoiceState("generating");
+
+          // Use speakAndWait to generate TTS and wait for playback to complete
+          speakAndWait(fullText, lastAssistantMsg.id)
+            .then(() => {
+              if (!voiceModeRef.current) return; // exited during playback
+
+              // Playback finished → short pause → start recording again
+              setTimeout(() => {
+                if (!voiceModeRef.current) return;
+                setVoiceState("recording");
+                startRecordingRef.current();
+              }, 500);
+            })
+            .catch((err: unknown) => {
+              if (!voiceModeRef.current) return;
+              // If preprocessor stripped all speakable text, just restart recording
+              if (err instanceof Error && err.message === "no_speakable_text") {
+                setVoiceState("recording");
+                startRecordingRef.current();
+              } else {
+                exitVoiceMode();
+              }
+            });
+        } else {
+          // No text to speak, just go back to recording
+          setVoiceState("recording");
+          startRecordingRef.current();
+        }
+      } else {
+        // No assistant message found, go back to recording
+        setVoiceState("recording");
+        startRecordingRef.current();
+      }
+    }
+    // eslint-disable-next-line react-hooks/exhaustive-deps
+  }, [isGenerating]);
+
+  // Track TTS generating/playing state to update voice overlay
+  useEffect(() => {
+    if (!voiceModeRef.current) return;
+    if (ttsIsGenerating && voiceState === "generating") {
+      // Already in generating state, keep it
+    } else if (ttsIsPlaying && (voiceState === "generating" || voiceState === "playing")) {
+      setVoiceState("playing");
+    }
+  }, [ttsIsGenerating, ttsIsPlaying, voiceState]);
+
+  // TTS became ready while voice mode is active: speak the last assistant message.
+  // This handles the case where the user downloads a TTS model mid-voice-loop —
+  // the continuation effect already ran and moved back to recording without TTS,
+  // so we retroactively speak the latest response once the model is available.
+  const prevTtsStatusRef = useRef(ttsStatus);
+  useEffect(() => {
+    const prev = prevTtsStatusRef.current;
+    prevTtsStatusRef.current = ttsStatus;
+
+    if (
+      prev !== "ready" &&
+      ttsStatus === "ready" &&
+      voiceModeRef.current &&
+      !isGenerating &&
+      !ttsIsPlaying &&
+      !ttsIsGenerating
+    ) {
+      // Find the last assistant message and speak it
+      const lastAssistantMsg = [...messages]
+        .reverse()
+        .find(
+          (m): m is ExtendedMessage => "role" in m && m.role === "assistant" && m.type === "message"
+        );
+
+      if (lastAssistantMsg) {
+        const msgContent = lastAssistantMsg.content;
+        const textParts = msgContent?.filter(
+          (p: ConversationContent) => p.type === "output_text" || p.type === "text"
+        );
+        const fullText = textParts
+          ?.map((p: ConversationContent) => (p as OutputTextContent | TextContent).text)
+          .join("\n");
+
+        if (fullText && voiceModeRef.current) {
+          // The voice loop may have already cycled back to recording —
+          // stop the active mic so we can play TTS without overlap.
+          if (recorderRef.current) {
+            const recorderToCleanup = recorderRef.current;
+            const streamToCleanup = streamRef.current;
+            recorderRef.current = null;
+            streamRef.current = null;
+            if (streamToCleanup) {
+              streamToCleanup.getTracks().forEach((track) => track.stop());
+            }
+            recorderToCleanup.stopRecording(() => {
+              // Resources already cleaned up synchronously above.
+            });
+            setIsRecording(false);
+          }
+
+          setVoiceState("generating");
+          speakAndWait(fullText, lastAssistantMsg.id)
+            .then(() => {
+              if (!voiceModeRef.current) return;
+              setTimeout(() => {
+                if (!voiceModeRef.current) return;
+                setVoiceState("recording");
+                startRecordingRef.current();
+              }, 500);
+            })
+            .catch((err: unknown) => {
+              if (!voiceModeRef.current) return;
+              if (err instanceof Error && err.message === "no_speakable_text") {
+                setVoiceState("recording");
+                startRecordingRef.current();
+              } else {
+                exitVoiceMode();
+              }
+            });
+        }
+      }
+    }
+    // eslint-disable-next-line react-hooks/exhaustive-deps
+  }, [ttsStatus]);
+
+  // TTS discovery prompt: show once when user sends first voice message without TTS installed
+  const handleTTSDiscovery = useCallback(() => {
+    if (!isTTSPlatform) return;
+    if (ttsStatus === "ready") return; // Already installed
+    const hasSeen = localStorage.getItem("hasSeenTTSDiscoveryPrompt") === "true";
+    if (hasSeen) return;
+    setShowTTSDiscovery(true);
+  }, [isTTSPlatform, ttsStatus]);
+
+  // Keep handleTTSDiscoveryRef in sync so transcribeAndSend always uses latest closure
+  handleTTSDiscoveryRef.current = handleTTSDiscovery;
+
+  const dismissTTSDiscovery = useCallback(() => {
+    localStorage.setItem("hasSeenTTSDiscoveryPrompt", "true");
+    setShowTTSDiscovery(false);
+  }, []);
+
+  // Handle app backgrounding / foregrounding.
+  // iOS kills mic streams when the app is backgrounded, but TTS (native Tauri audio)
+  // continues playing via the system media controller. So when going to background:
+  //   - Stop the mic recording if active (it will be killed by iOS anyway)
+  //   - Do NOT stop TTS playback — let it continue in the background
+  // On foreground return, reset any stuck recording flags so the user can start fresh.
+  useEffect(() => {
+    const handleVisibilityChange = () => {
+      if (document.visibilityState === "hidden") {
+        // App went to background — if actively recording, stop the mic cleanly.
+        // iOS will kill the stream anyway; this prevents corrupted state.
+        // Do NOT call exitVoiceMode() — that would also stop TTS playback
+        // which should continue in the background via the system media controller.
+        if (recorderRef.current) {
+          const recorderToCleanup = recorderRef.current;
+          const streamToCleanup = streamRef.current;
+          recorderRef.current = null;
+          streamRef.current = null;
+          if (streamToCleanup) {
+            streamToCleanup.getTracks().forEach((track) => track.stop());
+          }
+          recorderToCleanup.stopRecording(() => {
+            // Resources already cleaned up synchronously above.
+          });
+          setIsRecording(false);
+        }
+      } else if (document.visibilityState === "visible") {
+        // App came back to foreground — if recording flags are stuck
+        // (e.g. iOS killed the stream while backgrounded), reset them
+        // so the user can start a new recording.
+        if (!recorderRef.current && !streamRef.current) {
+          setIsRecording(false);
+          setIsTranscribing(false);
+          setIsProcessingSend(false);
+        }
+      }
+    };
+
+    document.addEventListener("visibilitychange", handleVisibilityChange);
+    return () => document.removeEventListener("visibilitychange", handleVisibilityChange);
+  }, []);
+
   const handleKeyDown = (e: React.KeyboardEvent<HTMLTextAreaElement>) => {
     // On desktop: Enter submits, Shift+Enter for new line
     // On mobile: Enter for new line, no keyboard shortcut to submit (use button)
@@ -3026,11 +3515,13 @@ export function UnifiedChat() {
                           {/* Mic button */}
                           <Button
                             type="button"
-                            onClick={startRecording}
-                            disabled={isGenerating || isRecording || !canUseVoice}
+                            onClick={voiceMode ? () => exitVoiceMode() : startRecording}
+                            disabled={isGenerating || (isRecording && !voiceMode) || !canUseVoice}
                             size="icon"
                             variant="ghost"
-                            className="h-9 w-9 rounded-lg hover:bg-muted"
+                            className={`h-9 w-9 rounded-lg hover:bg-muted ${
+                              voiceMode ? "text-primary" : ""
+                            }`}
                           >
                             <Mic className="h-4 w-4" />
                           </Button>
@@ -3059,14 +3550,31 @@ export function UnifiedChat() {
                       </div>
 
                       {/* Recording overlay for centered input */}
-                      {isRecording && (
+                      {(isRecording || voiceState) && (
                         <RecordingOverlay
                           isRecording={isRecording}
                           isProcessing={isProcessingSend || isTranscribing}
                           onSend={() => stopRecording(true)}
-                          onCancel={() => stopRecording(false)}
+                          onCancel={() => {
+                            if (voiceState === "error" && !voiceModeRef.current) {
+                              // Dismiss error overlay in non-voice mode
+                              setVoiceState(null);
+                              setRecordingBlob(null);
+                              setVoiceErrorMessage(null);
+                              setVoiceRetryCount(0);
+                              setRecordingDuration(0);
+                            } else {
+                              stopRecording(false);
+                              if (voiceModeRef.current) exitVoiceMode();
+                            }
+                          }}
                           isCompact={false}
                           className="absolute inset-0 rounded-xl"
+                          voiceState={voiceState || undefined}
+                          errorMessage={voiceErrorMessage || undefined}
+                          savedDuration={recordingDuration ?? undefined}
+                          onRetry={handleVoiceRetry}
+                          onDiscard={handleVoiceDiscard}
                         />
                       )}
                     </div>
@@ -3269,11 +3777,13 @@ export function UnifiedChat() {
                         {/* Mic button */}
                         <Button
                           type="button"
-                          onClick={startRecording}
-                          disabled={isGenerating || isRecording || !canUseVoice}
+                          onClick={voiceMode ? () => exitVoiceMode() : startRecording}
+                          disabled={isGenerating || (isRecording && !voiceMode) || !canUseVoice}
                           size="icon"
                           variant="ghost"
-                          className="h-8 w-8 rounded-lg hover:bg-muted"
+                          className={`h-8 w-8 rounded-lg hover:bg-muted ${
+                            voiceMode ? "text-primary" : ""
+                          }`}
                         >
                           <Mic className="h-4 w-4" />
                         </Button>
@@ -3302,14 +3812,30 @@ export function UnifiedChat() {
                     </div>
 
                     {/* Recording overlay for bottom input */}
-                    {isRecording && (
+                    {(isRecording || voiceState) && (
                       <RecordingOverlay
                         isRecording={isRecording}
                         isProcessing={isProcessingSend || isTranscribing}
                         onSend={() => stopRecording(true)}
-                        onCancel={() => stopRecording(false)}
+                        onCancel={() => {
+                          if (voiceState === "error" && !voiceModeRef.current) {
+                            setVoiceState(null);
+                            setRecordingBlob(null);
+                            setVoiceErrorMessage(null);
+                            setVoiceRetryCount(0);
+                            setRecordingDuration(0);
+                          } else {
+                            stopRecording(false);
+                            if (voiceModeRef.current) exitVoiceMode();
+                          }
+                        }}
                         isCompact={true}
                         className="absolute inset-0 rounded-xl"
+                        voiceState={voiceState || undefined}
+                        errorMessage={voiceErrorMessage || undefined}
+                        savedDuration={recordingDuration ?? undefined}
+                        onRetry={handleVoiceRetry}
+                        onDiscard={handleVoiceDiscard}
                       />
                     )}
                   </div>
@@ -3372,6 +3898,42 @@ export function UnifiedChat() {
         {/* TTS setup dialog */}
         <TTSDownloadDialog open={ttsSetupDialogOpen} onOpenChange={setTtsSetupDialogOpen} />
 
+        {/* TTS discovery prompt - shown once after first voice message on supported platforms */}
+        {showTTSDiscovery && (
+          <div className="fixed bottom-24 left-1/2 -translate-x-1/2 z-50 w-full max-w-md px-4">
+            <div className="bg-background border rounded-xl shadow-lg p-4 flex items-center gap-3">
+              <div className="flex-1">
+                <p className="text-sm font-medium">Enable voice responses?</p>
+                <p className="text-xs text-muted-foreground mt-0.5">
+                  Download a text-to-speech model (~264 MB) to hear Maple read responses aloud.
+                </p>
+              </div>
+              <div className="flex items-center gap-2 flex-shrink-0">
+                <Button
+                  variant="default"
+                  size="sm"
+                  className="gap-1.5"
+                  onClick={() => {
+                    dismissTTSDiscovery();
+                    setTtsSetupDialogOpen(true);
+                  }}
+                >
+                  <Download className="h-3.5 w-3.5" />
+                  Download
+                </Button>
+                <Button
+                  variant="ghost"
+                  size="icon"
+                  className="h-8 w-8"
+                  onClick={dismissTTSDiscovery}
+                >
+                  <X className="h-4 w-4" />
+                </Button>
+              </div>
+            </div>
+          </div>
+        )}
+
         {/* Hidden file inputs - must be outside conditional rendering to work in both views */}
         <input
           type="file"
diff --git a/frontend/src/services/tts/TTSContext.tsx b/frontend/src/services/tts/TTSContext.tsx
index 2a5089d5..19b65a13 100644
--- a/frontend/src/services/tts/TTSContext.tsx
+++ b/frontend/src/services/tts/TTSContext.tsx
@@ -48,6 +48,7 @@ interface TTSContextValue {
   downloadDetail: string;
   totalSizeMB: number;
   isPlaying: boolean;
+  isGenerating: boolean;
   currentPlayingId: string | null;
   isTauriEnv: boolean;
 
@@ -55,7 +56,11 @@ interface TTSContextValue {
   startDownload: () => Promise<void>;
   deleteModels: () => Promise<void>;
   speak: (text: string, messageId: string) => Promise<void>;
+  /** Like speak(), but returns a promise that resolves when playback ends (for voice mode loop). */
+  speakAndWait: (text: string, messageId: string) => Promise<void>;
   stop: () => void;
+  /** Cancel an in-progress TTS generation (returns to idle without playing). */
+  cancelGeneration: () => void;
   clearPlaybackError: () => void;
 }
 
@@ -72,6 +77,7 @@ export function TTSProvider({ children }: { children: ReactNode }) {
   const [downloadDetail, setDownloadDetail] = useState("");
   const [totalSizeMB, setTotalSizeMB] = useState(264);
   const [isPlaying, setIsPlaying] = useState(false);
+  const [isGenerating, setIsGenerating] = useState(false);
   const [currentPlayingId, setCurrentPlayingId] = useState<string | null>(null);
   const [playbackError, setPlaybackError] = useState<string | null>(null);
 
@@ -85,6 +91,9 @@ export function TTSProvider({ children }: { children: ReactNode }) {
   } | null>(null);
   const unlistenRef = useRef<(() => void) | null>(null);
 
+  // Generation sequence ID to detect stale results
+  const generationSeqRef = useRef(0);
+
   const cleanupDownloadListener = useCallback(() => {
     if (unlistenRef.current) {
       unlistenRef.current();
@@ -171,7 +180,7 @@ export function TTSProvider({ children }: { children: ReactNode }) {
     }
   }, [isTauriEnv, cleanupDownloadListener]);
 
-  const stop = useCallback(() => {
+  const stopPlayback = useCallback(() => {
     if (audioUrlRef.current) {
       URL.revokeObjectURL(audioUrlRef.current);
       audioUrlRef.current = null;
@@ -219,6 +228,22 @@ export function TTSProvider({ children }: { children: ReactNode }) {
     setCurrentPlayingId(null);
   }, []);
 
+  const stop = useCallback(() => {
+    // Invalidate any in-flight generation
+    generationSeqRef.current++;
+    setIsGenerating(false);
+    stopPlayback();
+  }, [stopPlayback]);
+
+  const cancelGeneration = useCallback(() => {
+    // Bump the sequence so the in-flight tts_synthesize result is discarded
+    generationSeqRef.current++;
+    setIsGenerating(false);
+    // Also stop playback to close the AudioContext — this prevents audio from
+    // playing if cancelGeneration is called during the async decoding window.
+    stopPlayback();
+  }, [stopPlayback]);
+
   const deleteModels = useCallback(async () => {
     if (!isTauriEnv) return;
 
@@ -238,27 +263,44 @@ export function TTSProvider({ children }: { children: ReactNode }) {
     }
   }, [isTauriEnv, stop]);
 
-  const speak = useCallback(
-    async (text: string, messageId: string) => {
+  /**
+   * Internal speak implementation. Returns a promise that resolves when playback ends
+   * if `waitForEnd` is true, otherwise resolves immediately after starting playback.
+   */
+  const speakInternal = useCallback(
+    async (text: string, messageId: string, waitForEnd: boolean): Promise<void> => {
       if (!isTauriEnv || status !== "ready") return;
 
-      // Stop any currently playing audio
+      // Stop any currently playing audio and invalidate previous generation
       stop();
 
       // Preprocess text to remove think blocks and other non-speakable content
       const processedText = preprocessTextForTTS(text);
       if (!processedText) {
-        return;
+        // Signal to callers that there was nothing to play (not a real error).
+        // Voice loop catches this to restart recording instead of exiting.
+        throw new Error("no_speakable_text");
       }
 
+      // Capture a sequence ID for this generation so we can detect staleness
+      const mySeq = ++generationSeqRef.current;
+
       try {
-        setIsPlaying(true);
+        setIsGenerating(true);
         setCurrentPlayingId(messageId);
 
         const result = await invoke<TTSSynthesizeResponse>("tts_synthesize", {
           text: processedText
         });
 
+        // Check if this generation is still current
+        if (generationSeqRef.current !== mySeq) {
+          // Stale - a newer speak() call or cancelGeneration() happened
+          return;
+        }
+
+        setIsGenerating(false);
+
         // Create audio from base64
         const audioBlob = base64ToBlob(result.audio_base64, "audio/wav");
         const audioUrl = URL.createObjectURL(audioBlob);
@@ -304,7 +346,7 @@ export function TTSProvider({ children }: { children: ReactNode }) {
         }
 
         // iOS: try to force media playback routing (speaker) for Web Audio.
-        // This helps avoid “only works with headphones / earpiece” routing issues.
+        // This helps avoid "only works with headphones / earpiece" routing issues.
         try {
           // eslint-disable-next-line @typescript-eslint/no-explicit-any
           const nav = navigator as any;
@@ -318,11 +360,25 @@ export function TTSProvider({ children }: { children: ReactNode }) {
 
         const audioContext = new AudioContextClass() as AudioContext;
 
+        // Store context ref immediately so stopPlayback() can clean it up
+        // if any operation below throws (prevents AudioContext resource leak)
+        audioContextRef.current = audioContext;
+
         // iOS requires user interaction to start audio - resume if suspended
         if (audioContext.state === "suspended") {
           await audioContext.resume();
         }
 
+        // Re-check staleness after async work
+        if (generationSeqRef.current !== mySeq) {
+          void audioContext.close().catch(() => {});
+          if (audioContextRef.current === audioContext) {
+            audioContextRef.current = null;
+          }
+          URL.revokeObjectURL(audioUrl);
+          return;
+        }
+
         const arrayBuffer = await audioBlob.arrayBuffer();
         const audioBuffer = await audioContext.decodeAudioData(arrayBuffer);
 
@@ -330,62 +386,97 @@ export function TTSProvider({ children }: { children: ReactNode }) {
         source.buffer = audioBuffer;
         source.connect(audioContext.destination);
 
-        // Store context and source for stop functionality
-        audioContextRef.current = audioContext;
         sourceNodeRef.current = source;
 
-        source.onended = () => {
-          if (sourceNodeRef.current !== source) {
-            return;
-          }
-          setIsPlaying(false);
-          setCurrentPlayingId(null);
+        setIsPlaying(true);
 
-          if (audioUrlRef.current === audioUrl) {
-            URL.revokeObjectURL(audioUrlRef.current);
-            audioUrlRef.current = null;
-          }
-          void audioContext.close().catch(() => {
-            // Ignore
-          });
-          audioContextRef.current = null;
-          sourceNodeRef.current = null;
-
-          if (audioSessionPrevTypeRef.current) {
-            try {
-              // eslint-disable-next-line @typescript-eslint/no-explicit-any
-              const nav = navigator as any;
-              if (nav.audioSession && typeof nav.audioSession.type === "string") {
-                nav.audioSession.type = audioSessionPrevTypeRef.current;
-              }
-            } catch {
+        // Wrap playback in a promise that resolves when it ends (for waitForEnd mode)
+        const playbackPromise = new Promise<void>((resolve) => {
+          source.onended = () => {
+            if (sourceNodeRef.current !== source) {
+              resolve();
+              return;
+            }
+            setIsPlaying(false);
+            setCurrentPlayingId(null);
+
+            if (audioUrlRef.current === audioUrl) {
+              URL.revokeObjectURL(audioUrlRef.current);
+              audioUrlRef.current = null;
+            }
+            void audioContext.close().catch(() => {
               // Ignore
+            });
+            audioContextRef.current = null;
+            sourceNodeRef.current = null;
+
+            if (audioSessionPrevTypeRef.current) {
+              try {
+                // eslint-disable-next-line @typescript-eslint/no-explicit-any
+                const nav = navigator as any;
+                if (nav.audioSession && typeof nav.audioSession.type === "string") {
+                  nav.audioSession.type = audioSessionPrevTypeRef.current;
+                }
+              } catch {
+                // Ignore
+              }
+              audioSessionPrevTypeRef.current = null;
             }
-            audioSessionPrevTypeRef.current = null;
-          }
 
-          if (mediaSessionPrevStateRef.current) {
-            try {
-              if ("mediaSession" in navigator) {
-                navigator.mediaSession.metadata = mediaSessionPrevStateRef.current.metadata;
-                navigator.mediaSession.playbackState =
-                  mediaSessionPrevStateRef.current.playbackState;
+            if (mediaSessionPrevStateRef.current) {
+              try {
+                if ("mediaSession" in navigator) {
+                  navigator.mediaSession.metadata = mediaSessionPrevStateRef.current.metadata;
+                  navigator.mediaSession.playbackState =
+                    mediaSessionPrevStateRef.current.playbackState;
+                }
+              } catch {
+                // Ignore
               }
-            } catch {
-              // Ignore
+              mediaSessionPrevStateRef.current = null;
             }
-            mediaSessionPrevStateRef.current = null;
-          }
-        };
+
+            resolve();
+          };
+        });
 
         source.start(0);
+
+        if (waitForEnd) {
+          await playbackPromise;
+        }
       } catch (err) {
+        // Check staleness before setting error state
+        if (generationSeqRef.current !== mySeq) return;
+
         console.error("TTS playback failed:", err);
+        setIsGenerating(false);
         setPlaybackError(err instanceof Error ? err.message : "TTS playback failed");
-        stop();
+        stopPlayback();
+        // Re-throw so speakAndWait callers (e.g. voice mode loop) can catch and exit
+        throw err;
       }
     },
-    [isTauriEnv, status, stop]
+    [isTauriEnv, status, stop, stopPlayback]
+  );
+
+  const speak = useCallback(
+    async (text: string, messageId: string) => {
+      try {
+        await speakInternal(text, messageId, false);
+      } catch {
+        // Error already handled by speakInternal (playbackError state set).
+        // Only speakAndWait needs to propagate for voice mode loop.
+      }
+    },
+    [speakInternal]
+  );
+
+  const speakAndWait = useCallback(
+    async (text: string, messageId: string) => {
+      await speakInternal(text, messageId, true);
+    },
+    [speakInternal]
   );
 
   const clearPlaybackError = useCallback(() => {
@@ -452,13 +543,16 @@ export function TTSProvider({ children }: { children: ReactNode }) {
         downloadDetail,
         totalSizeMB,
         isPlaying,
+        isGenerating,
         currentPlayingId,
         isTauriEnv,
         checkStatus,
         startDownload,
         deleteModels,
         speak,
+        speakAndWait,
         stop,
+        cancelGeneration,
         clearPlaybackError
       }}
     >