From b96e00ec60863f5d86d33d130e04822b9aedc63e Mon Sep 17 00:00:00 2001
From: BuffMcBigHuge <marco@bymar.co>
Date: Tue, 17 Feb 2026 17:39:31 -0500
Subject: [PATCH 1/9] Audio with NDI, audio buffer in frame loop, added audio
 track and media clock in webrtc.

Signed-off-by: BuffMcBigHuge <marco@bymar.co>
---
 src/scope/core/ndi/lib.py              |   6 +
 src/scope/core/outputs/ndi.py          |  58 ++++++++-
 src/scope/server/frame_processor.py    | 156 +++++++++++++++++++++++++
 src/scope/server/media_clock.py        |  61 ++++++++++
 src/scope/server/pipeline_processor.py |  35 ++++++
 src/scope/server/tracks.py             |  78 ++++++++++++-
 src/scope/server/webrtc.py             |  27 ++++-
 7 files changed, 415 insertions(+), 6 deletions(-)
 create mode 100644 src/scope/server/media_clock.py

diff --git a/src/scope/core/ndi/lib.py b/src/scope/core/ndi/lib.py
index f30c4737e..a0df91a38 100644
--- a/src/scope/core/ndi/lib.py
+++ b/src/scope/core/ndi/lib.py
@@ -275,3 +275,9 @@ def setup_send_functions(lib: ctypes.CDLL) -> None:
         ctypes.c_void_p,
         ctypes.POINTER(NDIlib_video_frame_v2_t),
     ]
+
+    lib.NDIlib_send_send_audio_v2.restype = None
+    lib.NDIlib_send_send_audio_v2.argtypes = [
+        ctypes.c_void_p,
+        ctypes.POINTER(NDIlib_audio_frame_v2_t),
+    ]
diff --git a/src/scope/core/outputs/ndi.py b/src/scope/core/outputs/ndi.py
index 8afd0c700..22c3d3c5b 100644
--- a/src/scope/core/outputs/ndi.py
+++ b/src/scope/core/outputs/ndi.py
@@ -1,6 +1,6 @@
 """NDI output sink implementation.
 
-Sends processed video frames over the network via NDI.
+Sends processed video frames and audio over the network via NDI.
 Uses the shared NDI ctypes bindings from scope.core.ndi.
 """
 
@@ -13,6 +13,7 @@
 
 from scope.core.ndi import (
     NDI_FOURCC_RGBA,
+    NDIlib_audio_frame_v2_t,
     NDIlib_send_create_t,
     NDIlib_video_frame_v2_t,
     load_library,
@@ -162,6 +163,61 @@ def send_frame(self, frame: np.ndarray | torch.Tensor) -> bool:
             logger.error(f"Error sending NDI frame: {e}")
             return False
 
+    def send_audio(
+        self,
+        audio: np.ndarray | torch.Tensor,
+        sample_rate: int,
+        num_channels: int,
+    ) -> bool:
+        """Send audio samples over NDI.
+
+        Args:
+            audio: Float32 audio samples. Shape (S,) for mono or (C, S) for multi-channel.
+                   Values should be in [-1.0, 1.0] range.
+            sample_rate: Audio sample rate (e.g. 48000).
+            num_channels: Number of audio channels (e.g. 1 for mono).
+
+        Returns:
+            True if send was successful.
+        """
+        if self._send_instance is None or self._lib is None:
+            return False
+
+        try:
+            if isinstance(audio, torch.Tensor):
+                if audio.is_cuda:
+                    audio = audio.cpu()
+                audio = audio.numpy()
+
+            audio = np.asarray(audio, dtype=np.float32)
+
+            # Ensure contiguous
+            if not audio.flags["C_CONTIGUOUS"]:
+                audio = np.ascontiguousarray(audio)
+
+            # NDI expects interleaved float32 samples
+            # For mono: shape (S,), for multi-channel: shape (C*S,) interleaved
+            num_samples = audio.shape[-1] if audio.ndim > 1 else len(audio)
+
+            audio_frame = NDIlib_audio_frame_v2_t()
+            audio_frame.sample_rate = sample_rate
+            audio_frame.no_channels = num_channels
+            audio_frame.no_samples = num_samples
+            audio_frame.timecode = -1  # auto
+            audio_frame.p_data = audio.ctypes.data
+            audio_frame.channel_stride_in_bytes = num_samples * 4  # float32 = 4 bytes
+            audio_frame.p_metadata = None
+            audio_frame.timestamp = -1  # auto
+
+            self._lib.NDIlib_send_send_audio_v2(
+                self._send_instance, ctypes.byref(audio_frame)
+            )
+            return True
+
+        except Exception as e:
+            logger.error(f"Error sending NDI audio: {e}")
+            return False
+
     def resize(self, width: int, height: int):
         """Update output dimensions (NDI rebuilds frame struct per-send)."""
         self._width = width
diff --git a/src/scope/server/frame_processor.py b/src/scope/server/frame_processor.py
index ef411c3e8..63812a541 100644
--- a/src/scope/server/frame_processor.py
+++ b/src/scope/server/frame_processor.py
@@ -3,8 +3,10 @@
 import threading
 import time
 import uuid
+from collections import deque
 from typing import TYPE_CHECKING, Any
 
+import numpy as np
 import torch
 from aiortc.mediastreams import VideoFrame
 
@@ -18,6 +20,13 @@
 
     from .cloud_connection import CloudConnectionManager
 
+# Audio constants
+WEBRTC_AUDIO_SAMPLE_RATE = 48000  # WebRTC standard output sample rate
+AUDIO_FRAME_DURATION_MS = 20  # Standard WebRTC audio frame duration
+AUDIO_SAMPLES_PER_FRAME = int(
+    WEBRTC_AUDIO_SAMPLE_RATE * AUDIO_FRAME_DURATION_MS / 1000
+)  # 960 samples
+
 logger = logging.getLogger(__name__)
 
 
@@ -109,6 +118,15 @@ def __init__(
         self._playback_ready_emitted = False
         self._stream_start_time: float | None = None
 
+        # Audio buffer: accumulates resampled audio samples ready for WebRTC output.
+        # Stores interleaved float32 samples at WEBRTC_AUDIO_SAMPLE_RATE (48kHz).
+        # AudioProcessingTrack calls get_audio() to drain 20ms chunks.
+        self._audio_buffer = deque()  # deque of np.ndarray chunks (mono, float32)
+        self._audio_buffer_lock = threading.Lock()
+        self._audio_buffer_samples = 0  # total samples buffered
+        self._audio_drain_thread: threading.Thread | None = None
+        self._audio_chunks_out = 0
+
         # Store pipeline_ids from initial_parameters if provided
         pipeline_ids = (initial_parameters or {}).get("pipeline_ids")
         if pipeline_ids is not None:
@@ -206,6 +224,12 @@ def start(self):
             )
             return
 
+        # Start audio drain thread to move audio from pipeline processor queue to buffer
+        self._audio_drain_thread = threading.Thread(
+            target=self._audio_drain_loop, daemon=True
+        )
+        self._audio_drain_thread.start()
+
         logger.info(
             f"[FRAME-PROCESSOR] Started with {len(self.pipeline_ids)} pipeline(s): {self.pipeline_ids}"
         )
@@ -234,6 +258,15 @@ def stop(self, error_message: str = None):
         # Clear pipeline processors
         self.pipeline_processors.clear()
 
+        # Wait for audio drain thread to finish
+        if self._audio_drain_thread and self._audio_drain_thread.is_alive():
+            self._audio_drain_thread.join(timeout=2.0)
+
+        # Clear audio buffer
+        with self._audio_buffer_lock:
+            self._audio_buffer.clear()
+            self._audio_buffer_samples = 0
+
         # Clean up all output sinks
         for sink_type, entry in list(self.output_sinks.items()):
             q = entry["queue"]
@@ -474,6 +507,129 @@ def get(self) -> torch.Tensor | None:
 
         return frame
 
+    def _audio_drain_loop(self):
+        """Background thread that drains audio from the last pipeline processor's
+        audio_output_queue, resamples to 48kHz, and appends to the audio buffer.
+        """
+        logger.info("[FRAME-PROCESSOR] Audio drain thread started")
+
+        while self.running:
+            if not self.pipeline_processors:
+                time.sleep(0.01)
+                continue
+
+            last_processor = self.pipeline_processors[-1]
+            try:
+                audio_tensor, sample_rate = last_processor.audio_output_queue.get(
+                    timeout=0.1
+                )
+            except queue.Empty:
+                continue
+
+            try:
+                # Convert torch tensor to numpy float32
+                if isinstance(audio_tensor, torch.Tensor):
+                    audio_np = audio_tensor.float().numpy()
+                else:
+                    audio_np = np.asarray(audio_tensor, dtype=np.float32)
+
+                # Ensure shape is [C, S] (channels, samples)
+                if audio_np.ndim == 1:
+                    audio_np = audio_np[np.newaxis, :]  # mono -> [1, S]
+
+                # Mix down to mono for WebRTC (average channels)
+                if audio_np.shape[0] > 1:
+                    audio_mono = audio_np.mean(axis=0)
+                else:
+                    audio_mono = audio_np[0]
+
+                # Resample to 48kHz if necessary
+                if sample_rate != WEBRTC_AUDIO_SAMPLE_RATE:
+                    audio_mono = self._resample_audio(
+                        audio_mono, sample_rate, WEBRTC_AUDIO_SAMPLE_RATE
+                    )
+
+                # Append to buffer
+                with self._audio_buffer_lock:
+                    self._audio_buffer.append(audio_mono)
+                    self._audio_buffer_samples += len(audio_mono)
+
+                # Also fan out to output sinks that support audio
+                if self.output_sinks:
+                    for _sink_type, entry in self.output_sinks.items():
+                        sink = entry["sink"]
+                        if hasattr(sink, "send_audio"):
+                            try:
+                                sink.send_audio(audio_mono, WEBRTC_AUDIO_SAMPLE_RATE, 1)
+                            except Exception as e:
+                                logger.debug(
+                                    f"Error sending audio to sink '{_sink_type}': {e}"
+                                )
+
+            except Exception as e:
+                logger.error(f"[FRAME-PROCESSOR] Error processing audio chunk: {e}")
+
+        logger.info(
+            f"[FRAME-PROCESSOR] Audio drain thread stopped ({self._audio_chunks_out} chunks served)"
+        )
+
+    @staticmethod
+    def _resample_audio(audio: np.ndarray, src_rate: int, dst_rate: int) -> np.ndarray:
+        """Simple linear interpolation resampling.
+
+        For production quality, a proper resampler (e.g. libsamplerate) would be
+        better, but linear interpolation is sufficient for initial audio support.
+        """
+        if src_rate == dst_rate:
+            return audio
+        duration = len(audio) / src_rate
+        num_output_samples = int(duration * dst_rate)
+        indices = np.linspace(0, len(audio) - 1, num_output_samples)
+        return np.interp(indices, np.arange(len(audio)), audio).astype(np.float32)
+
+    def get_audio(
+        self, num_samples: int = AUDIO_SAMPLES_PER_FRAME
+    ) -> np.ndarray | None:
+        """Get the next chunk of audio samples for WebRTC output.
+
+        Returns a mono float32 numpy array of length num_samples (default 960 = 20ms at 48kHz),
+        or None if no audio is available.
+
+        Called by AudioProcessingTrack.recv().
+        """
+        if not self.running:
+            return None
+
+        with self._audio_buffer_lock:
+            if self._audio_buffer_samples < num_samples:
+                return None
+
+            # Collect enough samples from the buffer
+            collected = []
+            remaining = num_samples
+            while remaining > 0 and self._audio_buffer:
+                chunk = self._audio_buffer[0]
+                if len(chunk) <= remaining:
+                    collected.append(self._audio_buffer.popleft())
+                    self._audio_buffer_samples -= len(chunk)
+                    remaining -= len(chunk)
+                else:
+                    # Split chunk: take what we need, put the rest back
+                    collected.append(chunk[:remaining])
+                    self._audio_buffer[0] = chunk[remaining:]
+                    self._audio_buffer_samples -= remaining
+                    remaining = 0
+
+            self._audio_chunks_out += 1
+
+        return np.concatenate(collected) if collected else None
+
+    @property
+    def has_audio(self) -> bool:
+        """Check if any audio data is buffered."""
+        with self._audio_buffer_lock:
+            return self._audio_buffer_samples > 0
+
     def _on_frame_from_cloud(self, frame: "VideoFrame") -> None:
         """Callback when a processed frame is received from cloud (cloud mode)."""
         self._frames_from_cloud += 1
diff --git a/src/scope/server/media_clock.py b/src/scope/server/media_clock.py
new file mode 100644
index 000000000..237ebe3c3
--- /dev/null
+++ b/src/scope/server/media_clock.py
@@ -0,0 +1,61 @@
+"""Shared media clock for synchronizing audio and video streams.
+
+Provides a single source of truth for media timing so that audio and video
+WebRTC tracks produce correlated PTS values. aiortc's RTCP Sender Reports
+then map these to NTP wallclock time for receiver-side A/V sync.
+"""
+
+import threading
+import time
+
+# Standard WebRTC clock rates
+AUDIO_CLOCK_RATE = 48000  # WebRTC audio: 48 kHz
+VIDEO_CLOCK_RATE = 90000  # WebRTC video: 90 kHz
+
+
+class MediaClock:
+    """Shared clock for synchronizing audio and video streams.
+
+    Both VideoProcessingTrack and AudioProcessingTrack reference the same
+    MediaClock instance. The clock starts when the first media frame is ready
+    to play, and get_media_time() returns elapsed wall-clock seconds since then.
+
+    PTS values derived from get_media_time() are correlated across tracks,
+    allowing the WebRTC receiver to synchronize audio and video playback.
+    """
+
+    def __init__(self):
+        self._start_time: float | None = None
+        self._lock = threading.Lock()
+
+    def start(self):
+        """Start the clock. Call when the first media frame is ready to play.
+
+        Safe to call multiple times; only the first call takes effect.
+        """
+        with self._lock:
+            if self._start_time is None:
+                self._start_time = time.time()
+
+    @property
+    def is_started(self) -> bool:
+        with self._lock:
+            return self._start_time is not None
+
+    def get_media_time(self) -> float:
+        """Get elapsed media time in seconds since the clock started.
+
+        Returns 0.0 if the clock hasn't been started yet.
+        """
+        with self._lock:
+            if self._start_time is None:
+                return 0.0
+            return time.time() - self._start_time
+
+    def media_time_to_video_pts(self, media_time: float) -> int:
+        """Convert media time (seconds) to video PTS in 90 kHz clock units."""
+        return int(media_time * VIDEO_CLOCK_RATE)
+
+    def media_time_to_audio_pts(self, media_time: float) -> int:
+        """Convert media time (seconds) to audio PTS in 48 kHz sample units."""
+        return int(media_time * AUDIO_CLOCK_RATE)
diff --git a/src/scope/server/pipeline_processor.py b/src/scope/server/pipeline_processor.py
index 971f99b9f..f47ff9ca7 100644
--- a/src/scope/server/pipeline_processor.py
+++ b/src/scope/server/pipeline_processor.py
@@ -67,6 +67,13 @@ def __init__(
         # Lock to protect input_queue assignment for thread-safe reference swapping
         self.input_queue_lock = threading.Lock()
 
+        # Audio output queue: stores (audio_tensor, sample_rate) tuples from pipeline output.
+        # Pipelines that produce audio return {"video": ..., "audio": ..., "audio_sample_rate": ...}.
+        # Only the last processor in a chain is read by FrameProcessor for audio output.
+        self.audio_output_queue: queue.Queue[tuple[torch.Tensor, int]] = queue.Queue(
+            maxsize=8
+        )
+
         # Current parameters used by processing thread
         self.parameters = initial_parameters or {}
         # Queue for parameter updates from external threads
@@ -206,6 +213,13 @@ def stop(self):
                 except queue.Empty:
                     break
 
+        # Clear audio output queue
+        while not self.audio_output_queue.empty():
+            try:
+                self.audio_output_queue.get_nowait()
+            except queue.Empty:
+                break
+
         logger.info(f"PipelineProcessor stopped for pipeline: {self.pipeline_id}")
 
     def update_parameters(self, parameters: dict[str, Any]):
@@ -371,6 +385,12 @@ def process_chunk(self):
                         self.output_queue.get_nowait()
                     except queue.Empty:
                         break
+            # Clear audio output queue
+            while not self.audio_output_queue.empty():
+                try:
+                    self.audio_output_queue.get_nowait()
+                except queue.Empty:
+                    break
 
         requirements = None
         if hasattr(self.pipeline, "prepare"):
@@ -509,6 +529,21 @@ def process_chunk(self):
                     )
                     continue
 
+            # Extract audio from pipeline output and queue it
+            audio_output = output_dict.get("audio")
+            audio_sample_rate = output_dict.get("audio_sample_rate")
+            if audio_output is not None and audio_sample_rate is not None:
+                # Detach and move to CPU for downstream consumption
+                audio_output = audio_output.detach().cpu()
+                try:
+                    self.audio_output_queue.put_nowait(
+                        (audio_output, audio_sample_rate)
+                    )
+                except queue.Full:
+                    logger.debug(
+                        f"Audio output queue full for {self.pipeline_id}, dropping audio chunk"
+                    )
+
             # Apply throttling if this pipeline is producing faster than next can consume
             # Only throttle if: (1) has video input, (2) has next processor
             if video_input is not None and self.next_processor is not None:
diff --git a/src/scope/server/tracks.py b/src/scope/server/tracks.py
index 3167a71e3..742262197 100644
--- a/src/scope/server/tracks.py
+++ b/src/scope/server/tracks.py
@@ -4,11 +4,17 @@
 import threading
 import time
 
+import numpy as np
 from aiortc import MediaStreamTrack
 from aiortc.mediastreams import VIDEO_CLOCK_RATE, VIDEO_TIME_BASE, MediaStreamError
-from av import VideoFrame
-
-from .frame_processor import FrameProcessor
+from av import AudioFrame, VideoFrame
+
+from .frame_processor import (
+    AUDIO_SAMPLES_PER_FRAME,
+    WEBRTC_AUDIO_SAMPLE_RATE,
+    FrameProcessor,
+)
+from .media_clock import MediaClock
 from .pipeline_manager import PipelineManager
 
 logger = logging.getLogger(__name__)
@@ -189,3 +195,69 @@ async def stop(self):
             self.frame_processor.stop()
 
         super().stop()
+
+
+class AudioProcessingTrack(MediaStreamTrack):
+    """WebRTC audio track that reads from FrameProcessor's audio buffer.
+
+    Produces 20ms audio frames (960 samples at 48kHz) synchronized with
+    the video track via a shared MediaClock. When no audio data is available,
+    silence frames are returned to keep the track alive.
+    """
+
+    kind = "audio"
+
+    AUDIO_PTIME = AUDIO_SAMPLES_PER_FRAME / WEBRTC_AUDIO_SAMPLE_RATE  # 0.02s (20ms)
+
+    def __init__(
+        self,
+        frame_processor: FrameProcessor,
+        media_clock: MediaClock,
+    ):
+        super().__init__()
+        self.frame_processor = frame_processor
+        self.media_clock = media_clock
+        self._timestamp = 0
+        self._started = False
+        self._last_frame_time: float | None = None
+
+    async def recv(self) -> AudioFrame:
+        if self.readyState != "live":
+            raise MediaStreamError
+
+        # Pace audio output at 20ms intervals
+        if self._last_frame_time is not None:
+            elapsed = time.time() - self._last_frame_time
+            wait = self.AUDIO_PTIME - elapsed
+            if wait > 0:
+                await asyncio.sleep(wait)
+
+        self._last_frame_time = time.time()
+
+        # Start the shared media clock on first audio frame
+        if not self._started:
+            self.media_clock.start()
+            self._started = True
+
+        # Try to get audio data from the frame processor
+        audio_data = self.frame_processor.get_audio(AUDIO_SAMPLES_PER_FRAME)
+
+        if audio_data is not None:
+            # Convert float32 [-1, 1] to int16 for WebRTC
+            audio_int16 = (np.clip(audio_data, -1.0, 1.0) * 32767.0).astype(np.int16)
+        else:
+            # Return silence when no audio is available
+            audio_int16 = np.zeros(AUDIO_SAMPLES_PER_FRAME, dtype=np.int16)
+
+        # Create AudioFrame: shape must be (1, num_samples) for mono s16 layout
+        frame = AudioFrame.from_ndarray(
+            audio_int16.reshape(1, -1), format="s16", layout="mono"
+        )
+        frame.sample_rate = WEBRTC_AUDIO_SAMPLE_RATE
+
+        # Set PTS from shared media clock for A/V sync
+        media_time = self.media_clock.get_media_time()
+        frame.pts = self.media_clock.media_time_to_audio_pts(media_time)
+        frame.time_base = fractions.Fraction(1, WEBRTC_AUDIO_SAMPLE_RATE)
+
+        return frame
diff --git a/src/scope/server/webrtc.py b/src/scope/server/webrtc.py
index cf10c7779..a05a65853 100644
--- a/src/scope/server/webrtc.py
+++ b/src/scope/server/webrtc.py
@@ -48,12 +48,14 @@
 
 
 class Session:
-    """WebRTC Session containing peer connection and associated video track."""
+    """WebRTC Session containing peer connection and associated tracks."""
 
     def __init__(
         self,
         pc: RTCPeerConnection,
         video_track: MediaStreamTrack | None = None,
+        audio_track: "AudioProcessingTrack | None" = None,
+        media_clock: "MediaClock | None" = None,
         data_channel: RTCDataChannel | None = None,
         relay: MediaRelay | None = None,
         recording_manager: RecordingManager | None = None,
@@ -64,6 +66,8 @@ def __init__(
         self.id = str(uuid.uuid4())
         self.pc = pc
         self.video_track = video_track
+        self.audio_track = audio_track
+        self.media_clock = media_clock
         self.data_channel = data_channel
         self.relay = relay
         self.recording_manager = recording_manager
@@ -226,6 +230,9 @@ async def handle_offer(
             # Create NotificationSender for this session to send notifications to the frontend
             notification_sender = NotificationSender()
 
+            # Create shared media clock for A/V synchronization
+            media_clock = MediaClock()
+
             video_track = VideoProcessingTrack(
                 pipeline_manager,
                 initial_parameters=initial_parameters,
@@ -236,6 +243,19 @@ async def handle_offer(
                 connection_info=request.connection_info,
             )
             session.video_track = video_track
+            session.media_clock = media_clock
+
+            # Eagerly initialize the FrameProcessor so the AudioProcessingTrack
+            # can share it. VideoProcessingTrack.recv() normally does this lazily,
+            # but we need the reference now to wire audio.
+            video_track.initialize_output_processing()
+
+            # Create AudioProcessingTrack sharing the same FrameProcessor and MediaClock
+            audio_track = AudioProcessingTrack(
+                frame_processor=video_track.frame_processor,
+                media_clock=media_clock,
+            )
+            session.audio_track = audio_track
 
             # Create a MediaRelay to allow multiple consumers (WebRTC and recording)
             relay = MediaRelay()
@@ -260,9 +280,12 @@ async def handle_offer(
             else:
                 session.recording_manager = None
 
-            # Add the relayed track to WebRTC connection
+            # Add the relayed video track to WebRTC connection
             pc.addTrack(relayed_track)
 
+            # Add audio track to WebRTC connection
+            pc.addTrack(audio_track)
+
             # Store relay for cleanup
             session.relay = relay
 

From ad8f5d02f9c8cc4773588dc92110bfa23af39575 Mon Sep 17 00:00:00 2001
From: BuffMcBigHuge <marco@bymar.co>
Date: Tue, 17 Feb 2026 17:39:44 -0500
Subject: [PATCH 2/9] Frontend audio work.

Signed-off-by: BuffMcBigHuge <marco@bymar.co>
---
 frontend/src/components/VideoOutput.tsx | 49 ++++++++++++++++++++++++-
 frontend/src/hooks/useUnifiedWebRTC.ts  | 15 ++++++--
 2 files changed, 59 insertions(+), 5 deletions(-)

diff --git a/frontend/src/components/VideoOutput.tsx b/frontend/src/components/VideoOutput.tsx
index 312477ca8..fa9bf80a2 100644
--- a/frontend/src/components/VideoOutput.tsx
+++ b/frontend/src/components/VideoOutput.tsx
@@ -1,4 +1,5 @@
 import { useEffect, useRef, useState, useCallback } from "react";
+import { Volume2, VolumeX } from "lucide-react";
 import { Card, CardContent, CardHeader, CardTitle } from "./ui/card";
 import { Spinner } from "./ui/spinner";
 import { PlayOverlay } from "./ui/play-overlay";
@@ -49,15 +50,47 @@ export function VideoOutput({
   const [isFadingOut, setIsFadingOut] = useState(false);
   const overlayTimeoutRef = useRef<number | null>(null);
 
+  // Audio state: start muted to comply with browser autoplay policy.
+  // User can click the speaker icon to unmute once the stream is playing.
+  const [isMuted, setIsMuted] = useState(true);
+  const [hasAudioTrack, setHasAudioTrack] = useState(false);
+
   // Use external ref if provided, otherwise use internal
   const containerRef = videoContainerRef || internalContainerRef;
 
   useEffect(() => {
     if (videoRef.current && remoteStream) {
       videoRef.current.srcObject = remoteStream;
+
+      // Check if the stream contains an audio track
+      const audioTracks = remoteStream.getAudioTracks();
+      setHasAudioTrack(audioTracks.length > 0);
+
+      // Listen for tracks being added later (audio may arrive after video)
+      const handleTrackAdded = () => {
+        const tracks = remoteStream.getAudioTracks();
+        setHasAudioTrack(tracks.length > 0);
+      };
+      remoteStream.addEventListener("addtrack", handleTrackAdded);
+
+      return () => {
+        remoteStream.removeEventListener("addtrack", handleTrackAdded);
+      };
     }
   }, [remoteStream]);
 
+  // Sync muted state to the video element
+  useEffect(() => {
+    if (videoRef.current) {
+      videoRef.current.muted = isMuted;
+    }
+  }, [isMuted]);
+
+  const toggleMute = useCallback((e: React.MouseEvent) => {
+    e.stopPropagation(); // Don't trigger play/pause or pointer lock
+    setIsMuted(prev => !prev);
+  }, []);
+
   // Listen for video playing event to notify parent
   useEffect(() => {
     const video = videoRef.current;
@@ -174,9 +207,23 @@ export function VideoOutput({
                   : "max-w-full max-h-full object-contain"
               }
               autoPlay
-              muted
+              muted={isMuted}
               playsInline
             />
+            {/* Audio mute/unmute toggle - only shown when stream has audio */}
+            {hasAudioTrack && (
+              <button
+                onClick={toggleMute}
+                className="absolute bottom-4 right-4 p-2 rounded-lg bg-black/60 hover:bg-black/80 text-white transition-colors z-10"
+                title={isMuted ? "Unmute audio" : "Mute audio"}
+              >
+                {isMuted ? (
+                  <VolumeX className="w-5 h-5" />
+                ) : (
+                  <Volume2 className="w-5 h-5" />
+                )}
+              </button>
+            )}
             {/* Play/Pause Overlay */}
             {showOverlay && (
               <div className="absolute inset-0 flex items-center justify-center pointer-events-none">
diff --git a/frontend/src/hooks/useUnifiedWebRTC.ts b/frontend/src/hooks/useUnifiedWebRTC.ts
index 190d1d41f..18be17e09 100644
--- a/frontend/src/hooks/useUnifiedWebRTC.ts
+++ b/frontend/src/hooks/useUnifiedWebRTC.ts
@@ -221,11 +221,18 @@ export function useUnifiedWebRTC(options?: UseUnifiedWebRTCOptions) {
         }
 
         // Event handlers
+        // Collect all incoming tracks (video + audio) into a single MediaStream.
+        // The backend may send video and audio as separate streams, so we
+        // merge them into one MediaStream for the <video> element.
+        const combinedStream = new MediaStream();
         pc.ontrack = (evt: RTCTrackEvent) => {
-          if (evt.streams && evt.streams[0]) {
-            console.log("[UnifiedWebRTC] Setting remote stream");
-            setRemoteStream(evt.streams[0]);
-          }
+          console.log(
+            `[UnifiedWebRTC] Track received: ${evt.track.kind} (id: ${evt.track.id})`
+          );
+          combinedStream.addTrack(evt.track);
+          // Update remoteStream reference each time a track is added so the
+          // component re-renders and picks up the latest tracks
+          setRemoteStream(combinedStream);
         };
 
         pc.onconnectionstatechange = () => {

From c1b098ab8290204b19a7666215ce5aeef7502826 Mon Sep 17 00:00:00 2001
From: BuffMcBigHuge <marco@bymar.co>
Date: Tue, 17 Feb 2026 17:48:37 -0500
Subject: [PATCH 3/9] Import fixes.

Signed-off-by: BuffMcBigHuge <marco@bymar.co>
---
 src/scope/server/webrtc.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/scope/server/webrtc.py b/src/scope/server/webrtc.py
index a05a65853..a54dab951 100644
--- a/src/scope/server/webrtc.py
+++ b/src/scope/server/webrtc.py
@@ -22,10 +22,11 @@
 from .cloud_track import CloudTrack
 from .credentials import get_turn_credentials
 from .kafka_publisher import publish_event
+from .media_clock import MediaClock
 from .pipeline_manager import PipelineManager
 from .recording import RecordingManager
 from .schema import WebRTCOfferRequest
-from .tracks import VideoProcessingTrack
+from .tracks import AudioProcessingTrack, VideoProcessingTrack
 
 if TYPE_CHECKING:
     from .cloud_connection import CloudConnectionManager

From ef3aff20f755a6756b5536df92703f9a5ccaa8ba Mon Sep 17 00:00:00 2001
From: BuffMcBigHuge <marco@bymar.co>
Date: Tue, 17 Feb 2026 17:57:20 -0500
Subject: [PATCH 4/9] Modified order of operations for audio track.

Signed-off-by: BuffMcBigHuge <marco@bymar.co>
---
 src/scope/server/webrtc.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/scope/server/webrtc.py b/src/scope/server/webrtc.py
index a54dab951..e370ee9c6 100644
--- a/src/scope/server/webrtc.py
+++ b/src/scope/server/webrtc.py
@@ -284,9 +284,6 @@ async def handle_offer(
             # Add the relayed video track to WebRTC connection
             pc.addTrack(relayed_track)
 
-            # Add audio track to WebRTC connection
-            pc.addTrack(audio_track)
-
             # Store relay for cleanup
             session.relay = relay
 
@@ -393,6 +390,9 @@ def on_data_channel_message(message):
             offer_sdp = RTCSessionDescription(sdp=request.sdp, type=request.type)
             await pc.setRemoteDescription(offer_sdp)
 
+            # Add audio track to WebRTC connection
+            pc.addTrack(audio_track)
+
             # Create answer
             answer = await pc.createAnswer()
             await pc.setLocalDescription(answer)

From 32c883394a481063a7d2f4822d8d51e772154d24 Mon Sep 17 00:00:00 2001
From: BuffMcBigHuge <marco@bymar.co>
Date: Tue, 17 Feb 2026 18:07:42 -0500
Subject: [PATCH 5/9] Modification to audio handshake.

Signed-off-by: BuffMcBigHuge <marco@bymar.co>
---
 frontend/src/hooks/useUnifiedWebRTC.ts | 5 +++++
 src/scope/server/webrtc.py             | 9 +++++----
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/frontend/src/hooks/useUnifiedWebRTC.ts b/frontend/src/hooks/useUnifiedWebRTC.ts
index 18be17e09..9e21e96e0 100644
--- a/frontend/src/hooks/useUnifiedWebRTC.ts
+++ b/frontend/src/hooks/useUnifiedWebRTC.ts
@@ -208,6 +208,11 @@ export function useUnifiedWebRTC(options?: UseUnifiedWebRTCOptions) {
           transceiver = pc.addTransceiver("video");
         }
 
+        // Add a receive-only audio transceiver so the SDP offer includes an
+        // audio m-line. The backend will match this and send audio back.
+        pc.addTransceiver("audio", { direction: "recvonly" });
+        console.log("[UnifiedWebRTC] Added recvonly audio transceiver");
+
         // Force VP8-only for aiortc compatibility
         if (transceiver) {
           const codecs = RTCRtpReceiver.getCapabilities("video")?.codecs || [];
diff --git a/src/scope/server/webrtc.py b/src/scope/server/webrtc.py
index e370ee9c6..a11079dbf 100644
--- a/src/scope/server/webrtc.py
+++ b/src/scope/server/webrtc.py
@@ -284,6 +284,10 @@ async def handle_offer(
             # Add the relayed video track to WebRTC connection
             pc.addTrack(relayed_track)
 
+            # Add audio track to WebRTC connection. The browser's offer includes
+            # a recvonly audio m-line, so aiortc will match this transceiver to it.
+            pc.addTrack(audio_track)
+
             # Store relay for cleanup
             session.relay = relay
 
@@ -390,9 +394,6 @@ def on_data_channel_message(message):
             offer_sdp = RTCSessionDescription(sdp=request.sdp, type=request.type)
             await pc.setRemoteDescription(offer_sdp)
 
-            # Add audio track to WebRTC connection
-            pc.addTrack(audio_track)
-
             # Create answer
             answer = await pc.createAnswer()
             await pc.setLocalDescription(answer)
@@ -416,7 +417,7 @@ def on_data_channel_message(message):
             }
 
         except Exception as e:
-            logger.error(f"Error handling WebRTC offer: {e}")
+            logger.error(f"Error handling WebRTC offer: {e}", exc_info=True)
             _publish_connection_error(
                 session.id if "session" in locals() else None,
                 request.connection_id,

From acd30d51ab38e935d0d72e5e5b7bc47980e56938 Mon Sep 17 00:00:00 2001
From: BuffMcBigHuge <marco@bymar.co>
Date: Tue, 17 Feb 2026 18:11:38 -0500
Subject: [PATCH 6/9] Solving issues with audio handshake.

Signed-off-by: BuffMcBigHuge <marco@bymar.co>
---
 src/scope/server/webrtc.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/src/scope/server/webrtc.py b/src/scope/server/webrtc.py
index a11079dbf..39273437b 100644
--- a/src/scope/server/webrtc.py
+++ b/src/scope/server/webrtc.py
@@ -284,10 +284,6 @@ async def handle_offer(
             # Add the relayed video track to WebRTC connection
             pc.addTrack(relayed_track)
 
-            # Add audio track to WebRTC connection. The browser's offer includes
-            # a recvonly audio m-line, so aiortc will match this transceiver to it.
-            pc.addTrack(audio_track)
-
             # Store relay for cleanup
             session.relay = relay
 
@@ -394,6 +390,16 @@ def on_data_channel_message(message):
             offer_sdp = RTCSessionDescription(sdp=request.sdp, type=request.type)
             await pc.setRemoteDescription(offer_sdp)
 
+            # Attach our audio track to the audio transceiver that aiortc created
+            # from the browser's recvonly audio m-line. We find it by kind and
+            # assign our track to its sender, then flip direction to sendonly so
+            # the answer tells the browser we'll be sending audio.
+            for t in pc.getTransceivers():
+                if t.kind == "audio" and t.sender.track is None:
+                    t.sender.replaceTrack(audio_track)
+                    t.direction = "sendonly"
+                    break
+
             # Create answer
             answer = await pc.createAnswer()
             await pc.setLocalDescription(answer)

From 9f9662fe4b4d0d14691f539eae9618e5959aba5b Mon Sep 17 00:00:00 2001
From: BuffMcBigHuge <marco@bymar.co>
Date: Tue, 17 Feb 2026 18:35:10 -0500
Subject: [PATCH 7/9] Audio support testing and logging.

Signed-off-by: BuffMcBigHuge <marco@bymar.co>
---
 frontend/src/hooks/useUnifiedWebRTC.ts | 15 +++++----------
 src/scope/server/frame_processor.py    |  4 ++++
 src/scope/server/pipeline_processor.py |  4 ++++
 src/scope/server/tracks.py             | 20 +++++++++++---------
 src/scope/server/webrtc.py             | 15 +++++----------
 5 files changed, 29 insertions(+), 29 deletions(-)

diff --git a/frontend/src/hooks/useUnifiedWebRTC.ts b/frontend/src/hooks/useUnifiedWebRTC.ts
index 9e21e96e0..32ae60da3 100644
--- a/frontend/src/hooks/useUnifiedWebRTC.ts
+++ b/frontend/src/hooks/useUnifiedWebRTC.ts
@@ -208,11 +208,6 @@ export function useUnifiedWebRTC(options?: UseUnifiedWebRTCOptions) {
           transceiver = pc.addTransceiver("video");
         }
 
-        // Add a receive-only audio transceiver so the SDP offer includes an
-        // audio m-line. The backend will match this and send audio back.
-        pc.addTransceiver("audio", { direction: "recvonly" });
-        console.log("[UnifiedWebRTC] Added recvonly audio transceiver");
-
         // Force VP8-only for aiortc compatibility
         if (transceiver) {
           const codecs = RTCRtpReceiver.getCapabilities("video")?.codecs || [];
@@ -227,17 +222,17 @@ export function useUnifiedWebRTC(options?: UseUnifiedWebRTCOptions) {
 
         // Event handlers
         // Collect all incoming tracks (video + audio) into a single MediaStream.
-        // The backend may send video and audio as separate streams, so we
-        // merge them into one MediaStream for the <video> element.
+        // The backend sends video and audio as separate tracks; we merge them
+        // into one MediaStream for the <video> element.
         const combinedStream = new MediaStream();
         pc.ontrack = (evt: RTCTrackEvent) => {
           console.log(
             `[UnifiedWebRTC] Track received: ${evt.track.kind} (id: ${evt.track.id})`
           );
           combinedStream.addTrack(evt.track);
-          // Update remoteStream reference each time a track is added so the
-          // component re-renders and picks up the latest tracks
-          setRemoteStream(combinedStream);
+          // Create a new MediaStream wrapper so React detects the state change
+          // (same object reference would not trigger a re-render)
+          setRemoteStream(new MediaStream(combinedStream.getTracks()));
         };
 
         pc.onconnectionstatechange = () => {
diff --git a/src/scope/server/frame_processor.py b/src/scope/server/frame_processor.py
index 63812a541..6efcbe585 100644
--- a/src/scope/server/frame_processor.py
+++ b/src/scope/server/frame_processor.py
@@ -553,6 +553,10 @@ def _audio_drain_loop(self):
                 with self._audio_buffer_lock:
                     self._audio_buffer.append(audio_mono)
                     self._audio_buffer_samples += len(audio_mono)
+                    logger.info(
+                        f"[FRAME-PROCESSOR] Audio buffered: {len(audio_mono)} samples "
+                        f"(total: {self._audio_buffer_samples}, sr={sample_rate})"
+                    )
 
                 # Also fan out to output sinks that support audio
                 if self.output_sinks:
diff --git a/src/scope/server/pipeline_processor.py b/src/scope/server/pipeline_processor.py
index f47ff9ca7..3ca88223e 100644
--- a/src/scope/server/pipeline_processor.py
+++ b/src/scope/server/pipeline_processor.py
@@ -535,6 +535,10 @@ def process_chunk(self):
             if audio_output is not None and audio_sample_rate is not None:
                 # Detach and move to CPU for downstream consumption
                 audio_output = audio_output.detach().cpu()
+                logger.info(
+                    f"[PIPELINE-PROC] Audio from {self.pipeline_id}: "
+                    f"shape={audio_output.shape}, sr={audio_sample_rate}"
+                )
                 try:
                     self.audio_output_queue.put_nowait(
                         (audio_output, audio_sample_rate)
diff --git a/src/scope/server/tracks.py b/src/scope/server/tracks.py
index 742262197..6feb73ed4 100644
--- a/src/scope/server/tracks.py
+++ b/src/scope/server/tracks.py
@@ -9,16 +9,18 @@
 from aiortc.mediastreams import VIDEO_CLOCK_RATE, VIDEO_TIME_BASE, MediaStreamError
 from av import AudioFrame, VideoFrame
 
-from .frame_processor import (
-    AUDIO_SAMPLES_PER_FRAME,
-    WEBRTC_AUDIO_SAMPLE_RATE,
-    FrameProcessor,
-)
+from .frame_processor import FrameProcessor
 from .media_clock import MediaClock
 from .pipeline_manager import PipelineManager
 
 logger = logging.getLogger(__name__)
 
+# Audio constants
+AUDIO_PTIME = 0.020  # 20ms audio frames (standard for WebRTC)
+AUDIO_CLOCK_RATE = 48000  # WebRTC typically uses 48kHz for Opus codec
+AUDIO_TIME_BASE = fractions.Fraction(1, AUDIO_CLOCK_RATE)
+AUDIO_SAMPLES_PER_FRAME = int(AUDIO_CLOCK_RATE * AUDIO_PTIME)  # 960 samples
+
 
 class VideoProcessingTrack(MediaStreamTrack):
     kind = "video"
@@ -207,7 +209,7 @@ class AudioProcessingTrack(MediaStreamTrack):
 
     kind = "audio"
 
-    AUDIO_PTIME = AUDIO_SAMPLES_PER_FRAME / WEBRTC_AUDIO_SAMPLE_RATE  # 0.02s (20ms)
+    AUDIO_PTIME_S = AUDIO_SAMPLES_PER_FRAME / AUDIO_CLOCK_RATE  # 0.02s (20ms)
 
     def __init__(
         self,
@@ -228,7 +230,7 @@ async def recv(self) -> AudioFrame:
         # Pace audio output at 20ms intervals
         if self._last_frame_time is not None:
             elapsed = time.time() - self._last_frame_time
-            wait = self.AUDIO_PTIME - elapsed
+            wait = self.AUDIO_PTIME_S - elapsed
             if wait > 0:
                 await asyncio.sleep(wait)
 
@@ -253,11 +255,11 @@ async def recv(self) -> AudioFrame:
         frame = AudioFrame.from_ndarray(
             audio_int16.reshape(1, -1), format="s16", layout="mono"
         )
-        frame.sample_rate = WEBRTC_AUDIO_SAMPLE_RATE
+        frame.sample_rate = AUDIO_CLOCK_RATE
 
         # Set PTS from shared media clock for A/V sync
         media_time = self.media_clock.get_media_time()
         frame.pts = self.media_clock.media_time_to_audio_pts(media_time)
-        frame.time_base = fractions.Fraction(1, WEBRTC_AUDIO_SAMPLE_RATE)
+        frame.time_base = AUDIO_TIME_BASE
 
         return frame
diff --git a/src/scope/server/webrtc.py b/src/scope/server/webrtc.py
index 39273437b..30efd791d 100644
--- a/src/scope/server/webrtc.py
+++ b/src/scope/server/webrtc.py
@@ -386,20 +386,15 @@ def on_data_channel_message(message):
                     except Exception as e:
                         logger.error(f"Error handling parameter update: {e}")
 
+            # Add audio track BEFORE setRemoteDescription. The browser's offer
+            # includes a recvonly audio m-line (from addTransceiver("audio")).
+            # aiortc's setRemoteDescription will match our audio transceiver to it.
+            pc.addTrack(audio_track)
+
             # Set remote description (the offer)
             offer_sdp = RTCSessionDescription(sdp=request.sdp, type=request.type)
             await pc.setRemoteDescription(offer_sdp)
 
-            # Attach our audio track to the audio transceiver that aiortc created
-            # from the browser's recvonly audio m-line. We find it by kind and
-            # assign our track to its sender, then flip direction to sendonly so
-            # the answer tells the browser we'll be sending audio.
-            for t in pc.getTransceivers():
-                if t.kind == "audio" and t.sender.track is None:
-                    t.sender.replaceTrack(audio_track)
-                    t.direction = "sendonly"
-                    break
-
             # Create answer
             answer = await pc.createAnswer()
             await pc.setLocalDescription(answer)

From fb3501168ae170a22573fe4fe862dc41f25cd683 Mon Sep 17 00:00:00 2001
From: BuffMcBigHuge <marco@bymar.co>
Date: Tue, 17 Feb 2026 18:43:25 -0500
Subject: [PATCH 8/9] Fighting with audio connection handshake issue.

Signed-off-by: BuffMcBigHuge <marco@bymar.co>
---
 frontend/src/hooks/useUnifiedWebRTC.ts |  5 +++++
 src/scope/server/webrtc.py             | 20 ++++++++++++++------
 2 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/frontend/src/hooks/useUnifiedWebRTC.ts b/frontend/src/hooks/useUnifiedWebRTC.ts
index 32ae60da3..27cbf9788 100644
--- a/frontend/src/hooks/useUnifiedWebRTC.ts
+++ b/frontend/src/hooks/useUnifiedWebRTC.ts
@@ -208,6 +208,11 @@ export function useUnifiedWebRTC(options?: UseUnifiedWebRTCOptions) {
           transceiver = pc.addTransceiver("video");
         }
 
+        // Add a receive-only audio transceiver so the SDP offer includes an
+        // audio m-line. The backend will attach its audio track to this
+        // transceiver after processing the offer.
+        pc.addTransceiver("audio", { direction: "recvonly" });
+
         // Force VP8-only for aiortc compatibility
         if (transceiver) {
           const codecs = RTCRtpReceiver.getCapabilities("video")?.codecs || [];
diff --git a/src/scope/server/webrtc.py b/src/scope/server/webrtc.py
index 30efd791d..cb1c317e5 100644
--- a/src/scope/server/webrtc.py
+++ b/src/scope/server/webrtc.py
@@ -386,15 +386,23 @@ def on_data_channel_message(message):
                     except Exception as e:
                         logger.error(f"Error handling parameter update: {e}")
 
-            # Add audio track BEFORE setRemoteDescription. The browser's offer
-            # includes a recvonly audio m-line (from addTransceiver("audio")).
-            # aiortc's setRemoteDescription will match our audio transceiver to it.
-            pc.addTrack(audio_track)
-
-            # Set remote description (the offer)
+            # Set remote description (the offer).
+            # The browser's offer includes a recvonly audio m-line (from
+            # addTransceiver("audio", {direction: "recvonly"})). aiortc will
+            # create an audio transceiver for it during setRemoteDescription.
             offer_sdp = RTCSessionDescription(sdp=request.sdp, type=request.type)
             await pc.setRemoteDescription(offer_sdp)
 
+            # Attach our audio track to the transceiver that aiortc created
+            # from the browser's recvonly audio m-line. We find it by kind,
+            # assign our track to its sender, and set direction to sendonly.
+            for t in pc.getTransceivers():
+                if t.kind == "audio":
+                    t.sender.replaceTrack(audio_track)
+                    t.direction = "sendonly"
+                    logger.info(f"Audio track attached to transceiver (mid={t.mid})")
+                    break
+
             # Create answer
             answer = await pc.createAnswer()
             await pc.setLocalDescription(answer)

From 073a6a26e3dc854da7e65a4ea45ea6c1058ef237 Mon Sep 17 00:00:00 2001
From: BuffMcBigHuge <marco@bymar.co>
Date: Mon, 23 Feb 2026 19:49:07 -0500
Subject: [PATCH 9/9] Mediaclock rework.

Signed-off-by: BuffMcBigHuge <marco@bymar.co>
---
 src/scope/server/frame_processor.py    |  2 +-
 src/scope/server/media_clock.py        | 12 +++----
 src/scope/server/pipeline_processor.py |  8 +++--
 src/scope/server/tracks.py             | 50 +++++++++++++++-----------
 src/scope/server/webrtc.py             |  1 +
 5 files changed, 43 insertions(+), 30 deletions(-)

diff --git a/src/scope/server/frame_processor.py b/src/scope/server/frame_processor.py
index 6efcbe585..78846b3e5 100644
--- a/src/scope/server/frame_processor.py
+++ b/src/scope/server/frame_processor.py
@@ -553,7 +553,7 @@ def _audio_drain_loop(self):
                 with self._audio_buffer_lock:
                     self._audio_buffer.append(audio_mono)
                     self._audio_buffer_samples += len(audio_mono)
-                    logger.info(
+                    logger.debug(
                         f"[FRAME-PROCESSOR] Audio buffered: {len(audio_mono)} samples "
                         f"(total: {self._audio_buffer_samples}, sr={sample_rate})"
                     )
diff --git a/src/scope/server/media_clock.py b/src/scope/server/media_clock.py
index 237ebe3c3..2010858ba 100644
--- a/src/scope/server/media_clock.py
+++ b/src/scope/server/media_clock.py
@@ -52,10 +52,10 @@ def get_media_time(self) -> float:
                 return 0.0
             return time.time() - self._start_time
 
-    def media_time_to_video_pts(self, media_time: float) -> int:
-        """Convert media time (seconds) to video PTS in 90 kHz clock units."""
-        return int(media_time * VIDEO_CLOCK_RATE)
+    def to_pts(self, clock_rate: int) -> int:
+        """Get current media time as PTS in the given clock rate.
 
-    def media_time_to_audio_pts(self, media_time: float) -> int:
-        """Convert media time (seconds) to audio PTS in 48 kHz sample units."""
-        return int(media_time * AUDIO_CLOCK_RATE)
+        Combines get_media_time() and conversion in a single call to minimize
+        drift between the time read and the PTS calculation.
+        """
+        return int(self.get_media_time() * clock_rate)
diff --git a/src/scope/server/pipeline_processor.py b/src/scope/server/pipeline_processor.py
index 05ba58330..b871de821 100644
--- a/src/scope/server/pipeline_processor.py
+++ b/src/scope/server/pipeline_processor.py
@@ -476,7 +476,11 @@ def process_chunk(self):
 
             # Forward extra params to downstream pipeline (dual-output pattern)
             # Preprocessors return {"video": frames, "vace_input_frames": ..., "vace_input_masks": ...}
-            extra_params = {k: v for k, v in output_dict.items() if k != "video"}
+            # Audio keys are handled separately via audio_output_queue, not as pipeline params.
+            _non_param_keys = {"video", "audio", "audio_sample_rate"}
+            extra_params = {
+                k: v for k, v in output_dict.items() if k not in _non_param_keys
+            }
             if extra_params and self.next_processor is not None:
                 self.next_processor.update_parameters(extra_params)
 
@@ -548,7 +552,7 @@ def process_chunk(self):
             if audio_output is not None and audio_sample_rate is not None:
                 # Detach and move to CPU for downstream consumption
                 audio_output = audio_output.detach().cpu()
-                logger.info(
+                logger.debug(
                     f"[PIPELINE-PROC] Audio from {self.pipeline_id}: "
                     f"shape={audio_output.shape}, sr={audio_sample_rate}"
                 )
diff --git a/src/scope/server/tracks.py b/src/scope/server/tracks.py
index 6feb73ed4..b4e47c961 100644
--- a/src/scope/server/tracks.py
+++ b/src/scope/server/tracks.py
@@ -31,6 +31,7 @@ def __init__(
         fps: int = 30,
         initial_parameters: dict = None,
         notification_callback: callable = None,
+        media_clock: MediaClock | None = None,
         session_id: str | None = None,
         user_id: str | None = None,
         connection_id: str | None = None,
@@ -40,6 +41,7 @@ def __init__(
         self.pipeline_manager = pipeline_manager
         self.initial_parameters = initial_parameters or {}
         self.notification_callback = notification_callback
+        self.media_clock = media_clock
         self.session_id = session_id
         self.user_id = user_id
         self.connection_id = connection_id
@@ -54,6 +56,8 @@ def __init__(
         self._paused = False
         self._paused_lock = threading.Lock()
         self._last_frame = None
+        self._last_send_time: float | None = None
+        self._clock_started = False
 
         # Server-side input mode - when enabled, frames come from the backend
         # instead of WebRTC (no browser video track needed)
@@ -83,33 +87,38 @@ async def input_loop(self):
                 self.input_task_running = False
                 break
 
-    # Copied from https://github.com/livepeer/fastworld/blob/e649ef788cd33d78af6d8e1da915cd933761535e/backend/track.py#L267
     async def next_timestamp(self) -> tuple[int, fractions.Fraction]:
-        """Override to control frame rate"""
+        """Pace output at the target frame rate and return a PTS from the shared MediaClock.
+
+        Using the shared clock ensures the video PTS is correlated with the
+        audio PTS so the WebRTC receiver can synchronize playback.
+        """
         if self.readyState != "live":
             raise MediaStreamError
 
-        if hasattr(self, "timestamp"):
-            # Calculate wait time based on current frame rate
-            current_time = time.time()
-            time_since_last_frame = current_time - self.last_frame_time
+        # Pace frames at the target interval
+        if self._last_send_time is not None:
+            elapsed = time.time() - self._last_send_time
+            wait = self.frame_ptime - elapsed
+            if wait > 0:
+                await asyncio.sleep(wait)
 
-            # Wait for the appropriate interval based on current FPS
-            target_interval = self.frame_ptime  # Current frame period
-            wait_time = target_interval - time_since_last_frame
+        self._last_send_time = time.time()
 
-            if wait_time > 0:
-                await asyncio.sleep(wait_time)
+        # Start the shared clock on the first frame (idempotent)
+        if self.media_clock and not self._clock_started:
+            self.media_clock.start()
+            self._clock_started = True
 
-            # Update timestamp and last frame time
-            self.timestamp += int(self.frame_ptime * VIDEO_CLOCK_RATE)
-            self.last_frame_time = time.time()
-        else:
-            self.start = time.time()
-            self.last_frame_time = time.time()
-            self.timestamp = 0
+        if self.media_clock:
+            return self.media_clock.to_pts(VIDEO_CLOCK_RATE), VIDEO_TIME_BASE
 
-        return self.timestamp, VIDEO_TIME_BASE
+        # Fallback for cases without a media clock (shouldn't happen in normal flow)
+        if not hasattr(self, "_fallback_pts"):
+            self._fallback_pts = 0
+        else:
+            self._fallback_pts += int(self.frame_ptime * VIDEO_CLOCK_RATE)
+        return self._fallback_pts, VIDEO_TIME_BASE
 
     def initialize_output_processing(self):
         if not self.frame_processor:
@@ -258,8 +267,7 @@ async def recv(self) -> AudioFrame:
         frame.sample_rate = AUDIO_CLOCK_RATE
 
         # Set PTS from shared media clock for A/V sync
-        media_time = self.media_clock.get_media_time()
-        frame.pts = self.media_clock.media_time_to_audio_pts(media_time)
+        frame.pts = self.media_clock.to_pts(AUDIO_CLOCK_RATE)
         frame.time_base = AUDIO_TIME_BASE
 
         return frame
diff --git a/src/scope/server/webrtc.py b/src/scope/server/webrtc.py
index cb1c317e5..ccab1260d 100644
--- a/src/scope/server/webrtc.py
+++ b/src/scope/server/webrtc.py
@@ -238,6 +238,7 @@ async def handle_offer(
                 pipeline_manager,
                 initial_parameters=initial_parameters,
                 notification_callback=notification_sender.call,
+                media_clock=media_clock,
                 session_id=session.id,
                 user_id=request.user_id,
                 connection_id=request.connection_id,