diff --git a/fern/assets/components/TurnDetectionVisualizer.tsx b/fern/assets/components/TurnDetectionVisualizer.tsx new file mode 100644 index 00000000..c435d456 --- /dev/null +++ b/fern/assets/components/TurnDetectionVisualizer.tsx @@ -0,0 +1,344 @@ +"use client"; +import * as React from "react"; + +const SEGMENTS = [ + { start: 0.5, end: 2.8, confidence: 0.85 }, + { start: 4.2, end: 7.1, confidence: 0.35 }, + { start: 9.0, end: 11.5, confidence: 0.92 }, +]; + +const DURATION = 14; + +const PRESETS: Record = { + aggressive: { confidence: 0.4, minSilence: 160, maxSilence: 400, label: "Aggressive" }, + balanced: { confidence: 0.4, minSilence: 400, maxSilence: 1280, label: "Balanced" }, + conservative: { confidence: 0.7, minSilence: 800, maxSilence: 3600, label: "Conservative" }, +}; + +interface EotResult { + time: number; + type: "semantic" | "acoustic"; + segIndex: number; +} + +function computeEot(conf: number, minMs: number, maxMs: number): EotResult[] { + const results: EotResult[] = []; + for (let i = 0; i < SEGMENTS.length; i++) { + const seg = SEGMENTS[i]; + const nextSeg = i < SEGMENTS.length - 1 ? SEGMENTS[i + 1] : null; + if (seg.confidence >= conf) { + const t = seg.end + minMs / 1000; + if (t <= DURATION && (!nextSeg || t < nextSeg.start)) { + results.push({ time: t, type: "semantic", segIndex: i }); + } + } else { + const t = seg.end + maxMs / 1000; + if (t <= DURATION && (!nextSeg || t < nextSeg.start)) { + results.push({ time: t, type: "acoustic", segIndex: i }); + } + } + } + return results; +} + +function generateWaveformBars(start: number, end: number, seed: number): number[] { + const count = Math.max(8, Math.floor((end - start) * 12)); + const bars: number[] = []; + let s = seed; + for (let i = 0; i < count; i++) { + s = (s * 1103515245 + 12345) & 0x7fffffff; + const frac = i / count; + const envelope = Math.sin(frac * Math.PI) * 0.6 + 0.3; + const noise = (s % 100) / 100; + bars.push(Math.max(0.15, Math.min(1, envelope * (0.5 + noise * 0.5)))); + } + return bars; +} + +export function TurnDetectionVisualizer() { + const [preset, setPreset] = React.useState("balanced"); + + const config = PRESETS[preset]; + const eotMarkers = React.useMemo( + () => computeEot(config.confidence, config.minSilence, config.maxSilence), + [config.confidence, config.minSilence, config.maxSilence] + ); + + const waveforms = React.useMemo( + () => SEGMENTS.map((seg, i) => generateWaveformBars(seg.start, seg.end, (i + 1) * 7919)), + [] + ); + + const toX= (t: number) => (t / DURATION) * 100; + + const turnsNotEnded: Set = new Set(); + for (let i = 0; i < SEGMENTS.length; i++) { + const seg = SEGMENTS[i]; + const nextSeg = i < SEGMENTS.length - 1 ? SEGMENTS[i + 1] : null; + const isAbove = seg.confidence >= config.confidence; + const silenceMs = isAbove ? config.minSilence : config.maxSilence; + const eotTime = seg.end + silenceMs / 1000; + if (nextSeg && eotTime >= nextSeg.start) { + turnsNotEnded.add(i); + } + } + + return ( +
+
+ + Turn Detection Visualizer + +
+ +
+ {Object.entries(PRESETS).map(([key, p]) => { + const isActive = preset === key; + return ( + + ); + })} +
+ +
+
+ {SEGMENTS.map((seg, i) => { + const left = toX(seg.start); + const width = toX(seg.end) - left; + const isAbove = seg.confidence >= config.confidence; + const bars = waveforms[i]; + const continued = turnsNotEnded.has(i); + return ( +
+
+ {bars.map((h, bi) => ( +
+ ))} +
+
+ {"conf: " + seg.confidence} +
+ {continued && ( +
+ )} +
+ ); + })} + + {SEGMENTS.map((seg, i) => { + const isAbove = seg.confidence >= config.confidence; + const silenceMs = isAbove ? config.minSilence : config.maxSilence; + const silenceEnd = seg.end + silenceMs / 1000; + const nextSeg = i < SEGMENTS.length - 1 ? SEGMENTS[i + 1] : null; + if (silenceEnd > DURATION) return null; + if (nextSeg && silenceEnd >= nextSeg.start) return null; + const startX = toX(seg.end); + const endX = toX(silenceEnd); + const color = isAbove ? "#22c55e" : "#f59e0b"; + return ( +
+
+
+ ); + })} + + {eotMarkers.map((m, i) => { + const x = toX(m.time); + const color = m.type === "semantic" ? "#22c55e" : "#f59e0b"; + const label = m.type === "semantic" ? "min_silence" : "max_silence"; + return ( +
+
+
+ {label} +
+ {"EoT"} +
+
+ ); + })} + + {turnsNotEnded.size > 0 && Array.from(turnsNotEnded).map((idx) => { + const seg = SEGMENTS[idx]; + const nextSeg = SEGMENTS[idx + 1]; + const midX = toX((seg.end + nextSeg.start) / 2); + return ( +
+ turn continues +
+ ); + })} +
+
+ +
+ + + EoT (min_silence_when_confident) + + + + EoT (max_turn_silence) + + {turnsNotEnded.size > 0 && ( + + + Turn continues + + )} +
+ +
+ end_of_turn_confidence_threshold + {config.confidence} + min_end_of_turn_silence_when_confident + {config.minSilence} ms + max_turn_silence + {config.maxSilence} ms +
+ +

+ When confidence {"\u2265"} threshold, EoT triggers after min_end_of_turn_silence_when_confident.{" "} + When confidence {"<"} threshold, EoT triggers after max_turn_silence.{" "} + If the silence period extends into the next speech segment, the turn continues. +

+
+ ); +} diff --git a/fern/pages/02-speech-to-text/universal-streaming/turn-detection.mdx b/fern/pages/02-speech-to-text/universal-streaming/turn-detection.mdx index 0163173b..5d4571f0 100644 --- a/fern/pages/02-speech-to-text/universal-streaming/turn-detection.mdx +++ b/fern/pages/02-speech-to-text/universal-streaming/turn-detection.mdx @@ -3,6 +3,8 @@ title: "Turn detection" description: "Intelligent turn detection with Streaming Speech-to-Text" --- +import { TurnDetectionVisualizer } from "../../../assets/components/TurnDetectionVisualizer"; + ### Overview AssemblyAI's turn detection model uses a neural network to detect when someone has finished speaking. Unlike traditional voice activity detection that only listens for silence, our model understands the meaning and flow of speech to make better decisions about when a turn has ended. @@ -74,6 +76,8 @@ These configurations are just starting points and can be fine-tuned based on you ### How it works + + The turn detection model uses a neural network to detect when someone has finished speaking. It has two ways to detect end-of-turn: