diff --git a/.cspell-wordlist.txt b/.cspell-wordlist.txt index 2e5092801..283af1481 100644 --- a/.cspell-wordlist.txt +++ b/.cspell-wordlist.txt @@ -104,3 +104,5 @@ POTTEDPLANT TVMONITOR sublist TTFT +timestamping +logprob diff --git a/apps/llm/app/voice_chat/index.tsx b/apps/llm/app/voice_chat/index.tsx index 79a713c93..27b4ad35d 100644 --- a/apps/llm/app/voice_chat/index.tsx +++ b/apps/llm/app/voice_chat/index.tsx @@ -1,4 +1,4 @@ -import { useContext, useEffect, useRef, useState } from 'react'; +import { useContext, useEffect, useState } from 'react'; import { Keyboard, KeyboardAvoidingView, @@ -35,6 +35,8 @@ export default function VoiceChatScreenWrapper() { function VoiceChatScreen() { const [isRecording, setIsRecording] = useState(false); + const [liveTranscription, setLiveTranscription] = useState(''); + const [recorder] = useState( () => new AudioRecorder({ @@ -42,7 +44,7 @@ function VoiceChatScreen() { bufferLengthInSamples: 1600, }) ); - const messageRecorded = useRef(false); + const { setGlobalGenerating } = useContext(GeneratingContext); const llm = useLLM({ model: QWEN3_0_6B_QUANTIZED }); @@ -67,16 +69,32 @@ function VoiceChatScreen() { if (isRecording) { setIsRecording(false); recorder.stop(); - messageRecorded.current = true; speechToText.streamStop(); } else { setIsRecording(true); + setLiveTranscription(''); + recorder.onAudioReady(({ buffer }) => { speechToText.streamInsert(buffer.getChannelData(0)); }); recorder.start(); - const transcription = await speechToText.stream(); - await llm.sendMessage(transcription); + + let finalResult = ''; + + try { + for await (const result of speechToText.stream()) { + const text = result.committed.text + result.nonCommitted.text; + setLiveTranscription(text); + finalResult = text; + } + } catch (e) { + console.error('Streaming error:', e); + } finally { + if (finalResult.trim().length > 0) { + await llm.sendMessage(finalResult); + setLiveTranscription(''); + } + } } }; @@ -96,16 +114,17 @@ function VoiceChatScreen() { Qwen 3 x Whisper - {llm.messageHistory.length || speechToText.committedTranscription ? ( + + {llm.messageHistory.length > 0 || liveTranscription.length > 0 ? ( 0 ? [ ...llm.messageHistory, { role: 'user', - content: speechToText.committedTranscription, + content: liveTranscription, }, ] : llm.messageHistory @@ -123,6 +142,7 @@ function VoiceChatScreen() { )} + {DeviceInfo.isEmulatorSync() ? ( diff --git a/apps/speech/app.json b/apps/speech/app.json index 693c815cb..1e6e36464 100644 --- a/apps/speech/app.json +++ b/apps/speech/app.json @@ -17,6 +17,9 @@ "bundleIdentifier": "com.anonymous.speech", "infoPlist": { "NSMicrophoneUsageDescription": "This app needs access to your microphone to record audio." + }, + "entitlements": { + "com.apple.developer.kernel.increased-memory-limit": true } }, "android": { @@ -24,11 +27,34 @@ "foregroundImage": "./assets/adaptive-icon.png", "backgroundColor": "#ffffff" }, - "package": "com.anonymous.speech" + "package": "com.anonymous.speech", + "permissions": [ + "android.permission.RECORD_AUDIO", + "android.permission.MODIFY_AUDIO_SETTINGS", + "android.permission.FOREGROUND_SERVICE", + "android.permission.FOREGROUND_SERVICE_MEDIA_PLAYBACK" + ] }, "web": { "favicon": "./assets/favicon.png" }, - "plugins": ["expo-font"] + "plugins": [ + "expo-font", + [ + "react-native-audio-api", + { + "iosBackgroundMode": true, + "iosMicrophonePermission": "This app requires access to the microphone to record audio.", + "androidPermissions": [ + "android.permission.MODIFY_AUDIO_SETTINGS", + "android.permission.FOREGROUND_SERVICE", + "android.permission.FOREGROUND_SERVICE_MEDIA_PLAYBACK", + "android.permission.RECORD_AUDIO" + ], + "androidForegroundService": true, + "androidFSTypes": ["mediaPlayback", "microphone"] + } + ] + ] } } diff --git a/apps/speech/components/VerboseTranscription.tsx b/apps/speech/components/VerboseTranscription.tsx new file mode 100644 index 000000000..1093b2bd1 --- /dev/null +++ b/apps/speech/components/VerboseTranscription.tsx @@ -0,0 +1,241 @@ +import React from 'react'; +import { View, Text, StyleSheet } from 'react-native'; +import { TranscriptionResult } from 'react-native-executorch'; + +export const VerboseTranscription = ({ + data, +}: { + data: TranscriptionResult; +}) => { + if (!data) return null; + + const hasSegments = Array.isArray(data.segments) && data.segments.length > 0; + + const hasLanguage = + !!data.language && data.language !== 'N/A' && data.language.trim() !== ''; + + const hasDuration = typeof data.duration === 'number' && data.duration > 0; + + const hasMetadata = hasLanguage || hasDuration; + + return ( + + + Full Text: + {data.text || ''} + + {hasMetadata && ( + + {hasLanguage && ( + Language: {data.language} + )} + {hasDuration && ( + + Duration: {data.duration?.toFixed(2)}s + + )} + + )} + + + {hasSegments && ( + <> + + Segments ({data.segments?.length}) + + + {data.segments?.map((seg, index) => ( + + + + {seg.start.toFixed(2)}s - {seg.end.toFixed(2)}s + + ID: {index} + + + "{seg.text}" + + {seg.words && seg.words.length > 0 && ( + + Word Timestamps: + + {seg.words.map((w, wIdx) => ( + + {w.word.trim()} + + {w.start.toFixed(2)}s + + + ))} + + + )} + + + + Avg LogProb + + {data.task === 'transcribe' + ? seg.avgLogprob?.toFixed(4) + : 'N/A'} + + + + Temp + + {data.task === 'transcribe' + ? seg.temperature?.toFixed(2) + : 'N/A'} + + + + {/*eslint-disable-next-line @cspell/spellchecker*/} + Compr. + + {data.task === 'transcribe' + ? seg.compressionRatio?.toFixed(2) + : 'N/A'} + + + + + ))} + + )} + + ); +}; + +const styles = StyleSheet.create({ + container: { + padding: 4, + }, + metaContainer: { + marginBottom: 16, + padding: 12, + backgroundColor: '#f0f2f5', + borderRadius: 8, + }, + label: { + fontWeight: 'bold', + color: '#0f186e', + marginBottom: 4, + }, + text: { + fontSize: 16, + color: '#333', + marginBottom: 8, + }, + row: { + flexDirection: 'row', + gap: 10, + marginTop: 8, + }, + metaItem: { + fontSize: 12, + color: '#666', + backgroundColor: '#e1e4e8', + paddingHorizontal: 8, + paddingVertical: 2, + borderRadius: 4, + overflow: 'hidden', + }, + sectionHeader: { + fontSize: 18, + fontWeight: 'bold', + color: '#0f186e', + marginBottom: 8, + marginTop: 8, + }, + segmentCard: { + backgroundColor: '#fff', + borderRadius: 8, + borderWidth: 1, + borderColor: '#e1e4e8', + marginBottom: 12, + padding: 12, + shadowColor: '#000', + shadowOffset: { width: 0, height: 1 }, + shadowOpacity: 0.1, + shadowRadius: 2, + elevation: 2, + }, + segmentHeader: { + flexDirection: 'row', + justifyContent: 'space-between', + marginBottom: 8, + }, + timeBadge: { + fontSize: 12, + fontWeight: 'bold', + color: '#fff', + backgroundColor: '#0f186e', + paddingHorizontal: 8, + paddingVertical: 2, + borderRadius: 12, + overflow: 'hidden', + }, + segmentId: { + fontSize: 12, + color: '#888', + }, + segmentText: { + fontSize: 15, + fontStyle: 'italic', + color: '#333', + marginBottom: 12, + }, + statsGrid: { + flexDirection: 'row', + flexWrap: 'wrap', + gap: 8, + borderTopWidth: 1, + borderTopColor: '#f0f0f0', + paddingTop: 8, + }, + statItem: { + flex: 1, + minWidth: '45%', + flexDirection: 'row', + justifyContent: 'space-between', + }, + statLabel: { + fontSize: 11, + color: '#888', + }, + statValue: { + fontSize: 11, + fontWeight: '600', + color: '#444', + }, + wordsContainer: { + marginVertical: 8, + backgroundColor: '#f8f9fa', + padding: 8, + borderRadius: 6, + }, + wordsGrid: { + flexDirection: 'row', + flexWrap: 'wrap', + gap: 6, + marginTop: 4, + }, + wordChip: { + backgroundColor: '#ffffff', + borderWidth: 1, + borderColor: '#e1e4e8', + borderRadius: 4, + paddingHorizontal: 6, + paddingVertical: 2, + alignItems: 'center', + }, + wordText: { + fontSize: 12, + color: '#333', + }, + wordTime: { + fontSize: 9, + color: '#888', + marginTop: 1, + }, +}); diff --git a/apps/speech/package.json b/apps/speech/package.json index 094fa2b78..1c0607be2 100644 --- a/apps/speech/package.json +++ b/apps/speech/package.json @@ -19,7 +19,7 @@ "metro-config": "^0.81.0", "react": "19.1.0", "react-native": "0.81.5", - "react-native-audio-api": "0.6.5", + "react-native-audio-api": "0.11.3", "react-native-device-info": "^14.0.4", "react-native-executorch": "workspace:*", "react-native-reanimated": "~4.1.1", diff --git a/apps/speech/screens/SpeechToTextScreen.tsx b/apps/speech/screens/SpeechToTextScreen.tsx index da7ed0f7e..06813dfcd 100644 --- a/apps/speech/screens/SpeechToTextScreen.tsx +++ b/apps/speech/screens/SpeechToTextScreen.tsx @@ -8,9 +8,14 @@ import { TextInput, KeyboardAvoidingView, Platform, + Switch, } from 'react-native'; import { SafeAreaProvider, SafeAreaView } from 'react-native-safe-area-context'; -import { useSpeechToText, WHISPER_TINY_EN } from 'react-native-executorch'; +import { + useSpeechToText, + WHISPER_TINY_EN, + TranscriptionResult, +} from 'react-native-executorch'; import FontAwesome from '@expo/vector-icons/FontAwesome'; import { AudioManager, @@ -21,6 +26,8 @@ import * as FileSystem from 'expo-file-system/legacy'; import SWMIcon from '../assets/swm_icon.svg'; import DeviceInfo from 'react-native-device-info'; +import { VerboseTranscription } from '../components/VerboseTranscription'; + const isSimulator = DeviceInfo.isEmulatorSync(); export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => { @@ -28,26 +35,34 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => { model: WHISPER_TINY_EN, }); - const [transcription, setTranscription] = useState(''); + const [transcription, setTranscription] = + useState(null); + + const [liveResult, setLiveResult] = useState<{ + fullText: string; + segments: any[]; + } | null>(null); + + const [enableTimestamps, setEnableTimestamps] = useState(false); const [audioURL, setAudioURL] = useState(''); + + const isRecordingRef = useRef(false); const [liveTranscribing, setLiveTranscribing] = useState(false); const scrollViewRef = useRef(null); - const [recorder] = useState( - () => - new AudioRecorder({ - sampleRate: 16000, - bufferLengthInSamples: 1600, - }) - ); + const recorder = new AudioRecorder(); useEffect(() => { AudioManager.setAudioSessionOptions({ iosCategory: 'playAndRecord', iosMode: 'spokenAudio', - iosOptions: ['allowBluetooth', 'defaultToSpeaker'], + iosOptions: ['allowBluetoothHFP', 'defaultToSpeaker'], }); - AudioManager.requestRecordingPermissions(); + const checkPerms = async () => { + const granted = await AudioManager.requestRecordingPermissions(); + if (!granted) console.warn('Microphone permission denied!'); + }; + checkPerms(); }, []); async function getAudioFile(sourceUri: string) { @@ -72,40 +87,110 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => { } const uri = await getAudioFile(audioURL); + // Reset previous states + setTranscription(null); + setLiveResult(null); const audioContext = new AudioContext({ sampleRate: 16000 }); try { - const decodedAudioData = await audioContext.decodeAudioDataSource(uri); + const decodedAudioData = await audioContext.decodeAudioData(uri); const audioBuffer = decodedAudioData.getChannelData(0); - setTranscription(await model.transcribe(audioBuffer)); + const result = await model.transcribe(audioBuffer, { + verbose: enableTimestamps, + }); + setTranscription(result); } catch (error) { console.error('Error decoding audio data', error); - console.warn('Note: Supported file formats: mp3, wav, flac'); return; } }; const handleStartTranscribeFromMicrophone = async () => { + isRecordingRef.current = true; setLiveTranscribing(true); - setTranscription(''); - recorder.onAudioReady(({ buffer }) => { - model.streamInsert(buffer.getChannelData(0)); - }); - recorder.start(); + + setTranscription(null); + setLiveResult({ fullText: '', segments: [] }); + + const sampleRate = 16000; + + recorder.onAudioReady( + { + sampleRate, + bufferLength: 0.1 * sampleRate, + channelCount: 1, + }, + ({ buffer }) => { + model.streamInsert(buffer.getChannelData(0)); + } + ); + + try { + const success = await AudioManager.setAudioSessionActivity(true); + if (!success) { + console.warn('Cannot start audio session correctly'); + } + const result = recorder.start(); + if (result.status === 'error') { + console.warn('Recording problems: ', result.message); + } + } catch (e) { + console.error('Failed to start recorder', e); + isRecordingRef.current = false; + setLiveTranscribing(false); + return; + } + + let accumulatedText = ''; + let accumulatedSegments: any[] = []; try { - await model.stream(); + const streamIter = model.stream({ + verbose: enableTimestamps, + }); + + for await (const { committed, nonCommitted } of streamIter) { + if (!isRecordingRef.current) break; + + if (committed.text) { + accumulatedText += committed.text; + } + if (committed.segments) { + accumulatedSegments = [...accumulatedSegments, ...committed.segments]; + } + + const currentDisplay = { + fullText: accumulatedText + nonCommitted.text, + segments: [...accumulatedSegments, ...(nonCommitted.segments || [])], + }; + + setLiveResult(currentDisplay); + } } catch (error) { console.error('Error during live transcription:', error); + } finally { + setLiveTranscribing(false); } }; const handleStopTranscribeFromMicrophone = () => { + isRecordingRef.current = false; + recorder.stop(); model.streamStop(); console.log('Live transcription stopped'); setLiveTranscribing(false); + + if (liveResult) { + setTranscription({ + text: liveResult.fullText, + segments: liveResult.segments, + language: 'en', + duration: 0, + }); + setLiveResult(null); + } }; const getModelStatus = () => { @@ -118,6 +203,20 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => { const readyToTranscribe = !model.isGenerating && model.isReady; const recordingButtonDisabled = isSimulator || !readyToTranscribe; + const getDisplayData = (): TranscriptionResult | null => { + if (liveTranscribing && liveResult) { + return { + text: liveResult.fullText, + segments: liveResult.segments, + language: 'en', + duration: 0, + }; + } + return transcription; + }; + + const displayData = getDisplayData(); + return ( @@ -138,6 +237,21 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => { Status: {getModelStatus()} + + Enable Timestamps (Verbose) + { + setEnableTimestamps(val); + setTranscription(null); + setLiveResult(null); + }} + trackColor={{ false: '#767577', true: '#0f186e' }} + thumbColor={enableTimestamps ? '#fff' : '#f4f3f4'} + disabled={model.isGenerating} + /> + + Transcription void }) => { scrollViewRef.current?.scrollToEnd({ animated: true }) } > - - {transcription !== '' - ? transcription - : model.committedTranscription + - model.nonCommittedTranscription} - + {displayData ? ( + + ) : ( + + {liveTranscribing + ? 'Listening...' + : 'No transcription yet...'} + + )} @@ -241,6 +358,17 @@ const styles = StyleSheet.create({ marginTop: 12, alignItems: 'center', }, + toggleContainer: { + flexDirection: 'row', + alignItems: 'center', + marginTop: 10, + marginBottom: 5, + }, + toggleLabel: { + fontSize: 16, + marginRight: 10, + color: '#0f186e', + }, transcriptionContainer: { flex: 1, width: '100%', @@ -256,6 +384,11 @@ const styles = StyleSheet.create({ borderWidth: 1, borderColor: '#0f186e', padding: 12, + maxHeight: 400, + }, + placeholderText: { + color: '#aaa', + fontStyle: 'italic', }, inputContainer: { marginBottom: 12, diff --git a/docs/docs/03-hooks/01-natural-language-processing/useSpeechToText.md b/docs/docs/03-hooks/01-natural-language-processing/useSpeechToText.md index 5b0545cf2..ce1aa3f06 100644 --- a/docs/docs/03-hooks/01-natural-language-processing/useSpeechToText.md +++ b/docs/docs/03-hooks/01-natural-language-processing/useSpeechToText.md @@ -47,12 +47,12 @@ const { uri } = await FileSystem.downloadAsync( ); const audioContext = new AudioContext({ sampleRate: 16000 }); -const decodedAudioData = await audioContext.decodeAudioDataSource(uri); +const decodedAudioData = await audioContext.decodeAudioData(uri); const audioBuffer = decodedAudioData.getChannelData(0); try { const transcription = await model.transcribe(audioBuffer); - console.log(transcription); + console.log(transcription.text); } catch (error) { console.error('Error during audio transcription', error); } @@ -101,12 +101,52 @@ const model = useSpeechToText({ const transcription = await model.transcribe(spanishAudio, { language: 'es' }); ``` +### Timestamps & Transcription Stat Data + +You can obtain word-level timestamps and other useful parameters from transcription ([`transcribe`](../../06-api-reference/interfaces/SpeechToTextType.md#transcribe) and [`stream`](../../06-api-reference/interfaces/SpeechToTextType.md#stream) methods) by setting `verbose: true` in the options. The result mimics the _verbose_json_ format from OpenAI Whisper API. For more information please read [`transcribe`](../../06-api-reference/interfaces/SpeechToTextType.md#transcribe), [`stream`](../../06-api-reference/interfaces/SpeechToTextType.md#stream), and [`TranscriptionResult`](../../06-api-reference/interfaces/TranscriptionResult.md) API References. + +```typescript +const transcription = await model.transcribe(audioBuffer, { verbose: true }); +// Example result +// +// transcription: { +// task: "transcription", +// text: "Example text for a ...", +// duration: 9.05, +// language: "en", +// segments: [ +// { +// start: 0, +// end: 5.4, +// text: "Example text for", +// words: [ +// { +// word: "Example", +// start: 0, +// end: 1.4 +// }, +// ... +// ] +// tokens: [1, 32, 45, ...], +// temperature: 0.0, +// avgLogprob: -1.235, +// compressionRatio: 1.632 +// }, +// ... +// ] +// } +``` + ## Example ```tsx import React, { useState } from 'react'; -import { Button, Text } from 'react-native'; -import { useSpeechToText, WHISPER_TINY_EN } from 'react-native-executorch'; +import { Button, Text, View } from 'react-native'; +import { + useSpeechToText, + WHISPER_TINY_EN, + TranscriptionResult, +} from 'react-native-executorch'; import { AudioContext } from 'react-native-audio-api'; import * as FileSystem from 'expo-file-system'; @@ -115,7 +155,7 @@ function App() { model: WHISPER_TINY_EN, }); - const [transcription, setTranscription] = useState(''); + const [transcription, setTranscription] = useState(null); const loadAudio = async () => { const { uri } = await FileSystem.downloadAsync( @@ -132,14 +172,45 @@ function App() { const handleTranscribe = async () => { const audio = await loadAudio(); - await model.transcribe(audio); + // Default text transcription + const result = await model.transcribe(audio); + setTranscription(result); + }; + + const handleTranscribeWithTimestamps = async () => { + const audio = await loadAudio(); + // Transcription with timestamps + const result = await model.transcribe(audio, { verbose: true }); + setTranscription(result); + }; + + // Custom logic for printing transcription + // e.g. + + const renderContent = () => { + if (!transcription) return Press a button to transcribe; + + if (transcription.segments && transcription.segments.length > 0) { + return ( + + {transcription.text + + '\n\nNum segments: ' + + transcription.segments.length.toString()} + + ); + } + return {transcription.text}; }; return ( - <> - {transcription} -